fix types for huggingface_hub

huggingface · Jan 28, 2025 · bd31ec5 · bd31ec5
1 parent 223973d
commit bd31ec5
Show file tree

Hide file tree

Showing 18 changed files with 55 additions and 38 deletions.
diff --git a/e2e/pyproject.toml b/e2e/pyproject.toml
@@ -34,8 +34,7 @@ strict = true
 module = [
     "huggingface_hub.*",
 ]
-# ^ huggingface_hub is not typed since version 0.13.0
-ignore_missing_imports = true
+no_implicit_reexport = false
 
 [tool.ruff]
 line-length = 119

diff --git a/jobs/cache_maintenance/pyproject.toml b/jobs/cache_maintenance/pyproject.toml
@@ -33,8 +33,7 @@ strict = true
 module = [
     "huggingface_hub.*",
 ]
-# ^ huggingface_hub is not typed since version 0.13.0
-ignore_missing_imports = true
+no_implicit_reexport = false
 
 [tool.ruff]
 line-length = 119

diff --git a/libs/libapi/pyproject.toml b/libs/libapi/pyproject.toml
@@ -40,7 +40,6 @@ strict = true
 module = [
     "datasets.*",
     "ecdsa.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*"

diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
@@ -61,12 +61,13 @@ markers = [
 
 [tool.mypy]
 strict = true
+# allow calling untyped methods in huggingface_hub (eg: DatasetInfo(...))
+untyped_calls_exclude = "huggingface_hub"
 
 [[tool.mypy.overrides]]
 module = [
     "datasets.*",
     "networkx.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "pymongoarrow.*",
@@ -77,9 +78,18 @@ module = [
     "aiobotocore.*",
     "requests.*",
 ]
-# ^ huggingface_hub is not typed since version 0.13.0
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = [
+    "huggingface_hub.*",
+]
+# allow
+#   from huggingface_hub.utils import build_hf_headers
+# even if the module does not explicitly exports the method
+# https://github.com/huggingface/huggingface_hub/blob/07896ee75b37da0d1744c9d03472485b985b3213/src/huggingface_hub/utils/__init__.py
+no_implicit_reexport = false
+
 [tool.ruff]
 line-length = 119
 src = ["src"]

diff --git a/services/admin/pyproject.toml b/services/admin/pyproject.toml
@@ -41,12 +41,17 @@ strict = true
 
 [[tool.mypy.overrides]]
 module = [
-    "huggingface_hub.*",
     "prometheus_client.*"
 ]
 ignore_missing_imports = true
 # ^ prometheus_client is now typed, but starlette-prometheus requires an old version
 
+[[tool.mypy.overrides]]
+module = [
+    "huggingface_hub.*",
+]
+no_implicit_reexport = false
+
 [tool.ruff]
 line-length = 119
 src = ["src"]

diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
@@ -43,7 +43,6 @@ disallow_untyped_calls = false
 [[tool.mypy.overrides]]
 module = [
     "datasets.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*",

diff --git a/services/rows/pyproject.toml b/services/rows/pyproject.toml
@@ -42,7 +42,6 @@ disallow_untyped_calls = false
 [[tool.mypy.overrides]]
 module = [
     "datasets.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*",

diff --git a/services/search/pyproject.toml b/services/search/pyproject.toml
@@ -42,7 +42,6 @@ disallow_untyped_calls = false
 module = [
     "datasets.*",
     "fsspec.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*"

diff --git a/services/sse-api/pyproject.toml b/services/sse-api/pyproject.toml
@@ -46,7 +46,6 @@ disallow_untyped_calls = false
 [[tool.mypy.overrides]]
 module = [
     "datasets.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*",

diff --git a/services/webhook/pyproject.toml b/services/webhook/pyproject.toml
@@ -42,7 +42,6 @@ disallow_untyped_calls = false
 [[tool.mypy.overrides]]
 module = [
     "datasets.*",
-    "huggingface_hub.*",
     "prometheus_client.*",
     "pyarrow.*",
     "tqdm.*",

diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
@@ -57,12 +57,13 @@ markers = [
 
 [tool.mypy]
 strict = true
+# allow calling untyped methods in huggingface_hub (eg: DatasetInfo(...))
+untyped_calls_exclude = "huggingface_hub"
 
 [[tool.mypy.overrides]]
 module = [
     "aiolimiter.*",
     "datasets.*",
-    "huggingface_hub.*",
     "pyarrow.*",
     "tqdm.*",
     "fsspec.*",
@@ -73,6 +74,12 @@ module = [
 ignore_missing_imports = true
 # ^ prometheus_client is now typed, but we use an old version to stick with the other services
 
+[[tool.mypy.overrides]]
+module = [
+    "huggingface_hub.*",
+]
+no_implicit_reexport = false
+
 [tool.ruff]
 line-length = 119
 src = ["src"]

diff --git a/services/worker/src/worker/job_runners/_job_runner_with_datasets_cache.py b/services/worker/src/worker/job_runners/_job_runner_with_datasets_cache.py
@@ -39,7 +39,7 @@ def set_datasets_cache(self, cache_subdirectory: Optional[Path]) -> None:
         datasets.config.EXTRACTED_DATASETS_PATH = (
             datasets.config.HF_DATASETS_CACHE / datasets.config.EXTRACTED_DATASETS_DIR
         )
-        huggingface_hub.constants.HF_HUB_CACHE = cache_subdirectory / "hub"
+        huggingface_hub.constants.HF_HUB_CACHE = str(cache_subdirectory / "hub")
         logging.debug(f"huggingface_hub cache set to: {huggingface_hub.constants.HF_HUB_CACHE}")
 
     def pre_compute(self) -> None:

diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -51,7 +51,7 @@
     CommitOperationDelete,
 )
 from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
-from huggingface_hub.hf_api import CommitInfo, DatasetInfo, HfApi, RepoFile
+from huggingface_hub.hf_api import CommitInfo, DatasetInfo, HfApi, RepoFile, RepoSibling
 from huggingface_hub.utils._http import HTTP_METHOD_T, Response, http_backoff
 from libcommon.constants import (
     PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS,
@@ -105,9 +105,7 @@ def http_backoff_with_timeout(method: HTTP_METHOD_T, url: str, **kwargs: Any) ->
     return http_backoff(method, url, **kwargs)
 
 
-def repo_file_rfilename_sort_key(repo_file: RepoFile) -> str:
-    if not isinstance(repo_file.rfilename, str):  # check type for mypy
-        raise ValueError(f"Expected a string for repo_file.rfilename, but got a '{type(repo_file.rfilename)}'.")
+def repo_file_rfilename_sort_key(repo_file: RepoSibling) -> str:
     return repo_file.rfilename
 
 
@@ -178,7 +176,7 @@ def parse_repo_filename(filename: str) -> tuple[str, str]:
 
 
 def create_parquet_file_item(
-    repo_file: RepoFile,
+    repo_file: RepoSibling,
     dataset: str,
     config: str,
     hf_endpoint: str,
@@ -235,7 +233,7 @@ def _is_too_big_from_hub(
     Returns:
         `bool`: if dataset size is bigger than max value.
     """
-    dataset_size: int = sum(sibling.size for sibling in dataset_info.siblings if sibling.size is not None)
+    dataset_size: int = sum(sibling.size for sibling in (dataset_info.siblings or []) if sibling.size is not None)
     return bool(dataset_size > max_dataset_size_bytes)
 
 
@@ -1147,7 +1145,10 @@ def create_commits(
 
 
 def get_delete_operations(
-    parquet_operations: list[CommitOperationAdd], all_repo_files: set[str], config_names: set[str], config: str
+    parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]],
+    all_repo_files: set[str],
+    config_names: set[str],
+    config: str,
 ) -> list[CommitOperationDelete]:
     # - get files that will be preserved in repo:
     #   1. parquet files belonging to any other config (otherwise outdated files might be preserved)
@@ -1179,7 +1180,7 @@ def commit_parquet_conversion(
     dataset: str,
     config: str,
     config_names: set[str],
-    parquet_operations: list[CommitOperation],
+    parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]],
     commit_message: str,
     target_revision: Optional[str],
 ) -> None:
@@ -1199,7 +1200,7 @@ def commit_parquet_conversion(
         config_names (`list[str]`):
             The list of all the configurations of this dataset. This is used to clean
             the other fiels and directories in the repo, if any.
-        parquet_operations (`list[huggingface_hub.hf_api.CommitOperation]`):
+        parquet_operations (`Union[list[CommitOperationAdd], list[CommitOperationCopy]]`):
             List of commit operation for the parquet conversion. It could be
             file additions or file copies for example.
         commit_message (`str`):
@@ -1219,11 +1220,11 @@ def commit_parquet_conversion(
             If one of the commits could not be created on the Hub.
     """
     target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
-    all_repo_files: set[str] = {f.rfilename for f in target_dataset_info.siblings}
+    all_repo_files: set[str] = {f.rfilename for f in (target_dataset_info.siblings or [])}
     delete_operations = get_delete_operations(
         parquet_operations=parquet_operations, all_repo_files=all_repo_files, config_names=config_names, config=config
     )
-    operations = delete_operations + parquet_operations
+    operations: list[CommitOperation] = list(delete_operations + parquet_operations)
     logging.info(f"{len(operations)} git operations to do for {dataset=} {config=}.")
     create_commits(
         committer_hf_api,
@@ -1372,6 +1373,7 @@ def compute_config_parquet_and_info_response(
 
     partial = False
     estimated_dataset_info: Optional[dict[str, Any]] = None
+    parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]] = []
     try:
         if is_parquet_builder_with_hub_files(builder):
             try:
@@ -1481,7 +1483,7 @@ def compute_config_parquet_and_info_response(
 
     repo_files = [
         repo_file
-        for repo_file in target_dataset_info.siblings
+        for repo_file in (target_dataset_info.siblings or [])
         if repo_file.rfilename.startswith(f"{config}/") and repo_file.rfilename.endswith(".parquet")
     ]
     repo_files.sort(key=repo_file_rfilename_sort_key)

diff --git a/services/worker/src/worker/job_runners/dataset/filetypes.py b/services/worker/src/worker/job_runners/dataset/filetypes.py
@@ -94,13 +94,13 @@ def compute_filetypes_response(
         raise DatasetNotFoundError(f"Cannot get the dataset info for {dataset=}") from err
 
     # get file types count
-    filetypes = get_filetypes(info.siblings)
+    filetypes = get_filetypes(info.siblings or [])
 
     # look into the zip archives to get the file types
     SUPPORTED_ARCHIVE_EXTENSIONS = [".zip"]
     archive_filenames = [
         sibling.rfilename
-        for sibling in info.siblings
+        for sibling in (info.siblings or [])
         if get_file_extension(sibling.rfilename, recursive=False, clean=False).extension
         in SUPPORTED_ARCHIVE_EXTENSIONS
     ]

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -130,7 +130,7 @@ def check_indexable(feature: FeatureType) -> None:
     return indexable_columns
 
 
-def get_monolingual_stemmer(card_data: DatasetCardData) -> str:
+def get_monolingual_stemmer(card_data: Optional[DatasetCardData]) -> str:
     if card_data is None:
         return DEFAULT_STEMMER
     all_languages = card_data["language"]
@@ -388,7 +388,7 @@ def compute_split_duckdb_index_response(
 
             logging.debug(f"get dataset info for {dataset=} with {target_revision=}")
             target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
-            all_repo_files: set[str] = {f.rfilename for f in target_dataset_info.siblings}
+            all_repo_files: set[str] = {f.rfilename for f in (target_dataset_info.siblings or [])}
             delete_operations = get_delete_operations(
                 all_repo_files=all_repo_files,
                 split_names=get_split_names(dataset=dataset, config=config),
@@ -458,7 +458,7 @@ def compute_split_duckdb_index_response(
         raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err
 
     repo_files = [
-        repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
+        repo_file for repo_file in (target_dataset_info.siblings or []) if repo_file.rfilename == index_file_location
     ]
 
     if not repo_files or len(repo_files) != 1:

diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
@@ -156,7 +156,8 @@ def hf_hub_open_file(
 ) -> HfFileSystemFile:
     """Open file with the HfFileSystem."""
     fs = HfFileSystem(endpoint=hf_endpoint, token=hf_token)
-    return fs.open(file_url, revision=revision)
+    file: HfFileSystemFile = fs.open(file_url, revision=revision)
+    return file
 
 
 # used by `config-parquet-and-info` and `config-parquet-metadata` steps
@@ -215,7 +216,7 @@ def check_split_exists(dataset: str, config: str, split: str) -> None:
         splits_content = split_names_response["content"]["splits"]
     except Exception as e:
         raise PreviousStepFormatError(
-            "Previous step 'config-split-names' did not return" " the expected content.",
+            "Previous step 'config-split-names' did not return the expected content.",
             e,
         ) from e
 

diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py
@@ -23,7 +23,7 @@
     Generator as ParametrizedGeneratorBasedBuilder,
 )
 from datasets.utils.py_utils import asdict
-from huggingface_hub.hf_api import CommitOperationAdd, HfApi
+from huggingface_hub.hf_api import CommitOperation, CommitOperationAdd, HfApi
 from libcommon.dtos import JobInfo, JobParams, Priority
 from libcommon.queue.jobs import Queue
 from libcommon.resources import CacheMongoResource, QueueMongoResource
@@ -400,7 +400,7 @@ def test_create_commits(
     else:
         parent_commit = None
     directory = f".test_create_commits_{max_operations_per_commit}_{use_parent_commit}"
-    operations: list[CommitOperationAdd] = [
+    operations: list[CommitOperation] = [
         CommitOperationAdd(path_in_repo=f"{directory}/file{i}.txt", path_or_fileobj=f"content{i}".encode("UTF-8"))
         for i in range(NUM_FILES)
     ]

diff --git a/services/worker/tests/job_runners/test__job_runner_with_datasets_cache.py b/services/worker/tests/job_runners/test__job_runner_with_datasets_cache.py
@@ -76,7 +76,7 @@ def test_set_datasets_cache(app_config: AppConfig, get_job_runner: GetJobRunner)
     dummy_path = base_path / "dummy"
     job_runner.set_datasets_cache(dummy_path)
     assert datasets.config.HF_DATASETS_CACHE.is_relative_to(dummy_path)
-    assert huggingface_hub.constants.HF_HUB_CACHE.is_relative_to(dummy_path)
+    assert Path(huggingface_hub.constants.HF_HUB_CACHE).is_relative_to(dummy_path)
 
 
 def test_pre_compute_post_compute(app_config: AppConfig, get_job_runner: GetJobRunner) -> None:
@@ -105,4 +105,4 @@ def assert_datasets_cache_path(path: Optional[Path], exists: bool) -> None:
             datasets.config.DOWNLOADED_DATASETS_PATH == datasets_cache_path / datasets.config.DOWNLOADED_DATASETS_DIR
         )
         assert datasets.config.EXTRACTED_DATASETS_PATH == datasets_cache_path / datasets.config.EXTRACTED_DATASETS_DIR
-        assert huggingface_hub.constants.HF_HUB_CACHE == hub_cache_path
+        assert huggingface_hub.constants.HF_HUB_CACHE == str(hub_cache_path)