Skip to content

Commit

Permalink
fix types for huggingface_hub
Browse files Browse the repository at this point in the history
  • Loading branch information
severo committed Jan 28, 2025
1 parent 223973d commit bd31ec5
Show file tree
Hide file tree
Showing 18 changed files with 55 additions and 38 deletions.
3 changes: 1 addition & 2 deletions e2e/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ strict = true
module = [
"huggingface_hub.*",
]
# ^ huggingface_hub is not typed since version 0.13.0
ignore_missing_imports = true
no_implicit_reexport = false

[tool.ruff]
line-length = 119
Expand Down
3 changes: 1 addition & 2 deletions jobs/cache_maintenance/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ strict = true
module = [
"huggingface_hub.*",
]
# ^ huggingface_hub is not typed since version 0.13.0
ignore_missing_imports = true
no_implicit_reexport = false

[tool.ruff]
line-length = 119
Expand Down
1 change: 0 additions & 1 deletion libs/libapi/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ strict = true
module = [
"datasets.*",
"ecdsa.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*"
Expand Down
14 changes: 12 additions & 2 deletions libs/libcommon/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,13 @@ markers = [

[tool.mypy]
strict = true
# allow calling untyped methods in huggingface_hub (eg: DatasetInfo(...))
untyped_calls_exclude = "huggingface_hub"

[[tool.mypy.overrides]]
module = [
"datasets.*",
"networkx.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"pymongoarrow.*",
Expand All @@ -77,9 +78,18 @@ module = [
"aiobotocore.*",
"requests.*",
]
# ^ huggingface_hub is not typed since version 0.13.0
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"huggingface_hub.*",
]
# allow
# from huggingface_hub.utils import build_hf_headers
# even if the module does not explicitly exports the method
# https://github.com/huggingface/huggingface_hub/blob/07896ee75b37da0d1744c9d03472485b985b3213/src/huggingface_hub/utils/__init__.py
no_implicit_reexport = false

[tool.ruff]
line-length = 119
src = ["src"]
Expand Down
7 changes: 6 additions & 1 deletion services/admin/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,17 @@ strict = true

[[tool.mypy.overrides]]
module = [
"huggingface_hub.*",
"prometheus_client.*"
]
ignore_missing_imports = true
# ^ prometheus_client is now typed, but starlette-prometheus requires an old version

[[tool.mypy.overrides]]
module = [
"huggingface_hub.*",
]
no_implicit_reexport = false

[tool.ruff]
line-length = 119
src = ["src"]
Expand Down
1 change: 0 additions & 1 deletion services/api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ disallow_untyped_calls = false
[[tool.mypy.overrides]]
module = [
"datasets.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*",
Expand Down
1 change: 0 additions & 1 deletion services/rows/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ disallow_untyped_calls = false
[[tool.mypy.overrides]]
module = [
"datasets.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*",
Expand Down
1 change: 0 additions & 1 deletion services/search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ disallow_untyped_calls = false
module = [
"datasets.*",
"fsspec.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*"
Expand Down
1 change: 0 additions & 1 deletion services/sse-api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ disallow_untyped_calls = false
[[tool.mypy.overrides]]
module = [
"datasets.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*",
Expand Down
1 change: 0 additions & 1 deletion services/webhook/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ disallow_untyped_calls = false
[[tool.mypy.overrides]]
module = [
"datasets.*",
"huggingface_hub.*",
"prometheus_client.*",
"pyarrow.*",
"tqdm.*",
Expand Down
9 changes: 8 additions & 1 deletion services/worker/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@ markers = [

[tool.mypy]
strict = true
# allow calling untyped methods in huggingface_hub (eg: DatasetInfo(...))
untyped_calls_exclude = "huggingface_hub"

[[tool.mypy.overrides]]
module = [
"aiolimiter.*",
"datasets.*",
"huggingface_hub.*",
"pyarrow.*",
"tqdm.*",
"fsspec.*",
Expand All @@ -73,6 +74,12 @@ module = [
ignore_missing_imports = true
# ^ prometheus_client is now typed, but we use an old version to stick with the other services

[[tool.mypy.overrides]]
module = [
"huggingface_hub.*",
]
no_implicit_reexport = false

[tool.ruff]
line-length = 119
src = ["src"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def set_datasets_cache(self, cache_subdirectory: Optional[Path]) -> None:
datasets.config.EXTRACTED_DATASETS_PATH = (
datasets.config.HF_DATASETS_CACHE / datasets.config.EXTRACTED_DATASETS_DIR
)
huggingface_hub.constants.HF_HUB_CACHE = cache_subdirectory / "hub"
huggingface_hub.constants.HF_HUB_CACHE = str(cache_subdirectory / "hub")
logging.debug(f"huggingface_hub cache set to: {huggingface_hub.constants.HF_HUB_CACHE}")

def pre_compute(self) -> None:
Expand Down
26 changes: 14 additions & 12 deletions services/worker/src/worker/job_runners/config/parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
CommitOperationDelete,
)
from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
from huggingface_hub.hf_api import CommitInfo, DatasetInfo, HfApi, RepoFile
from huggingface_hub.hf_api import CommitInfo, DatasetInfo, HfApi, RepoFile, RepoSibling
from huggingface_hub.utils._http import HTTP_METHOD_T, Response, http_backoff
from libcommon.constants import (
PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS,
Expand Down Expand Up @@ -105,9 +105,7 @@ def http_backoff_with_timeout(method: HTTP_METHOD_T, url: str, **kwargs: Any) ->
return http_backoff(method, url, **kwargs)


def repo_file_rfilename_sort_key(repo_file: RepoFile) -> str:
if not isinstance(repo_file.rfilename, str): # check type for mypy
raise ValueError(f"Expected a string for repo_file.rfilename, but got a '{type(repo_file.rfilename)}'.")
def repo_file_rfilename_sort_key(repo_file: RepoSibling) -> str:
return repo_file.rfilename


Expand Down Expand Up @@ -178,7 +176,7 @@ def parse_repo_filename(filename: str) -> tuple[str, str]:


def create_parquet_file_item(
repo_file: RepoFile,
repo_file: RepoSibling,
dataset: str,
config: str,
hf_endpoint: str,
Expand Down Expand Up @@ -235,7 +233,7 @@ def _is_too_big_from_hub(
Returns:
`bool`: if dataset size is bigger than max value.
"""
dataset_size: int = sum(sibling.size for sibling in dataset_info.siblings if sibling.size is not None)
dataset_size: int = sum(sibling.size for sibling in (dataset_info.siblings or []) if sibling.size is not None)
return bool(dataset_size > max_dataset_size_bytes)


Expand Down Expand Up @@ -1147,7 +1145,10 @@ def create_commits(


def get_delete_operations(
parquet_operations: list[CommitOperationAdd], all_repo_files: set[str], config_names: set[str], config: str
parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]],
all_repo_files: set[str],
config_names: set[str],
config: str,
) -> list[CommitOperationDelete]:
# - get files that will be preserved in repo:
# 1. parquet files belonging to any other config (otherwise outdated files might be preserved)
Expand Down Expand Up @@ -1179,7 +1180,7 @@ def commit_parquet_conversion(
dataset: str,
config: str,
config_names: set[str],
parquet_operations: list[CommitOperation],
parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]],
commit_message: str,
target_revision: Optional[str],
) -> None:
Expand All @@ -1199,7 +1200,7 @@ def commit_parquet_conversion(
config_names (`list[str]`):
The list of all the configurations of this dataset. This is used to clean
the other fiels and directories in the repo, if any.
parquet_operations (`list[huggingface_hub.hf_api.CommitOperation]`):
parquet_operations (`Union[list[CommitOperationAdd], list[CommitOperationCopy]]`):
List of commit operation for the parquet conversion. It could be
file additions or file copies for example.
commit_message (`str`):
Expand All @@ -1219,11 +1220,11 @@ def commit_parquet_conversion(
If one of the commits could not be created on the Hub.
"""
target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
all_repo_files: set[str] = {f.rfilename for f in target_dataset_info.siblings}
all_repo_files: set[str] = {f.rfilename for f in (target_dataset_info.siblings or [])}
delete_operations = get_delete_operations(
parquet_operations=parquet_operations, all_repo_files=all_repo_files, config_names=config_names, config=config
)
operations = delete_operations + parquet_operations
operations: list[CommitOperation] = list(delete_operations + parquet_operations)
logging.info(f"{len(operations)} git operations to do for {dataset=} {config=}.")
create_commits(
committer_hf_api,
Expand Down Expand Up @@ -1372,6 +1373,7 @@ def compute_config_parquet_and_info_response(

partial = False
estimated_dataset_info: Optional[dict[str, Any]] = None
parquet_operations: Union[list[CommitOperationAdd], list[CommitOperationCopy]] = []
try:
if is_parquet_builder_with_hub_files(builder):
try:
Expand Down Expand Up @@ -1481,7 +1483,7 @@ def compute_config_parquet_and_info_response(

repo_files = [
repo_file
for repo_file in target_dataset_info.siblings
for repo_file in (target_dataset_info.siblings or [])
if repo_file.rfilename.startswith(f"{config}/") and repo_file.rfilename.endswith(".parquet")
]
repo_files.sort(key=repo_file_rfilename_sort_key)
Expand Down
4 changes: 2 additions & 2 deletions services/worker/src/worker/job_runners/dataset/filetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@ def compute_filetypes_response(
raise DatasetNotFoundError(f"Cannot get the dataset info for {dataset=}") from err

# get file types count
filetypes = get_filetypes(info.siblings)
filetypes = get_filetypes(info.siblings or [])

# look into the zip archives to get the file types
SUPPORTED_ARCHIVE_EXTENSIONS = [".zip"]
archive_filenames = [
sibling.rfilename
for sibling in info.siblings
for sibling in (info.siblings or [])
if get_file_extension(sibling.rfilename, recursive=False, clean=False).extension
in SUPPORTED_ARCHIVE_EXTENSIONS
]
Expand Down
6 changes: 3 additions & 3 deletions services/worker/src/worker/job_runners/split/duckdb_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def check_indexable(feature: FeatureType) -> None:
return indexable_columns


def get_monolingual_stemmer(card_data: DatasetCardData) -> str:
def get_monolingual_stemmer(card_data: Optional[DatasetCardData]) -> str:
if card_data is None:
return DEFAULT_STEMMER
all_languages = card_data["language"]
Expand Down Expand Up @@ -388,7 +388,7 @@ def compute_split_duckdb_index_response(

logging.debug(f"get dataset info for {dataset=} with {target_revision=}")
target_dataset_info = hf_api.dataset_info(repo_id=dataset, revision=target_revision, files_metadata=False)
all_repo_files: set[str] = {f.rfilename for f in target_dataset_info.siblings}
all_repo_files: set[str] = {f.rfilename for f in (target_dataset_info.siblings or [])}
delete_operations = get_delete_operations(
all_repo_files=all_repo_files,
split_names=get_split_names(dataset=dataset, config=config),
Expand Down Expand Up @@ -458,7 +458,7 @@ def compute_split_duckdb_index_response(
raise DatasetNotFoundError("The dataset does not exist on the Hub.") from err

repo_files = [
repo_file for repo_file in target_dataset_info.siblings if repo_file.rfilename == index_file_location
repo_file for repo_file in (target_dataset_info.siblings or []) if repo_file.rfilename == index_file_location
]

if not repo_files or len(repo_files) != 1:
Expand Down
5 changes: 3 additions & 2 deletions services/worker/src/worker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ def hf_hub_open_file(
) -> HfFileSystemFile:
"""Open file with the HfFileSystem."""
fs = HfFileSystem(endpoint=hf_endpoint, token=hf_token)
return fs.open(file_url, revision=revision)
file: HfFileSystemFile = fs.open(file_url, revision=revision)
return file


# used by `config-parquet-and-info` and `config-parquet-metadata` steps
Expand Down Expand Up @@ -215,7 +216,7 @@ def check_split_exists(dataset: str, config: str, split: str) -> None:
splits_content = split_names_response["content"]["splits"]
except Exception as e:
raise PreviousStepFormatError(
"Previous step 'config-split-names' did not return" " the expected content.",
"Previous step 'config-split-names' did not return the expected content.",
e,
) from e

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
Generator as ParametrizedGeneratorBasedBuilder,
)
from datasets.utils.py_utils import asdict
from huggingface_hub.hf_api import CommitOperationAdd, HfApi
from huggingface_hub.hf_api import CommitOperation, CommitOperationAdd, HfApi
from libcommon.dtos import JobInfo, JobParams, Priority
from libcommon.queue.jobs import Queue
from libcommon.resources import CacheMongoResource, QueueMongoResource
Expand Down Expand Up @@ -400,7 +400,7 @@ def test_create_commits(
else:
parent_commit = None
directory = f".test_create_commits_{max_operations_per_commit}_{use_parent_commit}"
operations: list[CommitOperationAdd] = [
operations: list[CommitOperation] = [
CommitOperationAdd(path_in_repo=f"{directory}/file{i}.txt", path_or_fileobj=f"content{i}".encode("UTF-8"))
for i in range(NUM_FILES)
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_set_datasets_cache(app_config: AppConfig, get_job_runner: GetJobRunner)
dummy_path = base_path / "dummy"
job_runner.set_datasets_cache(dummy_path)
assert datasets.config.HF_DATASETS_CACHE.is_relative_to(dummy_path)
assert huggingface_hub.constants.HF_HUB_CACHE.is_relative_to(dummy_path)
assert Path(huggingface_hub.constants.HF_HUB_CACHE).is_relative_to(dummy_path)


def test_pre_compute_post_compute(app_config: AppConfig, get_job_runner: GetJobRunner) -> None:
Expand Down Expand Up @@ -105,4 +105,4 @@ def assert_datasets_cache_path(path: Optional[Path], exists: bool) -> None:
datasets.config.DOWNLOADED_DATASETS_PATH == datasets_cache_path / datasets.config.DOWNLOADED_DATASETS_DIR
)
assert datasets.config.EXTRACTED_DATASETS_PATH == datasets_cache_path / datasets.config.EXTRACTED_DATASETS_DIR
assert huggingface_hub.constants.HF_HUB_CACHE == hub_cache_path
assert huggingface_hub.constants.HF_HUB_CACHE == str(hub_cache_path)

0 comments on commit bd31ec5

Please sign in to comment.