Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove wget. replaces with the python native "requests" library #293

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions adalflow/adalflow/datasets/big_bench_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
import os
import uuid
from typing import Literal
import subprocess
from adalflow.utils.data import Dataset
from adalflow.datasets.types import Example

from adalflow.utils.file_io import save_csv
from adalflow.utils.file_io import save_csv, download_large_file
from adalflow.datasets.utils import prepare_dataset_path


Expand Down Expand Up @@ -75,23 +74,7 @@ def _check_or_download_dataset(self, data_path: str = None, split: str = "train"

print(f"Downloading dataset to {json_path}")
try:
# Use subprocess and capture the return code
result = subprocess.call(
[
"wget",
f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.task_name}.json",
"-O",
json_path,
]
)

# Check if wget failed (non-zero exit code)
if result != 0:
raise ValueError(
f"Failed to download dataset for task '{self.task_name}'.\n"
"Please verify the task name (the JSON file name) by checking the following link:\n"
"https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"
)
download_large_file(f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.task_name}.json", json_path)

# Check if the file is non-empty
if not os.path.exists(json_path) or os.path.getsize(json_path) == 0:
Expand Down
2 changes: 2 additions & 0 deletions adalflow/adalflow/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
load_jsonl,
append_to_jsonl,
write_list_to_jsonl,
download_large_file,
)
from .logger import printc, get_logger
from .registry import EntityMapping
Expand Down Expand Up @@ -44,6 +45,7 @@
"load_jsonl",
"append_to_jsonl",
"write_list_to_jsonl",
"download_large_file",
"safe_import",
"setup_env",
"DataLoader",
Expand Down
19 changes: 19 additions & 0 deletions adalflow/adalflow/utils/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import os
import pickle
import logging
import requests
import shutil
import functools
from typing import Mapping, Any, Optional, List, Dict


Expand Down Expand Up @@ -192,3 +195,19 @@ def write_list_to_jsonl(f: str, data: List[Dict[str, Any]]) -> None:
writer.write(d)
except Exception as e:
log.error(f"Error writing data to jsonl file {f}: {e}")

def download_large_file(url:str, destination:str):
r"""Download very large files without staging them in memory.

Args:
url (str): URL of data to download
destination (str): The name of the file to write the data to.
"""
try:
response = requests.get(url, stream=True)
with open(destination, 'wb') as out_file:
response.raw.read = functools.partial(response.raw.read, decode_content=True)
shutil.copyfileobj(response.raw, out_file)
log.info("File downloaded successfully!")
except requests.exceptions.RequestException as e:
log.exception("Error downloading the file:", e)
Loading