SylphAI-Inc · richard087 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/adalflow/adalflow/datasets/big_bench_hard.py b/adalflow/adalflow/datasets/big_bench_hard.py
@@ -3,11 +3,10 @@
 import os
 import uuid
 from typing import Literal
-import subprocess
 from adalflow.utils.data import Dataset
 from adalflow.datasets.types import Example
 
-from adalflow.utils.file_io import save_csv
+from adalflow.utils.file_io import save_csv, download_large_file
 from adalflow.datasets.utils import prepare_dataset_path
 
 
@@ -75,23 +74,7 @@ def _check_or_download_dataset(self, data_path: str = None, split: str = "train"
 
         print(f"Downloading dataset to {json_path}")
         try:
-            # Use subprocess and capture the return code
-            result = subprocess.call(
-                [
-                    "wget",
-                    f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.task_name}.json",
-                    "-O",
-                    json_path,
-                ]
-            )
-
-            # Check if wget failed (non-zero exit code)
-            if result != 0:
-                raise ValueError(
-                    f"Failed to download dataset for task '{self.task_name}'.\n"
-                    "Please verify the task name (the JSON file name) by checking the following link:\n"
-                    "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"
-                )
+            download_large_file(f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/bbh/{self.task_name}.json", json_path)
 
             # Check if the file is non-empty
             if not os.path.exists(json_path) or os.path.getsize(json_path) == 0:

diff --git a/adalflow/adalflow/utils/__init__.py b/adalflow/adalflow/utils/__init__.py
@@ -13,6 +13,7 @@
     load_jsonl,
     append_to_jsonl,
     write_list_to_jsonl,
+    download_large_file,
 )
 from .logger import printc, get_logger
 from .registry import EntityMapping
@@ -44,6 +45,7 @@
     "load_jsonl",
     "append_to_jsonl",
     "write_list_to_jsonl",
+    "download_large_file",
     "safe_import",
     "setup_env",
     "DataLoader",

diff --git a/adalflow/adalflow/utils/file_io.py b/adalflow/adalflow/utils/file_io.py
@@ -2,6 +2,9 @@
 import os
 import pickle
 import logging
+import requests
+import shutil
+import functools
 from typing import Mapping, Any, Optional, List, Dict
 
 
@@ -192,3 +195,19 @@ def write_list_to_jsonl(f: str, data: List[Dict[str, Any]]) -> None:
                 writer.write(d)
     except Exception as e:
         log.error(f"Error writing data to jsonl file {f}: {e}")
+
+def download_large_file(url:str, destination:str):
+    r"""Download very large files without staging them in memory.
+
+    Args:
+        url (str): URL of data to download
+        destination (str): The name of the file to write the data to.
+    """
+    try:
+        response = requests.get(url, stream=True)
+        with open(destination, 'wb') as out_file:
+            response.raw.read = functools.partial(response.raw.read, decode_content=True)
+            shutil.copyfileobj(response.raw, out_file)
+        log.info("File downloaded successfully!")
+    except requests.exceptions.RequestException as e:
+        log.exception("Error downloading the file:", e)