camel-ai · Wendong-Fan · Dec 8, 2024 · Jul 10, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/camel/benchmarks/__init__.py b/camel/benchmarks/__init__.py
@@ -0,0 +1,18 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+from .base import BaseBenchmark
+from .gaia import DefaultGAIARetriever, GAIABenchmark
+
+__all__ = ["BaseBenchmark", "GAIABenchmark", "DefaultGAIARetriever"]
diff --git a/camel/benchmarks/base.py b/camel/benchmarks/base.py
@@ -0,0 +1,152 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional
+
+from camel.agents import ChatAgent
+
+logger = logging.getLogger(__name__)
+
+
+class BaseBenchmark(ABC):
+    r"""Base class for benchmarks.
+
+    Attributes:
+        name (str): Name of the benchmark.
+        data_dir (str): Path to the data directory.
+        save_to (str): Path to save the results.
+        processes (int): Number of processes to use for parallel
+            processing. :(default: :obj:`1`)
+    """
+
+    def __init__(
+        self, name: str, data_dir: str, save_to: str, processes: int = 1
+    ):
+        r"""Initialize the benchmark.
+
+        Args:
+            name (str): Name of the benchmark.
+            data_dir (str): Path to the data directory.
+            save_to (str): Path to save the results.
+            processes (int): Number of processes to use for parallel
+                processing. :(default: :obj:`1`)
+
+        """
+        self.name = name
+        self.data_dir = Path(data_dir)
+        self.processes = processes
+        self.save_to = save_to
+        if not self.data_dir.exists():
+            logger.info(
+                f"Data directory {data_dir} does not exist. Creating it."
+            )
+            self.data_dir.mkdir(parents=True, exist_ok=True)
+        if not self.data_dir.is_dir():
+            raise NotADirectoryError(
+                f"Data directory {data_dir} is not a directory"
+            )
+        self._data: Dict[str, List[Dict[str, Any]]] = dict()
+        self._results: List[Dict[str, Any]] = []
+
+    @abstractmethod
+    def download(self) -> "BaseBenchmark":
+        r"""Download the benchmark data.
+
+        Returns:
+            BaseBenchmark: The benchmark instance.
+        """
+        pass
+
+    @abstractmethod
+    def load(self, force_download: bool = False) -> "BaseBenchmark":
+        r"""Load the benchmark data.
+
+        Args:
+            force_download (bool): Whether to force download the data.
+
+        Returns:
+            BaseBenchmark: The benchmark instance.
+        """
+        pass
+
+    @property
+    def train(self) -> List[Dict[str, Any]]:
+        r"""Get the training data.
+
+        Returns:
+            List[Dict[str, Any]]: The training data.
+        """
+        if not self._data:
+            logger.info("Data not loaded. Loading data.")
+            self.load()
+        return self._data["train"]
+
+    @property
+    def valid(self) -> List[Dict[str, Any]]:
+        r"""Get the validation data.
+
+        Returns:
+            List[Dict[str, Any]]: The validation data.
+        """
+        if not self._data:
+            logger.info("Data not loaded. Loading data.")
+            self.load()
+        return self._data["valid"]
+
+    @property
+    def test(self) -> List[Dict[str, Any]]:
+        r"""Get the test data.
+
+        Returns:
+            List[Dict[str, Any]]: The test data.
+        """
+        if not self._data:
+            logger.info("Data not loaded. Loading data.")
+            self.load()
+        return self._data["test"]
+
+    @abstractmethod
+    def run(
+        self,
+        agent: ChatAgent,
+        on: Literal["train", "valid", "test"],
+        randomize: bool = False,
+        subset: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> "BaseBenchmark":
+        r"""Run the benchmark.
+
+        Args:
+            agent (ChatAgent): The chat agent.
+            on (str): The data split to run the benchmark on.
+            randomize (bool): Whether to randomize the data.
+            subset (int): The subset of the data to run the benchmark on.
+
+        Returns:
+            BaseBenchmark: The benchmark instance.
+        """
+        pass
+
+    @property
+    def results(self) -> List[Dict[str, Any]]:
+        r"""Get the results.
+
+        Returns:
+            List[Dict[str, Any]]: The results.
+        """
+        return self._results