diff --git a/packages/ragbits-evaluate/examples/document-search/config/config.yaml b/packages/ragbits-evaluate/examples/document-search/config/config.yaml index 505fd2327..04eb09201 100644 --- a/packages/ragbits-evaluate/examples/document-search/config/config.yaml +++ b/packages/ragbits-evaluate/examples/document-search/config/config.yaml @@ -3,6 +3,9 @@ defaults: - setup: baseline - _self_ +task: + name: document-search + neptune: project: ragbits run: False diff --git a/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml b/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml index 4763cdbba..27ce786f5 100644 --- a/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml +++ b/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml @@ -1,2 +1,6 @@ -path: "m-ric/huggingface_doc_qa_eval" -split: "train" +ingest: + path: "micpst/hf-docs" + split: "train" +eval: + path: "micpst/hf-docs-retrieval" + split: "train" diff --git a/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml b/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml index 2dd3c3398..f6caad81d 100644 --- a/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml +++ b/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml @@ -1 +1 @@ -name: BASELINE +name: Baseline diff --git a/packages/ragbits-evaluate/examples/document-search/evaluate.py b/packages/ragbits-evaluate/examples/document-search/evaluate.py index 620a212b3..d805cbcbc 100644 --- a/packages/ragbits-evaluate/examples/document-search/evaluate.py +++ b/packages/ragbits-evaluate/examples/document-search/evaluate.py @@ -3,16 +3,15 @@ from pathlib import Path import hydra -import neptune from hydra.core.hydra_config import HydraConfig -from neptune.utils import stringify_unsupported from omegaconf import DictConfig from ragbits.evaluate.evaluator import Evaluator from ragbits.evaluate.loaders import HuggingFaceDataLoader -from ragbits.evaluate.metrics import MetricSet -from ragbits.evaluate.pipelines import DocumentSearchEvaluationPipeline -from ragbits.evaluate.utils import save +from ragbits.evaluate.metrics import DocumentSearchPrecision, DocumentSearchRecall, MetricSet +from ragbits.evaluate.metrics.document_search import DocumentSearchF1 +from ragbits.evaluate.pipelines import DocumentSearchPipeline +from ragbits.evaluate.utils import log_to_file, log_to_neptune logging.getLogger("LiteLLM").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR) @@ -29,10 +28,14 @@ async def bench(config: DictConfig) -> None: log.info("Starting evaluation: %s", config.setup.name) dataloader = HuggingFaceDataLoader(config) - pipeline = DocumentSearchEvaluationPipeline(config) - metrics = MetricSet()(config) - - evaluator = Evaluator(task="document_search") + pipeline = DocumentSearchPipeline(config) + metrics = MetricSet( + DocumentSearchPrecision, + DocumentSearchRecall, + DocumentSearchF1, + )(config) + + evaluator = Evaluator() results = await evaluator.compute( pipeline=pipeline, dataloader=dataloader, @@ -42,23 +45,13 @@ async def bench(config: DictConfig) -> None: log.info("Evaluation finished. Saving results...") output_dir = Path(HydraConfig.get().runtime.output_dir) - metrics_file = output_dir / "metrics.json" - results_file = output_dir / "results.json" + log_to_file(results, output_dir) - save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"]) - save(results_file, results=results["results"]) + if config.neptune.run: + log_to_neptune(config, results, output_dir) log.info("Evaluation results saved under directory: %s", output_dir) - if config.neptune.run: - run = neptune.init_run(project=config.neptune.project) - run["sys/tags"].add(config.setup.name) - run["config"] = stringify_unsupported(config) - run["evaluation/metrics"] = stringify_unsupported(results["metrics"]) - run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"]) - run["evaluation/metrics.json"].upload(metrics_file.as_posix()) - run["evaluation/results.json"].upload(results_file.as_posix()) - @hydra.main(config_path="config", config_name="config", version_base="3.2") def main(config: DictConfig) -> None: diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py index 875c2ea6a..f11b38d0e 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py @@ -1,8 +1,9 @@ from .evaluator import Evaluator from .loaders import DataLoader, HuggingFaceDataLoader -from .metrics import Metric, MetricSet -from .pipelines import DocumentSearchEvaluationPipeline -from .utils import save +from .metrics.base import Metric, MetricSet +from .pipelines.base import EvaluationPipeline, EvaluationResult +from .pipelines.document_search import DocumentSearchPipeline, DocumentSearchResult +from .utils import log_to_file, log_to_neptune __all__ = [ "Evaluator", @@ -10,6 +11,10 @@ "HuggingFaceDataLoader", "MetricSet", "Metric", - "DocumentSearchEvaluationPipeline", - "save", + "EvaluationPipeline", + "DocumentSearchPipeline", + "EvaluationResult", + "DocumentSearchResult", + "log_to_file", + "log_to_neptune", ] diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py b/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py index def383e56..ff410233f 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py @@ -14,15 +14,6 @@ class Evaluator: Evaluator class. """ - def __init__(self, task: str) -> None: - """ - Constructs the evaluator. - - Args: - task: The task for the evaluator. - """ - self.task = task - async def compute( self, pipeline: EvaluationPipeline, @@ -41,6 +32,7 @@ async def compute( The evaluation results. """ dataset = await dataloader.load() + await pipeline.prepare() results, perf_results = await self._call_pipeline(pipeline, dataset) computed_metrics = self._compute_metrics(metrics, results) processed_results = self._results_processor(results) diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py b/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py index 50b592f7d..e5b61af71 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py @@ -39,6 +39,6 @@ async def load(self) -> HFData: The loaded data. """ return load_dataset( - path=self.config.data.path, - split=self.config.data.split, + path=self.config.data.eval.path, + split=self.config.data.eval.split, ) diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py index 8a8b6e240..c4d10448c 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py @@ -1,3 +1,4 @@ from .base import Metric, MetricSet +from .document_search import DocumentSearchPrecision, DocumentSearchRecall -__all__ = ["Metric", "MetricSet"] +__all__ = ["Metric", "MetricSet", "DocumentSearchPrecision", "DocumentSearchRecall"] diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py index ce7ba2eb6..0ebf7235a 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py @@ -1,13 +1,15 @@ from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Generic, Optional, TypeVar from omegaconf import DictConfig from typing_extensions import Self from ..pipelines import EvaluationResult +ResultT = TypeVar("ResultT", bound=EvaluationResult) -class Metric(ABC): + +class Metric(Generic[ResultT], ABC): """ Base class for metrics. """ @@ -20,10 +22,10 @@ def __init__(self, config: Optional[DictConfig] = None) -> None: config: The metric configuration. """ super().__init__() - self.config = config or {} + self.config = config or DictConfig({}) @abstractmethod - def compute(self, results: list[EvaluationResult]) -> dict[str, Any]: + def compute(self, results: list[ResultT]) -> dict[str, Any]: """ Compute the metric. @@ -35,12 +37,12 @@ def compute(self, results: list[EvaluationResult]) -> dict[str, Any]: """ -class MetricSet: +class MetricSet(Generic[ResultT]): """ Represents a set of metrics. """ - def __init__(self, *metrics: type[Metric]) -> None: + def __init__(self, *metrics: type[Metric[ResultT]]) -> None: """ Initializes the metric set. @@ -48,7 +50,7 @@ def __init__(self, *metrics: type[Metric]) -> None: metrics: The metrics. """ self._metrics = metrics - self.metrics: list[Metric] = [] + self.metrics: list[Metric[ResultT]] = [] def __call__(self, config: Optional[DictConfig] = None) -> Self: """ @@ -63,7 +65,7 @@ def __call__(self, config: Optional[DictConfig] = None) -> Self: self.metrics = [metric(config) for metric in self._metrics] return self - def compute(self, results: list[EvaluationResult]) -> dict[str, Any]: + def compute(self, results: list[ResultT]) -> dict[str, Any]: """ Compute the metrics. diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py new file mode 100644 index 000000000..000f3014d --- /dev/null +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py @@ -0,0 +1,109 @@ +from typing import Any, Optional + +from omegaconf import DictConfig + +from ragbits.evaluate.metrics.base import Metric +from ragbits.evaluate.pipelines.document_search import DocumentSearchResult + + +class DocumentSearchPrecision(Metric[DocumentSearchResult]): + """ + Precision measures the accuracy of the retrieved documents. It is the ratio of the number of relevant + documents retrieved to the total number of documents retrieved. + + Precision = Total Number of Relevent Documents Retrieved / Total Number of Documents Retrieved + + Precision evaluates: "Out of all the documents that the system retrieved, how many were actually relevant?” + """ + + def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]: + """ + Compute the metric. + + Args: + results: The evaluation results. + + Returns: + The computed metric. + """ + total_relevant_documents_retrieved = sum( + len(set(result.reference_passages) & set(result.predicted_passages)) for result in results + ) + total_documents_retrieved = sum(len(set(result.predicted_passages)) for result in results) + + return { + "DOCUMENT_SEARCH/PRECISION": (total_relevant_documents_retrieved / total_documents_retrieved) + if total_documents_retrieved > 0 + else 0.0, + } + + +class DocumentSearchRecall(Metric[DocumentSearchResult]): + """ + Recall measures the comprehensiveness of the retrieved documents. It is the ratio of the number of relevant + documents retrieved to the total number of relevant documents in the database for the given query. + + Recall = Total Number of Relevant Documents Retrieved / Total Number of Relevant Documents in the Database + + Recall evaluates: "Out of all the relevant documents that exist in the database, + how many did the system manage to retrieve?" + """ + + def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]: + """ + Compute the metric. + + Args: + results: The evaluation results. + + Returns: + The computed metric. + """ + total_relevant_documents_retrieved = sum( + len(set(result.reference_passages) & set(result.predicted_passages)) for result in results + ) + total_relevant_documents = sum(len(set(result.reference_passages)) for result in results) + + return { + "DOCUMENT_SEARCH/RECALL": (total_relevant_documents_retrieved / total_relevant_documents) + if total_relevant_documents > 0 + else 0.0, + } + + +class DocumentSearchF1(Metric[DocumentSearchResult]): + """ + F1 Score is the harmonic mean of precision and recall. It is the weighted average of Precision and Recall. + + F1 = 2 * (Precision * Recall) / (Precision + Recall) + """ + + def __init__(self, config: Optional[DictConfig] = None) -> None: + """ + Initializes the metric. + + Args: + config: The metric configuration. + """ + super().__init__(config) + self.precision = DocumentSearchPrecision(config) + self.recall = DocumentSearchRecall(config) + + def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]: + """ + Compute the metric. + + Args: + results: The evaluation results. + + Returns: + The computed metric. + """ + precision = self.precision.compute(results)["DOCUMENT_SEARCH/PRECISION"] + recall = self.recall.compute(results)["DOCUMENT_SEARCH/RECALL"] + + return { + "DOCUMENT_SEARCH/F1": (2 * (precision * recall) / (precision + recall)) + if (precision + recall) > 0 + else 0.0, + } diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py index 9ba40746a..eacb77a26 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py @@ -1,8 +1,9 @@ from .base import EvaluationPipeline, EvaluationResult -from .document_search import DocumentSearchEvaluationPipeline +from .document_search import DocumentSearchPipeline, DocumentSearchResult __all__ = [ - "DocumentSearchEvaluationPipeline", + "DocumentSearchPipeline", + "DocumentSearchResult", "EvaluationPipeline", "EvaluationResult", ] diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py index 991871aae..9a790041f 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py @@ -6,7 +6,7 @@ @dataclass -class EvaluationResult: +class EvaluationResult(ABC): """ Represents the result of a single evaluation. """ @@ -25,7 +25,12 @@ def __init__(self, config: Optional[DictConfig] = None) -> None: config: The evaluation pipeline configuration. """ super().__init__() - self.config = config or {} + self.config = config or DictConfig({}) + + async def prepare(self) -> None: + """ + Prepares the document search evaluation pipeline. + """ @abstractmethod async def __call__(self, data: dict[str, Any]) -> EvaluationResult: diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py index 1afa86f72..d0dffee03 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py @@ -1,14 +1,90 @@ -from typing import Any +from dataclasses import dataclass +from functools import cached_property +from typing import Any, Optional + +from datasets import load_dataset +from omegaconf import DictConfig +from tqdm.asyncio import tqdm + +from ragbits.core.embeddings import LiteLLMEmbeddings +from ragbits.core.vector_store import InMemoryVectorStore +from ragbits.document_search.documents.document import DocumentMeta +from ragbits.document_search.documents.element import TextElement + +try: + from ragbits.document_search import DocumentSearch +except ImportError: + HAS_RAGBITS_DOCUMENT_SEARCH = False +else: + HAS_RAGBITS_DOCUMENT_SEARCH = True from .base import EvaluationPipeline, EvaluationResult -class DocumentSearchEvaluationPipeline(EvaluationPipeline): +@dataclass +class DocumentSearchResult(EvaluationResult): + """ + Represents the result of a single evaluation. + """ + + question: str + reference_passages: list[str] + predicted_passages: list[str] + + +class DocumentSearchPipeline(EvaluationPipeline): """ Document search evaluation pipeline. """ - async def __call__(self, data: dict[str, Any]) -> EvaluationResult: + def __init__(self, config: Optional[DictConfig] = None) -> None: + """ + Initializes the document search evaluation pipeline. + + Raises: + ImportError: If the ragbits-document-search package is not installed. + """ + super().__init__(config) + if not HAS_RAGBITS_DOCUMENT_SEARCH: + raise ImportError("You need to install the 'ragbits-document-search' package to use this pipeline.") + + @cached_property + def documents(self) -> list[DocumentMeta]: + """ + Returns the documents to be ingested. + + Returns: + The documents to be ingested. + """ + # TODO: Implement HF doc loader. + docs = load_dataset( + path=self.config.data.ingest.path, + split=self.config.data.ingest.split, + ) + return [DocumentMeta.create_text_document_from_literal(doc["content"]) for doc in docs] + + @cached_property + def document_search(self) -> DocumentSearch: + """ + Returns the document search instance. + + Returns: + The document search instance. + """ + return DocumentSearch( + embedder=LiteLLMEmbeddings(), + vector_store=InMemoryVectorStore(), + ) + + async def prepare(self) -> None: + """ + Prepares the document search evaluation pipeline. + """ + await tqdm.gather( + *[self.document_search.ingest_document(document) for document in self.documents], desc="Ingestion" + ) + + async def __call__(self, data: dict[str, Any]) -> DocumentSearchResult: """ Runs the document search evaluation pipeline. @@ -18,4 +94,9 @@ async def __call__(self, data: dict[str, Any]) -> EvaluationResult: Returns: The evaluation result. """ - return EvaluationResult() + elements = await self.document_search.search(data["question"]) + return DocumentSearchResult( + question=data["question"], + reference_passages=data["passages"], + predicted_passages=[element.content for element in elements if isinstance(element, TextElement)], + ) diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py index 7bca9dbaf..36e903241 100644 --- a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py +++ b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py @@ -4,8 +4,12 @@ from pathlib import Path from typing import Any +import neptune +from neptune.utils import stringify_unsupported +from omegaconf import DictConfig -def save(file_path: Path, **data: Any) -> None: + +def _save(file_path: Path, **data: Any) -> None: """ Save the data to a file. Add the current timestamp and Python version to the data. @@ -21,3 +25,41 @@ def save(file_path: Path, **data: Any) -> None: with open(file_path, "w", encoding="utf-8") as file: json.dump(data, file, indent=4) + + +def log_to_file(results: dict[str, Any], output_dir: Path) -> None: + """ + Log the evaluation results locally. + + Args: + results: The evaluation results. + output_dir: The output directory. + """ + metrics_file = output_dir / "metrics.json" + results_file = output_dir / "results.json" + + _save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"]) + _save(results_file, results=results["results"]) + + +def log_to_neptune(config: DictConfig, results: dict[str, Any], output_dir: Path) -> None: + """ + Log the evaluation results to Neptune. + + Args: + config: The Hydra configuration. + results: The evaluation results. + output_dir: The output directory. + """ + run = neptune.init_run(project=config.neptune.project) + run["sys/tags"].add( + [ + config.task.name, + config.setup.name, + ] + ) + run["config"] = stringify_unsupported(config) + run["evaluation/metrics"] = stringify_unsupported(results["metrics"]) + run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"]) + run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix()) + run["evaluation/results.json"].upload((output_dir / "results.json").as_posix()) diff --git a/uv.lock b/uv.lock index 23f2b39e5..82205c530 100644 --- a/uv.lock +++ b/uv.lock @@ -3581,10 +3581,6 @@ source = { editable = "packages/ragbits-evaluate" } dependencies = [ { name = "datasets" }, { name = "hydra-core" }, -] - -[package.optional-dependencies] -neptune = [ { name = "neptune" }, ] @@ -3600,8 +3596,8 @@ dev = [ [package.metadata] requires-dist = [ { name = "datasets", specifier = "~=3.0.1" }, - { name = "hydra-core", specifier = ">=1.3.2" }, - { name = "neptune", marker = "extra == 'neptune'", specifier = "~=1.12.0" }, + { name = "hydra-core", specifier = "~=1.3.2" }, + { name = "neptune", specifier = "~=1.12.0" }, ] [package.metadata.requires-dev]