Skip to content

Commit

Permalink
add document search pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
micpst committed Oct 14, 2024
1 parent 541eb98 commit 014937b
Show file tree
Hide file tree
Showing 15 changed files with 299 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ defaults:
- setup: baseline
- _self_

task:
name: document-search

neptune:
project: ragbits
run: False
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
path: "m-ric/huggingface_doc_qa_eval"
split: "train"
ingest:
path: "micpst/hf-docs"
split: "train"
eval:
path: "micpst/hf-docs-retrieval"
split: "train"
Original file line number Diff line number Diff line change
@@ -1 +1 @@
name: BASELINE
name: Baseline
37 changes: 15 additions & 22 deletions packages/ragbits-evaluate/examples/document-search/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@
from pathlib import Path

import hydra
import neptune
from hydra.core.hydra_config import HydraConfig
from neptune.utils import stringify_unsupported
from omegaconf import DictConfig

from ragbits.evaluate.evaluator import Evaluator
from ragbits.evaluate.loaders import HuggingFaceDataLoader
from ragbits.evaluate.metrics import MetricSet
from ragbits.evaluate.pipelines import DocumentSearchEvaluationPipeline
from ragbits.evaluate.utils import save
from ragbits.evaluate.metrics import DocumentSearchPrecision, DocumentSearchRecall, MetricSet
from ragbits.evaluate.metrics.document_search import DocumentSearchF1
from ragbits.evaluate.pipelines import DocumentSearchPipeline
from ragbits.evaluate.utils import log_to_file, log_to_neptune

logging.getLogger("LiteLLM").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
Expand All @@ -29,10 +28,14 @@ async def bench(config: DictConfig) -> None:
log.info("Starting evaluation: %s", config.setup.name)

dataloader = HuggingFaceDataLoader(config)
pipeline = DocumentSearchEvaluationPipeline(config)
metrics = MetricSet()(config)

evaluator = Evaluator(task="document_search")
pipeline = DocumentSearchPipeline(config)
metrics = MetricSet(
DocumentSearchPrecision,
DocumentSearchRecall,
DocumentSearchF1,
)(config)

evaluator = Evaluator()
results = await evaluator.compute(
pipeline=pipeline,
dataloader=dataloader,
Expand All @@ -42,23 +45,13 @@ async def bench(config: DictConfig) -> None:
log.info("Evaluation finished. Saving results...")

output_dir = Path(HydraConfig.get().runtime.output_dir)
metrics_file = output_dir / "metrics.json"
results_file = output_dir / "results.json"
log_to_file(results, output_dir)

save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
save(results_file, results=results["results"])
if config.neptune.run:
log_to_neptune(config, results, output_dir)

log.info("Evaluation results saved under directory: %s", output_dir)

if config.neptune.run:
run = neptune.init_run(project=config.neptune.project)
run["sys/tags"].add(config.setup.name)
run["config"] = stringify_unsupported(config)
run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
run["evaluation/metrics.json"].upload(metrics_file.as_posix())
run["evaluation/results.json"].upload(results_file.as_posix())


@hydra.main(config_path="config", config_name="config", version_base="3.2")
def main(config: DictConfig) -> None:
Expand Down
15 changes: 10 additions & 5 deletions packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from .evaluator import Evaluator
from .loaders import DataLoader, HuggingFaceDataLoader
from .metrics import Metric, MetricSet
from .pipelines import DocumentSearchEvaluationPipeline
from .utils import save
from .metrics.base import Metric, MetricSet
from .pipelines.base import EvaluationPipeline, EvaluationResult
from .pipelines.document_search import DocumentSearchPipeline, DocumentSearchResult
from .utils import log_to_file, log_to_neptune

__all__ = [
"Evaluator",
"DataLoader",
"HuggingFaceDataLoader",
"MetricSet",
"Metric",
"DocumentSearchEvaluationPipeline",
"save",
"EvaluationPipeline",
"DocumentSearchPipeline",
"EvaluationResult",
"DocumentSearchResult",
"log_to_file",
"log_to_neptune",
]
10 changes: 1 addition & 9 deletions packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,6 @@ class Evaluator:
Evaluator class.
"""

def __init__(self, task: str) -> None:
"""
Constructs the evaluator.
Args:
task: The task for the evaluator.
"""
self.task = task

async def compute(
self,
pipeline: EvaluationPipeline,
Expand All @@ -41,6 +32,7 @@ async def compute(
The evaluation results.
"""
dataset = await dataloader.load()
await pipeline.prepare()
results, perf_results = await self._call_pipeline(pipeline, dataset)
computed_metrics = self._compute_metrics(metrics, results)
processed_results = self._results_processor(results)
Expand Down
4 changes: 2 additions & 2 deletions packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,6 @@ async def load(self) -> HFData:
The loaded data.
"""
return load_dataset(
path=self.config.data.path,
split=self.config.data.split,
path=self.config.data.eval.path,
split=self.config.data.eval.split,
)
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .base import Metric, MetricSet
from .document_search import DocumentSearchPrecision, DocumentSearchRecall

__all__ = ["Metric", "MetricSet"]
__all__ = ["Metric", "MetricSet", "DocumentSearchPrecision", "DocumentSearchRecall"]
18 changes: 10 additions & 8 deletions packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from abc import ABC, abstractmethod
from typing import Any, Optional
from typing import Any, Generic, Optional, TypeVar

from omegaconf import DictConfig
from typing_extensions import Self

from ..pipelines import EvaluationResult

ResultT = TypeVar("ResultT", bound=EvaluationResult)

class Metric(ABC):

class Metric(Generic[ResultT], ABC):
"""
Base class for metrics.
"""
Expand All @@ -20,10 +22,10 @@ def __init__(self, config: Optional[DictConfig] = None) -> None:
config: The metric configuration.
"""
super().__init__()
self.config = config or {}
self.config = config or DictConfig({})

@abstractmethod
def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
def compute(self, results: list[ResultT]) -> dict[str, Any]:
"""
Compute the metric.
Expand All @@ -35,20 +37,20 @@ def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
"""


class MetricSet:
class MetricSet(Generic[ResultT]):
"""
Represents a set of metrics.
"""

def __init__(self, *metrics: type[Metric]) -> None:
def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
"""
Initializes the metric set.
Args:
metrics: The metrics.
"""
self._metrics = metrics
self.metrics: list[Metric] = []
self.metrics: list[Metric[ResultT]] = []

def __call__(self, config: Optional[DictConfig] = None) -> Self:
"""
Expand All @@ -63,7 +65,7 @@ def __call__(self, config: Optional[DictConfig] = None) -> Self:
self.metrics = [metric(config) for metric in self._metrics]
return self

def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
def compute(self, results: list[ResultT]) -> dict[str, Any]:
"""
Compute the metrics.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from typing import Any, Optional

from omegaconf import DictConfig

from ragbits.evaluate.metrics.base import Metric
from ragbits.evaluate.pipelines.document_search import DocumentSearchResult


class DocumentSearchPrecision(Metric[DocumentSearchResult]):
"""
Precision measures the accuracy of the retrieved documents. It is the ratio of the number of relevant
documents retrieved to the total number of documents retrieved.
Precision = Total Number of Relevent Documents Retrieved / Total Number of Documents Retrieved
Precision evaluates: "Out of all the documents that the system retrieved, how many were actually relevant?”
"""

def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
"""
Compute the metric.
Args:
results: The evaluation results.
Returns:
The computed metric.
"""
total_relevant_documents_retrieved = sum(
len(set(result.reference_passages) & set(result.predicted_passages)) for result in results
)
total_documents_retrieved = sum(len(set(result.predicted_passages)) for result in results)

return {
"DOCUMENT_SEARCH/PRECISION": (total_relevant_documents_retrieved / total_documents_retrieved)
if total_documents_retrieved > 0
else 0.0,
}


class DocumentSearchRecall(Metric[DocumentSearchResult]):
"""
Recall measures the comprehensiveness of the retrieved documents. It is the ratio of the number of relevant
documents retrieved to the total number of relevant documents in the database for the given query.
Recall = Total Number of Relevant Documents Retrieved / Total Number of Relevant Documents in the Database
Recall evaluates: "Out of all the relevant documents that exist in the database,
how many did the system manage to retrieve?"
"""

def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
"""
Compute the metric.
Args:
results: The evaluation results.
Returns:
The computed metric.
"""
total_relevant_documents_retrieved = sum(
len(set(result.reference_passages) & set(result.predicted_passages)) for result in results
)
total_relevant_documents = sum(len(set(result.reference_passages)) for result in results)

return {
"DOCUMENT_SEARCH/RECALL": (total_relevant_documents_retrieved / total_relevant_documents)
if total_relevant_documents > 0
else 0.0,
}


class DocumentSearchF1(Metric[DocumentSearchResult]):
"""
F1 Score is the harmonic mean of precision and recall. It is the weighted average of Precision and Recall.
F1 = 2 * (Precision * Recall) / (Precision + Recall)
"""

def __init__(self, config: Optional[DictConfig] = None) -> None:
"""
Initializes the metric.
Args:
config: The metric configuration.
"""
super().__init__(config)
self.precision = DocumentSearchPrecision(config)
self.recall = DocumentSearchRecall(config)

def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
"""
Compute the metric.
Args:
results: The evaluation results.
Returns:
The computed metric.
"""
precision = self.precision.compute(results)["DOCUMENT_SEARCH/PRECISION"]
recall = self.recall.compute(results)["DOCUMENT_SEARCH/RECALL"]

return {
"DOCUMENT_SEARCH/F1": (2 * (precision * recall) / (precision + recall))
if (precision + recall) > 0
else 0.0,
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from .base import EvaluationPipeline, EvaluationResult
from .document_search import DocumentSearchEvaluationPipeline
from .document_search import DocumentSearchPipeline, DocumentSearchResult

__all__ = [
"DocumentSearchEvaluationPipeline",
"DocumentSearchPipeline",
"DocumentSearchResult",
"EvaluationPipeline",
"EvaluationResult",
]
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


@dataclass
class EvaluationResult:
class EvaluationResult(ABC):
"""
Represents the result of a single evaluation.
"""
Expand All @@ -25,7 +25,12 @@ def __init__(self, config: Optional[DictConfig] = None) -> None:
config: The evaluation pipeline configuration.
"""
super().__init__()
self.config = config or {}
self.config = config or DictConfig({})

async def prepare(self) -> None:
"""
Prepares the document search evaluation pipeline.
"""

@abstractmethod
async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
Expand Down
Loading

0 comments on commit 014937b

Please sign in to comment.