Skip to content

Commit

Permalink
optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
kdziedzic68 committed Nov 6, 2024
1 parent e674abc commit 5a9e0bd
Show file tree
Hide file tree
Showing 15 changed files with 354 additions and 81 deletions.
8 changes: 5 additions & 3 deletions examples/evaluation/document-search/config/data/qa.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "hf-docs-retrieval"
path: "micpst/hf-docs-retrieval"
split: "train"
type: ragbits.evaluate.loaders.hf:HFDataLoader
options:
name: "hf-docs-retrieval"
path: "micpst/hf-docs-retrieval"
split: "train"
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
task:
name: chunking-1000

pipeline:
# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 1000
md:
config:
chunking_kwargs:
max_characters: 1000
providers:
txt:
config:
chunking_kwargs:
max_characters: 1000
md:
config:
chunking_kwargs:
max_characters: 1000

# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-1000
# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-1000
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@ task:
name: chunking-250

# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 250
md:
config:
chunking_kwargs:
max_characters: 250
pipeline:
providers:
txt:
config:
chunking_kwargs:
max_characters: 250
md:
config:
chunking_kwargs:
max_characters: 250

# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-250
# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-250
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@ task:
name: chunking-500

# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 500
md:
config:
chunking_kwargs:
max_characters: 500

pipeline:
providers:
txt:
config:
chunking_kwargs:
max_characters: 500
md:
config:
chunking_kwargs:
max_characters: 500
# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-500
vector_store:
config:
index_name: chunk-500
4 changes: 1 addition & 3 deletions examples/evaluation/document-search/config/ingestion.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
defaults:
- data: corpus
- embedder: litellm
- providers: unstructured
- vector_store: chroma
- pipeline: document_ingestion
- _self_
11 changes: 3 additions & 8 deletions examples/evaluation/document-search/config/retrieval.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
defaults:
- data: qa
- embedder: litellm
- providers: unstructured
- vector_store: chroma
- rephraser: noop
- reranker: noop
- _self_
- pipeline: document_search

task:
name: default
type: document-search

metrics:
DocumentSearchPrecisionRecallF1:
- type: ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1
matching_strategy: RougeChunkMatch
options:
threshold: 0.5
DocumentSearchRankedRetrievalMetrics:
- type: ragbits.evaluate.metrics.document_search:DocumentSearchRankedRetrievalMetrics
matching_strategy: RougeChunkMatch
options:
threshold: 0.5
Expand Down
10 changes: 5 additions & 5 deletions examples/evaluation/document-search/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from omegaconf import DictConfig

from ragbits.evaluate.evaluator import Evaluator
from ragbits.evaluate.loaders.hf import HFDataLoader
from ragbits.evaluate.metrics.document_search import document_search_metrics
from ragbits.evaluate.loaders import dataloader_factory
from ragbits.evaluate.metrics import metric_set_factory
from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
from ragbits.evaluate.utils import log_to_file, log_to_neptune, setup_neptune

Expand All @@ -34,9 +34,9 @@ async def bench(config: DictConfig) -> None:

log.info("Starting evaluation...")

dataloader = HFDataLoader(config.data)
pipeline = DocumentSearchPipeline(config)
metrics = document_search_metrics(config.metrics)
dataloader = dataloader_factory(config.data)
pipeline = DocumentSearchPipeline(config.pipeline)
metrics = metric_set_factory(config.metrics)

evaluator = Evaluator()
results = await evaluator.compute(
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluation/document-search/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def ingest(config: DictConfig) -> None:
"""
log.info("Ingesting documents...")

document_search = DocumentSearch.from_config(config) # type: ignore
document_search = DocumentSearch.from_config(config.pipeline) # type: ignore

documents = await tqdm.gather(
*[
Expand Down
2 changes: 1 addition & 1 deletion packages/ragbits-evaluate/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.2.0"]
dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.2.0", "optuna==4.0.0"]

[project.optional-dependencies]
relari = [
Expand Down
16 changes: 16 additions & 0 deletions packages/ragbits-evaluate/src/ragbits/evaluate/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sys

from omegaconf import DictConfig

from ragbits.core.utils.config_handling import get_cls_from_config

from .base import DataLoader

__all__ = [DataLoader]

module = sys.modules[__name__]


def dataloader_factory(config: DictConfig) -> DataLoader:
dataloader_class = get_cls_from_config(config.type, module)
return dataloader_class(config.options)
19 changes: 19 additions & 0 deletions packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import sys

from omegaconf import DictConfig

from ragbits.core.utils.config_handling import get_cls_from_config

from .base import MetricSet

__all__ = [MetricSet]

module = sys.modules[__name__]


def metric_set_factory(cfg: DictConfig) -> MetricSet:
metrics = []
for metric_cfg in cfg:
metric_module = get_cls_from_config(metric_cfg.type, module)
metrics.append(metric_module(metric_cfg))
return MetricSet(*metrics)
64 changes: 45 additions & 19 deletions packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Any, Generic, TypeVar

from omegaconf import DictConfig
from typing_extensions import Self

from ragbits.evaluate.pipelines.base import EvaluationResult

Expand All @@ -22,7 +21,8 @@ def __init__(self, config: DictConfig | None = None) -> None:
config: The metric configuration.
"""
super().__init__()
self.config = getattr(config, self.__class__.__name__, DictConfig({}))
self.config = config
self.weight : float = getattr(self.config, "weight", 1.)

@abstractmethod
def compute(self, results: list[ResultT]) -> dict[str, Any]:
Expand All @@ -36,34 +36,60 @@ def compute(self, results: list[ResultT]) -> dict[str, Any]:
The computed metric.
"""

# class MetricSet(Generic[ResultT]):
# """
# Represents a set of metrics.
# """
#
# def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
# """
# Initializes the metric set.
#
# Args:
# metrics: The metrics.
# """
# self._metrics = metrics
# self.metrics: list[Metric[ResultT]] = []
#
# def __call__(self, config: DictConfig | None = None) -> Self:
# """
# Initializes the metrics.
#
# Args:
# config: The configuration for the metrics.
#
# Returns:
# The initialized metric set.
# """
# self.metrics = [metric(config) for metric in self._metrics]
# return self
#
# def compute(self, results: list[ResultT]) -> dict[str, Any]:
# """
# Compute the metrics.
#
# Args:
# results: The evaluation results.
#
# Returns:
# The computed metrics.
# """
# return {name: value for metric in self.metrics for name, value in metric.compute(results).items()}


class MetricSet(Generic[ResultT]):
"""
Represents a set of metrics.
"""

def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
def __init__(self, *metrics: Metric[ResultT]) -> None:
"""
Initializes the metric set.
Args:
metrics: The metrics.
"""
self._metrics = metrics
self.metrics: list[Metric[ResultT]] = []

def __call__(self, config: DictConfig | None = None) -> Self:
"""
Initializes the metrics.
Args:
config: The configuration for the metrics.
Returns:
The initialized metric set.
"""
self.metrics = [metric(config) for metric in self._metrics]
return self
self.metrics = metrics

def compute(self, results: list[ResultT]) -> dict[str, Any]:
"""
Expand All @@ -75,4 +101,4 @@ def compute(self, results: list[ResultT]) -> dict[str, Any]:
Returns:
The computed metrics.
"""
return {name: value for metric in self.metrics for name, value in metric.compute(results).items()}
return {name: metric.weight * value for metric in self.metrics for name, value in metric.compute(results).items()}
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import asyncio
import uuid
from dataclasses import dataclass
from functools import cached_property

from omegaconf import DictConfig
from tqdm.asyncio import tqdm

from ragbits.document_search import DocumentSearch
from ragbits.document_search.documents.document import DocumentMeta
from ragbits.document_search.documents.element import TextElement
from ragbits.document_search.documents.sources import HuggingFaceSource
from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult


Expand Down Expand Up @@ -42,9 +50,40 @@ async def __call__(self, data: dict) -> DocumentSearchResult:
The evaluation result.
"""
elements = await self.document_search.search(data["question"])
predicted_passages = [element.get_text_representation() for element in elements]
predicted_passages = [element.content for element in elements if isinstance(element, TextElement)]
return DocumentSearchResult(
question=data["question"],
reference_passages=data["passages"],
predicted_passages=predicted_passages,
)


class DocumentSearchWithIngestionPipeline(DocumentSearchPipeline):
def __init__(self, config: DictConfig | None = None) -> None:
super().__init__(config)
self.config.vector_store.config.index_name = str(uuid.uuid4())
self._ingested = False
self._lock = asyncio.Lock()

async def __call__(self, data: dict) -> DocumentSearchResult:
async with self._lock:
if not self._ingested:
await self._ingest_documents()
self._ingested = True
return await super().__call__(data)

async def _ingest_documents(self):
documents = await tqdm.gather(
*[
DocumentMeta.from_source(
HuggingFaceSource(
path=self.config.answer_data_source.path,
split=self.config.answer_data_source.split,
row=i,
)
)
for i in range(self.config.answer_data_source.num_docs)
],
desc="Download",
)
await self.document_search.ingest(documents)
Loading

0 comments on commit 5a9e0bd

Please sign in to comment.