Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Simplification of Document Search Evaluation interface #258

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ metrics:
threshold: 0.5


callbacks:
- type: ragbits.evaluate.callbacks.neptune:NeptuneCallbackConfigurator
args:
callback_type: neptune.integrations.optuna:NeptuneCallback
project: deepsense-ai/ragbits
#callbacks:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is it commented?

# - type: ragbits.evaluate.callbacks.neptune:NeptuneCallbackConfigurator
# args:
# callback_type: neptune.integrations.optuna:NeptuneCallback
# project: deepsense-ai/ragbits
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults:
- embedder: litellm
- providers: unstructured
- vector_store: chroma
- _self_

type: ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline
ingest: true
search: false
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ defaults:
- vector_store: chroma
- rephraser: noop
- reranker: noop
- _self_
- _self_


type: ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ defaults:
- _self_

type: ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline
ingest: true

Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
from omegaconf import DictConfig

from ragbits.evaluate.evaluator import Evaluator
from ragbits.evaluate.loaders import dataloader_factory
from ragbits.evaluate.metrics import metric_set_factory
from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
from ragbits.evaluate.utils import log_to_file, log_to_neptune, setup_neptune

logging.getLogger("LiteLLM").setLevel(logging.ERROR)
Expand All @@ -31,20 +28,8 @@ async def bench(config: DictConfig) -> None:
config: Hydra configuration.
"""
run = setup_neptune(config)

log.info("Starting evaluation...")

dataloader = dataloader_factory(config.data)
pipeline = DocumentSearchPipeline(config.pipeline)
metrics = metric_set_factory(config.metrics)

evaluator = Evaluator()
results = await evaluator.compute(
pipeline=pipeline,
dataloader=dataloader,
metrics=metrics,
)

log.info("Starting the experiment...")
results = await Evaluator.run_experiment_from_config(config=config)
output_dir = log_to_file(results)
if run:
log_to_neptune(run, results, output_dir)
Expand Down
26 changes: 26 additions & 0 deletions examples/evaluation/document-search/advanced/optimize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys

import hydra
from omegaconf import DictConfig, OmegaConf

from ragbits.evaluate.optimizer import Optimizer
from ragbits.evaluate.utils import log_optimization_to_file

module = sys.modules[__name__]


@hydra.main(config_path="config", config_name="optimization", version_base="3.2")
def main(config: DictConfig) -> None:
"""
Function running evaluation for all datasets and evaluation tasks defined in hydra config.

Args:
config: Hydra configuration.
"""
exp_config = {"optimizer": OmegaConf.create({"direction": "maximize", "n_trials": 10}), "experiment_config": config}
configs_with_scores = Optimizer.run_experiment_from_config(config=exp_config)
log_optimization_to_file(configs_with_scores)


if __name__ == "__main__":
main()
79 changes: 79 additions & 0 deletions examples/evaluation/document-search/basic/basic_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "ragbits-document-search[huggingface]",
# "ragbits-core[chroma]",
# "hydra-core~=1.3.2",
# "unstructured[md]>=0.15.13",
# ]
# ///
import asyncio
import logging
import uuid
from pathlib import Path

from omegaconf import OmegaConf

from ragbits.evaluate.evaluator import Evaluator
from ragbits.evaluate.utils import log_to_file

logging.getLogger("LiteLLM").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
log = logging.getLogger(__name__)


async def evaluate() -> dict:
"""
Basic example of document search evaluation.

"""
log.info("Ingesting documents...")

config = OmegaConf.create(
{
"pipeline": {
"type": "ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline",
"ingest": False,
"search": True,
"providers": {
"txt": {
"type": "ragbits.document_search.ingestion.providers.unstructured:UnstructuredDefaultProvider"
}
},
},
"data": {
"type": "ragbits.evaluate.loaders.hf:HFDataLoader",
"options": {"name": "hf-docs-retrieval", "path": "micpst/hf-docs-retrieval", "split": "train"},
},
"metrics": [
{
"type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
"matching_strategy": "RougeChunkMatch",
"options": {"threshold": 0.5},
}
],
"neptune": {"project": "ragbits", "run": False},
"task": {"name": "default", "type": "document-search"},
}
)

results = await Evaluator.run_experiment_from_config(config=config)

log.info("Evaluation finished.")

return results


def main() -> None:
"""
Run the evaluation process.

"""
results = asyncio.run(evaluate())
out_dir = Path(str(uuid.uuid4()))
out_dir.mkdir()
log_to_file(results, output_dir=out_dir)


if __name__ == "__main__":
main() # pylint: disable=no-value-for-parameter
61 changes: 61 additions & 0 deletions examples/evaluation/document-search/basic/basic_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "ragbits-document-search[huggingface]",
# "ragbits-core[chroma]",
# "hydra-core~=1.3.2",
# "unstructured[md]>=0.15.13",
# ]
# ///
import asyncio
import logging

from omegaconf import OmegaConf

from ragbits.evaluate.pipelines import pipeline_factory

logging.getLogger("LiteLLM").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
log = logging.getLogger(__name__)


async def ingest() -> None:
"""
Ingest documents into the document search system.

Args:
config: Hydra configuration.
"""
log.info("Ingesting documents...")

config = OmegaConf.create(
{
"type": "ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline",
"ingest": True,
"search": False,
"answer_data_source": {"name": "hf-docs", "path": "micpst/hf-docs", "split": "train", "num_docs": 5},
"providers": {
"txt": {"type": "ragbits.document_search.ingestion.providers.unstructured:UnstructuredDefaultProvider"}
},
}
)

ingestor = pipeline_factory(config) # type: ignore

await ingestor()

log.info("Ingestion finished.")


def main() -> None:
"""
Run the ingestion process.

Args:
config: Hydra configuration.
"""
asyncio.run(ingest())


if __name__ == "__main__":
main() # pylint: disable=no-value-for-parameter
61 changes: 61 additions & 0 deletions examples/evaluation/document-search/basic/basic_optimize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import sys

from omegaconf import OmegaConf

from ragbits.evaluate.optimizer import Optimizer
from ragbits.evaluate.utils import log_optimization_to_file

module = sys.modules[__name__]


def main() -> None:
"""
Function running evaluation for all datasets and evaluation tasks defined in config.
"""
config = OmegaConf.create(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have to wrap these configs with hydra?

{
"pipeline": {
"type": "ragbits.evaluate.pipelines.document_search:DocumentSearchWithIngestionPipeline",
"ingest": True,
"search": True,
"answer_data_source": {
"name": "hf-docs",
"path": "micpst/hf-docs",
"split": "train",
"num_docs": 5,
},
"providers": {
"txt": {
"type": "ragbits.document_search.ingestion.providers.unstructured:UnstructuredDefaultProvider"
}
},
"embedder": {
"type": "ragbits.core.embeddings.litellm:LiteLLMEmbeddings",
"config": {
"model": "text-embedding-3-small",
"options": {
"dimensions": {"optimize": True, "range": [32, 512]},
},
},
},
},
"data": {
"type": "ragbits.evaluate.loaders.hf:HFDataLoader",
"options": {"name": "hf-docs-retrieval", "path": "micpst/hf-docs-retrieval", "split": "train"},
},
"metrics": [
{
"type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
"matching_strategy": "RougeChunkMatch",
"options": {"threshold": 0.5},
}
],
}
)
exp_config = {"optimizer": OmegaConf.create({"direction": "maximize", "n_trials": 10}), "experiment_config": config}
configs_with_scores = Optimizer.run_experiment_from_config(config=exp_config)
log_optimization_to_file(configs_with_scores)


if __name__ == "__main__":
main()

This file was deleted.

45 changes: 0 additions & 45 deletions examples/evaluation/document-search/optimize.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
subclass = import_by_path(config.type, cls.default_module)
if not issubclass(subclass, cls):
raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

return subclass.from_config(config.config)

@classmethod
Expand Down
12 changes: 11 additions & 1 deletion packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery
from ragbits.core.vector_stores.chroma import ChromaVectorStore
from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
from ragbits.core.vector_stores.qdrant import QdrantVectorStore

__all__ = ["InMemoryVectorStore", "VectorStore", "VectorStoreEntry", "VectorStoreOptions", "WhereQuery"]
__all__ = [
"ChromaVectorStore",
"InMemoryVectorStore",
"QdrantVectorStore",
"VectorStore",
"VectorStoreEntry",
"VectorStoreOptions",
"WhereQuery",
]
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ class DocumentSearchConfig(BaseModel):
Schema for for the dict taken by DocumentSearch.from_config method.
"""

embedder: ObjectContructionConfig
vector_store: ObjectContructionConfig
embedder: ObjectContructionConfig = ObjectContructionConfig(type="LiteLLMEmbeddings")
vector_store: ObjectContructionConfig = ObjectContructionConfig(
type="ChromaVectorStore",
config={"client": {"type": "PersistentClient"}, "index_name": "default"},
)
rephraser: ObjectContructionConfig = ObjectContructionConfig(type="NoopQueryRephraser")
reranker: ObjectContructionConfig = ObjectContructionConfig(type="NoopReranker")
processing_strategy: ObjectContructionConfig = ObjectContructionConfig(type="SequentialProcessing")
Expand Down
Loading
Loading