Skip to content

Commit

Permalink
Merge branch 'main' into kc/support-for-different-metadatastores-in-v…
Browse files Browse the repository at this point in the history
…ectorstore
  • Loading branch information
konrad-czarnota-ds committed Oct 24, 2024
2 parents 4d655c3 + 99d91f9 commit e7d7fc0
Show file tree
Hide file tree
Showing 60 changed files with 2,399 additions and 152 deletions.
4 changes: 4 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Checklist

- [ ] I have updated the documentation accordingly.
- [ ] I have updated the CHANGELOG.md file accordingly.
67 changes: 67 additions & 0 deletions .github/workflows/semantic_release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: Semantic Release

on:
workflow_dispatch:
inputs:
updateType:
description: "version update type"
required: true
type: choice
default: "patch"
options:
- "major"
- "minor"
- "patch"
packageName:
description: "name of the package to update"
required: true
type: choice
options:
- "ragbits"
- "ragbits-cli"
- "ragbits-core"
- "ragbits-document-search"

jobs:
release:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: write
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v2
with:
version: "0.4.10"

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Create release branch
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git checkout -b release/${{ github.event.inputs.packageName }}-$(date +%Y-%m-%d)
- name: Update packages
id: packages_update
run: |
echo old_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT
uv run scripts/update_ragbits_package.py ${{ github.event.inputs.packageName }} ${{ github.event.inputs.updateType }}
echo new_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT
uv sync
- name: Create PR with updated packages
run: |
COMMIT_MESSAGE="release(${{ github.event.inputs.packageName }}): update to v${{ steps.packages_update.outputs.new_version }}"
git add .
git commit -m "$COMMIT_MESSAGE"
git push -u origin HEAD
gh pr create -B main --title "$COMMIT_MESSAGE" \
--body 'Update ${{ github.event.inputs.packageName }} version from ${{ steps.packages_update.outputs.old_version }} to ${{ steps.packages_update.outputs.new_version }}'
env:
GH_TOKEN: ${{ secrets.GH_TOKEN }}
3 changes: 2 additions & 1 deletion .libraries-whitelist.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pkg_resources
tiktoken
chardet
chroma-hnswlib
chroma-hnswlib
rouge
15 changes: 12 additions & 3 deletions examples/document-search/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@

from ragbits.core.embeddings import LiteLLMEmbeddings
from ragbits.core.vector_store.chromadb_store import ChromaDBStore
from ragbits.document_search import DocumentSearch
from ragbits.document_search import DocumentSearch, SearchConfig
from ragbits.document_search.documents.document import DocumentMeta

documents = [
DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
DocumentMeta.create_text_document_from_literal(
"Why programmers don't like to swim? Because they're scared of the floating points."
),
DocumentMeta.create_text_document_from_literal("This one is completely unrelated."),
]


Expand All @@ -37,8 +38,16 @@ async def main():

await document_search.ingest(documents)

results = await document_search.search("I'm boiling my water and I need a joke")
print(results)
print()
print("All documents:")
all_documents = await vector_store.list()
print([doc.metadata["content"] for doc in all_documents])

query = "I'm boiling my water and I need a joke"
print()
print(f"Documents similar to: {query}")
results = await document_search.search(query, search_config=SearchConfig(vector_store_kwargs={"k": 2}))
print([element.get_key() for element in results])


if __name__ == "__main__":
Expand Down
12 changes: 12 additions & 0 deletions examples/document-search/from_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@
},
"reranker": {"type": "ragbits.document_search.retrieval.rerankers.noop:NoopReranker"},
"providers": {"txt": {"type": "DummyProvider"}},
"rephraser": {
"type": "LLMQueryRephraser",
"config": {
"llm": {
"type": "LiteLLM",
"config": {
"model_name": "gpt-4-turbo",
},
},
"prompt": "QueryRephraserPrompt",
},
},
}


Expand Down
35 changes: 35 additions & 0 deletions examples/evaluation/document-search/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Document Search Evaluation

## Ingest

```sh
uv run ingest.py
```

```sh
uv run ingest.py +experiments=chunking-250
```

```sh
uv run ingest.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
```

## Evaluate

```sh
uv run evaluate.py
```

```sh
uv run evaluate.py +experiments=chunking-250
```

```sh
uv run evaluate.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
```

### Log to Neptune

```sh
uv run evaluate.py neptune.run=True
```
4 changes: 4 additions & 0 deletions examples/evaluation/document-search/config/data/corpus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name: "hf-docs"
path: "micpst/hf-docs"
split: "train"
num_docs: 5
3 changes: 3 additions & 0 deletions examples/evaluation/document-search/config/data/qa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name: "hf-docs-retrieval"
path: "micpst/hf-docs-retrieval"
split: "train"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
type: LiteLLMEmbeddings
config:
model: "text-embedding-3-small"
options:
dimensions: 768
encoding_format: float
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# @package _global_

task:
name: chunking-1000

# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 1000
md:
config:
chunking_kwargs:
max_characters: 1000

# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-1000
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# @package _global_

task:
name: chunking-250

# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 250
md:
config:
chunking_kwargs:
max_characters: 250

# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-250
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# @package _global_

task:
name: chunking-500

# used only for ingestion
providers:
txt:
config:
chunking_kwargs:
max_characters: 500
md:
config:
chunking_kwargs:
max_characters: 500

# used for both ingestion and evaluation
vector_store:
config:
index_name: chunk-500
6 changes: 6 additions & 0 deletions examples/evaluation/document-search/config/ingestion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
defaults:
- data: corpus
- embedder: litellm
- providers: unstructured
- vector_store: chroma
- _self_
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
txt:
type: UnstructuredDefaultProvider
config:
use_api: false
partition_kwargs:
strategy: hi_res
chunking_kwargs:
include_orig_elements: true
max_characters: 1000
new_after_n_chars: 1000
overlap: 0
overlap_all: 0

md:
type: UnstructuredDefaultProvider
config:
use_api: false
partition_kwargs:
strategy: hi_res
chunking_kwargs:
include_orig_elements: true
max_characters: 1000
new_after_n_chars: 1000
overlap: 0
overlap_all: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
type: NoopQueryRephraser
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
type: NoopReranker
26 changes: 26 additions & 0 deletions examples/evaluation/document-search/config/retrieval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defaults:
- data: qa
- embedder: litellm
- providers: unstructured
- vector_store: chroma
- rephraser: noop
- reranker: noop
- _self_

task:
name: default
type: document-search

metrics:
DocumentSearchPrecisionRecallF1:
matching_strategy: RougeChunkMatch
options:
threshold: 0.5
DocumentSearchRankedRetrievalMetrics:
matching_strategy: RougeChunkMatch
options:
threshold: 0.5

neptune:
project: ragbits
run: False
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
type: ChromaDBStore
config:
chroma_client:
type: PersistentClient
config:
path: chroma
embedding_function:
type: ragbits.core.embeddings.litellm:LiteLLMEmbeddings
index_name: default
67 changes: 67 additions & 0 deletions examples/evaluation/document-search/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "ragbits-document-search",
# "ragbits-evaluate[relari]",
# "ragbits[litellm,chromadb]",
# ]
# ///
import asyncio
import logging

import hydra
from omegaconf import DictConfig

from ragbits.evaluate.evaluator import Evaluator
from ragbits.evaluate.loaders.hf import HFDataLoader
from ragbits.evaluate.metrics.document_search import document_search_metrics
from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
from ragbits.evaluate.utils import log_to_file, log_to_neptune, setup_neptune

logging.getLogger("LiteLLM").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
log = logging.getLogger(__name__)


async def bench(config: DictConfig) -> None:
"""
Function running evaluation for all datasets and evaluation tasks defined in hydra config.
Args:
config: Hydra configuration.
"""
run = setup_neptune(config)

log.info("Starting evaluation...")

dataloader = HFDataLoader(config.data)
pipeline = DocumentSearchPipeline(config)
metrics = document_search_metrics(config.metrics)

evaluator = Evaluator()
results = await evaluator.compute(
pipeline=pipeline,
dataloader=dataloader,
metrics=metrics,
)

output_dir = log_to_file(results)
if run:
log_to_neptune(run, results, output_dir)

log.info("Evaluation results saved under directory: %s", output_dir)


@hydra.main(config_path="config", config_name="retrieval", version_base="3.2")
def main(config: DictConfig) -> None:
"""
Function running evaluation for all datasets and evaluation tasks defined in hydra config.
Args:
config: Hydra configuration.
"""
asyncio.run(bench(config))


if __name__ == "__main__":
main() # pylint: disable=no-value-for-parameter
Loading

0 comments on commit e7d7fc0

Please sign in to comment.