-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into kc/support-for-different-metadatastores-in-v…
…ectorstore
- Loading branch information
Showing
60 changed files
with
2,399 additions
and
152 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
### Checklist | ||
|
||
- [ ] I have updated the documentation accordingly. | ||
- [ ] I have updated the CHANGELOG.md file accordingly. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
name: Semantic Release | ||
|
||
on: | ||
workflow_dispatch: | ||
inputs: | ||
updateType: | ||
description: "version update type" | ||
required: true | ||
type: choice | ||
default: "patch" | ||
options: | ||
- "major" | ||
- "minor" | ||
- "patch" | ||
packageName: | ||
description: "name of the package to update" | ||
required: true | ||
type: choice | ||
options: | ||
- "ragbits" | ||
- "ragbits-cli" | ||
- "ragbits-core" | ||
- "ragbits-document-search" | ||
|
||
jobs: | ||
release: | ||
runs-on: ubuntu-latest | ||
permissions: | ||
id-token: write | ||
contents: write | ||
steps: | ||
- uses: actions/checkout@v4 | ||
|
||
- name: Install uv | ||
uses: astral-sh/setup-uv@v2 | ||
with: | ||
version: "0.4.10" | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.10" | ||
|
||
- name: Create release branch | ||
run: | | ||
git config user.name "github-actions[bot]" | ||
git config user.email "github-actions[bot]@users.noreply.github.com" | ||
git checkout -b release/${{ github.event.inputs.packageName }}-$(date +%Y-%m-%d) | ||
- name: Update packages | ||
id: packages_update | ||
run: | | ||
echo old_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT | ||
uv run scripts/update_ragbits_package.py ${{ github.event.inputs.packageName }} ${{ github.event.inputs.updateType }} | ||
echo new_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT | ||
uv sync | ||
- name: Create PR with updated packages | ||
run: | | ||
COMMIT_MESSAGE="release(${{ github.event.inputs.packageName }}): update to v${{ steps.packages_update.outputs.new_version }}" | ||
git add . | ||
git commit -m "$COMMIT_MESSAGE" | ||
git push -u origin HEAD | ||
gh pr create -B main --title "$COMMIT_MESSAGE" \ | ||
--body 'Update ${{ github.event.inputs.packageName }} version from ${{ steps.packages_update.outputs.old_version }} to ${{ steps.packages_update.outputs.new_version }}' | ||
env: | ||
GH_TOKEN: ${{ secrets.GH_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
pkg_resources | ||
tiktoken | ||
chardet | ||
chroma-hnswlib | ||
chroma-hnswlib | ||
rouge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Document Search Evaluation | ||
|
||
## Ingest | ||
|
||
```sh | ||
uv run ingest.py | ||
``` | ||
|
||
```sh | ||
uv run ingest.py +experiments=chunking-250 | ||
``` | ||
|
||
```sh | ||
uv run ingest.py --multirun +experiments=chunking-250,chunking-500,chunking-1000 | ||
``` | ||
|
||
## Evaluate | ||
|
||
```sh | ||
uv run evaluate.py | ||
``` | ||
|
||
```sh | ||
uv run evaluate.py +experiments=chunking-250 | ||
``` | ||
|
||
```sh | ||
uv run evaluate.py --multirun +experiments=chunking-250,chunking-500,chunking-1000 | ||
``` | ||
|
||
### Log to Neptune | ||
|
||
```sh | ||
uv run evaluate.py neptune.run=True | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
name: "hf-docs" | ||
path: "micpst/hf-docs" | ||
split: "train" | ||
num_docs: 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
name: "hf-docs-retrieval" | ||
path: "micpst/hf-docs-retrieval" | ||
split: "train" |
6 changes: 6 additions & 0 deletions
6
examples/evaluation/document-search/config/embedder/litellm.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
type: LiteLLMEmbeddings | ||
config: | ||
model: "text-embedding-3-small" | ||
options: | ||
dimensions: 768 | ||
encoding_format: float |
20 changes: 20 additions & 0 deletions
20
examples/evaluation/document-search/config/experiments/chunking-1000.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# @package _global_ | ||
|
||
task: | ||
name: chunking-1000 | ||
|
||
# used only for ingestion | ||
providers: | ||
txt: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 1000 | ||
md: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 1000 | ||
|
||
# used for both ingestion and evaluation | ||
vector_store: | ||
config: | ||
index_name: chunk-1000 |
20 changes: 20 additions & 0 deletions
20
examples/evaluation/document-search/config/experiments/chunking-250.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# @package _global_ | ||
|
||
task: | ||
name: chunking-250 | ||
|
||
# used only for ingestion | ||
providers: | ||
txt: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 250 | ||
md: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 250 | ||
|
||
# used for both ingestion and evaluation | ||
vector_store: | ||
config: | ||
index_name: chunk-250 |
20 changes: 20 additions & 0 deletions
20
examples/evaluation/document-search/config/experiments/chunking-500.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# @package _global_ | ||
|
||
task: | ||
name: chunking-500 | ||
|
||
# used only for ingestion | ||
providers: | ||
txt: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 500 | ||
md: | ||
config: | ||
chunking_kwargs: | ||
max_characters: 500 | ||
|
||
# used for both ingestion and evaluation | ||
vector_store: | ||
config: | ||
index_name: chunk-500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
defaults: | ||
- data: corpus | ||
- embedder: litellm | ||
- providers: unstructured | ||
- vector_store: chroma | ||
- _self_ |
25 changes: 25 additions & 0 deletions
25
examples/evaluation/document-search/config/providers/unstructured.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
txt: | ||
type: UnstructuredDefaultProvider | ||
config: | ||
use_api: false | ||
partition_kwargs: | ||
strategy: hi_res | ||
chunking_kwargs: | ||
include_orig_elements: true | ||
max_characters: 1000 | ||
new_after_n_chars: 1000 | ||
overlap: 0 | ||
overlap_all: 0 | ||
|
||
md: | ||
type: UnstructuredDefaultProvider | ||
config: | ||
use_api: false | ||
partition_kwargs: | ||
strategy: hi_res | ||
chunking_kwargs: | ||
include_orig_elements: true | ||
max_characters: 1000 | ||
new_after_n_chars: 1000 | ||
overlap: 0 | ||
overlap_all: 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
type: NoopQueryRephraser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
type: NoopReranker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
defaults: | ||
- data: qa | ||
- embedder: litellm | ||
- providers: unstructured | ||
- vector_store: chroma | ||
- rephraser: noop | ||
- reranker: noop | ||
- _self_ | ||
|
||
task: | ||
name: default | ||
type: document-search | ||
|
||
metrics: | ||
DocumentSearchPrecisionRecallF1: | ||
matching_strategy: RougeChunkMatch | ||
options: | ||
threshold: 0.5 | ||
DocumentSearchRankedRetrievalMetrics: | ||
matching_strategy: RougeChunkMatch | ||
options: | ||
threshold: 0.5 | ||
|
||
neptune: | ||
project: ragbits | ||
run: False |
9 changes: 9 additions & 0 deletions
9
examples/evaluation/document-search/config/vector_store/chroma.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
type: ChromaDBStore | ||
config: | ||
chroma_client: | ||
type: PersistentClient | ||
config: | ||
path: chroma | ||
embedding_function: | ||
type: ragbits.core.embeddings.litellm:LiteLLMEmbeddings | ||
index_name: default |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# /// script | ||
# requires-python = ">=3.10" | ||
# dependencies = [ | ||
# "ragbits-document-search", | ||
# "ragbits-evaluate[relari]", | ||
# "ragbits[litellm,chromadb]", | ||
# ] | ||
# /// | ||
import asyncio | ||
import logging | ||
|
||
import hydra | ||
from omegaconf import DictConfig | ||
|
||
from ragbits.evaluate.evaluator import Evaluator | ||
from ragbits.evaluate.loaders.hf import HFDataLoader | ||
from ragbits.evaluate.metrics.document_search import document_search_metrics | ||
from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline | ||
from ragbits.evaluate.utils import log_to_file, log_to_neptune, setup_neptune | ||
|
||
logging.getLogger("LiteLLM").setLevel(logging.ERROR) | ||
logging.getLogger("httpx").setLevel(logging.ERROR) | ||
log = logging.getLogger(__name__) | ||
|
||
|
||
async def bench(config: DictConfig) -> None: | ||
""" | ||
Function running evaluation for all datasets and evaluation tasks defined in hydra config. | ||
Args: | ||
config: Hydra configuration. | ||
""" | ||
run = setup_neptune(config) | ||
|
||
log.info("Starting evaluation...") | ||
|
||
dataloader = HFDataLoader(config.data) | ||
pipeline = DocumentSearchPipeline(config) | ||
metrics = document_search_metrics(config.metrics) | ||
|
||
evaluator = Evaluator() | ||
results = await evaluator.compute( | ||
pipeline=pipeline, | ||
dataloader=dataloader, | ||
metrics=metrics, | ||
) | ||
|
||
output_dir = log_to_file(results) | ||
if run: | ||
log_to_neptune(run, results, output_dir) | ||
|
||
log.info("Evaluation results saved under directory: %s", output_dir) | ||
|
||
|
||
@hydra.main(config_path="config", config_name="retrieval", version_base="3.2") | ||
def main(config: DictConfig) -> None: | ||
""" | ||
Function running evaluation for all datasets and evaluation tasks defined in hydra config. | ||
Args: | ||
config: Hydra configuration. | ||
""" | ||
asyncio.run(bench(config)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() # pylint: disable=no-value-for-parameter |
Oops, something went wrong.