feat: add RecursiveSplitter
component for Document
preprocessing
#15511
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If you change this name also do it in tests_skipper.yml and ci_metrics.yml | |
name: Tests | |
on: | |
workflow_dispatch: # Activate this workflow manually | |
push: | |
branches: | |
- main | |
# release branches have the form v1.9.x | |
- "v[0-9].*[0-9].x" | |
pull_request: | |
types: | |
- opened | |
- reopened | |
- synchronize | |
- ready_for_review | |
paths: | |
# Keep the list in sync with the paths defined in the `tests_skipper.yml` workflow | |
- "haystack/**/*.py" | |
- "haystack/core/pipeline/predefined/*" | |
- "test/**/*.py" | |
- "pyproject.toml" | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }} | |
CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }} | |
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} | |
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
PYTHON_VERSION: "3.9" | |
HATCH_VERSION: "1.13.0" | |
jobs: | |
format: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
run: pip install hatch==${{ env.HATCH_VERSION }} | |
- name: Check file format | |
run: hatch run format-check | |
- name: Check linting | |
run: hatch run check | |
- name: Check presence of license header | |
run: docker run --rm -v "$(pwd):/github/workspace" ghcr.io/korandoru/hawkeye check | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
check-imports: | |
needs: format | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
run: pip install hatch==${{ env.HATCH_VERSION }} | |
- name: Check imports | |
run: hatch run python .github/utils/check_imports.py | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
unit-tests: | |
name: Unit / ${{ matrix.os }} | |
needs: format | |
strategy: | |
fail-fast: false | |
matrix: | |
os: | |
- ubuntu-latest | |
- windows-latest | |
- macos-latest | |
runs-on: ${{ matrix.os }} | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
id: hatch | |
shell: bash | |
run: | | |
pip install hatch==${{ env.HATCH_VERSION }} | |
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT" | |
- name: Run | |
run: hatch run test:unit | |
- uses: actions/cache/save@v4 | |
id: cache | |
with: | |
path: ${{ steps.hatch.outputs.env }} | |
key: ${{ runner.os }}-${{ github.sha }} | |
- name: Coveralls | |
# We upload only coverage for ubuntu as handling both os | |
# complicates the workflow too much for little to no gain | |
if: matrix.os == 'ubuntu-latest' | |
uses: coverallsapp/github-action@v2 | |
with: | |
path-to-lcov: coverage.xml | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
lint: | |
needs: unit-tests | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
# With the default value of 1, there are corner cases where tj-actions/changed-files | |
# fails with a `no merge base` error | |
fetch-depth: 0 | |
- name: Get changed files | |
id: files | |
uses: tj-actions/changed-files@v45 | |
with: | |
files: | | |
**/*.py | |
files_ignore: | | |
test/** | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
id: hatch | |
run: | | |
pip install hatch==${{ env.HATCH_VERSION }} | |
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT" | |
- uses: actions/cache/restore@v4 | |
id: cache | |
with: | |
path: ${{ steps.hatch.outputs.env }} | |
key: ${{ runner.os }}-${{ github.sha }} | |
- name: Mypy | |
if: steps.files.outputs.any_changed == 'true' | |
run: | | |
mkdir .mypy_cache | |
hatch run test:types ${{ steps.files.outputs.all_changed_files }} | |
- name: Pylint | |
if: steps.files.outputs.any_changed == 'true' | |
run: | | |
hatch run test:lint ${{ steps.files.outputs.all_changed_files }} | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
integration-tests-linux: | |
name: Integration / ubuntu-latest | |
needs: unit-tests | |
runs-on: ubuntu-latest | |
services: | |
tika: | |
image: apache/tika:2.9.0.0 | |
ports: | |
- 9998:9998 | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
id: hatch | |
shell: bash | |
run: | | |
pip install hatch==${{ env.HATCH_VERSION }} | |
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT" | |
- uses: actions/cache/restore@v4 | |
id: cache | |
with: | |
path: ${{ steps.hatch.outputs.env }} | |
key: ${{ runner.os }}-${{ github.sha }} | |
- name: Install dependencies | |
run: | | |
sudo apt update | |
sudo apt install ffmpeg # for local Whisper tests | |
- name: Run | |
run: hatch run test:integration | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
integration-tests-macos: | |
name: Integration / macos-latest | |
needs: unit-tests | |
runs-on: macos-latest | |
env: | |
HAYSTACK_MPS_ENABLED: false | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
id: hatch | |
shell: bash | |
run: | | |
pip install hatch==${{ env.HATCH_VERSION }} | |
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT" | |
- uses: actions/cache/restore@v4 | |
id: cache | |
with: | |
path: ${{ steps.hatch.outputs.env }} | |
key: ${{ runner.os }}-${{ github.sha }} | |
- name: Install dependencies | |
run: | | |
brew install ffmpeg # for local Whisper tests | |
- name: Run | |
run: hatch run test:integration-mac | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
integration-tests-windows: | |
name: Integration / windows-latest | |
needs: unit-tests | |
runs-on: windows-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "${{ env.PYTHON_VERSION }}" | |
- name: Install Hatch | |
id: hatch | |
shell: bash | |
run: | | |
pip install hatch==${{ env.HATCH_VERSION }} | |
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT" | |
- uses: actions/cache/restore@v4 | |
id: cache | |
with: | |
path: ${{ steps.hatch.outputs.env }} | |
key: ${{ runner.os }}-${{ github.sha }} | |
- name: Run | |
run: hatch run test:integration-windows | |
- name: Calculate alert data | |
id: calculator | |
shell: bash | |
if: (success() || failure()) && github.ref_name == 'main' | |
run: | | |
if [ "${{ job.status }}" = "success" ]; then | |
echo "alert_type=success" >> "$GITHUB_OUTPUT"; | |
else | |
echo "alert_type=error" >> "$GITHUB_OUTPUT"; | |
fi | |
- name: Send event to Datadog | |
if: (success() || failure()) && github.ref_name == 'main' | |
uses: masci/datadog@v1 | |
with: | |
api-key: ${{ secrets.CORE_DATADOG_API_KEY }} | |
api-url: https://api.datadoghq.eu | |
events: | | |
- title: "${{ github.workflow }} workflow" | |
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" | |
alert_type: "${{ steps.calculator.outputs.alert_type }}" | |
source_type_name: "Github" | |
host: ${{ github.repository_owner }} | |
tags: | |
- "project:${{ github.repository }}" | |
- "job:${{ github.job }}" | |
- "run_id:${{ github.run_id }}" | |
- "workflow:${{ github.workflow }}" | |
- "branch:${{ github.ref_name }}" | |
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
trigger-catch-all: | |
name: Tests completed | |
# This job will be executed only after all the other tests | |
# are successful. | |
# This way we'll be able to mark only this test as required | |
# and skip it accordingly. | |
needs: | |
- integration-tests-linux | |
- integration-tests-macos | |
- integration-tests-windows | |
uses: ./.github/workflows/tests_skipper_workflow.yml | |
with: | |
tests_were_skipped: false |