Merge branch 'deepset-ai:main' into ci-deepset-ai#5931-isort

mjspeck · Oct 18, 2023 · 49b7dd9 · 49b7dd9
2 parents 682ddec + 21d894d
commit 49b7dd9
Show file tree

Hide file tree

Showing 130 changed files with 3,887 additions and 648 deletions.
diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
@@ -42,7 +42,9 @@ jobs:
           python-version: ${{ env.PYTHON_VERSION }}
 
       - name: Install Haystack
-        run: pip install .[all,dev]
+        run: |
+          pip install --upgrade pip
+          pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
 
       - name: Run
         run: pytest examples/

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -6,6 +6,9 @@ on:
     paths:
       - "**.py"
       - "**/pyproject.toml"
+      - "!haystack/preview/**/*.py"
+      - "!test/preview/**/*.py"
+      - "!e2e/preview/**/*.py"
 
 env:
   PYTHON_VERSION: "3.8"

diff --git a/.github/workflows/linting_preview.yml b/.github/workflows/linting_preview.yml
@@ -0,0 +1,81 @@
+# If you change this name also do it in linting-skipper.yml and ci_metrics.yml
+name: Linting (Preview)
+
+on:
+  pull_request:
+    paths:
+      - "haystack/preview/**/*.py"
+      - "test/preview/**/*.py"
+      - "e2e/preview/**/*.py"
+      - "**/pyproject.toml"
+
+env:
+  PYTHON_VERSION: "3.8"
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          # With the default value of 1, there are corner cases where tj-actions/changed-files
+          # fails with a `no merge base` error
+          fetch-depth: 0
+
+      - name: Get changed files
+        id: files
+        uses: tj-actions/changed-files@v39
+        with:
+          files: |
+            **/*.py
+          files_ignore: |
+            test/**
+            rest_api/test/**
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install Haystack
+        run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
+
+      - name: Mypy
+        if: steps.files.outputs.any_changed == 'true'
+        run: |
+          mkdir .mypy_cache/
+          mypy --install-types --non-interactive ${{ steps.files.outputs.all_changed_files }} --exclude=rest_api/build/ --exclude=rest_api/test/
+
+  pylint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          # With the default value of 1, there are corner cases where tj-actions/changed-files
+          # fails with a `no merge base` error
+          fetch-depth: 0
+
+      - name: Get changed files
+        id: files
+        uses: tj-actions/changed-files@v39
+        with:
+          files: |
+            **/*.py
+          files_ignore: |
+            test/**
+            rest_api/test/**
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install Haystack
+        run: |
+          pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
+          pip install ./haystack-linter
+
+      - name: Pylint
+        if: steps.files.outputs.any_changed == 'true'
+        run: |
+          pylint -ry -j 0 ${{ steps.files.outputs.all_changed_files }}
diff --git a/.github/workflows/linting_skipper.yml b/.github/workflows/linting_skipper.yml
@@ -6,6 +6,9 @@ on:
     paths-ignore:
       - "**.py"
       - "**/pyproject.toml"
+      - "!haystack/preview/**/*.py"
+      - "!test/preview/**/*.py"
+      - "!e2e/preview/**/*.py"
 
 jobs:
   mypy:

diff --git a/.github/workflows/preview_imports.yml b/.github/workflows/preview_imports.yml
@@ -0,0 +1,55 @@
+name: Verify preview imports only preview
+
+on:
+  pull_request:
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - ready_for_review
+    paths:
+      - "haystack/preview/**.py"
+
+jobs:
+  verify-imports:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          # With the default value of 1, there are corner cases where tj-actions/changed-files
+          # fails with a `no merge base` error
+          fetch-depth: 0
+
+      - name: Get changed files
+        id: files
+        uses: tj-actions/changed-files@v39
+        with:
+          files: |
+            haystack/preview/**.py
+
+      - name: Check imports
+        shell: python
+        run: |
+          import re
+          regex = r"^(from haystack|import haystack)(?!\.preview| import preview)(.*)"
+
+          changed_files = "${{ steps.files.outputs.all_changed_files }}".split()
+          matches = {}
+          for path in changed_files:
+            with open(path, "r") as f:
+              file_matches = []
+              for line in f.readlines():
+                file_matches.extend(re.finditer(regex, line.strip()))
+              if file_matches:
+                matches[path] = file_matches
+
+          for path, match in matches.items():
+            print(f"Bad imports in file '{path}'")
+            for m in match:
+              print(m.group())
+            print()
+
+          if matches:
+            print("::error:: Imports in haystack.preview can only import from haystack.preview")
+            import sys; sys.exit(1)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -19,6 +19,7 @@ on:
       - "pyproject.toml"
       - "!haystack/preview/**/*.py"  # See tests_preview.yml
       - "!test/preview/**/*.py"  # See tests_preview.yml
+      - "!e2e/preview/**/*.py"  # See e2e_preview.yml
       - "!.github/**/*.py"
       - "!rest_api/**/*.py"
       - "!docs/**/*.py"
@@ -124,10 +125,10 @@ jobs:
         include:
           - topic: document_stores
             os: ubuntu-latest
-            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
+            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
           - topic: document_stores
             os: windows-latest
-            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
+            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -329,7 +330,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     services:
       elasticsearch:
-        image: elasticsearch:8.8.0
+        image: elasticsearch:8.10.2
         env:
           discovery.type: "single-node"
           xpack.security.enabled: "false"
@@ -346,9 +347,36 @@ jobs:
       - name: Install Haystack
         run: pip install .[elasticsearch8,dev,preprocessing,inference]
 
+      - name: Make elasticsearch comfortable with a disk almost full
+        run: |
+          curl -X PUT "localhost:9200/_cluster/settings?pretty" -H 'Content-Type: application/json' -d'
+          {
+            "persistent": {
+              "cluster.routing.allocation.disk.watermark.low": "90%",
+              "cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
+              "cluster.routing.allocation.disk.watermark.high": "95%",
+              "cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
+              "cluster.routing.allocation.disk.watermark.flood_stage": "97%",
+              "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
+              "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
+              "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
+            }
+          }
+          '
+          curl -X PUT "localhost:9200/*/_settings?expand_wildcards=all&pretty" -H 'Content-Type: application/json' -d'
+          {
+            "index.blocks.read_only_allow_delete": null
+          }
+          '
+
       - name: Run tests
         run: |
-          pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_elasticsearch.py
+          pytest -x -m"document_store and integration" test/document_stores/test_elasticsearch.py
+
+      - name: logs
+        if: failure()
+        run: |
+          docker logs "${{ job.services.elasticsearch.id }}"
 
       - name: Calculate alert data
         id: calculator

diff --git a/.github/workflows/tests_skipper.yml b/.github/workflows/tests_skipper.yml
@@ -10,9 +10,10 @@ on:
       - ready_for_review
     paths-ignore:
       - "**.py"
+      - "pyproject.toml"
       - "!haystack/preview/**/*.py"  # See tests_preview.yml
       - "!test/preview/**/*.py"  # See tests_preview.yml
-      - "pyproject.toml"
+      - "!e2e/preview/**/*.py"  # See e2e_preview.yml
       - "!.github/**/*.py"
       - "!rest_api/**/*.py"
       - "!docs/**/*.py"

diff --git a/README.md b/README.md
@@ -9,7 +9,57 @@
 | Meta    | ![Discord](https://img.shields.io/discord/993534733298450452?logo=discord) ![Twitter Follow](https://img.shields.io/twitter/follow/deepset_ai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 </div>
 
-[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.
+[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.
+
+## Quickstart
+
+Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data.
+
+Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach 👇
+
+First, run the minimal Haystack installation:
+
+```sh
+pip install farm-haystack
+```
+
+Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data: 
+
+```python
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.utils import build_pipeline, add_example_data, print_answers
+
+# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai".
+provider = "openai"
+API_KEY = "sk-..." # ADD YOUR KEY HERE
+
+# We support many different databases. Here, we load a simple and lightweight in-memory database.
+document_store = InMemoryDocumentStore(use_bm25=True)
+
+# Download and add Game of Thrones TXT articles to Haystack DocumentStore.
+# You can also provide a folder with your local documents.
+add_example_data(document_store, "data/GoT_getting_started")
+
+# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt.
+pipeline = build_pipeline(provider, API_KEY, document_store)
+
+# Ask a question on the data you just added.
+result = pipeline.run(query="Who is the father of Arya Stark?")
+
+# For details, like which documents were used to generate the answer, look into the <result> object
+print_answers(result, details="medium")
+```
+
+The output of the pipeline will reference the documents used to generate the answer:
+
+```
+'Query: Who is the father of Arya Stark?'
+'Answers:'
+[{'answer': 'The father of Arya Stark is Lord Eddard Stark of '
+                'Winterfell. [Document 1, Document 4, Document 5]'}]
+```
+
+Congratulations, you have just built your first Haystack app!
 
 ## Core Concepts
 

diff --git a/annotation_tool/docker-compose.yml b/annotation_tool/docker-compose.yml
@@ -13,6 +13,7 @@ services:
       # DEFAULT_ADMIN_PASSWORD: "DEMO_PASSWORD"
       # COOKIE_KEYS: "somesafecookiekeys"
       # JWT_SECRET: "somesafesecret"
+      # DOMAIN_WHITELIST: "*"
     ports:
       - "7001:7001"
     links:

diff --git a/e2e/preview/pipelines/test_extractive_qa_pipeline.py b/e2e/preview/pipelines/test_extractive_qa_pipeline.py
@@ -1,25 +1,39 @@
+import json
+
 from haystack.preview import Pipeline, Document
-from haystack.preview.document_stores import MemoryDocumentStore
-from haystack.preview.components.retrievers import MemoryBM25Retriever
+from haystack.preview.document_stores import InMemoryDocumentStore
+from haystack.preview.components.retrievers import InMemoryBM25Retriever
 from haystack.preview.components.readers import ExtractiveReader
 
 
-def test_extractive_qa_pipeline():
-    document_store = MemoryDocumentStore()
+def test_extractive_qa_pipeline(tmp_path):
+    # Create the pipeline
+    qa_pipeline = Pipeline()
+    qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
+    qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
+    qa_pipeline.connect("retriever", "reader")
+
+    # Draw the pipeline
+    qa_pipeline.draw(tmp_path / "test_extractive_qa_pipeline.png")
+
+    # Serialize the pipeline to JSON
+    with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
+        print(json.dumps(qa_pipeline.to_dict(), indent=4))
+        json.dump(qa_pipeline.to_dict(), f)
 
+    # Load the pipeline back
+    with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
+        qa_pipeline = Pipeline.from_dict(json.load(f))
+
+    # Populate the document store
     documents = [
         Document(text="My name is Jean and I live in Paris."),
         Document(text="My name is Mark and I live in Berlin."),
         Document(text="My name is Giorgio and I live in Rome."),
     ]
+    qa_pipeline.get_component("retriever").document_store.write_documents(documents)
 
-    document_store.write_documents(documents)
-
-    qa_pipeline = Pipeline()
-    qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever")
-    qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
-    qa_pipeline.connect("retriever", "reader")
-
+    # Query and assert
     questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
     answers_spywords = ["Jean", "Mark", "Giorgio"]