Fix / update LLM Complete Guide (side quest) (#134)

* remove extra step invocation * update requirements * fixes and updates * update chunking pipeline logic * handle JSON changes * formatting * add a typos.toml * update typos.toml * move typos.toml to repo root
zenml-io · Oct 23, 2024 · cbe263d · cbe263d
1 parent 42f952a
commit cbe263d
Show file tree

Hide file tree

Showing 12 changed files with 171 additions and 89 deletions.
diff --git a/.typos.toml b/.typos.toml
@@ -1,37 +1,36 @@
 [files]
 extend-exclude = [
-    "*.csv",
-    "sign-language-detection-yolov5/*",
-    "orbit-user-analysis/steps/report.py",
-    "customer-satisfaction/pipelines/deployment_pipeline.py",
-    "customer-satisfaction/streamlit_app.py",
-    "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb",
-    "customer-satisfaction/tests/data_test.py",
-    "end-to-end-computer-vision/**/*.ipynb",
-    "classifier-e2e/run_skip_basics.ipynb",
-    "classifier-e2e/run_full.ipynb",
-    "classifier-e2e/run_skip_basics.ipynb",
-    "classifier-e2e/run_full.ipynb",
-    "classifier-e2e/run_skip_basics.ipynb"
+    "*.json",
+    "*.js",
+    "*.ipynb",
 ]
 
 [default.extend-identifiers]
-#  HashiCorp = "HashiCorp"
-connexion = "connexion"
-preprocesser = "preprocesser"
-Preprocesser = "Preprocesser"
+HashiCorp = "HashiCorp"
+NDArray = "NDArray"
+K_Scatch = "K_Scatch"
+MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h = "MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h"
+VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi = "VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi"
+MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1 = "MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1"
 
 [default.extend-words]
-#  aks = "aks"
-GOES = "GOES"
-lenght = "lenght"
-preprocesser = "preprocesser"
-Preprocesser = "Preprocesser"
-Implicitly = "Implicitly"
-fo = "fo"
-mapp = "mapp"
-polution = "polution"
-magent = "magent"
+# Don't correct the surname "Teh"
+aks = "aks"
+hashi = "hashi"
+womens = "womens"
+prepend = "prepend"
+prepended = "prepended"
+goes = "goes"
+bare = "bare"
+prepending = "prepending"
+prev = "prev"
+creat = "creat"
+ret = "ret"
+daa = "daa"
+arange = "arange"
+cachable = "cachable"
+OT = "OT"
+cll = "cll"
 
 [default]
 locale = "en-us"
diff --git a/llm-complete-guide/pipelines/generate_chunk_questions.py b/llm-complete-guide/pipelines/generate_chunk_questions.py
@@ -19,10 +19,10 @@
 from zenml.client import Client
 
 
-@pipeline
+@pipeline(enable_cache=False)
 def generate_chunk_questions():
     """Pipeline to generate questions from chunks."""
-    local_setting = ExternalArtifact(value=True)
+    local_setting = ExternalArtifact(value=False)
     client = Client()
     docs_with_embeddings = client.get_artifact_version(
         name_id_or_prefix="documents_with_embeddings"

diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt
@@ -17,7 +17,7 @@ tiktoken
 umap-learn
 matplotlib
 pyarrow
-rerankers[all]
+rerankers[flashrank]
 datasets
 
 # optional requirements for S3 artifact store

diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -42,6 +42,7 @@
 from materializers.document_materializer import DocumentMaterializer
 from pipelines import (
     finetune_embeddings,
+    generate_chunk_questions,
     generate_synthetic_data,
     llm_basic_rag,
     llm_eval,
@@ -145,6 +146,13 @@
     default=False,
     help="Whether to use the reranker.",
 )
+@click.option(
+    "--chunks",
+    "chunks",
+    is_flag=True,
+    default=False,
+    help="Generate chunks for Hugging Face dataset",
+)
 def main(
     rag: bool = False,
     evaluation: bool = False,
@@ -157,6 +165,7 @@ def main(
     dummyembeddings: bool = False,
     argilla: bool = False,
     reranked: bool = False,
+    chunks: bool = False,
 ):
     """Main entry point for the pipeline execution.
 
@@ -170,6 +179,7 @@ def main(
         local (bool): If `True`, the local LLM via Ollama will be used.
         embeddings (bool): If `True`, the embeddings will be fine-tuned.
         argilla (bool): If `True`, the Argilla annotations will be used.
+        chunks (bool): If `True`, the chunks pipeline will be run.
     """
     pipeline_args = {"enable_cache": not no_cache}
     embeddings_finetune_args = {
@@ -201,6 +211,8 @@ def main(
         finetune_embeddings.with_options(**embeddings_finetune_args)()
     if dummyembeddings:
         chunking_experiment.with_options(**pipeline_args)()
+    if chunks:
+        generate_chunk_questions.with_options(**pipeline_args)()
 
 
 if __name__ == "__main__":

diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py
@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(
 
         if all(url_ending not in url for url in urls):
             logging.error(
-                f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
+                f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
             )
             failures += 1
 

diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py
@@ -373,7 +373,14 @@ def visualize_results(
         color="red",
     )
     for i, v in enumerate(finetuned_values):
-        ax.text(v - 1.5, i - height / 2, f"{v:.1f}", va="center", ha="right", color="white")
+        ax.text(
+            v - 1.5,
+            i - height / 2,
+            f"{v:.1f}",
+            va="center",
+            ha="right",
+            color="white",
+        )
     ax.barh(
         [i + height / 2 for i in y],
         base_values,
@@ -382,7 +389,14 @@ def visualize_results(
         color="blue",
     )
     for i, v in enumerate(base_values):
-        ax.text(v - 1.5, i + height / 2, f"{v:.1f}", va="center", ha="right", color="white")
+        ax.text(
+            v - 1.5,
+            i + height / 2,
+            f"{v:.1f}",
+            va="center",
+            ha="right",
+            color="white",
+        )
 
     ax.set_xlabel("Scores (%)")
     ax.set_title("Evaluation Results")

diff --git a/llm-complete-guide/steps/hf_dataset_loader.py b/llm-complete-guide/steps/hf_dataset_loader.py
@@ -29,6 +29,3 @@ def load_hf_dataset() -> (
     train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train")
     test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test")
     return train_dataset, test_dataset
-
-
-load_hf_dataset()
diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -19,9 +19,10 @@
 # https://www.timescale.com/blog/postgresql-as-a-vector-database-create-store-and-query-openai-embeddings-with-pgvector/
 # for providing the base implementation for this indexing functionality
 
+import json
 import logging
 import math
-from typing import Annotated, List
+from typing import Annotated
 
 from constants import (
     CHUNK_OVERLAP,
@@ -41,16 +42,16 @@
 
 @step
 def preprocess_documents(
-    documents: List[Document],
-) -> Annotated[List[Document], ArtifactConfig(name="split_chunks")]:
+    documents: str,
+) -> Annotated[str, ArtifactConfig(name="split_chunks")]:
     """
-    Preprocesses a list of documents by splitting them into chunks.
+    Preprocesses a JSON string of documents by splitting them into chunks.
 
     Args:
-        documents (List[Document]): A list of documents to be preprocessed.
+        documents (str): A JSON string containing a list of documents to be preprocessed.
 
     Returns:
-        Annotated[List[Document], ArtifactConfig(name="split_chunks")]: A list of preprocessed documents annotated with an ArtifactConfig.
+        Annotated[str, ArtifactConfig(name="split_chunks")]: A JSON string containing a list of preprocessed documents annotated with an ArtifactConfig.
 
     Raises:
         Exception: If an error occurs during preprocessing.
@@ -64,29 +65,34 @@ def preprocess_documents(
             },
         )
 
+        # Parse the JSON string into a list of Document objects
+        document_list = [Document(**doc) for doc in json.loads(documents)]
+
         split_docs = split_documents(
-            documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+            document_list, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
         )
-        return split_docs
+
+        # Convert the list of Document objects back to a JSON string
+        split_docs_json = json.dumps([doc.__dict__ for doc in split_docs])
+
+        return split_docs_json
     except Exception as e:
         logger.error(f"Error in preprocess_documents: {e}")
         raise
 
 
 @step
 def generate_embeddings(
-    split_documents: List[Document],
-) -> Annotated[
-    List[Document], ArtifactConfig(name="documents_with_embeddings")
-]:
+    split_documents: str,
+) -> Annotated[str, ArtifactConfig(name="documents_with_embeddings")]:
     """
     Generates embeddings for a list of split documents using a SentenceTransformer model.
 
     Args:
         split_documents (List[Document]): A list of Document objects that have been split into chunks.
 
     Returns:
-        Annotated[List[Document], ArtifactConfig(name="embeddings")]: The list of Document objects with generated embeddings, annotated with an ArtifactConfig.
+        Annotated[str, ArtifactConfig(name="documents_with_embeddings")]: A JSON string containing the Document objects with generated embeddings, annotated with an ArtifactConfig.
 
     Raises:
         Exception: If an error occurs during the generation of embeddings.
@@ -95,28 +101,36 @@ def generate_embeddings(
         model = SentenceTransformer(EMBEDDINGS_MODEL)
 
         log_artifact_metadata(
-            artifact_name="embeddings",
+            artifact_name="documents_with_embeddings",
             metadata={
                 "embedding_type": EMBEDDINGS_MODEL,
                 "embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
             },
         )
 
-        document_texts = [doc.page_content for doc in split_documents]
+        # Parse the JSON string into a list of Document objects
+        document_list = [
+            Document(**doc) for doc in json.loads(split_documents)
+        ]
+
+        document_texts = [doc.page_content for doc in document_list]
         embeddings = model.encode(document_texts)
 
-        for doc, embedding in zip(split_documents, embeddings):
-            doc.embedding = embedding
+        for doc, embedding in zip(document_list, embeddings):
+            doc.embedding = embedding.tolist()
+
+        # Convert the list of Document objects to a JSON string
+        documents_json = json.dumps([doc.__dict__ for doc in document_list])
 
-        return split_documents
+        return documents_json
     except Exception as e:
         logger.error(f"Error in generate_embeddings: {e}")
         raise
 
 
 @step
 def index_generator(
-    documents: List[Document],
+    documents: str,
 ) -> None:
     """Generates an index for the given documents.
 
@@ -126,7 +140,7 @@ def index_generator(
     using the cosine distance measure.
 
     Args:
-        documents (List[Document]): The list of Document objects with generated embeddings.
+        documents (str): A JSON string containing the Document objects with generated embeddings.
 
     Raises:
         Exception: If an error occurs during the index generation.
@@ -155,11 +169,14 @@ def index_generator(
 
             register_vector(conn)
 
+            # Parse the JSON string into a list of Document objects
+            document_list = [Document(**doc) for doc in json.loads(documents)]
+
             # Insert data only if it doesn't already exist
-            for doc in documents:
+            for doc in document_list:
                 content = doc.page_content
                 token_count = doc.token_count
-                embedding = doc.embedding.tolist()
+                embedding = doc.embedding
                 filename = doc.filename
                 parent_section = doc.parent_section
                 url = doc.url