Skip to content

Commit

Permalink
Fix / update LLM Complete Guide (side quest) (#134)
Browse files Browse the repository at this point in the history
* remove extra step invocation

* update requirements

* fixes and updates

* update chunking pipeline logic

* handle JSON changes

* formatting

* add a typos.toml

* update typos.toml

* move typos.toml to repo root
  • Loading branch information
strickvl authored Oct 23, 2024
1 parent 42f952a commit cbe263d
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 89 deletions.
53 changes: 26 additions & 27 deletions .typos.toml
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
[files]
extend-exclude = [
"*.csv",
"sign-language-detection-yolov5/*",
"orbit-user-analysis/steps/report.py",
"customer-satisfaction/pipelines/deployment_pipeline.py",
"customer-satisfaction/streamlit_app.py",
"nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb",
"customer-satisfaction/tests/data_test.py",
"end-to-end-computer-vision/**/*.ipynb",
"classifier-e2e/run_skip_basics.ipynb",
"classifier-e2e/run_full.ipynb",
"classifier-e2e/run_skip_basics.ipynb",
"classifier-e2e/run_full.ipynb",
"classifier-e2e/run_skip_basics.ipynb"
"*.json",
"*.js",
"*.ipynb",
]

[default.extend-identifiers]
# HashiCorp = "HashiCorp"
connexion = "connexion"
preprocesser = "preprocesser"
Preprocesser = "Preprocesser"
HashiCorp = "HashiCorp"
NDArray = "NDArray"
K_Scatch = "K_Scatch"
MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h = "MCAGA1UECgwZQW1hem9uIFdlYiBTZXJ2aWNlcywgSW5jLjETMBEGA1UECwwKQW1h"
VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi = "VQQGEwJVUzEQMA4GA1UEBwwHU2VhdHRsZTETMBEGA1UECAwKV2FzaGluZ3RvbjEi"
MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1 = "MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1"

[default.extend-words]
# aks = "aks"
GOES = "GOES"
lenght = "lenght"
preprocesser = "preprocesser"
Preprocesser = "Preprocesser"
Implicitly = "Implicitly"
fo = "fo"
mapp = "mapp"
polution = "polution"
magent = "magent"
# Don't correct the surname "Teh"
aks = "aks"
hashi = "hashi"
womens = "womens"
prepend = "prepend"
prepended = "prepended"
goes = "goes"
bare = "bare"
prepending = "prepending"
prev = "prev"
creat = "creat"
ret = "ret"
daa = "daa"
arange = "arange"
cachable = "cachable"
OT = "OT"
cll = "cll"

[default]
locale = "en-us"
4 changes: 2 additions & 2 deletions llm-complete-guide/pipelines/generate_chunk_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
from zenml.client import Client


@pipeline
@pipeline(enable_cache=False)
def generate_chunk_questions():
"""Pipeline to generate questions from chunks."""
local_setting = ExternalArtifact(value=True)
local_setting = ExternalArtifact(value=False)
client = Client()
docs_with_embeddings = client.get_artifact_version(
name_id_or_prefix="documents_with_embeddings"
Expand Down
2 changes: 1 addition & 1 deletion llm-complete-guide/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ tiktoken
umap-learn
matplotlib
pyarrow
rerankers[all]
rerankers[flashrank]
datasets

# optional requirements for S3 artifact store
Expand Down
12 changes: 12 additions & 0 deletions llm-complete-guide/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from materializers.document_materializer import DocumentMaterializer
from pipelines import (
finetune_embeddings,
generate_chunk_questions,
generate_synthetic_data,
llm_basic_rag,
llm_eval,
Expand Down Expand Up @@ -145,6 +146,13 @@
default=False,
help="Whether to use the reranker.",
)
@click.option(
"--chunks",
"chunks",
is_flag=True,
default=False,
help="Generate chunks for Hugging Face dataset",
)
def main(
rag: bool = False,
evaluation: bool = False,
Expand All @@ -157,6 +165,7 @@ def main(
dummyembeddings: bool = False,
argilla: bool = False,
reranked: bool = False,
chunks: bool = False,
):
"""Main entry point for the pipeline execution.
Expand All @@ -170,6 +179,7 @@ def main(
local (bool): If `True`, the local LLM via Ollama will be used.
embeddings (bool): If `True`, the embeddings will be fine-tuned.
argilla (bool): If `True`, the Argilla annotations will be used.
chunks (bool): If `True`, the chunks pipeline will be run.
"""
pipeline_args = {"enable_cache": not no_cache}
embeddings_finetune_args = {
Expand Down Expand Up @@ -201,6 +211,8 @@ def main(
finetune_embeddings.with_options(**embeddings_finetune_args)()
if dummyembeddings:
chunking_experiment.with_options(**pipeline_args)()
if chunks:
generate_chunk_questions.with_options(**pipeline_args)()


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion llm-complete-guide/steps/eval_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def perform_retrieval_evaluation(

if all(url_ending not in url for url in urls):
logging.error(
f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
)
failures += 1

Expand Down
18 changes: 16 additions & 2 deletions llm-complete-guide/steps/finetune_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,14 @@ def visualize_results(
color="red",
)
for i, v in enumerate(finetuned_values):
ax.text(v - 1.5, i - height / 2, f"{v:.1f}", va="center", ha="right", color="white")
ax.text(
v - 1.5,
i - height / 2,
f"{v:.1f}",
va="center",
ha="right",
color="white",
)
ax.barh(
[i + height / 2 for i in y],
base_values,
Expand All @@ -382,7 +389,14 @@ def visualize_results(
color="blue",
)
for i, v in enumerate(base_values):
ax.text(v - 1.5, i + height / 2, f"{v:.1f}", va="center", ha="right", color="white")
ax.text(
v - 1.5,
i + height / 2,
f"{v:.1f}",
va="center",
ha="right",
color="white",
)

ax.set_xlabel("Scores (%)")
ax.set_title("Evaluation Results")
Expand Down
3 changes: 0 additions & 3 deletions llm-complete-guide/steps/hf_dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,3 @@ def load_hf_dataset() -> (
train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train")
test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test")
return train_dataset, test_dataset


load_hf_dataset()
61 changes: 39 additions & 22 deletions llm-complete-guide/steps/populate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
# https://www.timescale.com/blog/postgresql-as-a-vector-database-create-store-and-query-openai-embeddings-with-pgvector/
# for providing the base implementation for this indexing functionality

import json
import logging
import math
from typing import Annotated, List
from typing import Annotated

from constants import (
CHUNK_OVERLAP,
Expand All @@ -41,16 +42,16 @@

@step
def preprocess_documents(
documents: List[Document],
) -> Annotated[List[Document], ArtifactConfig(name="split_chunks")]:
documents: str,
) -> Annotated[str, ArtifactConfig(name="split_chunks")]:
"""
Preprocesses a list of documents by splitting them into chunks.
Preprocesses a JSON string of documents by splitting them into chunks.
Args:
documents (List[Document]): A list of documents to be preprocessed.
documents (str): A JSON string containing a list of documents to be preprocessed.
Returns:
Annotated[List[Document], ArtifactConfig(name="split_chunks")]: A list of preprocessed documents annotated with an ArtifactConfig.
Annotated[str, ArtifactConfig(name="split_chunks")]: A JSON string containing a list of preprocessed documents annotated with an ArtifactConfig.
Raises:
Exception: If an error occurs during preprocessing.
Expand All @@ -64,29 +65,34 @@ def preprocess_documents(
},
)

# Parse the JSON string into a list of Document objects
document_list = [Document(**doc) for doc in json.loads(documents)]

split_docs = split_documents(
documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
document_list, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)
return split_docs

# Convert the list of Document objects back to a JSON string
split_docs_json = json.dumps([doc.__dict__ for doc in split_docs])

return split_docs_json
except Exception as e:
logger.error(f"Error in preprocess_documents: {e}")
raise


@step
def generate_embeddings(
split_documents: List[Document],
) -> Annotated[
List[Document], ArtifactConfig(name="documents_with_embeddings")
]:
split_documents: str,
) -> Annotated[str, ArtifactConfig(name="documents_with_embeddings")]:
"""
Generates embeddings for a list of split documents using a SentenceTransformer model.
Args:
split_documents (List[Document]): A list of Document objects that have been split into chunks.
Returns:
Annotated[List[Document], ArtifactConfig(name="embeddings")]: The list of Document objects with generated embeddings, annotated with an ArtifactConfig.
Annotated[str, ArtifactConfig(name="documents_with_embeddings")]: A JSON string containing the Document objects with generated embeddings, annotated with an ArtifactConfig.
Raises:
Exception: If an error occurs during the generation of embeddings.
Expand All @@ -95,28 +101,36 @@ def generate_embeddings(
model = SentenceTransformer(EMBEDDINGS_MODEL)

log_artifact_metadata(
artifact_name="embeddings",
artifact_name="documents_with_embeddings",
metadata={
"embedding_type": EMBEDDINGS_MODEL,
"embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
},
)

document_texts = [doc.page_content for doc in split_documents]
# Parse the JSON string into a list of Document objects
document_list = [
Document(**doc) for doc in json.loads(split_documents)
]

document_texts = [doc.page_content for doc in document_list]
embeddings = model.encode(document_texts)

for doc, embedding in zip(split_documents, embeddings):
doc.embedding = embedding
for doc, embedding in zip(document_list, embeddings):
doc.embedding = embedding.tolist()

# Convert the list of Document objects to a JSON string
documents_json = json.dumps([doc.__dict__ for doc in document_list])

return split_documents
return documents_json
except Exception as e:
logger.error(f"Error in generate_embeddings: {e}")
raise


@step
def index_generator(
documents: List[Document],
documents: str,
) -> None:
"""Generates an index for the given documents.
Expand All @@ -126,7 +140,7 @@ def index_generator(
using the cosine distance measure.
Args:
documents (List[Document]): The list of Document objects with generated embeddings.
documents (str): A JSON string containing the Document objects with generated embeddings.
Raises:
Exception: If an error occurs during the index generation.
Expand Down Expand Up @@ -155,11 +169,14 @@ def index_generator(

register_vector(conn)

# Parse the JSON string into a list of Document objects
document_list = [Document(**doc) for doc in json.loads(documents)]

# Insert data only if it doesn't already exist
for doc in documents:
for doc in document_list:
content = doc.page_content
token_count = doc.token_count
embedding = doc.embedding.tolist()
embedding = doc.embedding
filename = doc.filename
parent_section = doc.parent_section
url = doc.url
Expand Down
Loading

0 comments on commit cbe263d

Please sign in to comment.