From f926569b06ba66ae0b0005fb4d5327c9f93117c7 Mon Sep 17 00:00:00 2001 From: Jeremy Fowers Date: Tue, 28 Jan 2025 11:22:19 -0500 Subject: [PATCH 1/4] Make lemonade examples folder --- examples/{llm => lemonade}/leap_basic.py | 0 .../{llm => lemonade}/leap_ryzenai_npu.py | 0 examples/{llm => lemonade}/leap_streaming.py | 0 examples/llm/getting_started_llms.ipynb | 184 ------------------ 4 files changed, 184 deletions(-) rename examples/{llm => lemonade}/leap_basic.py (100%) rename examples/{llm => lemonade}/leap_ryzenai_npu.py (100%) rename examples/{llm => lemonade}/leap_streaming.py (100%) delete mode 100644 examples/llm/getting_started_llms.ipynb diff --git a/examples/llm/leap_basic.py b/examples/lemonade/leap_basic.py similarity index 100% rename from examples/llm/leap_basic.py rename to examples/lemonade/leap_basic.py diff --git a/examples/llm/leap_ryzenai_npu.py b/examples/lemonade/leap_ryzenai_npu.py similarity index 100% rename from examples/llm/leap_ryzenai_npu.py rename to examples/lemonade/leap_ryzenai_npu.py diff --git a/examples/llm/leap_streaming.py b/examples/lemonade/leap_streaming.py similarity index 100% rename from examples/llm/leap_streaming.py rename to examples/lemonade/leap_streaming.py diff --git a/examples/llm/getting_started_llms.ipynb b/examples/llm/getting_started_llms.ipynb deleted file mode 100644 index 837661c..0000000 --- a/examples/llm/getting_started_llms.ipynb +++ /dev/null @@ -1,184 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LLMs on RyzenAI with TurnkeyML\n", - "\n", - "This notebook will demonstrate how to bring up an example application that uses a RyzenAI to perform LLM inference. We will use the `lemonade` APIs in order to make this as quick as possible. This notebook makes use of both the `RyzenAI NPU`, as well as the `RyzenAI Radeon integrated GPU (iGPU)`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Application\n", - "\n", - "Our example application will prompt the user for input and then return the LLM's reponse. This is the same technology stack used to create AMD GAIA, which shows how Retrieval Augmented Generation (RAG), agentic workflows, and other advanced techniques can be layered on top of RyzenAI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Define our application: prompt the user and print the LLM's response\n", - "# We define this in a way that makes the NPU and iGPU interchangable\n", - "\n", - "def application(model, tokenizer):\n", - " while True:\n", - " # Prompt the user\n", - " user_prompt = input(\"What is your prompt to the LLM? \")\n", - " print(\"Prompt:\",user_prompt)\n", - "\n", - " # Exit the application if the user prompts \"exit\"\n", - " if user_prompt == \"exit\":\n", - " print(\"Done!\")\n", - " return\n", - "\n", - " # Tokenize the user's prompt\n", - " input_ids = tokenizer(user_prompt, return_tensors=\"pt\").input_ids\n", - "\n", - " # Generate the response\n", - " # Limit the response length to 30 tokens so that we have time to\n", - " # try a few prompts\n", - " llm_response = model.generate(input_ids, max_new_tokens=30)\n", - "\n", - " # Decode the response into text\n", - " decoded_response = tokenizer.decode(llm_response[0])\n", - "\n", - " # Print the response, then prompt for another input\n", - " print(\"Response:\",decoded_response)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RyzenAI NPU Model Initialization\n", - "\n", - "### Prequisites for NPU\n", - "\n", - "- `ryzenai-transformers` conda environment is installed and activated.\n", - "- Access to `meta-llama/Llama-2-7b-chat-hf` on Hugging Face.\n", - "- Install the TurnkeyML-LLM in your `ryzenai-transformers` environment, see https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm/README.md#install-ryzenai-npu\n", - "- Also `pip install jupyter` in your `ryzenai-transformers` environment.\n", - "\n", - "### Starting Up\n", - "\n", - "- Run `conda activate ryzenai-transformers`\n", - "- Run `setup_phx.bat` or `setup_stx.bat` on your PHX (RyzenAI 7000 or RyzenAI 300, respectively)\n", - "- Run `jupyter notebook`\n", - "- Copy the URL printed from the previous command, and use that as the kernel for this notebook.\n", - " - Example: \n", - " > Or copy and paste one of these URLs:\n", - " >\n", - " > http://localhost:8888/tree?token=14796c43ce39ef9a3296b7c7c26335e01f7bdc8b0fd4efce\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the lemonade APIs\n", - "from lemonade import leap\n", - "\n", - "# Load the model on to RyzenAI NPU\n", - "# NOTE: this takes a couple of minutes, but after you've done it once\n", - "# you can keep reusing the `model` instance in subsequent notebook cells\n", - "npu_model, npu_tokenizer = leap.from_pretrained(\n", - " \"meta-llama/Llama-2-7b-chat-hf\", recipe=\"ryzenai-npu\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NPU Application" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the application on NPU\n", - "# User should prompt \"exit\" to stop the application\n", - "application(npu_model, npu_tokenizer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Radeon iGPU Initialization\n", - "\n", - "### Prequisites for iGPU\n", - "\n", - "- `turnkeyml[oga-dml]` is installed into an activated conda environment.\n", - "- Download a copy of `Phi-3-mini`\n", - "- See https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm/README.md#install-onnxruntime-genai for details" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the turnkey APIs\n", - "from lemonade import leap\n", - "\n", - "# Load the model on iGPU\n", - "igpu_model, igpu_tokenizer = leap.from_pretrained(\n", - " \"microsoft/Phi-3-mini-4k-instruct\", recipe=\"oga-dml-igpu\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Radeon iGPU Application" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the application on iGPU\n", - "# User should prompt \"exit\" to stop the application\n", - "application(igpu_model, igpu_tokenizer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From a648f21b4714911fe25de55016b8ba35948eb53a Mon Sep 17 00:00:00 2001 From: Jeremy Fowers Date: Tue, 28 Jan 2025 11:28:37 -0500 Subject: [PATCH 2/4] Move tests, docs, and examples --- .../getting_started.md} | 0 docs/{ => lemonade}/humaneval_accuracy.md | 0 docs/{ => lemonade}/llamacpp.md | 0 docs/{ => lemonade}/mmlu_accuracy.md | 0 docs/{ => lemonade}/ort_genai_hybrid.md | 0 docs/{ => lemonade}/ort_genai_igpu.md | 0 docs/{ => lemonade}/ort_genai_npu.md | 0 docs/{ => lemonade}/perplexity.md | 0 .../getting_started.md} | 0 docs/{ => turnkey}/tools_user_guide.md | 0 examples/lemonade/leap_ryzenai_npu.py | 21 ---- examples/{ => turnkey}/api/loading_a_build.py | 0 examples/{ => turnkey}/api/onnx_opset.py | 0 .../{ => turnkey}/api/scripts/hello_world.py | 0 examples/{ => turnkey}/cli/onnx/README.md | 0 .../{ => turnkey}/cli/onnx/hello_world.onnx | Bin .../cli/plugins/example_tool/setup.py | 0 .../turnkeyml_plugin_example_tool/__init__.py | 0 .../turnkeyml_plugin_example_tool/tool.py | 0 examples/{ => turnkey}/cli/plugins/readme.md | 0 examples/{ => turnkey}/cli/readme.md | 0 examples/{ => turnkey}/cli/scripts/README.md | 0 .../{ => turnkey}/cli/scripts/hello_world.py | 0 .../{ => turnkey}/cli/scripts/max_depth.py | 0 .../cli/scripts/multiple_invocations.py | 0 .../{ => turnkey}/cli/scripts/two_models.py | 0 test/gpu.py | 91 ------------------ test/{ => lemonade}/llm_api.py | 0 test/{ => lemonade}/oga_cpu_api.py | 0 test/{ => turnkey}/analysis.py | 0 test/{ => turnkey}/cli.py | 0 test/{ => turnkey}/unit.py | 0 32 files changed, 112 deletions(-) rename docs/{lemonade_getting_started.md => lemonade/getting_started.md} (100%) rename docs/{ => lemonade}/humaneval_accuracy.md (100%) rename docs/{ => lemonade}/llamacpp.md (100%) rename docs/{ => lemonade}/mmlu_accuracy.md (100%) rename docs/{ => lemonade}/ort_genai_hybrid.md (100%) rename docs/{ => lemonade}/ort_genai_igpu.md (100%) rename docs/{ => lemonade}/ort_genai_npu.md (100%) rename docs/{ => lemonade}/perplexity.md (100%) rename docs/{classic_getting_started.md => turnkey/getting_started.md} (100%) rename docs/{ => turnkey}/tools_user_guide.md (100%) delete mode 100644 examples/lemonade/leap_ryzenai_npu.py rename examples/{ => turnkey}/api/loading_a_build.py (100%) rename examples/{ => turnkey}/api/onnx_opset.py (100%) rename examples/{ => turnkey}/api/scripts/hello_world.py (100%) rename examples/{ => turnkey}/cli/onnx/README.md (100%) rename examples/{ => turnkey}/cli/onnx/hello_world.onnx (100%) rename examples/{ => turnkey}/cli/plugins/example_tool/setup.py (100%) rename examples/{ => turnkey}/cli/plugins/example_tool/turnkeyml_plugin_example_tool/__init__.py (100%) rename examples/{ => turnkey}/cli/plugins/example_tool/turnkeyml_plugin_example_tool/tool.py (100%) rename examples/{ => turnkey}/cli/plugins/readme.md (100%) rename examples/{ => turnkey}/cli/readme.md (100%) rename examples/{ => turnkey}/cli/scripts/README.md (100%) rename examples/{ => turnkey}/cli/scripts/hello_world.py (100%) rename examples/{ => turnkey}/cli/scripts/max_depth.py (100%) rename examples/{ => turnkey}/cli/scripts/multiple_invocations.py (100%) rename examples/{ => turnkey}/cli/scripts/two_models.py (100%) delete mode 100644 test/gpu.py rename test/{ => lemonade}/llm_api.py (100%) rename test/{ => lemonade}/oga_cpu_api.py (100%) rename test/{ => turnkey}/analysis.py (100%) rename test/{ => turnkey}/cli.py (100%) rename test/{ => turnkey}/unit.py (100%) diff --git a/docs/lemonade_getting_started.md b/docs/lemonade/getting_started.md similarity index 100% rename from docs/lemonade_getting_started.md rename to docs/lemonade/getting_started.md diff --git a/docs/humaneval_accuracy.md b/docs/lemonade/humaneval_accuracy.md similarity index 100% rename from docs/humaneval_accuracy.md rename to docs/lemonade/humaneval_accuracy.md diff --git a/docs/llamacpp.md b/docs/lemonade/llamacpp.md similarity index 100% rename from docs/llamacpp.md rename to docs/lemonade/llamacpp.md diff --git a/docs/mmlu_accuracy.md b/docs/lemonade/mmlu_accuracy.md similarity index 100% rename from docs/mmlu_accuracy.md rename to docs/lemonade/mmlu_accuracy.md diff --git a/docs/ort_genai_hybrid.md b/docs/lemonade/ort_genai_hybrid.md similarity index 100% rename from docs/ort_genai_hybrid.md rename to docs/lemonade/ort_genai_hybrid.md diff --git a/docs/ort_genai_igpu.md b/docs/lemonade/ort_genai_igpu.md similarity index 100% rename from docs/ort_genai_igpu.md rename to docs/lemonade/ort_genai_igpu.md diff --git a/docs/ort_genai_npu.md b/docs/lemonade/ort_genai_npu.md similarity index 100% rename from docs/ort_genai_npu.md rename to docs/lemonade/ort_genai_npu.md diff --git a/docs/perplexity.md b/docs/lemonade/perplexity.md similarity index 100% rename from docs/perplexity.md rename to docs/lemonade/perplexity.md diff --git a/docs/classic_getting_started.md b/docs/turnkey/getting_started.md similarity index 100% rename from docs/classic_getting_started.md rename to docs/turnkey/getting_started.md diff --git a/docs/tools_user_guide.md b/docs/turnkey/tools_user_guide.md similarity index 100% rename from docs/tools_user_guide.md rename to docs/turnkey/tools_user_guide.md diff --git a/examples/lemonade/leap_ryzenai_npu.py b/examples/lemonade/leap_ryzenai_npu.py deleted file mode 100644 index 30ee022..0000000 --- a/examples/lemonade/leap_ryzenai_npu.py +++ /dev/null @@ -1,21 +0,0 @@ -""" -This example demonstrates how to use the LEAP API to load a model for -inference on a Ryzen AI NPU using the ryzenai-npu-load recipe, -and then use it to generate the response to a prompt. - -Note that this example will only run if the Ryzen AI NPU Private recipe is installed. -See genai/docs/ryzenai_npu.md for instructions. - -You can try the same model on CPU by changing the recipe to "hf-cpu". -""" - -from lemonade import leap - -model, tokenizer = leap.from_pretrained( - "meta-llama/Llama-2-7b-chat-hf", recipe="ryzenai-npu" -) - -input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids -response = model.generate(input_ids, max_new_tokens=30) - -print(tokenizer.decode(response[0])) diff --git a/examples/api/loading_a_build.py b/examples/turnkey/api/loading_a_build.py similarity index 100% rename from examples/api/loading_a_build.py rename to examples/turnkey/api/loading_a_build.py diff --git a/examples/api/onnx_opset.py b/examples/turnkey/api/onnx_opset.py similarity index 100% rename from examples/api/onnx_opset.py rename to examples/turnkey/api/onnx_opset.py diff --git a/examples/api/scripts/hello_world.py b/examples/turnkey/api/scripts/hello_world.py similarity index 100% rename from examples/api/scripts/hello_world.py rename to examples/turnkey/api/scripts/hello_world.py diff --git a/examples/cli/onnx/README.md b/examples/turnkey/cli/onnx/README.md similarity index 100% rename from examples/cli/onnx/README.md rename to examples/turnkey/cli/onnx/README.md diff --git a/examples/cli/onnx/hello_world.onnx b/examples/turnkey/cli/onnx/hello_world.onnx similarity index 100% rename from examples/cli/onnx/hello_world.onnx rename to examples/turnkey/cli/onnx/hello_world.onnx diff --git a/examples/cli/plugins/example_tool/setup.py b/examples/turnkey/cli/plugins/example_tool/setup.py similarity index 100% rename from examples/cli/plugins/example_tool/setup.py rename to examples/turnkey/cli/plugins/example_tool/setup.py diff --git a/examples/cli/plugins/example_tool/turnkeyml_plugin_example_tool/__init__.py b/examples/turnkey/cli/plugins/example_tool/turnkeyml_plugin_example_tool/__init__.py similarity index 100% rename from examples/cli/plugins/example_tool/turnkeyml_plugin_example_tool/__init__.py rename to examples/turnkey/cli/plugins/example_tool/turnkeyml_plugin_example_tool/__init__.py diff --git a/examples/cli/plugins/example_tool/turnkeyml_plugin_example_tool/tool.py b/examples/turnkey/cli/plugins/example_tool/turnkeyml_plugin_example_tool/tool.py similarity index 100% rename from examples/cli/plugins/example_tool/turnkeyml_plugin_example_tool/tool.py rename to examples/turnkey/cli/plugins/example_tool/turnkeyml_plugin_example_tool/tool.py diff --git a/examples/cli/plugins/readme.md b/examples/turnkey/cli/plugins/readme.md similarity index 100% rename from examples/cli/plugins/readme.md rename to examples/turnkey/cli/plugins/readme.md diff --git a/examples/cli/readme.md b/examples/turnkey/cli/readme.md similarity index 100% rename from examples/cli/readme.md rename to examples/turnkey/cli/readme.md diff --git a/examples/cli/scripts/README.md b/examples/turnkey/cli/scripts/README.md similarity index 100% rename from examples/cli/scripts/README.md rename to examples/turnkey/cli/scripts/README.md diff --git a/examples/cli/scripts/hello_world.py b/examples/turnkey/cli/scripts/hello_world.py similarity index 100% rename from examples/cli/scripts/hello_world.py rename to examples/turnkey/cli/scripts/hello_world.py diff --git a/examples/cli/scripts/max_depth.py b/examples/turnkey/cli/scripts/max_depth.py similarity index 100% rename from examples/cli/scripts/max_depth.py rename to examples/turnkey/cli/scripts/max_depth.py diff --git a/examples/cli/scripts/multiple_invocations.py b/examples/turnkey/cli/scripts/multiple_invocations.py similarity index 100% rename from examples/cli/scripts/multiple_invocations.py rename to examples/turnkey/cli/scripts/multiple_invocations.py diff --git a/examples/cli/scripts/two_models.py b/examples/turnkey/cli/scripts/two_models.py similarity index 100% rename from examples/cli/scripts/two_models.py rename to examples/turnkey/cli/scripts/two_models.py diff --git a/test/gpu.py b/test/gpu.py deleted file mode 100644 index bcae9e8..0000000 --- a/test/gpu.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -GPU tests -""" - -import os -import unittest -from unittest.mock import patch -import sys -import shutil -from turnkeyml.cli.cli import main as turnkeycli -import turnkeyml.common.filesystem as filesystem -from cli import assert_success_of_builds, flatten - - -class Testing(unittest.TestCase): - def setUp(self) -> None: - filesystem.rmdir(cache_dir) - - return super().setUp() - - def test_basic(self): - test_script = "linear.py" - # Benchmark with Pytorch - testargs = [ - "turnkey", - "benchmark", - os.path.join(corpus_dir, test_script), - "--cache-dir", - cache_dir, - "--device", - "nvidia", - ] - with patch.object(sys, "argv", flatten(testargs)): - turnkeycli() - - assert_success_of_builds( - [test_script], cache_dir, check_perf=True, runtime="trt" - ) - - -if __name__ == "__main__": - test_scripts_dot_py = { - "linear.py": """# labels: name::linear author::turnkey license::mit test_group::a -import torch - -torch.manual_seed(0) - - -class LinearTestModel(torch.nn.Module): - def __init__(self, input_features, output_features): - super(LinearTestModel, self).__init__() - self.fc = torch.nn.Linear(input_features, output_features) - - def forward(self, x): - output = self.fc(x) - return output - - -input_features = 10 -output_features = 10 - -# Model and input configurations -model = LinearTestModel(input_features, output_features) -inputs = {"x": torch.rand(input_features)} - -output = model(**inputs) - - """ - } - - # Create a test directory and make it the CWD - test_dir = os.path.join(os.path.dirname(__file__), "generated", "gpu_test_dir") - cache_dir = os.path.join(os.path.dirname(__file__), "generated", "cache-dir") - if os.path.isdir(test_dir): - shutil.rmtree(test_dir) - if os.path.isdir(cache_dir): - shutil.rmtree(cache_dir) - os.makedirs(test_dir) - os.chdir(test_dir) - - corpus_dir = os.path.join(os.getcwd(), "test_corpus") - extras_dir = os.path.join(corpus_dir, "extras") - os.makedirs(extras_dir, exist_ok=True) - - for key, value in test_scripts_dot_py.items(): - model_path = os.path.join(corpus_dir, key) - - with open(model_path, "w", encoding="utf") as f: - f.write(value) - - unittest.main() diff --git a/test/llm_api.py b/test/lemonade/llm_api.py similarity index 100% rename from test/llm_api.py rename to test/lemonade/llm_api.py diff --git a/test/oga_cpu_api.py b/test/lemonade/oga_cpu_api.py similarity index 100% rename from test/oga_cpu_api.py rename to test/lemonade/oga_cpu_api.py diff --git a/test/analysis.py b/test/turnkey/analysis.py similarity index 100% rename from test/analysis.py rename to test/turnkey/analysis.py diff --git a/test/cli.py b/test/turnkey/cli.py similarity index 100% rename from test/cli.py rename to test/turnkey/cli.py diff --git a/test/unit.py b/test/turnkey/unit.py similarity index 100% rename from test/unit.py rename to test/turnkey/unit.py From d246b9356733d0858d5d9104a62abd0ec9eb0c6b Mon Sep 17 00:00:00 2001 From: Jeremy Fowers Date: Tue, 28 Jan 2025 11:31:41 -0500 Subject: [PATCH 3/4] Add new lemonade examples and installer. Update source code. --- .github/workflows/test_lemonade.yml | 11 +- .github/workflows/test_lemonade_oga_cpu.yml | 9 +- .github/workflows/test_turnkey.yml | 63 ++---- README.md | 6 +- docs/code.md | 10 +- docs/contribute.md | 2 +- docs/lemonade/getting_started.md | 16 +- docs/readme.md | 39 +++- docs/turnkey/getting_started.md | 6 +- docs/turnkey/tools_user_guide.md | 2 +- examples/lemonade/README.md | 18 ++ examples/lemonade/demos/README.md | 17 ++ examples/lemonade/demos/chat/chat_hybrid.py | 65 ++++++ examples/lemonade/demos/chat/chat_start.py | 105 +++++++++ .../lemonade/demos/search/search_hybrid.py | 87 ++++++++ .../lemonade/demos/search/search_start.py | 140 ++++++++++++ examples/lemonade/leap_basic.py | 2 +- examples/lemonade/leap_oga_cpu.py | 18 ++ examples/lemonade/leap_oga_cpu_streaming.py | 36 +++ examples/lemonade/leap_oga_hybrid.py | 20 ++ .../lemonade/leap_oga_hybrid_streaming.py | 39 ++++ examples/lemonade/leap_oga_igpu.py | 18 ++ examples/lemonade/leap_oga_igpu_streaming.py | 39 ++++ examples/lemonade/leap_oga_npu.py | 21 ++ examples/lemonade/leap_oga_npu_streaming.py | 39 ++++ examples/lemonade/leap_streaming.py | 9 +- examples/readme.md | 9 +- examples/turnkey/cli/onnx/README.md | 2 +- examples/turnkey/cli/readme.md | 6 +- examples/turnkey/cli/scripts/README.md | 2 +- models/readme.md | 6 +- plugins/devices/README.md | 4 +- setup.py | 3 + src/lemonade/cache.py | 2 + src/lemonade/cli.py | 14 ++ src/lemonade/leap.py | 64 ++---- src/lemonade/tools/chat.py | 131 +++++++++-- src/lemonade/tools/huggingface_load.py | 6 +- src/lemonade/tools/humaneval.py | 2 +- src/lemonade/tools/mmlu.py | 2 +- src/lemonade/tools/ort_genai/oga.py | 46 ++-- src/lemonade/tools/perplexity.py | 3 +- src/lemonade_install/__init__.py | 1 + src/lemonade_install/install.py | 211 ++++++++++++++++++ src/turnkeyml/cli/cli.py | 2 +- src/turnkeyml/common/filesystem.py | 7 +- src/turnkeyml/common/status.py | 53 ++++- src/turnkeyml/sequence/sequence.py | 167 +++++++++++++- src/turnkeyml/tools/discovery/discover.py | 2 +- src/turnkeyml/tools/tool.py | 27 ++- test/lemonade/llm_api.py | 196 ++++++++++------ test/lemonade/oga_cpu_api.py | 30 ++- 52 files changed, 1559 insertions(+), 276 deletions(-) create mode 100644 examples/lemonade/README.md create mode 100644 examples/lemonade/demos/README.md create mode 100644 examples/lemonade/demos/chat/chat_hybrid.py create mode 100644 examples/lemonade/demos/chat/chat_start.py create mode 100644 examples/lemonade/demos/search/search_hybrid.py create mode 100644 examples/lemonade/demos/search/search_start.py create mode 100644 examples/lemonade/leap_oga_cpu.py create mode 100644 examples/lemonade/leap_oga_cpu_streaming.py create mode 100644 examples/lemonade/leap_oga_hybrid.py create mode 100644 examples/lemonade/leap_oga_hybrid_streaming.py create mode 100644 examples/lemonade/leap_oga_igpu.py create mode 100644 examples/lemonade/leap_oga_igpu_streaming.py create mode 100644 examples/lemonade/leap_oga_npu.py create mode 100644 examples/lemonade/leap_oga_npu_streaming.py create mode 100644 src/lemonade_install/__init__.py create mode 100644 src/lemonade_install/install.py diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml index 49de72c..6073d04 100644 --- a/.github/workflows/test_lemonade.yml +++ b/.github/workflows/test_lemonade.yml @@ -55,7 +55,14 @@ jobs: - name: Run lemonade tests shell: bash -el {0} run: | - lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10 - python test/llm_api.py + # Test CLI + lemonade -m -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10 + # Test low-level APIs + python test/lemonade/llm_api.py + + + # Test high-level LEAP APIs + python examples/lemonade/leap_basic.py + python examples/lemonade/leap_streaming.py diff --git a/.github/workflows/test_lemonade_oga_cpu.yml b/.github/workflows/test_lemonade_oga_cpu.yml index a6619b4..33edd22 100644 --- a/.github/workflows/test_lemonade_oga_cpu.yml +++ b/.github/workflows/test_lemonade_oga_cpu.yml @@ -55,6 +55,13 @@ jobs: env: HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions run: | + # Test CLI lemonade -i TinyPixel/small-llama2 oga-load --device cpu --dtype int4 llm-prompt -p "tell me a story" --max-new-tokens 5 - python test/oga_cpu_api.py + + # Test low-level APIs + python test/lemonade/oga_cpu_api.py + + # Test high-level LEAP APIs + python examples/lemonade/leap_oga_cpu.py + python examples/lemonade/leap_oga_cpu_streaming.py diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml index 52298a6..12ded9a 100644 --- a/.github/workflows/test_turnkey.yml +++ b/.github/workflows/test_turnkey.yml @@ -8,6 +8,11 @@ on: branches: ["main", "canary", "refresh"] pull_request: branches: ["main", "canary", "refresh"] + paths: + - src/turnkeyml/** + - test/turnkey/** + - examples/turnkey/** + - .github/workflows/test_turnkey.yml permissions: contents: read @@ -50,68 +55,32 @@ jobs: shell: bash -el {0} run: | # Unit tests - python test/unit.py + python test/turnkey/unit.py # turnkey examples # Note: we clear the default cache location prior to each block of example runs rm -rf ~/.cache/turnkey - python examples/api/onnx_opset.py --onnx-opset 15 - python examples/api/loading_a_build.py + python examples/turnkey/api/onnx_opset.py --onnx-opset 15 + python examples/turnkey/api/loading_a_build.py rm -rf ~/.cache/turnkey - turnkey -i examples/cli/scripts/hello_world.py discover export-pytorch benchmark + turnkey -i examples/turnkey/cli/scripts/hello_world.py discover export-pytorch benchmark rm -rf ~/.cache/turnkey - turnkey -i examples/cli/scripts/multiple_invocations.py discover export-pytorch benchmark + turnkey -i examples/turnkey/cli/scripts/multiple_invocations.py discover export-pytorch benchmark rm -rf ~/.cache/turnkey - turnkey -i examples/cli/scripts/max_depth.py discover --max-depth 1 export-pytorch benchmark + turnkey -i examples/turnkey/cli/scripts/max_depth.py discover --max-depth 1 export-pytorch benchmark rm -rf ~/.cache/turnkey - turnkey -i examples/cli/scripts/two_models.py discover export-pytorch benchmark + turnkey -i examples/turnkey/cli/scripts/two_models.py discover export-pytorch benchmark rm -rf ~/.cache/turnkey - turnkey -i examples/cli/onnx/hello_world.onnx load-onnx benchmark + turnkey -i examples/turnkey/cli/onnx/hello_world.onnx load-onnx benchmark # E2E tests - cd test/ + cd test/turnkey python cli.py python analysis.py - name: Test example plugins shell: bash -el {0} run: | rm -rf ~/.cache/turnkey - pip install -e examples/cli/plugins/example_tool - turnkey -i examples/cli/scripts/hello_world.py discover export-pytorch example-plugin-tool benchmark - # - name: Install and Start Slurm - # if: runner.os != 'Windows' - # shell: bash -el {0} - # run: | - # sudo apt update -y - # sudo apt install slurm-wlm -y - # cp test/helpers/slurm.conf test/helpers/slurm_modified.conf - # sed -i "s/YOUR_HOSTNAME_HERE/$HOSTNAME/" test/helpers/slurm_modified.conf - # sudo mv test/helpers/slurm_modified.conf /etc/slurm/slurm.conf - # sudo service slurmd start - # sudo service slurmctld start - # sudo service munge start - # - name: Test turnkey on Slurm - # if: runner.os != 'Windows' - # shell: bash -el {0} - # run: | - # # Create conda environment for Slurm using srun (sbatch + wait) - # export SKIP_REQUIREMENTS_INSTALL="True" - # export TORCH_CPU="True" - # srun src/turnkeyml/cli/setup_venv.sh - - # # Run tests on Slurm - # export TURNKEY_SLURM_USE_DEFAULT_MEMORY="True" - # turnkey -i models/selftest/linear.py --use-slurm --cache-dir local_cache discover export-pytorch - # bash test/helpers/check_slurm_output.sh slurm-2.out - - # Below tests are commented out as the GitHub runner runs out of space installing the requirements - # - name: Check installation of requirements.txt and their compatibility with turnkey - # shell: bash -el {0} - # run: | - # conda create --name test-requirements python=3.8 - # conda activate test-requirements - # pip install -r models/requirements.txt - # python -m pip check - # python -c "import torch_geometric" - # conda deactivate \ No newline at end of file + pip install -e examples/turnkey/cli/plugins/example_tool + turnkey -i examples/turnkey/cli/scripts/hello_world.py discover export-pytorch example-plugin-tool benchmark diff --git a/README.md b/README.md index 2a6156e..16be201 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ We are on a mission to make it easy to use the most important tools in the ONNX ecosystem. TurnkeyML accomplishes this by providing no-code CLIs and low-code APIs for both general ONNX workflows with `turnkey` as well as LLMs with `lemonade`. -| [**Lemonade**](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm) | [**Turnkey**](https://github.com/onnx/turnkeyml/blob/main/docs/classic_getting_started.md) | +| [**Lemonade**](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm) | [**Turnkey**](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/getting_started.md) | |:----------------------------------------------: |:-----------------------------------------------------------------: | -| Serve and benchmark LLMs on CPU, GPU, and NPU.
[Click here to get started with `lemonade`.](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade_getting_started.md) | Export and optimize ONNX models for CNNs and Transformers.
[Click here to get started with `turnkey`.](https://github.com/onnx/turnkeyml/blob/main/docs/classic_getting_started.md) | -| | | +| Serve and benchmark LLMs on CPU, GPU, and NPU.
[Click here to get started with `lemonade`.](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md) | Export and optimize ONNX models for CNNs and Transformers.
[Click here to get started with `turnkey`.](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/getting_started.md) | +| | | ## How It Works diff --git a/docs/code.md b/docs/code.md index 794f693..061626c 100644 --- a/docs/code.md +++ b/docs/code.md @@ -5,9 +5,9 @@ The TurnkeyML source code has a few major top-level directories: - `docs`: documentation for the entire project. - `examples`: example scripts for use with the TurnkeyML tools. - - `examples/cli`: tutorial series starting in `examples/cli/readme.md` to help learn the `turnkey` CLI. - - `examples/cli/scripts`: example scripts that can be fed as input into the `turnkey` CLI. These scripts each have a docstring that recommends one or more `turnkey` CLI commands to try out. - - `examples/api`: examples scripts that invoke `Tools` via APIs. + - `examples/turnkey/cli`: tutorial series starting in `examples/turnkey/cli/readme.md` to help learn the `turnkey` CLI. + - `examples/turnkey/cli/scripts`: example scripts that can be fed as input into the `turnkey` CLI. These scripts each have a docstring that recommends one or more `turnkey` CLI commands to try out. + - `examples/turnkey/api`: examples scripts that invoke `Tools` via APIs. - `models`: the corpora of models that makes up the TurnkeyML models (see [the models readme](https://github.com/onnx/turnkeyml/blob/main/models/readme.md)). - Each subdirectory under `models` represents a corpus of models pulled from somewhere on the internet. For example, `models/torch_hub` is a corpus of models from [Torch Hub](https://github.com/pytorch/hub). - `src/turnkeyml`: source code for the TurnkeyML package. @@ -20,8 +20,8 @@ The TurnkeyML source code has a few major top-level directories: - `src/turnkeyml/state.py`: implements the `State` class. - `src/turnkeyml/files_api.py`: implements the `evaluate_files()` API, which is the top-level API called by the CLI. - `test`: tests for the TurnkeyML tools. - - `test/analysis.py`: tests focusing on the `discover` `Tool`. - - `test/cli.py`: tests focusing on top-level CLI features. + - `test/turnkey/analysis.py`: tests focusing on the `discover` `Tool`. + - `test/turnkey/cli.py`: tests focusing on top-level CLI features. ## Tool Classes diff --git a/docs/contribute.md b/docs/contribute.md index 8c76550..b3419d8 100644 --- a/docs/contribute.md +++ b/docs/contribute.md @@ -88,7 +88,7 @@ We require the following naming scheme: ### Example -See the [example_tool](https://github.com/onnx/turnkeyml/tree/main/examples/cli/plugins/example_tool) plugin for an example. +See the [example_tool](https://github.com/onnx/turnkeyml/tree/main/examples/turnkey/cli/plugins/example_tool) plugin for an example. The `__init__.py` file with its `implements` dictionary looks like: diff --git a/docs/lemonade/getting_started.md b/docs/lemonade/getting_started.md index 924118b..72362a0 100644 --- a/docs/lemonade/getting_started.md +++ b/docs/lemonade/getting_started.md @@ -64,6 +64,16 @@ That command will run a few warmup iterations, then a few generation iterations The prompt size, number of output tokens, and number iterations are all parameters. Learn more by running `lemonade huggingface-bench -h`. +## Memory Usage + +The peak memory used by the lemonade build is captured in the build output. To capture more granular +memory usage information, use the `--memory` flag. For example: + +`lemonade -i facebook/opt-125m --memory huggingface-load huggingface-bench` + +In this case a `memory_usage.png` file will be generated and stored in the build folder. This file +contains a figure plotting the memory usage over the build time. Learn more by running `lemonade -h`. + ## Serving You can launch a WebSocket server for your LLM with: @@ -111,9 +121,9 @@ You can also try Phi-3-Mini-128k-Instruct with the following commands: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve` -You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md). +You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md). -> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information. +> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_npu.md) for more information. ## Install RyzenAI NPU for PyTorch @@ -131,7 +141,7 @@ If you decide to contribute, please: - do so via a pull request. - write your code in keeping with the same style as the rest of this repo's code. -- add a test under `test/llm_api.py` that provides coverage of your new feature. +- add a test under `test/lemonade/llm_api.py` that provides coverage of your new feature. The best way to contribute is to add new tools to cover more devices and usage scenarios. diff --git a/docs/readme.md b/docs/readme.md index 8355044..a9aee42 100644 --- a/docs/readme.md +++ b/docs/readme.md @@ -1,11 +1,38 @@ # TurnkeyML Documentation -This directory contains documentation for the TurnkeyML project: +## LLMs: `lemonade` tooling + +The `docs/lemonade` directory has documentation for the LLM-focused `lemonade` tooling: +- [Getting Started](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md): start here for LLMs. +- Accuracy tests (task performance): + - [HumanEval](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/humaneval_accuracy.md): details of the HumanEval coding task test. + - [MMLU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/mmlu_accuracy.md): details of the MMLU general reasoning test. + - [Perplexity](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/perplexity.md): details of the Perplexity test for LLMs. +- Tool-specific setup guides: + - [llama.cpp](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/llamacpp.md) + - OnnxRuntime GenaI: + - [iGPU/NPU hybrid](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_hybrid.md) + - [iGPU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md) + - [NPU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_npu.md) + +## CNNs and Transformers: `turnkey` tooling + +The `docs/turnkey` directory contains documentation for the CNN/Transformer-focused `turnkey` tooling: + +- [getting_started.md](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/getting_started.md) +- [tools_user_guide.md](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md): User guide for the tools: the `turnkey` CLI and the APIs. + + +There is more useful documentation available in: +- [examples/turnkey/cli/readme.md](https://github.com/onnx/turnkeyml/blob/main/examples/turnkey/cli/readme.md): Tutorial series for learning the `turnkey` CLI. +- [models/readme.md](https://github.com/onnx/turnkeyml/blob/main/models/readme.md): Tutorial for understanding the models and how to use `turnkey` to evaluate the models. + +## General Information + +This directory also contains documentation for the TurnkeyML project as a whole: + - [code.md](https://github.com/onnx/turnkeyml/blob/main/docs/code.md): Code organization for the tools. - [install.md](https://github.com/onnx/turnkeyml/blob/main/docs/install.md): Installation instructions for the tools. -- [tools_user_guide.md](https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md): User guide for the tools: the `turnkey` CLI and the APIs. - [versioning.md](https://github.com/onnx/turnkeyml/blob/main/docs/versioning.md): Defines the semantic versioning rules for the `turnkey` package. - -There is more useful documentation available in: -- [examples/cli/readme.md](https://github.com/onnx/turnkeyml/blob/main/examples/cli/readme.md): Tutorial series for learning the `turnkey` CLI. -- [models/readme.md](https://github.com/onnx/turnkeyml/blob/main/models/readme.md): Tutorial for understanding the models and how to use `turnkey` to evaluate the models. \ No newline at end of file +- [contribute.md](https://github.com/onnx/turnkeyml/blob/main/docs/contribute.md): Contribution guidelines for the project. +- [converage.md](https://github.com/onnx/turnkeyml/blob/main/docs/coverage.md): How to run code coverage metrics. diff --git a/docs/turnkey/getting_started.md b/docs/turnkey/getting_started.md index 52588c4..1598fe9 100644 --- a/docs/turnkey/getting_started.md +++ b/docs/turnkey/getting_started.md @@ -41,8 +41,8 @@ The easiest way to learn more about `turnkey` is to explore the help menu with ` We also provide the following resources: - [Installation guide](https://github.com/onnx/turnkeyml/blob/main/docs/install.md): how to install from source, set up Slurm, etc. -- [User guide](https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md): explains the concepts of `turnkey's`, including the syntax for making your own tool sequence. -- [Examples](https://github.com/onnx/turnkeyml/tree/main/examples/cli): PyTorch scripts and ONNX files that can be used to try out `turnkey` concepts. +- [User guide](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md): explains the concepts of `turnkey's`, including the syntax for making your own tool sequence. +- [Examples](https://github.com/onnx/turnkeyml/tree/main/examples/turnkey/cli): PyTorch scripts and ONNX files that can be used to try out `turnkey` concepts. - [Code organization guide](https://github.com/onnx/turnkeyml/blob/main/docs/code.md): learn how this repository is structured. - [Models](https://github.com/onnx/turnkeyml/blob/main/models/readme.md): PyTorch model scripts that work with `turnkey`. @@ -101,4 +101,4 @@ The build tool has built-in support for a variety of interoperable `Tools`. If y > turnkey -i my_model.py discover export-pytorch my-custom-tool --my-args ``` -All of the built-in `Tools` are implemented against the plugin API. Check out the [example plugins](https://github.com/onnx/turnkeyml/tree/main/examples/cli/plugins) and the [plugin API guide](https://github.com/onnx/turnkeyml/blob/main/docs/contribute.md#contributing-a-plugin) to learn more about creating an installable plugin. \ No newline at end of file +All of the built-in `Tools` are implemented against the plugin API. Check out the [example plugins](https://github.com/onnx/turnkeyml/tree/main/examples/turnkey/cli/plugins) and the [plugin API guide](https://github.com/onnx/turnkeyml/blob/main/docs/contribute.md#contributing-a-plugin) to learn more about creating an installable plugin. \ No newline at end of file diff --git a/docs/turnkey/tools_user_guide.md b/docs/turnkey/tools_user_guide.md index d59333a..a2191bd 100644 --- a/docs/turnkey/tools_user_guide.md +++ b/docs/turnkey/tools_user_guide.md @@ -108,7 +108,7 @@ Name of one or more script (.py), ONNX (.onnx), or cached build (_state.yaml) fi Examples: - `turnkey -i models/selftest/linear.py` - `turnkey -i models/selftest/linear.py models/selftest/twolayer.py` -- `turnkey -i examples/cli/onnx/sample.onnx` +- `turnkey -i examples/turnkey/cli/onnx/sample.onnx` You may also use [Bash regular expressions](https://tldp.org/LDP/Bash-Beginners-Guide/html/sect_04_01.html) to locate the files you want to benchmark. diff --git a/examples/lemonade/README.md b/examples/lemonade/README.md new file mode 100644 index 0000000..df67173 --- /dev/null +++ b/examples/lemonade/README.md @@ -0,0 +1,18 @@ +# Lemonade Examples + +This folder contains examples of how to use `lemonade` via the high-level LEAP APIs. These APIs make it easy to load a model, generate responses, and also show how to stream those responses. + +The `demos/` folder also contains some higher-level application demos of the LEAP APIs. Learn more in `demos/README.md`. + +## LEAP Examples + +This table shows which LEAP examples are available: + +| Framework | CPU | GPU | NPU | Hybrid | +|----------------------------|---------------------------|------------------|-----------------|--------------------| +| Huggingface | leap_basic.py | - | - | - | +| OGA | leap_oga_cpu.py | leap_oga_igpu.py | leap_oga_npu.py | leap_oga_hybrid.py | +| Huggingface with streaming | leap_streaming.py | - | - | - | +| OGA with streaming | leap_oga_cpu_streaming.py | leap_oga_igpu_streaming.py | leap_oga_npu_streaming.py | leap_oga_hybrid_streaming.py | + +To run a LEAP example, first set up a conda environment with the appropriate framework and backend support. Then run the scripts with a command like `python leap_basic.py`. \ No newline at end of file diff --git a/examples/lemonade/demos/README.md b/examples/lemonade/demos/README.md new file mode 100644 index 0000000..b229a57 --- /dev/null +++ b/examples/lemonade/demos/README.md @@ -0,0 +1,17 @@ +# Lemonade Demos + +The demo scripts in this folder show how `lemonade` can be used in integrate OnnxRuntime-GenAI (OGA) into higher-level applications such as chat and search. + +The format of each demo is to have two files which show the before-and-after of integrating OGA: + - `*_start.py`: a version of the application that uses regular software to try and handle a natural language task. + - `*_hybrid.py`: an upgrade of the application that integrates an LLM with Ryzen AI Hybrid to improve the natural language task. + +The demos available are: + - `chat/`: prompts the user for a message and then streams the LLM's response to the terminal. + - `search/`: demonstrates how a user can search an employee handbook in natural language using an LLM. + +To run a demo: +1. Set up a conda environment with the appropriate framework and backend support. +1. `cd` into the demo directory (e.g., `cd search/`) +1. Run the `*_start.py` script to see what the application is like without the LLM (e.g., `python search_start.py`) +1. Run the `*_hybrid.py` script to see what the application is like with the LLM (e.g., `python search_hybrid.py`) diff --git a/examples/lemonade/demos/chat/chat_hybrid.py b/examples/lemonade/demos/chat/chat_hybrid.py new file mode 100644 index 0000000..8b770ff --- /dev/null +++ b/examples/lemonade/demos/chat/chat_hybrid.py @@ -0,0 +1,65 @@ +import sys +from threading import Thread, Event +from transformers import StoppingCriteria, StoppingCriteriaList +from lemonade.tools.chat import StopOnEvent +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer + + +def main(): + + model, tokenizer = leap.from_pretrained( + "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", + recipe="oga-hybrid", + ) + + while True: + # Enable sending a signal into the generator thread to stop + # the generation early + stop_event = Event() + stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) + + # Prompt the user for an input message + print() + user_message = input("User: ") + print() + + # Print a friendly message when we quit + if user_message == "quit": + print("System: Ok, bye!\n") + break + + # Generate the response in a thread and stream the result back + # to the main thread + input_ids = tokenizer(user_message, return_tensors="pt").input_ids + + streamer = OrtGenaiStreamer(tokenizer) + generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 200, + "stopping_criteria": stopping_criteria, + } + + thread = Thread(target=model.generate, kwargs=generation_kwargs) + thread.start() + + # Print each word to the screen as it arrives from the streamer + # Allow the user to terminate the response with + # a keyboard interrupt (ctrl+c) + try: + print("LLM: ", end="") + for new_text in streamer: + print(new_text, end="") + sys.stdout.flush() + + except KeyboardInterrupt: + stop_event.set() + + print() + + thread.join() + + +if __name__ == "__main__": + main() diff --git a/examples/lemonade/demos/chat/chat_start.py b/examples/lemonade/demos/chat/chat_start.py new file mode 100644 index 0000000..22724f1 --- /dev/null +++ b/examples/lemonade/demos/chat/chat_start.py @@ -0,0 +1,105 @@ +import sys +from threading import Thread, Event +from transformers import StoppingCriteriaList +from lemonade.tools.chat import StopOnEvent +from queue import Queue +from time import sleep + + +class TextStreamer: + """ + Imitates a queue for streaming text from one thread to another. + + Not needed once we integrate with LEAP. + """ + + def __init__(self): + self.text_queue = Queue() + self.stop_signal = None + + def add_text(self, text: str): + self.text_queue.put(text) + + def done(self): + self.text_queue.put(self.stop_signal) + + def __iter__(self): + return self + + def __next__(self): + value = self.text_queue.get() + if value == self.stop_signal: + raise StopIteration() + else: + return value + + +def generate_placeholder( + streamer: TextStreamer, stopping_criteria: StoppingCriteriaList +): + """ + Imitates an LLM's generate function by streaming text to a queue. + + Not needed once we integrate with LEAP. + """ + + response = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + + for word in response.split(" "): + streamer.add_text(f"{word} ") + sleep(0.05) + + if stopping_criteria[0].stop_event.is_set(): + break + + streamer.done() + + +def main(): + + while True: + # Enable sending a signal into the generator thread to stop + # the generation early + stop_event = Event() + stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) + + # Prompt the user for an input message + print() + user_message = input("User: ") + print() + + # Print a friendly message when we quit + if user_message == "quit": + print("System: Ok, bye!\n") + break + + # Generate the response in a thread and stream the result back + # to the main thread + streamer = TextStreamer() + generation_kwargs = { + "streamer": streamer, + "stopping_criteria": stopping_criteria, + } + + thread = Thread(target=generate_placeholder, kwargs=generation_kwargs) + thread.start() + + # Print each word to the screen as it arrives + # Allow the user to terminate the response with + # a keyboard interrupt (ctrl+c) + try: + print("LLM: ", end="") + for new_text in streamer: + print(new_text, end="") + sys.stdout.flush() + + except KeyboardInterrupt: + stop_event.set() + + print() + + thread.join() + + +if __name__ == "__main__": + main() diff --git a/examples/lemonade/demos/search/search_hybrid.py b/examples/lemonade/demos/search/search_hybrid.py new file mode 100644 index 0000000..b8fe9fc --- /dev/null +++ b/examples/lemonade/demos/search/search_hybrid.py @@ -0,0 +1,87 @@ +import sys +from threading import Thread, Event +from transformers import StoppingCriteriaList +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer +from lemonade.tools.chat import StopOnEvent + +employee_handbook = """ +1. You will work very hard every day.\n +2. You are allowed to listen to music, but must wear headphones.\n +3. Remember, the break room fridge is not a science experiment. + Please label and remove your leftovers regularly!\n +""" + + +def system_prompt(user_prompt): + return f""" +<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +You are a helpful assistant who can only answer questions about this employee handbook: {employee_handbook}. +Don't make up information that isn't in the handbook already. +<|eot_id|><|start_header_id|>user<|end_header_id|> + +{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|> +""" + + +def main(): + + # Load LLaMA-3.2 1B model on Ryzen AI Hybrid + model, tokenizer = leap.from_pretrained( + "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", + recipe="oga-hybrid", + ) + + while True: + # Enable sending a signal into the generator thread to stop + # the generation early + stop_event = Event() + stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) + + # Prompt the user for an input message + print() + user_message = input("User: ") + print() + + # Print a friendly message when we quit + if user_message == "quit": + print("System: Ok, bye!\n") + break + + # Generate the response in a thread and stream the result back + # to the main thread + input_ids = tokenizer( + system_prompt(user_message), return_tensors="pt" + ).input_ids + + streamer = OrtGenaiStreamer(tokenizer) + generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 200, + "stopping_criteria": stopping_criteria, + } + + thread = Thread(target=model.generate, kwargs=generation_kwargs) + thread.start() + + # Print each word to the screen as it arrives from the streamer + # Allow the user to terminate the response with + # a keyboard interrupt (ctrl+c) + try: + print("LLM: ", end="") + for new_text in streamer: + print(new_text, end="") + sys.stdout.flush() + + except KeyboardInterrupt: + stop_event.set() + + print() + + thread.join() + + +if __name__ == "__main__": + main() diff --git a/examples/lemonade/demos/search/search_start.py b/examples/lemonade/demos/search/search_start.py new file mode 100644 index 0000000..705cea7 --- /dev/null +++ b/examples/lemonade/demos/search/search_start.py @@ -0,0 +1,140 @@ +import sys +from threading import Thread, Event +from transformers import StoppingCriteriaList +from lemonade.tools.chat import StopOnEvent + +# These imports are not needed when we add the LLM +from queue import Queue +from time import sleep + +employee_handbook = """ +1. You will work very hard every day.\n +2. You are allowed to listen to music, but must wear headphones.\n +3. Remember, the break room fridge is not a science experiment. + Please label and remove your leftovers regularly!\n +""" + + +class TextStreamer: + """ + Imitates a queue for streaming text from one thread to another. + + Not needed once we integrate with LEAP. + """ + + def __init__(self): + self.text_queue = Queue() + self.stop_signal = None + + def add_text(self, text: str): + self.text_queue.put(text) + + def done(self): + self.text_queue.put(self.stop_signal) + + def __iter__(self): + return self + + def __next__(self): + value = self.text_queue.get() + if value == self.stop_signal: + raise StopIteration() + else: + return value + + +def plain_text_search( + question: str, streamer: TextStreamer, stopping_criteria: StoppingCriteriaList +): + """ + Searches the employee handbook, looking for an exact match and + returns an answer if available. + + Imitates an LLM's generate function by streaming text to a queue. + + Not needed once we integrate with LEAP. + """ + + # Turn the question into key words + # Remove punctuation and convert to lower-case + sanitized_question = question.replace("?", "").replace(".", "").lower() + # Get a list of important words (longer than length 3) + keywords = [word for word in sanitized_question.split(" ") if len(word) > 3] + + # Search for the key words in the employee handbook + result = None + for keyword in keywords: + for line in employee_handbook.lower().split("\n"): + if keyword in line: + result = line + + if result: + response = ( + f"This line of the employee handbook might be relevant to you: {result}" + ) + else: + response = ( + "I am sorry, I didn't find anything that is useful to you. Please " + "try again with another question or read the entire employee handbook " + "cover-to-cover to make sure that you didn't miss any rules." + ) + + for word in response.split(" "): + streamer.add_text(f"{word} ") + sleep(0.05) + + if stopping_criteria[0].stop_event.is_set(): + break + + streamer.done() + + +def main(): + + while True: + # Enable sending a signal into the generator thread to stop + # the generation early + stop_event = Event() + stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) + + # Prompt the user for an input message + print() + user_message = input("User: ") + print() + + # Print a friendly message when we quit + if user_message == "quit": + print("System: Ok, bye!\n") + break + + # Generate the response in a thread and stream the result back + # to the main thread + streamer = TextStreamer() + generation_kwargs = { + "question": user_message, + "streamer": streamer, + "stopping_criteria": stopping_criteria, + } + + thread = Thread(target=plain_text_search, kwargs=generation_kwargs) + thread.start() + + # Print each word to the screen as it arrives from the streamer + # Allow the user to terminate the response with + # a keyboard interrupt (ctrl+c) + try: + print("LLM: ", end="") + for new_text in streamer: + print(new_text, end="") + sys.stdout.flush() + + except KeyboardInterrupt: + stop_event.set() + + print() + + thread.join() + + +if __name__ == "__main__": + main() diff --git a/examples/lemonade/leap_basic.py b/examples/lemonade/leap_basic.py index 418cecc..330d271 100644 --- a/examples/lemonade/leap_basic.py +++ b/examples/lemonade/leap_basic.py @@ -10,7 +10,7 @@ from lemonade import leap -model, tokenizer = leap.from_pretrained("facebook/opt-125m", recipe="hf-cpu") +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu") input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids response = model.generate(input_ids, max_new_tokens=30) diff --git a/examples/lemonade/leap_oga_cpu.py b/examples/lemonade/leap_oga_cpu.py new file mode 100644 index 0000000..9b9d539 --- /dev/null +++ b/examples/lemonade/leap_oga_cpu.py @@ -0,0 +1,18 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on CPU via OnnxRuntime-Genai (OGA) using the oga-cpu recipe, +and then use it to generate the response to a prompt. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from lemonade import leap + +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu") + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) + +print(tokenizer.decode(response[0])) diff --git a/examples/lemonade/leap_oga_cpu_streaming.py b/examples/lemonade/leap_oga_cpu_streaming.py new file mode 100644 index 0000000..b88c05f --- /dev/null +++ b/examples/lemonade/leap_oga_cpu_streaming.py @@ -0,0 +1,36 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on CPU via OnnxRuntime-GenAI using the oga-cpu recipe, and then +use a thread to generate a streaming the response to a prompt. + +Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, +i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from threading import Thread +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer + +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu") + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids + +streamer = OrtGenaiStreamer(tokenizer) +generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 30, +} + +thread = Thread(target=model.generate, kwargs=generation_kwargs) +thread.start() + +# Generate the response using streaming +for new_text in streamer: + print(new_text) + +thread.join() diff --git a/examples/lemonade/leap_oga_hybrid.py b/examples/lemonade/leap_oga_hybrid.py new file mode 100644 index 0000000..da47e2c --- /dev/null +++ b/examples/lemonade/leap_oga_hybrid.py @@ -0,0 +1,20 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-Genai (OGA) +using the oga-hybrid recipe, and then use it to generate the response to a prompt. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from lemonade import leap + +model, tokenizer = leap.from_pretrained( + "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid" +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) + +print(tokenizer.decode(response[0])) diff --git a/examples/lemonade/leap_oga_hybrid_streaming.py b/examples/lemonade/leap_oga_hybrid_streaming.py new file mode 100644 index 0000000..0b133f1 --- /dev/null +++ b/examples/lemonade/leap_oga_hybrid_streaming.py @@ -0,0 +1,39 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-GenAI +using the oga-cpu recipe, and then use a thread to generate a streaming the +response to a prompt. + +Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, +i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from threading import Thread +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer + +model, tokenizer = leap.from_pretrained( + "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid" +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids + +streamer = OrtGenaiStreamer(tokenizer) +generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 30, +} + +thread = Thread(target=model.generate, kwargs=generation_kwargs) +thread.start() + +# Generate the response using streaming +for new_text in streamer: + print(new_text) + +thread.join() diff --git a/examples/lemonade/leap_oga_igpu.py b/examples/lemonade/leap_oga_igpu.py new file mode 100644 index 0000000..5891e45 --- /dev/null +++ b/examples/lemonade/leap_oga_igpu.py @@ -0,0 +1,18 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on integrated GPUs (iGPUs) via OnnxRuntime-Genai (OGA) +using the oga-igpu recipe, and then use it to generate the response to a prompt. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from lemonade import leap + +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-igpu") + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) + +print(tokenizer.decode(response[0])) diff --git a/examples/lemonade/leap_oga_igpu_streaming.py b/examples/lemonade/leap_oga_igpu_streaming.py new file mode 100644 index 0000000..5841693 --- /dev/null +++ b/examples/lemonade/leap_oga_igpu_streaming.py @@ -0,0 +1,39 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on integrated GPUs (iGPUs) via OnnxRuntime-GenAI using the oga-igpu recipe, +and then use a thread to generate a streaming the response to a prompt. + +Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, +i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from threading import Thread +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer + +model, tokenizer = leap.from_pretrained( + "Qwen/Qwen2.5-0.5B-Instruct", + recipe="oga-igpu", +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids + +streamer = OrtGenaiStreamer(tokenizer) +generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 30, +} + +thread = Thread(target=model.generate, kwargs=generation_kwargs) +thread.start() + +# Generate the response using streaming +for new_text in streamer: + print(new_text) + +thread.join() diff --git a/examples/lemonade/leap_oga_npu.py b/examples/lemonade/leap_oga_npu.py new file mode 100644 index 0000000..d162b8d --- /dev/null +++ b/examples/lemonade/leap_oga_npu.py @@ -0,0 +1,21 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on Ryzen AI NPU via OnnxRuntime-Genai (OGA) using the oga-npu recipe, +and then use it to generate the response to a prompt. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from lemonade import leap + +model, tokenizer = leap.from_pretrained( + "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix", + recipe="oga-npu", +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) + +print(tokenizer.decode(response[0])) diff --git a/examples/lemonade/leap_oga_npu_streaming.py b/examples/lemonade/leap_oga_npu_streaming.py new file mode 100644 index 0000000..1b5e396 --- /dev/null +++ b/examples/lemonade/leap_oga_npu_streaming.py @@ -0,0 +1,39 @@ +""" +This example demonstrates how to use the LEAP API to load a model for +inference on Ryzen AI NPU via OnnxRuntime-GenAI using the oga-npu recipe, +and then use a thread to generate a streaming the response to a prompt. + +Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, +i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. + +Make sure you have set up your OGA device in your Python environment. +See for details: +https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install-onnxruntime-genai +""" + +from threading import Thread +from lemonade import leap +from lemonade.tools.ort_genai.oga import OrtGenaiStreamer + +model, tokenizer = leap.from_pretrained( + "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix", + recipe="oga-npu", +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids + +streamer = OrtGenaiStreamer(tokenizer) +generation_kwargs = { + "input_ids": input_ids, + "streamer": streamer, + "max_new_tokens": 30, +} + +thread = Thread(target=model.generate, kwargs=generation_kwargs) +thread.start() + +# Generate the response using streaming +for new_text in streamer: + print(new_text) + +thread.join() diff --git a/examples/lemonade/leap_streaming.py b/examples/lemonade/leap_streaming.py index e2951db..2c13b1b 100644 --- a/examples/lemonade/leap_streaming.py +++ b/examples/lemonade/leap_streaming.py @@ -4,17 +4,14 @@ generate a streaming the response to a prompt. Note: this approach only works with recipes that support TextIteratorStreamer, -i.e., huggingface-based recipes such as hf-cpu and ryzenai-npu. +i.e., huggingface-based recipes such as hf-cpu and hf-dgpu. """ -from thread import Thread +from threading import Thread from transformers import TextIteratorStreamer from lemonade import leap -# Replace the recipe with "ryzenai-npu" to run on the RyzenAI NPU -model, tokenizer = leap.from_pretrained( - "meta-llama/Llama-2-7b-chat-hf", recipe="hf-cpu" -) +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu") input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids diff --git a/examples/readme.md b/examples/readme.md index b9dcb05..4d21f92 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -1,6 +1,7 @@ # Tool Examples -This directory contains examples to help you learn how to use the tools. The examples are split up into two sub-directories: -1. `examples/cli`: a tutorial series for the `turnkey` CLI. This is the recommended starting point. -1. `examples/api`: scripts that demonstrate how to use the `turnkey.evaluate_files()` API. -1. `examples/llm`: scripts that demonstrate the `lemonade` CLI for LLMs. +This directory contains examples to help you learn how to use the tools. The examples are split up into these sub-directories: +1. `examples/lemonade`: scripts that demonstrate the `lemonade` CLI for LLMs. +1. `examples/turnkey/cli`: a tutorial series for the `turnkey` CLI. This is the recommended starting point. +1. `examples/turnkey/api`: scripts that demonstrate how to use the `turnkey.evaluate_files()` API. + diff --git a/examples/turnkey/cli/onnx/README.md b/examples/turnkey/cli/onnx/README.md index d8af31c..6f4c62c 100644 --- a/examples/turnkey/cli/onnx/README.md +++ b/examples/turnkey/cli/onnx/README.md @@ -3,7 +3,7 @@ You can try out `turnkey` with an ONNX file input like so: ```bash -cd turnkeyml/examples/cli/onnx +cd turnkeyml/examples/turnkey/cli/onnx turnkey -i hello_world.onnx load-onnx convert-fp16` ``` diff --git a/examples/turnkey/cli/readme.md b/examples/turnkey/cli/readme.md index 5c0e7cd..011f73f 100644 --- a/examples/turnkey/cli/readme.md +++ b/examples/turnkey/cli/readme.md @@ -2,6 +2,6 @@ This folder contains artifacts that can be used to help learn the `turnkey` CLI. See the docstring in each Python script for more information about how it can be used. -- [`onnx/`](https://github.com/onnx/turnkeyml/blob/main/examples/cli/onnx/README.md): example input for the `turnkey load-onnx` tool. -- [`plugins/`](https://github.com/onnx/turnkeyml/blob/main/examples/cli/plugins/README.md): example plugin implementation code. -- [`scripts/`](https://github.com/onnx/turnkeyml/blob/main/examples/cli/scripts/README.md): example scripts for use with the `turnkey discover` tool. +- [`onnx/`](https://github.com/onnx/turnkeyml/blob/main/examples/turnkey/cli/onnx/README.md): example input for the `turnkey load-onnx` tool. +- [`plugins/`](https://github.com/onnx/turnkeyml/blob/main/examples/turnkey/cli/plugins/README.md): example plugin implementation code. +- [`scripts/`](https://github.com/onnx/turnkeyml/blob/main/examples/turnkey/cli/scripts/README.md): example scripts for use with the `turnkey discover` tool. diff --git a/examples/turnkey/cli/scripts/README.md b/examples/turnkey/cli/scripts/README.md index 6f873f3..17d53fd 100644 --- a/examples/turnkey/cli/scripts/README.md +++ b/examples/turnkey/cli/scripts/README.md @@ -3,7 +3,7 @@ You can try out `turnkey` with PyTorch models, via the `discover` tool, like so: ```bash -cd turnkeyml/examples/cli/scripts +cd turnkeyml/examples/turnkey/cli/scripts turnkey -i hello_world.py discover export-pytorch ``` This will discover the model within `hello_world.py` and export it to ONNX. diff --git a/models/readme.md b/models/readme.md index 1ab740b..4bb4f14 100644 --- a/models/readme.md +++ b/models/readme.md @@ -1,6 +1,6 @@ # TurnkeyML Models -This directory contains the TurnkeyML models, which is a large collection of models that can be evaluated using the [`turnkey` CLI tool](https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md). +This directory contains the TurnkeyML models, which is a large collection of models that can be evaluated using the [`turnkey` CLI tool](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md). ## Table of Contents @@ -35,8 +35,8 @@ The corpora are: Before running the benchmark we suggest you: 1. Install the `turnkey` package by following the [install instructions](https://github.com/onnx/turnkeyml/tree/main/docs/install.md). -1. Go through some [`turnkey` CLI examples](https://github.com/onnx/turnkeyml/tree/main/examples/cli/readme.md). -1. Familiarize yourself with the [`turnkey` CLI tool](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey_user_guide.md) documentation. +1. Go through some [`turnkey` CLI examples](https://github.com/onnx/turnkeyml/tree/main/examples/turnkey/cli/readme.md). +1. Familiarize yourself with the [`turnkey` CLI tool](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md) documentation. You must also run the following command to install all of the models' dependencies into your Python environment. diff --git a/plugins/devices/README.md b/plugins/devices/README.md index 862dbcd..e8bab11 100644 --- a/plugins/devices/README.md +++ b/plugins/devices/README.md @@ -137,7 +137,7 @@ To add a runtime to this plugin: - Each supported part within a device family must be defined as a dictionary. - Each supported configuration within a device model must be defined as a list. - Example: `"supported_devices": {"family1":{"part1":["config1","config2"]}}`. - - See [example_combined](https://github.com/onnx/turnkeyml/tree/main/examples/cli/plugins/example_combined) for a plugin implementation example that leverages this feature. + - See [example_combined](https://github.com/onnx/turnkeyml/tree/main/examples/turnkey/cli/plugins/example_combined) for a plugin implementation example that leverages this feature. - Note: If a device is already supported by the tools, this simply adds support for another runtime to that device. If the device is _not_ already supported by the tools, this also adds support for that device and it will start to appear as an option for the `turnkey --device ` argument. - `"build_required": Bool`: indicates whether the `build_model()` API should be called on the `model` and `inputs`. - `"docker_required": Bool`: indicates whether benchmarking is implemented through a docker container. @@ -190,7 +190,7 @@ implements = { ### Runtime Class -A runtime class inherits the abstract base class [`BaseRT`](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/run/basert.py) and implements a one or more [runtimes](#runtime) to provide benchmarking support for one or more [devices](https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md#devices). +A runtime class inherits the abstract base class [`BaseRT`](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/run/basert.py) and implements a one or more [runtimes](#runtime) to provide benchmarking support for one or more [devices](https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md#devices). `BaseRT` has 4 methods that plugin developers must overload: - `_setup()`: any code that should be called prior to benchmarking as a one-time setup. Called automatically at the end of `BaseRT.__init__()`. diff --git a/setup.py b/setup.py index 199a984..346acb4 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ "turnkeyml_models.torch_hub", "turnkeyml_models.torchvision", "turnkeyml_models.transformers", + "lemonade_install", ], install_requires=[ "invoke>=2.0.0", @@ -47,6 +48,7 @@ "wmi", "pytz", "tqdm", + "matplotlib", # Conditional dependencies for ONNXRuntime backends "onnxruntime >=1.10.1;platform_system=='Linux' and extra != 'llm-oga-cuda'", "onnxruntime-directml >=1.19.0;platform_system=='Windows' and extra != 'llm-oga-cuda'", @@ -102,6 +104,7 @@ "turnkey=turnkeyml:turnkeycli", "turnkey-llm=lemonade:lemonadecli", "lemonade=lemonade:lemonadecli", + "lemonade-install=lemonade_install:installcli", ] }, python_requires=">=3.8, <3.12", diff --git a/src/lemonade/cache.py b/src/lemonade/cache.py index 614e7a7..cab7399 100644 --- a/src/lemonade/cache.py +++ b/src/lemonade/cache.py @@ -32,6 +32,8 @@ class Keys: PROMPT_TOKENS = "prompt_tokens" RESPONSE = "response" RESPONSE_TOKENS = "response_tokens" + RESPONSE_LENGTHS_HISTOGRAM = "response_lengths_histogram" CACHE_DIR = "cache_dir" DEVICE = "device" OGA_MODELS_SUBFOLDER = "oga_models_subfolder" + MEMORY_USAGE_PLOT = "memory_usage_plot" diff --git a/src/lemonade/cli.py b/src/lemonade/cli.py index a58377a..34d38bb 100644 --- a/src/lemonade/cli.py +++ b/src/lemonade/cli.py @@ -94,6 +94,19 @@ def main(): action="store_true", ) + parser.add_argument( + "-m", + "--memory", + nargs="?", + metavar="TRACK_INTERVAL", + type=float, + default=None, + const=0.25, + help="Track physical memory usage during the build and generate a plot when the " + "command completes. Optionally, specify the tracking interval (sec), " + "defaults to 0.25 sec.", + ) + global_args, tool_instances, evaluation_tools = cli.parse_tools(parser, tools) if len(evaluation_tools) > 0: @@ -119,6 +132,7 @@ def main(): sequence.launch( state, lean_cache=global_args["lean_cache"], + track_memory_interval=global_args["memory"], ) else: # Run the management tools diff --git a/src/lemonade/leap.py b/src/lemonade/leap.py index da9342f..dc68ace 100644 --- a/src/lemonade/leap.py +++ b/src/lemonade/leap.py @@ -28,34 +28,12 @@ def _make_state(recipe, checkpoint) -> Dict: return State(cache_dir=cache.DEFAULT_CACHE_DIR, build_name=f"{checkpoint}_{recipe}") -class HuggingfaceCudaTokenizer(TokenizerAdapter): - """ - Wrap the Huggingface tokenizer class by sending the encoded - tokenizer inputs to the dGPU. - - This allows LEAP recipes to be fungible by saving the user the - additional step of managing the input's device location. - """ - - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def __call__(self, prompt, **kwargs): - return self.tokenizer(prompt, **kwargs).to(device="cuda") - - def decode(self, response, **kwargs): - return self.tokenizer.decode(response, **kwargs) - - def from_pretrained( checkpoint: str, recipe: str = "hf-cpu", ) -> Tuple[ModelAdapter, TokenizerAdapter]: """ - Load an LLM and the corresponding tokenizer using a bespoke lemonade recipe. - - Not all recipes are available with all checkpoints. A leap.NotSupported exception - will be raised in these cases. + Load an LLM and the corresponding tokenizer using a lemonade recipe. Args: - checkpoint: huggingface checkpoint that defines the LLM @@ -64,8 +42,7 @@ def from_pretrained( Recipe choices: - hf-cpu: Huggingface Transformers implementation for CPU with max-perf settings - hf-dgpu: Huggingface Transformers implementation on dGPU (via device="cuda") - - dml-og-igpu: DirectML implementation for iGPU based on onnxruntime-genai - - ryzenai-npu: RyzenAI implementation of huggingface transformers PyTorch model + - oga-dml: DirectML implementation for iGPU based on onnxruntime-genai Returns: - model: LLM instance with a generate() method that invokes the recipe @@ -105,42 +82,31 @@ def from_pretrained( device="cuda", ) - # Wrap the tokenizer to ensure that inputs are placed on the dGPU device - tokenizer = HuggingfaceCudaTokenizer(state.tokenizer) - - return state.model, tokenizer + return state.model, state.tokenizer - elif recipe == "oga-dml-igpu": + elif recipe.startswith("oga-"): import lemonade.tools.ort_genai.oga as oga + # Make sure the user chose a supported runtime, e.g., oga-cpu + user_backend = recipe.split("oga-")[1] + supported_backends = ["cpu", "igpu", "npu", "hybrid", "cuda"] + supported_recipes = [f"oga-{backend}" for backend in supported_backends] + if recipe not in supported_recipes: + raise NotSupported( + "Selected OGA recipe is not supported. " + f"The supported OGA recipes are: {supported_recipes}" + ) + state = _make_state(recipe, checkpoint) state = oga.OgaLoad().run( state, input=checkpoint, - device="igpu", + device=user_backend, dtype="int4", ) return state.model, state.tokenizer - elif recipe == "ryzenai-npu": - if ( - checkpoint != "TheBloke/Llama-2-7b-Chat-fp16" - and checkpoint != "meta-llama/Llama-2-7b-chat-hf" - and checkpoint != "microsoft/Phi-3-mini-4k-instruct" - and checkpoint != "meta-llama/Meta-Llama-3-8B-Instruct" - and checkpoint != "meta-llama/Meta-Llama-3-8B" - ): - _raise_not_supported(recipe, checkpoint) - - import lemonade.tools.ryzenai_npu.ryzenai_npu as ryzenai_npu - - state = _make_state(recipe, checkpoint) - - state = ryzenai_npu.RyzenAINPULoad().run(state, checkpoint, device="phx") - - return state.model, state.tokenizer - else: _raise_not_supported(recipe, checkpoint) diff --git a/src/lemonade/tools/chat.py b/src/lemonade/tools/chat.py index 87c855e..9519c78 100644 --- a/src/lemonade/tools/chat.py +++ b/src/lemonade/tools/chat.py @@ -1,4 +1,5 @@ import argparse +import os import time import statistics from threading import Thread, Event @@ -9,6 +10,8 @@ from pydantic import BaseModel from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList import uvicorn +import matplotlib.pyplot as plt +import turnkeyml.common.build as build from turnkeyml.state import State from turnkeyml.tools import Tool from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter @@ -27,7 +30,24 @@ def sanitize_string(input_string): - return input_string.encode("utf-8", "ignore").decode("utf-8") + return input_string.encode("charmap", "ignore").decode("charmap") + + +def sanitize_text(text): + if isinstance(text, str): + return sanitize_string(text) + elif isinstance(text, list): + return [sanitize_string(item) for item in text] + else: + raise TypeError("Input must be a string or a list of strings.") + + +def positive_int(x): + """Conversion function for argparse""" + i = int(x) + if i < 1: + raise ValueError("Non-positive values are not allowed") + return i class LLMPrompt(Tool): @@ -53,6 +73,7 @@ def __init__(self): Keys.PROMPT, Keys.RESPONSE_TOKENS, Keys.RESPONSE, + Keys.RESPONSE_LENGTHS_HISTOGRAM, ] @staticmethod @@ -62,7 +83,14 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: add_help=add_help, ) - parser.add_argument("--prompt", "-p", help="Prompt input to the LLM") + parser.add_argument( + "--prompt", + "-p", + help="Input prompt to the LLM. Two formats are supported. " + "1) str: use a user-provided prompt string " + "2) path/to/prompt.txt: load the prompt from a .txt file.", + required=True, + ) parser.add_argument( "--max-new-tokens", @@ -72,44 +100,115 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: help="Maximum number of new tokens in the response", ) + parser.add_argument( + "--n-trials", + "-n", + default=1, + type=positive_int, + help="Number of responses the LLM will generate for the prompt (useful for testing)", + ) + return parser + def parse(self, state: State, args, known_only=True) -> argparse.Namespace: + """ + Helper function to parse CLI arguments into the args expected + by run() + """ + + parsed_args = super().parse(state, args, known_only) + + # Decode prompt arg into a string prompt + if parsed_args.prompt.endswith(".txt") and os.path.exists(parsed_args.prompt): + with open(parsed_args.prompt, "r", encoding="utf-8") as f: + parsed_args.prompt = f.read() + else: + # No change to the prompt + pass + + if parsed_args.n_trials < 1: + raise ValueError("N_TRIALS should be a positive number") + + return parsed_args + def run( self, state: State, prompt: str = "Hello", max_new_tokens: int = 512, + n_trials: int = 1, ) -> State: model: ModelAdapter = state.model tokenizer: TokenizerAdapter = state.tokenizer input_ids = tokenizer(prompt, return_tensors="pt").input_ids - if isinstance(input_ids, list): + if isinstance(input_ids, (list, str)): # OGA models return a list of tokens + # Our llama.cpp adapter returns a string len_tokens_in = len(input_ids) else: # HF models return a 2-D tensor len_tokens_in = input_ids.shape[1] - response = model.generate( - input_ids, max_new_tokens=max_new_tokens, **DEFAULT_GENERATE_PARAMS - ) - len_tokens_out = len(response[0]) - len_tokens_in - input_ids = input_ids if isinstance(input_ids, list) else input_ids[0] - i = 0 - while i < len_tokens_in and input_ids[i] == response[0][i]: - i += 1 - response_text = tokenizer.decode( - response[0][i:], skip_special_tokens=True - ).strip() + len_tokens_out = [] + response_texts = [] + for trial in range(n_trials): + if n_trials > 1: + self.set_percent_progress(100.0 * trial / n_trials) + + # Get the response from the LLM, which may include the prompt in it + response = model.generate( + input_ids, max_new_tokens=max_new_tokens, **DEFAULT_GENERATE_PARAMS + ) + + # Flatten the input and response + input_ids_array = ( + input_ids if isinstance(input_ids, (list, str)) else input_ids[0] + ) + response_array = response if isinstance(response, str) else response[0] + + # Separate the prompt from the response + len_tokens_out.append(len(response_array) - len_tokens_in) + + input_token = 0 + while ( + input_token < len_tokens_in + and input_ids_array[input_token] == response_array[input_token] + ): + input_token += 1 + + # Only decode the actual response (not the prompt) + response_text = tokenizer.decode( + response_array[input_token:], skip_special_tokens=True + ).strip() + response_texts.append(response_text) - state.response = response_text + state.response = response_texts + + if n_trials == 1: + len_tokens_out = len_tokens_out[0] + response_texts = response_texts[0] + else: + self.set_percent_progress(None) + + # Plot data + plt.figure() + plt.hist(len_tokens_out, bins=20) + plt.xlabel("Response Length (tokens)") + plt.ylabel("Frequency") + plt.title(f"Histogram of Response Lengths\n{state.build_name}") + figure_path = os.path.join( + build.output_dir(state.cache_dir, state.build_name), + "response_lengths.png", + ) + plt.savefig(figure_path) + state.save_stat(Keys.RESPONSE_LENGTHS_HISTOGRAM, figure_path) state.save_stat(Keys.PROMPT_TOKENS, len_tokens_in) state.save_stat(Keys.PROMPT, prompt) state.save_stat(Keys.RESPONSE_TOKENS, len_tokens_out) - state.save_stat(Keys.RESPONSE, sanitize_string(response_text)) + state.save_stat(Keys.RESPONSE, sanitize_text(response_texts)) return state diff --git a/src/lemonade/tools/huggingface_load.py b/src/lemonade/tools/huggingface_load.py index 617a4b3..8b48ec6 100644 --- a/src/lemonade/tools/huggingface_load.py +++ b/src/lemonade/tools/huggingface_load.py @@ -39,7 +39,11 @@ def __init__(self, tokenizer: transformers.AutoTokenizer, device: str): self.device = device def __call__(self, prompt, **kwargs): - return self.tokenizer(prompt, **kwargs).to(self.device) + tokens = self.tokenizer(prompt, **kwargs) + if self.device: + return tokens.to(self.device) + else: + return tokens def decode(self, response, **kwargs): return self.tokenizer.decode(response, **kwargs) diff --git a/src/lemonade/tools/humaneval.py b/src/lemonade/tools/humaneval.py index c9433a3..01ce2f0 100644 --- a/src/lemonade/tools/humaneval.py +++ b/src/lemonade/tools/humaneval.py @@ -26,7 +26,7 @@ class AccuracyHumaneval(Tool): - pass@10: Percentage of problems solved within 10 generation attempts - pass@100: Percentage of problems solved within 100 generation attempts - See docs/humaneval_accuracy.md for more details + See docs/lemonade/humaneval_accuracy.md for more details """ unique_name = "accuracy-humaneval" diff --git a/src/lemonade/tools/mmlu.py b/src/lemonade/tools/mmlu.py index 33abfcb..94a75da 100644 --- a/src/lemonade/tools/mmlu.py +++ b/src/lemonade/tools/mmlu.py @@ -30,7 +30,7 @@ def min_handle_none(*args: int): class AccuracyMMLU(Tool): """ - See docs/mmlu_accuracy.md for more details + See docs/lemonade/mmlu_accuracy.md for more details """ unique_name = "accuracy-mmlu" diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py index db3d68e..818a340 100644 --- a/src/lemonade/tools/ort_genai/oga.py +++ b/src/lemonade/tools/ort_genai/oga.py @@ -29,6 +29,8 @@ PassthroughTokenizerResult, ) from lemonade.cache import Keys +from lemonade_install.install import DEFAULT_AMD_OGA_NPU_DIR, DEFAULT_AMD_OGA_HYBRID_DIR + # ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder oga_models_path = "oga_models" @@ -232,8 +234,8 @@ class OgaLoad(FirstTool): Input: path to a checkpoint. Supported choices for cpu and igpu from HF model repository: LLM models on Huggingface supported by model_builder. See documentation - (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported - models. + (https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md) + for supported models. Supported choices for npu from HF model repository: Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern Local models for cpu, igpu, or npu: @@ -402,15 +404,19 @@ def run( oga_models_subfolder = None if device == "hybrid": - # Locate the directory containing hybrid-llm-artifacts_1.3.0 in the system PATH - hybrid_artifacts_path = None - hybrid_artifacts_path = os.environ.get("AMD_OGA_HYBRID") - - if hybrid_artifacts_path is None: - raise RuntimeError( - "Could not find hybrid-llm-artifacts_1.3.0 in system PATH. " - "Please ensure it is added to your PATH environment variable." + # Locate the directory containing hybrid-llm-artifacts_1.3.0 + if os.path.exists(DEFAULT_AMD_OGA_HYBRID_DIR): + hybrid_artifacts_path = os.path.join( + DEFAULT_AMD_OGA_HYBRID_DIR, "hybrid-llm-artifacts_1.3.0" ) + else: + if "AMD_OGA_HYBRID" not in os.environ: + raise RuntimeError( + "Could not find hybrid-llm-artifacts_1.3.0 in system PATH. " + "Please ensure it is added to your PATH environment variable." + ) + + hybrid_artifacts_path = os.environ.get("AMD_OGA_HYBRID") if hybrid_artifacts_path: # Construct the path to onnx_custom_ops.dll @@ -492,13 +498,17 @@ def run( if not download: # The download only flag is not set, so load model if device == "npu": - if "AMD_OGA" not in os.environ: - raise RuntimeError( - "Please set environment variable AMD_OGA to the path of the amd_oga files" - ) + if os.path.exists(DEFAULT_AMD_OGA_NPU_DIR): + oga_path = os.path.join(DEFAULT_AMD_OGA_NPU_DIR, "amd_oga") + else: + if "AMD_OGA" not in os.environ: + raise RuntimeError( + "Please set environment variable AMD_OGA " + "to the path of the amd_oga files" + ) - # Check AMD_OGA points to oga library files - oga_path = os.environ["AMD_OGA"] + # Check AMD_OGA points to oga library files + oga_path = os.environ["AMD_OGA"] if not os.path.exists( os.path.join(oga_path, "libs", "onnxruntime.dll") ): @@ -512,9 +522,7 @@ def run( # Change to the AMD_OGA distribution directory os.chdir(oga_path) - os.environ["PATH"] += os.pathsep + os.path.join( - os.environ["AMD_OGA"], "libs" - ) + os.environ["PATH"] += os.pathsep + os.path.join(oga_path, "libs") # Common environment variables for all NPU models os.environ["DD_ROOT"] = ".\\bins" diff --git a/src/lemonade/tools/perplexity.py b/src/lemonade/tools/perplexity.py index dd3ebf4..71fa8fa 100644 --- a/src/lemonade/tools/perplexity.py +++ b/src/lemonade/tools/perplexity.py @@ -13,7 +13,6 @@ class AccuracyPerplexity(Tool): """ Measure perplexity of an LLM using the wikitext dataset. - Refer to docs/perplexity.md for more details. Required input state: - state.model: instance that provides a __call__() method that returns @@ -22,7 +21,7 @@ class AccuracyPerplexity(Tool): Output state produced: None - See docs/perplexity.md for more details. + See docs/lemonade/perplexity.md for more details. """ unique_name = "accuracy-perplexity" diff --git a/src/lemonade_install/__init__.py b/src/lemonade_install/__init__.py new file mode 100644 index 0000000..39b1703 --- /dev/null +++ b/src/lemonade_install/__init__.py @@ -0,0 +1 @@ +from .install import main as installcli diff --git a/src/lemonade_install/install.py b/src/lemonade_install/install.py new file mode 100644 index 0000000..6578176 --- /dev/null +++ b/src/lemonade_install/install.py @@ -0,0 +1,211 @@ +""" +Utility that helps users install software. It is structured like a +ManagementTool, however it is not a ManagementTool because it cannot +import any lemonade or turnkey modules in order to avoid any installation +collisions on imported modules. +""" + +import argparse +import os +import subprocess +import sys +from typing import Optional +import zipfile +import requests +from pathlib import Path + + +lemonade_install_dir = Path(__file__).parent.parent.parent +DEFAULT_AMD_OGA_NPU_DIR = os.path.join( + lemonade_install_dir, "install", "ryzen_ai", "npu" +) +DEFAULT_AMD_OGA_HYBRID_DIR = os.path.join( + lemonade_install_dir, "install", "ryzen_ai", "hybrid" +) + + +def download_lfs_file(token, file, output_filename): + """Downloads a file from LFS""" + # Set up the headers for the request + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + + response = requests.get( + f"https://api.github.com/repos/aigdat/ryzenai-sw-ea/contents/{file}", + headers=headers, + ) + + # Check if the request was successful + if response.status_code == 200: + # Parse the JSON response to get the download URL + content = response.json() + download_url = content.get("download_url") + + if download_url: + # Download the file from the download URL + file_response = requests.get(download_url) + + # Write the content to a file + with open(output_filename, "wb") as file: + file.write(file_response.content) + else: + print("Download URL not found in the response.") + else: + raise ValueError( + "Failed to fetch the content from GitHub API. " + f"Status code: {response.status_code}, Response: {response.json()}" + ) + + if not os.path.isfile(output_filename): + raise ValueError(f"Error: {output_filename} does not exist.") + + +def unzip_file(zip_path, extract_to): + """Unzips the specified zip file to the given directory.""" + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(extract_to) + + +class LicenseRejected(Exception): + """ + Raise an exception if the user rejects the license prompt. + """ + + +class Install: + """ + Installs the necessary software for specific lemonade features. + """ + + @staticmethod + def parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Installs the necessary software for specific lemonade features", + ) + + parser.add_argument( + "--ryzenai", + help="Install Ryzen AI software for LLMs. Requires an authentication token.", + choices=["npu", "hybrid", None], + ) + + parser.add_argument( + "-y", + "--yes", + action="store_true", + help="Answer 'yes' to all questions. " + "Make sure to review all legal agreements before selecting this option.", + ) + + parser.add_argument( + "--token", + help="Some software requires an authentication token to download. " + "If this argument is not provided, the token can come from an environment " + "variable (e.g., Ryzen AI uses environment variable OGA_TOKEN).", + ) + + return parser + + def run( + self, + ryzenai: Optional[str] = None, + yes: bool = False, + token: Optional[str] = None, + ): + + if ryzenai is not None: + if yes: + print( + "\nYou have accepted the AMD Beta Software End User License Agreement for " + f"Ryzen AI {ryzenai} by providing the `--yes` option. " + "The license file is available for your review at " + # pylint: disable=line-too-long + "https://github.com/aigdat/ryzenai-sw-ea/blob/main/ryzen_ai_13_ga/llm-eula-beta-software.pdf\n" + ) + else: + print( + "\nYou must accept the AMD Beta Software End User License Agreement in " + "order to install this software. To continue, type the word yes " + "to assert that you agree and are authorized to agree " + "on behalf of your organization, to the terms and " + "conditions, in the Beta Software End User License Agreement, " + "which terms and conditions may be reviewed, downloaded and " + "printed from this link: " + # pylint: disable=line-too-long + "https://github.com/aigdat/ryzenai-sw-ea/blob/main/ryzen_ai_13_ga/llm-eula-beta-software.pdf\n" + ) + + response = input("Would you like to accept the license (yes/No)? ") + if response.lower() == "yes" or response.lower() == "y": + pass + else: + raise LicenseRejected( + "Exiting because the license was not accepted." + ) + + if ryzenai == "npu": + file = "ryzen_ai_13_ga/npu-llm-artifacts_1.3.0.zip" + install_dir = DEFAULT_AMD_OGA_NPU_DIR + wheels_full_path = os.path.join(install_dir, "amd_oga/wheels") + elif ryzenai == "hybrid": + file = "ryzen_ai_13_ga/hybrid-llm-artifacts_1.3.0.zip" + install_dir = DEFAULT_AMD_OGA_HYBRID_DIR + wheels_full_path = os.path.join( + install_dir, + "hybrid-llm-artifacts_1.3.0/hybrid-llm-artifacts/onnxruntime_genai/wheel", + ) + else: + raise ValueError( + f"Value passed to ryzenai argument is not supported: {ryzenai}" + ) + + archive_file_name = f"oga_{ryzenai}.zip" + archive_file_path = os.path.join(install_dir, archive_file_name) + + if token: + token_to_use = token + else: + token_to_use = os.environ.get("OGA_TOKEN") + + # Retrieve the installation artifacts + os.makedirs(install_dir, exist_ok=True) + print(f"\nDownloading {file} from GitHub LFS to {install_dir}\n") + download_lfs_file(token_to_use, file, archive_file_path) + + # Unzip the file + print(f"\nUnzipping archive {archive_file_path}\n") + unzip_file(archive_file_path, install_dir) + + # Install all whl files in the specified wheels folder + print(f"\nInstalling wheels from {wheels_full_path}\n") + for file in os.listdir(wheels_full_path): + if file.endswith(".whl"): + install_cmd = f"{sys.executable} -m pip install {os.path.join(wheels_full_path, file)}" + + print(f"\nInstalling {file} with command {install_cmd}\n") + + subprocess.run( + install_cmd, + check=True, + shell=True, + ) + + # Delete the zip file + print(f"\nCleaning up, removing {archive_file_path}\n") + os.remove(archive_file_path) + else: + raise ValueError( + "You must select something to install, for example `--ryzenai`" + ) + + +def main(): + installer = Install() + args = installer.parser().parse_args() + installer.run(**args.__dict__) + + +if __name__ == "__main__": + main() diff --git a/src/turnkeyml/cli/cli.py b/src/turnkeyml/cli/cli.py index cbecb25..b6d3232 100644 --- a/src/turnkeyml/cli/cli.py +++ b/src/turnkeyml/cli/cli.py @@ -180,7 +180,7 @@ def main(): description="This utility runs tools in a sequence. " "To use it, provide a list of tools and " "their arguments. See " - "https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md " + "https://github.com/onnx/turnkeyml/blob/main/docs/turnkey/tools_user_guide.md " "to learn the exact syntax.\n\nExample: turnkey -i my_model.py discover export-pytorch", formatter_class=NiceHelpFormatter, ) diff --git a/src/turnkeyml/common/filesystem.py b/src/turnkeyml/common/filesystem.py index 3ae640d..df4c3a6 100644 --- a/src/turnkeyml/common/filesystem.py +++ b/src/turnkeyml/common/filesystem.py @@ -190,7 +190,7 @@ def clean_output_dir(cache_dir: str, build_name: str) -> None: raise CacheError(f"No build found at {output_dir}") # Remove files that do not have an allowed extension - allowed_extensions = (".txt", ".out", ".yaml", ".json") + allowed_extensions = (".txt", ".out", ".yaml", ".json", ".png") all_paths = glob.glob(f"{output_dir}/**/*", recursive=True) for path in all_paths: if os.path.isfile(path) and not path.endswith(allowed_extensions): @@ -343,6 +343,9 @@ class Keys: # Prefix for reporting the execution duration of a tool # In the report this will look like tool_duration:TOOL_NAME TOOL_DURATION = "tool_duration" + # Prefix for reporting the peak working memory in the build through this tool + # In the report this will look like tool_memory:TOOL_NAME + TOOL_MEMORY = "tool_memory" # Prefix for reporting the execution status of a tool # In the report this will look like tool_status:TOOL_NAME TOOL_STATUS = "tool_status" @@ -371,6 +374,8 @@ class Keys: CACHE_DIR = "cache_dir" # Example inputs to the model INPUTS = "inputs" + # Path to the file containing the memory usage plot + MEMORY_USAGE_PLOT = "memory_usage_plot" def _clean_logfile(logfile_lines: List[str]) -> List[str]: diff --git a/src/turnkeyml/common/status.py b/src/turnkeyml/common/status.py index 735ff5a..06f58a7 100644 --- a/src/turnkeyml/common/status.py +++ b/src/turnkeyml/common/status.py @@ -1,8 +1,12 @@ import os +import platform +import shutil import sys import math import dataclasses from typing import Callable, List, Union, Dict, Optional +import textwrap +import psutil import torch from turnkeyml.common import printing from turnkeyml.state import State @@ -189,6 +193,13 @@ def _print_build_dir(self, cache_dir: str, build_name: str): self.skip.build_dir = True + def _print_peak_memory(self): + if platform.system() == "Windows": + print( + f"{self.indent}\tPeak memory:\t" + f"{psutil.Process().memory_info().peak_wset / 1024**3:,.3f} GB" + ) + def _print_status(self, cache_dir: str, build_name: str): stats = fs.Stats(cache_dir, build_name) if self.skip.previous_status_message: @@ -214,6 +225,9 @@ def _print_status(self, cache_dir: str, build_name: str): for key in self.stats_keys: max_key_len = max(len(_pretty_print_key(key)), max_key_len) + screen_width = shutil.get_terminal_size().columns + wrap_screen_width = screen_width - 2 + for key in self.stats_keys: nice_key = _pretty_print_key(key) try: @@ -230,9 +244,43 @@ def _print_status(self, cache_dir: str, build_name: str): value_tabs = " " * ( (max_key_len - len(_pretty_print_key(key))) + 1 ) - printing.logn( - f"{self.indent}\t{nice_key}:{value_tabs}{value} {units}" + hanging_indent = ( + len(self.indent) + 8 + len(nice_key) + 1 + len(value_tabs) ) + hanging_indent_str = " " * hanging_indent + if ( + isinstance(value, list) + and len(value) > 0 + and all(isinstance(item, str) for item in value) + ): + # Value is a list of strings, so output each one starting + # on its own line + printing.logn(f"{self.indent}\t{nice_key}:{value_tabs}[") + for line_counter, text in enumerate(value): + lines = textwrap.wrap( + "'" + text + "'", + width=wrap_screen_width, + initial_indent=hanging_indent_str, + subsequent_indent=hanging_indent_str, + ) + if line_counter + 1 < len(value): + # Not the last text item in the list, so add a comma + lines[-1] = lines[-1] + "," + for line in lines: + printing.logn(line) + printing.logn(f"{' ' * hanging_indent}] {units}") + else: + # Wrap value as needed + status_str = ( + f"{self.indent}\t{nice_key}:{value_tabs}{value} {units}" + ) + lines = textwrap.wrap( + status_str, + width=wrap_screen_width, + subsequent_indent=hanging_indent_str, + ) + for line in lines: + printing.logn(line) else: printing.logn( f"{self.indent}\t\t\t{nice_key}:\t{value} {units}" @@ -297,6 +345,7 @@ def print( ) self._print_input_shape() self._print_build_dir(cache_dir=cache_dir, build_name=build_name) + self._print_peak_memory() self._print_status(cache_dir=cache_dir, build_name=build_name) print() diff --git a/src/turnkeyml/sequence/sequence.py b/src/turnkeyml/sequence/sequence.py index bd5bd63..7696436 100644 --- a/src/turnkeyml/sequence/sequence.py +++ b/src/turnkeyml/sequence/sequence.py @@ -1,10 +1,16 @@ +import queue import sys import time import os +from multiprocessing import Process, Queue +import platform import copy from datetime import datetime from typing import List, Dict, Optional +import yaml import pytz +import matplotlib.pyplot as plt +import psutil import turnkeyml.common.printing as printing import turnkeyml.common.exceptions as exp import turnkeyml.common.build as build @@ -28,6 +34,95 @@ def _rewind_stdout(lines: int = 1): sys.stdout.flush() +def _get_time_mem_list(process): + """Returns a list containing current time and current process memory usage""" + return [time.time(), process.memory_info().rss] + + +def _memory_tracker(input_queue: Queue, yaml_path, track_memory_interval, track_names): + """ + Tracks memory usage during build and saves to yaml file + """ + memory_tracks = [] + current_track = [] + track_counter = 0 + + try: + parent_process = psutil.Process(pid=os.getppid()) + while ( + track_counter < len(track_names) + and parent_process.status() == psutil.STATUS_RUNNING + ): + + time.sleep(track_memory_interval) + + # Read any messages from the parent process + while track_counter < len(track_names) and not input_queue.empty(): + try: + message = input_queue.get(timeout=0.001) + if message is None: + # Current track is complete + memory_tracks.append( + [track_names[track_counter], current_track] + ) + current_track = [] + track_counter += 1 + else: + # Message is the output of _get_time_mem_list, so add to current track + current_track.append(message) + except queue.Empty: + # input_queue.empty had not been updated + pass + + # Save current time and memory usage + current_track.append(_get_time_mem_list(parent_process)) + + # Save the collected memory tracks + with open(yaml_path, "w", encoding="utf-8") as f: + yaml.dump(memory_tracks, f) + + except psutil.NoSuchProcess: + # If the parent process stopped existing, we can + # safely assume that tracking is no longer needed + # NOTE: this only seems to be needed on Windows + pass + + +def _plot_memory_usage(state: State, memory_tracks): + + # Find final time in the startup track (before first tool) to subtract from all other times + _, track = memory_tracks[0] + t0 = track[-1][0] + + # last_t and last_y are used to draw a line between the last point of the prior + # track and the first point of the current track + last_t = None + last_y = None + + plt.figure() + for k, v in memory_tracks[1:]: + t = [x[0] - t0 for x in v] + y = [float(x[1]) / 1024**3 for x in v] + # draw new memory usage track + if last_t is not None: + plt.plot([last_t] + t, [last_y] + y, label=k, marker=".") + else: + plt.plot(t, y, label=k, marker=".") + last_t = t[-1] + last_y = y[-1] + plt.xlabel("Time (sec)") + plt.ylabel("GB") + plt.title(f"Physical Memory Usage\n{state.build_name}") + plt.legend() + plt.grid() + figure_path = os.path.join( + build.output_dir(state.cache_dir, state.build_name), "memory_usage.png" + ) + plt.savefig(figure_path) + printing.log_info(f"Saved plot of memory usage to {figure_path}") + state.save_stat(fs.Keys.MEMORY_USAGE_PLOT, figure_path) + + class Sequence: """ Helper class to launch and manage build tools. @@ -50,6 +145,9 @@ def __init__( """ raise ValueError(msg) + # Save the process (used to get memory usage) + self.process = psutil.Process() + def show_monitor(self, state: State, verbosity: bool): """ Displays the monitor on the terminal. The purpose of the monitor @@ -81,11 +179,26 @@ def _advance_cursor(self, current_tool_name: str): print(cursor_down) + def _get_mem_usage_str(self) -> str: + """ + Returns a string with memory usage for the current process + (non-swapped physical memory). In Windows OS, the peak memory used in the + process is also included. + + Example: '1.100 GB (1.638 GB peak)' + """ + mem_info = self.process.memory_info() + mem_info_str = f"{mem_info.rss / 1024 ** 3:,.3f} GB" + if platform.system() == "Windows": + mem_info_str += f" ({mem_info.peak_wset / 1024 ** 3:,.3f} GB peak)" + return mem_info_str + def launch( self, state: State, lean_cache: bool = False, monitor: Optional[bool] = None, + track_memory_interval: Optional[float] = None, stats_to_save: Optional[Dict] = None, ) -> State: """ @@ -101,6 +214,29 @@ def launch( else: monitor_setting = monitor + # Start tracking memory usage + if track_memory_interval is not None: + # Create queue for passing messages to the tracker + memory_tracker_queue = Queue() + # The yaml file where the memory usage data will be saved + yaml_path = os.path.join( + build.output_dir(state.cache_dir, state.build_name), "memory_usage.yaml" + ) + # The names of each memory track segment + track_names = ["start-up"] + [tool.unique_name for tool in self.tools] + # Create process to continuously update queue + memory_tracker_process = Process( + target=_memory_tracker, + args=( + memory_tracker_queue, + yaml_path, + track_memory_interval, + track_names, + ), + ) + memory_tracker_process.start() + memory_tracker_queue.put(_get_time_mem_list(self.process)) + # Create a build directory in the cache fs.make_build_dir(state.cache_dir, state.build_name) @@ -152,17 +288,26 @@ def launch( for tool in self.tools: state.save_stat(tool.status_key, build.FunctionStatus.NOT_STARTED) state.save_stat(tool.duration_key, "-") + state.save_stat(tool.memory_key, "-") # Save any additional stats passed in via arguments if stats_to_save: for stat_key, stat_value in stats_to_save.items(): state.save_stat(stat_key, stat_value) + # Save initial memory and create dict for tracking memory usage + state.save_stat(f"{fs.Keys.TOOL_MEMORY}:__init__", self._get_mem_usage_str()) + # Run the build saved_exception = None for tool, argv in self.tools.items(): + start_time = time.time() + # Insert None into memory tracker queue before new tool starts + if track_memory_interval is not None: + memory_tracker_queue.put(None) + try: # Set status as incomplete, since tool just started @@ -240,21 +385,41 @@ def launch( execution_time = time.time() - start_time state.save_stat(tool.duration_key, execution_time) + # Store current memory and peak working memory + state.save_stat(tool.memory_key, self._get_mem_usage_str()) + if track_memory_interval is not None: + # sample each tool at least once + memory_tracker_queue.put(_get_time_mem_list(self.process)) + + # Send final None to memory_tracker so that is stops ands saves data to file + if track_memory_interval is not None: + memory_tracker_queue.put(None) + if not saved_exception: state.build_status = build.FunctionStatus.SUCCESSFUL state.save_stat(fs.Keys.BUILD_STATUS, build.FunctionStatus.SUCCESSFUL) - if vars(state).get("invocation_info"): state.invocation_info.status_message = ( f"Successful build! {state.invocation_info.extra_status}" ) state.invocation_info.status_message_color = printing.Colors.OKGREEN + if track_memory_interval is not None: + # Wait for memory tracker to finish writing yaml data file + while memory_tracker_process.is_alive(): + memory_tracker_process.join(timeout=1.0) + if os.path.exists(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as f: + memory_tracks = yaml.safe_load(f) + _plot_memory_usage(state, memory_tracks) + if vars(state).get("models_found") and vars(state).get("invocation_info"): # Present status statistics from the tools for tool in self.tools: state.invocation_info.stats_keys += tool.status_stats + if track_memory_interval is not None: + state.invocation_info.stats_keys += [fs.Keys.MEMORY_USAGE_PLOT] print() diff --git a/src/turnkeyml/tools/discovery/discover.py b/src/turnkeyml/tools/discovery/discover.py index 45e5b30..9ae8c70 100644 --- a/src/turnkeyml/tools/discovery/discover.py +++ b/src/turnkeyml/tools/discovery/discover.py @@ -33,7 +33,7 @@ class Discover(FirstTool): e.g., model(**inputs) You can learn more about how discovery and its arguments work at - https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md + https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/tools_user_guide.md """ unique_name = "discover" diff --git a/src/turnkeyml/tools/tool.py b/src/turnkeyml/tools/tool.py index 85f80f6..3a1fa35 100644 --- a/src/turnkeyml/tools/tool.py +++ b/src/turnkeyml/tools/tool.py @@ -33,12 +33,12 @@ def _spinner(message, q: Queue): while parent_process.status() == psutil.STATUS_RUNNING: for cursor in [" ", ". ", ".. ", "..."]: time.sleep(sleep_time) - if not q.empty(): + while not q.empty(): percent_complete = q.get() if percent_complete is not None: status = f" {message} ({percent_complete:.1f}%){cursor}\r" else: - status = f" {message}{cursor}\r" + status = f" {message}{cursor} \r" sys.stdout.write(status) sys.stdout.flush() except psutil.NoSuchProcess: @@ -139,7 +139,7 @@ def helpful_parser(cls, short_description: str, **kwargs): "part of a sequence of Tools, for example: `turnkey -i INPUTS tool-one " "tool-two tool-three`. Tools communicate data to each other via State. " "You can learn more at " - "https://github.com/onnx/turnkeyml/blob/main/docs/tools_user_guide.md" + "https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/tools_user_guide.md" ) return ToolParser( @@ -174,11 +174,15 @@ def status_line(self, successful, verbosity): elif successful: # Print success message printing.log(f" {success_tick} ", c=printing.Colors.OKGREEN) - printing.logn(self.monitor_message + progress_indicator + " ") + printing.logn( + self.monitor_message + progress_indicator + " " + ) else: # successful == False, print failure message printing.log(f" {fail_tick} ", c=printing.Colors.FAIL) - printing.logn(self.monitor_message + progress_indicator + " ") + printing.logn( + self.monitor_message + progress_indicator + " " + ) def __init__( self, @@ -189,6 +193,7 @@ def __init__( self.status_key = f"{fs.Keys.TOOL_STATUS}:{self.__class__.unique_name}" self.duration_key = f"{fs.Keys.TOOL_DURATION}:{self.__class__.unique_name}" + self.memory_key = f"{fs.Keys.TOOL_MEMORY}:{self.__class__.unique_name}" self.monitor_message = monitor_message self.progress = None self.progress_queue = None @@ -222,8 +227,10 @@ def set_percent_progress(self, percent_progress: float): know how much progress the Tool has made. """ - if not isinstance(percent_progress, float): - raise ValueError(f"Input argument must be a float, got {percent_progress}") + if percent_progress is not None and not isinstance(percent_progress, float): + raise ValueError( + f"Input argument must be a float or None, got {percent_progress}" + ) if self.progress_queue: self.progress_queue.put(percent_progress) @@ -254,7 +261,11 @@ def parse(self, state: State, args, known_only=True) -> argparse.Namespace: return parsed_args def parse_and_run( - self, state: State, args, monitor: bool = False, known_only=True + self, + state: State, + args, + monitor: bool = False, + known_only=True, ) -> Dict: """ Helper function to parse CLI arguments into the args expected diff --git a/test/lemonade/llm_api.py b/test/lemonade/llm_api.py index 3f57b27..a8f4b17 100644 --- a/test/lemonade/llm_api.py +++ b/test/lemonade/llm_api.py @@ -1,6 +1,7 @@ import unittest import shutil import os +import sys import urllib3 import platform import zipfile @@ -20,44 +21,26 @@ # Configure logging logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) ci_mode = os.getenv("LEMONADE_CI_MODE", False) -# Get cache directory from environment or create a new one -cache_dir = os.getenv('LEMONADE_CACHE_DIR') -if not cache_dir: - cache_dir, _ = common.create_test_dir("lemonade_api") - os.environ['LEMONADE_CACHE_DIR'] = cache_dir - -logger.info(f"Using cache directory: {cache_dir}") - -try: - url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" - resp = urllib3.request("GET", url, preload_content=False) - if 200 <= resp.status < 400: - eecs_berkeley_edu_cannot_be_reached = False - else: - eecs_berkeley_edu_cannot_be_reached = True - resp.release_conn() -except urllib3.exceptions.HTTPError: - eecs_berkeley_edu_cannot_be_reached = True - def download_llamacpp_binary(): """Download the appropriate llama.cpp binary for the current platform""" logger.info("Starting llama.cpp binary download...") - + # Get latest release info releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" try: response = requests.get(releases_url) response.raise_for_status() latest_release = response.json() - logger.info(f"Found latest release: {latest_release.get('tag_name', 'unknown')}") + logger.info( + f"Found latest release: {latest_release.get('tag_name', 'unknown')}" + ) except Exception as e: logger.error(f"Failed to fetch latest release info: {str(e)}") raise @@ -66,7 +49,7 @@ def download_llamacpp_binary(): system = platform.system().lower() machine = platform.machine().lower() logger.info(f"Detected platform: {system} {machine}") - + if system == "windows": # Windows uses AVX2 by default asset_pattern = "win-avx2-x64" @@ -79,13 +62,14 @@ def download_llamacpp_binary(): # Find matching asset matching_assets = [ - asset for asset in latest_release["assets"] + asset + for asset in latest_release["assets"] if ( - asset["name"].lower().startswith("llama-") and - asset_pattern in asset["name"].lower() + asset["name"].lower().startswith("llama-") + and asset_pattern in asset["name"].lower() ) ] - + if not matching_assets: error_msg = ( f"No matching binary found for {system} {machine}. " @@ -93,21 +77,21 @@ def download_llamacpp_binary(): ) logger.error(error_msg) raise RuntimeError(error_msg) - + asset = matching_assets[0] logger.info(f"Found matching asset: {asset['name']}") - + # Create binaries directory binary_dir = os.path.join(cache_dir, "llama_cpp_binary") os.makedirs(binary_dir, exist_ok=True) logger.info(f"Created binary directory: {binary_dir}") - + # Download and extract zip_path = os.path.join(binary_dir, asset["name"]) try: response = requests.get(asset["browser_download_url"]) response.raise_for_status() - + with open(zip_path, "wb") as f: f.write(response.content) logger.info(f"Downloaded binary to: {zip_path}") @@ -118,7 +102,7 @@ def download_llamacpp_binary(): except Exception as e: logger.error(f"Failed to download or extract binary: {str(e)}") raise - + # Find the executable if system == "windows": executable = os.path.join(binary_dir, "llama-cli.exe") @@ -126,7 +110,7 @@ def download_llamacpp_binary(): executable = os.path.join(binary_dir, "llama-cli") # Make executable on Linux os.chmod(executable, 0o755) - + if not os.path.exists(executable): error_msg = ( f"Expected executable not found at {executable} after extraction. " @@ -134,7 +118,7 @@ def download_llamacpp_binary(): ) logger.error(error_msg) raise RuntimeError(error_msg) - + logger.info(f"Successfully prepared executable at: {executable}") return executable @@ -150,17 +134,19 @@ def setUpClass(cls): error_msg = f"Failed to download llama.cpp binary: {str(e)}" logger.error(error_msg) raise unittest.SkipTest(error_msg) - + # Use a small GGUF model for testing cls.model_name = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" cls.model_file = "qwen2.5-0.5b-instruct-fp16.gguf" logger.info(f"Using test model: {cls.model_name}/{cls.model_file}") - + # Download the model file try: - model_url = f"https://huggingface.co/{cls.model_name}/resolve/main/{cls.model_file}" + model_url = ( + f"https://huggingface.co/{cls.model_name}/resolve/main/{cls.model_file}" + ) cls.model_path = os.path.join(cache_dir, cls.model_file) - + if not os.path.exists(cls.model_path): logger.info(f"Downloading model from: {model_url}") response = requests.get(model_url) @@ -188,44 +174,40 @@ def test_001_load_model(self): executable=self.executable, model_binary=self.model_path, context_size=512, - threads=1 + threads=1, ) - + self.assertIsNotNone(state.model) def test_002_generate_text(self): """Test text generation with llama.cpp""" state = LoadLlamaCpp().run( - self.state, - executable=self.executable, - model_binary=self.model_path + self.state, executable=self.executable, model_binary=self.model_path ) - + prompt = "What is the capital of France?" state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=20) - + self.assertIsNotNone(state.response) - self.assertGreater(len(state.response), len(prompt)) + self.assertGreater(len(state.response), 0, state.response) def test_003_benchmark(self): """Test benchmarking with llama.cpp""" state = LoadLlamaCpp().run( - self.state, - executable=self.executable, - model_binary=self.model_path + self.state, executable=self.executable, model_binary=self.model_path ) - + # Use longer output tokens to ensure we get valid performance metrics state = LlamaCppBench().run( state, iterations=2, warmup_iterations=1, output_tokens=128, - prompt="Hello, I am a test prompt that is long enough to get meaningful metrics." + prompt="Hello, I am a test prompt that is long enough to get meaningful metrics.", ) - + stats = fs.Stats(state.cache_dir, state.build_name).stats - + # Check if we got valid metrics self.assertIn(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, stats) self.assertIn(Keys.SECONDS_TO_FIRST_TOKEN, stats) @@ -251,9 +233,9 @@ def test_001_prompt(self): state = HuggingfaceLoad().run(state, input=checkpoint) state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=15) - assert len(state.response) > len(prompt), state.response + stats = fs.Stats(state.cache_dir, state.build_name).stats + assert len(stats["response"]) > 0, stats["response"] - @unittest.skipIf(eecs_berkeley_edu_cannot_be_reached, "eecs.berkeley.edu cannot be reached for dataset download") def test_002_accuracy_mmlu(self): # Test MMLU benchmarking with known model checkpoint = "facebook/opt-125m" @@ -286,16 +268,18 @@ def test_003_accuracy_humaneval(self): state = AccuracyHumaneval().run( state, first_n_samples=1, # Test only one problem for speed - k_samples=1, # Single attempt per problem - timeout=30.0 + k_samples=1, # Single attempt per problem + timeout=30.0, ) # Verify results stats = fs.Stats(state.cache_dir, state.build_name).stats assert "humaneval_pass@1" in stats, "HumanEval pass@1 metric not found" - assert isinstance(stats["humaneval_pass@1"], (int, float)), "HumanEval pass@1 metric should be numeric" + assert isinstance( + stats["humaneval_pass@1"], (int, float) + ), "HumanEval pass@1 metric should be numeric" - def test_001_huggingface_bench(self): + def test_004_huggingface_bench(self): # Benchmark OPT checkpoint = "facebook/opt-125m" @@ -311,15 +295,99 @@ def test_001_huggingface_bench(self): assert stats[Keys.TOKEN_GENERATION_TOKENS_PER_SECOND] > 0 + def test_005_prompt_from_file(self): + """ + Test the LLM Prompt tool capability to load prompt from a file + """ + + checkpoint = "facebook/opt-125m" + prompt_str = "Who is Humpty Dumpty?" + + prompt_path = os.path.join(corpus_dir, "prompt.txt") + with open(prompt_path, "w", encoding="utf-8") as f: + f.write(prompt_str) + + llm_prompt_args = ["-p", prompt_path, "--max-new-tokens", "15"] + + state = State( + cache_dir=cache_dir, + build_name="test", + ) + + state = HuggingfaceLoad().run(state, input=checkpoint) + llm_prompt_kwargs = LLMPrompt().parse(state, llm_prompt_args).__dict__ + state = LLMPrompt().run(state, **llm_prompt_kwargs) + + stats = fs.Stats(state.cache_dir, state.build_name).stats + + assert len(stats["response"]) > 0, stats["response"] + assert stats["prompt"] == prompt_str, f"{stats['prompt']} {prompt_str}" + + def test_006_multiple_prompt_responses(self): + """ + Test the LLM Prompt tool capability to run multiple inferences on the same prompt + """ + + checkpoint = "facebook/opt-125m" + prompt_str = "Who is Humpty Dumpty?" + n_trials = 2 + + state = State( + cache_dir=cache_dir, + build_name="test", + ) + + state = HuggingfaceLoad().run(state, input=checkpoint) + state = LLMPrompt().run( + state, prompt=prompt_str, max_new_tokens=15, n_trials=n_trials + ) + + stats = fs.Stats(state.cache_dir, state.build_name).stats + + # Check that two responses were generated + assert ( + isinstance(stats["response"], list) and len(stats["response"]) == n_trials + ), stats["response"] + assert ( + isinstance(stats["response_tokens"], list) + and len(stats["response_tokens"]) == n_trials + ), stats["response_tokens"] + # Check that histogram figure was generated + assert os.path.exists( + os.path.join(state.cache_dir, state.build_name, "response_lengths.png") + ) + if __name__ == "__main__": - cache_dir, _ = common.create_test_dir("lemonade_api") - + # Get cache directory from environment or create a new one + cache_dir = os.getenv("LEMONADE_CACHE_DIR") + if not cache_dir: + cache_dir, corpus_dir = common.create_test_dir("lemonade_api") + os.environ["LEMONADE_CACHE_DIR"] = cache_dir + + logger.info(f"Using cache directory: {cache_dir}") + + # Download mmlu + try: + url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" + resp = urllib3.request("GET", url, preload_content=False) + if 200 <= resp.status < 400: + eecs_berkeley_edu_cannot_be_reached = False + else: + eecs_berkeley_edu_cannot_be_reached = True + resp.release_conn() + except urllib3.exceptions.HTTPError: + eecs_berkeley_edu_cannot_be_reached = True + # Create test suite with all test classes suite = unittest.TestSuite() suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Testing)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestLlamaCpp)) - + # Run the test suite runner = unittest.TextTestRunner() - runner.run(suite) + result = runner.run(suite) + + # Set exit code based on test results + if not result.wasSuccessful(): + sys.exit(1) diff --git a/test/lemonade/oga_cpu_api.py b/test/lemonade/oga_cpu_api.py index def7b9c..0cbc6ad 100644 --- a/test/lemonade/oga_cpu_api.py +++ b/test/lemonade/oga_cpu_api.py @@ -18,17 +18,6 @@ force = False prompt = "Alice and Bob" -try: - url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" - resp = urllib3.request("GET", url, preload_content=False) - if 200 <= resp.status < 400: - eecs_berkeley_edu_cannot_be_reached = False - else: - eecs_berkeley_edu_cannot_be_reached = True - resp.release_conn() -except urllib3.exceptions.HTTPError: - eecs_berkeley_edu_cannot_be_reached = True - class Testing(unittest.TestCase): @@ -45,12 +34,8 @@ def test_001_ogaload(self): ) state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=5) - assert len(state.response) > len(prompt), state.response + assert len(state.response) > 0, state.response - @unittest.skipIf( - eecs_berkeley_edu_cannot_be_reached, - "eecs.berkeley.edu cannot be reached for dataset download", - ) def test_002_accuracy_mmlu(self): # Test MMLU benchmarking with known model subject = ["management"] @@ -97,4 +82,17 @@ def test_003_accuracy_humaneval(self): cache_dir, _ = common.create_test_dir( "lemonade_oga_cpu_api", base_dir=os.path.abspath(".") ) + + # Get MMLU data + try: + url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" + resp = urllib3.request("GET", url, preload_content=False) + if 200 <= resp.status < 400: + eecs_berkeley_edu_cannot_be_reached = False + else: + eecs_berkeley_edu_cannot_be_reached = True + resp.release_conn() + except urllib3.exceptions.HTTPError: + eecs_berkeley_edu_cannot_be_reached = True + unittest.main() From 9d7576415f748d18f134a3d8dc3259a04ff875a7 Mon Sep 17 00:00:00 2001 From: Jeremy Fowers Date: Tue, 28 Jan 2025 12:31:20 -0500 Subject: [PATCH 4/4] bug fixes, rev version number --- .github/workflows/test_lemonade.yml | 1 + examples/lemonade/demos/chat/chat_hybrid.py | 2 +- examples/lemonade/demos/chat/chat_start.py | 5 +++-- examples/lemonade/demos/search/search_start.py | 5 ++--- src/lemonade/tools/ort_genai/oga.py | 6 ++++-- src/lemonade/tools/ort_genai/oga_bench.py | 6 +++++- src/turnkeyml/version.py | 2 +- 7 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml index 6073d04..b7bdbbe 100644 --- a/.github/workflows/test_lemonade.yml +++ b/.github/workflows/test_lemonade.yml @@ -45,6 +45,7 @@ jobs: shell: bash -el {0} run: | pylint src/lemonade --rcfile .pylintrc --disable E0401 + pylint examples --rcfile .pylintrc --disable E0401,E0611 --jobs=1 - name: Test HF+CPU server if: runner.os == 'Windows' timeout-minutes: 10 diff --git a/examples/lemonade/demos/chat/chat_hybrid.py b/examples/lemonade/demos/chat/chat_hybrid.py index 8b770ff..d4e3c8f 100644 --- a/examples/lemonade/demos/chat/chat_hybrid.py +++ b/examples/lemonade/demos/chat/chat_hybrid.py @@ -1,6 +1,6 @@ import sys from threading import Thread, Event -from transformers import StoppingCriteria, StoppingCriteriaList +from transformers import StoppingCriteriaList from lemonade.tools.chat import StopOnEvent from lemonade import leap from lemonade.tools.ort_genai.oga import OrtGenaiStreamer diff --git a/examples/lemonade/demos/chat/chat_start.py b/examples/lemonade/demos/chat/chat_start.py index 22724f1..a094c83 100644 --- a/examples/lemonade/demos/chat/chat_start.py +++ b/examples/lemonade/demos/chat/chat_start.py @@ -1,9 +1,9 @@ import sys from threading import Thread, Event -from transformers import StoppingCriteriaList -from lemonade.tools.chat import StopOnEvent from queue import Queue from time import sleep +from transformers import StoppingCriteriaList +from lemonade.tools.chat import StopOnEvent class TextStreamer: @@ -43,6 +43,7 @@ def generate_placeholder( Not needed once we integrate with LEAP. """ + # pylint: disable=line-too-long response = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." for word in response.split(" "): diff --git a/examples/lemonade/demos/search/search_start.py b/examples/lemonade/demos/search/search_start.py index 705cea7..8249e29 100644 --- a/examples/lemonade/demos/search/search_start.py +++ b/examples/lemonade/demos/search/search_start.py @@ -1,11 +1,10 @@ import sys from threading import Thread, Event +from queue import Queue +from time import sleep from transformers import StoppingCriteriaList from lemonade.tools.chat import StopOnEvent -# These imports are not needed when we add the LLM -from queue import Queue -from time import sleep employee_handbook = """ 1. You will work very hard every day.\n diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py index 818a340..e859788 100644 --- a/src/lemonade/tools/ort_genai/oga.py +++ b/src/lemonade/tools/ort_genai/oga.py @@ -113,6 +113,7 @@ def generate( self, input_ids, max_new_tokens=512, + min_new_tokens=0, do_sample=True, top_k=50, top_p=1.0, @@ -135,6 +136,7 @@ def generate( params.pad_token_id = pad_token_id max_length = len(input_ids) + max_new_tokens + min_length = len(input_ids) + min_new_tokens if use_oga_pre_6_api: params.input_ids = input_ids @@ -147,7 +149,7 @@ def generate( top_p=search_config.get("top_p", top_p), temperature=search_config.get("temperature", temperature), max_length=max_length, - min_length=0, + min_length=min_length, early_stopping=search_config.get("early_stopping", False), length_penalty=search_config.get("length_penalty", 1.0), num_beams=search_config.get("num_beams", 1), @@ -167,7 +169,7 @@ def generate( top_p=top_p, temperature=temperature, max_length=max_length, - min_length=max_length, + min_length=min_length, ) params.try_graph_capture_with_max_batch_size(1) diff --git a/src/lemonade/tools/ort_genai/oga_bench.py b/src/lemonade/tools/ort_genai/oga_bench.py index ba9d8a1..93ae746 100644 --- a/src/lemonade/tools/ort_genai/oga_bench.py +++ b/src/lemonade/tools/ort_genai/oga_bench.py @@ -161,7 +161,11 @@ def run( model.generate(input_ids, max_new_tokens=output_tokens) for _ in tqdm.tqdm(range(iterations), desc="iterations"): - outputs = model.generate(input_ids, max_new_tokens=output_tokens) + outputs = model.generate( + input_ids, + max_new_tokens=output_tokens, + min_new_tokens=output_tokens, + ) token_len = len(outputs[0]) - input_ids_len diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py index 3a223dd..4682e61 100644 --- a/src/turnkeyml/version.py +++ b/src/turnkeyml/version.py @@ -1 +1 @@ -__version__ = "5.0.2" +__version__ = "5.0.3"