From 0a582356ecc34da4d7fa7ef4ee34d8508b117b71 Mon Sep 17 00:00:00 2001 From: Ramakrishnan Sivakumar Date: Wed, 15 Jan 2025 12:42:52 -0800 Subject: [PATCH] Update version to 5.0.1 (#265) * update version * update workflow * update docs * Update notebook naming * put the notebook in the right folder * Fix llamacpp. Polish naming. --------- Co-authored-by: Jeremy Fowers --- .github/workflows/test_lemonade.yml | 2 +- README.md | 2 +- docs/lemonade_getting_started.md | 14 +-- docs/llamacpp.md | 1 + docs/tools_user_guide.md | 14 ++- ...y_llm.ipynb => getting_started_llms.ipynb} | 6 +- setup.py | 2 +- src/lemonade/cli.py | 2 +- src/lemonade/tools/llamacpp.py | 50 +++++--- src/lemonade/tools/llamacpp_bench.py | 113 ++++++------------ src/lemonade/tools/ort_genai/oga.py | 3 +- src/lemonade/tools/ort_genai/oga_bench.py | 24 ++++ src/turnkeyml/tools/tool.py | 7 +- src/turnkeyml/version.py | 2 +- 14 files changed, 128 insertions(+), 114 deletions(-) rename examples/llm/{turnkey_llm.ipynb => getting_started_llms.ipynb} (94%) diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml index 4f18fb66..49de72cc 100644 --- a/.github/workflows/test_lemonade.yml +++ b/.github/workflows/test_lemonade.yml @@ -44,7 +44,7 @@ jobs: - name: Lint with PyLint shell: bash -el {0} run: | - pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401 + pylint src/lemonade --rcfile .pylintrc --disable E0401 - name: Test HF+CPU server if: runner.os == 'Windows' timeout-minutes: 10 diff --git a/README.md b/README.md index 9b6c5eea..2a6156e8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Welcome to ONNX TurnkeyML [![Turnkey tests](https://github.com/onnx/turnkeyml/actions/workflows/test_turnkey.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests") -[![Turnkey-LLM tests](https://github.com/onnx/turnkeyml/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests") +[![Lemonade tests](https://github.com/onnx/turnkeyml/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests") [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](https://github.com/onnx/turnkeyml/blob/main/docs/install.md "Check out our instructions") [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](https://github.com/onnx/turnkeyml/blob/main/docs/install.md "Check out our instructions") diff --git a/docs/lemonade_getting_started.md b/docs/lemonade_getting_started.md index 4501d6c6..924118b0 100644 --- a/docs/lemonade_getting_started.md +++ b/docs/lemonade_getting_started.md @@ -1,6 +1,6 @@ -# Turnkey-LLM +# Lemonade -Welcome to the project page for `turnkey-llm` (aka, "lemonade" the turnkey LLM Aide)! +Welcome to the project page for `lemonade` the Turnkey LLM Aide! Contents: 1. [Getting Started](#getting-started) @@ -12,7 +12,7 @@ Contents: # Getting Started -`turnkey-llm` introduces a brand new set of LLM-focused tools. +`lemonade` introduces a brand new set of LLM-focused tools. ## Install @@ -20,8 +20,8 @@ Contents: 1. `cd turnkeyml` (where `turnkeyml` is the repo root of your TurnkeyML clone) - Note: be sure to run these installation instructions from the repo root. 1. Create and activate a conda environment: - 1. `conda create -n tk-llm python=3.10` - 1. `conda activate tk-llm` + 1. `conda create -n lemon python=3.10` + 1. `conda activate lemon` 1. Install lemonade: `pip install -e .[llm]` - or `pip install -e .[llm-oga-dml]` if you want to use `onnxruntime-genai` (see [OGA](#install-onnxruntime-genai)) 1. `lemonade -h` to explore the LLM tools @@ -137,6 +137,6 @@ The best way to contribute is to add new tools to cover more devices and usage s To add a new tool: -1. (Optional) Create a new `.py` file under `src/turnkeyml/llm/tools` (or use an existing file if your tool fits into a pre-existing family of tools). +1. (Optional) Create a new `.py` file under `src/lemonade/tools` (or use an existing file if your tool fits into a pre-existing family of tools). 1. Define a new class that inherits the `Tool` class from `TurnkeyML`. -1. Register the class by adding it to the list of `tools` near the top of `src/turnkeyml/llm/cli.py`. +1. Register the class by adding it to the list of `tools` near the top of `src/lemonade/cli.py`. diff --git a/docs/llamacpp.md b/docs/llamacpp.md index ed947456..f104a7da 100644 --- a/docs/llamacpp.md +++ b/docs/llamacpp.md @@ -124,3 +124,4 @@ The integration provides: - Proper error handling with detailed messages - Performance metrics collection - Configurable generation parameters (temperature, top_p, top_k) +- 10-minute timeout for model generation to prevent indefinite hangs diff --git a/docs/tools_user_guide.md b/docs/tools_user_guide.md index 963bd839..d59333a3 100644 --- a/docs/tools_user_guide.md +++ b/docs/tools_user_guide.md @@ -245,4 +245,16 @@ For example: ``` export TURNKEY_BUILD_MONITOR="False" -``` \ No newline at end of file +``` + +### Adjust Build Monitor Update Frequency + +The build status monitor updates its display periodically to show progress. By default, it updates every 0.5 seconds, but you can adjust the update frequency by setting the `TURNKEY_BUILD_MONITOR_FREQUENCY` environment variable to the desired number of seconds between updates. + +For example: + +``` +export TURNKEY_BUILD_MONITOR_FREQUENCY="10.0" +``` + +This can be useful in long runs where frequent terminal updates might cause excessive terminal output. diff --git a/examples/llm/turnkey_llm.ipynb b/examples/llm/getting_started_llms.ipynb similarity index 94% rename from examples/llm/turnkey_llm.ipynb rename to examples/llm/getting_started_llms.ipynb index ffe6d90c..837661c5 100644 --- a/examples/llm/turnkey_llm.ipynb +++ b/examples/llm/getting_started_llms.ipynb @@ -6,7 +6,7 @@ "source": [ "# LLMs on RyzenAI with TurnkeyML\n", "\n", - "This notebook will demonstrate how to bring up an example application that uses a RyzenAI to perform LLM inference. We will use the `turnkeyml.llm` APIs in order to make this as quick as possible. This notebook makes use of both the `RyzenAI NPU`, as well as the `RyzenAI Radeon integrated GPU (iGPU)`." + "This notebook will demonstrate how to bring up an example application that uses a RyzenAI to perform LLM inference. We will use the `lemonade` APIs in order to make this as quick as possible. This notebook makes use of both the `RyzenAI NPU`, as well as the `RyzenAI Radeon integrated GPU (iGPU)`." ] }, { @@ -84,7 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Import the turnkey APIs\n", + "# Import the lemonade APIs\n", "from lemonade import leap\n", "\n", "# Load the model on to RyzenAI NPU\n", @@ -121,7 +121,7 @@ "\n", "### Prequisites for iGPU\n", "\n", - "- `turnkeyml[llm-oga-dml]` is installed into an activated conda environment.\n", + "- `turnkeyml[oga-dml]` is installed into an activated conda environment.\n", "- Download a copy of `Phi-3-mini`\n", "- See https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm/README.md#install-onnxruntime-genai for details" ] diff --git a/setup.py b/setup.py index 199bc3bf..ac7ea403 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "datasets", # Install human-eval from a forked repo with Windows support until the # PR (https://github.com/openai/human-eval/pull/53) is merged - "human-eval @ git+https://github.com/ramkrishna2910/human-eval.git", + "human-eval-windows==1.0.4", "fastapi", "uvicorn[standard]", ], diff --git a/src/lemonade/cli.py b/src/lemonade/cli.py index 66768be1..a58377aa 100644 --- a/src/lemonade/cli.py +++ b/src/lemonade/cli.py @@ -101,7 +101,7 @@ def main(): parser.error( "The first tool in the sequence needs to be one " "of the 'tools that can start a sequence.' Use " - "`turnkey-llm -h` to see that list of tools." + "`lemonade -h` to see that list of tools." ) # Run the evaluation tools as a build sequence = Sequence(tools=tool_instances) diff --git a/src/lemonade/tools/llamacpp.py b/src/lemonade/tools/llamacpp.py index 2eb8dfc3..464e0522 100644 --- a/src/lemonade/tools/llamacpp.py +++ b/src/lemonade/tools/llamacpp.py @@ -26,6 +26,7 @@ def generate( temperature: float = 0.8, top_p: float = 0.95, top_k: int = 40, + return_raw: bool = False, **kwargs, # pylint: disable=unused-argument ): """ @@ -40,10 +41,12 @@ def generate( temperature: Temperature for sampling (0.0 = greedy) top_p: Top-p sampling threshold top_k: Top-k sampling threshold + return_raw: If True, returns the complete raw output including timing info **kwargs: Additional arguments (ignored) Returns: - List containing a single string with the generated text + List containing a single string with the generated text, or raw output if + return_raw=True """ prompt = input_ids @@ -68,6 +71,7 @@ def generate( "--top-k", str(top_k), "-e", + "-no-cnv", ] cmd = [str(m) for m in cmd] @@ -82,7 +86,7 @@ def generate( errors="replace", ) - raw_output, stderr = process.communicate() + raw_output, stderr = process.communicate(timeout=600) if process.returncode != 0: error_msg = f"llama.cpp failed with return code {process.returncode}.\n" error_msg += f"Command: {' '.join(cmd)}\n" @@ -107,28 +111,36 @@ def generate( time_to_first_token_ms = float(parts.split("ms")[0].strip()) self.time_to_first_token = time_to_first_token_ms / 1000 + if return_raw: + return [raw_output, stderr] + + # Find where the prompt ends and the generated text begins + prompt_found = False + output_text = "" + prompt_first_line = prompt.split("\n")[0] + for line in raw_output.splitlines(): + if prompt_first_line in line: + prompt_found = True + if prompt_found: + line = line.replace(" [end of text]", "") + output_text = output_text + line + + if not prompt_found: + raise Exception( + f"Could not find prompt '{prompt_first_line}' in llama.cpp output. " + "This usually means the model failed to process the prompt correctly.\n" + f"Raw output:\n{raw_output}\n" + f"Stderr:\n{stderr}" + ) + + # Return list containing the generated text + return [output_text] + except Exception as e: error_msg = f"Failed to run llama.cpp command: {str(e)}\n" error_msg += f"Command: {' '.join(cmd)}" raise Exception(error_msg) - # Find where the prompt ends and the generated text begins - prompt_found = False - output_text = "" - prompt_first_line = prompt.split("\n")[0] - for line in raw_output.splitlines(): - if prompt_first_line in line: - prompt_found = True - if prompt_found: - line = line.replace(" [end of text]", "") - output_text = output_text + line - - if not prompt_found: - raise Exception("Prompt not found in result, this is a bug in lemonade.") - - # Return list containing the generated text - return [output_text] - class LoadLlamaCpp(FirstTool): unique_name = "load-llama-cpp" diff --git a/src/lemonade/tools/llamacpp_bench.py b/src/lemonade/tools/llamacpp_bench.py index d3a08a6d..bfe8a470 100644 --- a/src/lemonade/tools/llamacpp_bench.py +++ b/src/lemonade/tools/llamacpp_bench.py @@ -1,6 +1,5 @@ import argparse import os -import subprocess import statistics import tqdm from turnkeyml.state import State @@ -137,91 +136,51 @@ def run( for iteration in tqdm.tqdm( range(iterations), desc="iterations", disable=iterations < 2 ): - cmd = [ - state.model.executable, - "-m", - state.model.model, - "--ctx-size", - str(context_size), - "-n", - str(output_tokens), - "-t", - str(state.model.threads), - "-p", - prompt, - "-e", - ] - - cmd = [str(m) for m in cmd] - try: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - encoding="utf-8", - errors="replace", - ) - - raw_output, stderr = process.communicate() - if process.returncode != 0: + # Use the adapter's generate method which already has the timeout and error handling + raw_output, stderr = state.model.generate(prompt, return_raw=True) + + # Parse the timing information from the output + ms_per_token = None + time_to_first_token_ms = None + + # Look for timing in both stdout and stderr + for output in [raw_output, stderr]: + for line in output.splitlines(): + if "llama_perf_context_print: eval time =" in line: + parts = line.split("(")[1].strip() + parts = parts.split(",") + ms_per_token = float( + parts[0].split("ms per token")[0].strip() + ) + if "llama_perf_context_print: prompt eval time =" in line: + parts = line.split("=")[1].split("/")[0] + time_to_first_token_ms = float(parts.split("ms")[0].strip()) + + if ms_per_token is None or time_to_first_token_ms is None: error_msg = ( - f"llama.cpp failed with return code {process.returncode}.\n" + "Could not find timing information in llama.cpp output.\n" ) - error_msg += f"Command: {' '.join(cmd)}\n" - error_msg += f"Error output:\n{stderr}\n" - error_msg += f"Standard output:\n{raw_output}" + error_msg += "Raw output:\n" + raw_output + "\n" + error_msg += "Stderr:\n" + stderr raise Exception(error_msg) - if raw_output is None: - raise Exception("No output received from llama.cpp process") + # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0 + # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases + # as performance data for generating a few tokens is not relevant. + tokens_per_second = 0 + if output_tokens > 5 and ms_per_token > 0: + tokens_per_second = 1000 / ms_per_token + time_to_first_token = time_to_first_token_ms / 1000 - except Exception as e: - error_msg = f"Failed to run llama.cpp command: {str(e)}\n" - error_msg += f"Command: {' '.join(cmd)}" - raise Exception(error_msg) + if iteration > warmup_iterations - 1: + iteration_tokens_per_second.append(tokens_per_second) + iteration_time_to_first_token.append(time_to_first_token) - ms_per_token = None - time_to_first_token_ms = None - for line in raw_output.splitlines(): - if "llama_perf_context_print: eval time =" in line: - parts = line.split("(")[1].strip() - parts = parts.split(",") - ms_per_token = float(parts[0].split("ms per token")[0].strip()) - if "llama_perf_context_print: prompt eval time =" in line: - parts = line.split("=")[1].split("/")[0] - time_to_first_token_ms = float(parts.split("ms")[0].strip()) - - if ms_per_token is None or time_to_first_token_ms is None: - # Look in stderr as well since some versions of llama.cpp output timing there - for line in stderr.splitlines(): - if "llama_perf_context_print: eval time =" in line: - parts = line.split("(")[1].strip() - parts = parts.split(",") - ms_per_token = float(parts[0].split("ms per token")[0].strip()) - if "llama_perf_context_print: prompt eval time =" in line: - parts = line.split("=")[1].split("/")[0] - time_to_first_token_ms = float(parts.split("ms")[0].strip()) - - if ms_per_token is None or time_to_first_token_ms is None: - error_msg = "Could not find timing information in llama.cpp output.\n" - error_msg += "Raw output:\n" + raw_output + "\n" - error_msg += "Error output:\n" + stderr + except Exception as e: + error_msg = f"Failed to run benchmark: {str(e)}" raise Exception(error_msg) - # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0 - # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases - # as performance data for generating a few tokens is not relevant. - tokens_per_second = 0 - if output_tokens > 5 and ms_per_token > 0: - tokens_per_second = 1000 / ms_per_token - time_to_first_token = time_to_first_token_ms / 1000 - - if iteration > warmup_iterations - 1: - iteration_tokens_per_second.append(tokens_per_second) - iteration_time_to_first_token.append(time_to_first_token) - token_generation_tokens_per_second = statistics.mean( iteration_tokens_per_second ) diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py index 890e181e..8a1ce111 100644 --- a/src/lemonade/tools/ort_genai/oga.py +++ b/src/lemonade/tools/ort_genai/oga.py @@ -216,7 +216,8 @@ class OgaLoad(FirstTool): Input: path to a checkpoint. Supported choices for cpu and igpu from HF model repository: LLM models on Huggingface supported by model_builder. See documentation - (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported models. + (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported + models. Supported choices for npu from HF model repository: Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern Local models for cpu, igpu, or npu: diff --git a/src/lemonade/tools/ort_genai/oga_bench.py b/src/lemonade/tools/ort_genai/oga_bench.py index 0ae29756..ba9d8a1f 100644 --- a/src/lemonade/tools/ort_genai/oga_bench.py +++ b/src/lemonade/tools/ort_genai/oga_bench.py @@ -1,6 +1,7 @@ import argparse import os import statistics +from statistics import StatisticsError import tqdm from turnkeyml.state import State from turnkeyml.tools import Tool @@ -55,8 +56,10 @@ def __init__(self): self.status_stats = [ Keys.SECONDS_TO_FIRST_TOKEN, + Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN, Keys.PREFILL_TOKENS_PER_SECOND, Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, + Keys.STD_DEV_TOKENS_PER_SECOND, Keys.PROMPT_TOKENS, ] @@ -175,12 +178,33 @@ def run( token_generation_tokens_per_second = statistics.mean( per_iteration_tokens_per_second ) + try: + std_dev_time_to_first_token = statistics.stdev( + per_iteration_time_to_first_token + ) + except StatisticsError: + # Less than 2 measurements + std_dev_time_to_first_token = None + try: + std_dev_token_generation_tokens_per_second = statistics.stdev( + per_iteration_tokens_per_second + ) + except StatisticsError: + # Less than 2 measurements + std_dev_token_generation_tokens_per_second = None state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token) state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second) state.save_stat( Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second ) + state.save_stat( + Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN, std_dev_time_to_first_token + ) + state.save_stat( + Keys.STD_DEV_TOKENS_PER_SECOND, + std_dev_token_generation_tokens_per_second, + ) state.save_stat(Keys.PROMPT_TOKENS, input_ids_len) return state diff --git a/src/turnkeyml/tools/tool.py b/src/turnkeyml/tools/tool.py index 70b4f053..85f80f67 100644 --- a/src/turnkeyml/tools/tool.py +++ b/src/turnkeyml/tools/tool.py @@ -22,12 +22,17 @@ def _spinner(message, q: Queue): Queue to display the percent progress of the Tool. """ percent_complete = None + # Get sleep time from environment variable, default to 0.5s if not set + try: + sleep_time = float(os.getenv("TURNKEY_BUILD_MONITOR_FREQUENCY", "0.5")) + except ValueError: + sleep_time = 0.5 try: parent_process = psutil.Process(pid=os.getppid()) while parent_process.status() == psutil.STATUS_RUNNING: for cursor in [" ", ". ", ".. ", "..."]: - time.sleep(0.5) + time.sleep(sleep_time) if not q.empty(): percent_complete = q.get() if percent_complete is not None: diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py index ba7be38e..2fe5fde1 100644 --- a/src/turnkeyml/version.py +++ b/src/turnkeyml/version.py @@ -1 +1 @@ -__version__ = "5.0.0" +__version__ = "5.0.1"