From 0a582356ecc34da4d7fa7ef4ee34d8508b117b71 Mon Sep 17 00:00:00 2001
From: Ramakrishnan Sivakumar <ramkrishna2910@gmail.com>
Date: Wed, 15 Jan 2025 12:42:52 -0800
Subject: [PATCH] Update version to 5.0.1 (#265)

* update version

* update workflow

* update docs

* Update notebook naming

* put the notebook in the right folder

* Fix llamacpp. Polish naming.

---------

Co-authored-by: Jeremy Fowers <jeremy.fowers@amd.com>
---
 .github/workflows/test_lemonade.yml           |   2 +-
 README.md                                     |   2 +-
 docs/lemonade_getting_started.md              |  14 +--
 docs/llamacpp.md                              |   1 +
 docs/tools_user_guide.md                      |  14 ++-
 ...y_llm.ipynb => getting_started_llms.ipynb} |   6 +-
 setup.py                                      |   2 +-
 src/lemonade/cli.py                           |   2 +-
 src/lemonade/tools/llamacpp.py                |  50 +++++---
 src/lemonade/tools/llamacpp_bench.py          | 113 ++++++------------
 src/lemonade/tools/ort_genai/oga.py           |   3 +-
 src/lemonade/tools/ort_genai/oga_bench.py     |  24 ++++
 src/turnkeyml/tools/tool.py                   |   7 +-
 src/turnkeyml/version.py                      |   2 +-
 14 files changed, 128 insertions(+), 114 deletions(-)
 rename examples/llm/{turnkey_llm.ipynb => getting_started_llms.ipynb} (94%)

diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml
index 4f18fb66..49de72cc 100644
--- a/.github/workflows/test_lemonade.yml
+++ b/.github/workflows/test_lemonade.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Lint with PyLint
         shell: bash -el {0}
         run: |
-          pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401
+          pylint src/lemonade --rcfile .pylintrc --disable E0401
       - name: Test HF+CPU server
         if: runner.os == 'Windows'
         timeout-minutes: 10
diff --git a/README.md b/README.md
index 9b6c5eea..2a6156e8 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Welcome to ONNX TurnkeyML
 
 [![Turnkey tests](https://github.com/onnx/turnkeyml/actions/workflows/test_turnkey.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests")
-[![Turnkey-LLM tests](https://github.com/onnx/turnkeyml/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests")
+[![Lemonade tests](https://github.com/onnx/turnkeyml/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/onnx/turnkeyml/tree/main/test "Check out our tests")
 [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](https://github.com/onnx/turnkeyml/blob/main/docs/install.md "Check out our instructions")
 [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](https://github.com/onnx/turnkeyml/blob/main/docs/install.md "Check out our instructions")
 
diff --git a/docs/lemonade_getting_started.md b/docs/lemonade_getting_started.md
index 4501d6c6..924118b0 100644
--- a/docs/lemonade_getting_started.md
+++ b/docs/lemonade_getting_started.md
@@ -1,6 +1,6 @@
-# Turnkey-LLM
+# Lemonade
 
-Welcome to the project page for `turnkey-llm` (aka, "lemonade" the turnkey LLM Aide)!
+Welcome to the project page for `lemonade` the Turnkey LLM Aide!
 Contents:
 
 1. [Getting Started](#getting-started)
@@ -12,7 +12,7 @@ Contents:
 
 # Getting Started
 
-`turnkey-llm` introduces a brand new set of LLM-focused tools. 
+`lemonade` introduces a brand new set of LLM-focused tools. 
 
 ## Install
 
@@ -20,8 +20,8 @@ Contents:
 1. `cd turnkeyml` (where `turnkeyml` is the repo root of your TurnkeyML clone)
     - Note: be sure to run these installation instructions from the repo root.
 1. Create and activate a conda environment:
-    1. `conda create -n tk-llm python=3.10`
-    1. `conda activate tk-llm`
+    1. `conda create -n lemon python=3.10`
+    1. `conda activate lemon`
 1. Install lemonade: `pip install -e .[llm]`
     - or `pip install -e .[llm-oga-dml]` if you want to use `onnxruntime-genai` (see [OGA](#install-onnxruntime-genai))
 1. `lemonade -h` to explore the LLM tools
@@ -137,6 +137,6 @@ The best way to contribute is to add new tools to cover more devices and usage s
 
 To add a new tool:
 
-1. (Optional) Create a new `.py` file under `src/turnkeyml/llm/tools` (or use an existing file if your tool fits into a pre-existing family of tools).
+1. (Optional) Create a new `.py` file under `src/lemonade/tools` (or use an existing file if your tool fits into a pre-existing family of tools).
 1. Define a new class that inherits the `Tool` class from `TurnkeyML`.
-1. Register the class by adding it to the list of `tools` near the top of `src/turnkeyml/llm/cli.py`.
+1. Register the class by adding it to the list of `tools` near the top of `src/lemonade/cli.py`.
diff --git a/docs/llamacpp.md b/docs/llamacpp.md
index ed947456..f104a7da 100644
--- a/docs/llamacpp.md
+++ b/docs/llamacpp.md
@@ -124,3 +124,4 @@ The integration provides:
 - Proper error handling with detailed messages
 - Performance metrics collection
 - Configurable generation parameters (temperature, top_p, top_k)
+- 10-minute timeout for model generation to prevent indefinite hangs
diff --git a/docs/tools_user_guide.md b/docs/tools_user_guide.md
index 963bd839..d59333a3 100644
--- a/docs/tools_user_guide.md
+++ b/docs/tools_user_guide.md
@@ -245,4 +245,16 @@ For example:
 
 ```
 export TURNKEY_BUILD_MONITOR="False"
-```
\ No newline at end of file
+```
+
+### Adjust Build Monitor Update Frequency
+
+The build status monitor updates its display periodically to show progress. By default, it updates every 0.5 seconds, but you can adjust the update frequency by setting the `TURNKEY_BUILD_MONITOR_FREQUENCY` environment variable to the desired number of seconds between updates.
+
+For example:
+
+```
+export TURNKEY_BUILD_MONITOR_FREQUENCY="10.0"
+```
+
+This can be useful in long runs where frequent terminal updates might cause excessive terminal output.
diff --git a/examples/llm/turnkey_llm.ipynb b/examples/llm/getting_started_llms.ipynb
similarity index 94%
rename from examples/llm/turnkey_llm.ipynb
rename to examples/llm/getting_started_llms.ipynb
index ffe6d90c..837661c5 100644
--- a/examples/llm/turnkey_llm.ipynb
+++ b/examples/llm/getting_started_llms.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# LLMs on RyzenAI with TurnkeyML\n",
     "\n",
-    "This notebook will demonstrate how to bring up an example application that uses a RyzenAI to perform LLM inference. We will use the `turnkeyml.llm` APIs in order to make this as quick as possible. This notebook makes use of both the `RyzenAI NPU`, as well as the `RyzenAI Radeon integrated GPU (iGPU)`."
+    "This notebook will demonstrate how to bring up an example application that uses a RyzenAI to perform LLM inference. We will use the `lemonade` APIs in order to make this as quick as possible. This notebook makes use of both the `RyzenAI NPU`, as well as the `RyzenAI Radeon integrated GPU (iGPU)`."
    ]
   },
   {
@@ -84,7 +84,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Import the turnkey APIs\n",
+    "# Import the lemonade APIs\n",
     "from lemonade import leap\n",
     "\n",
     "# Load the model on to RyzenAI NPU\n",
@@ -121,7 +121,7 @@
     "\n",
     "### Prequisites for iGPU\n",
     "\n",
-    "- `turnkeyml[llm-oga-dml]` is installed into an activated conda environment.\n",
+    "- `turnkeyml[oga-dml]` is installed into an activated conda environment.\n",
     "-  Download a copy of `Phi-3-mini`\n",
     "- See https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm/README.md#install-onnxruntime-genai for details"
    ]
diff --git a/setup.py b/setup.py
index 199bc3bf..ac7ea403 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,7 @@
             "datasets",
             # Install human-eval from a forked repo with Windows support until the
             # PR (https://github.com/openai/human-eval/pull/53) is merged
-            "human-eval @ git+https://github.com/ramkrishna2910/human-eval.git",
+            "human-eval-windows==1.0.4",
             "fastapi",
             "uvicorn[standard]",
         ],
diff --git a/src/lemonade/cli.py b/src/lemonade/cli.py
index 66768be1..a58377aa 100644
--- a/src/lemonade/cli.py
+++ b/src/lemonade/cli.py
@@ -101,7 +101,7 @@ def main():
             parser.error(
                 "The first tool in the sequence needs to be one "
                 "of the 'tools that can start a sequence.' Use "
-                "`turnkey-llm -h` to see that list of tools."
+                "`lemonade -h` to see that list of tools."
             )
         # Run the evaluation tools as a build
         sequence = Sequence(tools=tool_instances)
diff --git a/src/lemonade/tools/llamacpp.py b/src/lemonade/tools/llamacpp.py
index 2eb8dfc3..464e0522 100644
--- a/src/lemonade/tools/llamacpp.py
+++ b/src/lemonade/tools/llamacpp.py
@@ -26,6 +26,7 @@ def generate(
         temperature: float = 0.8,
         top_p: float = 0.95,
         top_k: int = 40,
+        return_raw: bool = False,
         **kwargs,  # pylint: disable=unused-argument
     ):
         """
@@ -40,10 +41,12 @@ def generate(
             temperature: Temperature for sampling (0.0 = greedy)
             top_p: Top-p sampling threshold
             top_k: Top-k sampling threshold
+            return_raw: If True, returns the complete raw output including timing info
             **kwargs: Additional arguments (ignored)
 
         Returns:
-            List containing a single string with the generated text
+            List containing a single string with the generated text, or raw output if
+            return_raw=True
         """
 
         prompt = input_ids
@@ -68,6 +71,7 @@ def generate(
             "--top-k",
             str(top_k),
             "-e",
+            "-no-cnv",
         ]
 
         cmd = [str(m) for m in cmd]
@@ -82,7 +86,7 @@ def generate(
                 errors="replace",
             )
 
-            raw_output, stderr = process.communicate()
+            raw_output, stderr = process.communicate(timeout=600)
             if process.returncode != 0:
                 error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
                 error_msg += f"Command: {' '.join(cmd)}\n"
@@ -107,28 +111,36 @@ def generate(
                     time_to_first_token_ms = float(parts.split("ms")[0].strip())
                     self.time_to_first_token = time_to_first_token_ms / 1000
 
+            if return_raw:
+                return [raw_output, stderr]
+
+            # Find where the prompt ends and the generated text begins
+            prompt_found = False
+            output_text = ""
+            prompt_first_line = prompt.split("\n")[0]
+            for line in raw_output.splitlines():
+                if prompt_first_line in line:
+                    prompt_found = True
+                if prompt_found:
+                    line = line.replace("</s> [end of text]", "")
+                    output_text = output_text + line
+
+            if not prompt_found:
+                raise Exception(
+                    f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
+                    "This usually means the model failed to process the prompt correctly.\n"
+                    f"Raw output:\n{raw_output}\n"
+                    f"Stderr:\n{stderr}"
+                )
+
+            # Return list containing the generated text
+            return [output_text]
+
         except Exception as e:
             error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
             error_msg += f"Command: {' '.join(cmd)}"
             raise Exception(error_msg)
 
-        # Find where the prompt ends and the generated text begins
-        prompt_found = False
-        output_text = ""
-        prompt_first_line = prompt.split("\n")[0]
-        for line in raw_output.splitlines():
-            if prompt_first_line in line:
-                prompt_found = True
-            if prompt_found:
-                line = line.replace("</s> [end of text]", "")
-                output_text = output_text + line
-
-        if not prompt_found:
-            raise Exception("Prompt not found in result, this is a bug in lemonade.")
-
-        # Return list containing the generated text
-        return [output_text]
-
 
 class LoadLlamaCpp(FirstTool):
     unique_name = "load-llama-cpp"
diff --git a/src/lemonade/tools/llamacpp_bench.py b/src/lemonade/tools/llamacpp_bench.py
index d3a08a6d..bfe8a470 100644
--- a/src/lemonade/tools/llamacpp_bench.py
+++ b/src/lemonade/tools/llamacpp_bench.py
@@ -1,6 +1,5 @@
 import argparse
 import os
-import subprocess
 import statistics
 import tqdm
 from turnkeyml.state import State
@@ -137,91 +136,51 @@ def run(
         for iteration in tqdm.tqdm(
             range(iterations), desc="iterations", disable=iterations < 2
         ):
-            cmd = [
-                state.model.executable,
-                "-m",
-                state.model.model,
-                "--ctx-size",
-                str(context_size),
-                "-n",
-                str(output_tokens),
-                "-t",
-                str(state.model.threads),
-                "-p",
-                prompt,
-                "-e",
-            ]
-
-            cmd = [str(m) for m in cmd]
-
             try:
-                process = subprocess.Popen(
-                    cmd,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    universal_newlines=True,
-                    encoding="utf-8",
-                    errors="replace",
-                )
-
-                raw_output, stderr = process.communicate()
-                if process.returncode != 0:
+                # Use the adapter's generate method which already has the timeout and error handling
+                raw_output, stderr = state.model.generate(prompt, return_raw=True)
+
+                # Parse the timing information from the output
+                ms_per_token = None
+                time_to_first_token_ms = None
+
+                # Look for timing in both stdout and stderr
+                for output in [raw_output, stderr]:
+                    for line in output.splitlines():
+                        if "llama_perf_context_print:        eval time =" in line:
+                            parts = line.split("(")[1].strip()
+                            parts = parts.split(",")
+                            ms_per_token = float(
+                                parts[0].split("ms per token")[0].strip()
+                            )
+                        if "llama_perf_context_print: prompt eval time =" in line:
+                            parts = line.split("=")[1].split("/")[0]
+                            time_to_first_token_ms = float(parts.split("ms")[0].strip())
+
+                if ms_per_token is None or time_to_first_token_ms is None:
                     error_msg = (
-                        f"llama.cpp failed with return code {process.returncode}.\n"
+                        "Could not find timing information in llama.cpp output.\n"
                     )
-                    error_msg += f"Command: {' '.join(cmd)}\n"
-                    error_msg += f"Error output:\n{stderr}\n"
-                    error_msg += f"Standard output:\n{raw_output}"
+                    error_msg += "Raw output:\n" + raw_output + "\n"
+                    error_msg += "Stderr:\n" + stderr
                     raise Exception(error_msg)
 
-                if raw_output is None:
-                    raise Exception("No output received from llama.cpp process")
+                # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
+                # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
+                # as performance data for generating a few tokens is not relevant.
+                tokens_per_second = 0
+                if output_tokens > 5 and ms_per_token > 0:
+                    tokens_per_second = 1000 / ms_per_token
+                time_to_first_token = time_to_first_token_ms / 1000
 
-            except Exception as e:
-                error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
-                error_msg += f"Command: {' '.join(cmd)}"
-                raise Exception(error_msg)
+                if iteration > warmup_iterations - 1:
+                    iteration_tokens_per_second.append(tokens_per_second)
+                    iteration_time_to_first_token.append(time_to_first_token)
 
-            ms_per_token = None
-            time_to_first_token_ms = None
-            for line in raw_output.splitlines():
-                if "llama_perf_context_print:        eval time =" in line:
-                    parts = line.split("(")[1].strip()
-                    parts = parts.split(",")
-                    ms_per_token = float(parts[0].split("ms per token")[0].strip())
-                if "llama_perf_context_print: prompt eval time =" in line:
-                    parts = line.split("=")[1].split("/")[0]
-                    time_to_first_token_ms = float(parts.split("ms")[0].strip())
-
-            if ms_per_token is None or time_to_first_token_ms is None:
-                # Look in stderr as well since some versions of llama.cpp output timing there
-                for line in stderr.splitlines():
-                    if "llama_perf_context_print:        eval time =" in line:
-                        parts = line.split("(")[1].strip()
-                        parts = parts.split(",")
-                        ms_per_token = float(parts[0].split("ms per token")[0].strip())
-                    if "llama_perf_context_print: prompt eval time =" in line:
-                        parts = line.split("=")[1].split("/")[0]
-                        time_to_first_token_ms = float(parts.split("ms")[0].strip())
-
-            if ms_per_token is None or time_to_first_token_ms is None:
-                error_msg = "Could not find timing information in llama.cpp output.\n"
-                error_msg += "Raw output:\n" + raw_output + "\n"
-                error_msg += "Error output:\n" + stderr
+            except Exception as e:
+                error_msg = f"Failed to run benchmark: {str(e)}"
                 raise Exception(error_msg)
 
-            # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
-            # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
-            # as performance data for generating a few tokens is not relevant.
-            tokens_per_second = 0
-            if output_tokens > 5 and ms_per_token > 0:
-                tokens_per_second = 1000 / ms_per_token
-            time_to_first_token = time_to_first_token_ms / 1000
-
-            if iteration > warmup_iterations - 1:
-                iteration_tokens_per_second.append(tokens_per_second)
-                iteration_time_to_first_token.append(time_to_first_token)
-
         token_generation_tokens_per_second = statistics.mean(
             iteration_tokens_per_second
         )
diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py
index 890e181e..8a1ce111 100644
--- a/src/lemonade/tools/ort_genai/oga.py
+++ b/src/lemonade/tools/ort_genai/oga.py
@@ -216,7 +216,8 @@ class OgaLoad(FirstTool):
     Input: path to a checkpoint.
         Supported choices for cpu and igpu from HF model repository:
             LLM models on Huggingface supported by model_builder.  See documentation
-            (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported models.
+            (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
+            models.
         Supported choices for npu from HF model repository:
             Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
         Local models for cpu, igpu, or npu:
diff --git a/src/lemonade/tools/ort_genai/oga_bench.py b/src/lemonade/tools/ort_genai/oga_bench.py
index 0ae29756..ba9d8a1f 100644
--- a/src/lemonade/tools/ort_genai/oga_bench.py
+++ b/src/lemonade/tools/ort_genai/oga_bench.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import statistics
+from statistics import StatisticsError
 import tqdm
 from turnkeyml.state import State
 from turnkeyml.tools import Tool
@@ -55,8 +56,10 @@ def __init__(self):
 
         self.status_stats = [
             Keys.SECONDS_TO_FIRST_TOKEN,
+            Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
             Keys.PREFILL_TOKENS_PER_SECOND,
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
+            Keys.STD_DEV_TOKENS_PER_SECOND,
             Keys.PROMPT_TOKENS,
         ]
 
@@ -175,12 +178,33 @@ def run(
         token_generation_tokens_per_second = statistics.mean(
             per_iteration_tokens_per_second
         )
+        try:
+            std_dev_time_to_first_token = statistics.stdev(
+                per_iteration_time_to_first_token
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            std_dev_time_to_first_token = None
+        try:
+            std_dev_token_generation_tokens_per_second = statistics.stdev(
+                per_iteration_tokens_per_second
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            std_dev_token_generation_tokens_per_second = None
 
         state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token)
         state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second)
         state.save_stat(
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second
         )
+        state.save_stat(
+            Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN, std_dev_time_to_first_token
+        )
+        state.save_stat(
+            Keys.STD_DEV_TOKENS_PER_SECOND,
+            std_dev_token_generation_tokens_per_second,
+        )
         state.save_stat(Keys.PROMPT_TOKENS, input_ids_len)
 
         return state
diff --git a/src/turnkeyml/tools/tool.py b/src/turnkeyml/tools/tool.py
index 70b4f053..85f80f67 100644
--- a/src/turnkeyml/tools/tool.py
+++ b/src/turnkeyml/tools/tool.py
@@ -22,12 +22,17 @@ def _spinner(message, q: Queue):
     Queue to display the percent progress of the Tool.
     """
     percent_complete = None
+    # Get sleep time from environment variable, default to 0.5s if not set
+    try:
+        sleep_time = float(os.getenv("TURNKEY_BUILD_MONITOR_FREQUENCY", "0.5"))
+    except ValueError:
+        sleep_time = 0.5
 
     try:
         parent_process = psutil.Process(pid=os.getppid())
         while parent_process.status() == psutil.STATUS_RUNNING:
             for cursor in ["   ", ".  ", ".. ", "..."]:
-                time.sleep(0.5)
+                time.sleep(sleep_time)
                 if not q.empty():
                     percent_complete = q.get()
                 if percent_complete is not None:
diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py
index ba7be38e..2fe5fde1 100644
--- a/src/turnkeyml/version.py
+++ b/src/turnkeyml/version.py
@@ -1 +1 @@
-__version__ = "5.0.0"
+__version__ = "5.0.1"