From 599ccdc10b29223a17cb604b10bb3f96a2baeac1 Mon Sep 17 00:00:00 2001
From: Kat Petrova <e.petrova@rasa.com>
Date: Thu, 14 Nov 2024 10:33:09 +0100
Subject: [PATCH 1/2] prompt example for inference

---
 .idea/workspace.xml                           |  49 ++
 .../cmd_gen_finetuning-checkpoint.ipynb       | 502 ++++++++++++++++++
 cmd_gen_finetuning.ipynb                      |  14 +-
 3 files changed, 562 insertions(+), 3 deletions(-)
 create mode 100644 .idea/workspace.xml
 create mode 100644 .ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..058b2fc
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="f6a915a9-bf07-40d0-bd11-ea77fa0382b6" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 8
+}</component>
+  <component name="ProjectId" id="2nT5oXWTHlIdKBpgQt6eRUGUyfW" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;git-widget-placeholder&quot;: &quot;hyperparams&quot;,
+    &quot;last_opened_file_path&quot;: &quot;/Users/katpetrova/rasa/notebooks&quot;
+  }
+}</component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-python-sdk-d7ad00fb9fc3-c546a90a8094-com.jetbrains.pycharm.community.sharedIndexes.bundled-PC-242.23726.102" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="f6a915a9-bf07-40d0-bd11-ea77fa0382b6" name="Changes" comment="" />
+      <created>1728984400605</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1728984400605</updated>
+    </task>
+    <servers />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb b/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb
new file mode 100644
index 0000000..90b6859
--- /dev/null
+++ b/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb
@@ -0,0 +1,502 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tuning an LLM for Command Generation in CALM\n",
+    "\n",
+    "This is a worked example of how to efficiently fine-tune a base language model from [Hugging Face Hub](https://huggingface.co/models) using the [Unsloth](https://docs.unsloth.ai) and [TRL](https://huggingface.co/docs/trl/en/index) libraries for the task of command generation within [CALM](https://rasa.com/docs/rasa-pro/calm).\n",
+    "\n",
+    "Unsloth integrates with TRL in order to reduce the time and GPU memory required to fine-tune LLMs, when compared to using TRL exclusively.\n",
+    "\n",
+    "To run fine-tuning, you must have first [generated the dataset](https://rasa.com/rasa-pro/docs/operating/fine-tuning-recipe) files `train.jsonl` and `val.jsonl`, which must be in the [TRL instruction format](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Configure fine-tuning environment\n",
+    "\n",
+    "In order to run this notebook, you will need to first install Unsloth onto a machine with the following minimum hardware requirements:\n",
+    "- Single NVIDIA A100 GPU with 40GB VRAM\n",
+    "- 12 core CPU with 85GB RAM\n",
+    "- 250GB disk\n",
+    "\n",
+    "Please refer to the Unsloth installation instructions in the [official documentation](https://github.com/unslothai/unsloth/blob/main/README.md).\n",
+    "\n",
+    "Here is an example of how to set up the environment:\n",
+    "\n",
+    "First, we provisioned a Linux instance with the appropriate hardware and the following software installed:\n",
+    "- Python 3.10\n",
+    "- CUDA Toolkit 12.1\n",
+    "- PyTorch 2.2.\n",
+    "\n",
+    "Next, we installed Unsloth and other packages as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "# install unsloth and other dependencies\n",
+    "pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
+    "pip install --no-deps \"xformers<=0.0.26\" \"trl<0.9.0\" peft accelerate bitsandbytes huggingface_hub[cli]\n",
+    "# remove tpu-only package that is installed by default on gcp runtimes, even when only using gpu\n",
+    "pip uninstall torch-xla -y"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Unsloth installation is very sensitive to the environment, in particular Cuda and PyTorch versions, so follow the [official installation instructions](https://github.com/unslothai/unsloth/blob/main/README.md) appropriate for your set-up."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Download base model\n",
+    "\n",
+    "You can download the model you want to fine-tune from Hugging Face Hub using the [official CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli) with an [API access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token) as per the code below. Make sure you first update the `HUGGINGFACE_TOKEN` and `BASE_MODEL` environment variables with your own values.\n",
+    "\n",
+    "When testing this notebook, the [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) model was used. Note that `meta-llama/Meta-Llama-3.1-8B-Instruct` is a [gated model](https://huggingface.co/docs/hub/en/models-gated) that you must first request access to before using. \n",
+    "\n",
+    "You can use any other PyTorch model available on [Hugging Face Hub](https://huggingface.co/models). It is recommended that you use a model that has been pre-trained on instructional tasks, such as the [CodeLlama 13B Instruct](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) model.\n",
+    "\n",
+    "Pre-trained models with more parameters will generally perform better at tasks than models with fewer parameters. However, the size of model you can use is limited by how much memory your GPU has.\n",
+    "\n",
+    "Alternatively, if you already have a PyTorch model directory to hand, you can upload it to your notebook environment manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "# TODO: update with your values\n",
+    "export HUGGINGFACE_TOKEN=\"CHANGEME\"\n",
+    "export BASE_MODEL=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+    "\n",
+    "# download model\n",
+    "huggingface-cli download \"${BASE_MODEL}\" \\\n",
+    "    --token \"${HUGGINGFACE_TOKEN}\" \\\n",
+    "    --local-dir \"./base_model\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Load and quantize base model\n",
+    "\n",
+    "The [quantization of model parameters](https://huggingface.co/docs/optimum/en/concept_guides/quantization) can significantly reduce the GPU memory required to run model fine-tuning and inference, at the cost of model accuracy.\n",
+    "\n",
+    "Here, the base model is loaded from disk and quantized into an 4-bit representation on the fly using the [BitsAndBytes](https://huggingface.co/docs/transformers/main/en/quantization/bitsandbytes) library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import BitsAndBytesConfig\n",
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "max_seq_length = 2048\n",
+    "random_seed = 42\n",
+    "\n",
+    "\n",
+    "# configure quantization method for base model\n",
+    "quantization_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "\n",
+    "# load quantized model and tokenizer from disk\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=\"./base_model\",\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    quantization_config=quantization_config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Configure base model for PEFT\n",
+    "\n",
+    "[Parameter Efficient Fine-Tuning](https://huggingface.co/blog/peft) (PEFT) is a technique for adapting LLMs for specific tasks by freezing all of the base model parameters and only training a relatively small number of additional parameters. Compared to fine-tuning all parameters, PEFT can significantly reduce the amount of GPU memory required at the cost of the fine-tuned model accuracy.\n",
+    "\n",
+    "In the code below, the base model is configured for PEFT using the [Low-Rank Adaptation](https://arxiv.org/pdf/2106.09685) (LoRA) method. It is recommended that you read the [official documentation](https://docs.unsloth.ai/basics/lora-parameters-encyclopedia) and experiment with the arguments of the `get_peft_model` method. For example, you may get better model performance with different values for `r` and `lora_alpha`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "# adapt model for peft\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=16,\n",
+    "    target_modules=[\n",
+    "        \"q_proj\",\n",
+    "        \"k_proj\",\n",
+    "        \"v_proj\",\n",
+    "        \"o_proj\",\n",
+    "        \"gate_proj\",\n",
+    "        \"up_proj\",\n",
+    "        \"down_proj\",\n",
+    "    ],\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0,\n",
+    "    bias=\"none\",\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
+    "    random_state=random_seed,\n",
+    "    use_rslora=False,\n",
+    "    loftq_config=None,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Load training and validation datasets\n",
+    "\n",
+    "The following code loads the training and validation datasets from the `train.jsonl` and `val.jsonl` files, respectively\n",
+    "\n",
+    "As the files use the TRL instruction format, the TRL trainer used later will be able to [automatically parse](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support) the datasets and [generate the prompts from a template](https://huggingface.co/docs/transformers/en/chat_templating) configured in the tokenizer.\n",
+    "\n",
+    "Prompt templates vary between models and TRL will infer the correct template from your base model. If this is not available in your base model or if you wish to change it, you can set your own [template string](https://huggingface.co/docs/transformers/en/chat_templating#advanced-adding-and-editing-chat-templates) manually. Unsloth also provides a selection of [pre-defined chat templates](https://docs.unsloth.ai/basics/chat-templates) for popular language models that you can use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from trl.extras.dataset_formatting import get_formatting_func_from_dataset\n",
+    "from unsloth.chat_templates import get_chat_template\n",
+    "\n",
+    "# Load the training and evaluation datasets from JSONL files on disk\n",
+    "train_dataset = datasets.load_dataset(\n",
+    "    \"json\", data_files={\"train\": \"train.jsonl\"}, split=\"train\"\n",
+    ")\n",
+    "eval_dataset = datasets.load_dataset(\n",
+    "    \"json\", data_files={\"eval\": \"val.jsonl\"}, split=\"eval\"\n",
+    ")\n",
+    "\n",
+    "# Uncomment the following line if you want to test prompt formatting on a single example from the eval dataset\n",
+    "# print(get_formatting_func_from_dataset(train_dataset, tokenizer)(eval_dataset[0]))\n",
+    "\n",
+    "# Get a tokenizer with a chat template to format conversations according to a specified structure\n",
+    "tokenizer = get_chat_template(\n",
+    "    tokenizer,\n",
+    "    chat_template=\"llama-3\",  # Specifies the chat template format (options: zephyr, chatml, mistral, llama, alpaca, etc.)\n",
+    "    mapping={\"role\": \"from\", \"content\": \"value\", \"user\": \"human\", \"assistant\": \"gpt\"},  # Maps dataset roles and messages to expected format\n",
+    ")\n",
+    "\n",
+    "# Define a function to format prompts for each example in the dataset\n",
+    "def formatting_prompts_func(examples):\n",
+    "    # Extract conversation messages from each example\n",
+    "    convos = examples[\"messages\"]\n",
+    "    \n",
+    "    # Apply the chat template to each conversation without tokenizing or adding generation prompts\n",
+    "    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]\n",
+    "    \n",
+    "    # Return the formatted texts in a new dictionary key\n",
+    "    return {\"text\": texts}\n",
+    "\n",
+    "# Apply the formatting function to both the training and evaluation datasets in batches\n",
+    "train_dataset = train_dataset.map(formatting_prompts_func, batched=True)\n",
+    "eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Configure trainer\n",
+    "\n",
+    "Below, the arguments for the supervised fine-tuning (SFT) trainer are configured. Their values were chosen somewhat arbitrarily and resulted in satisfactory results during testing.\n",
+    "\n",
+    "It is recommended that you read the official documentation and experiment with the arguments passed to `SFTConfig` (see [here](https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer)) and `SFTTrainer` (see [here](https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer)).\n",
+    "\n",
+    "For example:\n",
+    "- If you get an OOM error when running fine-tuning, you can reduce `per_device_train_batch_size` in order to reduce the memory footprint. However, if your GPU has sufficient memory, you can try increasing it in order to reduce the total number of training steps.\n",
+    "- Consider setting `max_steps`, as you may not need to perform all epochs in order to achieve optimal model accuracy. Conversely, you may see better model accuracy by increasing `num_train_epochs`.\n",
+    "- If fine-tuning is taking too long, you can increase `eval_steps` in order to reduce how often validation is performed. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import TrainingArguments\n",
+    "from trl import SFTTrainer\n",
+    "from unsloth import is_bfloat16_supported\n",
+    "\n",
+    "# configure training args\n",
+    "args = TrainingArguments(\n",
+    "    ###### training\n",
+    "    seed = random_seed,\n",
+    "    per_device_train_batch_size = 2,\n",
+    "    gradient_accumulation_steps = 4,\n",
+    "    warmup_steps = 5,\n",
+    "    #max_steps = 60,\n",
+    "    num_train_epochs = 2,\n",
+    "    learning_rate = 2e-4,\n",
+    "    lr_scheduler_type = \"linear\",\n",
+    "    optim = \"adamw_8bit\",\n",
+    "    weight_decay = 0.01,\n",
+    "    ###### datatypes\n",
+    "    fp16 = not is_bfloat16_supported(),\n",
+    "    bf16 = is_bfloat16_supported(),\n",
+    "    ###### evaluation\n",
+    "    eval_strategy = \"steps\",\n",
+    "    eval_steps = 50,\n",
+    "    per_device_eval_batch_size = 8,\n",
+    "    ###### outputs\n",
+    "    logging_steps = 30,\n",
+    "    output_dir = \"outputs\",\n",
+    ")\n",
+    "\n",
+    "# setup trainer\n",
+    "trainer = SFTTrainer(\n",
+    "    model = model,\n",
+    "    tokenizer = tokenizer,\n",
+    "    train_dataset = train_dataset,\n",
+    "    eval_dataset = eval_dataset,\n",
+    "    max_seq_length = max_seq_length,\n",
+    "    args = args,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Perform supervised fine-tuning\n",
+    "\n",
+    "In the code below, fine-tuning is performed using the previously congfigured trainer.\n",
+    "\n",
+    "When testing this step on an NVIDIA A100 using the configuration defined above, it took around 12 minutes to perform fine-tuning with a training dataset containing around 500 examples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# run fine-tuning\n",
+    "finetune_metrics = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "After fine-tuning, the base model and fine-tuned adapters are [merged together and saved to disk](https://docs.unsloth.ai/basics/saving-models/saving-to-vllm) in 16-bit for future compatibility with the [vLLM](https://github.com/vllm-project/vllm) model serving library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# save model to disk in 16-bit\n",
+    "model.save_pretrained_merged(\"./finetuned_model\", tokenizer, save_method=\"merged_16bit\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Visualize fine-tuning metrics\n",
+    "\n",
+    "Some of the metrics collected during fine-tuning are visualised below in order for you to diagnose any potential issues with the model.\n",
+    "\n",
+    "Specifically, the training and validation losses are plotted against the training step number. Please check the plot for the following:\n",
+    "- Ideally, as the fine-tuning steps increase, the training and validation losses should decrease and converge. \n",
+    "- If both loss curves do not converge, it may be worth performing more fine-tuning steps or epochs. This is known as [underfitting](https://www.ibm.com/topics/underfitting).\n",
+    "- If the validation loss suddenly starts to increase while the training loss continues to decrease or converge, you should decrease your total number of steps or epochs. This is known as [overfitting](https://www.ibm.com/topics/overfitting)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install pandas matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# plot step against train and val losses\n",
+    "fig, ax = plt.subplots()\n",
+    "log_history = pd.DataFrame(trainer.state.log_history)\n",
+    "eval_loss = log_history[[\"step\", \"eval_loss\"]].dropna().plot(x=\"step\", ax=ax)\n",
+    "train_loss = log_history[[\"step\", \"loss\"]].dropna().plot(x=\"step\", ax=ax)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Run ad hoc inference\n",
+    "\n",
+    "You can load your fine-tuned model from disk using Unsloth and use it to run optimized inference on individual inputs of your choosing using the code below.\n",
+    "\n",
+    "Note that the inputs passed to model are in the [TRL convertsational format](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support) as the Hugging Face [chat template requires them to be](https://huggingface.co/docs/transformers/main/en/chat_templating#how-do-i-use-chat-templates). During training TRL will [automatically convert the instruction format to the conversational format](https://github.com/huggingface/trl/blob/main/trl/extras/dataset_formatting.py). However, you have to do this yourself when applying chat templates manually for inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_example = '\\n\\nYour task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.\\n\\nThese are the flows that can be started, with their description and slots:\\n\\nbook_excursion: search and book an excursion\\n    slot: trip_destination\\n    slot: excursion_name (use the official name as stated by the assistant)\\n    \\nsearch_hotel: search for hotels\\n    slot: trip_destination\\n    slot: hotel_price_range\\n    slot: hotel_start_date (check in date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_end_date (check out date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_search_results_readable\\n    slot: hotel_name (Name of the hotel which the user has selected. Must be an exact element of the slot \"hotel_search_results_readable\". If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nsearch_rental_car: search for rental cars\\n    slot: trip_destination\\n    slot: car_rental_start_date\\n    slot: car_rental_end_date\\n    slot: car_rental_search_results_readable\\n    slot: car_rental_selection (Car rental option which the user has selected. Must be an exact element of the slot car_rental_search_results_readable. If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nchange_flight: change an existing booking to a new flight\\n    slot: flight_search_start_date (start date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: flight_search_end_date (end date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: selected_flight_id (the selected id number from the search results presented to the user.)\\n    \\nlist_bookings: list flight bookings - booked flights\\n    \\ngoodbye: say goodbye to user\\n    \\nlist_skills: tell the user what i can do\\n    \\nout_of_scope: trigger this if the user asks for something we don\\'t have a flow for or an existing command does not apply for it. Conversation repair flow for off-topic interactions that won\\'t disrupt the main conversation. should not respond to greetings or anything else for which there is a flow defined or an existing command is apt for it.\\n    \\nwelcome: greet the user and ask how you can help\\n    \\n\\n===\\nHere is what happened previously in the conversation:\\n\\nUSER: I\\'d like to book an excursion\\n\\n===\\n\\nYou are currently not in any flow and so there are no active slots.\\nThis means you can only set a slot if you first start a flow that requires that slot.\\n\\nIf you start a flow, first start the flow and then optionally fill that flow\\'s slots with information the user provided in their message.\\n\\nThe user just said \"\"\"I\\'d like to book an excursion\"\"\".\\n\\n===\\nBased on this information generate a list of actions you want to take. Your job is to start flows and to fill slots where appropriate. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:\\n* Slot setting, described by \"SetSlot(slot_name, slot_value)\". An example would be \"SetSlot(recipient, Freddy)\"\\n* Starting another flow, described by \"StartFlow(flow_name)\". An example would be \"StartFlow(transfer_money)\"\\n* Cancelling the current flow, described by \"CancelFlow()\"\\n* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote \"contacts\" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).\\n* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by \"SkipQuestion()\". Examples of user skip phrases are: \"Go to the next question\", \"Ask me something else\".\\n* Responding to knowledge-oriented user messages, described by \"SearchAndReply()\"\\n* Responding to a casual, non-task-oriented user message, described by \"ChitChat()\".\\n* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by \"HumanHandoff()\".\\n\\n===\\nWrite out the actions you want to take, one per line, in the order they should take place.\\nDo not fill slots with abstract values or placeholders.\\nOnly use information provided by the user.\\nOnly start a flow if it\\'s completely clear what the user wants. Imagine you were a person reading this message. If it\\'s not 100% clear, clarify the next step.\\nDon\\'t be overly confident. Take a conservative approach and clarify before proceeding.\\nIf the user asks for two things which seem contradictory, clarify before starting a flow.\\nIf it\\'s not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.\\nStrictly adhere to the provided action types listed above.\\nFocus on the last message and take it one step at a time.\\nUse the previous conversation steps only to aid understanding.\\n\\nCurrent time is \"2024-04-27 18:13:47.329404 +0000\" and it is a Saturday today!\\n\\nYour action list:'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TextStreamer\n",
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\"./finetuned_model\")\n",
+    "FastLanguageModel.for_inference(model)  # enable inference optimizations\n",
+    "streamer = TextStreamer(tokenizer)  # stream model outputs as they are generated\n",
+    "\n",
+    "# the content to include in the input prompt\n",
+    "content = prompt_example\n",
+    "\n",
+    "# apply prompt template and tokenize\n",
+    "input_ids = tokenizer.apply_chat_template(\n",
+    "    [{\"role\": \"user\", \"content\": content}],  # in the TRL conversational format\n",
+    "    tokenize=True,\n",
+    "    add_generation_prompt=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ").to(\"cuda\")\n",
+    "\n",
+    "# generate model output from user input\n",
+    "_ = model.generate(\n",
+    "    input_ids=input_ids,\n",
+    "    streamer=streamer,  # remove streamer if you want whole output at end\n",
+    "    max_new_tokens=64,  # set the limit on how many tokens are generated\n",
+    "    do_sample=False,  # disable random sampling for deterministic outputs\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Export fine-tuned model\n",
+    "\n",
+    "Lastly, export your fine-tuned model directory to an appropriate storage location that can be easily accessed later for [deployment](https://rasa.com/rasa-pro/docs/building-assistants/self-hosted-llm).\n",
+    "\n",
+    "It is recommended that you use a cloud object store, such as [Amazon S3](https://aws.amazon.com/s3/) or [Google Cloud Storage](https://cloud.google.com/storage).\n",
+    "\n",
+    "Uncomment and run the corresponding commands below for your cloud provider, making sure to first update the environment variables with your own values. It is assumed that:\n",
+    "- your bucket already exists\n",
+    "- you have already installed the CLI tool for your cloud provider\n",
+    "- you have already authenticated with your cloud provider and have sufficient permissions to write to your bucket"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "export LOCAL_MODEL_PATH=\"./finetuned_model\"\n",
+    "\n",
+    "# if using amazon\n",
+    "# export S3_MODEL_URI=\"s3://CHANGEME\" # update with your value\n",
+    "# aws s3 cp \"${LOCAL_MODEL_PATH}\" \"${S3_MODEL_URI}\" --recursive\n",
+    "\n",
+    "# if using google\n",
+    "# export GCS_MODEL_URI=\"gs://CHANGEME\" # update with your value\n",
+    "# gsutil cp -r \"${LOCAL_MODEL_PATH}\" \"${GCS_MODEL_URI}\""
+   ]
+  }
+ ],
+ "metadata": {
+  "environment": {
+   "kernel": "unsloth_env",
+   "name": ".m125",
+   "type": "gcloud",
+   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/:m125"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/cmd_gen_finetuning.ipynb b/cmd_gen_finetuning.ipynb
index 16d030e..dc61608 100644
--- a/cmd_gen_finetuning.ipynb
+++ b/cmd_gen_finetuning.ipynb
@@ -395,6 +395,15 @@
     "Note that the inputs passed to model are in the [TRL convertsational format](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support) as the Hugging Face [chat template requires them to be](https://huggingface.co/docs/transformers/main/en/chat_templating#how-do-i-use-chat-templates). During training TRL will [automatically convert the instruction format to the conversational format](https://github.com/huggingface/trl/blob/main/trl/extras/dataset_formatting.py). However, you have to do this yourself when applying chat templates manually for inference."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_example = '\\n\\nYour task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.\\n\\nThese are the flows that can be started, with their description and slots:\\n\\nbook_excursion: search and book an excursion\\n    slot: trip_destination\\n    slot: excursion_name (use the official name as stated by the assistant)\\n    \\nsearch_hotel: search for hotels\\n    slot: trip_destination\\n    slot: hotel_price_range\\n    slot: hotel_start_date (check in date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_end_date (check out date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_search_results_readable\\n    slot: hotel_name (Name of the hotel which the user has selected. Must be an exact element of the slot \"hotel_search_results_readable\". If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nsearch_rental_car: search for rental cars\\n    slot: trip_destination\\n    slot: car_rental_start_date\\n    slot: car_rental_end_date\\n    slot: car_rental_search_results_readable\\n    slot: car_rental_selection (Car rental option which the user has selected. Must be an exact element of the slot car_rental_search_results_readable. If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nchange_flight: change an existing booking to a new flight\\n    slot: flight_search_start_date (start date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: flight_search_end_date (end date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: selected_flight_id (the selected id number from the search results presented to the user.)\\n    \\nlist_bookings: list flight bookings - booked flights\\n    \\ngoodbye: say goodbye to user\\n    \\nlist_skills: tell the user what i can do\\n    \\nout_of_scope: trigger this if the user asks for something we don\\'t have a flow for or an existing command does not apply for it. Conversation repair flow for off-topic interactions that won\\'t disrupt the main conversation. should not respond to greetings or anything else for which there is a flow defined or an existing command is apt for it.\\n    \\nwelcome: greet the user and ask how you can help\\n    \\n\\n===\\nHere is what happened previously in the conversation:\\n\\nUSER: I\\'d like to book an excursion\\n\\n===\\n\\nYou are currently not in any flow and so there are no active slots.\\nThis means you can only set a slot if you first start a flow that requires that slot.\\n\\nIf you start a flow, first start the flow and then optionally fill that flow\\'s slots with information the user provided in their message.\\n\\nThe user just said \"\"\"I\\'d like to book an excursion\"\"\".\\n\\n===\\nBased on this information generate a list of actions you want to take. Your job is to start flows and to fill slots where appropriate. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:\\n* Slot setting, described by \"SetSlot(slot_name, slot_value)\". An example would be \"SetSlot(recipient, Freddy)\"\\n* Starting another flow, described by \"StartFlow(flow_name)\". An example would be \"StartFlow(transfer_money)\"\\n* Cancelling the current flow, described by \"CancelFlow()\"\\n* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote \"contacts\" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).\\n* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by \"SkipQuestion()\". Examples of user skip phrases are: \"Go to the next question\", \"Ask me something else\".\\n* Responding to knowledge-oriented user messages, described by \"SearchAndReply()\"\\n* Responding to a casual, non-task-oriented user message, described by \"ChitChat()\".\\n* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by \"HumanHandoff()\".\\n\\n===\\nWrite out the actions you want to take, one per line, in the order they should take place.\\nDo not fill slots with abstract values or placeholders.\\nOnly use information provided by the user.\\nOnly start a flow if it\\'s completely clear what the user wants. Imagine you were a person reading this message. If it\\'s not 100% clear, clarify the next step.\\nDon\\'t be overly confident. Take a conservative approach and clarify before proceeding.\\nIf the user asks for two things which seem contradictory, clarify before starting a flow.\\nIf it\\'s not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.\\nStrictly adhere to the provided action types listed above.\\nFocus on the last message and take it one step at a time.\\nUse the previous conversation steps only to aid understanding.\\n\\nCurrent time is \"2024-04-27 18:13:47.329404 +0000\" and it is a Saturday today!\\n\\nYour action list:'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -409,8 +418,7 @@
     "streamer = TextStreamer(tokenizer)  # stream model outputs as they are generated\n",
     "\n",
     "# the content to include in the input prompt\n",
-    "# by default, a value from the validation dataset as example\n",
-    "content = eval_dataset[\"text\"][0]\n",
+    "content = prompt_example\n",
     "\n",
     "# apply prompt template and tokenize\n",
     "input_ids = tokenizer.apply_chat_template(\n",
@@ -472,7 +480,7 @@
    "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/:m125"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },

From 4542e1efa9577cb0a5b9ef3b72d3aecfd162310d Mon Sep 17 00:00:00 2001
From: Kat Petrova <e.petrova@rasa.com>
Date: Thu, 14 Nov 2024 10:36:48 +0100
Subject: [PATCH 2/2] only notebook

---
 .idea/workspace.xml                           |  49 --
 .../cmd_gen_finetuning-checkpoint.ipynb       | 502 ------------------
 2 files changed, 551 deletions(-)
 delete mode 100644 .idea/workspace.xml
 delete mode 100644 .ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index 058b2fc..0000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="AutoImportSettings">
-    <option name="autoReloadType" value="SELECTIVE" />
-  </component>
-  <component name="ChangeListManager">
-    <list default="true" id="f6a915a9-bf07-40d0-bd11-ea77fa0382b6" name="Changes" comment="" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="ProjectColorInfo">{
-  &quot;associatedIndex&quot;: 8
-}</component>
-  <component name="ProjectId" id="2nT5oXWTHlIdKBpgQt6eRUGUyfW" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent">{
-  &quot;keyToString&quot;: {
-    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
-    &quot;git-widget-placeholder&quot;: &quot;hyperparams&quot;,
-    &quot;last_opened_file_path&quot;: &quot;/Users/katpetrova/rasa/notebooks&quot;
-  }
-}</component>
-  <component name="SharedIndexes">
-    <attachedChunks>
-      <set>
-        <option value="bundled-python-sdk-d7ad00fb9fc3-c546a90a8094-com.jetbrains.pycharm.community.sharedIndexes.bundled-PC-242.23726.102" />
-      </set>
-    </attachedChunks>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="f6a915a9-bf07-40d0-bd11-ea77fa0382b6" name="Changes" comment="" />
-      <created>1728984400605</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1728984400605</updated>
-    </task>
-    <servers />
-  </component>
-</project>
\ No newline at end of file
diff --git a/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb b/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb
deleted file mode 100644
index 90b6859..0000000
--- a/.ipynb_checkpoints/cmd_gen_finetuning-checkpoint.ipynb
+++ /dev/null
@@ -1,502 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Fine-tuning an LLM for Command Generation in CALM\n",
-    "\n",
-    "This is a worked example of how to efficiently fine-tune a base language model from [Hugging Face Hub](https://huggingface.co/models) using the [Unsloth](https://docs.unsloth.ai) and [TRL](https://huggingface.co/docs/trl/en/index) libraries for the task of command generation within [CALM](https://rasa.com/docs/rasa-pro/calm).\n",
-    "\n",
-    "Unsloth integrates with TRL in order to reduce the time and GPU memory required to fine-tune LLMs, when compared to using TRL exclusively.\n",
-    "\n",
-    "To run fine-tuning, you must have first [generated the dataset](https://rasa.com/rasa-pro/docs/operating/fine-tuning-recipe) files `train.jsonl` and `val.jsonl`, which must be in the [TRL instruction format](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Configure fine-tuning environment\n",
-    "\n",
-    "In order to run this notebook, you will need to first install Unsloth onto a machine with the following minimum hardware requirements:\n",
-    "- Single NVIDIA A100 GPU with 40GB VRAM\n",
-    "- 12 core CPU with 85GB RAM\n",
-    "- 250GB disk\n",
-    "\n",
-    "Please refer to the Unsloth installation instructions in the [official documentation](https://github.com/unslothai/unsloth/blob/main/README.md).\n",
-    "\n",
-    "Here is an example of how to set up the environment:\n",
-    "\n",
-    "First, we provisioned a Linux instance with the appropriate hardware and the following software installed:\n",
-    "- Python 3.10\n",
-    "- CUDA Toolkit 12.1\n",
-    "- PyTorch 2.2.\n",
-    "\n",
-    "Next, we installed Unsloth and other packages as follows:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "# install unsloth and other dependencies\n",
-    "pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
-    "pip install --no-deps \"xformers<=0.0.26\" \"trl<0.9.0\" peft accelerate bitsandbytes huggingface_hub[cli]\n",
-    "# remove tpu-only package that is installed by default on gcp runtimes, even when only using gpu\n",
-    "pip uninstall torch-xla -y"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Unsloth installation is very sensitive to the environment, in particular Cuda and PyTorch versions, so follow the [official installation instructions](https://github.com/unslothai/unsloth/blob/main/README.md) appropriate for your set-up."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Download base model\n",
-    "\n",
-    "You can download the model you want to fine-tune from Hugging Face Hub using the [official CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli) with an [API access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token) as per the code below. Make sure you first update the `HUGGINGFACE_TOKEN` and `BASE_MODEL` environment variables with your own values.\n",
-    "\n",
-    "When testing this notebook, the [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) model was used. Note that `meta-llama/Meta-Llama-3.1-8B-Instruct` is a [gated model](https://huggingface.co/docs/hub/en/models-gated) that you must first request access to before using. \n",
-    "\n",
-    "You can use any other PyTorch model available on [Hugging Face Hub](https://huggingface.co/models). It is recommended that you use a model that has been pre-trained on instructional tasks, such as the [CodeLlama 13B Instruct](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) model.\n",
-    "\n",
-    "Pre-trained models with more parameters will generally perform better at tasks than models with fewer parameters. However, the size of model you can use is limited by how much memory your GPU has.\n",
-    "\n",
-    "Alternatively, if you already have a PyTorch model directory to hand, you can upload it to your notebook environment manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "# TODO: update with your values\n",
-    "export HUGGINGFACE_TOKEN=\"CHANGEME\"\n",
-    "export BASE_MODEL=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
-    "\n",
-    "# download model\n",
-    "huggingface-cli download \"${BASE_MODEL}\" \\\n",
-    "    --token \"${HUGGINGFACE_TOKEN}\" \\\n",
-    "    --local-dir \"./base_model\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Load and quantize base model\n",
-    "\n",
-    "The [quantization of model parameters](https://huggingface.co/docs/optimum/en/concept_guides/quantization) can significantly reduce the GPU memory required to run model fine-tuning and inference, at the cost of model accuracy.\n",
-    "\n",
-    "Here, the base model is loaded from disk and quantized into an 4-bit representation on the fly using the [BitsAndBytes](https://huggingface.co/docs/transformers/main/en/quantization/bitsandbytes) library."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from transformers import BitsAndBytesConfig\n",
-    "from unsloth import FastLanguageModel\n",
-    "\n",
-    "max_seq_length = 2048\n",
-    "random_seed = 42\n",
-    "\n",
-    "\n",
-    "# configure quantization method for base model\n",
-    "quantization_config = BitsAndBytesConfig(\n",
-    "    load_in_4bit=True,\n",
-    ")\n",
-    "\n",
-    "# load quantized model and tokenizer from disk\n",
-    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
-    "    model_name=\"./base_model\",\n",
-    "    max_seq_length=max_seq_length,\n",
-    "    quantization_config=quantization_config,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Configure base model for PEFT\n",
-    "\n",
-    "[Parameter Efficient Fine-Tuning](https://huggingface.co/blog/peft) (PEFT) is a technique for adapting LLMs for specific tasks by freezing all of the base model parameters and only training a relatively small number of additional parameters. Compared to fine-tuning all parameters, PEFT can significantly reduce the amount of GPU memory required at the cost of the fine-tuned model accuracy.\n",
-    "\n",
-    "In the code below, the base model is configured for PEFT using the [Low-Rank Adaptation](https://arxiv.org/pdf/2106.09685) (LoRA) method. It is recommended that you read the [official documentation](https://docs.unsloth.ai/basics/lora-parameters-encyclopedia) and experiment with the arguments of the `get_peft_model` method. For example, you may get better model performance with different values for `r` and `lora_alpha`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from unsloth import FastLanguageModel\n",
-    "\n",
-    "# adapt model for peft\n",
-    "model = FastLanguageModel.get_peft_model(\n",
-    "    model,\n",
-    "    r=16,\n",
-    "    target_modules=[\n",
-    "        \"q_proj\",\n",
-    "        \"k_proj\",\n",
-    "        \"v_proj\",\n",
-    "        \"o_proj\",\n",
-    "        \"gate_proj\",\n",
-    "        \"up_proj\",\n",
-    "        \"down_proj\",\n",
-    "    ],\n",
-    "    lora_alpha=16,\n",
-    "    lora_dropout=0,\n",
-    "    bias=\"none\",\n",
-    "    use_gradient_checkpointing=\"unsloth\",\n",
-    "    random_state=random_seed,\n",
-    "    use_rslora=False,\n",
-    "    loftq_config=None,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Load training and validation datasets\n",
-    "\n",
-    "The following code loads the training and validation datasets from the `train.jsonl` and `val.jsonl` files, respectively\n",
-    "\n",
-    "As the files use the TRL instruction format, the TRL trainer used later will be able to [automatically parse](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support) the datasets and [generate the prompts from a template](https://huggingface.co/docs/transformers/en/chat_templating) configured in the tokenizer.\n",
-    "\n",
-    "Prompt templates vary between models and TRL will infer the correct template from your base model. If this is not available in your base model or if you wish to change it, you can set your own [template string](https://huggingface.co/docs/transformers/en/chat_templating#advanced-adding-and-editing-chat-templates) manually. Unsloth also provides a selection of [pre-defined chat templates](https://docs.unsloth.ai/basics/chat-templates) for popular language models that you can use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import datasets\n",
-    "from trl.extras.dataset_formatting import get_formatting_func_from_dataset\n",
-    "from unsloth.chat_templates import get_chat_template\n",
-    "\n",
-    "# Load the training and evaluation datasets from JSONL files on disk\n",
-    "train_dataset = datasets.load_dataset(\n",
-    "    \"json\", data_files={\"train\": \"train.jsonl\"}, split=\"train\"\n",
-    ")\n",
-    "eval_dataset = datasets.load_dataset(\n",
-    "    \"json\", data_files={\"eval\": \"val.jsonl\"}, split=\"eval\"\n",
-    ")\n",
-    "\n",
-    "# Uncomment the following line if you want to test prompt formatting on a single example from the eval dataset\n",
-    "# print(get_formatting_func_from_dataset(train_dataset, tokenizer)(eval_dataset[0]))\n",
-    "\n",
-    "# Get a tokenizer with a chat template to format conversations according to a specified structure\n",
-    "tokenizer = get_chat_template(\n",
-    "    tokenizer,\n",
-    "    chat_template=\"llama-3\",  # Specifies the chat template format (options: zephyr, chatml, mistral, llama, alpaca, etc.)\n",
-    "    mapping={\"role\": \"from\", \"content\": \"value\", \"user\": \"human\", \"assistant\": \"gpt\"},  # Maps dataset roles and messages to expected format\n",
-    ")\n",
-    "\n",
-    "# Define a function to format prompts for each example in the dataset\n",
-    "def formatting_prompts_func(examples):\n",
-    "    # Extract conversation messages from each example\n",
-    "    convos = examples[\"messages\"]\n",
-    "    \n",
-    "    # Apply the chat template to each conversation without tokenizing or adding generation prompts\n",
-    "    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]\n",
-    "    \n",
-    "    # Return the formatted texts in a new dictionary key\n",
-    "    return {\"text\": texts}\n",
-    "\n",
-    "# Apply the formatting function to both the training and evaluation datasets in batches\n",
-    "train_dataset = train_dataset.map(formatting_prompts_func, batched=True)\n",
-    "eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 6. Configure trainer\n",
-    "\n",
-    "Below, the arguments for the supervised fine-tuning (SFT) trainer are configured. Their values were chosen somewhat arbitrarily and resulted in satisfactory results during testing.\n",
-    "\n",
-    "It is recommended that you read the official documentation and experiment with the arguments passed to `SFTConfig` (see [here](https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer)) and `SFTTrainer` (see [here](https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer)).\n",
-    "\n",
-    "For example:\n",
-    "- If you get an OOM error when running fine-tuning, you can reduce `per_device_train_batch_size` in order to reduce the memory footprint. However, if your GPU has sufficient memory, you can try increasing it in order to reduce the total number of training steps.\n",
-    "- Consider setting `max_steps`, as you may not need to perform all epochs in order to achieve optimal model accuracy. Conversely, you may see better model accuracy by increasing `num_train_epochs`.\n",
-    "- If fine-tuning is taking too long, you can increase `eval_steps` in order to reduce how often validation is performed. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from transformers import TrainingArguments\n",
-    "from trl import SFTTrainer\n",
-    "from unsloth import is_bfloat16_supported\n",
-    "\n",
-    "# configure training args\n",
-    "args = TrainingArguments(\n",
-    "    ###### training\n",
-    "    seed = random_seed,\n",
-    "    per_device_train_batch_size = 2,\n",
-    "    gradient_accumulation_steps = 4,\n",
-    "    warmup_steps = 5,\n",
-    "    #max_steps = 60,\n",
-    "    num_train_epochs = 2,\n",
-    "    learning_rate = 2e-4,\n",
-    "    lr_scheduler_type = \"linear\",\n",
-    "    optim = \"adamw_8bit\",\n",
-    "    weight_decay = 0.01,\n",
-    "    ###### datatypes\n",
-    "    fp16 = not is_bfloat16_supported(),\n",
-    "    bf16 = is_bfloat16_supported(),\n",
-    "    ###### evaluation\n",
-    "    eval_strategy = \"steps\",\n",
-    "    eval_steps = 50,\n",
-    "    per_device_eval_batch_size = 8,\n",
-    "    ###### outputs\n",
-    "    logging_steps = 30,\n",
-    "    output_dir = \"outputs\",\n",
-    ")\n",
-    "\n",
-    "# setup trainer\n",
-    "trainer = SFTTrainer(\n",
-    "    model = model,\n",
-    "    tokenizer = tokenizer,\n",
-    "    train_dataset = train_dataset,\n",
-    "    eval_dataset = eval_dataset,\n",
-    "    max_seq_length = max_seq_length,\n",
-    "    args = args,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 7. Perform supervised fine-tuning\n",
-    "\n",
-    "In the code below, fine-tuning is performed using the previously congfigured trainer.\n",
-    "\n",
-    "When testing this step on an NVIDIA A100 using the configuration defined above, it took around 12 minutes to perform fine-tuning with a training dataset containing around 500 examples."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# run fine-tuning\n",
-    "finetune_metrics = trainer.train()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "\n",
-    "After fine-tuning, the base model and fine-tuned adapters are [merged together and saved to disk](https://docs.unsloth.ai/basics/saving-models/saving-to-vllm) in 16-bit for future compatibility with the [vLLM](https://github.com/vllm-project/vllm) model serving library."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# save model to disk in 16-bit\n",
-    "model.save_pretrained_merged(\"./finetuned_model\", tokenizer, save_method=\"merged_16bit\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 8. Visualize fine-tuning metrics\n",
-    "\n",
-    "Some of the metrics collected during fine-tuning are visualised below in order for you to diagnose any potential issues with the model.\n",
-    "\n",
-    "Specifically, the training and validation losses are plotted against the training step number. Please check the plot for the following:\n",
-    "- Ideally, as the fine-tuning steps increase, the training and validation losses should decrease and converge. \n",
-    "- If both loss curves do not converge, it may be worth performing more fine-tuning steps or epochs. This is known as [underfitting](https://www.ibm.com/topics/underfitting).\n",
-    "- If the validation loss suddenly starts to increase while the training loss continues to decrease or converge, you should decrease your total number of steps or epochs. This is known as [overfitting](https://www.ibm.com/topics/overfitting)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install pandas matplotlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# plot step against train and val losses\n",
-    "fig, ax = plt.subplots()\n",
-    "log_history = pd.DataFrame(trainer.state.log_history)\n",
-    "eval_loss = log_history[[\"step\", \"eval_loss\"]].dropna().plot(x=\"step\", ax=ax)\n",
-    "train_loss = log_history[[\"step\", \"loss\"]].dropna().plot(x=\"step\", ax=ax)\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 9. Run ad hoc inference\n",
-    "\n",
-    "You can load your fine-tuned model from disk using Unsloth and use it to run optimized inference on individual inputs of your choosing using the code below.\n",
-    "\n",
-    "Note that the inputs passed to model are in the [TRL convertsational format](https://huggingface.co/docs/trl/en/sft_trainer#dataset-format-support) as the Hugging Face [chat template requires them to be](https://huggingface.co/docs/transformers/main/en/chat_templating#how-do-i-use-chat-templates). During training TRL will [automatically convert the instruction format to the conversational format](https://github.com/huggingface/trl/blob/main/trl/extras/dataset_formatting.py). However, you have to do this yourself when applying chat templates manually for inference."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt_example = '\\n\\nYour task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.\\n\\nThese are the flows that can be started, with their description and slots:\\n\\nbook_excursion: search and book an excursion\\n    slot: trip_destination\\n    slot: excursion_name (use the official name as stated by the assistant)\\n    \\nsearch_hotel: search for hotels\\n    slot: trip_destination\\n    slot: hotel_price_range\\n    slot: hotel_start_date (check in date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_end_date (check out date for the hotel. Do not fill this slot unless user explicitly specifies it in the conversation.)\\n    slot: hotel_search_results_readable\\n    slot: hotel_name (Name of the hotel which the user has selected. Must be an exact element of the slot \"hotel_search_results_readable\". If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nsearch_rental_car: search for rental cars\\n    slot: trip_destination\\n    slot: car_rental_start_date\\n    slot: car_rental_end_date\\n    slot: car_rental_search_results_readable\\n    slot: car_rental_selection (Car rental option which the user has selected. Must be an exact element of the slot car_rental_search_results_readable. If the user wants an option that isn\\'t listed, do not fill the slot with that value.)\\n    \\nchange_flight: change an existing booking to a new flight\\n    slot: flight_search_start_date (start date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: flight_search_end_date (end date for flight search. Refer to the current time available and always output this in the ISO format.)\\n    slot: selected_flight_id (the selected id number from the search results presented to the user.)\\n    \\nlist_bookings: list flight bookings - booked flights\\n    \\ngoodbye: say goodbye to user\\n    \\nlist_skills: tell the user what i can do\\n    \\nout_of_scope: trigger this if the user asks for something we don\\'t have a flow for or an existing command does not apply for it. Conversation repair flow for off-topic interactions that won\\'t disrupt the main conversation. should not respond to greetings or anything else for which there is a flow defined or an existing command is apt for it.\\n    \\nwelcome: greet the user and ask how you can help\\n    \\n\\n===\\nHere is what happened previously in the conversation:\\n\\nUSER: I\\'d like to book an excursion\\n\\n===\\n\\nYou are currently not in any flow and so there are no active slots.\\nThis means you can only set a slot if you first start a flow that requires that slot.\\n\\nIf you start a flow, first start the flow and then optionally fill that flow\\'s slots with information the user provided in their message.\\n\\nThe user just said \"\"\"I\\'d like to book an excursion\"\"\".\\n\\n===\\nBased on this information generate a list of actions you want to take. Your job is to start flows and to fill slots where appropriate. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:\\n* Slot setting, described by \"SetSlot(slot_name, slot_value)\". An example would be \"SetSlot(recipient, Freddy)\"\\n* Starting another flow, described by \"StartFlow(flow_name)\". An example would be \"StartFlow(transfer_money)\"\\n* Cancelling the current flow, described by \"CancelFlow()\"\\n* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote \"contacts\" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).\\n* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by \"SkipQuestion()\". Examples of user skip phrases are: \"Go to the next question\", \"Ask me something else\".\\n* Responding to knowledge-oriented user messages, described by \"SearchAndReply()\"\\n* Responding to a casual, non-task-oriented user message, described by \"ChitChat()\".\\n* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by \"HumanHandoff()\".\\n\\n===\\nWrite out the actions you want to take, one per line, in the order they should take place.\\nDo not fill slots with abstract values or placeholders.\\nOnly use information provided by the user.\\nOnly start a flow if it\\'s completely clear what the user wants. Imagine you were a person reading this message. If it\\'s not 100% clear, clarify the next step.\\nDon\\'t be overly confident. Take a conservative approach and clarify before proceeding.\\nIf the user asks for two things which seem contradictory, clarify before starting a flow.\\nIf it\\'s not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.\\nStrictly adhere to the provided action types listed above.\\nFocus on the last message and take it one step at a time.\\nUse the previous conversation steps only to aid understanding.\\n\\nCurrent time is \"2024-04-27 18:13:47.329404 +0000\" and it is a Saturday today!\\n\\nYour action list:'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import TextStreamer\n",
-    "from unsloth import FastLanguageModel\n",
-    "\n",
-    "model, tokenizer = FastLanguageModel.from_pretrained(\"./finetuned_model\")\n",
-    "FastLanguageModel.for_inference(model)  # enable inference optimizations\n",
-    "streamer = TextStreamer(tokenizer)  # stream model outputs as they are generated\n",
-    "\n",
-    "# the content to include in the input prompt\n",
-    "content = prompt_example\n",
-    "\n",
-    "# apply prompt template and tokenize\n",
-    "input_ids = tokenizer.apply_chat_template(\n",
-    "    [{\"role\": \"user\", \"content\": content}],  # in the TRL conversational format\n",
-    "    tokenize=True,\n",
-    "    add_generation_prompt=True,\n",
-    "    return_tensors=\"pt\",\n",
-    ").to(\"cuda\")\n",
-    "\n",
-    "# generate model output from user input\n",
-    "_ = model.generate(\n",
-    "    input_ids=input_ids,\n",
-    "    streamer=streamer,  # remove streamer if you want whole output at end\n",
-    "    max_new_tokens=64,  # set the limit on how many tokens are generated\n",
-    "    do_sample=False,  # disable random sampling for deterministic outputs\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 10. Export fine-tuned model\n",
-    "\n",
-    "Lastly, export your fine-tuned model directory to an appropriate storage location that can be easily accessed later for [deployment](https://rasa.com/rasa-pro/docs/building-assistants/self-hosted-llm).\n",
-    "\n",
-    "It is recommended that you use a cloud object store, such as [Amazon S3](https://aws.amazon.com/s3/) or [Google Cloud Storage](https://cloud.google.com/storage).\n",
-    "\n",
-    "Uncomment and run the corresponding commands below for your cloud provider, making sure to first update the environment variables with your own values. It is assumed that:\n",
-    "- your bucket already exists\n",
-    "- you have already installed the CLI tool for your cloud provider\n",
-    "- you have already authenticated with your cloud provider and have sufficient permissions to write to your bucket"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "export LOCAL_MODEL_PATH=\"./finetuned_model\"\n",
-    "\n",
-    "# if using amazon\n",
-    "# export S3_MODEL_URI=\"s3://CHANGEME\" # update with your value\n",
-    "# aws s3 cp \"${LOCAL_MODEL_PATH}\" \"${S3_MODEL_URI}\" --recursive\n",
-    "\n",
-    "# if using google\n",
-    "# export GCS_MODEL_URI=\"gs://CHANGEME\" # update with your value\n",
-    "# gsutil cp -r \"${LOCAL_MODEL_PATH}\" \"${GCS_MODEL_URI}\""
-   ]
-  }
- ],
- "metadata": {
-  "environment": {
-   "kernel": "unsloth_env",
-   "name": ".m125",
-   "type": "gcloud",
-   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/:m125"
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.18"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}