From 9fbf6c73ce88fe5082cd4d645f83c62619ea5673 Mon Sep 17 00:00:00 2001
From: Yiren Lu <yirenlu13@gmail.com>
Date: Thu, 18 Apr 2024 18:57:31 -0400
Subject: [PATCH] Update llm-finetuning docs

---
 config/mistral.yml |   2 +-
 src/common.py      |   2 +-
 src/inference.py   | 117 ---------------------------------------------
 3 files changed, 2 insertions(+), 119 deletions(-)
 delete mode 100644 src/inference.py

diff --git a/config/mistral.yml b/config/mistral.yml
index 9dec966..c9d4e6f 100644
--- a/config/mistral.yml
+++ b/config/mistral.yml
@@ -21,7 +21,7 @@ datasets:
       format: |-
         [INST] Using the schema context below, generate a SQL query that answers the question.
         {input}
-        {instruction} [/INST] 
+        {instruction} [/INST]
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/src/common.py b/src/common.py
index 627f045..dcfcd1b 100644
--- a/src/common.py
+++ b/src/common.py
@@ -15,7 +15,7 @@
         "git clone https://github.com/OpenAccess-AI-Collective/axolotl /root/axolotl",
         "cd /root/axolotl && git checkout v0.4.0",
     )
-    .pip_install("huggingface_hub==0.20.3", "hf-transfer==0.1.5")
+    .pip_install("huggingface_hub==0.20.3", "hf-transfer==0.1.5", "wandb==0.16.3")
     .env(
         dict(
             HUGGINGFACE_HUB_CACHE="/pretrained",
diff --git a/src/inference.py b/src/inference.py
deleted file mode 100644
index 084e624..0000000
--- a/src/inference.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import time
-import yaml
-from pathlib import Path
-
-import modal
-from fastapi.responses import StreamingResponse
-
-from .common import stub, vllm_image, VOLUME_CONFIG
-
-N_INFERENCE_GPU = 2
-
-with vllm_image.imports():
-    from vllm.engine.arg_utils import AsyncEngineArgs
-    from vllm.engine.async_llm_engine import AsyncLLMEngine
-    from vllm.sampling_params import SamplingParams
-    from vllm.utils import random_uuid
-
-
-def get_model_path_from_run(path: Path) -> Path:
-    with (path / "config.yml").open() as f:
-        return path / yaml.safe_load(f.read())["output_dir"] / "merged"
-
-
-@stub.cls(
-    gpu=modal.gpu.H100(count=N_INFERENCE_GPU),
-    image=vllm_image,
-    volumes=VOLUME_CONFIG,
-    allow_concurrent_inputs=30,
-    container_idle_timeout=900,
-)
-class Inference:
-    def __init__(self, run_name: str = "", run_dir: str = "/runs") -> None:
-        self.run_name = run_name
-        self.run_dir = run_dir
-
-    @modal.enter()
-    def init(self):
-        if self.run_name:
-            path = Path(self.run_dir) / self.run_name
-            model_path = get_model_path_from_run(path)
-        else:
-            # Pick the last run automatically
-            run_paths = list(Path(self.run_dir).iterdir())
-            for path in sorted(run_paths, reverse=True):
-                model_path = get_model_path_from_run(path)
-                if model_path.exists():
-                    break
-
-        print("Initializing vLLM engine on:", model_path)
-
-        engine_args = AsyncEngineArgs(
-            model=model_path,
-            gpu_memory_utilization=0.95,
-            tensor_parallel_size=N_INFERENCE_GPU,
-        )
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def _stream(self, input: str):
-        if not input:
-            return
-
-        sampling_params = SamplingParams(
-            repetition_penalty=1.1,
-            temperature=0.2,
-            top_p=0.95,
-            top_k=50,
-            max_tokens=1024,
-        )
-        request_id = random_uuid()
-        results_generator = self.engine.generate(input, sampling_params, request_id)
-
-        t0 = time.time()
-        index, tokens = 0, 0
-        async for request_output in results_generator:
-            if (
-                request_output.outputs[0].text
-                and "\ufffd" == request_output.outputs[0].text[-1]
-            ):
-                continue
-            yield request_output.outputs[0].text[index:]
-            index = len(request_output.outputs[0].text)
-
-            # Token accounting
-            new_tokens = len(request_output.outputs[0].token_ids)
-            tokens = new_tokens
-
-        throughput = tokens / (time.time() - t0)
-        print(f"Request completed: {throughput:.4f} tokens/s")
-        print(request_output.outputs[0].text)
-
-    @modal.method()
-    async def completion(self, input: str):
-        async for text in self._stream(input):
-            yield text
-
-    @modal.method()
-    async def non_streaming(self, input: str):
-        output = [text async for text in self._stream(input)]
-        return "".join(output)
-
-    @modal.web_endpoint()
-    async def web(self, input: str):
-        return StreamingResponse(self._stream(input), media_type="text/event-stream")
-
-
-@stub.local_entrypoint()
-def inference_main(run_name: str = "", prompt: str = ""):
-    if prompt:
-        for chunk in Inference(run_name).completion.remote_gen(prompt):
-            print(chunk, end="")
-    else:
-        prompt = input(
-            "Enter a prompt (including the prompt template, e.g. [INST] ... [/INST]):\n"
-        )
-        print("Loading model ...")
-        for chunk in Inference(run_name).completion.remote_gen(prompt):
-            print(chunk, end="")