From cbe934025e44a1d6db623b5659d0271790f16435 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Fri, 5 Jul 2024 10:31:40 -0700
Subject: [PATCH] Fix/llm launcher disable token (#3230)

* Fix disable_token_auth api

* Move vllm dep to right place

* Fix llm deployment docs
---
 README.md                    | 2 +-
 docs/llm_deployment.md       | 4 ++--
 requirements/common.txt      | 1 +
 requirements/torch_linux.txt | 1 -
 ts/llm_launcher.py           | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c797383b2d..33956318fc 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ Refer to [torchserve docker](docker/README.md) for details.
 #export token=<HUGGINGFACE_HUB_TOKEN>
 docker build . -f docker/Dockerfile.llm -t ts/llm
 
-docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
+docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
 
 curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
index b413d02061..282dd558fe 100644
--- a/docs/llm_deployment.md
+++ b/docs/llm_deployment.md
@@ -22,7 +22,7 @@ export token=<HUGGINGFACE_HUB_TOKEN>
 
 You can then go ahead and launch a TorchServe instance serving your selected model:
 ```bash
-docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
+docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
 ```
 
 To change the model you just need to exchange the identifier given to the `--model_id` parameter.
@@ -42,7 +42,7 @@ To rename the model endpoint from `predictions/model` to something else you can
 
 The launcher script can also be used outside a docker container by calling this after installing TorchServe following the [installation instruction](https://github.com/pytorch/serve/blob/feature/single_cmd_llm_deployment/README.md#-quick-start-with-torchserve).
 ```bash
-python -m ts.llm_launcher --disable_token
+python -m ts.llm_launcher --disable_token_auth
 ```
 
 Please note that the launcher script as well as the docker command will automatically run on all available GPUs so make sure to restrict the visible number of device by setting CUDA_VISIBLE_DEVICES.
diff --git a/requirements/common.txt b/requirements/common.txt
index b568981d3e..452a613e76 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,3 +6,4 @@ pynvml==11.5.0
 pyyaml==6.0.1
 ninja==1.11.1.1
 setuptools
+vllm==0.5.0; sys_platform == 'linux'
diff --git a/requirements/torch_linux.txt b/requirements/torch_linux.txt
index adc3278a63..df10c8a2cb 100644
--- a/requirements/torch_linux.txt
+++ b/requirements/torch_linux.txt
@@ -5,4 +5,3 @@ torch==2.3.0+cpu; sys_platform == 'linux'
 torchvision==0.18.0+cpu; sys_platform == 'linux'
 torchtext==0.18.0; sys_platform == 'linux'
 torchaudio==2.3.0+cpu; sys_platform == 'linux'
-vllm==0.5.0; sys_platform == 'linux'
diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
index 66a29a5194..89248ce9f4 100644
--- a/ts/llm_launcher.py
+++ b/ts/llm_launcher.py
@@ -99,7 +99,7 @@ def main(args):
                 model_store=args.model_store,
                 no_config_snapshots=True,
                 models=args.model_name,
-                disable_token=args.disable_token,
+                disable_token=args.disable_token_auth,
             )
 
             pause()
@@ -134,7 +134,7 @@ def main(args):
     )
 
     parser.add_argument(
-        "--disable_token-auth",
+        "--disable_token_auth",
         action="store_true",
         help="Disable token authentication",
     )