33
33
# the weights from HuggingFace directly into a local directory when building the
34
34
# container image.
35
35
36
+
36
37
def download_model_to_image (model_dir , model_name , model_revision ):
37
38
from huggingface_hub import snapshot_download
38
39
from transformers .utils import move_cache
@@ -46,6 +47,7 @@ def download_model_to_image(model_dir, model_name, model_revision):
46
47
)
47
48
move_cache ()
48
49
50
+
49
51
# ## Set up the container image
50
52
51
53
# Our first order of business is to define the environment our server will run in
@@ -76,7 +78,7 @@ def download_model_to_image(model_dir, model_name, model_revision):
76
78
.env ({"HF_HUB_ENABLE_HF_TRANSFER" : "1" })
77
79
.run_function (
78
80
download_model_to_image ,
79
- timeout = 60 * MINUTES , # typically much faster but set high to be conservative
81
+ timeout = 60 * MINUTES , # typically much faster but set high to be conservative
80
82
kwargs = {
81
83
"model_dir" : MODEL_DIR ,
82
84
"model_name" : MODEL_NAME ,
@@ -100,14 +102,15 @@ def download_model_to_image(model_dir, model_name, model_revision):
100
102
#
101
103
app = modal .App (APP_NAME )
102
104
105
+
103
106
@app .function (
104
107
image = vllm_image ,
105
108
gpu = GPU_CONFIG ,
106
- keep_warm = 0 , # Spin down entirely when idle
109
+ keep_warm = 0 , # Spin down entirely when idle
107
110
container_idle_timeout = 5 * MINUTES ,
108
111
timeout = 24 * HOURS ,
109
112
allow_concurrent_inputs = 1000 ,
110
- secrets = [modal .Secret .from_name ("example-secret-token" )], # contains MODAL_TOKEN used below
113
+ secrets = [modal .Secret .from_name ("example-secret-token" )], # contains MODAL_TOKEN used below
111
114
)
112
115
@modal .asgi_app ()
113
116
def serve ():
@@ -144,7 +147,7 @@ def serve():
144
147
145
148
# This example uses a token defined in the Modal secret linked above,
146
149
# as described here: https://modal.com/docs/guide/secrets
147
- async def is_authenticated (api_key = fastapi .Security (http_bearer )):
150
+ async def is_authenticated (api_key = fastapi .Security (http_bearer )):
148
151
if api_key .credentials != os .getenv ("MODAL_TOKEN" ):
149
152
raise fastapi .HTTPException (
150
153
status_code = fastapi .status .HTTP_401_UNAUTHORIZED ,
@@ -167,17 +170,13 @@ async def is_authenticated(api_key = fastapi.Security(http_bearer)):
167
170
enforce_eager = False , # capture the graph for faster inference, but slower cold starts (30s > 20s)
168
171
)
169
172
170
- engine = AsyncLLMEngine .from_engine_args (
171
- engine_args , usage_context = UsageContext .OPENAI_API_SERVER
172
- )
173
+ engine = AsyncLLMEngine .from_engine_args (engine_args , usage_context = UsageContext .OPENAI_API_SERVER )
173
174
174
175
model_config = get_model_config (engine )
175
176
176
177
request_logger = RequestLogger (max_log_len = 2048 )
177
178
178
- base_model_paths = [
179
- BaseModelPath (name = MODEL_NAME .split ("/" )[1 ], model_path = MODEL_NAME )
180
- ]
179
+ base_model_paths = [BaseModelPath (name = MODEL_NAME .split ("/" )[1 ], model_path = MODEL_NAME )]
181
180
182
181
api_server .chat = lambda s : OpenAIServingChat (
183
182
engine ,
0 commit comments