-
Notifications
You must be signed in to change notification settings - Fork 0
/
llm.py
52 lines (39 loc) · 1.33 KB
/
llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import modal.gpu
GPU_COUNT = 4
image = modal.Image.debian_slim(python_version="3.10").pip_install(
"pillow",
"torch",
"requests",
"huggingface-hub",
"vllm",
)
app = modal.App("browserman-llm", image=image)
MODEL_NAME = "neuralmagic/Llama-3.2-90B-Vision-Instruct-FP8-dynamic"
@app.cls(gpu=modal.gpu.H100(count=GPU_COUNT), container_idle_timeout=20 * 60)
class Model:
@modal.build()
def build(self):
import transformers.utils
from huggingface_hub import snapshot_download
snapshot_download(MODEL_NAME)
transformers.utils.move_cache()
@modal.enter()
def enter(self):
from vllm import LLM
self.llm = LLM(
model=MODEL_NAME,
max_num_seqs=1,
enforce_eager=True,
tensor_parallel_size=GPU_COUNT,
)
@modal.method()
def inference(self, prompt, image, temperature=0.2):
from vllm import SamplingParams
# Set up sampling parameters
sampling_params = SamplingParams(temperature=temperature, max_tokens=300)
# Generate the response
inputs = {"prompt": prompt, "multi_modal_data": {}}
if image:
inputs["multi_modal_data"]["image"] = image
outputs = self.llm.generate(inputs, sampling_params=sampling_params)
return outputs[0].outputs[0].text