Skip to content

Commit

Permalink
Add llama-cpp-python server
Browse files Browse the repository at this point in the history
Changed default runtime from 'llama.cpp' to 'llama-cpp-python'.
Added 'llama-cpp-python' as a runtime option for better
flexibility with the `--runtime` flag.

Signed-off-by: Eric Curtin <[email protected]>
  • Loading branch information
ericcurtin committed Nov 15, 2024
1 parent 41d0c21 commit fe9c0ca
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 23 deletions.
2 changes: 1 addition & 1 deletion docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ After=local-fs.target
[Container]
AddDevice=-/dev/dri
AddDevice=-/dev/kfd
Exec=llama-server --port 1234 -m $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
Exec=python3 -m llama_cpp.server --port 1234 --model $HOME/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf
Image=quay.io/ramalama/ramalama:latest
Mount=type=bind,src=/home/dwalsh/.local/share/ramalama/models/huggingface/instructlab/granite-7b-lab-GGUF/granite-7b-lab-Q4_K_M.gguf,target=/mnt/models/model.file,ro,Z
ContainerName=MyGraniteServer
Expand Down
6 changes: 3 additions & 3 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def load_and_merge_config():
)

ramalama_config['carimage'] = ramalama_config.get('carimage', "registry.access.redhat.com/ubi9-micro:latest")
ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama.cpp')
ramalama_config['runtime'] = ramalama_config.get('runtime', 'llama-cpp-python')
ramalama_config['store'] = os.getenv('RAMALAMA_STORE', ramalama_config.get('store', get_store()))
ramalama_config['transport'] = os.getenv('RAMALAMA_TRANSPORT', ramalama_config.get('transport', "ollama"))

Expand Down Expand Up @@ -207,8 +207,8 @@ def configure_arguments(parser):
parser.add_argument(
"--runtime",
default=config.get("runtime"),
choices=["llama.cpp", "vllm"],
help="specify the runtime to use; valid options are 'llama.cpp' and 'vllm'",
choices=["llama-cpp-python", "llama.cpp", "vllm"],
help="specify the runtime to use; valid options are 'llama-cpp-python', 'llama.cpp' and 'vllm'",
)
parser.add_argument(
"--store",
Expand Down
43 changes: 29 additions & 14 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,19 @@ def run(self, args):
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))

def execute_model(self, model_path, exec_args, args):
try:
if self.exec_model_in_container(model_path, exec_args, args):
return
if args.dryrun:
dry_run(exec_args)
return
exec_cmd(exec_args, debug=args.debug)
except FileNotFoundError as e:
if in_container():
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))

def serve(self, args):
if hasattr(args, "name") and args.name:
if not args.container and not args.generate:
Expand All @@ -300,13 +313,25 @@ def serve(self, args):
if not args.container and not args.generate:
exec_model_path = model_path

exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path]
if args.runtime == "vllm":
exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
else:
elif args.runtime == "llama.cpp":
exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path, "--host", args.host]
if args.gpu:
exec_args.extend(self.gpu_args())
exec_args.extend(["--host", args.host])

else:
exec_args = [
"python3",
"-m",
"llama_cpp.server",
"--port",
args.port,
"--model",
exec_model_path,
"--host",
args.host,
]

if args.generate == "quadlet":
return self.quadlet(model_path, args, exec_args)
Expand All @@ -317,17 +342,7 @@ def serve(self, args):
if args.generate == "quadlet/kube":
return self.quadlet_kube(model_path, args, exec_args)

try:
if self.exec_model_in_container(model_path, exec_args, args):
return
if args.dryrun:
dry_run(exec_args)
return
exec_cmd(exec_args, debug=args.debug)
except FileNotFoundError as e:
if in_container():
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))
self.execute_model(model_path, exec_args, args)

def quadlet(self, model, args, exec_args):
quadlet = Quadlet(model, args, exec_args)
Expand Down
8 changes: 4 additions & 4 deletions test/system/040-serve.bats
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat tinyllama.container
is "$output" ".*PublishPort=1234" "PublishPort should match"
is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
is "$output" ".*Mount=type=bind,.*tinyllama" "Mount line should be correct"

rm tinyllama.container
Expand Down Expand Up @@ -174,7 +174,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
run cat $name.container
is "$output" ".*PublishPort=1234" "PublishPort should match"
is "$output" ".*ContainerName=${name}" "Quadlet should have ContainerName field"
is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
is "$output" ".*Exec=python3 -m llama_cpp.server --port 1234 --model .*" "Exec line should be correct"
is "$output" ".*Mount=type=image,source=${ociimage},destination=/mnt/models,subpath=/models,readwrite=false" "Volume line should be correct"

if is_container; then
Expand Down Expand Up @@ -226,7 +226,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat $name.yaml
is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
is "$output" ".*command: \[\"llama-server\"\]" "Should command"
is "$output" ".*command: \[\"python3\"\]" "Should command"
is "$output" ".*containerPort: 1234" "Should container container port"

run_ramalama serve --name=${name} --port 1234 --generate=quadlet/kube ${model}
Expand All @@ -235,7 +235,7 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name

run cat $name.yaml
is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
is "$output" ".*command: \[\"llama-server\"\]" "Should command"
is "$output" ".*command: \[\"python3\"\]" "Should command"
is "$output" ".*containerPort: 1234" "Should container container port"

run cat $name.kube
Expand Down
2 changes: 1 addition & 1 deletion test/system/060-info.bats
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ load helpers
# FIXME Engine (podman|docker|'')
tests="
Image | "quay.io/ramalama/ramalama:latest"
Runtime | "llama.cpp"
Runtime | "llama-cpp-python"
Version | "${version}"
Store | \\\("${HOME}/.local/share/ramalama"\\\|"/var/lib/ramalama"\\\)
"
Expand Down

0 comments on commit fe9c0ca

Please sign in to comment.