Skip to content

Commit 3733f58

Browse files
authored
feat(backends): Python llama.cpp engine (#1925)
1 parent 6a1350c commit 3733f58

6 files changed

Lines changed: 154 additions & 1 deletion

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# llama.cpp engine for Dynamo
2+
3+
Usage:
4+
- `pip install -r requirements.txt` # Need a recent pip, `uv pip` might be too old.
5+
- `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
6+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
llama-cpp-python -C cmake.args="-DGGML_CUDA=on"
5+
uvloop

components/backends/llama_cpp/src/dynamo/llama_cpp/__init__.py

Whitespace-only changes.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from dynamo.llama_cpp.main import main
5+
6+
if __name__ == "__main__":
7+
main()
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Usage: `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
5+
6+
import argparse
7+
import logging
8+
import sys
9+
from typing import Optional
10+
11+
import uvloop
12+
from llama_cpp import Llama
13+
14+
from dynamo.llm import ModelType, register_llm
15+
from dynamo.runtime import DistributedRuntime, dynamo_worker
16+
from dynamo.runtime.logging import configure_dynamo_logging
17+
18+
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
19+
20+
configure_dynamo_logging()
21+
22+
23+
class Config:
24+
"""Command line parameters or defaults"""
25+
26+
namespace: str
27+
component: str
28+
endpoint: str
29+
model_path: str
30+
model_name: Optional[str]
31+
context_length: int
32+
33+
34+
@dynamo_worker(static=False)
35+
async def worker(runtime: DistributedRuntime):
36+
config = cmd_line_args()
37+
38+
component = runtime.namespace(config.namespace).component(config.component)
39+
await component.create_service()
40+
41+
model_type = ModelType.Chat # llama.cpp does the pre-processing
42+
endpoint = component.endpoint(config.endpoint)
43+
await register_llm(model_type, endpoint, config.model_path, config.model_name)
44+
45+
# Initialize the engine
46+
# For more parameters see:
47+
# https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#high-level-api
48+
kwargs = {
49+
"model_path": config.model_path,
50+
"n_gpu_layers": -1, # GPU if we can
51+
"n_threads": 16, # Otherwise give it some CPU
52+
}
53+
if config.context_length:
54+
kwargs["n_ctx"] = config.context_length
55+
engine = Llama(**kwargs)
56+
57+
await endpoint.serve_endpoint(RequestHandler(engine).generate)
58+
59+
60+
class RequestHandler:
61+
def __init__(self, engine):
62+
self.engine_client = engine
63+
64+
async def generate(self, request):
65+
gen = self.engine_client.create_chat_completion(
66+
request["messages"], stream=True
67+
)
68+
# TODO this is a synchronous generator in an async method.
69+
# Move it to a thread so it doesn't block the event loop.
70+
for res in gen:
71+
logging.debug(f"res: {res}")
72+
yield res
73+
74+
75+
def cmd_line_args():
76+
parser = argparse.ArgumentParser(
77+
description="llama.cpp server integrated with Dynamo LLM."
78+
)
79+
parser.add_argument(
80+
"--model-path",
81+
type=str,
82+
required=True,
83+
help="Path to a local GGUF file.",
84+
)
85+
parser.add_argument(
86+
"--endpoint",
87+
type=str,
88+
default=DEFAULT_ENDPOINT,
89+
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
90+
)
91+
parser.add_argument(
92+
"--model-name",
93+
type=str,
94+
default="",
95+
help="Name to serve the model under. Defaults to deriving it from model path.",
96+
)
97+
parser.add_argument(
98+
"--context-length",
99+
type=int,
100+
default=None,
101+
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
102+
)
103+
args = parser.parse_args()
104+
105+
config = Config()
106+
config.model_path = args.model_path
107+
if args.model_name:
108+
config.model_name = args.model_name
109+
else:
110+
# This becomes an `Option` on the Rust side
111+
config.model_name = None
112+
113+
endpoint_str = args.endpoint.replace("dyn://", "", 1)
114+
endpoint_parts = endpoint_str.split(".")
115+
if len(endpoint_parts) != 3:
116+
logging.error(
117+
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
118+
)
119+
sys.exit(1)
120+
121+
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
122+
123+
config.namespace = parsed_namespace
124+
config.component = parsed_component_name
125+
config.endpoint = parsed_endpoint_name
126+
config.context_length = args.context_length
127+
return config
128+
129+
130+
def main():
131+
uvloop.run(worker())
132+
133+
134+
if __name__ == "__main__":
135+
main()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ requires = ["hatchling"]
7979
build-backend = "hatchling.build"
8080

8181
[tool.hatch.build.targets.wheel]
82-
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo"]
82+
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo", "components/backends/llama_cpp/src/dynamo"]
8383

8484
# This section is for including the binaries in the wheel package
8585
# but doesn't make them executable scripts in the venv bin directory

0 commit comments

Comments
 (0)