Skip to content

Commit 41fe335

Browse files
committed
feat(backends): Added llama.cpp engine
Before: `dynamo-run in=dyn out=llamacpp --model-path <GGUF>` After: `python -m dynamo.llama_cpp --model-path <GGUF> Setup: `pip install -r requirements.txt`. That's it! Builds for CUDA by default. Once this lands we can remove the Rust llama.cpp engine.
1 parent 6a1350c commit 41fe335

6 files changed

Lines changed: 152 additions & 1 deletion

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# llama.cpp engine for Dynamo
2+
3+
Usage:
4+
- `pip install -r requirements.txt` # Need a recent pip, `uv pip` might be too old.
5+
- `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
6+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
llama-cpp-python -C cmake.args="-DGGML_CUDA=on"
5+
uvloop

components/backends/llama_cpp/src/dynamo/llama_cpp/__init__.py

Whitespace-only changes.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from dynamo.llama_cpp.main import main
5+
6+
if __name__ == "__main__":
7+
main()
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Usage: `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
5+
6+
import argparse
7+
import logging
8+
import sys
9+
from typing import Optional
10+
11+
import uvloop
12+
from llama_cpp import Llama
13+
14+
from dynamo.llm import ModelType, register_llm
15+
from dynamo.runtime import DistributedRuntime, dynamo_worker
16+
from dynamo.runtime.logging import configure_dynamo_logging
17+
18+
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
19+
20+
configure_dynamo_logging()
21+
22+
23+
class Config:
24+
"""Command line parameters or defaults"""
25+
26+
namespace: str
27+
component: str
28+
endpoint: str
29+
model_path: str
30+
model_name: Optional[str]
31+
context_length: int
32+
33+
34+
@dynamo_worker(static=False)
35+
async def worker(runtime: DistributedRuntime):
36+
config = cmd_line_args()
37+
38+
component = runtime.namespace(config.namespace).component(config.component)
39+
await component.create_service()
40+
41+
model_type = ModelType.Chat # llama.cpp does the pre-processing
42+
endpoint = component.endpoint(config.endpoint)
43+
await register_llm(model_type, endpoint, config.model_path, config.model_name)
44+
45+
# Initialize the engine
46+
# For more parameters see:
47+
# https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#high-level-api
48+
kwargs = {
49+
"model_path": config.model_path,
50+
"n_gpu_layers": -1, # GPU if we can
51+
"n_threads": 16, # Otherwise give it some CPU
52+
}
53+
if config.context_length:
54+
kwargs["n_ctx"] = config.context_length
55+
engine = Llama(**kwargs)
56+
57+
await endpoint.serve_endpoint(RequestHandler(engine).generate)
58+
59+
60+
class RequestHandler:
61+
def __init__(self, engine):
62+
self.engine_client = engine
63+
64+
async def generate(self, request):
65+
gen = self.engine_client.create_chat_completion(
66+
request["messages"], stream=True
67+
)
68+
for res in gen:
69+
logging.debug(f"res: {res}")
70+
yield res
71+
72+
73+
def cmd_line_args():
74+
parser = argparse.ArgumentParser(
75+
description="llama.cpp server integrated with Dynamo LLM."
76+
)
77+
parser.add_argument(
78+
"--model-path",
79+
type=str,
80+
required=True,
81+
help="Path to a local GGUF file.",
82+
)
83+
parser.add_argument(
84+
"--endpoint",
85+
type=str,
86+
default=DEFAULT_ENDPOINT,
87+
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
88+
)
89+
parser.add_argument(
90+
"--model-name",
91+
type=str,
92+
default="",
93+
help="Name to serve the model under. Defaults to deriving it from model path.",
94+
)
95+
parser.add_argument(
96+
"--context-length",
97+
type=int,
98+
default=None,
99+
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
100+
)
101+
args = parser.parse_args()
102+
103+
config = Config()
104+
config.model_path = args.model_path
105+
if args.model_name:
106+
config.model_name = args.model_name
107+
else:
108+
# This becomes an `Option` on the Rust side
109+
config.model_name = None
110+
111+
endpoint_str = args.endpoint.replace("dyn://", "", 1)
112+
endpoint_parts = endpoint_str.split(".")
113+
if len(endpoint_parts) != 3:
114+
logging.error(
115+
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
116+
)
117+
sys.exit(1)
118+
119+
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
120+
121+
config.namespace = parsed_namespace
122+
config.component = parsed_component_name
123+
config.endpoint = parsed_endpoint_name
124+
config.context_length = args.context_length
125+
return config
126+
127+
128+
def main():
129+
uvloop.run(worker())
130+
131+
132+
if __name__ == "__main__":
133+
main()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ requires = ["hatchling"]
7979
build-backend = "hatchling.build"
8080

8181
[tool.hatch.build.targets.wheel]
82-
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo"]
82+
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo", "components/backends/llama_cpp/src/dynamo"]
8383

8484
# This section is for including the binaries in the wheel package
8585
# but doesn't make them executable scripts in the venv bin directory

0 commit comments

Comments
 (0)