I would like to be able to limit the model context window, even when the model is offering a larger limit. On local models, this helps to fine tune performance and on cloud models, like GLM-5 it helps to ensure that the context doesn't degrade ( using 95000 token window instead of 200.000 on GLM-5 makes it rock-solid )
Also, it is helpful for switching between models (fallbacks via hermes itself oder proxies like LiteLLM) to set a common limit.
diff --git a/cli.py b/cli.py
index 24f12f38..3b833386 100755
--- a/cli.py
+++ b/cli.py
@@ -984,6 +984,17 @@ class HermesCLI:
Provides a REPL interface with rich formatting, command history,
and tool execution capabilities.
"""
+
+ @staticmethod
+ def _coerce_positive_int(val: Any) -> Optional[int]:
+ """Parse a strictly positive int from config/env-like values."""
+ if val is None or val == "":
+ return None
+ try:
+ i = int(val)
+ return i if i > 0 else None
+ except (TypeError, ValueError):
+ return None
def __init__(
self,
@@ -1147,6 +1158,20 @@ class HermesCLI:
self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
self._active_agent_route_signature = None
+ # Max output tokens: model.max_tokens (default) and model.max_tokens_per_model (overrides)
+ self._max_tokens_default: Optional[int] = None
+ self._max_tokens_per_model: Dict[str, int] = {}
+ if isinstance(_model_config, dict):
+ self._max_tokens_default = self._coerce_positive_int(_model_config.get("max_tokens"))
+ raw_per = _model_config.get("max_tokens_per_model")
+ if isinstance(raw_per, dict):
+ for k, v in raw_per.items():
+ if not isinstance(k, str) or not k.strip():
+ continue
+ iv = self._coerce_positive_int(v)
+ if iv is not None:
+ self._max_tokens_per_model[k.strip()] = iv
+
# Agent will be initialized on first use
self.agent: Optional[AIAgent] = None
self._app = None # prompt_toolkit Application (set in run())
@@ -1727,6 +1752,13 @@ class HermesCLI:
return True
+ def _resolved_max_tokens(self, model_name: Optional[str]) -> Optional[int]:
+ """Apply per-model max_tokens override from config, else global default."""
+ m = (model_name or "").strip()
+ if m and m in self._max_tokens_per_model:
+ return self._max_tokens_per_model[m]
+ return self._max_tokens_default
+
def _resolve_turn_agent_config(self, user_message: str) -> dict:
"""Resolve model/runtime overrides for a single user turn."""
from agent.smart_model_routing import resolve_turn_route
@@ -1852,6 +1884,7 @@ class HermesCLI:
pass_session_id=self.pass_session_id,
tool_progress_callback=self._on_tool_progress,
stream_delta_callback=self._stream_delta if self.streaming_enabled else None,
+ max_tokens=self._resolved_max_tokens(effective_model),
)
self._active_agent_route_signature = (
effective_model,
@@ -3806,6 +3839,7 @@ class HermesCLI:
provider_require_parameters=self._provider_require_params,
provider_data_collection=self._provider_data_collection,
fallback_model=self._fallback_model,
+ max_tokens=self._resolved_max_tokens(turn_route["model"]),
)
result = bg_agent.run_conversation(
diff --git a/gateway/run.py b/gateway/run.py
index 2887ee7a..7a08fc08 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -319,6 +319,9 @@ class GatewayRunner:
self._provider_routing = self._load_provider_routing()
self._fallback_model = self._load_fallback_model()
self._smart_model_routing = self._load_smart_model_routing()
+ self._model_max_tokens_default, self._model_max_tokens_by_model = (
+ self._load_model_max_tokens_config()
+ )
# Wire process registry into session store for reset protection
from tools.process_registry import process_registry
@@ -528,6 +531,7 @@ class GatewayRunner:
enabled_toolsets=["memory", "skills"],
session_id=old_session_id,
honcho_session_key=honcho_session_key,
+ max_tokens=self._resolve_max_tokens(model),
)
# Build conversation history from transcript
@@ -834,6 +838,49 @@ class GatewayRunner:
pass
return {}
+ @staticmethod
+ def _coerce_positive_int(val: Any) -> Optional[int]:
+ if val is None or val == "":
+ return None
+ try:
+ i = int(val)
+ return i if i > 0 else None
+ except (TypeError, ValueError):
+ return None
+
+ @staticmethod
+ def _load_model_max_tokens_config() -> tuple[Optional[int], Dict[str, int]]:
+ """Load model.max_tokens and model.max_tokens_per_model from config.yaml."""
+ try:
+ import yaml as _y
+ cfg_path = _hermes_home / "config.yaml"
+ if cfg_path.exists():
+ with open(cfg_path, encoding="utf-8") as _f:
+ cfg = _y.safe_load(_f) or {}
+ mc = cfg.get("model", {})
+ if not isinstance(mc, dict):
+ return None, {}
+ default = GatewayRunner._coerce_positive_int(mc.get("max_tokens"))
+ raw_per = mc.get("max_tokens_per_model")
+ per: Dict[str, int] = {}
+ if isinstance(raw_per, dict):
+ for k, v in raw_per.items():
+ if not isinstance(k, str) or not k.strip():
+ continue
+ iv = GatewayRunner._coerce_positive_int(v)
+ if iv is not None:
+ per[k.strip()] = iv
+ return default, per
+ except Exception:
+ pass
+ return None, {}
+
+ def _resolve_max_tokens(self, model_name: Optional[str]) -> Optional[int]:
+ m = (model_name or "").strip()
+ if m and m in self._model_max_tokens_by_model:
+ return self._model_max_tokens_by_model[m]
+ return self._model_max_tokens_default
+
async def start(self) -> bool:
"""
Start the gateway and all configured platform adapters.
@@ -1705,6 +1752,7 @@ class GatewayRunner:
quiet_mode=True,
enabled_toolsets=["memory"],
session_id=session_entry.session_id,
+ max_tokens=self._resolve_max_tokens(_hyg_model),
)
loop = asyncio.get_event_loop()
@@ -3210,6 +3258,7 @@ class GatewayRunner:
platform=platform_key,
session_db=self._session_db,
fallback_model=self._fallback_model,
+ max_tokens=self._resolve_max_tokens(turn_route["model"]),
)
return agent.run_conversation(
@@ -3403,6 +3452,7 @@ class GatewayRunner:
quiet_mode=True,
enabled_toolsets=["memory"],
session_id=session_entry.session_id,
+ max_tokens=self._resolve_max_tokens(model),
)
loop = asyncio.get_event_loop()
@@ -4577,6 +4627,7 @@ class GatewayRunner:
honcho_config=honcho_config,
session_db=self._session_db,
fallback_model=self._fallback_model,
+ max_tokens=self._resolve_max_tokens(turn_route["model"]),
)
# Store agent reference for interrupt support
Problem or Use Case
I would like to be able to limit the model context window, even when the model is offering a larger limit. On local models, this helps to fine tune performance and on cloud models, like GLM-5 it helps to ensure that the context doesn't degrade ( using 95000 token window instead of 200.000 on GLM-5 makes it rock-solid )
Also, it is helpful for switching between models (fallbacks via hermes itself oder proxies like LiteLLM) to set a common limit.
Proposed Solution
A simple hack, which works for me as a hint:
Alternatives Considered
No response
Feature Type
Configuration option
Scope
Medium (few files, < 300 lines)
Contribution