Skip to content

[Feature]: custom context window limits per model #2057

@dan-and

Description

@dan-and

Problem or Use Case

I would like to be able to limit the model context window, even when the model is offering a larger limit. On local models, this helps to fine tune performance and on cloud models, like GLM-5 it helps to ensure that the context doesn't degrade ( using 95000 token window instead of 200.000 on GLM-5 makes it rock-solid )

Also, it is helpful for switching between models (fallbacks via hermes itself oder proxies like LiteLLM) to set a common limit.

Proposed Solution

A simple hack, which works for me as a hint:

diff --git a/cli.py b/cli.py
index 24f12f38..3b833386 100755
--- a/cli.py
+++ b/cli.py
@@ -984,6 +984,17 @@ class HermesCLI:
     Provides a REPL interface with rich formatting, command history,
     and tool execution capabilities.
     """
+
+    @staticmethod
+    def _coerce_positive_int(val: Any) -> Optional[int]:
+        """Parse a strictly positive int from config/env-like values."""
+        if val is None or val == "":
+            return None
+        try:
+            i = int(val)
+            return i if i > 0 else None
+        except (TypeError, ValueError):
+            return None

     def __init__(
         self,
@@ -1147,6 +1158,20 @@ class HermesCLI:
         self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
         self._active_agent_route_signature = None

+        # Max output tokens: model.max_tokens (default) and model.max_tokens_per_model (overrides)
+        self._max_tokens_default: Optional[int] = None
+        self._max_tokens_per_model: Dict[str, int] = {}
+        if isinstance(_model_config, dict):
+            self._max_tokens_default = self._coerce_positive_int(_model_config.get("max_tokens"))
+            raw_per = _model_config.get("max_tokens_per_model")
+            if isinstance(raw_per, dict):
+                for k, v in raw_per.items():
+                    if not isinstance(k, str) or not k.strip():
+                        continue
+                    iv = self._coerce_positive_int(v)
+                    if iv is not None:
+                        self._max_tokens_per_model[k.strip()] = iv
+
         # Agent will be initialized on first use
         self.agent: Optional[AIAgent] = None
         self._app = None  # prompt_toolkit Application (set in run())
@@ -1727,6 +1752,13 @@ class HermesCLI:

         return True

+    def _resolved_max_tokens(self, model_name: Optional[str]) -> Optional[int]:
+        """Apply per-model max_tokens override from config, else global default."""
+        m = (model_name or "").strip()
+        if m and m in self._max_tokens_per_model:
+            return self._max_tokens_per_model[m]
+        return self._max_tokens_default
+
     def _resolve_turn_agent_config(self, user_message: str) -> dict:
         """Resolve model/runtime overrides for a single user turn."""
         from agent.smart_model_routing import resolve_turn_route
@@ -1852,6 +1884,7 @@ class HermesCLI:
                 pass_session_id=self.pass_session_id,
                 tool_progress_callback=self._on_tool_progress,
                 stream_delta_callback=self._stream_delta if self.streaming_enabled else None,
+                max_tokens=self._resolved_max_tokens(effective_model),
             )
             self._active_agent_route_signature = (
                 effective_model,
@@ -3806,6 +3839,7 @@ class HermesCLI:
                     provider_require_parameters=self._provider_require_params,
                     provider_data_collection=self._provider_data_collection,
                     fallback_model=self._fallback_model,
+                    max_tokens=self._resolved_max_tokens(turn_route["model"]),
                 )

                 result = bg_agent.run_conversation(
diff --git a/gateway/run.py b/gateway/run.py
index 2887ee7a..7a08fc08 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -319,6 +319,9 @@ class GatewayRunner:
         self._provider_routing = self._load_provider_routing()
         self._fallback_model = self._load_fallback_model()
         self._smart_model_routing = self._load_smart_model_routing()
+        self._model_max_tokens_default, self._model_max_tokens_by_model = (
+            self._load_model_max_tokens_config()
+        )

         # Wire process registry into session store for reset protection
         from tools.process_registry import process_registry
@@ -528,6 +531,7 @@ class GatewayRunner:
                 enabled_toolsets=["memory", "skills"],
                 session_id=old_session_id,
                 honcho_session_key=honcho_session_key,
+                max_tokens=self._resolve_max_tokens(model),
             )

             # Build conversation history from transcript
@@ -834,6 +838,49 @@ class GatewayRunner:
             pass
         return {}

+    @staticmethod
+    def _coerce_positive_int(val: Any) -> Optional[int]:
+        if val is None or val == "":
+            return None
+        try:
+            i = int(val)
+            return i if i > 0 else None
+        except (TypeError, ValueError):
+            return None
+
+    @staticmethod
+    def _load_model_max_tokens_config() -> tuple[Optional[int], Dict[str, int]]:
+        """Load model.max_tokens and model.max_tokens_per_model from config.yaml."""
+        try:
+            import yaml as _y
+            cfg_path = _hermes_home / "config.yaml"
+            if cfg_path.exists():
+                with open(cfg_path, encoding="utf-8") as _f:
+                    cfg = _y.safe_load(_f) or {}
+                mc = cfg.get("model", {})
+                if not isinstance(mc, dict):
+                    return None, {}
+                default = GatewayRunner._coerce_positive_int(mc.get("max_tokens"))
+                raw_per = mc.get("max_tokens_per_model")
+                per: Dict[str, int] = {}
+                if isinstance(raw_per, dict):
+                    for k, v in raw_per.items():
+                        if not isinstance(k, str) or not k.strip():
+                            continue
+                        iv = GatewayRunner._coerce_positive_int(v)
+                        if iv is not None:
+                            per[k.strip()] = iv
+                return default, per
+        except Exception:
+            pass
+        return None, {}
+
+    def _resolve_max_tokens(self, model_name: Optional[str]) -> Optional[int]:
+        m = (model_name or "").strip()
+        if m and m in self._model_max_tokens_by_model:
+            return self._model_max_tokens_by_model[m]
+        return self._model_max_tokens_default
+
     async def start(self) -> bool:
         """
         Start the gateway and all configured platform adapters.
@@ -1705,6 +1752,7 @@ class GatewayRunner:
                                     quiet_mode=True,
                                     enabled_toolsets=["memory"],
                                     session_id=session_entry.session_id,
+                                    max_tokens=self._resolve_max_tokens(_hyg_model),
                                 )

                                 loop = asyncio.get_event_loop()
@@ -3210,6 +3258,7 @@ class GatewayRunner:
                     platform=platform_key,
                     session_db=self._session_db,
                     fallback_model=self._fallback_model,
+                    max_tokens=self._resolve_max_tokens(turn_route["model"]),
                 )

                 return agent.run_conversation(
@@ -3403,6 +3452,7 @@ class GatewayRunner:
                 quiet_mode=True,
                 enabled_toolsets=["memory"],
                 session_id=session_entry.session_id,
+                max_tokens=self._resolve_max_tokens(model),
             )

             loop = asyncio.get_event_loop()
@@ -4577,6 +4627,7 @@ class GatewayRunner:
                 honcho_config=honcho_config,
                 session_db=self._session_db,
                 fallback_model=self._fallback_model,
+                max_tokens=self._resolve_max_tokens(turn_route["model"]),
             )

             # Store agent reference for interrupt support

Alternatives Considered

No response

Feature Type

Configuration option

Scope

Medium (few files, < 300 lines)

Contribution

  • I'd like to implement this myself and submit a PR

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions