fix(ollama): avoid implicit native num_ctx override

steipete · steipete · commit 755984559723 · 2026-04-27T07:42:14.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai
 - Image tool/media: honor `tools.media.image.timeoutSeconds` and matching per-model image timeouts in explicit image analysis, including the MiniMax VLM fallback path, so slow local vision models are not capped by hardcoded 30s/60s aborts. Fixes #67889; supersedes #67929. Thanks @AllenT22 and @alchip.
 - Providers/Ollama: read larger custom Modelfile `PARAMETER num_ctx` values from `/api/show` so auto-discovered Ollama models with expanded context no longer stay pinned to the base model context. Fixes #68344. Thanks @neeravmakwana.
 - Providers/Ollama: honor configured model `params.num_ctx` in native and OpenAI-compatible Ollama requests so local models can cap runtime context without rebuilding Modelfiles. Fixes #44550 and #52206; supersedes #69464. Thanks @taitruong, @armi0024, and @LokiCode404.
+- Providers/Ollama: stop forcing native Ollama requests to use the full configured `contextWindow` as `options.num_ctx` unless `params.num_ctx` is explicit, so local models can keep Ollama's VRAM/env default instead of looking hung on first turns. Fixes #49684 and #68662. Thanks @zhouZcong and @dshenster-byte.
 - Providers/Ollama: forward whitelisted native Ollama model params such as `temperature`, `top_p`, and top-level `think` so users can disable API-level thinking or tune local models from config without proxy shims. Fixes #48010. Thanks @tangzhi, @pandego, @maweibin, @Adam-Researchh, and @EmpireCreator.
 - Providers/Ollama: expose native Ollama thinking effort levels so `/think max` is accepted for reasoning-capable Ollama models and maps to Ollama's highest supported `think` effort. Fixes #71584. Thanks @g0st1n.
 - Providers/Ollama: strip the active custom Ollama provider prefix before native chat and embedding requests, so custom provider ids like `ollama-spark/qwen3:32b` reach Ollama as the real model name. Fixes #72353. Thanks @maximus-dss and @hclsys.
diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md
@@ -758,7 +758,7 @@ For the full setup and behavior details, see [Ollama Web Search](/tools/ollama-s
   <Accordion title="Context windows">
     For auto-discovered models, OpenClaw uses the context window reported by Ollama when available, including larger `PARAMETER num_ctx` values from custom Modelfiles. Otherwise it falls back to the default Ollama context window used by OpenClaw.
 
-    You can set provider-level `contextWindow`, `contextTokens`, and `maxTokens` defaults for every model under that Ollama provider, then override them per model when needed. To cap Ollama's per-request runtime context without rebuilding a Modelfile, set `params.num_ctx`; OpenClaw sends it as `options.num_ctx` for both native Ollama and the OpenAI-compatible Ollama adapter. Invalid, zero, negative, and non-finite values are ignored and fall back to `contextWindow`.
+    You can set provider-level `contextWindow`, `contextTokens`, and `maxTokens` defaults for every model under that Ollama provider, then override them per model when needed. `contextWindow` is OpenClaw's prompt and compaction budget. Native Ollama requests leave `options.num_ctx` unset unless you explicitly configure `params.num_ctx`, so Ollama can apply its own model, `OLLAMA_CONTEXT_LENGTH`, or VRAM-based default. To cap or force Ollama's per-request runtime context without rebuilding a Modelfile, set `params.num_ctx`; invalid, zero, negative, and non-finite values are ignored. The OpenAI-compatible Ollama adapter still injects `options.num_ctx` by default from the configured `params.num_ctx` or `contextWindow`; disable that with `injectNumCtxForOpenAICompat: false` if your upstream rejects `options`.
 
     Native Ollama model entries also accept the common Ollama runtime options under `params`, including `temperature`, `top_p`, `top_k`, `min_p`, `num_predict`, `stop`, `repeat_penalty`, `num_batch`, `num_thread`, and `use_mmap`. OpenClaw forwards only Ollama request keys, so OpenClaw runtime params such as `streaming` are not leaked to Ollama. Use `params.think` or `params.thinking` to send top-level Ollama `think`; `false` disables API-level thinking for Qwen-style thinking models.
 
@@ -999,7 +999,7 @@ For the full setup and behavior details, see [Ollama Web Search](/tools/ollama-s
   </Accordion>
 
   <Accordion title="Large-context model is too slow or runs out of memory">
-    Many Ollama models advertise contexts that are larger than your hardware can run comfortably. Cap both OpenClaw's budget and Ollama's request context:
+    Many Ollama models advertise contexts that are larger than your hardware can run comfortably. Native Ollama uses Ollama's own runtime context default unless you set `params.num_ctx`. Cap both OpenClaw's budget and Ollama's request context when you want predictable first-token latency:
 
     ```json5
     {
@@ -1021,7 +1021,7 @@ For the full setup and behavior details, see [Ollama Web Search](/tools/ollama-s
     }
     ```
 
-    Lower `contextWindow` first if the prompt ingestion phase is slow. Lower `maxTokens` if generation runs too long.
+    Lower `contextWindow` first if OpenClaw is sending too much prompt. Lower `params.num_ctx` if Ollama is loading a runtime context that is too large for the machine. Lower `maxTokens` if generation runs too long.
 
   </Accordion>
 </AccordionGroup>
diff --git a/extensions/ollama/src/stream-runtime.test.ts b/extensions/ollama/src/stream-runtime.test.ts
@@ -207,7 +207,7 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
         };
         expect(requestBody.think).toBe(false);
         expect(requestBody.options?.think).toBeUndefined();
-        expect(requestBody.options?.num_ctx).toBe(131072);
+        expect(requestBody.options?.num_ctx).toBeUndefined();
       },
     );
   });
@@ -260,7 +260,7 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
         };
         expect(requestBody.think).toBe("low");
         expect(requestBody.options?.think).toBeUndefined();
-        expect(requestBody.options?.num_ctx).toBe(131072);
+        expect(requestBody.options?.num_ctx).toBeUndefined();
       },
     );
   });
@@ -332,7 +332,7 @@ describe("createConfiguredOllamaCompatStreamWrapper", () => {
         };
         expect(requestBody.think).toBe("high");
         expect(requestBody.options?.think).toBeUndefined();
-        expect(requestBody.options?.num_ctx).toBe(131072);
+        expect(requestBody.options?.num_ctx).toBeUndefined();
       },
     );
   });
@@ -1296,9 +1296,12 @@ describe("createOllamaStreamFn", () => {
         }
 
         const requestBody = JSON.parse(requestInit.body) as {
-          options: { num_ctx?: number; num_predict?: number };
+          options?: { num_ctx?: number; num_predict?: number };
         };
-        expect(requestBody.options.num_ctx).toBe(131072);
+        if (!requestBody.options) {
+          throw new Error("Expected Ollama request options");
+        }
+        expect(requestBody.options?.num_ctx).toBeUndefined();
         expect(requestBody.options.num_predict).toBe(123);
       },
     );
diff --git a/extensions/ollama/src/stream.ts b/extensions/ollama/src/stream.ts
@@ -247,12 +247,18 @@ function resolveOllamaModelOptions(model: ProviderRuntimeModel): Record<string,
   const params = model.params;
   if (params && typeof params === "object" && !Array.isArray(params)) {
     for (const [key, value] of Object.entries(params)) {
+      if (key === "num_ctx") {
+        continue;
+      }
       if (value !== undefined && OLLAMA_OPTION_PARAM_KEYS.has(key)) {
         options[key] = value;
       }
     }
   }
-  options.num_ctx = resolveOllamaNumCtx(model);
+  const numCtx = resolveOllamaConfiguredNumCtx(model);
+  if (numCtx !== undefined) {
+    options.num_ctx = numCtx;
+  }
   return options;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -247,12 +247,18 @@ function resolveOllamaModelOptions(model: ProviderRuntimeModel): Record<string,`
`247`	`247`	`const params = model.params;`
`248`	`248`	`if (params && typeof params === "object" && !Array.isArray(params)) {`
`249`	`249`	`for (const [key, value] of Object.entries(params)) {`
	`250`	`+ if (key === "num_ctx") {`
	`251`	`+ continue;`
	`252`	`+ }`
`250`	`253`	`if (value !== undefined && OLLAMA_OPTION_PARAM_KEYS.has(key)) {`
`251`	`254`	`options[key] = value;`
`252`	`255`	`}`
`253`	`256`	`}`
`254`	`257`	`}`
`255`		`- options.num_ctx = resolveOllamaNumCtx(model);`
	`258`	`+ const numCtx = resolveOllamaConfiguredNumCtx(model);`
	`259`	`+ if (numCtx !== undefined) {`
	`260`	`+ options.num_ctx = numCtx;`
	`261`	`+ }`
`256`	`262`	`return options;`
`257`	`263`	`}`
`258`	`264`