fix(e2e): retry Windows kitchen sink probes

vincentkoc · vincentkoc · commit 3e275a53dc9e · 2026-05-24T23:10:33.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,9 @@ Docs: https://docs.openclaw.ai
 - Installer: install Node.js through `apk` on Alpine Linux instead of falling through to the NodeSource package-manager path.
 - Installer: detect musl Linux shells such as Alpine as Linux instead of rejecting them before npm install.
 - Control UI: split large build-time runtime dependencies into stable chunks so Linux/Docker install and package builds stay below the app chunk warning threshold.
+- Tests: retry transient loopback HTTP resets in the kitchen-sink RPC walk so native Windows readiness probes do not fail after the gateway is already ready.
+- Tests: run `test:serial` through a Node wrapper so targeted serial Vitest commands work on native Windows.
+- Tests: normalize Vitest config path assertions so the infra config suite runs on native Windows paths.
 - Scripts: run the optional Discord native opus installer through the shared pnpm launcher and Windows CI coverage so native Windows installs avoid shell-mode package-manager shims.
 - Agents/MCP: bound bundled MCP `tools/list` catalog discovery so hung MCP servers do not block session tool materialization. (#85063) Thanks @nxmxbbd.
 - Scripts: run generated-module formatting through the shared pnpm launcher and Windows CI coverage so native Windows generator checks avoid shell-mode package-manager shims.
diff --git a/package.json b/package.json
@@ -1747,7 +1747,7 @@
     "test:plugins:kitchen-sink-live": "bash -lc 'if [ -x \"$HOME/.local/bin/openclaw-testbox-env\" ]; then exec \"$HOME/.local/bin/openclaw-testbox-env\" pnpm openclaw qa suite --provider-mode live-frontier --scenario kitchen-sink-live-openai; fi; exec pnpm openclaw qa suite --provider-mode live-frontier --scenario kitchen-sink-live-openai'",
     "test:plugins:kitchen-sink-rpc": "node --import tsx scripts/e2e/kitchen-sink-rpc-walk.mjs",
     "test:sectriage": "OPENCLAW_GATEWAY_PROJECT_SHARDS=1 node scripts/run-vitest.mjs run --config test/vitest/vitest.gateway.config.ts && node scripts/run-vitest.mjs run --config test/vitest/vitest.unit.config.ts --exclude src/daemon/launchd.integration.test.ts --exclude src/process/exec.test.ts",
-    "test:serial": "OPENCLAW_TEST_PROJECTS_SERIAL=1 OPENCLAW_VITEST_MAX_WORKERS=1 node scripts/test-projects.mjs",
+    "test:serial": "node scripts/test-projects-serial.mjs",
     "test:stability:gateway": "OPENCLAW_VITEST_MAX_WORKERS=1 node scripts/run-vitest.mjs run --config test/vitest/vitest.gateway.config.ts src/gateway/gateway-stability.test.ts && OPENCLAW_VITEST_MAX_WORKERS=1 node scripts/run-vitest.mjs run --config test/vitest/vitest.logging.config.ts src/logging/diagnostic-stability-bundle.test.ts && OPENCLAW_VITEST_MAX_WORKERS=1 node scripts/run-vitest.mjs run --config test/vitest/vitest.infra.config.ts src/infra/fatal-error-hooks.test.ts",
     "test:cli-response:contract": "node scripts/build-all.mjs cliStartup && node scripts/test-cli-startup-bench-budget.mjs --preset response --runs 1 --warmup 0 --timeout-ms 10000 --skip-baseline",
     "test:startup:bench": "node --import tsx scripts/bench-cli-startup.ts",
diff --git a/scripts/e2e/kitchen-sink-rpc-walk.mjs b/scripts/e2e/kitchen-sink-rpc-walk.mjs
@@ -292,25 +292,58 @@ async function retryRpcCall(method, params, options) {
 function isRetryableGatewayCallError(error) {
   const text = error instanceof Error ? error.message : String(error);
   return (
+    isRetryableTransientNetworkError(error) ||
     text.includes("gateway starting") ||
     text.includes("gateway closed") ||
     text.includes("handshake timeout") ||
-    text.includes("GatewayTransportError") ||
-    text.includes("ECONNREFUSED") ||
-    text.includes("fetch failed")
+    text.includes("GatewayTransportError")
   );
 }
 
-async function fetchJson(url) {
-  const response = await fetch(url);
-  const text = await response.text();
-  let body = null;
-  try {
-    body = text ? JSON.parse(text) : null;
-  } catch {
-    body = text;
+function isRetryableTransientNetworkError(error, seen = new Set()) {
+  if (!error || seen.has(error)) {
+    return false;
+  }
+  seen.add(error);
+  const candidate = error;
+  const message = candidate instanceof Error ? candidate.message : String(candidate);
+  const code = typeof candidate === "object" && candidate !== null ? candidate.code : undefined;
+  const text = `${String(code ?? "")} ${message}`;
+  if (
+    /\b(?:ECONNRESET|ECONNREFUSED|ETIMEDOUT|EPIPE|EHOSTUNREACH|ENETUNREACH)\b/iu.test(text) ||
+    /\b(?:fetch failed|socket hang up|connection reset)\b/iu.test(text)
+  ) {
+    return true;
+  }
+  if (typeof candidate === "object" && candidate !== null && "cause" in candidate) {
+    return isRetryableTransientNetworkError(candidate.cause, seen);
+  }
+  return false;
+}
+
+export async function fetchJson(url, options = {}) {
+  const attempts = Math.max(1, options.attempts ?? 3);
+  let lastError;
+  for (let attempt = 1; attempt <= attempts; attempt += 1) {
+    try {
+      const response = await (options.fetchImpl ?? fetch)(url);
+      const text = await response.text();
+      let body = null;
+      try {
+        body = text ? JSON.parse(text) : null;
+      } catch {
+        body = text;
+      }
+      return { ok: response.ok, status: response.status, body };
+    } catch (error) {
+      lastError = error;
+      if (attempt >= attempts || !isRetryableTransientNetworkError(error)) {
+        throw error;
+      }
+      await delay(options.retryDelayMs ?? 250);
+    }
   }
-  return { ok: response.ok, status: response.status, body };
+  throw lastError ?? new Error(`fetch ${url} failed`);
 }
 
 function configureKitchenSink(env, port) {
diff --git a/scripts/test-projects-serial.mjs b/scripts/test-projects-serial.mjs
@@ -0,0 +1,4 @@
+process.env.OPENCLAW_TEST_PROJECTS_SERIAL = "1";
+process.env.OPENCLAW_VITEST_MAX_WORKERS = "1";
+
+await import("./test-projects.mjs");
diff --git a/src/infra/vitest-config.test.ts b/src/infra/vitest-config.test.ts
@@ -7,6 +7,10 @@ import baseConfig, {
   resolveLocalVitestScheduling,
 } from "../../vitest.config.ts";
 
+function normalizeConfigPath(value: unknown): string {
+  return String(value).replaceAll("\\", "/");
+}
+
 describe("resolveLocalVitestMaxWorkers", () => {
   it("uses a moderate local worker cap on larger hosts", () => {
     expect(
@@ -204,12 +208,16 @@ describe("base vitest config", () => {
 
   it("keeps the base setup file minimal", () => {
     expect(baseConfig.test?.setupFiles).toHaveLength(1);
-    expect(baseConfig.test?.setupFiles?.[0]).toMatch(/(?:^|\/)test\/setup\.ts$/u);
+    expect(normalizeConfigPath(baseConfig.test?.setupFiles?.[0])).toMatch(
+      /(?:^|\/)test\/setup\.ts$/u,
+    );
   });
 
   it("keeps the base runner non-isolated by default", () => {
     expect(baseConfig.test?.isolate).toBe(false);
-    expect(baseConfig.test?.runner).toMatch(/(?:^|\/)test\/non-isolated-runner\.ts$/u);
+    expect(normalizeConfigPath(baseConfig.test?.runner)).toMatch(
+      /(?:^|\/)test\/non-isolated-runner\.ts$/u,
+    );
   });
 });
 
@@ -221,9 +229,7 @@ describe("test scripts", () => {
       scripts?: Record<string, string>;
     };
 
-    expect(pkg.scripts?.["test:serial"]).toBe(
-      "OPENCLAW_TEST_PROJECTS_SERIAL=1 OPENCLAW_VITEST_MAX_WORKERS=1 node scripts/test-projects.mjs",
-    );
+    expect(pkg.scripts?.["test:serial"]).toBe("node scripts/test-projects-serial.mjs");
     expect(pkg.scripts?.["test:fast"]).toBe(
       "node scripts/run-vitest.mjs run --config test/vitest/vitest.unit.config.ts",
     );
diff --git a/test/scripts/kitchen-sink-rpc-walk.test.ts b/test/scripts/kitchen-sink-rpc-walk.test.ts
@@ -1,5 +1,9 @@
-import { describe, expect, it } from "vitest";
-import { assertResourceCeiling, sampleProcess } from "../../scripts/e2e/kitchen-sink-rpc-walk.mjs";
+import { describe, expect, it, vi } from "vitest";
+import {
+  assertResourceCeiling,
+  fetchJson,
+  sampleProcess,
+} from "../../scripts/e2e/kitchen-sink-rpc-walk.mjs";
 
 describe("kitchen-sink RPC process sampling", () => {
   it("samples RSS on Windows instead of silently disabling the resource guard", async () => {
@@ -47,6 +51,25 @@ describe("kitchen-sink RPC process sampling", () => {
     expect(sample).toEqual({ cpuPercent: 12.5, rssMiB: 256 });
   });
 
+  it("retries transient loopback fetch resets from Windows HTTP probes", async () => {
+    const reset = new TypeError("fetch failed", {
+      cause: Object.assign(new Error("read ECONNRESET"), { code: "ECONNRESET" }),
+    });
+    const fetchImpl = vi
+      .fn()
+      .mockRejectedValueOnce(reset)
+      .mockResolvedValueOnce(new Response('{"status":"live"}', { status: 200 }));
+
+    await expect(
+      fetchJson("http://127.0.0.1:19680/healthz", {
+        attempts: 2,
+        fetchImpl,
+        retryDelayMs: 0,
+      }),
+    ).resolves.toEqual({ ok: true, status: 200, body: { status: "live" } });
+    expect(fetchImpl).toHaveBeenCalledTimes(2);
+  });
+
   it("fails when the sampled RSS exceeds the configured ceiling", () => {
     expect(() => assertResourceCeiling({ rssMiB: 2049 })).toThrow(
       "gateway RSS exceeded 2048 MiB: 2049 MiB",