fix: harden gateway install recovery paths

steipete · steipete · commit 9c37cfcbdbf7 · 2026-05-04T01:28:17.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,10 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Channels/WhatsApp: allow `@whiskeysockets/libsignal-node` in `onlyBuiltDependencies` so pnpm v9+ `blockExoticSubdeps` no longer rejects the baileys git-tarball subdep and silences all inbound agent replies. Fixes #76539. Thanks @ottodeng and @vincentkoc.
+- Gateway/install: keep `.env`-managed values in the macOS LaunchAgent env file while still tracking `OPENCLAW_SERVICE_MANAGED_ENV_KEYS`, so regenerated services do not boot without managed auth/provider keys. Fixes #75374.
+- Gateway/restart: verify listener PIDs by argv when `lsof` reports only the Node process name, so stale gateway cleanup can find macOS `cnode` listeners. Fixes #70664.
+- Gateway/logging: expand leading `~` in `logging.file` before creating the file logger, preventing startup crash loops for home-relative log paths. Fixes #73587.
+- Channels/CLI: keep `openclaw channels list --json` usable when provider usage fetching fails, and report per-provider usage errors without aborting the channel list. Refs #67595.
 - Gateway/systemd: preserve operator-added secrets in the Gateway env file across re-stage while clearing OpenClaw-managed keys (such as `OPENCLAW_GATEWAY_TOKEN`) so a fresh staging value is never shadowed by a stale env-file copy; operator secrets are also retained when the state-dir `.env` is empty. Fixes #76860. Thanks @hclsys.
 - Plugin updates: do not short-circuit trusted official npm updates as unchanged when the default/latest spec still resolves to an already-installed prerelease that the installer should replace with a stable fallback. Thanks @vincentkoc.
 - Plugin tools: keep auth-unavailable optional tools hidden even when another default tool from the same plugin is available and `tools.alsoAllow` names the optional tool. Thanks @vincentkoc.
diff --git a/src/commands/channels.list.auth-profiles.test.ts b/src/commands/channels.list.auth-profiles.test.ts
@@ -13,6 +13,7 @@ const mocks = vi.hoisted(() => ({
   loadAuthProfileStoreWithoutExternalProfiles: vi.fn(),
   listReadOnlyChannelPluginsForConfig: vi.fn<() => ChannelPlugin[]>(() => []),
   buildChannelAccountSnapshot: vi.fn(),
+  loadProviderUsageSummary: vi.fn(),
 }));
 
 vi.mock("../config/config.js", () => ({
@@ -39,6 +40,11 @@ vi.mock("../channels/plugins/status.js", () => ({
   buildChannelAccountSnapshot: mocks.buildChannelAccountSnapshot,
 }));
 
+vi.mock("../infra/provider-usage.js", () => ({
+  formatUsageReportLines: () => [],
+  loadProviderUsageSummary: mocks.loadProviderUsageSummary,
+}));
+
 import { channelsListCommand } from "./channels/list.js";
 
 function createMockChannelPlugin(accountIds: string[]): ChannelPlugin {
@@ -64,6 +70,7 @@ describe("channels list auth profiles", () => {
     mocks.readConfigFileSnapshot.mockReset();
     mocks.resolveCommandConfigWithSecrets.mockClear();
     mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReset();
+    mocks.loadProviderUsageSummary.mockReset();
     mocks.listReadOnlyChannelPluginsForConfig.mockReset();
     mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([]);
     mocks.buildChannelAccountSnapshot.mockReset();
@@ -143,6 +150,27 @@ describe("channels list auth profiles", () => {
     expect(payload.chat?.telegram).toEqual(["alerts", "default"]);
   });
 
+  it("keeps JSON output valid when usage loading fails", async () => {
+    const runtime = createTestRuntime();
+    mocks.readConfigFileSnapshot.mockResolvedValue({
+      ...baseConfigSnapshot,
+      config: {},
+    });
+    mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReturnValue({
+      version: 1,
+      profiles: {},
+    });
+    mocks.loadProviderUsageSummary.mockRejectedValue(new Error("fetch failed"));
+
+    await channelsListCommand({ json: true }, runtime);
+
+    const payload = JSON.parse(runtime.log.mock.calls[0]?.[0] as string) as {
+      usage?: unknown;
+    };
+    expect(payload.usage).toBeUndefined();
+    expect(runtime.error).not.toHaveBeenCalled();
+  });
+
   it("prints configured chat channel accounts before auth providers", async () => {
     const runtime = createTestRuntime();
     mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([
diff --git a/src/commands/channels/list.ts b/src/commands/channels/list.ts
@@ -91,14 +91,17 @@ function formatAccountLine(params: {
 }
 async function loadUsageWithProgress(
   runtime: RuntimeEnv,
+  progress = true,
 ): Promise<Awaited<ReturnType<typeof loadProviderUsageSummary>> | null> {
   try {
     return await withProgress(
-      { label: "Fetching usage snapshot…", indeterminate: true, enabled: true },
+      { label: "Fetching usage snapshot…", indeterminate: true, enabled: progress },
       async () => await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true }),
     );
   } catch (err) {
-    runtime.error(String(err));
+    if (progress) {
+      runtime.error(String(err));
+    }
     return null;
   }
 }
@@ -125,9 +128,7 @@ export async function channelsListCommand(
     isExternal: false,
   }));
   if (opts.json) {
-    const usage = includeUsage
-      ? await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true })
-      : undefined;
+    const usage = includeUsage ? await loadUsageWithProgress(runtime, false) : undefined;
     const chat: Record<string, string[]> = {};
     for (const plugin of plugins) {
       chat[plugin.id] = plugin.config.listAccountIds(cfg);
diff --git a/src/commands/daemon-install-helpers.test.ts b/src/commands/daemon-install-helpers.test.ts
@@ -597,6 +597,67 @@ describe("buildGatewayInstallPlan — dotenv merge", () => {
     );
   });
 
+  it("retains managed .env values for macOS LaunchAgent env files", async () => {
+    await writeStateDirDotEnv("TAVILY_API_KEY=dotenv-tavily\nOPENROUTER_API_KEY=or-key\n", {
+      stateDir: path.join(tmpDir, ".openclaw"),
+    });
+    mockNodeGatewayPlanFixture({
+      serviceEnvironment: {
+        HOME: "/from-service",
+        OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
+        OPENCLAW_PORT: "3000",
+      },
+    });
+
+    const plan = await buildGatewayInstallPlan({
+      env: { HOME: tmpDir },
+      port: 3000,
+      runtime: "node",
+      platform: "darwin",
+    });
+
+    expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
+    expect(plan.environment.OPENROUTER_API_KEY).toBe("or-key");
+    expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
+      "OPENROUTER_API_KEY,TAVILY_API_KEY",
+    );
+  });
+
+  it("does not retain config env values for macOS LaunchAgent env files", async () => {
+    await writeStateDirDotEnv("OPENROUTER_API_KEY=or-dotenv\nTAVILY_API_KEY=dotenv-tavily\n", {
+      stateDir: path.join(tmpDir, ".openclaw"),
+    });
+    mockNodeGatewayPlanFixture({
+      serviceEnvironment: {
+        HOME: "/from-service",
+        OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
+        OPENCLAW_PORT: "3000",
+      },
+    });
+
+    const plan = await buildGatewayInstallPlan({
+      env: { HOME: tmpDir },
+      port: 3000,
+      runtime: "node",
+      platform: "darwin",
+      config: {
+        env: {
+          vars: {
+            BRAVE_API_KEY: "brave-config-key",
+            OPENROUTER_API_KEY: "or-config-key",
+          },
+        },
+      },
+    });
+
+    expect(plan.environment.BRAVE_API_KEY).toBeUndefined();
+    expect(plan.environment.OPENROUTER_API_KEY).toBeUndefined();
+    expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
+    expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
+      "BRAVE_API_KEY,OPENROUTER_API_KEY,TAVILY_API_KEY",
+    );
+  });
+
   it("works when .env file does not exist", async () => {
     mockNodeGatewayPlanFixture({ serviceEnvironment: { OPENCLAW_PORT: "3000" } });
 
diff --git a/src/commands/daemon-install-helpers.ts b/src/commands/daemon-install-helpers.ts
@@ -3,7 +3,10 @@ import os from "node:os";
 import path from "node:path";
 import type { AuthProfileStore } from "../agents/auth-profiles/types.js";
 import { formatCliCommand } from "../cli/command-format.js";
-import { collectDurableServiceEnvVars } from "../config/state-dir-dotenv.js";
+import {
+  collectDurableServiceEnvVars,
+  readStateDirDotEnvVars,
+} from "../config/state-dir-dotenv.js";
 import type { OpenClawConfig } from "../config/types.js";
 import { resolveSecretInputRef } from "../config/types.secrets.js";
 import { resolveGatewayLaunchAgentLabel } from "../daemon/constants.js";
@@ -392,6 +395,35 @@ function resolveGatewayInstallWorkingDirectory(params: {
   return resolveGatewayStateDir(params.env);
 }
 
+function retainLaunchAgentManagedServiceEnvValues(params: {
+  environment: Record<string, string | undefined>;
+  durableEnvironment: Record<string, string | undefined>;
+  managedServiceEnvKeys: string | undefined;
+  stateDirDotEnvEnvironment: Record<string, string | undefined>;
+  serviceEnvironment: Record<string, string | undefined>;
+  platform: NodeJS.Platform;
+}): void {
+  if (params.platform !== "darwin" || !params.serviceEnvironment.OPENCLAW_LAUNCHD_LABEL?.trim()) {
+    return;
+  }
+  const managedKeys = readManagedServiceEnvKeysFromEnvironment({
+    OPENCLAW_SERVICE_MANAGED_ENV_KEYS: params.managedServiceEnvKeys,
+  });
+  if (managedKeys.size === 0) {
+    return;
+  }
+  for (const [rawKey, value] of Object.entries(params.stateDirDotEnvEnvironment)) {
+    const key = normalizeEnvVarKey(rawKey, { portable: true })?.toUpperCase();
+    if (!key || !managedKeys.has(key) || typeof value !== "string" || !value.trim()) {
+      continue;
+    }
+    if (params.durableEnvironment[rawKey] !== value) {
+      continue;
+    }
+    params.environment[rawKey] = value;
+  }
+}
+
 async function buildGatewayInstallEnvironment(params: {
   env: Record<string, string | undefined>;
   config?: OpenClawConfig;
@@ -408,6 +440,7 @@ async function buildGatewayInstallEnvironment(params: {
   environment: Record<string, string | undefined>;
   environmentValueSources: Record<string, GatewayServiceEnvironmentValueSource | undefined>;
 }> {
+  const stateDirDotEnvEnvironment = readStateDirDotEnvVars(params.env);
   const durableEnvironment = collectDurableServiceEnvVars({
     env: params.env,
     config: params.config,
@@ -463,6 +496,14 @@ async function buildGatewayInstallEnvironment(params: {
     omitKeys: Object.keys(params.serviceEnvironment),
   });
   writeManagedServiceEnvKeysToEnvironment(environment, managedServiceEnvKeys);
+  retainLaunchAgentManagedServiceEnvValues({
+    environment,
+    durableEnvironment,
+    managedServiceEnvKeys,
+    stateDirDotEnvEnvironment,
+    serviceEnvironment: params.serviceEnvironment,
+    platform: params.platform,
+  });
   if (environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS) {
     environmentValueSources.OPENCLAW_SERVICE_MANAGED_ENV_KEYS = "inline";
   }
diff --git a/src/infra/provider-usage.load.test.ts b/src/infra/provider-usage.load.test.ts
@@ -133,6 +133,48 @@ describe("provider-usage.load", () => {
     }
   });
 
+  it("keeps usage summary available when one provider fetch rejects", async () => {
+    resolveProviderUsageSnapshotWithPluginMock.mockImplementation(
+      async ({ provider }): Promise<ProviderUsageSnapshot | null> => {
+        if (provider === "anthropic") {
+          throw new Error("fetch failed");
+        }
+        const usageProvider = provider as ProviderUsageSnapshot["provider"];
+        return {
+          provider: usageProvider,
+          displayName: "Codex",
+          windows: [{ label: "3h", usedPercent: 12 }],
+        };
+      },
+    );
+    const mockFetch = createProviderUsageFetch(async () => {
+      throw new Error("legacy fetch should not run");
+    });
+
+    const summary = await loadUsageWithAuth(
+      loadProviderUsageSummary,
+      [
+        { provider: "anthropic", token: "token-a" },
+        { provider: "openai-codex", token: "token-codex" },
+      ],
+      mockFetch,
+    );
+
+    expect(summary.providers).toEqual([
+      {
+        provider: "anthropic",
+        displayName: "Claude",
+        windows: [],
+        error: "fetch failed",
+      },
+      {
+        provider: "openai-codex",
+        displayName: "Codex",
+        windows: [{ label: "3h", usedPercent: 12 }],
+      },
+    ]);
+  });
+
   it("throws when fetch is unavailable", async () => {
     const previousFetch = globalThis.fetch;
     vi.stubGlobal("fetch", undefined);
diff --git a/src/infra/provider-usage.load.ts b/src/infra/provider-usage.load.ts
@@ -103,8 +103,14 @@ export async function loadProviderUsageSummary(
     return { updatedAt: now, providers: [] };
   }
 
-  const tasks = auths.map((auth) =>
-    withTimeout(
+  const tasks = auths.map((auth) => {
+    const failureSnapshot = (error: string): ProviderUsageSnapshot => ({
+      provider: auth.provider,
+      displayName: PROVIDER_LABELS[auth.provider] ?? auth.provider,
+      windows: [],
+      error,
+    });
+    return withTimeout(
       fetchProviderUsageSnapshot({
         auth,
         config,
@@ -121,8 +127,11 @@ export async function loadProviderUsageSummary(
         windows: [],
         error: "Timeout",
       },
-    ),
-  );
+    ).catch((error: unknown) => {
+      const message = error instanceof Error ? error.message : String(error);
+      return failureSnapshot(message.trim() || "Fetch failed");
+    });
+  });
 
   const snapshots = await Promise.all(tasks);
   const providers = snapshots.filter((entry) => {
diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts
@@ -256,6 +256,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
       expect(pids).not.toContain(process.pid);
     });
 
+    it("verifies argv when lsof reports the node process name instead of openclaw", () => {
+      const stalePid = process.pid + 101;
+      mockSpawnSync.mockImplementation((command: unknown) => {
+        if (command === "ps") {
+          return {
+            error: null,
+            status: 0,
+            stdout: "node /opt/openclaw/dist/entry.js gateway\n",
+            stderr: "",
+          };
+        }
+        return {
+          error: null,
+          status: 0,
+          stdout: lsofOutput([{ pid: stalePid, cmd: "cnode" }]),
+          stderr: "",
+        };
+      });
+
+      expect(findGatewayPidsOnPortSync(18789)).toEqual([stalePid]);
+      expect(mockSpawnSync).toHaveBeenCalledWith(
+        "ps",
+        ["-ww", "-p", String(stalePid), "-o", "command="],
+        expect.objectContaining({ timeout: 2000 }),
+      );
+    });
+
     it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => {
       // Regression: openclaw-weixin sidecar (child of the gateway) invoked
       // cleanStaleGatewayProcessesSync during init. lsof reported the parent
@@ -1174,8 +1201,9 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
       vi.spyOn(process, "kill").mockReturnValue(true);
       // Should complete cleanly — no openclaw pids in status-1 output → free
       expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
-      // Completed in exactly 2 calls (initial find + 1 free poll)
-      expect(getCallCount()).toBe(2);
+      // Completed with one argv verification after the status-1 poll output:
+      // initial lsof + poll lsof + ps argv check.
+      expect(getCallCount()).toBe(3);
     });
   });
 
diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts
diff --git a/src/logging/logger-redaction-behavior.test.ts b/src/logging/logger-redaction-behavior.test.ts
diff --git a/src/logging/logger.ts b/src/logging/logger.ts