Skip to content

Commit 9c37cfc

Browse files
committed
fix: harden gateway install recovery paths
1 parent 9799e41 commit 9c37cfc

11 files changed

Lines changed: 317 additions & 36 deletions

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ Docs: https://docs.openclaw.ai
3838
### Fixes
3939

4040
- Channels/WhatsApp: allow `@whiskeysockets/libsignal-node` in `onlyBuiltDependencies` so pnpm v9+ `blockExoticSubdeps` no longer rejects the baileys git-tarball subdep and silences all inbound agent replies. Fixes #76539. Thanks @ottodeng and @vincentkoc.
41+
- Gateway/install: keep `.env`-managed values in the macOS LaunchAgent env file while still tracking `OPENCLAW_SERVICE_MANAGED_ENV_KEYS`, so regenerated services do not boot without managed auth/provider keys. Fixes #75374.
42+
- Gateway/restart: verify listener PIDs by argv when `lsof` reports only the Node process name, so stale gateway cleanup can find macOS `cnode` listeners. Fixes #70664.
43+
- Gateway/logging: expand leading `~` in `logging.file` before creating the file logger, preventing startup crash loops for home-relative log paths. Fixes #73587.
44+
- Channels/CLI: keep `openclaw channels list --json` usable when provider usage fetching fails, and report per-provider usage errors without aborting the channel list. Refs #67595.
4145
- Gateway/systemd: preserve operator-added secrets in the Gateway env file across re-stage while clearing OpenClaw-managed keys (such as `OPENCLAW_GATEWAY_TOKEN`) so a fresh staging value is never shadowed by a stale env-file copy; operator secrets are also retained when the state-dir `.env` is empty. Fixes #76860. Thanks @hclsys.
4246
- Plugin updates: do not short-circuit trusted official npm updates as unchanged when the default/latest spec still resolves to an already-installed prerelease that the installer should replace with a stable fallback. Thanks @vincentkoc.
4347
- Plugin tools: keep auth-unavailable optional tools hidden even when another default tool from the same plugin is available and `tools.alsoAllow` names the optional tool. Thanks @vincentkoc.

src/commands/channels.list.auth-profiles.test.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ const mocks = vi.hoisted(() => ({
1313
loadAuthProfileStoreWithoutExternalProfiles: vi.fn(),
1414
listReadOnlyChannelPluginsForConfig: vi.fn<() => ChannelPlugin[]>(() => []),
1515
buildChannelAccountSnapshot: vi.fn(),
16+
loadProviderUsageSummary: vi.fn(),
1617
}));
1718

1819
vi.mock("../config/config.js", () => ({
@@ -39,6 +40,11 @@ vi.mock("../channels/plugins/status.js", () => ({
3940
buildChannelAccountSnapshot: mocks.buildChannelAccountSnapshot,
4041
}));
4142

43+
vi.mock("../infra/provider-usage.js", () => ({
44+
formatUsageReportLines: () => [],
45+
loadProviderUsageSummary: mocks.loadProviderUsageSummary,
46+
}));
47+
4248
import { channelsListCommand } from "./channels/list.js";
4349

4450
function createMockChannelPlugin(accountIds: string[]): ChannelPlugin {
@@ -64,6 +70,7 @@ describe("channels list auth profiles", () => {
6470
mocks.readConfigFileSnapshot.mockReset();
6571
mocks.resolveCommandConfigWithSecrets.mockClear();
6672
mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReset();
73+
mocks.loadProviderUsageSummary.mockReset();
6774
mocks.listReadOnlyChannelPluginsForConfig.mockReset();
6875
mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([]);
6976
mocks.buildChannelAccountSnapshot.mockReset();
@@ -143,6 +150,27 @@ describe("channels list auth profiles", () => {
143150
expect(payload.chat?.telegram).toEqual(["alerts", "default"]);
144151
});
145152

153+
it("keeps JSON output valid when usage loading fails", async () => {
154+
const runtime = createTestRuntime();
155+
mocks.readConfigFileSnapshot.mockResolvedValue({
156+
...baseConfigSnapshot,
157+
config: {},
158+
});
159+
mocks.loadAuthProfileStoreWithoutExternalProfiles.mockReturnValue({
160+
version: 1,
161+
profiles: {},
162+
});
163+
mocks.loadProviderUsageSummary.mockRejectedValue(new Error("fetch failed"));
164+
165+
await channelsListCommand({ json: true }, runtime);
166+
167+
const payload = JSON.parse(runtime.log.mock.calls[0]?.[0] as string) as {
168+
usage?: unknown;
169+
};
170+
expect(payload.usage).toBeUndefined();
171+
expect(runtime.error).not.toHaveBeenCalled();
172+
});
173+
146174
it("prints configured chat channel accounts before auth providers", async () => {
147175
const runtime = createTestRuntime();
148176
mocks.listReadOnlyChannelPluginsForConfig.mockReturnValue([

src/commands/channels/list.ts

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,14 +91,17 @@ function formatAccountLine(params: {
9191
}
9292
async function loadUsageWithProgress(
9393
runtime: RuntimeEnv,
94+
progress = true,
9495
): Promise<Awaited<ReturnType<typeof loadProviderUsageSummary>> | null> {
9596
try {
9697
return await withProgress(
97-
{ label: "Fetching usage snapshot…", indeterminate: true, enabled: true },
98+
{ label: "Fetching usage snapshot…", indeterminate: true, enabled: progress },
9899
async () => await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true }),
99100
);
100101
} catch (err) {
101-
runtime.error(String(err));
102+
if (progress) {
103+
runtime.error(String(err));
104+
}
102105
return null;
103106
}
104107
}
@@ -125,9 +128,7 @@ export async function channelsListCommand(
125128
isExternal: false,
126129
}));
127130
if (opts.json) {
128-
const usage = includeUsage
129-
? await loadProviderUsageSummary({ skipPluginAuthWithoutCredentialSource: true })
130-
: undefined;
131+
const usage = includeUsage ? await loadUsageWithProgress(runtime, false) : undefined;
131132
const chat: Record<string, string[]> = {};
132133
for (const plugin of plugins) {
133134
chat[plugin.id] = plugin.config.listAccountIds(cfg);

src/commands/daemon-install-helpers.test.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,67 @@ describe("buildGatewayInstallPlan — dotenv merge", () => {
597597
);
598598
});
599599

600+
it("retains managed .env values for macOS LaunchAgent env files", async () => {
601+
await writeStateDirDotEnv("TAVILY_API_KEY=dotenv-tavily\nOPENROUTER_API_KEY=or-key\n", {
602+
stateDir: path.join(tmpDir, ".openclaw"),
603+
});
604+
mockNodeGatewayPlanFixture({
605+
serviceEnvironment: {
606+
HOME: "/from-service",
607+
OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
608+
OPENCLAW_PORT: "3000",
609+
},
610+
});
611+
612+
const plan = await buildGatewayInstallPlan({
613+
env: { HOME: tmpDir },
614+
port: 3000,
615+
runtime: "node",
616+
platform: "darwin",
617+
});
618+
619+
expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
620+
expect(plan.environment.OPENROUTER_API_KEY).toBe("or-key");
621+
expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
622+
"OPENROUTER_API_KEY,TAVILY_API_KEY",
623+
);
624+
});
625+
626+
it("does not retain config env values for macOS LaunchAgent env files", async () => {
627+
await writeStateDirDotEnv("OPENROUTER_API_KEY=or-dotenv\nTAVILY_API_KEY=dotenv-tavily\n", {
628+
stateDir: path.join(tmpDir, ".openclaw"),
629+
});
630+
mockNodeGatewayPlanFixture({
631+
serviceEnvironment: {
632+
HOME: "/from-service",
633+
OPENCLAW_LAUNCHD_LABEL: "ai.openclaw.gateway",
634+
OPENCLAW_PORT: "3000",
635+
},
636+
});
637+
638+
const plan = await buildGatewayInstallPlan({
639+
env: { HOME: tmpDir },
640+
port: 3000,
641+
runtime: "node",
642+
platform: "darwin",
643+
config: {
644+
env: {
645+
vars: {
646+
BRAVE_API_KEY: "brave-config-key",
647+
OPENROUTER_API_KEY: "or-config-key",
648+
},
649+
},
650+
},
651+
});
652+
653+
expect(plan.environment.BRAVE_API_KEY).toBeUndefined();
654+
expect(plan.environment.OPENROUTER_API_KEY).toBeUndefined();
655+
expect(plan.environment.TAVILY_API_KEY).toBe("dotenv-tavily");
656+
expect(plan.environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS).toBe(
657+
"BRAVE_API_KEY,OPENROUTER_API_KEY,TAVILY_API_KEY",
658+
);
659+
});
660+
600661
it("works when .env file does not exist", async () => {
601662
mockNodeGatewayPlanFixture({ serviceEnvironment: { OPENCLAW_PORT: "3000" } });
602663

src/commands/daemon-install-helpers.ts

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ import os from "node:os";
33
import path from "node:path";
44
import type { AuthProfileStore } from "../agents/auth-profiles/types.js";
55
import { formatCliCommand } from "../cli/command-format.js";
6-
import { collectDurableServiceEnvVars } from "../config/state-dir-dotenv.js";
6+
import {
7+
collectDurableServiceEnvVars,
8+
readStateDirDotEnvVars,
9+
} from "../config/state-dir-dotenv.js";
710
import type { OpenClawConfig } from "../config/types.js";
811
import { resolveSecretInputRef } from "../config/types.secrets.js";
912
import { resolveGatewayLaunchAgentLabel } from "../daemon/constants.js";
@@ -392,6 +395,35 @@ function resolveGatewayInstallWorkingDirectory(params: {
392395
return resolveGatewayStateDir(params.env);
393396
}
394397

398+
function retainLaunchAgentManagedServiceEnvValues(params: {
399+
environment: Record<string, string | undefined>;
400+
durableEnvironment: Record<string, string | undefined>;
401+
managedServiceEnvKeys: string | undefined;
402+
stateDirDotEnvEnvironment: Record<string, string | undefined>;
403+
serviceEnvironment: Record<string, string | undefined>;
404+
platform: NodeJS.Platform;
405+
}): void {
406+
if (params.platform !== "darwin" || !params.serviceEnvironment.OPENCLAW_LAUNCHD_LABEL?.trim()) {
407+
return;
408+
}
409+
const managedKeys = readManagedServiceEnvKeysFromEnvironment({
410+
OPENCLAW_SERVICE_MANAGED_ENV_KEYS: params.managedServiceEnvKeys,
411+
});
412+
if (managedKeys.size === 0) {
413+
return;
414+
}
415+
for (const [rawKey, value] of Object.entries(params.stateDirDotEnvEnvironment)) {
416+
const key = normalizeEnvVarKey(rawKey, { portable: true })?.toUpperCase();
417+
if (!key || !managedKeys.has(key) || typeof value !== "string" || !value.trim()) {
418+
continue;
419+
}
420+
if (params.durableEnvironment[rawKey] !== value) {
421+
continue;
422+
}
423+
params.environment[rawKey] = value;
424+
}
425+
}
426+
395427
async function buildGatewayInstallEnvironment(params: {
396428
env: Record<string, string | undefined>;
397429
config?: OpenClawConfig;
@@ -408,6 +440,7 @@ async function buildGatewayInstallEnvironment(params: {
408440
environment: Record<string, string | undefined>;
409441
environmentValueSources: Record<string, GatewayServiceEnvironmentValueSource | undefined>;
410442
}> {
443+
const stateDirDotEnvEnvironment = readStateDirDotEnvVars(params.env);
411444
const durableEnvironment = collectDurableServiceEnvVars({
412445
env: params.env,
413446
config: params.config,
@@ -463,6 +496,14 @@ async function buildGatewayInstallEnvironment(params: {
463496
omitKeys: Object.keys(params.serviceEnvironment),
464497
});
465498
writeManagedServiceEnvKeysToEnvironment(environment, managedServiceEnvKeys);
499+
retainLaunchAgentManagedServiceEnvValues({
500+
environment,
501+
durableEnvironment,
502+
managedServiceEnvKeys,
503+
stateDirDotEnvEnvironment,
504+
serviceEnvironment: params.serviceEnvironment,
505+
platform: params.platform,
506+
});
466507
if (environment.OPENCLAW_SERVICE_MANAGED_ENV_KEYS) {
467508
environmentValueSources.OPENCLAW_SERVICE_MANAGED_ENV_KEYS = "inline";
468509
}

src/infra/provider-usage.load.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,48 @@ describe("provider-usage.load", () => {
133133
}
134134
});
135135

136+
it("keeps usage summary available when one provider fetch rejects", async () => {
137+
resolveProviderUsageSnapshotWithPluginMock.mockImplementation(
138+
async ({ provider }): Promise<ProviderUsageSnapshot | null> => {
139+
if (provider === "anthropic") {
140+
throw new Error("fetch failed");
141+
}
142+
const usageProvider = provider as ProviderUsageSnapshot["provider"];
143+
return {
144+
provider: usageProvider,
145+
displayName: "Codex",
146+
windows: [{ label: "3h", usedPercent: 12 }],
147+
};
148+
},
149+
);
150+
const mockFetch = createProviderUsageFetch(async () => {
151+
throw new Error("legacy fetch should not run");
152+
});
153+
154+
const summary = await loadUsageWithAuth(
155+
loadProviderUsageSummary,
156+
[
157+
{ provider: "anthropic", token: "token-a" },
158+
{ provider: "openai-codex", token: "token-codex" },
159+
],
160+
mockFetch,
161+
);
162+
163+
expect(summary.providers).toEqual([
164+
{
165+
provider: "anthropic",
166+
displayName: "Claude",
167+
windows: [],
168+
error: "fetch failed",
169+
},
170+
{
171+
provider: "openai-codex",
172+
displayName: "Codex",
173+
windows: [{ label: "3h", usedPercent: 12 }],
174+
},
175+
]);
176+
});
177+
136178
it("throws when fetch is unavailable", async () => {
137179
const previousFetch = globalThis.fetch;
138180
vi.stubGlobal("fetch", undefined);

src/infra/provider-usage.load.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,14 @@ export async function loadProviderUsageSummary(
103103
return { updatedAt: now, providers: [] };
104104
}
105105

106-
const tasks = auths.map((auth) =>
107-
withTimeout(
106+
const tasks = auths.map((auth) => {
107+
const failureSnapshot = (error: string): ProviderUsageSnapshot => ({
108+
provider: auth.provider,
109+
displayName: PROVIDER_LABELS[auth.provider] ?? auth.provider,
110+
windows: [],
111+
error,
112+
});
113+
return withTimeout(
108114
fetchProviderUsageSnapshot({
109115
auth,
110116
config,
@@ -121,8 +127,11 @@ export async function loadProviderUsageSummary(
121127
windows: [],
122128
error: "Timeout",
123129
},
124-
),
125-
);
130+
).catch((error: unknown) => {
131+
const message = error instanceof Error ? error.message : String(error);
132+
return failureSnapshot(message.trim() || "Fetch failed");
133+
});
134+
});
126135

127136
const snapshots = await Promise.all(tasks);
128137
const providers = snapshots.filter((entry) => {

src/infra/restart-stale-pids.test.ts

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
256256
expect(pids).not.toContain(process.pid);
257257
});
258258

259+
it("verifies argv when lsof reports the node process name instead of openclaw", () => {
260+
const stalePid = process.pid + 101;
261+
mockSpawnSync.mockImplementation((command: unknown) => {
262+
if (command === "ps") {
263+
return {
264+
error: null,
265+
status: 0,
266+
stdout: "node /opt/openclaw/dist/entry.js gateway\n",
267+
stderr: "",
268+
};
269+
}
270+
return {
271+
error: null,
272+
status: 0,
273+
stdout: lsofOutput([{ pid: stalePid, cmd: "cnode" }]),
274+
stderr: "",
275+
};
276+
});
277+
278+
expect(findGatewayPidsOnPortSync(18789)).toEqual([stalePid]);
279+
expect(mockSpawnSync).toHaveBeenCalledWith(
280+
"ps",
281+
["-ww", "-p", String(stalePid), "-o", "command="],
282+
expect.objectContaining({ timeout: 2000 }),
283+
);
284+
});
285+
259286
it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => {
260287
// Regression: openclaw-weixin sidecar (child of the gateway) invoked
261288
// cleanStaleGatewayProcessesSync during init. lsof reported the parent
@@ -1174,8 +1201,9 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
11741201
vi.spyOn(process, "kill").mockReturnValue(true);
11751202
// Should complete cleanly — no openclaw pids in status-1 output → free
11761203
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
1177-
// Completed in exactly 2 calls (initial find + 1 free poll)
1178-
expect(getCallCount()).toBe(2);
1204+
// Completed with one argv verification after the status-1 poll output:
1205+
// initial lsof + poll lsof + ps argv check.
1206+
expect(getCallCount()).toBe(3);
11791207
});
11801208
});
11811209

0 commit comments

Comments
 (0)