Skip to content

Commit 9eaca28

Browse files
authored
fix(backup): retry tar EOF races and skip live volatile files
Fixes #72249.\n\nSummary:\n- retry live backup tar EOF races\n- skip current live session, cron, log, and delivery-queue state files\n- preserve workspace lock/temp files and keep backup --json parseable\n\nVerification:\n- Crabbox pre-fix repro: tbx_01kr5xt9vf5pas5ee4aefrp3am\n- Crabbox post-fix proof: tbx_01kr5y3e1kbtt6chbypfdydbgs\n- pnpm check:test-types\n- pnpm lint:core\n- pnpm test src/commands/backup.test.ts src/infra/backup-volatile-filter.test.ts src/infra/backup-create.test.ts\n- CI on 3766457: green\n\nThanks @abnershang.
1 parent 7d91fcb commit 9eaca28

8 files changed

Lines changed: 668 additions & 18 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai
146146
### Fixes
147147

148148
- Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net.
149+
- Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang.
149150
- OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
150151
- OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race.
151152
- OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions.

docs/cli/backup.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ skipped.
5353

5454
The archive payload stores file contents from those source trees, and the embedded `manifest.json` records the resolved absolute source paths plus the archive layout used for each asset.
5555

56+
During archive creation, OpenClaw skips known live-mutation files that do not have restoration value, including active agent session transcripts, cron run logs, rolling logs, delivery queues, socket/pid/temp files under the state directory, and related durable-queue temp files. The JSON result includes `skippedVolatileCount` so automation can see how many files were intentionally omitted.
57+
5658
Installed plugin source and manifest files under the state directory's
5759
`extensions/` tree are included, but their nested `node_modules/` dependency
5860
trees are skipped. Those dependencies are rebuildable install artifacts; after

src/commands/backup.test.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,50 @@ describe("backup commands", () => {
316316
}
317317
});
318318

319+
it("keeps volatile-skip notices out of json output", async () => {
320+
const stateDir = path.join(tempHome.home, ".openclaw");
321+
const backupDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-backups-json-"));
322+
try {
323+
const runtime = createBackupTestRuntime();
324+
await mockStateOnlyBackupPlan(stateDir);
325+
tarCreateMock.mockImplementationOnce(
326+
async (
327+
options: { file: string; filter?: (entryPath: string) => boolean },
328+
entryPaths: string[],
329+
) => {
330+
const manifestPath = entryPaths[0];
331+
const stateRoot = entryPaths[1];
332+
expect(manifestPath).toBeDefined();
333+
expect(stateRoot).toBeDefined();
334+
if (!manifestPath || !stateRoot) {
335+
throw new Error("backup test expected manifest and state entries");
336+
}
337+
expect(options.filter?.(manifestPath)).toBe(true);
338+
expect(
339+
options.filter?.(path.join(stateRoot, "agents", "main", "sessions", "s.jsonl")),
340+
).toBe(false);
341+
await fs.writeFile(options.file, "archive-bytes", "utf8");
342+
},
343+
);
344+
345+
const result = await backupCreateCommand(runtime, {
346+
output: backupDir,
347+
json: true,
348+
});
349+
350+
expect(result.skippedVolatileCount).toBe(1);
351+
expect(runtime.log).toHaveBeenCalledTimes(1);
352+
const payload = vi.mocked(runtime.log).mock.calls[0]?.[0];
353+
if (typeof payload !== "string") {
354+
throw new Error("backup test expected JSON string output");
355+
}
356+
expect(payload).not.toContain("Backup skipped");
357+
expect(JSON.parse(payload)).toMatchObject({ skippedVolatileCount: 1 });
358+
} finally {
359+
await fs.rm(backupDir, { recursive: true, force: true });
360+
}
361+
});
362+
319363
it("rejects output paths that would be created inside a backed-up directory", async () => {
320364
const stateDir = path.join(tempHome.home, ".openclaw");
321365
await fs.writeFile(path.join(stateDir, "openclaw.json"), JSON.stringify({}), "utf8");

src/commands/backup.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ export async function backupCreateCommand(
2222
runtime: RuntimeEnv,
2323
opts: BackupCreateOptions = {},
2424
): Promise<BackupCreateResult> {
25-
const result = await createBackupArchive(opts);
25+
const result = await createBackupArchive({
26+
...opts,
27+
log: opts.log ?? (opts.json ? undefined : (message: string) => runtime.log(message)),
28+
});
2629
if (opts.verify && !opts.dryRun) {
2730
const { backupVerifyCommand } = await loadBackupVerifyRuntime();
2831
await backupVerifyCommand(

src/infra/backup-create.test.ts

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { backupVerifyCommand } from "../commands/backup-verify.js";
66
import type { RuntimeEnv } from "../runtime.js";
77
import { withOpenClawTestState } from "../test-utils/openclaw-test-state.js";
88
import {
9+
__test as backupCreateInternals,
910
buildExtensionsNodeModulesFilter,
1011
createBackupArchive,
1112
formatBackupCreateSummary,
@@ -23,6 +24,7 @@ function makeResult(overrides: Partial<BackupCreateResult> = {}): BackupCreateRe
2324
verified: false,
2425
assets: [],
2526
skipped: [],
27+
skippedVolatileCount: 0,
2628
...overrides,
2729
};
2830
}
@@ -106,6 +108,159 @@ describe("formatBackupCreateSummary", () => {
106108
])("$name", ({ result, expected }) => {
107109
expect(formatBackupCreateSummary(result)).toEqual(expected);
108110
});
111+
112+
it("surfaces the volatile skip count in the summary", () => {
113+
expect(
114+
formatBackupCreateSummary(
115+
makeResult({
116+
assets: [
117+
{
118+
kind: "state",
119+
sourcePath: "/state",
120+
archivePath: "archive/state",
121+
displayPath: "~/.openclaw",
122+
},
123+
],
124+
skippedVolatileCount: 3,
125+
}),
126+
),
127+
).toEqual([
128+
"Backup archive: /tmp/openclaw-backup.tar.gz",
129+
"Included 1 path:",
130+
"- state: ~/.openclaw",
131+
"Created /tmp/openclaw-backup.tar.gz",
132+
"Skipped 3 volatile files (live sessions, cron logs, queues, sockets, pid/tmp).",
133+
]);
134+
});
135+
});
136+
137+
describe("isTarEofRaceError", () => {
138+
const { isTarEofRaceError } = backupCreateInternals;
139+
140+
it.each([
141+
"did not encounter expected EOF",
142+
"encountered unexpected EOF",
143+
"TAR_BAD_ARCHIVE: Unrecognized archive format",
144+
"Truncated input (needed 512 more bytes, only 0 available) (TAR_BAD_ARCHIVE)",
145+
])("matches tar-specific EOF-class error: %s", (message) => {
146+
expect(isTarEofRaceError(new Error(message))).toBe(true);
147+
});
148+
149+
it("matches errors by code even when the message is empty", () => {
150+
expect(isTarEofRaceError(Object.assign(new Error(""), { code: "EOF" }))).toBe(true);
151+
});
152+
153+
it.each([
154+
"EOF occurred in violation of protocol",
155+
"unexpected eof while reading",
156+
"ran out of EOF markers",
157+
"permission denied",
158+
"",
159+
])("does not match unrelated errors: %s", (message) => {
160+
expect(isTarEofRaceError(new Error(message))).toBe(false);
161+
});
162+
163+
it("rejects non-object inputs", () => {
164+
expect(isTarEofRaceError(null)).toBe(false);
165+
expect(isTarEofRaceError(undefined)).toBe(false);
166+
expect(isTarEofRaceError("did not encounter expected EOF")).toBe(false);
167+
});
168+
});
169+
170+
describe("writeTarArchiveWithRetry", () => {
171+
it("retries on EOF-class errors and eventually succeeds", async () => {
172+
const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
173+
path: "/state/sessions/s-abc/transcript.jsonl",
174+
});
175+
const runTar = vi
176+
.fn<() => Promise<void>>()
177+
.mockRejectedValueOnce(eofErr)
178+
.mockRejectedValueOnce(eofErr)
179+
.mockResolvedValueOnce(undefined);
180+
const log = vi.fn();
181+
const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
182+
183+
await backupCreateInternals.writeTarArchiveWithRetry({
184+
tempArchivePath: "/tmp/backup.tar.gz.tmp",
185+
runTar,
186+
log,
187+
sleepMs: sleep,
188+
});
189+
190+
expect(runTar).toHaveBeenCalledTimes(3);
191+
expect(sleep).toHaveBeenNthCalledWith(1, 10_000);
192+
expect(sleep).toHaveBeenNthCalledWith(2, 20_000);
193+
expect(log).toHaveBeenCalledTimes(2);
194+
});
195+
196+
it("surfaces the offending path and attempt count after exhausting retries", async () => {
197+
const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
198+
path: "/state/logs/gateway.jsonl",
199+
});
200+
const runTar = vi.fn<() => Promise<void>>().mockRejectedValue(eofErr);
201+
const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
202+
203+
await expect(
204+
backupCreateInternals.writeTarArchiveWithRetry({
205+
tempArchivePath: "/tmp/backup.tar.gz.tmp",
206+
runTar,
207+
sleepMs: sleep,
208+
}),
209+
).rejects.toThrow(/last offending path: \/state\/logs\/gateway\.jsonl, after 3 attempts/);
210+
expect(runTar).toHaveBeenCalledTimes(3);
211+
});
212+
213+
it("lets callers reset per-attempt counters so retries report the final attempt's count, not a running sum", async () => {
214+
// Simulate the caller's pattern: a closure counter populated by a filter
215+
// that tar.c invokes while walking the tree. Each attempt re-walks the
216+
// same tree, so the runTar closure must reset the counter before calling
217+
// tar.c -- otherwise the reported count accumulates across attempts.
218+
let skippedVolatileCount = 0;
219+
const volatileFilesSeenPerAttempt = 5;
220+
let attempt = 0;
221+
222+
const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
223+
path: "/state/sessions/s-abc/transcript.jsonl",
224+
});
225+
226+
const runTar = vi.fn<() => Promise<void>>().mockImplementation(async () => {
227+
attempt += 1;
228+
skippedVolatileCount = 0;
229+
for (let i = 0; i < volatileFilesSeenPerAttempt; i += 1) {
230+
skippedVolatileCount += 1;
231+
}
232+
if (attempt < 3) {
233+
throw eofErr;
234+
}
235+
});
236+
const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
237+
238+
await backupCreateInternals.writeTarArchiveWithRetry({
239+
tempArchivePath: "/tmp/backup.tar.gz.tmp",
240+
runTar,
241+
sleepMs: sleep,
242+
});
243+
244+
expect(runTar).toHaveBeenCalledTimes(3);
245+
// Without the reset, this would be 15 (5 * 3 attempts). With the reset,
246+
// it equals the count from the final (successful) attempt.
247+
expect(skippedVolatileCount).toBe(volatileFilesSeenPerAttempt);
248+
});
249+
250+
it("does not retry on non-EOF errors", async () => {
251+
const runTar = vi.fn<() => Promise<void>>().mockRejectedValue(new Error("permission denied"));
252+
const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
253+
254+
await expect(
255+
backupCreateInternals.writeTarArchiveWithRetry({
256+
tempArchivePath: "/tmp/backup.tar.gz.tmp",
257+
runTar,
258+
sleepMs: sleep,
259+
}),
260+
).rejects.toThrow(/permission denied/);
261+
expect(runTar).toHaveBeenCalledTimes(1);
262+
expect(sleep).not.toHaveBeenCalled();
263+
});
109264
});
110265

111266
describe("buildExtensionsNodeModulesFilter", () => {
@@ -131,6 +286,65 @@ describe("buildExtensionsNodeModulesFilter", () => {
131286
});
132287

133288
describe("createBackupArchive", () => {
289+
it("skips current live volatile state files while preserving workspace locks", async () => {
290+
await withOpenClawTestState(
291+
{
292+
layout: "split",
293+
prefix: "openclaw-backup-volatile-",
294+
scenario: "minimal",
295+
},
296+
async (state) => {
297+
const outputDir = state.path("backups");
298+
await state.writeConfig({
299+
agents: {
300+
list: [{ id: "main", default: true, workspace: state.workspaceDir }],
301+
},
302+
});
303+
await fs.mkdir(outputDir, { recursive: true });
304+
await fs.writeFile(path.join(state.workspaceDir, "Cargo.lock"), "workspace lock\n", "utf8");
305+
await fs.writeFile(
306+
path.join(state.workspaceDir, "pending.tmp"),
307+
"workspace temp fixture\n",
308+
"utf8",
309+
);
310+
await state.writeText("agents/main/sessions/live-session.jsonl", "session\n");
311+
await state.writeText("sessions/legacy-session.jsonl", "legacy session\n");
312+
await state.writeText("cron/runs/nightly.jsonl", "cron\n");
313+
await state.writeText("logs/gateway.log", "log\n");
314+
await state.writeJson("delivery-queue/message.json", { id: "delivery" });
315+
await state.writeJson("session-delivery-queue/message.json", { id: "session-delivery" });
316+
await state.writeText("tmp/staged.tmp", "tmp\n");
317+
await state.writeText("gateway.pid", "123\n");
318+
319+
const result = await createBackupArchive({
320+
output: outputDir,
321+
includeWorkspace: true,
322+
nowMs: Date.UTC(2026, 4, 9, 8, 0, 0),
323+
});
324+
const entries = await listArchiveEntries(result.archivePath);
325+
326+
expect(entries.some((entry) => entry.endsWith("/workspace/Cargo.lock"))).toBe(true);
327+
expect(entries.some((entry) => entry.endsWith("/workspace/pending.tmp"))).toBe(true);
328+
for (const suffix of [
329+
"/state/agents/main/sessions/live-session.jsonl",
330+
"/state/sessions/legacy-session.jsonl",
331+
"/state/cron/runs/nightly.jsonl",
332+
"/state/logs/gateway.log",
333+
"/state/delivery-queue/message.json",
334+
"/state/session-delivery-queue/message.json",
335+
"/state/tmp/staged.tmp",
336+
"/state/gateway.pid",
337+
]) {
338+
expect(
339+
entries.some((entry) => entry.endsWith(suffix)),
340+
suffix,
341+
).toBe(false);
342+
}
343+
expect(result.skippedVolatileCount).toBe(8);
344+
},
345+
);
346+
});
347+
134348
it("omits installed plugin node_modules from the real archive while keeping plugin files", async () => {
135349
await withOpenClawTestState(
136350
{

0 commit comments

Comments
 (0)