Skip to content

Commit cf06578

Browse files
committed
feat(qa-lab): add jsonl replay harness
1 parent 66dcc4e commit cf06578

14 files changed

Lines changed: 641 additions & 0 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Docs: https://docs.openclaw.ai
99
- Gateway/plugins: reuse a compatible Gateway startup plugin registry during dispatch so safe plugin dispatches avoid redundant registry loading. (#84324) Thanks @ai-hpc.
1010
- Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
1111
- Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
12+
- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
1213
- Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn.
1314

1415
### Fixes

extensions/qa-lab/src/cli.runtime.test.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ import {
7676
runQaDockerUpCommand,
7777
runQaCharacterEvalCommand,
7878
runQaCoverageReportCommand,
79+
runQaJsonlReplayCommand,
7980
runQaManualLaneCommand,
8081
runQaParityReportCommand,
8182
runQaSuiteCommand,
@@ -1084,6 +1085,44 @@ describe("qa cli runtime", () => {
10841085
expectWriteContains(stdoutWrite, "codex-native-workspace");
10851086
});
10861087

1088+
it("writes a curated mock JSONL replay report and summary", async () => {
1089+
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-jsonl-replay-cli-"));
1090+
try {
1091+
await runQaJsonlReplayCommand({
1092+
repoRoot,
1093+
transcripts: path.resolve("qa/scenarios/jsonl-replay"),
1094+
outputDir: "jsonl-output",
1095+
runtimePair: "pi,codex",
1096+
});
1097+
1098+
const report = await fs.readFile(
1099+
path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-report.md"),
1100+
"utf8",
1101+
);
1102+
const summary = JSON.parse(
1103+
await fs.readFile(
1104+
path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-summary.json"),
1105+
"utf8",
1106+
),
1107+
) as { transcripts?: Array<{ userTurnCount?: number }> };
1108+
1109+
expect(report).toContain("# OpenClaw JSONL Replay Report - pi vs codex");
1110+
expect(report).toContain("| plan-mode-boundaries.jsonl | 3 | | none, none, none |");
1111+
expect(summary.transcripts).toHaveLength(7);
1112+
} finally {
1113+
await fs.rm(repoRoot, { recursive: true, force: true });
1114+
}
1115+
});
1116+
1117+
it("keeps JSONL replay mock-only until real runtime cell replay is wired", async () => {
1118+
await expect(
1119+
runQaJsonlReplayCommand({
1120+
repoRoot: process.cwd(),
1121+
providerMode: "live-frontier",
1122+
}),
1123+
).rejects.toThrow("qa jsonl-replay currently supports mock-openai curated fixtures only.");
1124+
});
1125+
10871126
it("exits nonzero when tool coverage summary is missing a required runtime tool call", async () => {
10881127
const priorExitCode = process.exitCode;
10891128
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-tool-coverage-"));

extensions/qa-lab/src/cli.runtime.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./cove
1616
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
1717
import { runQaDockerUp } from "./docker-up.runtime.js";
1818
import type { QaCliBackendAuthMode } from "./gateway-child.js";
19+
import {
20+
createMockJsonlReplayCellRunner,
21+
renderJsonlReplayMarkdownReport,
22+
runJsonlReplay,
23+
type JsonlReplayInput,
24+
} from "./jsonl-replay.js";
1925
import { startQaLabServer } from "./lab-server.js";
2026
import { runQaManualLane } from "./manual-lane.runtime.js";
2127
import { runQaMultipass } from "./multipass.runtime.js";
@@ -820,6 +826,50 @@ export async function runQaCoverageReportCommand(opts: {
820826
process.stdout.write(body);
821827
}
822828

829+
export async function runQaJsonlReplayCommand(opts: {
830+
repoRoot?: string;
831+
transcripts?: string;
832+
outputDir?: string;
833+
runtimePair?: string;
834+
providerMode?: QaProviderModeInput;
835+
}) {
836+
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
837+
const runtimePair = parseQaRuntimePair(opts.runtimePair) ?? ["pi", "codex"];
838+
if (runtimePair[0] !== "pi" || runtimePair[1] !== "codex") {
839+
throw new Error('--runtime-pair for jsonl-replay must be "pi,codex".');
840+
}
841+
const providerMode = normalizeQaProviderMode(opts.providerMode ?? "mock-openai");
842+
if (providerMode !== "mock-openai") {
843+
throw new Error("qa jsonl-replay currently supports mock-openai curated fixtures only.");
844+
}
845+
const transcriptDir = path.resolve(repoRoot, opts.transcripts ?? "qa/scenarios/jsonl-replay");
846+
const outputDir =
847+
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
848+
path.join(repoRoot, ".artifacts", "qa-e2e", `jsonl-replay-${Date.now().toString(36)}`);
849+
await fs.mkdir(outputDir, { recursive: true });
850+
const result = await runJsonlReplay(
851+
{
852+
directory: transcriptDir,
853+
runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
854+
providerMode,
855+
},
856+
{ runCell: createMockJsonlReplayCellRunner() },
857+
);
858+
const reportPayload = {
859+
generatedAt: new Date().toISOString(),
860+
providerMode,
861+
runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
862+
transcripts: result.transcripts,
863+
};
864+
const report = renderJsonlReplayMarkdownReport(reportPayload);
865+
const reportPath = path.join(outputDir, "qa-jsonl-replay-report.md");
866+
const summaryPath = path.join(outputDir, "qa-jsonl-replay-summary.json");
867+
await fs.writeFile(reportPath, report, "utf8");
868+
await fs.writeFile(summaryPath, `${JSON.stringify(result, null, 2)}\n`, "utf8");
869+
process.stdout.write(`QA JSONL replay report: ${reportPath}\n`);
870+
process.stdout.write(`QA JSONL replay summary: ${summaryPath}\n`);
871+
}
872+
823873
export async function runQaCharacterEvalCommand(opts: {
824874
repoRoot?: string;
825875
outputDir?: string;

extensions/qa-lab/src/cli.test.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ const {
4545
runQaCredentialsListCommand,
4646
runQaCredentialsRemoveCommand,
4747
runQaCoverageReportCommand,
48+
runQaJsonlReplayCommand,
4849
runQaProviderServerCommand,
4950
runQaSuiteCommand,
5051
runQaTelegramCommand,
@@ -58,6 +59,7 @@ const {
5859
runQaCredentialsListCommand: vi.fn(),
5960
runQaCredentialsRemoveCommand: vi.fn(),
6061
runQaCoverageReportCommand: vi.fn(),
62+
runQaJsonlReplayCommand: vi.fn(),
6163
runQaProviderServerCommand: vi.fn(),
6264
runQaSuiteCommand: vi.fn(),
6365
runQaTelegramCommand: vi.fn(),
@@ -113,6 +115,7 @@ vi.mock("./cli.runtime.js", () => ({
113115
runQaCredentialsListCommand,
114116
runQaCredentialsRemoveCommand,
115117
runQaCoverageReportCommand,
118+
runQaJsonlReplayCommand,
116119
runQaProviderServerCommand,
117120
runQaSuiteCommand,
118121
}));
@@ -128,6 +131,7 @@ describe("qa cli registration", () => {
128131
runQaCredentialsListCommand.mockReset();
129132
runQaCredentialsRemoveCommand.mockReset();
130133
runQaCoverageReportCommand.mockReset();
134+
runQaJsonlReplayCommand.mockReset();
131135
runQaProviderServerCommand.mockReset();
132136
runQaSuiteCommand.mockReset();
133137
runQaTelegramCommand.mockReset();
@@ -480,6 +484,33 @@ describe("qa cli registration", () => {
480484
});
481485
});
482486

487+
it("routes JSONL replay flags into the qa runtime command", async () => {
488+
await program.parseAsync([
489+
"node",
490+
"openclaw",
491+
"qa",
492+
"jsonl-replay",
493+
"--repo-root",
494+
"/tmp/openclaw-repo",
495+
"--transcripts",
496+
"qa/scenarios/jsonl-replay",
497+
"--runtime-pair",
498+
"pi,codex",
499+
"--provider-mode",
500+
"mock-openai",
501+
"--output-dir",
502+
".artifacts/qa-e2e/jsonl-replay-test",
503+
]);
504+
505+
expect(runQaJsonlReplayCommand).toHaveBeenCalledWith({
506+
repoRoot: "/tmp/openclaw-repo",
507+
transcripts: "qa/scenarios/jsonl-replay",
508+
runtimePair: "pi,codex",
509+
providerMode: "mock-openai",
510+
outputDir: ".artifacts/qa-e2e/jsonl-replay-test",
511+
});
512+
});
513+
483514
it("delegates discovered qa runner registration through the generic host seam", () => {
484515
const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
485516
expect(registration.register).toHaveBeenCalledTimes(1);

extensions/qa-lab/src/cli.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,17 @@ async function runQaCoverageReport(opts: {
8383
await runtime.runQaCoverageReportCommand(opts);
8484
}
8585

86+
async function runQaJsonlReplay(opts: {
87+
repoRoot?: string;
88+
transcripts?: string;
89+
outputDir?: string;
90+
runtimePair?: string;
91+
providerMode?: QaProviderModeInput;
92+
}) {
93+
const runtime = await loadQaLabCliRuntime();
94+
await runtime.runQaJsonlReplayCommand(opts);
95+
}
96+
8697
async function runQaCharacterEval(opts: {
8798
repoRoot?: string;
8899
outputDir?: string;
@@ -402,6 +413,33 @@ export function registerQaLabCli(program: Command) {
402413
},
403414
);
404415

416+
qa.command("jsonl-replay")
417+
.description("Replay curated JSONL transcripts through the runtime parity replay harness")
418+
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
419+
.option(
420+
"--transcripts <path>",
421+
"Directory of curated JSONL transcripts",
422+
"qa/scenarios/jsonl-replay",
423+
)
424+
.option("--runtime-pair <pair>", "Runtime pair label, e.g. pi,codex", "pi,codex")
425+
.option(
426+
"--provider-mode <mode>",
427+
`Provider mode (${formatQaProviderModeHelp()})`,
428+
"mock-openai",
429+
)
430+
.option("--output-dir <path>", "Artifact directory for the JSONL replay report")
431+
.action(
432+
async (opts: {
433+
repoRoot?: string;
434+
transcripts?: string;
435+
runtimePair?: string;
436+
providerMode?: QaProviderModeInput;
437+
outputDir?: string;
438+
}) => {
439+
await runQaJsonlReplay(opts);
440+
},
441+
);
442+
405443
qa.command("character-eval")
406444
.description("Run the character QA scenario across live models and write a judged report")
407445
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")

0 commit comments

Comments
 (0)