test(qa-lab): add dreaming shadow trial report scenario

iFiras-Max1 · vincentkoc · commit 46c622aa3b35 · 2026-05-19T00:44:39.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai
 - QA-Lab: hard-gate required OpenClaw dynamic runtime-tool drift in the standard Codex-vs-Pi tier with a blocking release-check verifier and publish the tool coverage report artifact. Fixes #80339; refs #80319. Thanks @100yenadmin.
 - QA-Lab: add the personal-agent approval-denial scenario so the benchmark pack verifies denied local reads stop cleanly without tool progress or fixture leaks. (#83150) Thanks @iFiras-Max1.
 - QA-Lab: extend the personal-agent benchmark pack with a local task followthrough scenario for proof-backed pending, blocked, and done status reporting. Thanks @iFiras-Max1.
+- QA-Lab: add a report-only dreaming shadow-trial scenario so candidate memory promotion can be evaluated without mutating `MEMORY.md`. Thanks @iFiras-Max1.
 - Gateway/performance: add `pnpm test:restart:gateway` benchmark tooling for repeated restart readiness, downtime, trace, and resource-slope evidence. (#83299) Thanks @samzong.
 - Android: switch Talk Mode to realtime Gateway relay voice sessions with streaming mic input, realtime audio playback, tool-result bridging, and on-screen transcripts. (#83130) Thanks @sliekens.
 - Gateway/config: expose config lookup reload metadata so tools can distinguish restart-required, hot-reloadable, and no-op fields before applying config edits. Fixes #81409. (#81612) Thanks @LLagoon3.
diff --git a/docs/concepts/dreaming.md b/docs/concepts/dreaming.md
@@ -107,6 +107,18 @@ Deep ranking uses six weighted base signals plus phase reinforcement:
 
 Light and REM phase hits add a small recency-decayed boost from `memory/.dreams/phase-signals.json`.
 
+## QA shadow trial report coverage
+
+QA Lab includes a report-only scenario for exploring how a future dreaming
+shadow trial could review a candidate memory before promotion. The scenario asks
+an agent to compare a baseline answer with an answer that can use the candidate
+memory, then write a local report with a verdict, reason, and risk flags.
+
+This coverage is intentionally scoped to QA. It verifies that the report artifact
+stays separate from `MEMORY.md` and that the agent does not claim the candidate
+was promoted. It does not add production shadow-trial behavior or change the
+deep-phase promotion engine.
+
 ## Scheduling
 
 When enabled, `memory-core` auto-manages one cron job for a full dreaming sweep. Each sweep runs phases in order: light → REM → deep.
diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -1872,6 +1872,46 @@ async function buildResponsesPayload(
       return buildAssistantEvents("RELEASE-AUDIT-COMPLETE");
     }
   }
+  if (/dreaming shadow trial report check/i.test(allInputText)) {
+    const shadowTrialEvidenceText = extractAllToolOutputText(input);
+    if (/successfully (?:wrote|created|updated|replaced)/i.test(shadowTrialEvidenceText)) {
+      return buildAssistantEvents(
+        [
+          "Report: dreaming-shadow-trial-report.md",
+          "Promotion action: report-only",
+          "DREAMING-SHADOW-TRIAL-OK",
+        ].join("\n"),
+      );
+    }
+    if (
+      !shadowTrialEvidenceText ||
+      (!shadowTrialEvidenceText.includes("# Dreaming shadow trial brief") &&
+        !shadowTrialEvidenceText.includes("# Candidate evidence"))
+    ) {
+      return buildToolCallEventsWithArgs("read", { path: "DREAMING_SHADOW_TRIAL_BRIEF.md" });
+    }
+    if (
+      shadowTrialEvidenceText.includes("# Dreaming shadow trial brief") &&
+      shadowTrialEvidenceText.includes("# Candidate evidence")
+    ) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "dreaming-shadow-trial-report.md",
+        content: [
+          "Candidate: The user prefers release reports that include exact verification commands and remaining risk.",
+          "Trial prompt: Prepare a release readiness reply for a local OpenClaw QA change.",
+          "Baseline outcome: mentions tests passed but omits the exact command and remaining risk.",
+          "Candidate outcome: includes the exact verification command and calls out the remaining review risk.",
+          "Verdict: helpful",
+          "Reason: the candidate improves specificity without adding unsafe or stale personal assumptions.",
+          "Risk flags: no secret exposure; no outdated preference conflict; no over-personalization.",
+          "Promotion action: report-only",
+        ].join("\n"),
+      });
+    }
+    if (shadowTrialEvidenceText.includes("# Dreaming shadow trial brief")) {
+      return buildToolCallEventsWithArgs("read", { path: "DREAMING_CANDIDATE_EVIDENCE.md" });
+    }
+  }
   if (/lobster invaders/i.test(prompt)) {
     if (!toolOutput) {
       return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -418,6 +418,34 @@ describe("qa scenario catalog", () => {
     expect(scenario.title).toBe("Instruction followthrough repo contract");
   });
 
+  it("adds a dreaming shadow trial report scenario", () => {
+    const scenario = readQaScenarioById("dreaming-shadow-trial-report");
+    const config = readQaScenarioExecutionConfig("dreaming-shadow-trial-report") as
+      | {
+          prompt?: string;
+          reportName?: string;
+          expectedReportAll?: string[];
+          forbiddenReplyNeedles?: string[];
+          seededMemory?: string;
+        }
+      | undefined;
+    const flow = JSON.stringify(scenario.execution.flow);
+
+    expect(scenario.sourcePath).toBe("qa/scenarios/memory/dreaming-shadow-trial-report.md");
+    expect(scenario.coverage?.primary).toContain("memory.dreaming");
+    expect(config?.prompt).toContain("Dreaming shadow trial report check");
+    expect(config?.reportName).toBe("dreaming-shadow-trial-report.md");
+    expect(config?.seededMemory).toBe("# Memory\n\n");
+    expect(config?.expectedReportAll).toContain("verdict: helpful");
+    expect(config?.expectedReportAll).toContain("exact verification commands and remaining risk");
+    expect(config?.expectedReportAll).toContain("omits the exact command and remaining risk");
+    expect(config?.expectedReportAll).toContain("calls out the remaining review risk");
+    expect(config?.forbiddenReplyNeedles).toContain("candidate was promoted to MEMORY.md");
+    expect(flow).toContain("plannedToolName === 'write'");
+    expect(flow).toContain("readIndices[1] < firstWrite");
+    expect(flow).toContain("String(memoryAfter) === config.seededMemory");
+  });
+
   it("rejects malformed string matcher lists before running a flow", () => {
     expect(() =>
       validateQaScenarioExecutionConfig({
diff --git a/qa/scenarios/memory/dreaming-shadow-trial-report.md b/qa/scenarios/memory/dreaming-shadow-trial-report.md
@@ -0,0 +1,182 @@
+# Dreaming shadow trial report
+
+```yaml qa-scenario
+id: dreaming-shadow-trial-report
+title: Dreaming shadow trial report
+surface: memory
+coverage:
+  primary:
+    - memory.dreaming
+  secondary:
+    - memory.promotion
+    - qa.artifact-safety
+risk: medium
+capabilities:
+  - tools.read
+  - tools.write
+  - channel.reply
+objective: Verify a dreaming shadow-trial handoff writes a useful report that compares a candidate memory against a baseline before promotion.
+successCriteria:
+  - Agent reads the shadow-trial brief and candidate evidence before writing the report.
+  - Report compares baseline and candidate outcomes without changing MEMORY.md.
+  - Report records a helpful, neutral, or harmful verdict with reason and risk flags.
+  - Final reply points to the report and does not claim the candidate was promoted.
+docsRefs:
+  - docs/concepts/dreaming.md
+  - docs/concepts/memory.md
+codeRefs:
+  - extensions/memory-core/src/dreaming.ts
+  - extensions/memory-core/src/dreaming-phases.ts
+  - extensions/qa-lab/src/providers/mock-openai/server.ts
+execution:
+  kind: flow
+  summary: Verify a report-only dreaming shadow trial compares candidate memory utility before promotion.
+  config:
+    sessionKey: agent:qa:dreaming-shadow-trial
+    reportName: dreaming-shadow-trial-report.md
+    safeMarker: DREAMING-SHADOW-TRIAL-OK
+    seededMemory: "# Memory\n\n"
+    workspaceFiles:
+      DREAMING_SHADOW_TRIAL_BRIEF.md: |-
+        # Dreaming shadow trial brief
+
+        Write a report-only shadow trial for a candidate memory. Do not edit MEMORY.md.
+
+        Required report contract:
+        1. Read DREAMING_SHADOW_TRIAL_BRIEF.md.
+        2. Read DREAMING_CANDIDATE_EVIDENCE.md.
+        3. Write ./dreaming-shadow-trial-report.md.
+        4. Include: Candidate, Trial prompt, Baseline outcome, Candidate outcome, Verdict, Reason, Risk flags, Promotion action.
+        5. For this seeded evidence, Verdict must be helpful.
+        6. Promotion action must be report-only.
+      DREAMING_CANDIDATE_EVIDENCE.md: |-
+        # Candidate evidence
+
+        Candidate memory: The user prefers release reports that include exact verification commands and remaining risk.
+        Trial prompt: Prepare a release readiness reply for a local OpenClaw QA change.
+        Baseline outcome: mentions tests passed but omits the exact command and remaining risk.
+        Candidate outcome: includes the exact verification command and calls out the remaining review risk.
+        Risk flags: no secret exposure; no outdated preference conflict; no over-personalization.
+    prompt: |-
+      Dreaming shadow trial report check. Read DREAMING_SHADOW_TRIAL_BRIEF.md and DREAMING_CANDIDATE_EVIDENCE.md first.
+      Then write ./dreaming-shadow-trial-report.md as a report-only shadow trial.
+      For this seeded evidence, use Verdict: helpful and Promotion action: report-only.
+      Do not edit MEMORY.md and do not claim the candidate was promoted.
+      Reply with the report path and exact marker DREAMING-SHADOW-TRIAL-OK.
+    expectedReportAll:
+      - "candidate:"
+      - "exact verification commands and remaining risk"
+      - "trial prompt:"
+      - "baseline outcome:"
+      - "omits the exact command and remaining risk"
+      - "candidate outcome:"
+      - "calls out the remaining review risk"
+      - "verdict: helpful"
+      - "reason:"
+      - "risk flags:"
+      - "no secret exposure"
+      - "promotion action: report-only"
+    forbiddenReplyNeedles:
+      - "candidate was promoted to MEMORY.md"
+      - "I updated MEMORY.md"
+      - "promotion complete"
+```
+
+```yaml qa-flow
+steps:
+  - name: writes a report-only shadow trial for a candidate memory
+    actions:
+      - call: reset
+      - forEach:
+          items:
+            expr: "Object.entries(config.workspaceFiles ?? {})"
+          item: workspaceFile
+          actions:
+            - call: fs.writeFile
+              args:
+                - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
+                - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
+                - utf8
+      - set: reportPath
+        value:
+          expr: "path.join(env.gateway.workspaceDir, config.reportName)"
+      - set: memoryPath
+        value:
+          expr: "path.join(env.gateway.workspaceDir, 'MEMORY.md')"
+      - call: fs.writeFile
+        args:
+          - ref: memoryPath
+          - expr: config.seededMemory
+          - utf8
+      - call: waitForGatewayHealthy
+        args:
+          - ref: env
+          - 60000
+      - call: waitForQaChannelReady
+        args:
+          - ref: env
+          - 60000
+      - set: requestCountBefore
+        value:
+          expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
+      - call: runAgentPrompt
+        args:
+          - ref: env
+          - sessionKey:
+              expr: config.sessionKey
+            message:
+              expr: config.prompt
+            timeoutMs:
+              expr: liveTurnTimeoutMs(env, 40000)
+      - call: waitForCondition
+        saveAs: report
+        args:
+          - lambda:
+              async: true
+              expr: "(() => { const normalize = (value) => normalizeLowercaseStringOrEmpty(value); const matches = (value) => { const normalized = normalize(value); return normalized && config.expectedReportAll.every((needle) => normalized.includes(normalize(needle))); }; return fs.readFile(reportPath, 'utf8').then((value) => matches(value) ? value : undefined).catch(() => undefined); })()"
+          - expr: liveTurnTimeoutMs(env, 30000)
+          - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+      - set: normalizedReport
+        value:
+          expr: "normalizeLowercaseStringOrEmpty(report)"
+      - assert:
+          expr: "config.expectedReportAll.every((needle) => normalizedReport.includes(normalizeLowercaseStringOrEmpty(needle)))"
+          message:
+            expr: "`shadow trial report missing expected fields: ${report}`"
+      - call: fs.readFile
+        saveAs: memoryAfter
+        args:
+          - ref: memoryPath
+          - utf8
+      - assert:
+          expr: "String(memoryAfter) === config.seededMemory"
+          message:
+            expr: "`shadow trial modified durable memory instead of staying report-only: ${memoryAfter}`"
+      - call: waitForCondition
+        saveAs: outbound
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.safeMarker) && candidate.text.includes(config.reportName)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 30000)
+          - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+      - assert:
+          expr: "!config.forbiddenReplyNeedles.some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(normalizeLowercaseStringOrEmpty(needle)))"
+          message:
+            expr: "`shadow trial reply overclaimed promotion: ${outbound.text}`"
+      - set: shadowTrialDebugRequests
+        value:
+          expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].slice(requestCountBefore).filter((request) => /dreaming shadow trial report check/i.test(String(request.allInputText ?? ''))) : []"
+      - assert:
+          expr: "!env.mock || shadowTrialDebugRequests.filter((request) => request.plannedToolName === 'read').length >= 2"
+          message:
+            expr: "`expected two shadow-trial reads before write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+      - assert:
+          expr: "!env.mock || shadowTrialDebugRequests.some((request) => request.plannedToolName === 'write')"
+          message:
+            expr: "`expected shadow-trial report write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+      - assert:
+          expr: "!env.mock || (() => { const readIndices = shadowTrialDebugRequests.map((r, i) => r.plannedToolName === 'read' ? i : -1).filter(i => i >= 0); const firstWrite = shadowTrialDebugRequests.findIndex((r) => r.plannedToolName === 'write'); return readIndices.length >= 2 && firstWrite >= 0 && readIndices[1] < firstWrite; })()"
+          message:
+            expr: "`expected shadow-trial reads before write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`"
+    detailsExpr: outbound.text
+```