test(qa): require channel scenario markers

vincentkoc · vincentkoc · commit a9f099d279a6 · 2026-06-03T14:27:25.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -67,6 +67,7 @@ Docs: https://docs.openclaw.ai
 - Release/CI/E2E: require the Kitchen Sink RPC walk to prove every expected plugin tool is cataloged and effective before invoking tool fixtures.
 - Release/CI/E2E: stop tracked Docker build commands when centralized build wrappers receive shutdown signals.
 - Release/CI/E2E: cover MCP channel pairing reconnects by asserting the same temporary client state is reused across reconnects.
+- Release/CI/E2E: require QA channel baseline and reconnect scenarios to assert their scenario markers instead of accepting any outbound reply.
 - Release/CI/E2E: fail secret-provider proof runs when temporary state cleanup still fails after retries instead of hiding the cleanup error.
 - Release/CI/E2E: fail package-candidate ref proofs when temporary source worktree cleanup fails instead of leaving stale worktrees behind.
 - Release/CI/E2E: remove package tarball extract directories when tar extraction fails before validation can continue.
diff --git a/extensions/qa-lab/src/scenario-flow-runner.test.ts b/extensions/qa-lab/src/scenario-flow-runner.test.ts
@@ -1,7 +1,98 @@
 import { describe, expect, it } from "vitest";
 import { createQaBusState } from "./bus-state.js";
+import { readQaScenarioById } from "./scenario-catalog.js";
 import { runScenarioFlow } from "./scenario-flow-runner.js";
 
+type QaFlowStep = {
+  name: string;
+  run: () => Promise<string | void>;
+};
+
+function formatTestTranscript(state: ReturnType<typeof createQaBusState>) {
+  return state
+    .getSnapshot()
+    .messages.map((message) => `${message.direction}:${message.conversation.id}:${message.text}`)
+    .join("\n");
+}
+
+async function runLoadedScenarioFlow(
+  scenarioId: string,
+  params: {
+    onWaitForOutboundMessage?: (params: {
+      waitCount: number;
+      state: ReturnType<typeof createQaBusState>;
+    }) => void;
+  } = {},
+) {
+  const scenario = readQaScenarioById(scenarioId);
+  const flow = scenario.execution.flow;
+  if (!flow) {
+    throw new Error(`scenario has no flow: ${scenarioId}`);
+  }
+
+  const state = createQaBusState();
+  let waitCount = 0;
+  const api = {
+    env: {},
+    state,
+    scenario,
+    config: scenario.execution.config ?? {},
+    randomUUID: () => "00000000-0000-4000-8000-000000000000",
+    liveTurnTimeoutMs: (_env: unknown, timeoutMs: number) => timeoutMs,
+    waitForGatewayHealthy: async () => undefined,
+    waitForQaChannelReady: async () => undefined,
+    waitForNoOutbound: async () => undefined,
+    sleep: async () => undefined,
+    reset: async () => {
+      state.reset();
+    },
+    resetBus: async () => {
+      state.reset();
+    },
+    runAgentPrompt: async () => undefined,
+    formatTransportTranscript: formatTestTranscript,
+    waitForOutboundMessage: async (
+      stateLocal: ReturnType<typeof createQaBusState>,
+      predicate: (candidate: unknown) => boolean,
+      timeoutMs: number,
+      options?: { sinceIndex?: number },
+    ) => {
+      waitCount += 1;
+      params.onWaitForOutboundMessage?.({ waitCount, state: stateLocal });
+      const match = stateLocal
+        .getSnapshot()
+        .messages.slice(options?.sinceIndex ?? 0)
+        .find((candidate) => predicate(candidate));
+      if (match) {
+        return match;
+      }
+      throw new Error(`timed out after ${timeoutMs}ms waiting for outbound marker`);
+    },
+    runScenario: async (_name: string, steps: QaFlowStep[]) => {
+      const stepResults = [];
+      for (const step of steps) {
+        const details = await step.run();
+        stepResults.push({
+          name: step.name,
+          status: "pass" as const,
+          ...(details !== undefined ? { details } : {}),
+        });
+      }
+      return {
+        name: scenario.title,
+        status: "pass" as const,
+        steps: stepResults,
+      };
+    },
+  };
+
+  return await runScenarioFlow({
+    api,
+    scenarioTitle: scenario.title,
+    flow,
+  });
+}
+
 describe("scenario-flow-runner", () => {
   it("supports qaImport inside flow expressions", async () => {
     const result = await runScenarioFlow({
@@ -221,4 +312,78 @@ describe("scenario-flow-runner", () => {
     expect(result.status).toBe("pass");
     expect(result.steps[0]?.details).toBe("QA_CODEX_PLUGIN_TURN_OK");
   });
+
+  it.each([
+    {
+      scenarioId: "channel-chat-baseline",
+      to: "channel:qa-room",
+      text: "generic shared-channel reply without the required marker",
+    },
+    {
+      scenarioId: "dm-chat-baseline",
+      to: "dm:alice",
+      text: "generic DM reply without the required marker",
+    },
+  ])("rejects unmarked outbound replies for $scenarioId", async ({ scenarioId, to, text }) => {
+    await expect(
+      runLoadedScenarioFlow(scenarioId, {
+        onWaitForOutboundMessage: ({ state }) => {
+          state.addOutboundMessage({
+            accountId: "qa-channel",
+            to,
+            text,
+          });
+        },
+      }),
+    ).rejects.toThrow("waiting for outbound marker");
+  });
+
+  it("rejects reconnect follow-up replies that replay the first marker", async () => {
+    await expect(
+      runLoadedScenarioFlow("qa-channel-reconnect-dedupe", {
+        onWaitForOutboundMessage: ({ waitCount, state }) => {
+          if (waitCount === 1) {
+            state.addOutboundMessage({
+              accountId: "qa-channel",
+              to: "channel:qa-room",
+              text: "RECONNECT-FIRST-OK",
+            });
+            return;
+          }
+          state.addOutboundMessage({
+            accountId: "qa-channel",
+            to: "channel:qa-room",
+            text: "RECONNECT-FIRST-OK",
+          });
+        },
+      }),
+    ).rejects.toThrow("waiting for outbound marker");
+  });
+
+  it("rejects reconnect follow-up turns with extra unmarked outbound replies", async () => {
+    await expect(
+      runLoadedScenarioFlow("qa-channel-reconnect-dedupe", {
+        onWaitForOutboundMessage: ({ waitCount, state }) => {
+          if (waitCount === 1) {
+            state.addOutboundMessage({
+              accountId: "qa-channel",
+              to: "channel:qa-room",
+              text: "RECONNECT-FIRST-OK",
+            });
+            return;
+          }
+          state.addOutboundMessage({
+            accountId: "qa-channel",
+            to: "channel:qa-room",
+            text: "RECONNECT-SECOND-OK",
+          });
+          state.addOutboundMessage({
+            accountId: "qa-channel",
+            to: "channel:qa-room",
+            text: "unmarked duplicate delivery",
+          });
+        },
+      }),
+    ).rejects.toThrow("exactly one marked post-restart reply");
+  });
 });
diff --git a/qa/scenarios/channels/channel-chat-baseline.md b/qa/scenarios/channels/channel-chat-baseline.md
@@ -12,6 +12,7 @@ coverage:
 objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
 successCriteria:
   - Agent replies in the shared channel transcript.
+  - Agent visible reply contains the scenario marker.
   - Agent keeps the conversation scoped to the channel.
   - Agent respects mention-driven group routing semantics.
 docsRefs:
@@ -24,7 +25,8 @@ execution:
   kind: flow
   summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics.
   config:
-    mentionPrompt: "@openclaw explain the QA lab"
+    expectedMarker: QA-CHANNEL-BASELINE-OK
+    mentionPrompt: "@openclaw qa channel baseline marker check. Reply exactly: QA-CHANNEL-BASELINE-OK"
 ```
 
 ```yaml qa-flow
@@ -78,7 +80,14 @@ steps:
           - ref: state
           - lambda:
               params: [candidate]
-              expr: "candidate.conversation.id === 'qa-room' && !candidate.threadId"
+              expr: "candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && candidate.conversation.kind === 'channel' && !candidate.threadId && String(candidate.text ?? '').includes(config.expectedMarker)"
           - expr: liveTurnTimeoutMs(env, 180000)
+      - set: matchingOutbound
+        value:
+          expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && candidate.conversation.kind === 'channel' && String(candidate.text ?? '').includes(config.expectedMarker))"
+      - assert:
+          expr: matchingOutbound.length === 1
+          message:
+            expr: "`expected exactly one channel baseline marker reply, saw ${matchingOutbound.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`"
     detailsExpr: message.text
 ```
diff --git a/qa/scenarios/channels/dm-chat-baseline.md b/qa/scenarios/channels/dm-chat-baseline.md
@@ -12,6 +12,7 @@ coverage:
 objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
 successCriteria:
   - Agent replies in DM without channel routing mistakes.
+  - Agent visible reply contains the scenario marker.
   - Agent explains the QA lab and message bus correctly.
   - Agent keeps the dev C-3PO personality.
 docsRefs:
@@ -24,7 +25,8 @@ execution:
   kind: flow
   summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character.
   config:
-    prompt: "Hello there, who are you?"
+    expectedMarker: QA-DM-BASELINE-OK
+    prompt: "DM baseline marker check. Include exact marker: `QA-DM-BASELINE-OK` and briefly identify the QA lab message bus."
 ```
 
 ```yaml qa-flow
@@ -47,7 +49,14 @@ steps:
           - ref: state
           - lambda:
               params: [candidate]
-              expr: "candidate.conversation.id === 'alice'"
+              expr: "candidate.direction === 'outbound' && candidate.conversation.id === 'alice' && candidate.conversation.kind === 'direct' && String(candidate.text ?? '').includes(config.expectedMarker)"
           - expr: liveTurnTimeoutMs(env, 45000)
+      - set: matchingOutbound
+        value:
+          expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'alice' && candidate.conversation.kind === 'direct' && String(candidate.text ?? '').includes(config.expectedMarker))"
+      - assert:
+          expr: matchingOutbound.length === 1
+          message:
+            expr: "`expected exactly one DM baseline marker reply, saw ${matchingOutbound.length}; transcript=${formatTransportTranscript(state, { conversationId: 'alice' })}`"
     detailsExpr: outbound.text
 ```
diff --git a/qa/scenarios/channels/qa-channel-reconnect-dedupe.md b/qa/scenarios/channels/qa-channel-reconnect-dedupe.md
@@ -64,7 +64,7 @@ steps:
           - ref: state
           - lambda:
               params: [candidate]
-              expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound'"
+              expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound' && String(candidate.text ?? '').includes(config.firstMarker)"
           - expr: liveTurnTimeoutMs(env, 60000)
       - set: beforeRestartCursor
         value:
@@ -80,9 +80,9 @@ steps:
         value:
           expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')"
       - assert:
-          expr: "firstMatchesBeforeFollowup.length === 1"
+          expr: "firstMatchesBeforeFollowup.length === 1 && String(firstMatchesBeforeFollowup[0]?.text ?? '').includes(config.firstMarker)"
           message:
-            expr: "`readiness cycle replayed first reply ${firstMatchesBeforeFollowup.length} times; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`"
+            expr: "`readiness cycle should preserve exactly one marked first reply, saw ${firstMatchesBeforeFollowup.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`"
       - call: runAgentPrompt
         args:
           - ref: env
@@ -99,7 +99,7 @@ steps:
           - ref: state
           - lambda:
               params: [candidate]
-              expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound'"
+              expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound' && String(candidate.text ?? '').includes(config.secondMarker)"
           - expr: liveTurnTimeoutMs(env, 60000)
           - sinceIndex:
               ref: beforeRestartCursor
@@ -108,13 +108,16 @@ steps:
           expr: state.getSnapshot()
       - set: firstMatches
         value:
-          expr: "snapshot.messages.slice(0, beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')"
+          expr: "snapshot.messages.slice(0, beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && String(candidate.text ?? '').includes(config.firstMarker))"
       - set: secondMatches
+        value:
+          expr: "snapshot.messages.slice(beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && String(candidate.text ?? '').includes(config.secondMarker))"
+      - set: postRestartOutbounds
         value:
           expr: "snapshot.messages.slice(beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')"
       - assert:
-          expr: "firstMatches.length === 1 && secondMatches.length === 1"
+          expr: "firstMatches.length === 1 && secondMatches.length === 1 && postRestartOutbounds.length === 1 && !postRestartOutbounds.some((candidate) => String(candidate.text ?? '').includes(config.firstMarker))"
           message:
-            expr: "`expected one pre-restart and one post-restart reply; first=${firstMatches.length} second=${secondMatches.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`"
+            expr: "`expected one marked pre-restart reply and exactly one marked post-restart reply without replaying the first marker; first=${firstMatches.length} second=${secondMatches.length} post=${postRestartOutbounds.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`"
     detailsExpr: "`before=${firstOutbound.text}\\nafter=${secondOutbound.text}`"
 ```