Skip to content

Commit a931884

Browse files
committed
fix(qa): require runtime tool failure proof
1 parent 7a3d24e commit a931884

5 files changed

Lines changed: 409 additions & 3 deletions

File tree

extensions/qa-lab/src/providers/aimock/server.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type AimockRequestSnapshot = {
1919
plannedToolCallId?: string;
2020
plannedToolName?: string;
2121
toolOutputCallId?: string;
22+
toolOutputStructuredError?: true;
2223
};
2324

2425
function writeJson(res: ServerResponse, status: number, body: unknown) {
@@ -100,6 +101,21 @@ function extractToolOutputCallId(body: ChatCompletionRequest | null | undefined)
100101
return "";
101102
}
102103

104+
function extractToolOutputStructuredError(body: ChatCompletionRequest | null | undefined) {
105+
const messages = requestMessages(body);
106+
for (let index = messages.length - 1; index >= 0; index -= 1) {
107+
const message = messages[index] as {
108+
role?: unknown;
109+
isError?: unknown;
110+
is_error?: unknown;
111+
};
112+
if (message?.role === "tool") {
113+
return message.isError === true || message.is_error === true;
114+
}
115+
}
116+
return false;
117+
}
118+
103119
function countImageInputs(value: unknown): number {
104120
if (Array.isArray(value)) {
105121
return value.reduce((sum, entry) => sum + countImageInputs(entry), 0);
@@ -170,6 +186,7 @@ function toRequestSnapshot(entry: JournalEntry): AimockRequestSnapshot {
170186
plannedToolCallId: extractPlannedToolCallId(entry),
171187
plannedToolName: extractPlannedToolName(entry),
172188
toolOutputCallId: extractToolOutputCallId(body) || undefined,
189+
...(extractToolOutputStructuredError(body) ? { toolOutputStructuredError: true } : {}),
173190
};
174191
}
175192

extensions/qa-lab/src/providers/mock-openai/server.test.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3752,6 +3752,59 @@ describe("qa mock openai server", () => {
37523752
expect(debug.allInputText).toContain("Delegate one bounded QA task");
37533753
});
37543754

3755+
it("exposes structured Anthropic tool_result errors in debug snapshots", async () => {
3756+
const server = await startQaMockOpenAiServer({
3757+
host: "127.0.0.1",
3758+
port: 0,
3759+
});
3760+
cleanups.push(async () => {
3761+
await server.stop();
3762+
});
3763+
3764+
const response = await fetch(`${server.baseUrl}/v1/messages`, {
3765+
method: "POST",
3766+
headers: { "content-type": "application/json" },
3767+
body: JSON.stringify({
3768+
model: "claude-opus-4-8",
3769+
max_tokens: 256,
3770+
messages: [
3771+
{
3772+
role: "assistant",
3773+
content: [
3774+
{
3775+
type: "tool_use",
3776+
id: "toolu_mock_read_error",
3777+
name: "read",
3778+
input: { path: "/missing" },
3779+
},
3780+
],
3781+
},
3782+
{
3783+
role: "user",
3784+
content: [
3785+
{
3786+
type: "tool_result",
3787+
tool_use_id: "toolu_mock_read_error",
3788+
is_error: true,
3789+
content: "ENOENT: no such file or directory",
3790+
},
3791+
],
3792+
},
3793+
],
3794+
}),
3795+
});
3796+
expect(response.status).toBe(200);
3797+
3798+
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
3799+
expect(debugResponse.status).toBe(200);
3800+
const debug = (await debugResponse.json()) as {
3801+
toolOutputCallId: string;
3802+
toolOutputStructuredError?: boolean;
3803+
};
3804+
expect(debug.toolOutputCallId).toBe("toolu_mock_read_error");
3805+
expect(debug.toolOutputStructuredError).toBe(true);
3806+
});
3807+
37553808
it("streams Anthropic /v1/messages tool_use responses as SSE", async () => {
37563809
const server = await startQaMockOpenAiServer({
37573810
host: "127.0.0.1",

extensions/qa-lab/src/providers/mock-openai/server.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ type MockOpenAiRequestSnapshot = {
106106
plannedToolName?: string;
107107
plannedToolArgs?: Record<string, unknown>;
108108
toolOutputCallId?: string;
109+
toolOutputStructuredError?: true;
109110
};
110111

111112
// Anthropic /v1/messages request/response shapes the mock actually needs.
@@ -125,6 +126,7 @@ type AnthropicMessageContentBlock =
125126
| {
126127
type: "tool_result";
127128
tool_use_id: string;
129+
is_error?: boolean;
128130
content: string | Array<{ type: "text"; text: string }>;
129131
}
130132
| { type: "image"; source: Record<string, unknown> };
@@ -395,6 +397,13 @@ function extractFunctionCallOutputCallId(item: ResponsesInputItem) {
395397
);
396398
}
397399

400+
function functionCallOutputIsStructuredError(item: ResponsesInputItem) {
401+
if (item.type !== "function_call_output") {
402+
return false;
403+
}
404+
return item.is_error === true || item.isError === true;
405+
}
406+
398407
function extractToolOutput(input: ResponsesInputItem[]) {
399408
const lastUserIndex = findLastUserIndex(input);
400409
for (let index = input.length - 1; index > lastUserIndex; index -= 1) {
@@ -425,6 +434,35 @@ function extractToolOutput(input: ResponsesInputItem[]) {
425434
return "";
426435
}
427436

437+
function extractToolOutputStructuredError(input: ResponsesInputItem[]) {
438+
const lastUserIndex = findLastUserIndex(input);
439+
for (let index = input.length - 1; index > lastUserIndex; index -= 1) {
440+
const item = input[index];
441+
const output = extractFunctionCallOutputText(item);
442+
if (output) {
443+
return functionCallOutputIsStructuredError(item);
444+
}
445+
}
446+
for (let index = input.length - 1; index >= 0; index -= 1) {
447+
const item = input[index];
448+
const output = extractFunctionCallOutputText(item);
449+
if (output) {
450+
const laterUserTexts = input
451+
.slice(index + 1)
452+
.filter((laterItem) => laterItem.role === "user" && Array.isArray(laterItem.content))
453+
.map((laterItem) => extractInputText(laterItem.content as unknown[]))
454+
.filter(Boolean);
455+
if (
456+
laterUserTexts.length > 0 &&
457+
laterUserTexts.every((text) => isToolOutputContinuationText(text))
458+
) {
459+
return functionCallOutputIsStructuredError(item);
460+
}
461+
}
462+
}
463+
return false;
464+
}
465+
428466
function extractToolOutputCallId(input: ResponsesInputItem[]) {
429467
const lastUserIndex = findLastUserIndex(input);
430468
for (let index = input.length - 1; index > lastUserIndex; index -= 1) {
@@ -2867,6 +2905,7 @@ function convertAnthropicMessagesToResponsesInput(params: {
28672905
type: "function_call_output",
28682906
call_id: block.tool_use_id,
28692907
output,
2908+
...(block.is_error === true ? { is_error: true } : {}),
28702909
});
28712910
}
28722911
continue;
@@ -3235,6 +3274,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
32353274
plannedToolName: extractPlannedToolName(events),
32363275
plannedToolArgs: extractPlannedToolArgs(events),
32373276
toolOutputCallId: extractToolOutputCallId(input) || undefined,
3277+
...(extractToolOutputStructuredError(input) ? { toolOutputStructuredError: true } : {}),
32383278
};
32393279
requests.push(lastRequest);
32403280
if (requests.length > MOCK_OPENAI_DEBUG_REQUEST_LIMIT) {
@@ -3293,6 +3333,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
32933333
plannedToolName: extractPlannedToolName(events),
32943334
plannedToolArgs: extractPlannedToolArgs(events),
32953335
toolOutputCallId: extractToolOutputCallId(input) || undefined,
3336+
...(extractToolOutputStructuredError(input) ? { toolOutputStructuredError: true } : {}),
32963337
};
32973338
requests.push(lastRequest);
32983339
if (requests.length > MOCK_OPENAI_DEBUG_REQUEST_LIMIT) {

0 commit comments

Comments
 (0)