Skip to content

Commit c848ebc

Browse files
authored
agents: split GPT-5 prompt and retry behavior (#65597)
* agents: split GPT-5 prompt and retry behavior * agents: fix GPT-5 review follow-ups * agents: address GPT-5 review follow-ups * agents: avoid replaying side-effectful GPT retries * agents: mark subagent control as mutating * agents: fail closed on single-action retries * commands: stabilize channel legacy doctor migration test * agents: narrow single-action retry promise trigger
1 parent d0c8377 commit c848ebc

8 files changed

Lines changed: 352 additions & 17 deletions

File tree

extensions/openai/index.test.ts

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import {
1414
OPENAI_FRIENDLY_PROMPT_OVERLAY,
1515
OPENAI_GPT5_EXECUTION_BIAS,
1616
OPENAI_GPT5_OUTPUT_CONTRACT,
17+
OPENAI_GPT5_TOOL_CALL_STYLE,
1718
} from "./prompt-overlay.js";
1819

1920
const runtimeMocks = vi.hoisted(() => ({
@@ -365,7 +366,7 @@ describe("openai plugin", () => {
365366
};
366367

367368
expect(openaiProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
368-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
369+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
369370
sectionOverrides: {
370371
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
371372
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -382,7 +383,7 @@ describe("openai plugin", () => {
382383
"Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
383384
);
384385
expect(codexProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
385-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
386+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
386387
sectionOverrides: {
387388
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
388389
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -454,9 +455,22 @@ describe("openai plugin", () => {
454455
expect(OPENAI_FRIENDLY_PROMPT_OVERLAY).toContain(
455456
"Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
456457
);
458+
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
459+
"Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.",
460+
);
461+
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
462+
"If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.",
463+
);
457464
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
458465
"Do prerequisite lookup or discovery before dependent actions.",
459466
);
467+
expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
468+
"Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.",
469+
);
470+
expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
471+
"When a first-class tool exists for an action, use the tool instead of asking the user to run a command.",
472+
);
473+
expect(OPENAI_GPT5_TOOL_CALL_STYLE).not.toContain("/approve");
460474
expect(OPENAI_GPT5_OUTPUT_CONTRACT).toContain(
461475
"Return the requested sections only, in the requested order.",
462476
);
@@ -486,7 +500,7 @@ describe("openai plugin", () => {
486500
agentId: undefined,
487501
}),
488502
).toEqual({
489-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
503+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
490504
sectionOverrides: {
491505
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
492506
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -514,7 +528,7 @@ describe("openai plugin", () => {
514528
agentId: undefined,
515529
}),
516530
).toEqual({
517-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
531+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
518532
sectionOverrides: {
519533
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
520534
},
@@ -540,7 +554,7 @@ describe("openai plugin", () => {
540554
agentId: undefined,
541555
}),
542556
).toEqual({
543-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
557+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
544558
sectionOverrides: {
545559
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
546560
},
@@ -567,7 +581,7 @@ describe("openai plugin", () => {
567581
agentId: undefined,
568582
}),
569583
).toEqual({
570-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
584+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
571585
sectionOverrides: {
572586
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
573587
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -594,7 +608,7 @@ describe("openai plugin", () => {
594608
agentId: undefined,
595609
}),
596610
).toEqual({
597-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
611+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
598612
sectionOverrides: {
599613
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
600614
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,

extensions/openai/prompt-overlay.ts

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,20 @@ Do not use em dashes unless the user explicitly asks for them or they are requir
6464

6565
export const OPENAI_GPT5_EXECUTION_BIAS = `## Execution Bias
6666
67-
Start the real work in the same turn when the next step is clear.
67+
Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.
68+
Commentary-only turns are incomplete when tools are available and the next action is clear.
69+
If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.
6870
Do prerequisite lookup or discovery before dependent actions.
69-
If another tool call would likely improve correctness or completeness, keep going instead of stopping at partial progress.
7071
Multi-part requests stay incomplete until every requested item is handled or clearly marked blocked.
71-
Before the final answer, quickly verify correctness, coverage, formatting, and obvious side effects.`;
72+
Act first, then verify if needed. Do not pause to summarize or verify before taking the next action.`;
73+
74+
export const OPENAI_GPT5_TOOL_CALL_STYLE = `## Tool Call Style
75+
76+
Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.
77+
When a first-class tool exists for an action, use the tool instead of asking the user to run a command.
78+
If multiple tool calls are needed, call them in sequence without stopping to explain between calls.
79+
Default: do not narrate routine, low-risk tool calls (just call the tool).
80+
Narrate only when it genuinely helps: complex multi-step work, sensitive actions like deletions, or when the user explicitly asks for commentary.`;
7281

7382
export type OpenAIPromptOverlayMode = "friendly" | "off";
7483

@@ -103,8 +112,14 @@ export function resolveOpenAISystemPromptContribution(params: {
103112
) {
104113
return undefined;
105114
}
115+
// tool_call_style is NOT overridden via sectionOverrides because the
116+
// default section includes dynamic channel-specific approval guidance
117+
// from buildExecApprovalPromptGuidance() that varies per runtime
118+
// channel. Overriding it with a static string would lose that dynamic
119+
// content. Instead, the tool-first reinforcement lives in stablePrefix
120+
// so it's always present alongside the default tool_call_style section.
106121
return {
107-
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
122+
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
108123
sectionOverrides: {
109124
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
110125
...(params.mode === "friendly" ? { interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY } : {}),

src/agents/pi-embedded-runner/run.incomplete-turn.test.ts

Lines changed: 159 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ import {
1010
resetRunOverflowCompactionHarnessMocks,
1111
} from "./run.overflow-compaction.harness.js";
1212
import {
13+
buildAttemptReplayMetadata,
1314
extractPlanningOnlyPlanDetails,
1415
isLikelyExecutionAckPrompt,
16+
PLANNING_ONLY_RETRY_INSTRUCTION,
1517
resolveAckExecutionFastPathInstruction,
1618
resolvePlanningOnlyRetryLimit,
1719
resolvePlanningOnlyRetryInstruction,
@@ -281,7 +283,10 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
281283
timedOut: false,
282284
attempt: makeAttemptResult({
283285
assistantTexts: ["I'll inspect the code, make the change, and run the checks."],
284-
toolMetas: [{ toolName: "bash", meta: "ls" }],
286+
toolMetas: [
287+
{ toolName: "read", meta: "path=src/index.ts" },
288+
{ toolName: "search", meta: "pattern=runEmbeddedPiAgent" },
289+
],
285290
}),
286291
});
287292

@@ -436,3 +441,156 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
436441
).toBe("paused");
437442
});
438443
});
444+
445+
describe("resolvePlanningOnlyRetryInstruction single-action loophole", () => {
446+
const openaiParams = { provider: "openai", modelId: "gpt-5.4" } as const;
447+
448+
function makeAttemptWithTools(
449+
toolNames: string[],
450+
assistantText: string,
451+
): Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"] {
452+
const toolMetas = toolNames.map((toolName) => ({ toolName }));
453+
return {
454+
toolMetas,
455+
assistantTexts: [assistantText],
456+
lastAssistant: { stopReason: "stop" },
457+
itemLifecycle: { startedCount: toolNames.length },
458+
replayMetadata: buildAttemptReplayMetadata({
459+
toolMetas,
460+
didSendViaMessagingTool: false,
461+
}),
462+
clientToolCall: null,
463+
yieldDetected: false,
464+
didSendDeterministicApprovalPrompt: false,
465+
didSendViaMessagingTool: false,
466+
lastToolError: null,
467+
} as unknown as Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"];
468+
}
469+
470+
it("retries when exactly 1 non-plan tool call plus 'i can do that' prose is detected", () => {
471+
const result = resolvePlanningOnlyRetryInstruction({
472+
...openaiParams,
473+
aborted: false,
474+
timedOut: false,
475+
attempt: makeAttemptWithTools(["read"], "I can do that next."),
476+
});
477+
478+
expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
479+
});
480+
481+
it("retries when exactly 1 non-plan tool call plus planning prose is detected", () => {
482+
const result = resolvePlanningOnlyRetryInstruction({
483+
...openaiParams,
484+
aborted: false,
485+
timedOut: false,
486+
attempt: makeAttemptWithTools(["read"], "I'll analyze the structure next."),
487+
});
488+
489+
expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
490+
});
491+
492+
it("does not retry when 2+ non-plan tool calls are present", () => {
493+
const result = resolvePlanningOnlyRetryInstruction({
494+
...openaiParams,
495+
aborted: false,
496+
timedOut: false,
497+
attempt: makeAttemptWithTools(["read", "search"], "I'll verify the output."),
498+
});
499+
500+
expect(result).toBeNull();
501+
});
502+
503+
it("does not retry when 1 tool call plus completion language is present", () => {
504+
const result = resolvePlanningOnlyRetryInstruction({
505+
...openaiParams,
506+
aborted: false,
507+
timedOut: false,
508+
attempt: makeAttemptWithTools(["read"], "Done. The file looks correct."),
509+
});
510+
511+
expect(result).toBeNull();
512+
});
513+
514+
it("does not retry when 1 tool call plus 'let me know' handoff is present", () => {
515+
const result = resolvePlanningOnlyRetryInstruction({
516+
...openaiParams,
517+
aborted: false,
518+
timedOut: false,
519+
attempt: makeAttemptWithTools(["read"], "Let me know if you need anything else."),
520+
});
521+
522+
expect(result).toBeNull();
523+
});
524+
525+
it("does not retry when 1 tool call plus an answer-style summary is present", () => {
526+
const result = resolvePlanningOnlyRetryInstruction({
527+
...openaiParams,
528+
aborted: false,
529+
timedOut: false,
530+
attempt: makeAttemptWithTools(
531+
["read"],
532+
"I'll summarize the root cause: the provider auth scope is missing.",
533+
),
534+
});
535+
536+
expect(result).toBeNull();
537+
});
538+
539+
it("does not retry when 1 tool call plus a future-tense description is present", () => {
540+
const result = resolvePlanningOnlyRetryInstruction({
541+
...openaiParams,
542+
aborted: false,
543+
timedOut: false,
544+
attempt: makeAttemptWithTools(
545+
["read"],
546+
"I'll describe the issue: the provider auth scope is missing.",
547+
),
548+
});
549+
550+
expect(result).toBeNull();
551+
});
552+
553+
it("does not retry when 1 safe tool call is followed by answer prose joined with 'and'", () => {
554+
const result = resolvePlanningOnlyRetryInstruction({
555+
...openaiParams,
556+
aborted: false,
557+
timedOut: false,
558+
attempt: makeAttemptWithTools(["read"], "I'll explain and recommend a fix."),
559+
});
560+
561+
expect(result).toBeNull();
562+
});
563+
564+
it("does not retry when 1 tool call plus a bare 'i can do that' reply is present", () => {
565+
const result = resolvePlanningOnlyRetryInstruction({
566+
...openaiParams,
567+
aborted: false,
568+
timedOut: false,
569+
attempt: makeAttemptWithTools(["read"], "I can do that."),
570+
});
571+
572+
expect(result).toBeNull();
573+
});
574+
575+
it("does not retry when the lone tool call already had side effects", () => {
576+
const result = resolvePlanningOnlyRetryInstruction({
577+
...openaiParams,
578+
aborted: false,
579+
timedOut: false,
580+
attempt: makeAttemptWithTools(["sessions_spawn"], "I'll continue from there next."),
581+
});
582+
583+
expect(result).toBeNull();
584+
});
585+
586+
it("does not retry when the lone tool call is unclassified", () => {
587+
const result = resolvePlanningOnlyRetryInstruction({
588+
...openaiParams,
589+
aborted: false,
590+
timedOut: false,
591+
attempt: makeAttemptWithTools(["vendor_widget"], "I'll continue from there next."),
592+
});
593+
594+
expect(result).toBeNull();
595+
});
596+
});

0 commit comments

Comments
 (0)