Skip to content

Commit 2291047

Browse files
authored
fix(ai): fix missing support for image thought signatures (e.g. for Gemini image models) (#13064)
## Summary This fixes a few problems: * Image thought signatures were not supported by `ai` at all when streaming, including in UI / `useChat` transformations. * Image thought signatures were therefore also not being processed when received in a Gemini API response. * When using Gemini to produce multimodal output (e.g. text and images together), the order of output parts was not respected - images were always displayed after all text, even when in the original response they were in between text paragraphs. ## Manual Verification Use the new UI example `google-gemini-images` in `examples/ai-e2e-next` to verify. Try a prompt like: ``` Write a 3 verse kids poem about a dog meeting an owl, accompanied by illustrations ``` Without this PR, you will see all images appear after all text, even though this is not the correct order per the streaming chunks. With this PR, that's fixed. Similarly, without this PR you will not be able to send any follow up messages - they will error because of the missing image thought signatures. With this PR, it works. ## Related Issues Fixes #12988 Fixes #11461 Fixes #10595
1 parent ef557a7 commit 2291047

File tree

14 files changed

+497
-7
lines changed

14 files changed

+497
-7
lines changed

.changeset/young-ducks-rush.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
'@ai-sdk/google': patch
3+
'ai': patch
4+
---
5+
6+
fix(ai): fix missing support for image thought signatures (e.g. for Gemini image models)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import { google, type GoogleLanguageModelOptions } from '@ai-sdk/google';
2+
import { streamText, convertToModelMessages } from 'ai';
3+
4+
export const maxDuration = 30;
5+
6+
export async function POST(req: Request) {
7+
const { messages } = await req.json();
8+
9+
const result = streamText({
10+
model: google('gemini-3.1-flash-image-preview'),
11+
messages: await convertToModelMessages(messages),
12+
providerOptions: {
13+
google: {
14+
responseModalities: ['TEXT', 'IMAGE'],
15+
thinkingConfig: {
16+
thinkingLevel: 'high',
17+
},
18+
} satisfies GoogleLanguageModelOptions,
19+
},
20+
includeRawChunks: true,
21+
});
22+
23+
return result.toUIMessageStreamResponse();
24+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
'use client';
2+
3+
import ChatInput from '@/components/chat-input';
4+
import { useChat } from '@ai-sdk/react';
5+
import { DefaultChatTransport } from 'ai';
6+
7+
export default function Chat() {
8+
const { status, sendMessage, messages, error, regenerate } = useChat({
9+
transport: new DefaultChatTransport({
10+
api: '/api/chat/google-gemini-image',
11+
}),
12+
});
13+
14+
return (
15+
<div className="flex flex-col w-full max-w-md py-24 mx-auto stretch">
16+
{messages.map(message => (
17+
<div key={message.id} className="whitespace-pre-wrap">
18+
{message.role === 'user' ? 'User: ' : 'AI: '}
19+
{message.parts.map((part, index) => {
20+
if (part.type === 'text') {
21+
return <div key={index}>{part.text}</div>;
22+
} else if (
23+
part.type === 'file' &&
24+
part.mediaType.startsWith('image/')
25+
) {
26+
return (
27+
// eslint-disable-next-line @next/next/no-img-element
28+
<img key={index} src={part.url} alt="Generated image" />
29+
);
30+
}
31+
})}
32+
</div>
33+
))}
34+
35+
{error && (
36+
<div className="mt-4">
37+
<div className="text-red-500">An error occurred.</div>
38+
<button
39+
type="button"
40+
className="px-4 py-2 mt-4 text-blue-500 rounded-md border border-blue-500"
41+
onClick={() => regenerate()}
42+
>
43+
Retry
44+
</button>
45+
</div>
46+
)}
47+
48+
<ChatInput status={status} onSubmit={text => sendMessage({ text })} />
49+
</div>
50+
);
51+
}

packages/ai/src/generate-text/run-tools-transformation.test.ts

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,150 @@ describe('runToolsTransformation', () => {
9999
`);
100100
});
101101

102+
it('should forward file parts', async () => {
103+
const inputStream: ReadableStream<LanguageModelV3StreamPart> =
104+
convertArrayToReadableStream([
105+
{
106+
type: 'file',
107+
data: 'Hello World',
108+
mediaType: 'text/plain',
109+
},
110+
{
111+
type: 'finish',
112+
finishReason: { unified: 'stop', raw: 'stop' },
113+
usage: testUsage,
114+
},
115+
]);
116+
117+
const transformedStream = runToolsTransformation({
118+
generateId: mockId({ prefix: 'id' }),
119+
tools: undefined,
120+
generatorStream: inputStream,
121+
tracer: new MockTracer(),
122+
telemetry: undefined,
123+
messages: [],
124+
system: undefined,
125+
abortSignal: undefined,
126+
repairToolCall: undefined,
127+
experimental_context: undefined,
128+
});
129+
130+
const result = await convertReadableStreamToArray(transformedStream);
131+
132+
expect(result).toMatchInlineSnapshot(`
133+
[
134+
{
135+
"file": DefaultGeneratedFileWithType {
136+
"base64Data": "Hello World",
137+
"mediaType": "text/plain",
138+
"type": "file",
139+
"uint8ArrayData": undefined,
140+
},
141+
"type": "file",
142+
},
143+
{
144+
"finishReason": "stop",
145+
"providerMetadata": undefined,
146+
"rawFinishReason": "stop",
147+
"type": "finish",
148+
"usage": {
149+
"cachedInputTokens": undefined,
150+
"inputTokenDetails": {
151+
"cacheReadTokens": undefined,
152+
"cacheWriteTokens": undefined,
153+
"noCacheTokens": 3,
154+
},
155+
"inputTokens": 3,
156+
"outputTokenDetails": {
157+
"reasoningTokens": undefined,
158+
"textTokens": 10,
159+
},
160+
"outputTokens": 10,
161+
"raw": undefined,
162+
"reasoningTokens": undefined,
163+
"totalTokens": 13,
164+
},
165+
},
166+
]
167+
`);
168+
});
169+
170+
it('should forward file parts with providerMetadata', async () => {
171+
const inputStream: ReadableStream<LanguageModelV3StreamPart> =
172+
convertArrayToReadableStream([
173+
{
174+
type: 'file',
175+
data: 'Hello World',
176+
mediaType: 'text/plain',
177+
providerMetadata: {
178+
testProvider: { signature: 'test-signature' },
179+
},
180+
},
181+
{
182+
type: 'finish',
183+
finishReason: { unified: 'stop', raw: 'stop' },
184+
usage: testUsage,
185+
},
186+
]);
187+
188+
const transformedStream = runToolsTransformation({
189+
generateId: mockId({ prefix: 'id' }),
190+
tools: undefined,
191+
generatorStream: inputStream,
192+
tracer: new MockTracer(),
193+
telemetry: undefined,
194+
messages: [],
195+
system: undefined,
196+
abortSignal: undefined,
197+
repairToolCall: undefined,
198+
experimental_context: undefined,
199+
});
200+
201+
const result = await convertReadableStreamToArray(transformedStream);
202+
203+
expect(result).toMatchInlineSnapshot(`
204+
[
205+
{
206+
"file": DefaultGeneratedFileWithType {
207+
"base64Data": "Hello World",
208+
"mediaType": "text/plain",
209+
"type": "file",
210+
"uint8ArrayData": undefined,
211+
},
212+
"providerMetadata": {
213+
"testProvider": {
214+
"signature": "test-signature",
215+
},
216+
},
217+
"type": "file",
218+
},
219+
{
220+
"finishReason": "stop",
221+
"providerMetadata": undefined,
222+
"rawFinishReason": "stop",
223+
"type": "finish",
224+
"usage": {
225+
"cachedInputTokens": undefined,
226+
"inputTokenDetails": {
227+
"cacheReadTokens": undefined,
228+
"cacheWriteTokens": undefined,
229+
"noCacheTokens": 3,
230+
},
231+
"inputTokens": 3,
232+
"outputTokenDetails": {
233+
"reasoningTokens": undefined,
234+
"textTokens": 10,
235+
},
236+
"outputTokens": 10,
237+
"raw": undefined,
238+
"reasoningTokens": undefined,
239+
"totalTokens": 13,
240+
},
241+
},
242+
]
243+
`);
244+
});
245+
102246
it('should handle async tool execution', async () => {
103247
const inputStream: ReadableStream<LanguageModelV3StreamPart> =
104248
convertArrayToReadableStream([

packages/ai/src/generate-text/run-tools-transformation.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,10 @@ export type SingleRequestTextStreamPart<TOOLS extends ToolSet> =
8787

8888
// Other types:
8989
| ({ type: 'source' } & Source)
90-
| { type: 'file'; file: GeneratedFile } // different because of GeneratedFile object
90+
| { type: 'file'; file: GeneratedFile; providerMetadata?: ProviderMetadata } // different because of GeneratedFile object
9191
| ({ type: 'tool-call' } & TypedToolCall<TOOLS>)
9292
| ({ type: 'tool-result' } & TypedToolResult<TOOLS>)
9393
| ({ type: 'tool-error' } & TypedToolError<TOOLS>)
94-
| { type: 'file'; file: GeneratedFile } // different because of GeneratedFile object
9594
| { type: 'stream-start'; warnings: SharedV3Warning[] }
9695
| {
9796
type: 'response-metadata';
@@ -224,6 +223,9 @@ export function runToolsTransformation<TOOLS extends ToolSet>({
224223
data: chunk.data,
225224
mediaType: chunk.mediaType,
226225
}),
226+
...(chunk.providerMetadata != null
227+
? { providerMetadata: chunk.providerMetadata }
228+
: {}),
227229
});
228230
break;
229231
}

packages/ai/src/generate-text/stream-text-result.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ export type TextStreamPart<TOOLS extends ToolSet> =
421421
providerMetadata?: ProviderMetadata;
422422
}
423423
| ({ type: 'source' } & Source)
424-
| { type: 'file'; file: GeneratedFile } // different because of GeneratedFile object
424+
| { type: 'file'; file: GeneratedFile; providerMetadata?: ProviderMetadata } // different because of GeneratedFile object
425425
| ({ type: 'tool-call' } & TypedToolCall<TOOLS>)
426426
| ({ type: 'tool-result' } & TypedToolResult<TOOLS>)
427427
| ({ type: 'tool-error' } & TypedToolError<TOOLS>)

packages/ai/src/generate-text/stream-text.test.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,34 @@ const modelWithFiles = new MockLanguageModelV3({
226226
}),
227227
});
228228

229+
const modelWithFilesAndProviderMetadata = new MockLanguageModelV3({
230+
doStream: async () => ({
231+
stream: convertArrayToReadableStream([
232+
{
233+
type: 'file',
234+
data: 'Hello World',
235+
mediaType: 'text/plain',
236+
providerMetadata: {
237+
testProvider: { signature: 'sig-1' },
238+
},
239+
},
240+
{ type: 'text-start', id: '1' },
241+
{ type: 'text-delta', id: '1', delta: 'Hello!' },
242+
{ type: 'text-end', id: '1' },
243+
{
244+
type: 'file',
245+
data: 'QkFVRw==',
246+
mediaType: 'image/jpeg',
247+
},
248+
{
249+
type: 'finish',
250+
finishReason: { unified: 'stop', raw: 'stop' },
251+
usage: testUsage,
252+
},
253+
]),
254+
}),
255+
});
256+
229257
const modelWithReasoning = new MockLanguageModelV3({
230258
doStream: async () => ({
231259
stream: convertArrayToReadableStream([
@@ -1017,6 +1045,44 @@ describe('streamText', () => {
10171045
`);
10181046
});
10191047

1048+
it('should send files with providerMetadata', async () => {
1049+
const result = streamText({
1050+
model: modelWithFilesAndProviderMetadata,
1051+
...defaultSettings(),
1052+
});
1053+
1054+
const parts = await convertAsyncIterableToArray(result.fullStream);
1055+
const fileParts = parts.filter(p => p.type === 'file');
1056+
1057+
expect(fileParts).toMatchInlineSnapshot(`
1058+
[
1059+
{
1060+
"file": DefaultGeneratedFileWithType {
1061+
"base64Data": "Hello World",
1062+
"mediaType": "text/plain",
1063+
"type": "file",
1064+
"uint8ArrayData": undefined,
1065+
},
1066+
"providerMetadata": {
1067+
"testProvider": {
1068+
"signature": "sig-1",
1069+
},
1070+
},
1071+
"type": "file",
1072+
},
1073+
{
1074+
"file": DefaultGeneratedFileWithType {
1075+
"base64Data": "QkFVRw==",
1076+
"mediaType": "image/jpeg",
1077+
"type": "file",
1078+
"uint8ArrayData": undefined,
1079+
},
1080+
"type": "file",
1081+
},
1082+
]
1083+
`);
1084+
});
1085+
10201086
it('should use fallback response metadata when response metadata is not provided', async () => {
10211087
const result = streamText({
10221088
model: new MockLanguageModelV3({
@@ -3223,6 +3289,39 @@ describe('streamText', () => {
32233289
`);
32243290
});
32253291

3292+
it('should send file content with providerMetadata', async () => {
3293+
const result = streamText({
3294+
model: modelWithFilesAndProviderMetadata,
3295+
...defaultSettings(),
3296+
});
3297+
3298+
const uiMessageStream = result.toUIMessageStream();
3299+
const parts = await convertReadableStreamToArray(uiMessageStream);
3300+
const fileParts = parts.filter(
3301+
(p: Record<string, unknown>) => p.type === 'file',
3302+
);
3303+
3304+
expect(fileParts).toMatchInlineSnapshot(`
3305+
[
3306+
{
3307+
"mediaType": "text/plain",
3308+
"providerMetadata": {
3309+
"testProvider": {
3310+
"signature": "sig-1",
3311+
},
3312+
},
3313+
"type": "file",
3314+
"url": "data:text/plain;base64,Hello World",
3315+
},
3316+
{
3317+
"mediaType": "image/jpeg",
3318+
"type": "file",
3319+
"url": "data:image/jpeg;base64,QkFVRw==",
3320+
},
3321+
]
3322+
`);
3323+
});
3324+
32263325
it('should not generate a new message id when onFinish is provided and generateMessageId is not provided', async () => {
32273326
const result = streamText({
32283327
model: createTestModel(),

0 commit comments

Comments
 (0)