Skip to content

Commit 6dd0262

Browse files
committed
fix: spool large mcp media to disk
1 parent 4f72de4 commit 6dd0262

4 files changed

Lines changed: 200 additions & 20 deletions

File tree

pkg/runtime/toolexec/dispatcher.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -724,13 +724,21 @@ func buildMultiContent(text string, images []tools.MediaContent) []chat.MessageP
724724
parts := make([]chat.MessagePart, 0, 1+len(images))
725725
parts = append(parts, chat.MessagePart{Type: chat.MessagePartTypeText, Text: text})
726726
for _, img := range images {
727-
parts = append(parts, chat.MessagePart{
728-
Type: chat.MessagePartTypeImageURL,
729-
ImageURL: &chat.MessageImageURL{
730-
URL: "data:" + img.MimeType + ";base64," + img.Data,
731-
Detail: chat.ImageURLDetailAuto,
732-
},
733-
})
727+
switch {
728+
case img.FilePath != "":
729+
parts = append(parts, chat.MessagePart{
730+
Type: chat.MessagePartTypeText,
731+
Text: fmt.Sprintf("[image saved to %s (%s)]", img.FilePath, img.MimeType),
732+
})
733+
case img.Data != "":
734+
parts = append(parts, chat.MessagePart{
735+
Type: chat.MessagePartTypeImageURL,
736+
ImageURL: &chat.MessageImageURL{
737+
URL: "data:" + img.MimeType + ";base64," + img.Data,
738+
Detail: chat.ImageURLDetailAuto,
739+
},
740+
})
741+
}
734742
}
735743
return parts
736744
}

pkg/tools/mcp/mcp.go

Lines changed: 104 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ type Toolset struct {
160160

161161
supervisor *lifecycle.Supervisor
162162

163+
// mediaDir is the toolset-scoped temp dir holding spooled media
164+
// payloads. Created lazily on first spool, removed by Stop.
165+
mediaMu sync.Mutex
166+
mediaDir string
167+
163168
mu sync.Mutex
164169

165170
// Cached tools and prompts, invalidated via MCP notifications and
@@ -426,6 +431,7 @@ func (ts *Toolset) Start(ctx context.Context) error {
426431
// Stop tears the supervisor down. Idempotent.
427432
func (ts *Toolset) Stop(ctx context.Context) error {
428433
slog.DebugContext(ctx, "Stopping MCP toolset", "server", ts.logID)
434+
defer ts.cleanupMediaDir()
429435
if ts.supervisor == nil {
430436
return nil
431437
}
@@ -694,7 +700,7 @@ func (ts *Toolset) callTool(ctx context.Context, toolCall tools.ToolCall) (*tool
694700
return nil, fmt.Errorf("failed to call tool: %w", err)
695701
}
696702

697-
result := processMCPContent(resp)
703+
result := ts.processMCPContent(resp)
698704
slog.DebugContext(ctx, "MCP tool call completed", "tool", toolCall.Function.Name, "output_length", len(result.Output))
699705
slog.DebugContext(ctx, result.Output)
700706
return result, nil
@@ -714,7 +720,13 @@ func isInitNotificationSendError(err error) bool {
714720
return false
715721
}
716722

717-
func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
723+
const maxInlineMediaBytes = 256 * 1024
724+
725+
// writeMediaFile is a package-level indirection so tests can simulate
726+
// disk failures without manipulating the filesystem.
727+
var writeMediaFile = defaultWriteMediaFile
728+
729+
func (ts *Toolset) processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
718730
var text strings.Builder
719731
var images, audios []tools.MediaContent
720732

@@ -723,9 +735,9 @@ func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
723735
case *mcp.TextContent:
724736
text.WriteString(c.Text)
725737
case *mcp.ImageContent:
726-
images = append(images, encodeMedia(c.Data, c.MIMEType))
738+
images = append(images, ts.encodeMedia(c.Data, c.MIMEType))
727739
case *mcp.AudioContent:
728-
audios = append(audios, encodeMedia(c.Data, c.MIMEType))
740+
audios = append(audios, ts.encodeMedia(c.Data, c.MIMEType))
729741
case *mcp.ResourceLink:
730742
if c.Name != "" {
731743
// Escape ] in name and ) in URI to prevent broken markdown links.
@@ -760,12 +772,94 @@ func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
760772
}
761773
}
762774

763-
// encodeMedia re-encodes raw bytes (as decoded by the MCP SDK) back to base64
764-
// for our internal MediaContent representation.
765-
func encodeMedia(data []byte, mimeType string) tools.MediaContent {
766-
return tools.MediaContent{
767-
Data: base64.StdEncoding.EncodeToString(data),
768-
MimeType: mimeType,
775+
// encodeMedia keeps small payloads inline and spools larger ones to disk so the
776+
// session and TUI do not retain duplicate base64 copies. Spooled files live
777+
// under a toolset-scoped temp directory removed by Stop.
778+
func (ts *Toolset) encodeMedia(data []byte, mimeType string) tools.MediaContent {
779+
media := tools.MediaContent{MimeType: mimeType}
780+
if len(data) <= maxInlineMediaBytes {
781+
media.Data = base64.StdEncoding.EncodeToString(data)
782+
return media
783+
}
784+
785+
dir, err := ts.ensureMediaDir()
786+
if err == nil {
787+
var path string
788+
path, err = writeMediaFile(dir, data, mimeType)
789+
if err == nil {
790+
media.FilePath = path
791+
return media
792+
}
793+
}
794+
slog.Warn("failed to spool MCP media to disk", "mime_type", mimeType, "bytes", len(data), "error", err)
795+
media.Data = base64.StdEncoding.EncodeToString(data)
796+
return media
797+
}
798+
799+
// ensureMediaDir lazily creates the toolset-scoped temp dir for spooled
800+
// media payloads. The directory is removed by Stop.
801+
func (ts *Toolset) ensureMediaDir() (string, error) {
802+
ts.mediaMu.Lock()
803+
defer ts.mediaMu.Unlock()
804+
if ts.mediaDir != "" {
805+
return ts.mediaDir, nil
806+
}
807+
dir, err := os.MkdirTemp("", "docker-agent-mcp-media-*")
808+
if err != nil {
809+
return "", err
810+
}
811+
ts.mediaDir = dir
812+
return dir, nil
813+
}
814+
815+
// cleanupMediaDir removes the toolset-scoped media spool directory, if any.
816+
func (ts *Toolset) cleanupMediaDir() {
817+
ts.mediaMu.Lock()
818+
dir := ts.mediaDir
819+
ts.mediaDir = ""
820+
ts.mediaMu.Unlock()
821+
if dir == "" {
822+
return
823+
}
824+
if err := os.RemoveAll(dir); err != nil {
825+
slog.Warn("failed to remove MCP media spool directory", "dir", dir, "error", err)
826+
}
827+
}
828+
829+
func defaultWriteMediaFile(dir string, data []byte, mimeType string) (string, error) {
830+
f, err := os.CreateTemp(dir, "media-*"+mediaExtension(mimeType))
831+
if err != nil {
832+
return "", err
833+
}
834+
path := f.Name()
835+
if _, err := f.Write(data); err != nil {
836+
_ = f.Close()
837+
_ = os.Remove(path)
838+
return "", err
839+
}
840+
if err := f.Close(); err != nil {
841+
_ = os.Remove(path)
842+
return "", err
843+
}
844+
return path, nil
845+
}
846+
847+
func mediaExtension(mimeType string) string {
848+
switch mimeType {
849+
case "image/png":
850+
return ".png"
851+
case "image/jpeg":
852+
return ".jpg"
853+
case "image/gif":
854+
return ".gif"
855+
case "image/webp":
856+
return ".webp"
857+
case "audio/wav", "audio/wave", "audio/x-wav":
858+
return ".wav"
859+
case "audio/mpeg", "audio/mp3":
860+
return ".mp3"
861+
default:
862+
return ".bin"
769863
}
770864
}
771865

pkg/tools/mcp/mcp_test.go

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package mcp
22

33
import (
4+
"bytes"
45
"context"
6+
"errors"
57
"fmt"
68
"iter"
9+
"os"
710
"sync"
811
"sync/atomic"
912
"testing"
@@ -478,7 +481,7 @@ func TestProcessMCPContent(t *testing.T) {
478481
t.Run(tt.name, func(t *testing.T) {
479482
t.Parallel()
480483

481-
result := processMCPContent(tt.input)
484+
result := (&Toolset{}).processMCPContent(tt.input)
482485

483486
assert.Equal(t, tt.wantOutput, result.Output)
484487
assert.Equal(t, tt.wantIsError, result.IsError)
@@ -536,3 +539,75 @@ func TestCallToolRecoversFromErrSessionMissing(t *testing.T) {
536539
assert.Equal(t, "recovered", result.Output)
537540
assert.Equal(t, int32(2), callCount.Load(), "expected exactly 2 CallTool invocations (1 failed + 1 retry)")
538541
}
542+
543+
func TestProcessMCPContentSpoolsLargeMedia(t *testing.T) {
544+
ts := &Toolset{}
545+
t.Cleanup(ts.cleanupMediaDir)
546+
547+
tests := []struct {
548+
name string
549+
size int
550+
wantInline bool
551+
}{
552+
{"at threshold stays inline", maxInlineMediaBytes, true},
553+
{"above threshold spools", maxInlineMediaBytes + 1, false},
554+
}
555+
556+
for _, tt := range tests {
557+
t.Run(tt.name, func(t *testing.T) {
558+
data := bytes.Repeat([]byte("x"), tt.size)
559+
result := ts.processMCPContent(callToolResult(&mcp.ImageContent{Data: data, MIMEType: "image/png"}))
560+
561+
require.Len(t, result.Images, 1)
562+
img := result.Images[0]
563+
assert.Equal(t, "image/png", img.MimeType)
564+
565+
if tt.wantInline {
566+
assert.NotEmpty(t, img.Data)
567+
assert.Empty(t, img.FilePath)
568+
return
569+
}
570+
571+
assert.Empty(t, img.Data)
572+
require.NotEmpty(t, img.FilePath)
573+
574+
got, err := os.ReadFile(img.FilePath)
575+
require.NoError(t, err)
576+
assert.Equal(t, data, got)
577+
})
578+
}
579+
}
580+
581+
func TestEncodeMediaFallsBackToInlineOnDiskFailure(t *testing.T) {
582+
original := writeMediaFile
583+
t.Cleanup(func() { writeMediaFile = original })
584+
writeMediaFile = func(string, []byte, string) (string, error) {
585+
return "", errors.New("disk full")
586+
}
587+
588+
ts := &Toolset{}
589+
t.Cleanup(ts.cleanupMediaDir)
590+
591+
data := bytes.Repeat([]byte("x"), maxInlineMediaBytes+1)
592+
result := ts.processMCPContent(callToolResult(&mcp.ImageContent{Data: data, MIMEType: "image/png"}))
593+
594+
require.Len(t, result.Images, 1)
595+
img := result.Images[0]
596+
assert.Empty(t, img.FilePath)
597+
assert.NotEmpty(t, img.Data, "falls back to inline base64 when disk write fails")
598+
}
599+
600+
func TestToolsetStopRemovesMediaDir(t *testing.T) {
601+
ts := &Toolset{}
602+
data := bytes.Repeat([]byte("x"), maxInlineMediaBytes+1)
603+
media := ts.encodeMedia(data, "image/png")
604+
require.NotEmpty(t, media.FilePath)
605+
606+
_, err := os.Stat(media.FilePath)
607+
require.NoError(t, err)
608+
609+
require.NoError(t, ts.Stop(t.Context()))
610+
611+
_, err = os.Stat(media.FilePath)
612+
assert.True(t, os.IsNotExist(err), "spooled media file should be removed by Stop")
613+
}

pkg/tools/tools.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,11 @@ type FunctionCall struct {
7373
// MediaContent represents base64-encoded binary data (image, audio, etc.)
7474
// returned by a tool.
7575
type MediaContent struct {
76-
// Data is the base64-encoded payload.
77-
Data string `json:"data"`
76+
// Data is the base64-encoded payload. It is kept only for small media; large
77+
// MCP payloads are spooled to FilePath to avoid retaining duplicate base64.
78+
Data string `json:"data,omitempty"`
79+
// FilePath is an optional local file containing the decoded media payload.
80+
FilePath string `json:"filePath,omitempty"`
7881
// MimeType identifies the content type (e.g. "image/png", "audio/wav").
7982
MimeType string `json:"mimeType"`
8083
}

0 commit comments

Comments
 (0)