@@ -302,10 +302,17 @@ func convertResponsesInputMap(item map[string]interface{}, index int) (core.Mess
302302 if callID == "" {
303303 return core.Message {}, "" , core .NewInvalidRequestError (fmt .Sprintf ("invalid responses input item at index %d: function_call_output call_id is required" , index ), nil )
304304 }
305+ content , err := stringifyResponsesInputValueWithError (item ["output" ])
306+ if err != nil {
307+ return core.Message {}, "" , core .NewInvalidRequestError (
308+ fmt .Sprintf ("invalid responses input item at index %d: function_call_output.output must be JSON-serializable" , index ),
309+ err ,
310+ )
311+ }
305312 return core.Message {
306313 Role : "tool" ,
307314 ToolCallID : callID ,
308- Content : stringifyResponsesInputValue ( item [ "output" ]) ,
315+ Content : content ,
309316 ExtraFields : rawJSONMapFromUnknownKeys (item , "type" , "call_id" , "status" , "output" ),
310317 }, "function_call_output" , nil
311318 }
@@ -488,28 +495,37 @@ func normalizeTypedResponsesContentPart(part core.ContentPart) (core.ContentPart
488495 ExtraFields : core .CloneRawJSONMap (part .ExtraFields ),
489496 }, true
490497 case "image_url" , "input_image" :
491- if part .ImageURL == nil || part .ImageURL .URL == "" {
498+ if part .ImageURL == nil {
499+ return core.ContentPart {}, false
500+ }
501+ url := strings .TrimSpace (part .ImageURL .URL )
502+ if url == "" {
492503 return core.ContentPart {}, false
493504 }
494505 return core.ContentPart {
495506 Type : "image_url" ,
496507 ImageURL : & core.ImageURLContent {
497- URL : part . ImageURL . URL ,
498- Detail : part .ImageURL .Detail ,
499- MediaType : part .ImageURL .MediaType ,
508+ URL : url ,
509+ Detail : strings . TrimSpace ( part .ImageURL .Detail ) ,
510+ MediaType : strings . TrimSpace ( part .ImageURL .MediaType ) ,
500511 ExtraFields : core .CloneRawJSONMap (part .ImageURL .ExtraFields ),
501512 },
502513 ExtraFields : core .CloneRawJSONMap (part .ExtraFields ),
503514 }, true
504515 case "input_audio" :
505- if part .InputAudio == nil || part .InputAudio .Data == "" || part .InputAudio .Format == "" {
516+ if part .InputAudio == nil {
517+ return core.ContentPart {}, false
518+ }
519+ data := strings .TrimSpace (part .InputAudio .Data )
520+ format := strings .TrimSpace (part .InputAudio .Format )
521+ if data == "" || format == "" {
506522 return core.ContentPart {}, false
507523 }
508524 return core.ContentPart {
509525 Type : "input_audio" ,
510526 InputAudio : & core.InputAudioContent {
511- Data : part . InputAudio . Data ,
512- Format : part . InputAudio . Format ,
527+ Data : data ,
528+ Format : format ,
513529 ExtraFields : core .CloneRawJSONMap (part .InputAudio .ExtraFields ),
514530 },
515531 ExtraFields : core .CloneRawJSONMap (part .ExtraFields ),
@@ -550,32 +566,34 @@ func canFlattenResponsesPartsToText(parts []core.ContentPart) bool {
550566func normalizeResponsesImageURLForChat (value interface {}) (* core.ImageURLContent , bool ) {
551567 switch v := value .(type ) {
552568 case string :
553- if v == "" {
569+ url := strings .TrimSpace (v )
570+ if url == "" {
554571 return nil , false
555572 }
556- return & core.ImageURLContent {URL : v }, true
573+ return & core.ImageURLContent {URL : url }, true
557574 case map [string ]string :
558- url := v ["url" ]
575+ url := strings . TrimSpace ( v ["url" ])
559576 if url == "" {
560577 return nil , false
561578 }
562579 return & core.ImageURLContent {
563580 URL : url ,
564- Detail : v ["detail" ],
565- MediaType : v ["media_type" ],
581+ Detail : strings . TrimSpace ( v ["detail" ]) ,
582+ MediaType : strings . TrimSpace ( v ["media_type" ]) ,
566583 ExtraFields : rawJSONMapFromUnknownStringKeys (v , "url" , "detail" , "media_type" ),
567584 }, true
568585 case map [string ]interface {}:
569586 url , _ := v ["url" ].(string )
587+ url = strings .TrimSpace (url )
570588 if url == "" {
571589 return nil , false
572590 }
573591 detail , _ := v ["detail" ].(string )
574592 mediaType , _ := v ["media_type" ].(string )
575593 return & core.ImageURLContent {
576594 URL : url ,
577- Detail : detail ,
578- MediaType : mediaType ,
595+ Detail : strings . TrimSpace ( detail ) ,
596+ MediaType : strings . TrimSpace ( mediaType ) ,
579597 ExtraFields : rawJSONMapFromUnknownKeys (v , "url" , "detail" , "media_type" ),
580598 }, true
581599 default :
@@ -586,8 +604,8 @@ func normalizeResponsesImageURLForChat(value interface{}) (*core.ImageURLContent
586604func normalizeResponsesInputAudioForChat (value interface {}) (* core.InputAudioContent , bool ) {
587605 switch v := value .(type ) {
588606 case map [string ]string :
589- data := v ["data" ]
590- format := v ["format" ]
607+ data := strings . TrimSpace ( v ["data" ])
608+ format := strings . TrimSpace ( v ["format" ])
591609 if data == "" || format == "" {
592610 return nil , false
593611 }
@@ -599,6 +617,8 @@ func normalizeResponsesInputAudioForChat(value interface{}) (*core.InputAudioCon
599617 case map [string ]interface {}:
600618 data , _ := v ["data" ].(string )
601619 format , _ := v ["format" ].(string )
620+ data = strings .TrimSpace (data )
621+ format = strings .TrimSpace (format )
602622 if data == "" || format == "" {
603623 return nil , false
604624 }
@@ -684,17 +704,25 @@ func firstNonEmptyString(item map[string]interface{}, keys ...string) string {
684704}
685705
686706func stringifyResponsesInputValue (value interface {}) string {
707+ encoded , err := stringifyResponsesInputValueWithError (value )
708+ if err != nil {
709+ return ""
710+ }
711+ return encoded
712+ }
713+
714+ func stringifyResponsesInputValueWithError (value interface {}) (string , error ) {
687715 switch v := value .(type ) {
688716 case nil :
689- return ""
717+ return "" , nil
690718 case string :
691- return v
719+ return v , nil
692720 default :
693721 encoded , err := json .Marshal (v )
694722 if err != nil {
695- return ""
723+ return "" , err
696724 }
697- return string (encoded )
725+ return string (encoded ), nil
698726 }
699727}
700728
@@ -812,26 +840,37 @@ func buildResponsesContentItemsFromParts(parts []core.ContentPart) []core.Respon
812840 Annotations : []string {},
813841 })
814842 case "image_url" :
815- if part .ImageURL == nil || part .ImageURL .URL == "" {
843+ if part .ImageURL == nil {
844+ continue
845+ }
846+ url := strings .TrimSpace (part .ImageURL .URL )
847+ if url == "" {
816848 continue
817849 }
818850 items = append (items , core.ResponsesContentItem {
819851 Type : "input_image" ,
820852 ImageURL : & core.ImageURLContent {
821- URL : part .ImageURL .URL ,
822- Detail : part .ImageURL .Detail ,
823- MediaType : part .ImageURL .MediaType ,
853+ URL : url ,
854+ Detail : strings .TrimSpace (part .ImageURL .Detail ),
855+ MediaType : strings .TrimSpace (part .ImageURL .MediaType ),
856+ ExtraFields : core .CloneRawJSONMap (part .ImageURL .ExtraFields ),
824857 },
825858 })
826859 case "input_audio" :
827- if part .InputAudio == nil || part .InputAudio .Data == "" || part .InputAudio .Format == "" {
860+ if part .InputAudio == nil {
861+ continue
862+ }
863+ data := strings .TrimSpace (part .InputAudio .Data )
864+ format := strings .TrimSpace (part .InputAudio .Format )
865+ if data == "" || format == "" {
828866 continue
829867 }
830868 items = append (items , core.ResponsesContentItem {
831869 Type : "input_audio" ,
832870 InputAudio : & core.InputAudioContent {
833- Data : part .InputAudio .Data ,
834- Format : part .InputAudio .Format ,
871+ Data : data ,
872+ Format : format ,
873+ ExtraFields : core .CloneRawJSONMap (part .InputAudio .ExtraFields ),
835874 },
836875 })
837876 }
0 commit comments