@@ -184,6 +184,69 @@ message VideoMetadata {
184184 [(google.api.field_behavior ) = OPTIONAL ];
185185}
186186
187+ // Configuration for a prebuilt voice.
188+ message PrebuiltVoiceConfig {
189+ // The name of the prebuilt voice to use.
190+ optional string voice_name = 1 ;
191+ }
192+
193+ // The configuration for the replicated voice to use.
194+ message ReplicatedVoiceConfig {
195+ // Optional. The mimetype of the voice sample. Currently only
196+ // mime_type=audio/pcm is supported, which is raw mono 16-bit signed
197+ // little-endian pcm data, with 24k sampling rate.
198+ string mime_type = 1 [(google.api.field_behavior ) = OPTIONAL ];
199+
200+ // Optional. The sample of the custom voice.
201+ bytes voice_sample_audio = 2 [(google.api.field_behavior ) = OPTIONAL ];
202+ }
203+
204+
205+ // Configuration for a voice.
206+ message VoiceConfig {
207+ // The configuration for the speaker to use.
208+ oneof voice_config {
209+ // The configuration for a prebuilt voice.
210+ PrebuiltVoiceConfig prebuilt_voice_config = 1 ;
211+
212+ // Optional. The configuration for a replicated voice. This enables users to
213+ // replicate a voice from an audio sample.
214+ ReplicatedVoiceConfig replicated_voice_config = 3
215+ [(google.api.field_behavior ) = OPTIONAL ];
216+ }
217+ }
218+
219+ // Configuration for a single speaker in a multi-speaker setup.
220+ message SpeakerVoiceConfig {
221+ // Required. The name of the speaker. This should be the same as the speaker
222+ // name used in the prompt.
223+ string speaker = 1 [(google.api.field_behavior ) = REQUIRED ];
224+
225+ // Required. The configuration for the voice of this speaker.
226+ VoiceConfig voice_config = 2 [(google.api.field_behavior ) = REQUIRED ];
227+ }
228+
229+ // Configuration for a multi-speaker text-to-speech request.
230+ message MultiSpeakerVoiceConfig {
231+ // Required. A list of configurations for the voices of the speakers. Exactly
232+ // two speaker voice configurations must be provided.
233+ repeated SpeakerVoiceConfig speaker_voice_configs = 2
234+ [(google.api.field_behavior ) = REQUIRED ];
235+ }
236+
237+ // Configuration for speech generation.
238+ message SpeechConfig {
239+ // The configuration for the voice to use.
240+ VoiceConfig voice_config = 1 ;
241+
242+ // Optional. The language code (ISO 639-1) for the speech synthesis.
243+ string language_code = 2 [(google.api.field_behavior ) = OPTIONAL ];
244+
245+ // The configuration for a multi-speaker text-to-speech request.
246+ // This field is mutually exclusive with `voice_config`.
247+ MultiSpeakerVoiceConfig multi_speaker_voice_config = 3 ;
248+ }
249+
187250// Config for image generation features.
188251message ImageConfig {
189252 // Optional. The desired aspect ratio for the generated images. The following
@@ -347,6 +410,10 @@ message GenerationConfig {
347410 optional RoutingConfig routing_config = 17
348411 [(google.api.field_behavior ) = OPTIONAL ];
349412
413+ // Optional. The speech generation config.
414+ optional SpeechConfig speech_config = 23
415+ [(google.api.field_behavior ) = OPTIONAL ];
416+
350417 // Optional. Config for thinking features.
351418 // An error will be returned if this field is set for models that don't
352419 // support thinking.
0 commit comments