@@ -190,19 +190,61 @@ message PrebuiltVoiceConfig {
190190 optional string voice_name = 1 ;
191191}
192192
193- // The configuration for the voice to use.
193+ // The configuration for the replicated voice to use.
194+ message ReplicatedVoiceConfig {
195+ // Optional. The mimetype of the voice sample. Currently only
196+ // mime_type=audio/pcm is supported, which is raw mono 16-bit signed
197+ // little-endian pcm data, with 24k sampling rate.
198+ string mime_type = 1 [(google.api.field_behavior ) = OPTIONAL ];
199+
200+ // Optional. The sample of the custom voice.
201+ bytes voice_sample_audio = 2 [(google.api.field_behavior ) = OPTIONAL ];
202+ }
203+
204+ // Configuration for a voice.
194205message VoiceConfig {
195206 // The configuration for the speaker to use.
196207 oneof voice_config {
197- // The configuration for the prebuilt voice to use .
208+ // The configuration for a prebuilt voice.
198209 PrebuiltVoiceConfig prebuilt_voice_config = 1 ;
210+
211+ // Optional. The configuration for a replicated voice. This enables users to
212+ // replicate a voice from an audio sample.
213+ ReplicatedVoiceConfig replicated_voice_config = 3
214+ [(google.api.field_behavior ) = OPTIONAL ];
199215 }
200216}
201217
202- // The speech generation config.
218+ // Configuration for a single speaker in a multi-speaker setup.
219+ message SpeakerVoiceConfig {
220+ // Required. The name of the speaker. This should be the same as the speaker
221+ // name used in the prompt.
222+ string speaker = 1 [(google.api.field_behavior ) = REQUIRED ];
223+
224+ // Required. The configuration for the voice of this speaker.
225+ VoiceConfig voice_config = 2 [(google.api.field_behavior ) = REQUIRED ];
226+ }
227+
228+ // Configuration for a multi-speaker text-to-speech request.
229+ message MultiSpeakerVoiceConfig {
230+ // Required. A list of configurations for the voices of the speakers. Exactly
231+ // two speaker voice configurations must be provided.
232+ repeated SpeakerVoiceConfig speaker_voice_configs = 2
233+ [(google.api.field_behavior ) = REQUIRED ];
234+ }
235+
236+
237+ // Configuration for speech generation.
203238message SpeechConfig {
204- // The configuration for the speaker to use.
239+ // The configuration for the voice to use.
205240 VoiceConfig voice_config = 1 ;
241+
242+ // Optional. The language code (ISO 639-1) for the speech synthesis.
243+ string language_code = 2 [(google.api.field_behavior ) = OPTIONAL ];
244+
245+ // The configuration for a multi-speaker text-to-speech request.
246+ // This field is mutually exclusive with `voice_config`.
247+ MultiSpeakerVoiceConfig multi_speaker_voice_config = 3 ;
206248}
207249
208250// Config for image generation features.
0 commit comments