Skip to content

Commit 136201b

Browse files
Google APIscopybara-github
authored andcommitted
feat: add ReplicatedVoiceConfig to VoiceConfig to enable Gemini TTS voice replication
PiperOrigin-RevId: 833480721
1 parent 7418c8b commit 136201b

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

google/cloud/aiplatform/v1/content.proto

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,69 @@ message VideoMetadata {
184184
[(google.api.field_behavior) = OPTIONAL];
185185
}
186186

187+
// Configuration for a prebuilt voice.
188+
message PrebuiltVoiceConfig {
189+
// The name of the prebuilt voice to use.
190+
optional string voice_name = 1;
191+
}
192+
193+
// The configuration for the replicated voice to use.
194+
message ReplicatedVoiceConfig {
195+
// Optional. The mimetype of the voice sample. Currently only
196+
// mime_type=audio/pcm is supported, which is raw mono 16-bit signed
197+
// little-endian pcm data, with 24k sampling rate.
198+
string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
199+
200+
// Optional. The sample of the custom voice.
201+
bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
202+
}
203+
204+
205+
// Configuration for a voice.
206+
message VoiceConfig {
207+
// The configuration for the speaker to use.
208+
oneof voice_config {
209+
// The configuration for a prebuilt voice.
210+
PrebuiltVoiceConfig prebuilt_voice_config = 1;
211+
212+
// Optional. The configuration for a replicated voice. This enables users to
213+
// replicate a voice from an audio sample.
214+
ReplicatedVoiceConfig replicated_voice_config = 3
215+
[(google.api.field_behavior) = OPTIONAL];
216+
}
217+
}
218+
219+
// Configuration for a single speaker in a multi-speaker setup.
220+
message SpeakerVoiceConfig {
221+
// Required. The name of the speaker. This should be the same as the speaker
222+
// name used in the prompt.
223+
string speaker = 1 [(google.api.field_behavior) = REQUIRED];
224+
225+
// Required. The configuration for the voice of this speaker.
226+
VoiceConfig voice_config = 2 [(google.api.field_behavior) = REQUIRED];
227+
}
228+
229+
// Configuration for a multi-speaker text-to-speech request.
230+
message MultiSpeakerVoiceConfig {
231+
// Required. A list of configurations for the voices of the speakers. Exactly
232+
// two speaker voice configurations must be provided.
233+
repeated SpeakerVoiceConfig speaker_voice_configs = 2
234+
[(google.api.field_behavior) = REQUIRED];
235+
}
236+
237+
// Configuration for speech generation.
238+
message SpeechConfig {
239+
// The configuration for the voice to use.
240+
VoiceConfig voice_config = 1;
241+
242+
// Optional. The language code (ISO 639-1) for the speech synthesis.
243+
string language_code = 2 [(google.api.field_behavior) = OPTIONAL];
244+
245+
// The configuration for a multi-speaker text-to-speech request.
246+
// This field is mutually exclusive with `voice_config`.
247+
MultiSpeakerVoiceConfig multi_speaker_voice_config = 3;
248+
}
249+
187250
// Config for image generation features.
188251
message ImageConfig {
189252
// Optional. The desired aspect ratio for the generated images. The following
@@ -347,6 +410,10 @@ message GenerationConfig {
347410
optional RoutingConfig routing_config = 17
348411
[(google.api.field_behavior) = OPTIONAL];
349412

413+
// Optional. The speech generation config.
414+
optional SpeechConfig speech_config = 23
415+
[(google.api.field_behavior) = OPTIONAL];
416+
350417
// Optional. Config for thinking features.
351418
// An error will be returned if this field is set for models that don't
352419
// support thinking.

0 commit comments

Comments
 (0)