Skip to content

Commit 15fa97d

Browse files
Google APIscopybara-github
authored andcommitted
feat: add ReplicatedVoiceConfig to VoiceConfig to enable Gemini TTS voice replication
PiperOrigin-RevId: 833560482
1 parent 136201b commit 15fa97d

File tree

1 file changed

+46
-4
lines changed

1 file changed

+46
-4
lines changed

google/cloud/aiplatform/v1beta1/content.proto

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,61 @@ message PrebuiltVoiceConfig {
190190
optional string voice_name = 1;
191191
}
192192

193-
// The configuration for the voice to use.
193+
// The configuration for the replicated voice to use.
194+
message ReplicatedVoiceConfig {
195+
// Optional. The mimetype of the voice sample. Currently only
196+
// mime_type=audio/pcm is supported, which is raw mono 16-bit signed
197+
// little-endian pcm data, with 24k sampling rate.
198+
string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
199+
200+
// Optional. The sample of the custom voice.
201+
bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
202+
}
203+
204+
// Configuration for a voice.
194205
message VoiceConfig {
195206
// The configuration for the speaker to use.
196207
oneof voice_config {
197-
// The configuration for the prebuilt voice to use.
208+
// The configuration for a prebuilt voice.
198209
PrebuiltVoiceConfig prebuilt_voice_config = 1;
210+
211+
// Optional. The configuration for a replicated voice. This enables users to
212+
// replicate a voice from an audio sample.
213+
ReplicatedVoiceConfig replicated_voice_config = 3
214+
[(google.api.field_behavior) = OPTIONAL];
199215
}
200216
}
201217

202-
// The speech generation config.
218+
// Configuration for a single speaker in a multi-speaker setup.
219+
message SpeakerVoiceConfig {
220+
// Required. The name of the speaker. This should be the same as the speaker
221+
// name used in the prompt.
222+
string speaker = 1 [(google.api.field_behavior) = REQUIRED];
223+
224+
// Required. The configuration for the voice of this speaker.
225+
VoiceConfig voice_config = 2 [(google.api.field_behavior) = REQUIRED];
226+
}
227+
228+
// Configuration for a multi-speaker text-to-speech request.
229+
message MultiSpeakerVoiceConfig {
230+
// Required. A list of configurations for the voices of the speakers. Exactly
231+
// two speaker voice configurations must be provided.
232+
repeated SpeakerVoiceConfig speaker_voice_configs = 2
233+
[(google.api.field_behavior) = REQUIRED];
234+
}
235+
236+
237+
// Configuration for speech generation.
203238
message SpeechConfig {
204-
// The configuration for the speaker to use.
239+
// The configuration for the voice to use.
205240
VoiceConfig voice_config = 1;
241+
242+
// Optional. The language code (ISO 639-1) for the speech synthesis.
243+
string language_code = 2 [(google.api.field_behavior) = OPTIONAL];
244+
245+
// The configuration for a multi-speaker text-to-speech request.
246+
// This field is mutually exclusive with `voice_config`.
247+
MultiSpeakerVoiceConfig multi_speaker_voice_config = 3;
206248
}
207249

208250
// Config for image generation features.

0 commit comments

Comments
 (0)