feat: add ReplicatedVoiceConfig to VoiceConfig to enable Gemini TTS voice replication

Google APIs · copybara-github · commit 15fa97d38ad2 · 2025-11-17T17:21:31.000-08:00
PiperOrigin-RevId: 833560482
diff --git a/google/cloud/aiplatform/v1beta1/content.proto b/google/cloud/aiplatform/v1beta1/content.proto
@@ -190,19 +190,61 @@ message PrebuiltVoiceConfig {
   optional string voice_name = 1;
 }
 
-// The configuration for the voice to use.
+// The configuration for the replicated voice to use.
+message ReplicatedVoiceConfig {
+  // Optional. The mimetype of the voice sample. Currently only
+  // mime_type=audio/pcm is supported, which is raw mono 16-bit signed
+  // little-endian pcm data, with 24k sampling rate.
+  string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The sample of the custom voice.
+  bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
+}
+
+// Configuration for a voice.
 message VoiceConfig {
   // The configuration for the speaker to use.
   oneof voice_config {
-    // The configuration for the prebuilt voice to use.
+    // The configuration for a prebuilt voice.
     PrebuiltVoiceConfig prebuilt_voice_config = 1;
+
+    // Optional. The configuration for a replicated voice. This enables users to
+    // replicate a voice from an audio sample.
+    ReplicatedVoiceConfig replicated_voice_config = 3
+        [(google.api.field_behavior) = OPTIONAL];
   }
 }
 
-// The speech generation config.
+// Configuration for a single speaker in a multi-speaker setup.
+message SpeakerVoiceConfig {
+  // Required. The name of the speaker. This should be the same as the speaker
+  // name used in the prompt.
+  string speaker = 1 [(google.api.field_behavior) = REQUIRED];
+
+  // Required. The configuration for the voice of this speaker.
+  VoiceConfig voice_config = 2 [(google.api.field_behavior) = REQUIRED];
+}
+
+// Configuration for a multi-speaker text-to-speech request.
+message MultiSpeakerVoiceConfig {
+  // Required. A list of configurations for the voices of the speakers. Exactly
+  // two speaker voice configurations must be provided.
+  repeated SpeakerVoiceConfig speaker_voice_configs = 2
+      [(google.api.field_behavior) = REQUIRED];
+}
+
+
+// Configuration for speech generation.
 message SpeechConfig {
-  // The configuration for the speaker to use.
+  // The configuration for the voice to use.
   VoiceConfig voice_config = 1;
+
+  // Optional. The language code (ISO 639-1) for the speech synthesis.
+  string language_code = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // The configuration for a multi-speaker text-to-speech request.
+  // This field is mutually exclusive with `voice_config`.
+  MultiSpeakerVoiceConfig multi_speaker_voice_config = 3;
 }
 
 // Config for image generation features.