feat: add ReplicatedVoiceConfig to VoiceConfig to enable Gemini TTS voice replication

Google APIs · copybara-github · commit 136201b66f70 · 2025-11-17T13:47:57.000-08:00
PiperOrigin-RevId: 833480721
diff --git a/google/cloud/aiplatform/v1/content.proto b/google/cloud/aiplatform/v1/content.proto
@@ -184,6 +184,69 @@ message VideoMetadata {
       [(google.api.field_behavior) = OPTIONAL];
 }
 
+// Configuration for a prebuilt voice.
+message PrebuiltVoiceConfig {
+  // The name of the prebuilt voice to use.
+  optional string voice_name = 1;
+}
+
+// The configuration for the replicated voice to use.
+message ReplicatedVoiceConfig {
+  // Optional. The mimetype of the voice sample. Currently only
+  // mime_type=audio/pcm is supported, which is raw mono 16-bit signed
+  // little-endian pcm data, with 24k sampling rate.
+  string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The sample of the custom voice.
+  bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
+}
+
+
+// Configuration for a voice.
+message VoiceConfig {
+  // The configuration for the speaker to use.
+  oneof voice_config {
+    // The configuration for a prebuilt voice.
+    PrebuiltVoiceConfig prebuilt_voice_config = 1;
+
+    // Optional. The configuration for a replicated voice. This enables users to
+    // replicate a voice from an audio sample.
+    ReplicatedVoiceConfig replicated_voice_config = 3
+        [(google.api.field_behavior) = OPTIONAL];
+  }
+}
+
+// Configuration for a single speaker in a multi-speaker setup.
+message SpeakerVoiceConfig {
+  // Required. The name of the speaker. This should be the same as the speaker
+  // name used in the prompt.
+  string speaker = 1 [(google.api.field_behavior) = REQUIRED];
+
+  // Required. The configuration for the voice of this speaker.
+  VoiceConfig voice_config = 2 [(google.api.field_behavior) = REQUIRED];
+}
+
+// Configuration for a multi-speaker text-to-speech request.
+message MultiSpeakerVoiceConfig {
+  // Required. A list of configurations for the voices of the speakers. Exactly
+  // two speaker voice configurations must be provided.
+  repeated SpeakerVoiceConfig speaker_voice_configs = 2
+      [(google.api.field_behavior) = REQUIRED];
+}
+
+// Configuration for speech generation.
+message SpeechConfig {
+  // The configuration for the voice to use.
+  VoiceConfig voice_config = 1;
+
+  // Optional. The language code (ISO 639-1) for the speech synthesis.
+  string language_code = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // The configuration for a multi-speaker text-to-speech request.
+  // This field is mutually exclusive with `voice_config`.
+  MultiSpeakerVoiceConfig multi_speaker_voice_config = 3;
+}
+
 // Config for image generation features.
 message ImageConfig {
   // Optional. The desired aspect ratio for the generated images. The following
@@ -347,6 +410,10 @@ message GenerationConfig {
   optional RoutingConfig routing_config = 17
       [(google.api.field_behavior) = OPTIONAL];
 
+  // Optional. The speech generation config.
+  optional SpeechConfig speech_config = 23
+      [(google.api.field_behavior) = OPTIONAL];
+
   // Optional. Config for thinking features.
   // An error will be returned if this field is set for models that don't
   // support thinking.