feat: Support HD voice custom pronunciations

Google APIs · copybara-github · commit 2059f0f65463 · 2025-03-31T08:51:49.000-07:00
docs: A comment for method `StreamingSynthesize` in service `TextToSpeech` is changed
docs: A comment for enum value `OGG_OPUS` in enum `AudioEncoding` is changed
docs: A comment for enum value `PCM` in enum `AudioEncoding` is changed
docs: A comment for field `low_latency_journey_synthesis` in message `.google.cloud.texttospeech.v1.AdvancedVoiceOptions` is changed
docs: A comment for enum value `PHONETIC_ENCODING_IPA` in enum `PhoneticEncoding` is changed
docs: A comment for enum value `PHONETIC_ENCODING_X_SAMPA` in enum `PhoneticEncoding` is changed
docs: A comment for field `phrase` in message `.google.cloud.texttospeech.v1.CustomPronunciationParams` is changed
docs: A comment for field `pronunciations` in message `.google.cloud.texttospeech.v1.CustomPronunciations` is changed
docs: A comment for message `MultiSpeakerMarkup` is changed
docs: A comment for field `custom_pronunciations` in message `.google.cloud.texttospeech.v1.SynthesisInput` is changed
docs: A comment for field `voice_clone` in message `.google.cloud.texttospeech.v1.VoiceSelectionParams` is changed
docs: A comment for field `audio_encoding` in message `.google.cloud.texttospeech.v1.StreamingAudioConfig` is changed
docs: A comment for field `text` in message `.google.cloud.texttospeech.v1.StreamingSynthesisInput` is changed

PiperOrigin-RevId: 742280480
diff --git a/google/cloud/texttospeech/v1/cloud_tts.proto b/google/cloud/texttospeech/v1/cloud_tts.proto
@@ -59,7 +59,7 @@ service TextToSpeech {
     option (google.api.method_signature) = "input,voice,audio_config";
   }
 
-  // Performs bidirectional streaming speech synthesis: receive audio while
+  // Performs bidirectional streaming speech synthesis: receives audio while
   // sending text.
   rpc StreamingSynthesize(stream StreamingSynthesizeRequest)
       returns (stream StreamingSynthesizeResponse) {}
@@ -99,7 +99,7 @@ enum AudioEncoding {
   // MP3 audio at 32kbps.
   MP3 = 2;
 
-  // Opus encoded audio wrapped in an ogg container. The result will be a
+  // Opus encoded audio wrapped in an ogg container. The result is a
   // file which can be played natively on Android, and in browsers (at least
   // Chrome and Firefox). The quality of the encoding is considerably higher
   // than MP3 while using approximately the same bitrate.
@@ -114,7 +114,7 @@ enum AudioEncoding {
   ALAW = 6;
 
   // Uncompressed 16-bit signed little-endian samples (Linear PCM).
-  // Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
+  // Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
   // any other) header.
   PCM = 7;
 }
@@ -157,8 +157,8 @@ message Voice {
 
 // Used for advanced voice options.
 message AdvancedVoiceOptions {
-  // Only for Journey voices. If false, the synthesis will be context aware
-  // and have higher latency.
+  // Only for Journey voices. If false, the synthesis is context aware
+  // and has a higher latency.
   optional bool low_latency_journey_synthesis = 1;
 }
 
@@ -184,18 +184,18 @@ message CustomPronunciationParams {
     // Not specified.
     PHONETIC_ENCODING_UNSPECIFIED = 0;
 
-    // IPA. (e.g. apple -> ˈæpəl )
+    // IPA, such as apple -> ˈæpəl.
     // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
     PHONETIC_ENCODING_IPA = 1;
 
-    // X-SAMPA (e.g. apple -> "{p@l" )
+    // X-SAMPA, such as apple -> "{p@l".
     // https://en.wikipedia.org/wiki/X-SAMPA
     PHONETIC_ENCODING_X_SAMPA = 2;
   }
 
-  // The phrase to which the customization will be applied.
-  // The phrase can be multiple words (in the case of proper nouns etc), but
-  // should not span to a whole sentence.
+  // The phrase to which the customization is applied.
+  // The phrase can be multiple words, such as proper nouns, but shouldn't span
+  // the length of the sentence.
   optional string phrase = 1;
 
   // The phonetic encoding of the phrase.
@@ -208,13 +208,13 @@ message CustomPronunciationParams {
 
 // A collection of pronunciation customizations.
 message CustomPronunciations {
-  // The pronunciation customizations to be applied.
+  // The pronunciation customizations are applied.
   repeated CustomPronunciationParams pronunciations = 1;
 }
 
 // A collection of turns for multi-speaker synthesis.
 message MultiSpeakerMarkup {
-  // A Multi-speaker turn.
+  // A multi-speaker turn.
   message Turn {
     // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
     // to documentation for available speakers.
@@ -250,18 +250,16 @@ message SynthesisInput {
     MultiSpeakerMarkup multi_speaker_markup = 4;
   }
 
-  // Optional. The pronunciation customizations to be applied to the input. If
-  // this is set, the input will be synthesized using the given pronunciation
+  // Optional. The pronunciation customizations are applied to the input. If
+  // this is set, the input is synthesized using the given pronunciation
   // customizations.
   //
-  // The initial support will be for EFIGS (English, French,
-  // Italian, German, Spanish) languages, as provided in
-  // VoiceSelectionParams. Journey and Instant Clone voices are
-  // not supported yet.
+  // The initial support is for en-us, with plans to expand to other locales in
+  // the future. Instant Clone voices aren't supported.
   //
   // In order to customize the pronunciation of a phrase, there must be an exact
   // match of the phrase in the input types. If using SSML, the phrase must not
-  // be inside a phoneme tag (entirely or partially).
+  // be inside a phoneme tag.
   CustomPronunciations custom_pronunciations = 3
       [(google.api.field_behavior) = OPTIONAL];
 }
@@ -299,8 +297,8 @@ message VoiceSelectionParams {
   CustomVoiceParams custom_voice = 4;
 
   // Optional. The configuration for a voice clone. If
-  // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
-  // voice clone matching the specified configuration.
+  // [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
+  // clone matching the specified configuration.
   VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
 }
 
@@ -309,10 +307,10 @@ message AudioConfig {
   // Required. The format of the audio byte stream.
   AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
 
-  // Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
+  // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
   // the normal native speed supported by the specific voice. 2.0 is twice as
   // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
-  // speed. Any other values < 0.25 or > 4.0 will return an error.
+  // speed. Any other values < 0.25 or > 2.0 will return an error.
   double speaking_rate = 2 [
     (google.api.field_behavior) = INPUT_ONLY,
     (google.api.field_behavior) = OPTIONAL
@@ -408,12 +406,21 @@ message SynthesizeSpeechResponse {
 // Description of the desired output audio data.
 message StreamingAudioConfig {
   // Required. The format of the audio byte stream.
-  // For now, streaming only supports PCM and OGG_OPUS. All other encodings
-  // will return an error.
+  // Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
+  // return an error.
   AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
 
   // Optional. The synthesis sample rate (in hertz) for this audio.
   int32 sample_rate_hertz = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
+  // the normal native speed supported by the specific voice. 2.0 is twice as
+  // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
+  // speed. Any other values < 0.25 or > 2.0 will return an error.
+  double speaking_rate = 3 [
+    (google.api.field_behavior) = INPUT_ONLY,
+    (google.api.field_behavior) = OPTIONAL
+  ];
 }
 
 // Provides configuration information for the StreamingSynthesize request.
@@ -424,15 +431,27 @@ message StreamingSynthesizeConfig {
   // Optional. The configuration of the synthesized audio.
   StreamingAudioConfig streaming_audio_config = 4
       [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The pronunciation customizations are applied to the input. If
+  // this is set, the input is synthesized using the given pronunciation
+  // customizations.
+  //
+  // The initial support is for en-us, with plans to expand to other locales in
+  // the future. Instant Clone voices aren't supported.
+  //
+  // In order to customize the pronunciation of a phrase, there must be an exact
+  // match of the phrase in the input types. If using SSML, the phrase must not
+  // be inside a phoneme tag.
+  CustomPronunciations custom_pronunciations = 5
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Input to be synthesized.
 message StreamingSynthesisInput {
   oneof input_source {
     // The raw text to be synthesized. It is recommended that each input
-    // contains complete, terminating sentences, as this will likely result in
-    // better prosody in the output audio. That being said, users are free to
-    // input text however they please.
+    // contains complete, terminating sentences, which results in better prosody
+    // in the output audio.
     string text = 1;
   }
 }