@@ -59,7 +59,7 @@ service TextToSpeech {
5959 option (google.api.method_signature ) = "input,voice,audio_config" ;
6060 }
6161
62- // Performs bidirectional streaming speech synthesis: receive audio while
62+ // Performs bidirectional streaming speech synthesis: receives audio while
6363 // sending text.
6464 rpc StreamingSynthesize (stream StreamingSynthesizeRequest )
6565 returns (stream StreamingSynthesizeResponse ) {}
@@ -99,7 +99,7 @@ enum AudioEncoding {
9999 // MP3 audio at 32kbps.
100100 MP3 = 2 ;
101101
102- // Opus encoded audio wrapped in an ogg container. The result will be a
102+ // Opus encoded audio wrapped in an ogg container. The result is a
103103 // file which can be played natively on Android, and in browsers (at least
104104 // Chrome and Firefox). The quality of the encoding is considerably higher
105105 // than MP3 while using approximately the same bitrate.
@@ -114,7 +114,7 @@ enum AudioEncoding {
114114 ALAW = 6 ;
115115
116116 // Uncompressed 16-bit signed little-endian samples (Linear PCM).
117- // Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
117+ // Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
118118 // any other) header.
119119 PCM = 7 ;
120120}
@@ -157,8 +157,8 @@ message Voice {
157157
158158// Used for advanced voice options.
159159message AdvancedVoiceOptions {
160- // Only for Journey voices. If false, the synthesis will be context aware
161- // and have higher latency.
160+ // Only for Journey voices. If false, the synthesis is context aware
161+ // and has a higher latency.
162162 optional bool low_latency_journey_synthesis = 1 ;
163163}
164164
@@ -184,18 +184,18 @@ message CustomPronunciationParams {
184184 // Not specified.
185185 PHONETIC_ENCODING_UNSPECIFIED = 0 ;
186186
187- // IPA. (e.g. apple -> ˈæpəl )
187+ // IPA, such as apple -> ˈæpəl.
188188 // https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
189189 PHONETIC_ENCODING_IPA = 1 ;
190190
191- // X-SAMPA (e.g. apple -> "{p@l" )
191+ // X-SAMPA, such as apple -> "{p@l".
192192 // https://en.wikipedia.org/wiki/X-SAMPA
193193 PHONETIC_ENCODING_X_SAMPA = 2 ;
194194 }
195195
196- // The phrase to which the customization will be applied.
197- // The phrase can be multiple words (in the case of proper nouns etc) , but
198- // should not span to a whole sentence.
196+ // The phrase to which the customization is applied.
197+ // The phrase can be multiple words, such as proper nouns, but shouldn't span
198+ // the length of the sentence.
199199 optional string phrase = 1 ;
200200
201201 // The phonetic encoding of the phrase.
@@ -208,13 +208,13 @@ message CustomPronunciationParams {
208208
209209// A collection of pronunciation customizations.
210210message CustomPronunciations {
211- // The pronunciation customizations to be applied.
211+ // The pronunciation customizations are applied.
212212 repeated CustomPronunciationParams pronunciations = 1 ;
213213}
214214
215215// A collection of turns for multi-speaker synthesis.
216216message MultiSpeakerMarkup {
217- // A Multi -speaker turn.
217+ // A multi -speaker turn.
218218 message Turn {
219219 // Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
220220 // to documentation for available speakers.
@@ -250,18 +250,16 @@ message SynthesisInput {
250250 MultiSpeakerMarkup multi_speaker_markup = 4 ;
251251 }
252252
253- // Optional. The pronunciation customizations to be applied to the input. If
254- // this is set, the input will be synthesized using the given pronunciation
253+ // Optional. The pronunciation customizations are applied to the input. If
254+ // this is set, the input is synthesized using the given pronunciation
255255 // customizations.
256256 //
257- // The initial support will be for EFIGS (English, French,
258- // Italian, German, Spanish) languages, as provided in
259- // VoiceSelectionParams. Journey and Instant Clone voices are
260- // not supported yet.
257+ // The initial support is for en-us, with plans to expand to other locales in
258+ // the future. Instant Clone voices aren't supported.
261259 //
262260 // In order to customize the pronunciation of a phrase, there must be an exact
263261 // match of the phrase in the input types. If using SSML, the phrase must not
264- // be inside a phoneme tag (entirely or partially) .
262+ // be inside a phoneme tag.
265263 CustomPronunciations custom_pronunciations = 3
266264 [(google.api.field_behavior ) = OPTIONAL ];
267265}
@@ -299,8 +297,8 @@ message VoiceSelectionParams {
299297 CustomVoiceParams custom_voice = 4 ;
300298
301299 // Optional. The configuration for a voice clone. If
302- // [VoiceCloneParams.voice_clone_key] is set, the service will choose the
303- // voice clone matching the specified configuration.
300+ // [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
301+ // clone matching the specified configuration.
304302 VoiceCloneParams voice_clone = 5 [(google.api.field_behavior ) = OPTIONAL ];
305303}
306304
@@ -309,10 +307,10 @@ message AudioConfig {
309307 // Required. The format of the audio byte stream.
310308 AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
311309
312- // Optional. Input only. Speaking rate/speed, in the range [0.25, 4 .0]. 1.0 is
310+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 2 .0]. 1.0 is
313311 // the normal native speed supported by the specific voice. 2.0 is twice as
314312 // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
315- // speed. Any other values < 0.25 or > 4 .0 will return an error.
313+ // speed. Any other values < 0.25 or > 2 .0 will return an error.
316314 double speaking_rate = 2 [
317315 (google.api.field_behavior ) = INPUT_ONLY ,
318316 (google.api.field_behavior ) = OPTIONAL
@@ -408,12 +406,21 @@ message SynthesizeSpeechResponse {
408406// Description of the desired output audio data.
409407message StreamingAudioConfig {
410408 // Required. The format of the audio byte stream.
411- // For now, streaming only supports PCM and OGG_OPUS. All other encodings
412- // will return an error.
409+ // Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
410+ // return an error.
413411 AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
414412
415413 // Optional. The synthesis sample rate (in hertz) for this audio.
416414 int32 sample_rate_hertz = 2 [(google.api.field_behavior ) = OPTIONAL ];
415+
416+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
417+ // the normal native speed supported by the specific voice. 2.0 is twice as
418+ // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
419+ // speed. Any other values < 0.25 or > 2.0 will return an error.
420+ double speaking_rate = 3 [
421+ (google.api.field_behavior ) = INPUT_ONLY ,
422+ (google.api.field_behavior ) = OPTIONAL
423+ ];
417424}
418425
419426// Provides configuration information for the StreamingSynthesize request.
@@ -424,15 +431,27 @@ message StreamingSynthesizeConfig {
424431 // Optional. The configuration of the synthesized audio.
425432 StreamingAudioConfig streaming_audio_config = 4
426433 [(google.api.field_behavior ) = OPTIONAL ];
434+
435+ // Optional. The pronunciation customizations are applied to the input. If
436+ // this is set, the input is synthesized using the given pronunciation
437+ // customizations.
438+ //
439+ // The initial support is for en-us, with plans to expand to other locales in
440+ // the future. Instant Clone voices aren't supported.
441+ //
442+ // In order to customize the pronunciation of a phrase, there must be an exact
443+ // match of the phrase in the input types. If using SSML, the phrase must not
444+ // be inside a phoneme tag.
445+ CustomPronunciations custom_pronunciations = 5
446+ [(google.api.field_behavior ) = OPTIONAL ];
427447}
428448
429449// Input to be synthesized.
430450message StreamingSynthesisInput {
431451 oneof input_source {
432452 // The raw text to be synthesized. It is recommended that each input
433- // contains complete, terminating sentences, as this will likely result in
434- // better prosody in the output audio. That being said, users are free to
435- // input text however they please.
453+ // contains complete, terminating sentences, which results in better prosody
454+ // in the output audio.
436455 string text = 1 ;
437456 }
438457}
0 commit comments