Skip to content

Commit 2059f0f

Browse files
Google APIscopybara-github
authored andcommitted
feat: Support HD voice custom pronunciations
docs: A comment for method `StreamingSynthesize` in service `TextToSpeech` is changed docs: A comment for enum value `OGG_OPUS` in enum `AudioEncoding` is changed docs: A comment for enum value `PCM` in enum `AudioEncoding` is changed docs: A comment for field `low_latency_journey_synthesis` in message `.google.cloud.texttospeech.v1.AdvancedVoiceOptions` is changed docs: A comment for enum value `PHONETIC_ENCODING_IPA` in enum `PhoneticEncoding` is changed docs: A comment for enum value `PHONETIC_ENCODING_X_SAMPA` in enum `PhoneticEncoding` is changed docs: A comment for field `phrase` in message `.google.cloud.texttospeech.v1.CustomPronunciationParams` is changed docs: A comment for field `pronunciations` in message `.google.cloud.texttospeech.v1.CustomPronunciations` is changed docs: A comment for message `MultiSpeakerMarkup` is changed docs: A comment for field `custom_pronunciations` in message `.google.cloud.texttospeech.v1.SynthesisInput` is changed docs: A comment for field `voice_clone` in message `.google.cloud.texttospeech.v1.VoiceSelectionParams` is changed docs: A comment for field `audio_encoding` in message `.google.cloud.texttospeech.v1.StreamingAudioConfig` is changed docs: A comment for field `text` in message `.google.cloud.texttospeech.v1.StreamingSynthesisInput` is changed PiperOrigin-RevId: 742280480
1 parent 2b63b92 commit 2059f0f

1 file changed

Lines changed: 47 additions & 28 deletions

File tree

google/cloud/texttospeech/v1/cloud_tts.proto

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ service TextToSpeech {
5959
option (google.api.method_signature) = "input,voice,audio_config";
6060
}
6161

62-
// Performs bidirectional streaming speech synthesis: receive audio while
62+
// Performs bidirectional streaming speech synthesis: receives audio while
6363
// sending text.
6464
rpc StreamingSynthesize(stream StreamingSynthesizeRequest)
6565
returns (stream StreamingSynthesizeResponse) {}
@@ -99,7 +99,7 @@ enum AudioEncoding {
9999
// MP3 audio at 32kbps.
100100
MP3 = 2;
101101

102-
// Opus encoded audio wrapped in an ogg container. The result will be a
102+
// Opus encoded audio wrapped in an ogg container. The result is a
103103
// file which can be played natively on Android, and in browsers (at least
104104
// Chrome and Firefox). The quality of the encoding is considerably higher
105105
// than MP3 while using approximately the same bitrate.
@@ -114,7 +114,7 @@ enum AudioEncoding {
114114
ALAW = 6;
115115

116116
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
117-
// Note that as opposed to LINEAR16, audio will not be wrapped in a WAV (or
117+
// Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or
118118
// any other) header.
119119
PCM = 7;
120120
}
@@ -157,8 +157,8 @@ message Voice {
157157

158158
// Used for advanced voice options.
159159
message AdvancedVoiceOptions {
160-
// Only for Journey voices. If false, the synthesis will be context aware
161-
// and have higher latency.
160+
// Only for Journey voices. If false, the synthesis is context aware
161+
// and has a higher latency.
162162
optional bool low_latency_journey_synthesis = 1;
163163
}
164164

@@ -184,18 +184,18 @@ message CustomPronunciationParams {
184184
// Not specified.
185185
PHONETIC_ENCODING_UNSPECIFIED = 0;
186186

187-
// IPA. (e.g. apple -> ˈæpəl )
187+
// IPA, such as apple -> ˈæpəl.
188188
// https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
189189
PHONETIC_ENCODING_IPA = 1;
190190

191-
// X-SAMPA (e.g. apple -> "{p@l" )
191+
// X-SAMPA, such as apple -> "{p@l".
192192
// https://en.wikipedia.org/wiki/X-SAMPA
193193
PHONETIC_ENCODING_X_SAMPA = 2;
194194
}
195195

196-
// The phrase to which the customization will be applied.
197-
// The phrase can be multiple words (in the case of proper nouns etc), but
198-
// should not span to a whole sentence.
196+
// The phrase to which the customization is applied.
197+
// The phrase can be multiple words, such as proper nouns, but shouldn't span
198+
// the length of the sentence.
199199
optional string phrase = 1;
200200

201201
// The phonetic encoding of the phrase.
@@ -208,13 +208,13 @@ message CustomPronunciationParams {
208208

209209
// A collection of pronunciation customizations.
210210
message CustomPronunciations {
211-
// The pronunciation customizations to be applied.
211+
// The pronunciation customizations are applied.
212212
repeated CustomPronunciationParams pronunciations = 1;
213213
}
214214

215215
// A collection of turns for multi-speaker synthesis.
216216
message MultiSpeakerMarkup {
217-
// A Multi-speaker turn.
217+
// A multi-speaker turn.
218218
message Turn {
219219
// Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer
220220
// to documentation for available speakers.
@@ -250,18 +250,16 @@ message SynthesisInput {
250250
MultiSpeakerMarkup multi_speaker_markup = 4;
251251
}
252252

253-
// Optional. The pronunciation customizations to be applied to the input. If
254-
// this is set, the input will be synthesized using the given pronunciation
253+
// Optional. The pronunciation customizations are applied to the input. If
254+
// this is set, the input is synthesized using the given pronunciation
255255
// customizations.
256256
//
257-
// The initial support will be for EFIGS (English, French,
258-
// Italian, German, Spanish) languages, as provided in
259-
// VoiceSelectionParams. Journey and Instant Clone voices are
260-
// not supported yet.
257+
// The initial support is for en-us, with plans to expand to other locales in
258+
// the future. Instant Clone voices aren't supported.
261259
//
262260
// In order to customize the pronunciation of a phrase, there must be an exact
263261
// match of the phrase in the input types. If using SSML, the phrase must not
264-
// be inside a phoneme tag (entirely or partially).
262+
// be inside a phoneme tag.
265263
CustomPronunciations custom_pronunciations = 3
266264
[(google.api.field_behavior) = OPTIONAL];
267265
}
@@ -299,8 +297,8 @@ message VoiceSelectionParams {
299297
CustomVoiceParams custom_voice = 4;
300298

301299
// Optional. The configuration for a voice clone. If
302-
// [VoiceCloneParams.voice_clone_key] is set, the service will choose the
303-
// voice clone matching the specified configuration.
300+
// [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice
301+
// clone matching the specified configuration.
304302
VoiceCloneParams voice_clone = 5 [(google.api.field_behavior) = OPTIONAL];
305303
}
306304

@@ -309,10 +307,10 @@ message AudioConfig {
309307
// Required. The format of the audio byte stream.
310308
AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
311309

312-
// Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
310+
// Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
313311
// the normal native speed supported by the specific voice. 2.0 is twice as
314312
// fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
315-
// speed. Any other values < 0.25 or > 4.0 will return an error.
313+
// speed. Any other values < 0.25 or > 2.0 will return an error.
316314
double speaking_rate = 2 [
317315
(google.api.field_behavior) = INPUT_ONLY,
318316
(google.api.field_behavior) = OPTIONAL
@@ -408,12 +406,21 @@ message SynthesizeSpeechResponse {
408406
// Description of the desired output audio data.
409407
message StreamingAudioConfig {
410408
// Required. The format of the audio byte stream.
411-
// For now, streaming only supports PCM and OGG_OPUS. All other encodings
412-
// will return an error.
409+
// Streaming supports PCM, ALAW, MULAW and OGG_OPUS. All other encodings
410+
// return an error.
413411
AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];
414412

415413
// Optional. The synthesis sample rate (in hertz) for this audio.
416414
int32 sample_rate_hertz = 2 [(google.api.field_behavior) = OPTIONAL];
415+
416+
// Optional. Input only. Speaking rate/speed, in the range [0.25, 2.0]. 1.0 is
417+
// the normal native speed supported by the specific voice. 2.0 is twice as
418+
// fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
419+
// speed. Any other values < 0.25 or > 2.0 will return an error.
420+
double speaking_rate = 3 [
421+
(google.api.field_behavior) = INPUT_ONLY,
422+
(google.api.field_behavior) = OPTIONAL
423+
];
417424
}
418425

419426
// Provides configuration information for the StreamingSynthesize request.
@@ -424,15 +431,27 @@ message StreamingSynthesizeConfig {
424431
// Optional. The configuration of the synthesized audio.
425432
StreamingAudioConfig streaming_audio_config = 4
426433
[(google.api.field_behavior) = OPTIONAL];
434+
435+
// Optional. The pronunciation customizations are applied to the input. If
436+
// this is set, the input is synthesized using the given pronunciation
437+
// customizations.
438+
//
439+
// The initial support is for en-us, with plans to expand to other locales in
440+
// the future. Instant Clone voices aren't supported.
441+
//
442+
// In order to customize the pronunciation of a phrase, there must be an exact
443+
// match of the phrase in the input types. If using SSML, the phrase must not
444+
// be inside a phoneme tag.
445+
CustomPronunciations custom_pronunciations = 5
446+
[(google.api.field_behavior) = OPTIONAL];
427447
}
428448

429449
// Input to be synthesized.
430450
message StreamingSynthesisInput {
431451
oneof input_source {
432452
// The raw text to be synthesized. It is recommended that each input
433-
// contains complete, terminating sentences, as this will likely result in
434-
// better prosody in the output audio. That being said, users are free to
435-
// input text however they please.
453+
// contains complete, terminating sentences, which results in better prosody
454+
// in the output audio.
436455
string text = 1;
437456
}
438457
}

0 commit comments

Comments
 (0)