Skip to content

Commit 1a14153

Browse files
Google APIscopybara-github
authored andcommitted
feat: add custom prompt config in the request and return prompt in the response
feat: update min_speaker_count and max_speaker_count to be optional docs: Clarify that min_speaker_count and max_speaker_count in SpeakerDiarizationConfig are not currently used docs: Update guidance on how to enable speaker diarization; to enable, set the diarization_config field to an empty SpeakerDiarizationConfig message PiperOrigin-RevId: 852383212
1 parent 9a477cd commit 1a14153

File tree

2 files changed

+27
-22
lines changed

2 files changed

+27
-22
lines changed

google/cloud/speech/v2/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ load(
347347

348348
csharp_proto_library(
349349
name = "speech_csharp_proto",
350-
extra_opts = [],
351350
deps = [":speech_proto"],
352351
)
353352

google/cloud/speech/v2/cloud_speech.proto

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -779,19 +779,20 @@ message ExplicitDecodingConfig {
779779

780780
// Configuration to enable speaker diarization.
781781
message SpeakerDiarizationConfig {
782-
// Required. Minimum number of speakers in the conversation. This range gives
783-
// you more flexibility by allowing the system to automatically determine the
784-
// correct number of speakers.
785-
//
786-
// To fix the number of speakers detected in the audio, set
787-
// `min_speaker_count` = `max_speaker_count`.
788-
int32 min_speaker_count = 2 [(google.api.field_behavior) = REQUIRED];
789-
790-
// Required. Maximum number of speakers in the conversation. Valid values are:
791-
// 1-6. Must be >= `min_speaker_count`. This range gives you more flexibility
792-
// by allowing the system to automatically determine the correct number of
793-
// speakers.
794-
int32 max_speaker_count = 3 [(google.api.field_behavior) = REQUIRED];
782+
// Optional. The system automatically determines the number of speakers. This
783+
// value is not currently used.
784+
int32 min_speaker_count = 2 [(google.api.field_behavior) = OPTIONAL];
785+
786+
// Optional. The system automatically determines the number of speakers. This
787+
// value is not currently used.
788+
int32 max_speaker_count = 3 [(google.api.field_behavior) = OPTIONAL];
789+
}
790+
791+
// Configuration to enable custom prompt in chirp3.
792+
message CustomPromptConfig {
793+
// Optional. The custom instructions to override the existing instructions for
794+
// chirp3.
795+
string custom_prompt = 1 [(google.api.field_behavior) = OPTIONAL];
795796
}
796797

797798
// Available recognition features.
@@ -846,21 +847,19 @@ message RecognitionFeatures {
846847
// Mode for recognizing multi-channel audio.
847848
MultiChannelMode multi_channel_mode = 17;
848849

849-
// Configuration to enable speaker diarization and set additional
850-
// parameters to make diarization better suited for your application.
851-
// When this is enabled, we send all the words from the beginning of the
852-
// audio for the top alternative in every consecutive STREAMING responses.
853-
// This is done in order to improve our speaker tags as our models learn to
854-
// identify the speakers in the conversation over time.
855-
// For non-streaming requests, the diarization results will be provided only
856-
// in the top alternative of the FINAL SpeechRecognitionResult.
850+
// Configuration to enable speaker diarization. To enable diarization, set
851+
// this field to an empty SpeakerDiarizationConfig message.
857852
SpeakerDiarizationConfig diarization_config = 9;
858853

859854
// Maximum number of recognition hypotheses to be returned.
860855
// The server may return fewer than `max_alternatives`.
861856
// Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
862857
// one. If omitted, will return a maximum of one.
863858
int32 max_alternatives = 16;
859+
860+
// Optional. Configuration to enable custom prompt for chirp3.
861+
CustomPromptConfig custom_prompt_config = 18
862+
[(google.api.field_behavior) = OPTIONAL];
864863
}
865864

866865
// Transcription normalization configuration. Use transcription normalization
@@ -1066,6 +1065,13 @@ message RecognitionResponseMetadata {
10661065

10671066
// When available, billed audio seconds for the corresponding request.
10681067
google.protobuf.Duration total_billed_duration = 6;
1068+
1069+
// Optional. Output only. Provides the prompt used for the recognition
1070+
// request.
1071+
optional string prompt = 10 [
1072+
(google.api.field_behavior) = OUTPUT_ONLY,
1073+
(google.api.field_behavior) = OPTIONAL
1074+
];
10691075
}
10701076

10711077
// Alternative hypotheses (a.k.a. n-best list).

0 commit comments

Comments
 (0)