| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- // Copyright 2016 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- syntax = "proto3";
- package google.cloud.speech.v1beta1;
- import "google/api/annotations.proto";
- import "google/longrunning/operations.proto";
- import "google/rpc/status.proto";
- option java_multiple_files = true;
- option java_outer_classname = "SpeechProto";
- option java_package = "com.google.cloud.speech.v1beta1";
- // Service that implements Google Cloud Speech API.
- service Speech {
- // Perform synchronous speech-recognition: receive results after all audio
- // has been sent and processed.
- rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) {
- option (google.api.http) = { post: "/v1beta1/speech:syncrecognize" body: "*" };
- }
- // Perform asynchronous speech-recognition: receive results via the
- // google.longrunning.Operations interface. `Operation.response` returns
- // `AsyncRecognizeResponse`.
- rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) {
- option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" };
- }
- // Perform bidirectional streaming speech-recognition: receive results while
- // sending audio. This method is only available via the gRPC API (not REST).
- rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
- }
- // `SyncRecognizeRequest` is the top-level message sent by the client for
- // the `SyncRecognize` method.
- message SyncRecognizeRequest {
- // [Required] The `config` message provides information to the recognizer
- // that specifies how to process the request.
- RecognitionConfig config = 1;
- // [Required] The audio data to be recognized.
- RecognitionAudio audio = 2;
- }
- // `AsyncRecognizeRequest` is the top-level message sent by the client for
- // the `AsyncRecognize` method.
- message AsyncRecognizeRequest {
- // [Required] The `config` message provides information to the recognizer
- // that specifies how to process the request.
- RecognitionConfig config = 1;
- // [Required] The audio data to be recognized.
- RecognitionAudio audio = 2;
- }
- // `StreamingRecognizeRequest` is the top-level message sent by the client for
- // the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are
- // sent. The first message must contain a `streaming_config` message and must
- // not contain `audio` data. All subsequent messages must contain `audio` data
- // and must not contain a `streaming_config` message.
- message StreamingRecognizeRequest {
- oneof streaming_request {
- // The `streaming_config` message provides information to the recognizer
- // that specifies how to process the request.
- //
- // The first `StreamingRecognizeRequest` message must contain a
- // `streaming_config` message.
- StreamingRecognitionConfig streaming_config = 1;
- // The audio data to be recognized. Sequential chunks of audio data are sent
- // in sequential `StreamingRecognizeRequest` messages. The first
- // `StreamingRecognizeRequest` message must not contain `audio_content` data
- // and all subsequent `StreamingRecognizeRequest` messages must contain
- // `audio_content` data. The audio bytes must be encoded as specified in
- // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
- // pure binary representation (not base64).
- bytes audio_content = 2;
- }
- }
- // The `StreamingRecognitionConfig` message provides information to the
- // recognizer that specifies how to process the request.
- message StreamingRecognitionConfig {
- // [Required] The `config` message provides information to the recognizer
- // that specifies how to process the request.
- RecognitionConfig config = 1;
- // [Optional] If `false` or omitted, the recognizer will perform continuous
- // recognition (continuing to process audio even if the user pauses speaking)
- // until the client closes the output stream (gRPC API) or when the maximum
- // time limit has been reached. Multiple `SpeechRecognitionResult`s with the
- // `is_final` flag set to `true` may be returned.
- //
- // If `true`, the recognizer will detect a single spoken utterance. When it
- // detects that the user has paused or stopped speaking, it will return an
- // `END_OF_UTTERANCE` event and cease recognition. It will return no more than
- // one `SpeechRecognitionResult` with the `is_final` flag set to `true`.
- bool single_utterance = 2;
- // [Optional] If `true`, interim results (tentative hypotheses) may be
- // returned as they become available (these interim results are indicated with
- // the `is_final=false` flag).
- // If `false` or omitted, only `is_final=true` result(s) are returned.
- bool interim_results = 3;
- }
- // The `RecognitionConfig` message provides information to the recognizer
- // that specifies how to process the request.
- message RecognitionConfig {
- // Audio encoding of the data sent in the audio message. All encodings support
- // only 1 channel (mono) audio. Only `FLAC` includes a header that describes
- // the bytes of audio that follow the header. The other encodings are raw
- // audio bytes with no header.
- //
- // For best results, the audio source should be captured and transmitted using
- // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
- // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture
- // or transmit the audio, particularly if background noise is present.
- enum AudioEncoding {
- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
- ENCODING_UNSPECIFIED = 0;
- // Uncompressed 16-bit signed little-endian samples.
- // This is the only encoding that may be used by `AsyncRecognize`.
- LINEAR16 = 1;
- // This is the recommended encoding for `SyncRecognize` and
- // `StreamingRecognize` because it uses lossless compression; therefore
- // recognition accuracy is not compromised by a lossy codec.
- //
- // The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
- // http://flac.sourceforge.net/documentation.html.
- // Only 16-bit samples are supported.
- // Not all fields in STREAMINFO are supported.
- FLAC = 2;
- // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
- MULAW = 3;
- // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz.
- AMR = 4;
- // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz.
- AMR_WB = 5;
- }
- // [Required] Encoding of audio data sent in all `RecognitionAudio` messages.
- AudioEncoding encoding = 1;
- // [Required] Sample rate in Hertz of the audio data sent in all
- // `RecognitionAudio` messages. Valid values are: 8000-48000.
- // 16000 is optimal. For best results, set the sampling rate of the audio
- // source to 16000 Hz. If that's not possible, use the native sample rate of
- // the audio source (instead of re-sampling).
- int32 sample_rate = 2;
- // [Optional] The language of the supplied audio as a BCP-47 language tag.
- // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt
- // If omitted, defaults to "en-US". See
- // [Language Support](/speech/docs/best-practices#language_support) for
- // a list of the currently supported language codes.
- string language_code = 3;
- // [Optional] Maximum number of recognition hypotheses to be returned.
- // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
- // within each `SpeechRecognitionResult`.
- // The server may return fewer than `max_alternatives`.
- // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
- // `1`. If omitted, defaults to `1`.
- int32 max_alternatives = 4;
- // [Optional] If set to `true`, the server will attempt to filter out
- // profanities, replacing all but the initial character in each filtered word
- // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
- // won't be filtered out.
- bool profanity_filter = 5;
- // [Optional] A means to provide context to assist the speech recognition.
- SpeechContext speech_context = 6;
- }
- // Provides "hints" to the speech recognizer to favor specific words and phrases
- // in the results.
- message SpeechContext {
- // [Optional] A list of up to 50 phrases of up to 100 characters each to
- // provide words and phrases "hints" to the speech recognition so that it is
- // more likely to recognize them.
- repeated string phrases = 1;
- }
- // Contains audio data in the encoding specified in the `RecognitionConfig`.
- // Either `content` or `uri` must be supplied. Supplying both or neither
- // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
- message RecognitionAudio {
- oneof audio_source {
- // The audio data bytes encoded as specified in
- // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
- // pure binary representation, whereas JSON representations use base64.
- bytes content = 1;
- // URI that points to a file that contains audio data bytes as specified in
- // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
- // supported, which must be specified in the following format:
- // `gs://bucket_name/object_name` (other URI formats return
- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
- // [Request URIs](/storage/docs/reference-uris).
- string uri = 2;
- }
- }
- // `SyncRecognizeResponse` is the only message returned to the client by
- // `SyncRecognize`. It contains the result as zero or more
- // sequential `RecognizeResponse` messages.
- message SyncRecognizeResponse {
- // [Output-only] Sequential list of transcription results corresponding to
- // sequential portions of audio.
- repeated SpeechRecognitionResult results = 2;
- }
- // `AsyncRecognizeResponse` is the only message returned to the client by
- // `AsyncRecognize`. It contains the result as zero or more
- // sequential `RecognizeResponse` messages.
- message AsyncRecognizeResponse {
- // [Output-only] Sequential list of transcription results corresponding to
- // sequential portions of audio.
- repeated SpeechRecognitionResult results = 2;
- }
- // `StreamingRecognizeResponse` is the only message returned to the client by
- // `StreamingRecognize`. It contains the result as zero or more
- // sequential `RecognizeResponse` messages.
- message StreamingRecognizeResponse {
- // Indicates the type of endpointer event.
- enum EndpointerType {
- // No endpointer event specified.
- ENDPOINTER_EVENT_UNSPECIFIED = 0;
- // Speech has been detected in the audio stream.
- START_OF_SPEECH = 1;
- // Speech has ceased to be detected in the audio stream.
- END_OF_SPEECH = 2;
- // The end of the audio stream has been reached. and it is being processed.
- END_OF_AUDIO = 3;
- // This event is only sent when `single_utterance` is `true`. It indicates
- // that the server has detected the end of the user's speech utterance and
- // expects no additional speech. Therefore, the server will not process
- // additional audio. The client should stop sending additional audio data.
- END_OF_UTTERANCE = 4;
- }
- // [Output-only] If set, returns a [google.rpc.Status][] message that
- // specifies the error for the operation.
- google.rpc.Status error = 1;
- // [Output-only] This repeated list contains zero or more results that
- // correspond to consecutive portions of the audio currently being processed.
- // It contains zero or one `is_final=true` result (the newly settled portion),
- // followed by zero or more `is_final=false` results.
- repeated StreamingRecognitionResult results = 2;
- // [Output-only] Indicates the lowest index in the `results` array that has
- // changed. The repeated `SpeechRecognitionResult` results overwrite past
- // results at this index and higher.
- int32 result_index = 3;
- // [Output-only] Indicates the type of endpointer event.
- EndpointerType endpointer_type = 4;
- }
- // A speech recognition result corresponding to a portion of the audio that is
- // currently being processed.
- message StreamingRecognitionResult {
- // [Output-only] May contain one or more recognition hypotheses (up to the
- // maximum specified in `max_alternatives`).
- repeated SpeechRecognitionAlternative alternatives = 1;
- // [Output-only] If `false`, this `SpeechRecognitionResult` represents an
- // interim result that may change. If `true`, this is the final time the
- // speech service will return this particular `SpeechRecognitionResult`,
- // the recognizer will not return any further hypotheses for this portion of
- // the transcript and corresponding audio.
- bool is_final = 2;
- // [Output-only] An estimate of the probability that the recognizer will not
- // change its guess about this interim result. Values range from 0.0
- // (completely unstable) to 1.0 (completely stable). Note that this is not the
- // same as `confidence`, which estimates the probability that a recognition
- // result is correct.
- // This field is only provided for interim results (`is_final=false`).
- // The default of 0.0 is a sentinel value indicating stability was not set.
- float stability = 3;
- }
- // A speech recognition result corresponding to a portion of the audio.
- message SpeechRecognitionResult {
- // [Output-only] May contain one or more recognition hypotheses (up to the
- // maximum specified in `max_alternatives`).
- repeated SpeechRecognitionAlternative alternatives = 1;
- }
- // Alternative hypotheses (a.k.a. n-best list).
- message SpeechRecognitionAlternative {
- // [Output-only] Transcript text representing the words that the user spoke.
- string transcript = 1;
- // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number
- // means the system is more confident that the recognition is correct.
- // This field is typically provided only for the top hypothesis, and only for
- // `is_final=true` results.
- // The default of 0.0 is a sentinel value indicating confidence was not set.
- float confidence = 2;
- }
|