// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.speech.v1beta1; import "google/api/annotations.proto"; import "google/longrunning/operations.proto"; import "google/rpc/status.proto"; option java_multiple_files = true; option java_outer_classname = "SpeechProto"; option java_package = "com.google.cloud.speech.v1beta1"; // Service that implements Google Cloud Speech API. service Speech { // Perform synchronous speech-recognition: receive results after all audio // has been sent and processed. rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) { option (google.api.http) = { post: "/v1beta1/speech:syncrecognize" body: "*" }; } // Perform asynchronous speech-recognition: receive results via the // google.longrunning.Operations interface. `Operation.response` returns // `AsyncRecognizeResponse`. rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" }; } // Perform bidirectional streaming speech-recognition: receive results while // sending audio. This method is only available via the gRPC API (not REST). rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse); } // `SyncRecognizeRequest` is the top-level message sent by the client for // the `SyncRecognize` method. message SyncRecognizeRequest { // [Required] The `config` message provides information to the recognizer // that specifies how to process the request. RecognitionConfig config = 1; // [Required] The audio data to be recognized. RecognitionAudio audio = 2; } // `AsyncRecognizeRequest` is the top-level message sent by the client for // the `AsyncRecognize` method. message AsyncRecognizeRequest { // [Required] The `config` message provides information to the recognizer // that specifies how to process the request. RecognitionConfig config = 1; // [Required] The audio data to be recognized. RecognitionAudio audio = 2; } // `StreamingRecognizeRequest` is the top-level message sent by the client for // the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are // sent. The first message must contain a `streaming_config` message and must // not contain `audio` data. All subsequent messages must contain `audio` data // and must not contain a `streaming_config` message. message StreamingRecognizeRequest { oneof streaming_request { // The `streaming_config` message provides information to the recognizer // that specifies how to process the request. // // The first `StreamingRecognizeRequest` message must contain a // `streaming_config` message. StreamingRecognitionConfig streaming_config = 1; // The audio data to be recognized. Sequential chunks of audio data are sent // in sequential `StreamingRecognizeRequest` messages. The first // `StreamingRecognizeRequest` message must not contain `audio_content` data // and all subsequent `StreamingRecognizeRequest` messages must contain // `audio_content` data. The audio bytes must be encoded as specified in // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a // pure binary representation (not base64). bytes audio_content = 2; } } // The `StreamingRecognitionConfig` message provides information to the // recognizer that specifies how to process the request. message StreamingRecognitionConfig { // [Required] The `config` message provides information to the recognizer // that specifies how to process the request. RecognitionConfig config = 1; // [Optional] If `false` or omitted, the recognizer will perform continuous // recognition (continuing to process audio even if the user pauses speaking) // until the client closes the output stream (gRPC API) or when the maximum // time limit has been reached. Multiple `SpeechRecognitionResult`s with the // `is_final` flag set to `true` may be returned. // // If `true`, the recognizer will detect a single spoken utterance. When it // detects that the user has paused or stopped speaking, it will return an // `END_OF_UTTERANCE` event and cease recognition. It will return no more than // one `SpeechRecognitionResult` with the `is_final` flag set to `true`. bool single_utterance = 2; // [Optional] If `true`, interim results (tentative hypotheses) may be // returned as they become available (these interim results are indicated with // the `is_final=false` flag). // If `false` or omitted, only `is_final=true` result(s) are returned. bool interim_results = 3; } // The `RecognitionConfig` message provides information to the recognizer // that specifies how to process the request. message RecognitionConfig { // Audio encoding of the data sent in the audio message. All encodings support // only 1 channel (mono) audio. Only `FLAC` includes a header that describes // the bytes of audio that follow the header. The other encodings are raw // audio bytes with no header. // // For best results, the audio source should be captured and transmitted using // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture // or transmit the audio, particularly if background noise is present. enum AudioEncoding { // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples. // This is the only encoding that may be used by `AsyncRecognize`. LINEAR16 = 1; // This is the recommended encoding for `SyncRecognize` and // `StreamingRecognize` because it uses lossless compression; therefore // recognition accuracy is not compromised by a lossy codec. // // The stream FLAC (Free Lossless Audio Codec) encoding is specified at: // http://flac.sourceforge.net/documentation.html. // Only 16-bit samples are supported. // Not all fields in STREAMINFO are supported. FLAC = 2; // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. MULAW = 3; // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz. AMR = 4; // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz. AMR_WB = 5; } // [Required] Encoding of audio data sent in all `RecognitionAudio` messages. AudioEncoding encoding = 1; // [Required] Sample rate in Hertz of the audio data sent in all // `RecognitionAudio` messages. Valid values are: 8000-48000. // 16000 is optimal. For best results, set the sampling rate of the audio // source to 16000 Hz. If that's not possible, use the native sample rate of // the audio source (instead of re-sampling). int32 sample_rate = 2; // [Optional] The language of the supplied audio as a BCP-47 language tag. // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt // If omitted, defaults to "en-US". See // [Language Support](/speech/docs/best-practices#language_support) for // a list of the currently supported language codes. string language_code = 3; // [Optional] Maximum number of recognition hypotheses to be returned. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages // within each `SpeechRecognitionResult`. // The server may return fewer than `max_alternatives`. // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of // `1`. If omitted, defaults to `1`. int32 max_alternatives = 4; // [Optional] If set to `true`, the server will attempt to filter out // profanities, replacing all but the initial character in each filtered word // with asterisks, e.g. "f***". If set to `false` or omitted, profanities // won't be filtered out. bool profanity_filter = 5; // [Optional] A means to provide context to assist the speech recognition. SpeechContext speech_context = 6; } // Provides "hints" to the speech recognizer to favor specific words and phrases // in the results. message SpeechContext { // [Optional] A list of up to 50 phrases of up to 100 characters each to // provide words and phrases "hints" to the speech recognition so that it is // more likely to recognize them. repeated string phrases = 1; } // Contains audio data in the encoding specified in the `RecognitionConfig`. // Either `content` or `uri` must be supplied. Supplying both or neither // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. message RecognitionAudio { oneof audio_source { // The audio data bytes encoded as specified in // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a // pure binary representation, whereas JSON representations use base64. bytes content = 1; // URI that points to a file that contains audio data bytes as specified in // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are // supported, which must be specified in the following format: // `gs://bucket_name/object_name` (other URI formats return // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see // [Request URIs](/storage/docs/reference-uris). string uri = 2; } } // `SyncRecognizeResponse` is the only message returned to the client by // `SyncRecognize`. It contains the result as zero or more // sequential `RecognizeResponse` messages. message SyncRecognizeResponse { // [Output-only] Sequential list of transcription results corresponding to // sequential portions of audio. repeated SpeechRecognitionResult results = 2; } // `AsyncRecognizeResponse` is the only message returned to the client by // `AsyncRecognize`. It contains the result as zero or more // sequential `RecognizeResponse` messages. message AsyncRecognizeResponse { // [Output-only] Sequential list of transcription results corresponding to // sequential portions of audio. repeated SpeechRecognitionResult results = 2; } // `StreamingRecognizeResponse` is the only message returned to the client by // `StreamingRecognize`. It contains the result as zero or more // sequential `RecognizeResponse` messages. message StreamingRecognizeResponse { // Indicates the type of endpointer event. enum EndpointerType { // No endpointer event specified. ENDPOINTER_EVENT_UNSPECIFIED = 0; // Speech has been detected in the audio stream. START_OF_SPEECH = 1; // Speech has ceased to be detected in the audio stream. END_OF_SPEECH = 2; // The end of the audio stream has been reached. and it is being processed. END_OF_AUDIO = 3; // This event is only sent when `single_utterance` is `true`. It indicates // that the server has detected the end of the user's speech utterance and // expects no additional speech. Therefore, the server will not process // additional audio. The client should stop sending additional audio data. END_OF_UTTERANCE = 4; } // [Output-only] If set, returns a [google.rpc.Status][] message that // specifies the error for the operation. google.rpc.Status error = 1; // [Output-only] This repeated list contains zero or more results that // correspond to consecutive portions of the audio currently being processed. // It contains zero or one `is_final=true` result (the newly settled portion), // followed by zero or more `is_final=false` results. repeated StreamingRecognitionResult results = 2; // [Output-only] Indicates the lowest index in the `results` array that has // changed. The repeated `SpeechRecognitionResult` results overwrite past // results at this index and higher. int32 result_index = 3; // [Output-only] Indicates the type of endpointer event. EndpointerType endpointer_type = 4; } // A speech recognition result corresponding to a portion of the audio that is // currently being processed. message StreamingRecognitionResult { // [Output-only] May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). repeated SpeechRecognitionAlternative alternatives = 1; // [Output-only] If `false`, this `SpeechRecognitionResult` represents an // interim result that may change. If `true`, this is the final time the // speech service will return this particular `SpeechRecognitionResult`, // the recognizer will not return any further hypotheses for this portion of // the transcript and corresponding audio. bool is_final = 2; // [Output-only] An estimate of the probability that the recognizer will not // change its guess about this interim result. Values range from 0.0 // (completely unstable) to 1.0 (completely stable). Note that this is not the // same as `confidence`, which estimates the probability that a recognition // result is correct. // This field is only provided for interim results (`is_final=false`). // The default of 0.0 is a sentinel value indicating stability was not set. float stability = 3; } // A speech recognition result corresponding to a portion of the audio. message SpeechRecognitionResult { // [Output-only] May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). repeated SpeechRecognitionAlternative alternatives = 1; } // Alternative hypotheses (a.k.a. n-best list). message SpeechRecognitionAlternative { // [Output-only] Transcript text representing the words that the user spoke. string transcript = 1; // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number // means the system is more confident that the recognition is correct. // This field is typically provided only for the top hypothesis, and only for // `is_final=true` results. // The default of 0.0 is a sentinel value indicating confidence was not set. float confidence = 2; }