cloud_speech.proto 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.speech.v1beta1;
  16. import "google/api/annotations.proto";
  17. import "google/longrunning/operations.proto";
  18. import "google/rpc/status.proto";
  19. option java_multiple_files = true;
  20. option java_outer_classname = "SpeechProto";
  21. option java_package = "com.google.cloud.speech.v1beta1";
  22. // Service that implements Google Cloud Speech API.
  23. service Speech {
  24. // Perform synchronous speech-recognition: receive results after all audio
  25. // has been sent and processed.
  26. rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse);
  27. // Perform asynchronous speech-recognition: receive results via the
  28. // google.longrunning.Operations interface. `Operation.response` returns
  29. // `AsyncRecognizeResponse`.
  30. rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation);
  31. // Perform bidirectional streaming speech-recognition: receive results while
  32. // sending audio. This method is only available via the gRPC API (not REST).
  33. rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
  34. }
  35. // `SyncRecognizeRequest` is the top-level message sent by the client for
  36. // the `SyncRecognize` method.
  37. message SyncRecognizeRequest {
  38. // [Required] The `config` message provides information to the recognizer
  39. // that specifies how to process the request.
  40. RecognitionConfig config = 1;
  41. // [Required] The audio data to be recognized.
  42. RecognitionAudio audio = 2;
  43. }
  44. // `AsyncRecognizeRequest` is the top-level message sent by the client for
  45. // the `AsyncRecognize` method.
  46. message AsyncRecognizeRequest {
  47. // [Required] The `config` message provides information to the recognizer
  48. // that specifies how to process the request.
  49. RecognitionConfig config = 1;
  50. // [Required] The audio data to be recognized.
  51. RecognitionAudio audio = 2;
  52. }
  53. // `StreamingRecognizeRequest` is the top-level message sent by the client for
  54. // the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are
  55. // sent. The first message must contain a `streaming_config` message and must
  56. // not contain `audio` data. All subsequent messages must contain `audio` data
  57. // and must not contain a `streaming_config` message.
  58. message StreamingRecognizeRequest {
  59. oneof streaming_request {
  60. // The `streaming_config` message provides information to the recognizer
  61. // that specifies how to process the request.
  62. //
  63. // The first `StreamingRecognizeRequest` message must contain a
  64. // `streaming_config` message.
  65. StreamingRecognitionConfig streaming_config = 1;
  66. // The audio data to be recognized. Sequential chunks of audio data are sent
  67. // in sequential `StreamingRecognizeRequest` messages. The first
  68. // `StreamingRecognizeRequest` message must not contain `audio_content` data
  69. // and all subsequent `StreamingRecognizeRequest` messages must contain
  70. // `audio_content` data. The audio bytes must be encoded as specified in
  71. // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
  72. // pure binary representation (not base64).
  73. bytes audio_content = 2;
  74. }
  75. }
  76. // The `StreamingRecognitionConfig` message provides information to the
  77. // recognizer that specifies how to process the request.
  78. message StreamingRecognitionConfig {
  79. // [Required] The `config` message provides information to the recognizer
  80. // that specifies how to process the request.
  81. RecognitionConfig config = 1;
  82. // [Optional] If `false` or omitted, the recognizer will perform continuous
  83. // recognition (continuing to process audio even if the user pauses speaking)
  84. // until the client closes the output stream (gRPC API) or when the maximum
  85. // time limit has been reached. Multiple `SpeechRecognitionResult`s with the
  86. // `is_final` flag set to `true` may be returned.
  87. //
  88. // If `true`, the recognizer will detect a single spoken utterance. When it
  89. // detects that the user has paused or stopped speaking, it will return an
  90. // `END_OF_UTTERANCE` event and cease recognition. It will return no more than
  91. // one `SpeechRecognitionResult` with the `is_final` flag set to `true`.
  92. bool single_utterance = 2;
  93. // [Optional] If `true`, interim results (tentative hypotheses) may be
  94. // returned as they become available (these interim results are indicated with
  95. // the `is_final=false` flag).
  96. // If `false` or omitted, only `is_final=true` result(s) are returned.
  97. bool interim_results = 3;
  98. }
  99. // The `RecognitionConfig` message provides information to the recognizer
  100. // that specifies how to process the request.
  101. message RecognitionConfig {
  102. // Audio encoding of the data sent in the audio message. All encodings support
  103. // only 1 channel (mono) audio. Only `FLAC` includes a header that describes
  104. // the bytes of audio that follow the header. The other encodings are raw
  105. // audio bytes with no header.
  106. //
  107. // For best results, the audio source should be captured and transmitted using
  108. // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
  109. // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture
  110. // or transmit the audio, particularly if background noise is present.
  111. enum AudioEncoding {
  112. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
  113. ENCODING_UNSPECIFIED = 0;
  114. // Uncompressed 16-bit signed little-endian samples.
  115. // This is the only encoding that may be used by `AsyncRecognize`.
  116. LINEAR16 = 1;
  117. // This is the recommended encoding for `SyncRecognize` and
  118. // `StreamingRecognize` because it uses lossless compression; therefore
  119. // recognition accuracy is not compromised by a lossy codec.
  120. //
  121. // The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
  122. // http://flac.sourceforge.net/documentation.html.
  123. // Only 16-bit samples are supported.
  124. // Not all fields in STREAMINFO are supported.
  125. FLAC = 2;
  126. // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
  127. MULAW = 3;
  128. // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz.
  129. AMR = 4;
  130. // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz.
  131. AMR_WB = 5;
  132. }
  133. // [Required] Encoding of audio data sent in all `RecognitionAudio` messages.
  134. AudioEncoding encoding = 1;
  135. // [Required] Sample rate in Hertz of the audio data sent in all
  136. // `RecognitionAudio` messages. Valid values are: 8000-48000.
  137. // 16000 is optimal. For best results, set the sampling rate of the audio
  138. // source to 16000 Hz. If that's not possible, use the native sample rate of
  139. // the audio source (instead of re-sampling).
  140. int32 sample_rate = 2;
  141. // [Optional] The language of the supplied audio as a BCP-47 language tag.
  142. // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt
  143. // If omitted, defaults to "en-US". See
  144. // [Language Support](/speech/docs/best-practices#language_support) for
  145. // a list of the currently supported language codes.
  146. string language_code = 3;
  147. // [Optional] Maximum number of recognition hypotheses to be returned.
  148. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  149. // within each `SpeechRecognitionResult`.
  150. // The server may return fewer than `max_alternatives`.
  151. // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
  152. // `1`. If omitted, defaults to `1`.
  153. int32 max_alternatives = 4;
  154. // [Optional] If set to `true`, the server will attempt to filter out
  155. // profanities, replacing all but the initial character in each filtered word
  156. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  157. // won't be filtered out.
  158. bool profanity_filter = 5;
  159. // [Optional] A means to provide context to assist the speech recognition.
  160. SpeechContext speech_context = 6;
  161. }
  162. // Provides "hints" to the speech recognizer to favor specific words and phrases
  163. // in the results.
  164. message SpeechContext {
  165. // [Optional] A list of up to 50 phrases of up to 100 characters each to
  166. // provide words and phrases "hints" to the speech recognition so that it is
  167. // more likely to recognize them.
  168. repeated string phrases = 1;
  169. }
  170. // Contains audio data in the encoding specified in the `RecognitionConfig`.
  171. // Either `content` or `uri` must be supplied. Supplying both or neither
  172. // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
  173. message RecognitionAudio {
  174. oneof audio_source {
  175. // The audio data bytes encoded as specified in
  176. // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
  177. // pure binary representation, whereas JSON representations use base64.
  178. bytes content = 1;
  179. // URI that points to a file that contains audio data bytes as specified in
  180. // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
  181. // supported, which must be specified in the following format:
  182. // `gs://bucket_name/object_name` (other URI formats return
  183. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
  184. // [Request URIs](/storage/docs/reference-uris).
  185. string uri = 2;
  186. }
  187. }
  188. // `SyncRecognizeResponse` is the only message returned to the client by
  189. // `SyncRecognize`. It contains the result as zero or more
  190. // sequential `RecognizeResponse` messages.
  191. message SyncRecognizeResponse {
  192. // [Output-only] Sequential list of transcription results corresponding to
  193. // sequential portions of audio.
  194. repeated SpeechRecognitionResult results = 2;
  195. }
  196. // `AsyncRecognizeResponse` is the only message returned to the client by
  197. // `AsyncRecognize`. It contains the result as zero or more
  198. // sequential `RecognizeResponse` messages.
  199. message AsyncRecognizeResponse {
  200. // [Output-only] Sequential list of transcription results corresponding to
  201. // sequential portions of audio.
  202. repeated SpeechRecognitionResult results = 2;
  203. }
  204. // `StreamingRecognizeResponse` is the only message returned to the client by
  205. // `StreamingRecognize`. It contains the result as zero or more
  206. // sequential `RecognizeResponse` messages.
  207. message StreamingRecognizeResponse {
  208. // Indicates the type of endpointer event.
  209. enum EndpointerType {
  210. // No endpointer event specified.
  211. ENDPOINTER_EVENT_UNSPECIFIED = 0;
  212. // Speech has been detected in the audio stream.
  213. START_OF_SPEECH = 1;
  214. // Speech has ceased to be detected in the audio stream.
  215. END_OF_SPEECH = 2;
  216. // The end of the audio stream has been reached. and it is being processed.
  217. END_OF_AUDIO = 3;
  218. // This event is only sent when `single_utterance` is `true`. It indicates
  219. // that the server has detected the end of the user's speech utterance and
  220. // expects no additional speech. Therefore, the server will not process
  221. // additional audio. The client should stop sending additional audio data.
  222. END_OF_UTTERANCE = 4;
  223. }
  224. // [Output-only] If set, returns a [google.rpc.Status][] message that
  225. // specifies the error for the operation.
  226. google.rpc.Status error = 1;
  227. // [Output-only] This repeated list contains zero or more results that
  228. // correspond to consecutive portions of the audio currently being processed.
  229. // It contains zero or one `is_final=true` result (the newly settled portion),
  230. // followed by zero or more `is_final=false` results.
  231. repeated StreamingRecognitionResult results = 2;
  232. // [Output-only] Indicates the lowest index in the `results` array that has
  233. // changed. The repeated `SpeechRecognitionResult` results overwrite past
  234. // results at this index and higher.
  235. int32 result_index = 3;
  236. // [Output-only] Indicates the type of endpointer event.
  237. EndpointerType endpointer_type = 4;
  238. }
  239. // A speech recognition result corresponding to a portion of the audio that is
  240. // currently being processed.
  241. message StreamingRecognitionResult {
  242. // [Output-only] May contain one or more recognition hypotheses (up to the
  243. // maximum specified in `max_alternatives`).
  244. repeated SpeechRecognitionAlternative alternatives = 1;
  245. // [Output-only] If `false`, this `SpeechRecognitionResult` represents an
  246. // interim result that may change. If `true`, this is the final time the
  247. // speech service will return this particular `SpeechRecognitionResult`,
  248. // the recognizer will not return any further hypotheses for this portion of
  249. // the transcript and corresponding audio.
  250. bool is_final = 2;
  251. // [Output-only] An estimate of the probability that the recognizer will not
  252. // change its guess about this interim result. Values range from 0.0
  253. // (completely unstable) to 1.0 (completely stable). Note that this is not the
  254. // same as `confidence`, which estimates the probability that a recognition
  255. // result is correct.
  256. // This field is only provided for interim results (`is_final=false`).
  257. // The default of 0.0 is a sentinel value indicating stability was not set.
  258. float stability = 3;
  259. }
  260. // A speech recognition result corresponding to a portion of the audio.
  261. message SpeechRecognitionResult {
  262. // [Output-only] May contain one or more recognition hypotheses (up to the
  263. // maximum specified in `max_alternatives`).
  264. repeated SpeechRecognitionAlternative alternatives = 1;
  265. }
  266. // Alternative hypotheses (a.k.a. n-best list).
  267. message SpeechRecognitionAlternative {
  268. // [Output-only] Transcript text representing the words that the user spoke.
  269. string transcript = 1;
  270. // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number
  271. // means the system is more confident that the recognition is correct.
  272. // This field is typically provided only for the top hypothesis, and only for
  273. // `is_final=true` results.
  274. // The default of 0.0 is a sentinel value indicating confidence was not set.
  275. float confidence = 2;
  276. }