cloud_speech.proto 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.speech.v1beta1;
  16. import "google/api/annotations.proto";
  17. import "google/longrunning/operations.proto";
  18. import "google/rpc/status.proto";
  19. option java_multiple_files = true;
  20. option java_outer_classname = "SpeechProto";
  21. option java_package = "com.google.cloud.speech.v1beta1";
  22. // Service that implements Google Cloud Speech API.
  23. service Speech {
  24. // Perform synchronous speech-recognition: receive results after all audio
  25. // has been sent and processed.
  26. rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) {
  27. option (google.api.http) = { post: "/v1beta1/speech:syncrecognize" body: "*" };
  28. }
  29. // Perform asynchronous speech-recognition: receive results via the
  30. // google.longrunning.Operations interface. `Operation.response` returns
  31. // `AsyncRecognizeResponse`.
  32. rpc AsyncRecognize(AsyncRecognizeRequest) returns (google.longrunning.Operation) {
  33. option (google.api.http) = { post: "/v1beta1/speech:asyncrecognize" body: "*" };
  34. }
  35. // Perform bidirectional streaming speech-recognition: receive results while
  36. // sending audio. This method is only available via the gRPC API (not REST).
  37. rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
  38. }
  39. // `SyncRecognizeRequest` is the top-level message sent by the client for
  40. // the `SyncRecognize` method.
  41. message SyncRecognizeRequest {
  42. // [Required] The `config` message provides information to the recognizer
  43. // that specifies how to process the request.
  44. RecognitionConfig config = 1;
  45. // [Required] The audio data to be recognized.
  46. RecognitionAudio audio = 2;
  47. }
  48. // `AsyncRecognizeRequest` is the top-level message sent by the client for
  49. // the `AsyncRecognize` method.
  50. message AsyncRecognizeRequest {
  51. // [Required] The `config` message provides information to the recognizer
  52. // that specifies how to process the request.
  53. RecognitionConfig config = 1;
  54. // [Required] The audio data to be recognized.
  55. RecognitionAudio audio = 2;
  56. }
  57. // `StreamingRecognizeRequest` is the top-level message sent by the client for
  58. // the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are
  59. // sent. The first message must contain a `streaming_config` message and must
  60. // not contain `audio` data. All subsequent messages must contain `audio` data
  61. // and must not contain a `streaming_config` message.
  62. message StreamingRecognizeRequest {
  63. oneof streaming_request {
  64. // The `streaming_config` message provides information to the recognizer
  65. // that specifies how to process the request.
  66. //
  67. // The first `StreamingRecognizeRequest` message must contain a
  68. // `streaming_config` message.
  69. StreamingRecognitionConfig streaming_config = 1;
  70. // The audio data to be recognized. Sequential chunks of audio data are sent
  71. // in sequential `StreamingRecognizeRequest` messages. The first
  72. // `StreamingRecognizeRequest` message must not contain `audio_content` data
  73. // and all subsequent `StreamingRecognizeRequest` messages must contain
  74. // `audio_content` data. The audio bytes must be encoded as specified in
  75. // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
  76. // pure binary representation (not base64).
  77. bytes audio_content = 2;
  78. }
  79. }
  80. // The `StreamingRecognitionConfig` message provides information to the
  81. // recognizer that specifies how to process the request.
  82. message StreamingRecognitionConfig {
  83. // [Required] The `config` message provides information to the recognizer
  84. // that specifies how to process the request.
  85. RecognitionConfig config = 1;
  86. // [Optional] If `false` or omitted, the recognizer will perform continuous
  87. // recognition (continuing to process audio even if the user pauses speaking)
  88. // until the client closes the output stream (gRPC API) or when the maximum
  89. // time limit has been reached. Multiple `SpeechRecognitionResult`s with the
  90. // `is_final` flag set to `true` may be returned.
  91. //
  92. // If `true`, the recognizer will detect a single spoken utterance. When it
  93. // detects that the user has paused or stopped speaking, it will return an
  94. // `END_OF_UTTERANCE` event and cease recognition. It will return no more than
  95. // one `SpeechRecognitionResult` with the `is_final` flag set to `true`.
  96. bool single_utterance = 2;
  97. // [Optional] If `true`, interim results (tentative hypotheses) may be
  98. // returned as they become available (these interim results are indicated with
  99. // the `is_final=false` flag).
  100. // If `false` or omitted, only `is_final=true` result(s) are returned.
  101. bool interim_results = 3;
  102. }
  103. // The `RecognitionConfig` message provides information to the recognizer
  104. // that specifies how to process the request.
  105. message RecognitionConfig {
  106. // Audio encoding of the data sent in the audio message. All encodings support
  107. // only 1 channel (mono) audio. Only `FLAC` includes a header that describes
  108. // the bytes of audio that follow the header. The other encodings are raw
  109. // audio bytes with no header.
  110. //
  111. // For best results, the audio source should be captured and transmitted using
  112. // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
  113. // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture
  114. // or transmit the audio, particularly if background noise is present.
  115. enum AudioEncoding {
  116. // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
  117. ENCODING_UNSPECIFIED = 0;
  118. // Uncompressed 16-bit signed little-endian samples.
  119. // This is the only encoding that may be used by `AsyncRecognize`.
  120. LINEAR16 = 1;
  121. // This is the recommended encoding for `SyncRecognize` and
  122. // `StreamingRecognize` because it uses lossless compression; therefore
  123. // recognition accuracy is not compromised by a lossy codec.
  124. //
  125. // The stream FLAC (Free Lossless Audio Codec) encoding is specified at:
  126. // http://flac.sourceforge.net/documentation.html.
  127. // Only 16-bit samples are supported.
  128. // Not all fields in STREAMINFO are supported.
  129. FLAC = 2;
  130. // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
  131. MULAW = 3;
  132. // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz.
  133. AMR = 4;
  134. // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz.
  135. AMR_WB = 5;
  136. }
  137. // [Required] Encoding of audio data sent in all `RecognitionAudio` messages.
  138. AudioEncoding encoding = 1;
  139. // [Required] Sample rate in Hertz of the audio data sent in all
  140. // `RecognitionAudio` messages. Valid values are: 8000-48000.
  141. // 16000 is optimal. For best results, set the sampling rate of the audio
  142. // source to 16000 Hz. If that's not possible, use the native sample rate of
  143. // the audio source (instead of re-sampling).
  144. int32 sample_rate = 2;
  145. // [Optional] The language of the supplied audio as a BCP-47 language tag.
  146. // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt
  147. // If omitted, defaults to "en-US". See
  148. // [Language Support](/speech/docs/best-practices#language_support) for
  149. // a list of the currently supported language codes.
  150. string language_code = 3;
  151. // [Optional] Maximum number of recognition hypotheses to be returned.
  152. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  153. // within each `SpeechRecognitionResult`.
  154. // The server may return fewer than `max_alternatives`.
  155. // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
  156. // `1`. If omitted, defaults to `1`.
  157. int32 max_alternatives = 4;
  158. // [Optional] If set to `true`, the server will attempt to filter out
  159. // profanities, replacing all but the initial character in each filtered word
  160. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  161. // won't be filtered out.
  162. bool profanity_filter = 5;
  163. // [Optional] A means to provide context to assist the speech recognition.
  164. SpeechContext speech_context = 6;
  165. }
  166. // Provides "hints" to the speech recognizer to favor specific words and phrases
  167. // in the results.
  168. message SpeechContext {
  169. // [Optional] A list of up to 50 phrases of up to 100 characters each to
  170. // provide words and phrases "hints" to the speech recognition so that it is
  171. // more likely to recognize them.
  172. repeated string phrases = 1;
  173. }
  174. // Contains audio data in the encoding specified in the `RecognitionConfig`.
  175. // Either `content` or `uri` must be supplied. Supplying both or neither
  176. // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
  177. message RecognitionAudio {
  178. oneof audio_source {
  179. // The audio data bytes encoded as specified in
  180. // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
  181. // pure binary representation, whereas JSON representations use base64.
  182. bytes content = 1;
  183. // URI that points to a file that contains audio data bytes as specified in
  184. // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
  185. // supported, which must be specified in the following format:
  186. // `gs://bucket_name/object_name` (other URI formats return
  187. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
  188. // [Request URIs](/storage/docs/reference-uris).
  189. string uri = 2;
  190. }
  191. }
  192. // `SyncRecognizeResponse` is the only message returned to the client by
  193. // `SyncRecognize`. It contains the result as zero or more
  194. // sequential `RecognizeResponse` messages.
  195. message SyncRecognizeResponse {
  196. // [Output-only] Sequential list of transcription results corresponding to
  197. // sequential portions of audio.
  198. repeated SpeechRecognitionResult results = 2;
  199. }
  200. // `AsyncRecognizeResponse` is the only message returned to the client by
  201. // `AsyncRecognize`. It contains the result as zero or more
  202. // sequential `RecognizeResponse` messages.
  203. message AsyncRecognizeResponse {
  204. // [Output-only] Sequential list of transcription results corresponding to
  205. // sequential portions of audio.
  206. repeated SpeechRecognitionResult results = 2;
  207. }
  208. // `StreamingRecognizeResponse` is the only message returned to the client by
  209. // `StreamingRecognize`. It contains the result as zero or more
  210. // sequential `RecognizeResponse` messages.
  211. message StreamingRecognizeResponse {
  212. // Indicates the type of endpointer event.
  213. enum EndpointerType {
  214. // No endpointer event specified.
  215. ENDPOINTER_EVENT_UNSPECIFIED = 0;
  216. // Speech has been detected in the audio stream.
  217. START_OF_SPEECH = 1;
  218. // Speech has ceased to be detected in the audio stream.
  219. END_OF_SPEECH = 2;
  220. // The end of the audio stream has been reached. and it is being processed.
  221. END_OF_AUDIO = 3;
  222. // This event is only sent when `single_utterance` is `true`. It indicates
  223. // that the server has detected the end of the user's speech utterance and
  224. // expects no additional speech. Therefore, the server will not process
  225. // additional audio. The client should stop sending additional audio data.
  226. END_OF_UTTERANCE = 4;
  227. }
  228. // [Output-only] If set, returns a [google.rpc.Status][] message that
  229. // specifies the error for the operation.
  230. google.rpc.Status error = 1;
  231. // [Output-only] This repeated list contains zero or more results that
  232. // correspond to consecutive portions of the audio currently being processed.
  233. // It contains zero or one `is_final=true` result (the newly settled portion),
  234. // followed by zero or more `is_final=false` results.
  235. repeated StreamingRecognitionResult results = 2;
  236. // [Output-only] Indicates the lowest index in the `results` array that has
  237. // changed. The repeated `SpeechRecognitionResult` results overwrite past
  238. // results at this index and higher.
  239. int32 result_index = 3;
  240. // [Output-only] Indicates the type of endpointer event.
  241. EndpointerType endpointer_type = 4;
  242. }
  243. // A speech recognition result corresponding to a portion of the audio that is
  244. // currently being processed.
  245. message StreamingRecognitionResult {
  246. // [Output-only] May contain one or more recognition hypotheses (up to the
  247. // maximum specified in `max_alternatives`).
  248. repeated SpeechRecognitionAlternative alternatives = 1;
  249. // [Output-only] If `false`, this `SpeechRecognitionResult` represents an
  250. // interim result that may change. If `true`, this is the final time the
  251. // speech service will return this particular `SpeechRecognitionResult`,
  252. // the recognizer will not return any further hypotheses for this portion of
  253. // the transcript and corresponding audio.
  254. bool is_final = 2;
  255. // [Output-only] An estimate of the probability that the recognizer will not
  256. // change its guess about this interim result. Values range from 0.0
  257. // (completely unstable) to 1.0 (completely stable). Note that this is not the
  258. // same as `confidence`, which estimates the probability that a recognition
  259. // result is correct.
  260. // This field is only provided for interim results (`is_final=false`).
  261. // The default of 0.0 is a sentinel value indicating stability was not set.
  262. float stability = 3;
  263. }
  264. // A speech recognition result corresponding to a portion of the audio.
  265. message SpeechRecognitionResult {
  266. // [Output-only] May contain one or more recognition hypotheses (up to the
  267. // maximum specified in `max_alternatives`).
  268. repeated SpeechRecognitionAlternative alternatives = 1;
  269. }
  270. // Alternative hypotheses (a.k.a. n-best list).
  271. message SpeechRecognitionAlternative {
  272. // [Output-only] Transcript text representing the words that the user spoke.
  273. string transcript = 1;
  274. // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number
  275. // means the system is more confident that the recognition is correct.
  276. // This field is typically provided only for the top hypothesis, and only for
  277. // `is_final=true` results.
  278. // The default of 0.0 is a sentinel value indicating confidence was not set.
  279. float confidence = 2;
  280. }