From fb2ede546b13739e3de1dc19ed7e8c776e260daf Mon Sep 17 00:00:00 2001 From: James Stout Date: Fri, 14 Jul 2017 23:17:34 -0700 Subject: [PATCH 1/2] Added custom microphone integration. --- speech/cloud-client/transcribe_microphone.py | 312 +++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 speech/cloud-client/transcribe_microphone.py diff --git a/speech/cloud-client/transcribe_microphone.py b/speech/cloud-client/transcribe_microphone.py new file mode 100644 index 00000000000..aa7a7a44057 --- /dev/null +++ b/speech/cloud-client/transcribe_microphone.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the streaming API. + +Example usage: + python transcribe_streaming.py resources/audio.raw +""" + + +from __future__ import division + +import contextlib +import functools +import re +import signal +import sys + + +import google.auth +import google.auth.transport.grpc +import google.auth.transport.requests +from google.cloud.proto.speech.v1beta1 import cloud_speech_pb2 +from google.rpc import code_pb2 +import grpc +import pyaudio +from six.moves import queue + +# [START import_libraries] +import argparse +import io +# [END import_libraries] + + +# Audio recording parameters +RATE = 16000 +CHUNK = int(RATE / 10) # 100ms + +# The Speech API has a streaming limit of 60 seconds of audio*, so keep the +# connection alive for that long, plus some more to give the API time to figure +# out the transcription. +# * https://g.co/cloud/speech/limits#content +DEADLINE_SECS = 60 * 3 + 5 +SPEECH_SCOPE = 'https://www.googleapis.com/auth/cloud-platform' + + +def make_channel(host, port): + """Creates a secure channel with auth credentials from the environment.""" + # Grab application default credentials from the environment + credentials, _ = google.auth.default(scopes=[SPEECH_SCOPE]) + + # Create a secure channel using the credentials. + http_request = google.auth.transport.requests.Request() + target = '{}:{}'.format(host, port) + + return google.auth.transport.grpc.secure_authorized_channel( + credentials, http_request, target) + + +def buffer_to_file(buffer): + class BufferFile(object): + def __init__(self, buffer): + self.buffer = buffer + self.closed = False + self.data = b'' + + def read(self, n=-1): + if self.closed: + return b'' + if len(self.data) == 0: + piece = self.buffer.get() + if piece: + self.data += b''.join(piece) + while True: + try: + piece = self.buffer.get(block=False) + if piece: + self.data += b''.join(piece) + except queue.Empty: + break + result = self.data[:n] + self.data = self.data[n:] + return result + + return BufferFile(buffer) + + +def _audio_data_generator(buff): + """A generator that yields all available data in the given buffer. + Args: + buff - a Queue object, where each element is a chunk of data. + Yields: + A chunk of data that is the aggregate of all chunks of data in `buff`. + The function will block until at least one data chunk is available. + """ + stop = False + while not stop: + # Use a blocking get() to ensure there's at least one chunk of data. + data = [buff.get()] + + # Now consume whatever other data's still buffered. + while True: + try: + data.append(buff.get(block=False)) + except queue.Empty: + break + + # `None` in the buffer signals that the audio stream is closed. Yield + # the final bit of the buffer and exit the loop. + if None in data: + stop = True + data.remove(None) + + yield b''.join(data) + + +def _fill_buffer(buff, in_data, frame_count, time_info, status_flags): + """Continuously collect data from the audio stream, into the buffer.""" + buff.put(in_data) + return None, pyaudio.paContinue + + +# [START audio_stream] +@contextlib.contextmanager +def record_audio(rate, chunk): + """Opens a recording stream in a context manager.""" + # Create a thread-safe buffer of audio data + buff = queue.Queue() + + audio_interface = pyaudio.PyAudio() + audio_stream = audio_interface.open( + format=pyaudio.paInt16, + # The API currently only supports 1-channel (mono) audio + # https://goo.gl/z757pE + channels=1, rate=rate, + input=True, frames_per_buffer=chunk, + # Run the audio stream asynchronously to fill the buffer object. + # This is necessary so that the input device's buffer doesn't overflow + # while the calling thread makes network requests, etc. + stream_callback=functools.partial(_fill_buffer, buff), + ) + + yield buffer_to_file(buff) + + audio_stream.stop_stream() + audio_stream.close() + # Signal the _audio_data_generator to finish + buff.put(None) + audio_interface.terminate() +# [END audio_stream] + + +def request_stream(data_stream, rate, interim_results=True): + """Yields `StreamingRecognizeRequest`s constructed from a recording audio + stream. + Args: + data_stream: A generator that yields raw audio data to send. + rate: The sampling rate in hertz. + interim_results: Whether to return intermediate results, before the + transcription is finalized. + """ + # The initial request must contain metadata about the stream, so the + # server knows how to interpret it. + recognition_config = cloud_speech_pb2.RecognitionConfig( + # There are a bunch of config options you can specify. See + # https://goo.gl/KPZn97 for the full list. + encoding='LINEAR16', # raw 16-bit signed LE samples + sample_rate=rate, # the rate in hertz + # See http://g.co/cloud/speech/docs/languages + # for a list of supported languages. + language_code='en-US', # a BCP-47 language tag + ) + streaming_config = cloud_speech_pb2.StreamingRecognitionConfig( + interim_results=interim_results, + config=recognition_config, + ) + + yield cloud_speech_pb2.StreamingRecognizeRequest( + streaming_config=streaming_config) + + for data in data_stream: + # Subsequent requests can all just have the content + yield cloud_speech_pb2.StreamingRecognizeRequest(audio_content=data) + + +def listen_print_loop(results): + """Iterates through server responses and prints them. + The recognize_stream passed is a generator that will block until a response + is provided by the server. When the transcription response comes, print it. + In this case, responses are provided for interim results as well. If the + response is an interim one, print a line feed at the end of it, to allow + the next result to overwrite it, until the response is a final one. For the + final one, print a newline to preserve the finalized transcription. + """ + num_chars_printed = 0 + for result in results: + transcript = result.alternatives[0].transcript + + # Display interim results, but with a carriage return at the end of the + # line, so subsequent lines will overwrite them. + # + # If the previous result was longer than this one, we need to print + # some extra spaces to overwrite the previous result + overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript)) + + if not result.is_final: + sys.stdout.write(transcript + overwrite_chars + '\r') + sys.stdout.flush() + + num_chars_printed = len(transcript) + + else: + print(transcript + overwrite_chars) + + # Exit recognition if any of the transcribed phrases could be + # one of our keywords. + if re.search(r'\b(exit|quit)\b', transcript, re.I): + print('Exiting..') + break + + num_chars_printed = 0 + + +def old_main(): + service = cloud_speech_pb2.SpeechStub( + make_channel('speech.googleapis.com', 443)) + + # For streaming audio from the microphone, there are three threads. + # First, a thread that collects audio data as it comes in + with record_audio(RATE, CHUNK) as buffered_audio_data: + # Second, a thread that sends requests with that data + requests = request_stream(buffered_audio_data, RATE) + # Third, a thread that listens for transcription responses + recognize_stream = service.StreamingRecognize( + requests, DEADLINE_SECS) + + # Exit things cleanly on interrupt + signal.signal(signal.SIGINT, lambda *_: recognize_stream.cancel()) + + # Now, put the transcription responses to use. + try: + listen_print_loop(recognize_stream) + + recognize_stream.cancel() + except grpc.RpcError as e: + code = e.code() + # CANCELLED is caused by the interrupt handler, which is expected. + if code is not code.CANCELLED: + raise + + +def main(): + from google.cloud import speech + speech_client = speech.Client() + + # For streaming audio from the microphone, there are three threads. + # First, a thread that collects audio data as it comes in + with record_audio(RATE, CHUNK) as audio_file: + audio_sample = speech_client.sample( + stream=audio_file, + encoding=speech.encoding.Encoding.LINEAR16, + sample_rate_hertz=16000) + results = audio_sample.streaming_recognize('en-US', interim_results=True) + + # Exit things cleanly on interrupt + # signal.signal(signal.SIGINT, lambda *_: recognize_stream.cancel()) + + # Now, put the transcription responses to use. + try: + listen_print_loop(results) + + # recognize_stream.cancel() + except grpc.RpcError as e: + code = e.code() + # CANCELLED is caused by the interrupt handler, which is expected. + if code is not code.CANCELLED: + raise + + +if __name__ == '__main__': + main() + +def transcribe_streaming(stream_file): + """Streams transcription of the given audio file.""" + from google.cloud import speech + speech_client = speech.Client() + + with io.open(stream_file, 'rb') as audio_file: + audio_sample = speech_client.sample( + stream=audio_file, + encoding=speech.encoding.Encoding.LINEAR16, + sample_rate_hertz=16000) + alternatives = audio_sample.streaming_recognize('en-US') + + for alternative in alternatives: + print('Finished: {}'.format(alternative.is_final)) + print('Stability: {}'.format(alternative.stability)) + print('Confidence: {}'.format(alternative.confidence)) + print('Transcript: {}'.format(alternative.transcript)) From a9f69bbe8ca4eea143b45e25a88a7e971c05dcc2 Mon Sep 17 00:00:00 2001 From: James Stout Date: Sat, 15 Jul 2017 00:08:29 -0700 Subject: [PATCH 2/2] Added missing requirements --- speech/cloud-client/requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/speech/cloud-client/requirements.txt b/speech/cloud-client/requirements.txt index 92970530560..cf759659057 100644 --- a/speech/cloud-client/requirements.txt +++ b/speech/cloud-client/requirements.txt @@ -1 +1,7 @@ google-cloud-speech==0.27.0 +grpcio==1.1.0 +PyAudio==0.2.10 +proto-google-cloud-speech-v1beta1==0.15.1 +six==1.10.0 +requests==2.13.0 +google-auth==0.8.0