2828import datetime
2929import io
3030import os
31+ import subprocess
3132
3233from google .cloud import speech_v1p1beta1 as speech
34+ # from google.cloud import speech
3335from google .cloud import storage
3436
37+ UPLOAD_BUCKET_NAME = 'bjoeris-temp-audio'
38+
3539def _safe_filename (filename ):
3640 """
3741 Generates a safe filename that is unlikely to collide with existing objects
3842 in Google Cloud Storage.
3943 ``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext``
4044 """
4145 date = datetime .datetime .utcnow ().strftime ("%Y-%m-%d-%H%M%S" )
42- basename , extension = filename . rsplit ( '.' , 1 )
46+ basename , extension = os . path . splitext ( os . path . basename ( filename ) )
4347 return "{0}-{1}.{2}" .format (basename , date , extension )
4448
4549# [START def_transcribe_file]
4650def transcribe_file (filename , output ):
4751 """Transcribe the given audio file asynchronously."""
4852 client = storage .Client ()
4953
50- bucket_name = 'bjoeris-temp-audio'
54+ print ("Converting file..." )
55+ filename = transcode_file (filename )
56+
57+ bucket_name = UPLOAD_BUCKET_NAME
5158 bucket = client .bucket (bucket_name )
5259 blob_name = _safe_filename (filename )
5360 blob = bucket .blob (blob_name )
54- print ("Uploading file..." )
61+ uri = "gs://{}/{}" .format (bucket_name , blob_name )
62+ print ("Uploading file..." , uri )
5563 with io .open (filename , 'rb' ) as audio_file :
5664 blob .upload_from_file (audio_file )
57- uri = "gs://{}/{}" .format (bucket_name , blob_name )
5865
59- transcribe_gcs (uri , output )
60- print ("Deleting file..." )
61- blob .delete ()
66+ operation = transcribe_gcs (uri , output )
67+ def callback (operation_future ):
68+ print ("Deleting file..." )
69+ blob .delete ()
70+ operation .add_done_callback (callback )
71+ return operation
6272# [END def_transcribe_file]
6373
74+ def transcode_file (filename ):
75+ stripped_name , ext = os .path .splitext (filename )
76+ output = '{}-transcode.flac' .format (stripped_name )
77+ subprocess .run (['ffmpeg' , '-i' , filename , '-ac' , '1' , '-ar' , '48000' , '-acodec' , 'flac' , output ])
78+ print ("transcoded: " , output )
79+ return output
80+
6481
65- # [START def_transcribe_gcs]
6682def transcribe_gcs (gcs_uri , output ):
6783 """Asynchronously transcribes the audio file specified by the gcs_uri."""
6884 client = speech .SpeechClient ()
@@ -73,40 +89,45 @@ def transcribe_gcs(gcs_uri, output):
7389 metadata .interaction_type = speech .enums .RecognitionMetadata .InteractionType .DISCUSSION
7490 metadata .microphone_distance = speech .enums .RecognitionMetadata .MicrophoneDistance .NEARFIELD
7591 metadata .recording_device_type = speech .enums .RecognitionMetadata .RecordingDeviceType .PC
92+
7693 config = speech .types .RecognitionConfig (
7794 encoding = speech .enums .RecognitionConfig .AudioEncoding .FLAC ,
78- sample_rate_hertz = 16000 ,
95+ sample_rate_hertz = 48000 ,
7996 language_code = 'en-US' ,
8097 metadata = metadata ,
8198 enable_automatic_punctuation = True ,
82- enable_word_time_offsets = True )
99+ enable_word_time_offsets = True ,
100+ )
83101
102+ print ('Transcribing... {}' .format (gcs_uri ))
84103 operation = client .long_running_recognize (config , audio )
104+ operation .add_done_callback (lambda operation_future : save_results (operation_future .result ().results , output ))
105+ return operation
85106
86- print ('Transcribing...' )
87- response = operation .result (timeout = 90 )
88-
107+ def save_results (results , output ):
89108 # Each result is for a consecutive portion of the audio. Iterate through
90109 # them to get the transcripts for the entire audio file.
91- timestamp = 0.0
92110 with open (output , 'w' , newline = '' ) as csvfile :
93111 fieldnames = ['timestamp' , 'confidence' , 'transcript' ]
94112 csvwriter = csv .DictWriter (csvfile , fieldnames = fieldnames )
95113 csvwriter .writeheader ()
96- for result in response . results :
114+ for result in results :
97115 alternative = result .alternatives [0 ]
98116 if len (alternative .words ) > 0 :
99117 timestamp = alternative .words [0 ].start_time
100118 timestamp = timestamp .seconds + 1e-9 * timestamp .nanos
101- timestamp_mins = int (timestamp // 60 )
102- timestamp_secs = timestamp - timestamp_mins * 60
103- csvwriter .writerow ({
104- 'timestamp' : '{}:{}' .format (timestamp_mins , timestamp_secs ),
105- 'confidence' : alternative .confidence ,
106- 'transcript' : alternative .transcript ,
107- })
108- print (u'{}:{} | {} | {}' .format (timestamp_mins , timestamp_secs , alternative .confidence , alternative .transcript ))
109- # [END def_transcribe]
119+ timestamp_hrs = int (timestamp // 3600 )
120+ timestamp_mins = int ((timestamp - timestamp_hrs * 3600 ) // 60 )
121+ timestamp_secs = int (timestamp - timestamp_mins * 60 - timestamp_hrs * 3600 )
122+ timestamp_str = '{:0>2d}:{:0>2d}:{:0>2d}' .format (timestamp_hrs , timestamp_mins , timestamp_secs )
123+ else :
124+ timestamp_str = ''
125+ csvwriter .writerow ({
126+ 'timestamp' : timestamp_str ,
127+ 'confidence' : '{:.2f}' .format (alternative .confidence ),
128+ 'transcript' : alternative .transcript ,
129+ })
130+ print (u'{} | {:.2f} | {}' .format (timestamp_str , alternative .confidence , alternative .transcript ))
110131
111132
112133if __name__ == '__main__' :
@@ -120,7 +141,9 @@ def transcribe_gcs(gcs_uri, output):
120141 args = parser .parse_args ()
121142 if args .out is None :
122143 args .out = os .path .splitext (args .audio_file )[0 ] + ".csv"
144+ operation = None
123145 if args .audio_file .startswith ('gs://' ):
124- transcribe_gcs (args .audio_file , args .out )
146+ operation = transcribe_gcs (args .audio_file , args .out )
125147 else :
126- transcribe_file (args .audio_file , args .out )
148+ operation = transcribe_file (args .audio_file , args .out )
149+ operation .result ()
0 commit comments