-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgcloud_main.py
87 lines (76 loc) · 3.32 KB
/
gcloud_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import argparse
import io, os, sys
# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from trans_to_phon import *
from ctcmodel_main import *
from viterbi_align import *
from phoneme_list import *
client = speech.SpeechClient()
## requirements
#(1) pip install google-cloud-speech
#(2) git clone --recursive https://github.com/parlance/ctcdecode.git
# cd ctcdecode
# pip install .
#(3) Setting up gcloud:
# https://cloud.google.com/speech-to-text/docs/quickstart-gcloud
def get_gloud_transcript(file_name):
with io.open(file_name, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
language_code='en-US')
# Detects speech in the audio file
response = client.recognize(config, audio)
result = response.results[0]
transcript = result.alternatives[0].transcript
return transcript
def run_main(file):
label_map = PHONEME_MAP + [' ']
phoneme_list = PHONEME_LIST + [' ']
transcript = get_gloud_transcript(file)
# dictionary = word_phon_dict("/home/hyd/fisher_complete_trans/f1+f2.complete.fordecode.dict")
dictionary = word_phon_dict("cmudict.0.6d.wsj0.forfalignment")
phonemes = convert_sentence_trans_phones(transcript, dictionary)
phonemes_id = [PHONEME_LIST.index(l) for l in phonemes]
# print(phonemes_id)
feature_input = get_feature(file)
# print(feature_input.shape)
if (feature_input.shape[0]< 50 or len(phonemes_id) < 2): # Too small audio or couldnt break into phonemes
return
phonemes_len = len(PHONEME_MAP)
model = CTCPhonemeSegmentor(47,512,3)
# model = model#.cuda()
checkpoint = torch.load('18.model')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
log_softmax_logits, softmax_logits, lens = model([feature_input])
logits = log_softmax_logits.detach().cpu().squeeze().transpose(1,0)
# print(logits)
output_sequence = viterbi_align(logits, phonemes_id)
blank_id = logits.shape[0]-1
time_stamps_useful = list(map(lambda i: i if output_sequence[i]!=blank_id else -1, np.arange(len(output_sequence))))
time_stamps_not_blank = np.array(list(filter(lambda i: i!=-1, time_stamps_useful)))
time_stamps_not_blank = time_stamps_not_blank/100.
output_sequence_not_blank = np.array(list(filter(lambda i: i!=blank_id, output_sequence)))
phone_pred = []
for j in output_sequence_not_blank:
phone_pred.append(phoneme_list[j])
return phone_pred, time_stamps_not_blank
if __name__ == '__main__':
parser = argparse.ArgumentParser( prog='gcloud_main.py', description='Takes in the input wav file and outputs \
(1) a list of phonemes and (2) time stamps for each phoneme')
parser.add_argument('-infile', required=True, help='The input file to process')
args = parser.parse_args()
output_sequence, time_stamps= run_main(args.infile)
print("Predicted phonemes: ")
print(output_sequence)
print("At timestamps: (in seconds) ")
print(time_stamps)
print(len(output_sequence), len(time_stamps))
## time_stamps[0] is when the first phoneme ends. Duration (0 -> time_stamps[0])
## time_stamps[1] is when the second phoneme ends. Duration (time_stamps[0] -> time_stamps[1])
## and so on ...