-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_time_distribution.py
88 lines (78 loc) · 3.13 KB
/
word_time_distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# coding=utf-8
import re
import sys
import os
import json
from raw_conversion import raw
def distribute_words(wordlist):
"""Takes a list of word tokens in the Google Cloud StT format.
Keeps the start time code of the first token and the end time code
of the last token, and distributes the other timecodes according to
a syllable count heuristics."""
returnlist = []
start = wordlist[0]['startTime']
end = wordlist[-1]['endTime']
timespan = end-start
replacements = (r'(.+)(\\.+)', r'\1')
nuclei = re.compile(r'[aeiouyæøåAEIOUYÆØŤ]{1,2}')
pairlist = []
for word in wordlist:
myword = re.sub(replacements[0], replacements[1], raw(word['text']))
syllables = nuclei.findall(myword)
if word.get('deleted') == True:
pairlist.append((word, 0))
else:
score = 1
if len(syllables) > 1:
score += len(syllables)-1
pairlist.append((word, score))
syllength = sum([x[1] for x in pairlist])
length = len(wordlist)
average_word_length = timespan/length
average_syll_length = timespan/syllength
for n in range(len(pairlist)):
token = pairlist[n][0]
syllcount = pairlist[n][1]
wordlength = syllcount*average_syll_length
if n == 0:
token['endTime'] = int(token['startTime']+wordlength)
returnlist.append(token)
elif n == len(wordlist)-1:
token['startTime'] = int(returnlist[-1]['endTime'])
returnlist.append(word)
else:
token['startTime'] = int(returnlist[-1]['endTime'])
token['endTime'] = int(token['startTime']+wordlength)
returnlist.append(token)
return returnlist
def redistribute_words(googledict):
"""Takes as input a Google Cloud StT transcription. Returns
a dict, compatible with Google Cloud StT and Språklabben, which corresponds to
the Google transcriptions with the start and end timecodes from the input, but
with the timecodes of all other words generated by the heuristics in
word_time_distributrion.distribute words."""
returndict = {'paragraphs': []}
for n, par in enumerate(googledict['paragraphs']):
mydict = {}
mydict['id'] = par['id']
if par.get('speaker') != None:
mydict['speaker'] = par['speaker']
mydict['startTime'] = par['startTime']
mydict['words'] = par['words']
mydict['words'][0]['startTime'] = par['words'][0]['startTime']
mydict['words'][-1]['endTime'] = par['words'][-1]['endTime']
mydict['words'] = distribute_words(mydict['words'])
returndict['paragraphs'].append(mydict)
return returndict
if __name__ == "__main__":
try:
googlejson = sys.argv[1]
outfile = sys.argv[2]
except IndexError:
sys.exit("Please provide filenames: python word_time_distribution.py googlejson outfile")
with open(googlejson, 'r') as google:
googledict = json.load(google)
newdict = redistribute_words(googledict)
with open(outfile, 'w') as out:
json.dump(newdict, out, ensure_ascii=False)