-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign_texts.py
149 lines (142 loc) · 7.82 KB
/
align_texts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# coding=utf-8
import nltk
import sys
import os
import re
import jiwer
import json
import csv
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance
#converting transcription json
def get_paragraphs(myjson):
"""Takes a Google StT json paragraphs file and returns a list of dicts for each
paragraph with starttime, fulltextlist (list of word dicts with start time and end time),
textlist (list of words), textstring (textlist in string format) and length (of textlist)
"""
returnlist = []
for par in myjson['paragraphs']:
starttime = par['startTime']
fulltextlist = par['words']
textlist = [elem['text'] for elem in fulltextlist]
textstring = ' '.join(textlist)
length = len(textlist)
mydict = {'startTime': starttime,
'textlist': textlist, 'textstring': textstring,
'length': length, 'fulltextlist': fulltextlist}
returnlist.append(mydict)
return returnlist
# Search for paragraphs in ref algorithm
def align_words(transdictlist, reflist, leven=False, threshold=0.5):
"""Takes a list of word dicts from Google StT (such as the fulltextlist
generated by get_paragraphs()), a list, reflist, of a text presumed to be the
written version of the transcribed text, and returns a list of dictionaries for
each word where the words in the two lists are aligned: transword: word from transcription
, startTime: timecode of start of word, endTime: timecode of end of word, transindex: index
of transword in transdictlist, refword: word from written version (empty if non is found),
refindex: index of word in input list (None if no none is found). if leven=False,
the script only aligns identical words. If leven=True, it also aligns words with a normalized
Levenshtein distance below threshold. The script only aligns bigram matches in both texts, and
does not allow crossing matches."""
indexed_reflist = [(n,reflist[n]) for n in range(len(reflist))] #index the reflist
intermediatelist = []
returnlist = [] #list to be returned
refstartind = 0
bigram = [] #list used to temporarily store matches while checking if they are bigram matches
stoprefindex = 0 #index in reflist where looping should start TODO: rename to something with start?
for n in range(len(transdictlist)): #loop through transdictlist
w = transdictlist[n]
base = {'transword': w['text'], 'confidence': w['confidence'], 'startTime': w['startTime'], 'endTime': w['endTime'],
'transindex': n, 'refword': '', 'refindex': None} # The entry in returnlist if no match is found
returnlist.append(base) #appending to returnlist
mymatch = {'transword': w['text'], 'confidence': w['confidence'], 'startTime': w['startTime'], 'endTime': w['endTime'],
'transindex': n, 'refword': '', 'refindex': None} # TODO: find a way to get to get this without copy-paste from base, without base being altered
for e in range(len(indexed_reflist)): # Loop through the indexed reflist
if w['text'].lower() == indexed_reflist[e][1].lower() and indexed_reflist[e][0]>stoprefindex: #if identical words are found in both lists
mymatch['refword'] = indexed_reflist[e][1]
mymatch['refindex'] = indexed_reflist[e][0]
if len(bigram) == 0: #if bigram list is empty
if n == range(len(transdictlist))[-1]: #if we have reached the end of transdictlist,
returnlist = returnlist[:-1]
returnlist.append(mymatch) #append the match dict to the end of the returnlist
stoprefindex = mymatch['refindex']
else:
bigram.append(mymatch) #append match to bigram awaiting further matches
elif len(bigram) == 1: #if there is a match in bigram
if bigram[0]['refindex'] == mymatch['refindex']-1 and bigram[0]['transindex'] == mymatch['transindex']-1: #if the previous word is also a match in both lists
returnlist = returnlist[:-2]
returnlist.append(bigram[0])
returnlist.append(mymatch) #append both matches to the end of the returnlist
stoprefindex = mymatch['refindex']
bigram = [] #empty bigram list
else: #if not parallell bigram match
bigram.pop(0) #empty bigram of previous match
bigram.append(mymatch) #append current match to bigram awaiting awaiting further matches
break #break loop through reflist
elif leven == True and normalized_damerau_levenshtein_distance(w['text'], indexed_reflist[e][1]) <= threshold and indexed_reflist[e][0]>stoprefindex: #same as previous ifblock, but with Levenshtein similarity
mymatch['refword'] = indexed_reflist[e][1]
mymatch['refindex'] = indexed_reflist[e][0]
if len(bigram) == 0: #Copy-paste of the if-block above. TODO: refactor without copy-paste
if n == range(len(transdictlist))[-1]:
returnlist = returnlist[:-1]
returnlist.append(mymatch)
stoprefindex = mymatch['refindex']
else:
bigram.append(mymatch)
elif len(bigram) == 1:
if bigram[0]['refindex'] == mymatch['refindex']-1 and bigram[0]['transindex'] == mymatch['transindex']-1:
returnlist = returnlist[:-2]
returnlist.append(bigram[0])
returnlist.append(mymatch)
stoprefindex = mymatch['refindex']
bigram = []
else:
bigram.pop(0)
bigram.append(mymatch)
break
return returnlist
def find_paragraph_pair(trans_json, ref_lst, threshold=0.7):
"""Takes a Google StT json file and a written version of the text as input (tokenized as list), and
identifies portions of the written version that correspond to the paragraphs of the
transcription, using normalized Levenhstein list comparison. If there is a match under
the threshold in the two texts, align_words() is run on the matching paragraphs to align the
words."""
trans_dict_list = get_paragraphs(trans_json)
result_dict_list = [] #output list
for counter_par, elem in enumerate(trans_dict_list): # loop through transcription paragraphs
print('Reading paragraph %s' % counter_par)
textlist = elem['textlist']
fulltextlist = elem['fulltextlist']
starttime = elem['startTime']
parlength = elem['length']
intermediate_results = {'trans': fulltextlist, 'ref': '', 'startTime': starttime, 'dist': 1, 'ref_start_index': 0, 'ref_end_index': 0} #intermediate storage of matches
for n in range(len(ref_lst)): #loop through written version. TODO: limit the search space to increase efficiency
window = ref_lst[n:n+parlength] # search window of same length as paragraph in transcription
dist = normalized_damerau_levenshtein_distance(textlist, window)
if not dist < threshold: #discard pairs with a distance above threshold
pass
elif dist < intermediate_results['dist']: #replace match already in intermediate_results with a new match of lower distance
intermediate_results['ref'] = window
intermediate_results['dist'] = dist
matchwords = align_words(fulltextlist, intermediate_results['ref'], leven=True) #when matching paragraphs are found, align the words in them
intermediate_results['matches'] = matchwords #store the aligned words.
result_dict_list.append(intermediate_results) #append results to output list
return result_dict_list
def swap_word_pairs(myjson, ref_txt, parthreshold=0.7):
"""Takes a paragraphs dict from a Google StT json file and a written version of the transcribed text,
and returns a paragraphs dict where the original words are replaced with the words from align_words"""
returndict = {'paragraphs': []}
pairslist = find_paragraph_pair(myjson, ref_txt, threshold=parthreshold)
for n in range(len(myjson['paragraphs'])):
mydict = {'startTime':myjson['paragraphs'][n]['startTime'],'words':[],'id':myjson['paragraphs'][n]['id']}
for match in pairslist[n]['matches']:
mytoken = {'endTime': match['endTime'], 'startTime': match['startTime']}
if match['refword'] == '':
mytoken['text'] = match['transword']
mytoken['confidence'] = match['confidence']
else:
mytoken['text'] = match['refword']
mytoken['confidence'] = 1
mydict['words'].append(mytoken)
returndict['paragraphs'].append(mydict)
return returndict