-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage_comparison.py
178 lines (138 loc) · 6.97 KB
/
language_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from io import open
from nltk import FreqDist
from nltk import WittenBellProbDist
from nltk.util import bigrams
import numpy as np
from conllu import parse_incr
from sklearn.metrics import accuracy_score
import pandas as pd
corpora = {'en': 'UD_English-EWT/en_ewt',
'es': 'UD_Spanish-GSD/es_gsd',
'nl': 'UD_Dutch-Alpino/nl_alpino',
'ar': 'UD_Arabic-PADT/ar_padt',
'fr': 'UD_French-Sequoia/fr_sequoia'}
def train_corpus(lang):
return corpora[lang] + '-ud-train.conllu'
def test_corpus(lang):
return corpora[lang] + '-ud-test.conllu'
# Remove contractions such as "isn't".
def prune_sentence(sent):
return [token for token in sent if type(token['id']) is int]
def conllu_corpus(path):
data_file = open(path, 'r', encoding='utf-8')
sents = list(parse_incr(data_file))
return [prune_sentence(sent) for sent in sents]
# generating the bigram tuple for transition
def generate_bigram_list(train_sentences):
tags_outer = []
for sent in train_sentences:
tags = ['<s>']
for token in sent:
tags.append(token['upos'])
tags.append('</s>')
tags = list(bigrams(tags))
tags_outer.extend(tags)
return tags_outer
# to generate the smoothing dictionary
def emission_using_witten_bell_smoothing(word_tag_tuple):
smoothed = {}
tags = set([t for (_, t) in word_tag_tuple])
for tag in tags:
words = [w for (w, t) in word_tag_tuple if t == tag]
smoothed[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)
return smoothed
# calculates the emission probabilities for the training sentences
def calculate_emission_probability(witten_emission_smooth, word_to_check, tag_to_check):
return witten_emission_smooth[tag_to_check].prob(word_to_check)
# calculates the smoothing for given tags in order of -> (tag1, tag2)
def transition_using_witten_bell_smoothing(tags_bigram):
smoothed = {}
distinct_tags = set([t for (t, _) in tags_bigram])
for tag1 in distinct_tags:
tag2 = [t2 for (t1, t2) in tags_bigram if t1 == tag1]
smoothed[tag1] = WittenBellProbDist(FreqDist(tag2), bins=1e5)
return smoothed
# calculates the transition probability for tags
def calculate_transition_prob(witten_trans_smooth, tag1, tag2):
return witten_trans_smooth[tag1].prob(tag2)
def calculate_accuracy(predicted_tags, actual_tags):
predicted_tags = [predicted_tag[1] for predicted_tag in predicted_tags]
actual_tags = [actual_tag[1] for actual_tag in actual_tags]
return accuracy_score(actual_tags, predicted_tags)
def generate_word_tag_lemma_data_frame(sentence_list):
word_tag_lemma = []
max_word_count = 18000
current_count = 0
for sent in sentence_list:
for token in sent:
if current_count < max_word_count:
current_count += 1
word_tag_lemma.append([token['form'], token['lemma'], token['upos']])
else:
break
return pd.DataFrame(word_tag_lemma, columns=['word', 'lemma', 'pos_tag'])
def calculate_word_to_lemma_ratio(word_tag_lemma_df, lang):
# lower lemma and higher word count means rich morphology
print(lang)
print("Total data frame size: {}".format(word_tag_lemma_df.shape[0]))
print("Unique words to unique lemma ratio: {}".format(word_tag_lemma_df.loc[:, 'word'].nunique() /
word_tag_lemma_df.loc[:, 'lemma'].nunique()))
def generate_transition_matrix(lang_bigram, lang, pos_tags):
smoothed = transition_using_witten_bell_smoothing(lang_bigram)
trans_prob_matrix = np.zeros((len(pos_tags), len(pos_tags)))
for curr_tag_pos, curr_tag in enumerate(pos_tags):
for prev_tag_pos, prev_tag in enumerate(pos_tags):
trans_prob_matrix[curr_tag_pos, prev_tag_pos] = calculate_transition_prob(smoothed,
prev_tag,
curr_tag)
# for each row, check the max value, and the index of that max value is the tag that comes after the row tag
predicted_bigram = get_tag_probability(trans_prob_matrix)
return [lang, predicted_bigram]
def get_tag_probability(trans_prob_matrix):
# for each row, check the max value, and the index of that max value is the tag that comes after the row tag
tag_prediction = []
for curr_tag_pos in range(len(tags)):
row_list_tag_prob = trans_prob_matrix[curr_tag_pos, :]
max_tag_prob = max(row_list_tag_prob)
max_tag_index = np.where(row_list_tag_prob == max_tag_prob)
tag_pred = [tags[curr_tag_pos], tags[max_tag_index[0][0]]]
tag_prediction.append(tag_pred)
return tag_prediction
if __name__ == '__main__':
fr_df = generate_word_tag_lemma_data_frame(conllu_corpus(train_corpus('fr')))
calculate_word_to_lemma_ratio(fr_df, 'French')
print("\n")
en_df = generate_word_tag_lemma_data_frame(conllu_corpus(train_corpus('en')))
calculate_word_to_lemma_ratio(en_df, 'English')
print("\n")
ar_df = generate_word_tag_lemma_data_frame(conllu_corpus(train_corpus('ar')))
calculate_word_to_lemma_ratio(ar_df, 'Arabic')
print("\n")
es_df = generate_word_tag_lemma_data_frame(conllu_corpus(train_corpus('es')))
calculate_word_to_lemma_ratio(es_df, 'Spanish')
print("\n")
nl_df = generate_word_tag_lemma_data_frame(conllu_corpus(train_corpus('nl')))
calculate_word_to_lemma_ratio(nl_df, 'Dutch')
print("\n")
fr_bigrams = list(bigrams(fr_df.loc[:, 'pos_tag']))
en_bigrams = list(bigrams(en_df.loc[:, 'pos_tag']))
ar_bigrams = list(bigrams(ar_df.loc[:, 'pos_tag']))
es_bigrams = list(bigrams(es_df.loc[:, 'pos_tag']))
nl_bigrams = list(bigrams(nl_df.loc[:, 'pos_tag']))
# only storing tags that are common in all our languages
tags = list(set.intersection(*map(set, [fr_df.loc[:, 'pos_tag'],
en_df.loc[:, 'pos_tag'],
ar_df.loc[:, 'pos_tag'],
es_df.loc[:, 'pos_tag'],
nl_df.loc[:, 'pos_tag']])))
list_trans_matrix = [generate_transition_matrix(fr_bigrams, 'French', tags),
generate_transition_matrix(en_bigrams, 'English', tags),
generate_transition_matrix(ar_bigrams, 'Arabic', tags),
generate_transition_matrix(es_bigrams, 'Spanish', tags),
generate_transition_matrix(nl_bigrams, 'Dutch', tags)]
# To check the similarity in the next possible post tag after a previous one
for first_trans_matrix in list_trans_matrix:
for second_trans_matrix in list_trans_matrix:
if first_trans_matrix != second_trans_matrix:
print("\n{} vs {}".format(first_trans_matrix[0], second_trans_matrix[0]))
print(calculate_accuracy(first_trans_matrix[1], second_trans_matrix[1]))