forked from Timilehin/Yoruba-Intonator
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsemantic_bigram.py
48 lines (33 loc) · 1.1 KB
/
semantic_bigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import collections
import utils
import nltk
def generate_contextual_bigram(filename):
file = open(filename, "r")
semantic_bigram_freq = collections.Counter()
bigram_freq = collections.Counter()
for f_line in file:
line = f_line.split()
all_pairs = utils.get_all_pairs(line)
for pair in all_pairs:
pair.sort()
semantic_bigram_freq[tuple(pair)] += 1
fh = open("contextual_bigram_data.txt","w")
for words, freq in semantic_bigram_freq.items():
fh.write("{},{},{}\n".format(words[0], words[1], freq))
fh.close()
def generate_bigram(filename):
# given a tokenized file, generate bigram and contextual bigram data
#filename = "tokenized_sentences.txt"
#filename = "newbi.txt"
file = open(filename, "r")
semantic_bigram_freq = collections.Counter()
bigram_freq = collections.Counter()
for f_line in file:
line = f_line.split()
bigrams = nltk.ngrams(line, 2)
for gram in bigrams:
bigram_freq[gram] += 1
bigram_file = open("bigram_data.txt","w")
for words, freq in bigram_freq.items():
bigram_file.write("{},{},{}\n".format(words[0], words[1], freq))
bigram_file.close()