-
Notifications
You must be signed in to change notification settings - Fork 0
/
old_kl.py
executable file
·156 lines (121 loc) · 6.02 KB
/
old_kl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
import sys
import operator
from collections import defaultdict
#from unidecode import unidecode ## Trouble??
import math
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
PRECLUDE_STOPWORDS = False
OVERLOOK_STOPWORDS = True
DOWNCASE = True
BAD_CHARS = "! @ # $ % ^ & * ( ) _ + - = { } [ ] | . \ : ; ' < > ? , / ~ ` \"".split()
VERY_BAD_CHARS = "' \"".split()
CO_OCCURRENCE_WINDOW = 9
CLOSED_WORDS = "non those thus with one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty were which was am what you yeah go who where when why 's is we i will this may until would should could a about above according across actually adj after afterwards again against all almost alone along already also although always among amongst an and another any anyhow anyone anything anywhere are around as at b be became because become becomes becoming been before beforehand begin behind being below beside besides between beyond both but by c can cannot caption co co. could d did do does down during e each eg eight eighty either else elsewhere end ending enough etc even ever every everyone everything everywhere except f few first for found from further g h had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie if in inc. indeed instead into is it its itself j k l last later latter latterly least less let like likely ltd m made make makes many maybe me meantime meanwhile might miss more moreover most mostly mr mrs much must my myself n namely neither never nevertheless next nine ninety no nobody none nonetheless noone nor not nothing now nowhere o of off often on once one only onto or other others otherwise our ours ourselves out over overall own p per perhaps q r rather recent recently s same seem seemed seeming seems seven several she should since so some somehow someone something sometime sometimes somewhere still such t to taking than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they where your yours yourself yourselves z".split()
def get_co_occurrences(s,exact_match=False):
r = defaultdict(float)
count = defaultdict(int)
if (DOWNCASE): s = s.lower()
for l in BAD_CHARS:
s = s.replace(l, "")
words = s.split()
# Preclude stopwords?
if (PRECLUDE_STOPWORDS):
for c in CLOSED_WORDS:
if (DOWNCASE):
if (c in words):
sys.stderr.write(" Precluded stopword " + c + "\n")
words = [x for x in words if x != c]
else: # To maintain capitalisation, only if !DOWNCASE:
for w in words:
if (w.lower() == c.lower()):
sys.stderr.write(" Precluded stopword " + c + "\n")
words = [x for x in words if x != c]
if (OVERLOOK_STOPWORDS):
i = 0
for t in words:
count[t] += 1 # This is up here because sometimes `count is used for stuff that doesn not
# use OVERLOOK_STOPWORDS (ie. syntactic graphs)
if (t in CLOSED_WORDS): continue
start = max(i-CO_OCCURRENCE_WINDOW, 0)
end = min(i+CO_OCCURRENCE_WINDOW, len(words)-1)
for w in words[start:end]:
if (w != t and w not in CLOSED_WORDS): r[t + " " + w] += 1.0
i += 1
else:
i = 0
for t in words:
start = max(i-CO_OCCURRENCE_WINDOW, 0)
end = min(i+CO_OCCURRENCE_WINDOW, len(words)-1)
count[t] += 1
for w in words[start:end]:
if (w != t): r[t + " " + w] += 1.0
i += 1
if (len(r.keys()) > 0): return count,r
else: return count,{}
## Normalise a discrete frequency distribution of a dictionary d so it:
#### 1) sums to 1
#### 2) is smoothed as a Krichevsky-Trofimov estimator (adds .5 to all counts and renorms)
# http://en.wikipedia.org/wiki/Krichevsky%E2%80%93Trofimov_estimator
def KT_norm(d):
for k in d: d[k] += 0.5 # K-T smoothing
s = sum(d.values())
for k in d: d[k] /= s # Norm to sum(values) = 1
return d
# Get the K-L divergence from set P to Q (P should be a baseline, as the function is not symetric).
def kl_div(a, b):
p = a.copy()
q = b.copy()
keys = set(p.keys()) | set(q.keys())
for k in keys:
if (k not in p): p[k] = 0.0
if (k not in q): q[k] = 0.0
p = KT_norm(p)
q = KT_norm(q)
r = 0.0
for k in keys:
r += p[k] * math.log(p[k]/q[k]) # Definition of K-L Divergance
return r
corpus = ""
corpus_file = sys.argv[1]
output_file = sys.argv[2]
fin = open(corpus_file,"r")
sys.stderr.write("----------------------------------\nReading in corpus ("+corpus_file+")...\n")
for line in fin:
corpus += line
fin.close()
sys.stderr.write("Tokenizing...")
#corpus = unidecode(corpus)
#sys.stderr.write(".")
toks = nltk.word_tokenize(corpus)
sys.stderr.write(".")
text = nltk.Text(toks)
sys.stderr.write(".")
words = [w.lower() for w in text if w.isalnum() and w.lower() not in CLOSED_WORDS and len(w) > 2]
sys.stderr.write(".\n")
TF = {}
sys.stderr.write("Counting TF...\n")
for token in words:
if token in TF: TF[token] += 1.0
else: TF[token] = 1.0
sys.stderr.write("Retrieving co-occurrences...\n")
count,co_occurrences = get_co_occurrences(corpus)
CO = {}
sys.stderr.write("Calculating K-L divergences...\n")
fout = open(output_file,'w')
fout.write("term,KL(tf,co),KL(co,tf),sym_KL_div\n")
for target in TF:
if (TF[target] < 3): continue
CO.clear()
for co,f in co_occurrences.iteritems():
w,t = co.split()
if (w == target):
if w in CO: CO[t] += f
else: CO[t] = f
kl1 = kl_div(TF,CO)
kl2 = kl_div(CO,TF)
fout.write(target + "," + str(kl1) + "," + str(kl2) + "," + str((kl1 + kl2) / 2.0) + "\n")
fout.close()
exit(1)