forked from noah14noah/Gene_Editing_Study
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunique_top_words.py
123 lines (108 loc) · 4.42 KB
/
unique_top_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from ModelEstimation import LDAModel
import gensim
import ast
import sys
import numpy
num_topics = 50
## Load model, dictionary and set to top topics
ldam=LDAModel()
ldam.get_model()
dictionary = gensim.corpora.Dictionary.load("RC_LDA_Dict_True.dict")
top_topics = []
with open("top_topic_ids","r") as f: # should be copied manually from the
# output path
for line in f:
if line.strip() != "":
top_topics.append(int(line.strip()))
## Extract the top words of top topics
# NOTE: Requires top_words_all_[num_topics] in the model's directory.
# Please manually copy from [output_path]
top_word = {key:0 for key in range(num_topics)}
with open("top_words_all_"+str(num_topics),"r") as f:
for idx,line in enumerate(f):
if idx == 100:
break
if idx != 0 and idx % 2 != 0:
top_word[int(idx / 2)] = (ast.literal_eval(line)) # comment out the next
# line if you want [topn] top words, not [topn]/2
top_word[int(idx / 2)] = top_word[int(idx / 2)][:int((len(top_word[int(idx / 2)]) / 2 )- 1)]
## Identify unique and almost unique top words of top topics
uniques = {key:[] for key in top_word.keys()}
twos = {key:[] for key in top_word.keys()}
other_top_words = {key:[] for key in top_word.keys()}
for topic in top_word.keys():
if topic in top_topics:
print (topic)
for other_topic in top_word.keys():
if other_topic in top_topics:
print( other_topic)
print( other_topic == topic)
if other_topic != topic:
for word in [x[0] for x in top_word[other_topic]]:
other_top_words[topic].append(word)
for word in top_word[topic]:
if word not in other_top_words[topic]:
uniques[topic].append(word)
counts = {}
for topic in top_word.keys():
for word in top_word[topic]:
if word[0] in counts:
counts[word[0]] += 1
else:
counts[word[0]] = 1
for topic in top_word.keys():
for word in top_word[topic]:# top_word[idx / 2] = top_word[idx / 2][:5] # top 6
if counts[word[0]] == 1:
uniques[topic].append(word[0])
elif counts[word[0]] == 2:
twos[topic].append(word[0])
## Calculate summary statistics for a unique or almost unique word being
# associated with its assigned top topic or a different topic
all_top_words = []
for topic in uniques.keys():
for word in top_word[topic]:
if word not in all_top_words:
all_top_words.append(word)
maximum_non_assigned = 0
all_assigned = []
all_non_assigned = []
for idx,word in enumerate(all_top_words):
all_of_them = ldam.ldamodel.get_term_topics(dictionary.token2id[str(word[0])])
all_of_them = sorted(all_of_them, key=lambda x: x[1],reverse=True)
if len(all_of_them) > 0:
all_assigned.append(all_of_them[0][1])
if len(all_of_them) > 1:
all_non_assigned.append(all_of_them[1][1])
if all_of_them[1][1] > maximum_non_assigned:
maximum_non_assigned = all_of_them[1][1]
print(word[0])
print(all_of_them)
## Descriptive statistics for unique top words
print("mean (different topic): "+str(numpy.mean(all_non_assigned)))
print("max (different topic): "+str(maximum_non_assigned))
print("min (different topic): "+str(numpy.amin(all_non_assigned)))
print("standard deviation (different topic): "+str(numpy.std(all_non_assigned)))
print("***")
print("mean (assigned top topic): "+str(numpy.mean(all_assigned)))
print("max (assigned top topic): "+str(numpy.amax(all_assigned)))
print("min (assigned top topic): "+str(numpy.amin(all_assigned)))
print("standard deviation (assigned top topic): "+str(numpy.std(all_assigned)))
## Write unique terms for top topics to file. Change path if need be
fout = open("top_uniques.txt","a+")
for topic in top_word.keys():
if topic in top_topics:
print(topic,file=fout)
# print len(uniques[topic])
# print len(twos[topic])
print("unique",file=fout)
print(uniques[topic],file=fout)
print("almost unique",file=fout)
print(twos[topic],file=fout)
## Used in development of figures for publication. For developers' use only
with open("top_twenty_50.txt","w") as fout_new:
for topic in top_word.keys():
for idx,word_tuple in enumerate(top_word[topic]):
if idx == 20:
break
print(word_tuple[0],file=fout_new)
print(word_tuple[1],file=fout_new)