-
Notifications
You must be signed in to change notification settings - Fork 8
/
create_T.py
74 lines (53 loc) · 1.89 KB
/
create_T.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# %% Imports
import pickle # for saving and loading lists to and from files
from collections import Counter # to fast count occurrences
import untangle # to convert XML to python objects
# %% number of posts with tag t in P
with open("../data/processed/alltags.dmp", "rb") as fp: # pen file for alltags.dmp load
tag_set = pickle.load(fp) # list from file
tag_in_P = Counter(tag_set) # create "tag : occurrence" for P
# %% number of posts with tag t in S
input_file = "../data/raw/Tags.xml" # input XML file
obj = untangle.parse(input_file) # untangle XML file
tag_in_S = {} # create "tag : occurrence" for S
for i in range(0, len(obj.tags.row)): # for each row entry of XML (for each TagName)
tag_name = obj.tags.row[i]['TagName'] # extract TagName
tag_count = obj.tags.row[i]['Count'] # extract Count
tag_entry = {tag_name:tag_count} # create dictionary entry
tag_in_S.update(tag_entry) # append dictionary entry
# %% calculate alpha
alpha_tags = []
for key in tag_in_P.keys():
numerator = int(tag_in_P[key])
if key in tag_in_S:
denominator = int(tag_in_S[key])
alpha = numerator / denominator
if alpha >= 0.1:
alpha_tags.append(key)
# %% write alpha tag list
with open('../data/processed/alphatags.txt', 'w') as f:
for item in alpha_tags:
f.write("%s\n" % item)
# %% calculate beta
beta_tags = []
for key in tag_in_P.keys():
numerator = int(tag_in_P[key])
denominator = len(tag_in_P)
beta = numerator / denominator
if beta >= 0.01:
beta_tags.append(key)
# %% write beta tag list
with open('../data/processed/betatags.txt', 'w') as f:
for item in beta_tags:
f.write("%s\n" % item)
# %% create T
T = []
for member in alpha_tags:
if member in beta_tags:
T.append(member)
# %% write T tag list
with open('../data/processed/T.txt', 'w') as f:
for item in T:
f.write("%s\n" % item)
print(T)
# %%