forked from DongboShi/opendac2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
135 lines (117 loc) · 3.78 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
import pickle as pkl
import json
from XMeans import XMeans
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
# from opendac2018 import clustering
from settings import assignments_train_path, pubs_validate_path, \
local_output_path, global_output_path, pos_pair_path
# todo:
# from RNN_estimate import get_clusters
# from local import make_input
assignments_train_path = './data/assignment_train.json'
pubs_validate_path = './data/pubs_validate.json'
def label2assign(id, y_pred):
'''
传入paper id 及预测簇编号
返回assignment形式:[[id1, id2, ...], [id1, id2, ...]]
'''
d = defaultdict(list)
for i in range(len(id)):
d[y_pred[i]].append(id[i])
return list(d.values())
def assign2label(lst):
'''
传入一个作者的assignment.
lst: [ [id1, id2, ...], [id1, id2, ...] ]
返回两个list: [id1, id2, id3, ...]; [lab1, lab2, lab3, ...]. 相同簇的paper id具有相同的编号
'''
L = 0
clusters = [c for c in lst]
id2lab = defaultdict(list)
for c in clusters:
for id in c:
id2lab[id].append(L)
L += 1
return list(id2lab.keys()), list(id2lab.values())
def prec_based_on_rule(ass_path):
"""
Calculate the precision of a assignment with the pos_pair info.
"""
assigns = json.load(open(ass_path))
pos_pair = json.load(open(pos_pair_path))
prec = {}
for name in assigns.keys():
pairs = pos_pair[name]
assign = assigns[name]
corr_num = .0
total_num = .0
ids, labels = assign2label(assign)
pid2label = {i:j for i, j in zip(ids, labels)}
for pida, pidb in pairs:
total_num += 1
if (pid2label[pida] == pid2label[pidb]):
corr_num += 1
prec[name] = corr_num / total_num
return prec
def cal_f1(prec, rec):
return 2 * prec * rec / (prec + rec)
def pairwise_precision_recall_f1(preds, truths):
tp = 0
fp = 0
fn = 0
n_samples = len(preds)
for i in range(n_samples - 1):
pred_i = preds[i]
for j in range(i + 1, n_samples):
pred_j = preds[j]
if pred_i == pred_j:
if truths[i] == truths[j]:
tp += 1
else:
fp += 1
elif truths[i] == truths[j]:
fn += 1
tp_plus_fp = tp + fp
tp_plus_fn = tp + fn
if tp_plus_fp == 0:
precision = 0.
else:
precision = tp / tp_plus_fp
if tp_plus_fn == 0:
recall = 0.
else:
recall = tp / tp_plus_fn
if not precision or not recall:
f1 = 0.
else:
f1 = (2 * precision * recall) / (precision + recall)
return precision, recall, f1
# def offline_score():
# assignments_train = json.load(open(assignments_train_path, 'r'))
# Z = pkl.load(open(local_output_path, 'rb'))
# metric = []
# for k, v in assignments_train.items():
# ids = [p['id'] for p in v]
# X = [Z.get(x) for x in ids]
# ids, labs = assign2label(v)
# labels = clustering(X)
# f1 = pairwise_precision_recall_f1(labels, labs)
# print(f1)
# metric.append(f1)
# print('offline f1:', np.mean(metric))
# if __name__ == "__main__":
# # 线下验证:
# assignments_train = json.load(open(assignments_train_path, 'r'))
# metric = []
# for k, v in assignments_train.items():
# ids, labs = assign2label(v)
# model = AgglomerativeClustering(n_clusters=get_clusters(k))
# X = make_input(ids)
# model.fit(X)
# f1 = pairwise_precision_recall_f1(model.label_, labs)
# print(f1)
# metric.append(f1)
# print('offline f1:', np.mean(metric))