-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerate_neighbor_wmd.py
125 lines (95 loc) · 3.11 KB
/
generate_neighbor_wmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import pandas as pd
from tqdm import tqdm
from .utils import dist_utils,split_data,nlp_utils
import scipy.stats as sps
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
import wmd
from wmd import WMD
seed = 1024
np.random.seed(seed)
path = '../data/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
data_all = pd.concat([train, test])[['question1','question2']]
#dup index
q_all = pd.DataFrame(np.hstack([train['question1'], test['question1'],
train['question2'], test['question2']]), columns=['question'])
q_all = pd.DataFrame(q_all.question.value_counts()).reset_index()
q_num = dict(q_all.values)
q_index = {}
for i,key in enumerate(q_num.keys()):
q_index[key] = i
index_q = {}
for i,key in enumerate(q_index.keys()):
index_q[i] = key
data_all['q1_index'] = data_all['question1'].map(q_index)
data_all['q2_index'] = data_all['question2'].map(q_index)
#link edges
q_list = {}
dd = data_all[['q1_index','q2_index']].values
for i in tqdm(np.arange(data_all.shape[0])):
#for i in np.arange(dd.shape[0]):
q1,q2=dd[i]
if q_list.setdefault(q1,[q2])!=[q2]:
q_list[q1].append(q2)
if q_list.setdefault(q2,[q1])!=[q1]:
q_list[q2].append(q1)
nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)
def _get_wmd_dis(q1,q2):
doc1 = nlp(q1)
doc2 = nlp(q2)
try:
sim = doc1.similarity(doc2)
except:
sim = -1
return sim
def calc_neigh_wmd(neighs,qind):
if qind not in index_q:
return 5*[-1]
q_str = index_q[qind]
sim_fea = []
for i in neighs:
if i in index_q:
nei_str = index_q[i]
sim_fea.append(_get_wmd_dis(q_str, nei_str))
aggregation_mode = ["mean", "std", "max", "min", "median"]
aggregator = [None if m == "" else getattr(np, m) for m in aggregation_mode]
score = []
for n, agg in enumerate(aggregator):
if len(sim_fea) == 0:
s = -1
try:
s = agg(sim_fea)
except:
s = -1
score.append(s)
return score
fea_q1 = []
fea_q2 = []
for i in tqdm(np.arange(data_all.shape[0])):
q1,q2 = dd[i]
if (q1 not in q_list)|(q2 not in q_list):
fea_q1.append(5*[0])
fea_q2.append(5*[0])
continue
nei_q1 = set(q_list[q1])
fea_q1.append(calc_neigh_wmd(nei_q1,q1))
nei_q2 = set(q_list[q2])
fea_q2.append(calc_neigh_wmd(nei_q2,q2))
fea_q1 = np.array(fea_q1)
fea_q2 = np.array(fea_q2)
all_fea = np.hstack([fea_q1, fea_q2])
train_fea = all_fea[:train.shape[0]]
test_fea = all_fea[train.shape[0]:]
train_stats = np.vstack([train_fea.mean(axis=1), train_fea.max(axis=1), train_fea.min(axis=1),
train_fea.std(axis=1)]).T
test_stats = np.vstack([test_fea.mean(axis=1), test_fea.max(axis=1), test_fea.min(axis=1),
test_fea.std(axis=1)]).T
train_fea = np.hstack([train_fea, train_stats])
test_fea = np.hstack([test_fea, test_stats])
# sps.spearmanr(train_fea,train['is_duplicate'])[0]
pd.to_pickle(train_fea,'../X_v2/train_neigh_wmd.pkl')
pd.to_pickle(test_fea,'../X_v2/test_neigh_wmd.pkl')