-
Notifications
You must be signed in to change notification settings - Fork 3
/
genereate_len_extra_stats.py
49 lines (36 loc) · 1.93 KB
/
genereate_len_extra_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import pandas as pd
import datetime
import operator
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats as sps
seed = 1024
np.random.seed(seed)
path = '../data/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
data_all = pd.concat([train, test])[['question1','question2']]
def generate_len_stats(data_all):
data_all['caps_count_q1'] = data_all['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
data_all['caps_count_q2'] = data_all['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
data_all['diff_caps'] = data_all['caps_count_q1'] - data_all['caps_count_q2']
data_all['len_char_q1'] = data_all['question1'].apply(lambda x: len(str(x).replace(' ', '')))
data_all['len_char_q2'] = data_all['question2'].apply(lambda x: len(str(x).replace(' ', '')))
data_all['diff_len_char'] = data_all['len_char_q1'] - data_all['len_char_q2']
data_all['len_word_q1'] = data_all['question1'].apply(lambda x: len(str(x).split()))
data_all['len_word_q2'] = data_all['question2'].apply(lambda x: len(str(x).split()))
data_all['diff_len_word'] = data_all['len_word_q1'] - data_all['len_word_q2']
data_all['avg_world_len1'] = data_all['len_char_q1'] / data_all['len_word_q1']
data_all['avg_world_len2'] = data_all['len_char_q2'] / data_all['len_word_q2']
data_all['diff_avg_word'] = data_all['avg_world_len1'] - data_all['avg_world_len2']
data_all.drop(['question1','question2'],axis=1,inplace=1)
return data_all[['caps_count_q1','caps_count_q2','diff_caps','diff_len_char','avg_world_len1',
'avg_world_len2','diff_avg_word']]
fea = generate_len_stats(data_all.copy())
train_len = fea[:train.shape[0]]
test_len = fea[train.shape[0]:]
# sps.spearmanr(train_len,train['is_duplicate'])[0]
pd.to_pickle(train_len,'../X_v2/train_len.pkl')
pd.to_pickle(test_len,'../X_v2/test_len.pkl')