-
Notifications
You must be signed in to change notification settings - Fork 0
/
explore_data.py
155 lines (134 loc) · 5.19 KB
/
explore_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
__author__ = 'lucabasa'
__version__ = '1.1.1'
__status__ = 'development'
'''
Source: https://github.com/lucabasa
'''
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
# import seaborn as sns
def list_missing(data, verbose=True):
mis_cols = [col for col in data.columns if data[col].isna().any()]
if not verbose:
return mis_cols
tot_rows = len(data)
for col in mis_cols:
print(f'Column {col}: {round(data[col].isna().sum()*100/tot_rows, 2)}% missing')
return mis_cols
# def plot_correlations(data, target=None, limit=50, figsize=(12,10), **kwargs):
# '''
# This function plots the correlation matrix of a dataframe
# If a target feature is provided, it will display only a certain amount of features, the ones correlated the most
# with the target. The number of features displayed is controlled by the parameter limit
# '''
# corr = data.corr()
# if target:
# corr['abs'] = abs(corr[target])
# cor_target = corr.sort_values(by='abs', ascending=False)[target]
# cor_target = cor_target[:limit]
# del corr['abs']
# corr = corr.loc[cor_target.index, cor_target.index]
# plt.figure(figsize=figsize)
# ax = sns.heatmap(corr, cmap='RdBu_r', **kwargs)
# ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
# return cor_target
def plot_distribution(data, column, bins=50, correlation=None):
'''
Plots a histogram of a given column
If a Pandas Series is provided with the correlation values, it will be displayed in the title.
'''
plt.figure(figsize=(12,8))
data[column].hist(bins=bins)
if not correlation is None:
value = correlation[column]
column = column + f' - {round(value,2)}'
plt.title(f'Distribution of {column}', fontsize=18)
plt.grid(False)
# def plot_bivariate(data, x, y, hue=None, **kwargs):
# '''
# Scatterplot of the feature x vs the feature y with the possibility of adding a hue
# '''
# plt.figure(figsize=(12,8))
# sns.scatterplot(data=data, x=x, y=y, hue=hue, **kwargs)
# if hue:
# plt.title(f'{x} vs {y}, by {hue}', fontsize=18)
# else:
# plt.title(f'{x} vs {y}', fontsize=18)
# def corr_target(data, target, cols, x_estimator=None):
# '''
# Scatterplot + linear regression of a list of columns against the target.
# A correlation matrix is also printed.
# It is possible to pass an estimator.
# '''
# print(data[cols+[target]].corr())
# num = len(cols)
# rows = int(num/2) + (num % 2 > 0)
# cols = list(cols)
# y = data[target]
# fig, ax = plt.subplots(rows, 2, figsize=(12, 5 * (rows)))
# i = 0
# j = 0
# for feat in cols:
# x = data[feat]
# if (rows > 1):
# sns.regplot(x=x, y=y, ax=ax[i][j], x_estimator=x_estimator)
# j = (j+1)%2
# i = i + 1 - j
# else:
# sns.regplot(x=x, y=y, ax=ax[i], x_estimator=x_estimator)
# i = i+1
def ks_test(data, col, target, critical=0.05):
'''
It takes a categorical feature and makes dummies.
For each dummy, it performs a Kolmogorov-Smirnov test between the distribution of the target
of that subset vs the rest of the population.
'''
df = pd.get_dummies(data[[col]+[target]], columns=[col])
for col in df.columns:
if col == target:
continue
tmp_1 = df[df[col] == 1][target]
tmp_2 = df[df[col] == 0][target]
ks, p = stats.ks_2samp(tmp_1, tmp_2)
if p < critical:
return True
return False
def find_cats(data, target, thrs=0.1, agg_func='mean', critical=0.05, ks=True, frac=1):
'''
Finds interesting categorical features either by perfoming a Kolmogorov-Smirnov test or
simply be comparing the descriptive statistic of the full population versus the one obtained with the
various subsets.
'''
cats = []
tar_std = data[target].std()
#select_dtypes(include=['object'])找出所有的category feature
for col in data.select_dtypes(include=['object']).columns:
counts = data[col].value_counts(dropna=False,
normalize=True)
tmp = data.loc[data[col].isin(counts[counts > thrs].index),:]
if ks:
res = ks_test(tmp, col, target, critical=critical)
if res:
cats.append(col)
else:
res = tmp.groupby(col)[target].agg(agg_func).std()
if res >= tar_std*frac:
cats.append(col)
return cats
# def segm_target(data, cat, target):
# '''
# Studies the target segmented by a categorical feature.
# It plots both a boxplot and a distplot for visual support
# '''
# df = data.groupby(cat)[target].agg(['count', 'mean', 'max',
# 'min', 'median', 'std'])
# fig, ax = plt.subplots(1,2, figsize=(12, 5))
# sns.boxplot(cat, target, data=data, ax=ax[0])
# for val in data[cat].unique():
# tmp = data[data[cat] == val]
# sns.distplot(tmp[target], hist=False, kde=True,
# kde_kws = {'linewidth': 3},
# label = val, ax=ax[1])
# return df