-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresampling.py
113 lines (92 loc) · 4.05 KB
/
resampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
from parser import parse_data
from sklearn.utils import resample
def upsample_minority(df):
"""
Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its
signal.
"""
class_names = df.type.unique()
type_count = df.type.value_counts()
# Separate majority and minority classes
majority_class = type_count.idxmax()
df_minorities = {}
for label in class_names:
if label == majority_class:
df_majority = df[df.type == label]
else:
df_minorities[label] = df[df.type == label]
# Upsample minority class
df_upsampled = df_majority.copy()
for label in class_names:
if label != majority_class:
df_minority_upsampled = resample(df_minorities[label],
replace=True, # sample with replacement
n_samples=df_majority.shape[0], # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_upsampled, df_minority_upsampled])
# Display new class counts
# print(df_upsampled.type.value_counts())
return df_upsampled
def downsample_majority(df):
"""
Down-sampling involves randomly removing observations from the majority class to prevent its signal from dominating
the learning algorithm.
"""
class_names = df.type.unique()
type_count = df.type.value_counts()
# Separate majority and minority classes
minority_class = type_count.idxmin()
df_majorities = {}
for label in class_names:
if label == minority_class:
df_minority = df[df.type == label]
else:
df_majorities[label] = df[df.type == label]
# Upsample majority classes
df_downsampled = df_minority.copy()
for label in class_names:
if label != minority_class:
df_majority_downsampled = resample(df_majorities[label],
replace=True, # sample with replacement
n_samples=df_minority.shape[0], # to match minority class
random_state=123) # reproducible results
# Combine majority class with downsampled majority classes
df_downsampled = pd.concat([df_downsampled, df_majority_downsampled])
# Display new class counts
# print(df_downsampled.type.value_counts())
return df_downsampled
def midsample(df):
"""
We are trying a new resampling method : mid-sampling, where we up-sample the minority classes and down-sample the
majority classes to a middle number of values.
"""
class_names = df.type.unique()
n_classes = len(class_names)
type_count = df.type.value_counts()
# print(type_count)
middle_class = type_count.index[n_classes/2 - 1]
df_other = {}
for label in class_names:
if label == middle_class:
df_middle = df[df.type == label]
else:
df_other[label] = df[df.type == label]
df_midsampled = df_middle.copy()
for label in class_names:
if label != middle_class:
df_other_midsampled = resample(df_other[label],
replace=True, # sample with replacement
n_samples=df_middle.shape[0], # to match middle class
random_state=123) # reproducible results
# Combine middle class with midsampled classes
df_midsampled = pd.concat([df_midsampled, df_other_midsampled])
# Display new class counts
# print(df_midsampled.type.value_counts())
return df_midsampled
if __name__ == "__main__":
data = pd.read_csv("./data/mbti_1.csv", header=0)
data_parsed = parse_data(data)
data_upsampled = upsample_minority(data_parsed)
data_downsampled = downsample_majority(data_parsed)