Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

oversample, exhaust, stratify #1

Merged
merged 1 commit into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions skl_emeralds/print.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
def print_label_info(series, message):
class_counts = series.value_counts()
class_fractions = class_counts / series.size
print(message,'\nClasses in the dataset, counts, fraction: \n', pd.concat((class_counts, class_fractions), axis=1))
52 changes: 52 additions & 0 deletions skl_emeralds/test_train_splitters/exhaust.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def exhaust_class_(pointcloud, new_label, test_size, classes):

class_counts = new_label.label.value_counts()

n_train = (1 - test_size) * new_label.label.shape[0]
n_even = n_train / classes.size

label_train_deficits = np.round(n_even - class_counts)

for c in classes:
# Pandas series, single class e.g. Brittle, non-brittle
label_c = new_label.label.loc[new_label.label == c]
# Dataframe of above indexes
data_c = pointcloud.loc[label_c.index]

if n_even <= class_counts[c]:
# Creating indices, array of indices
new_indices_c = np.random.choice(label_c.index, round(n_even), replace=False)
# 2 masks
loc_label_ids_train = data_c.index.isin(new_indices_c)
loc_label_ids_test = ~data_c.index.isin(new_indices_c)

if n_even > class_counts[c]:
n_concats = max(math.floor(label_train_deficits.loc[c] / class_counts.loc[c]), 0)
n_random_sel = int(label_train_deficits.loc[c] % class_counts.loc[c])
new_indices_c = np.concatenate((
np.array(label_c.index.to_list() * (n_concats + 1)),
np.random.choice(label_c.index.to_numpy(), n_random_sel, replace=False)))
# 1 mask
loc_label_ids_train = data_c.index.isin(new_indices_c)

print('...Classes exhausted')
return loc_label_ids_train, loc_label_ids_test

def test_train_split_balance_oversample_minority_exhaust(arr, filt, test_size=0.2, random_state=None, verbose=False):
training_wlabel = arr.loc[filt]

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(training_wlabel.label)
print('Classes in dataset are: ,', classes)

train, test = exhaust_class_(arr, training_wlabel, test_size, classes)
return train, test
104 changes: 104 additions & 0 deletions skl_emeralds/test_train_splitters/oversample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def test_train_split_balance_oversample_minority_byhole(arr, filt, test_size=0.2, random_state=None, verbose=False):
training_wlabel = arr.loc[filt]

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(training_wlabel.label)
print('Training classes: ', classes)

if verbose:
print('splitting by hole, then balancing')

# All points from a borehole should either be in test OR in train
data_train, data_test, label_train, label_test = train_test_split_byhole(arr,
training_wlabel.label,
test_size=test_size,
random_state=random_state,
test_size_byData=False,
hole_id_name="title")

for ID in data_train.title.unique():
if ID in data_test.title.unique():
print('Duplicate borehole in train / test split:', ID)

label_train_counts = label_train.value_counts()
label_train_deficits = label_train_counts.max() - label_train_counts
for c in classes:
if label_train_deficits.loc[c] == 0:
continue

index_train_c = label_train.loc[label_train == c].index
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c])
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c]
new_indices_c = np.concatenate((
np.array(index_train_c.to_list() * (n_concats)),
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False)))

data_train = pd.concat((data_train, data_train.loc[new_indices_c, :]))


loc_label_ids_train = data_train.index.isin(new_indices_c)

loc_label_ids_test = ~data_test.index.isin(new_indices_c)
if verbose:
print('----------------------------')
print_label_info(label_train, 'Training labels')
print('----------------------------')
print_label_info(label_test, 'Testing labels')

return loc_label_ids_train, loc_label_ids_test


def train_test_split_byhole(arr, label, test_size=0.2, hole_id_name='title', random_state=None, test_size_byData=None):
IDs = arr.loc[:, hole_id_name].unique()
if random_state is not None:
np.random.seed(random_state)
np.random.shuffle(IDs)
n_IDs = IDs.size
index_IDs = np.arange(n_IDs)

if not test_size_byData: # default setting, where test_size is interpreted as fraction of holes

n_IDs_train = int(round((test_size) * n_IDs))

IDs_train = IDs[:n_IDs_train]
IDs_test = IDs[n_IDs_train:]

data_train = arr[arr.loc[:, hole_id_name].isin(IDs_train)]
data_test = arr[arr.loc[:, hole_id_name].isin(IDs_test)]

label_train = label[arr.loc[:, hole_id_name].isin(IDs_train)]
label_test = label[arr.loc[:, hole_id_name].isin(IDs_test)]

elif test_size_byData: # test_size is interpreted as fraction of data points
data_wLabel = arr
data_wLabel.at[:, 'Label'] = label

# make new dataframe, pointID shuffled as index, order index as one column, use this as lookup table
lookup_ds = pd.Series(index=IDs, arr=index_IDs)
for row_index, value in data.loc[:, hole_id_name].items():
data_wLabel.at[row_index, 'ID_order'] = lookup_ds.loc[value].astype(np.int)
data_wLabel = data_wLabel.sort_values(axis=0, by='ID_order')

n_rows = data_wLabel.shape[0]
n_train = round((1 - test_size) * n_rows)

data_train = data_wLabel.iloc[:n_train, :]
data_test = data_wLabel.iloc[n_train:, :]

label_train = data_wLabel.iloc[:n_train, :].loc[:, 'Label']
label_test = data_wLabel.iloc[n_train:, :].loc[:, 'Label']

return data_train, data_test, label_train, label_test




49 changes: 49 additions & 0 deletions skl_emeralds/test_train_splitters/stratify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def test_train_split_balance_oversample_minority_stratify(arr, filt, test_size, verbose=None):
print('Stratifying by label, then balancing')

if verbose:
print_label_info(filt, 'Input labels')

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(filt)

data_train, data_test, label_train, label_test = sklearn.model_selection.train_test_split(
arr, filt, test_size=test_size,
random_state=random_state, stratify=filt)

label_train_counts = label_train.value_counts()
label_train_deficits = label_train_counts.max() - label_train_counts

for c in classes:
if label_train_deficits.loc[c] == 0:
continue

index_train_c = label_train.loc[label_train == c].index
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c])
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c]
new_indices_c = np.concatenate((
np.array(index_train_c.to_list() * (n_concats)),
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False)))

data_train = pd.concat((data_train, data_train.loc[new_indices_c, :]))

loc_label_ids_train = data_train.index.isin(new_indices_c)

loc_label_ids_test = ~data_test.index.isin(new_indices_c)

if verbose:
print('---------------------')
print_label_info(label_train, 'Training labels')
print('---------------------')
print_label_info(label_test, 'Testing labels')

return loc_label_ids_train, loc_label_ids_test