-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from emerald-geomodelling/train-test
oversample, exhaust, stratify
- Loading branch information
Showing
4 changed files
with
209 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
def print_label_info(series, message): | ||
class_counts = series.value_counts() | ||
class_fractions = class_counts / series.size | ||
print(message,'\nClasses in the dataset, counts, fraction: \n', pd.concat((class_counts, class_fractions), axis=1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
import pickle | ||
import elnes | ||
import math | ||
from skl_emeralds.print import * | ||
|
||
def exhaust_class_(pointcloud, new_label, test_size, classes): | ||
|
||
class_counts = new_label.label.value_counts() | ||
|
||
n_train = (1 - test_size) * new_label.label.shape[0] | ||
n_even = n_train / classes.size | ||
|
||
label_train_deficits = np.round(n_even - class_counts) | ||
|
||
for c in classes: | ||
# Pandas series, single class e.g. Brittle, non-brittle | ||
label_c = new_label.label.loc[new_label.label == c] | ||
# Dataframe of above indexes | ||
data_c = pointcloud.loc[label_c.index] | ||
|
||
if n_even <= class_counts[c]: | ||
# Creating indices, array of indices | ||
new_indices_c = np.random.choice(label_c.index, round(n_even), replace=False) | ||
# 2 masks | ||
loc_label_ids_train = data_c.index.isin(new_indices_c) | ||
loc_label_ids_test = ~data_c.index.isin(new_indices_c) | ||
|
||
if n_even > class_counts[c]: | ||
n_concats = max(math.floor(label_train_deficits.loc[c] / class_counts.loc[c]), 0) | ||
n_random_sel = int(label_train_deficits.loc[c] % class_counts.loc[c]) | ||
new_indices_c = np.concatenate(( | ||
np.array(label_c.index.to_list() * (n_concats + 1)), | ||
np.random.choice(label_c.index.to_numpy(), n_random_sel, replace=False))) | ||
# 1 mask | ||
loc_label_ids_train = data_c.index.isin(new_indices_c) | ||
|
||
print('...Classes exhausted') | ||
return loc_label_ids_train, loc_label_ids_test | ||
|
||
def test_train_split_balance_oversample_minority_exhaust(arr, filt, test_size=0.2, random_state=None, verbose=False): | ||
training_wlabel = arr.loc[filt] | ||
|
||
if random_state is not None: | ||
np.random.seed(int(random_state)) | ||
classes = np.unique(training_wlabel.label) | ||
print('Classes in dataset are: ,', classes) | ||
|
||
train, test = exhaust_class_(arr, training_wlabel, test_size, classes) | ||
return train, test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
import pickle | ||
import elnes | ||
import math | ||
from skl_emeralds.print import * | ||
|
||
def test_train_split_balance_oversample_minority_byhole(arr, filt, test_size=0.2, random_state=None, verbose=False): | ||
training_wlabel = arr.loc[filt] | ||
|
||
if random_state is not None: | ||
np.random.seed(int(random_state)) | ||
classes = np.unique(training_wlabel.label) | ||
print('Training classes: ', classes) | ||
|
||
if verbose: | ||
print('splitting by hole, then balancing') | ||
|
||
# All points from a borehole should either be in test OR in train | ||
data_train, data_test, label_train, label_test = train_test_split_byhole(arr, | ||
training_wlabel.label, | ||
test_size=test_size, | ||
random_state=random_state, | ||
test_size_byData=False, | ||
hole_id_name="title") | ||
|
||
for ID in data_train.title.unique(): | ||
if ID in data_test.title.unique(): | ||
print('Duplicate borehole in train / test split:', ID) | ||
|
||
label_train_counts = label_train.value_counts() | ||
label_train_deficits = label_train_counts.max() - label_train_counts | ||
for c in classes: | ||
if label_train_deficits.loc[c] == 0: | ||
continue | ||
|
||
index_train_c = label_train.loc[label_train == c].index | ||
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c]) | ||
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c] | ||
new_indices_c = np.concatenate(( | ||
np.array(index_train_c.to_list() * (n_concats)), | ||
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False))) | ||
|
||
data_train = pd.concat((data_train, data_train.loc[new_indices_c, :])) | ||
|
||
|
||
loc_label_ids_train = data_train.index.isin(new_indices_c) | ||
|
||
loc_label_ids_test = ~data_test.index.isin(new_indices_c) | ||
if verbose: | ||
print('----------------------------') | ||
print_label_info(label_train, 'Training labels') | ||
print('----------------------------') | ||
print_label_info(label_test, 'Testing labels') | ||
|
||
return loc_label_ids_train, loc_label_ids_test | ||
|
||
|
||
def train_test_split_byhole(arr, label, test_size=0.2, hole_id_name='title', random_state=None, test_size_byData=None): | ||
IDs = arr.loc[:, hole_id_name].unique() | ||
if random_state is not None: | ||
np.random.seed(random_state) | ||
np.random.shuffle(IDs) | ||
n_IDs = IDs.size | ||
index_IDs = np.arange(n_IDs) | ||
|
||
if not test_size_byData: # default setting, where test_size is interpreted as fraction of holes | ||
|
||
n_IDs_train = int(round((test_size) * n_IDs)) | ||
|
||
IDs_train = IDs[:n_IDs_train] | ||
IDs_test = IDs[n_IDs_train:] | ||
|
||
data_train = arr[arr.loc[:, hole_id_name].isin(IDs_train)] | ||
data_test = arr[arr.loc[:, hole_id_name].isin(IDs_test)] | ||
|
||
label_train = label[arr.loc[:, hole_id_name].isin(IDs_train)] | ||
label_test = label[arr.loc[:, hole_id_name].isin(IDs_test)] | ||
|
||
elif test_size_byData: # test_size is interpreted as fraction of data points | ||
data_wLabel = arr | ||
data_wLabel.at[:, 'Label'] = label | ||
|
||
# make new dataframe, pointID shuffled as index, order index as one column, use this as lookup table | ||
lookup_ds = pd.Series(index=IDs, arr=index_IDs) | ||
for row_index, value in data.loc[:, hole_id_name].items(): | ||
data_wLabel.at[row_index, 'ID_order'] = lookup_ds.loc[value].astype(np.int) | ||
data_wLabel = data_wLabel.sort_values(axis=0, by='ID_order') | ||
|
||
n_rows = data_wLabel.shape[0] | ||
n_train = round((1 - test_size) * n_rows) | ||
|
||
data_train = data_wLabel.iloc[:n_train, :] | ||
data_test = data_wLabel.iloc[n_train:, :] | ||
|
||
label_train = data_wLabel.iloc[:n_train, :].loc[:, 'Label'] | ||
label_test = data_wLabel.iloc[n_train:, :].loc[:, 'Label'] | ||
|
||
return data_train, data_test, label_train, label_test | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
import pickle | ||
import elnes | ||
import math | ||
from skl_emeralds.print import * | ||
|
||
def test_train_split_balance_oversample_minority_stratify(arr, filt, test_size, verbose=None): | ||
print('Stratifying by label, then balancing') | ||
|
||
if verbose: | ||
print_label_info(filt, 'Input labels') | ||
|
||
if random_state is not None: | ||
np.random.seed(int(random_state)) | ||
classes = np.unique(filt) | ||
|
||
data_train, data_test, label_train, label_test = sklearn.model_selection.train_test_split( | ||
arr, filt, test_size=test_size, | ||
random_state=random_state, stratify=filt) | ||
|
||
label_train_counts = label_train.value_counts() | ||
label_train_deficits = label_train_counts.max() - label_train_counts | ||
|
||
for c in classes: | ||
if label_train_deficits.loc[c] == 0: | ||
continue | ||
|
||
index_train_c = label_train.loc[label_train == c].index | ||
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c]) | ||
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c] | ||
new_indices_c = np.concatenate(( | ||
np.array(index_train_c.to_list() * (n_concats)), | ||
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False))) | ||
|
||
data_train = pd.concat((data_train, data_train.loc[new_indices_c, :])) | ||
|
||
loc_label_ids_train = data_train.index.isin(new_indices_c) | ||
|
||
loc_label_ids_test = ~data_test.index.isin(new_indices_c) | ||
|
||
if verbose: | ||
print('---------------------') | ||
print_label_info(label_train, 'Training labels') | ||
print('---------------------') | ||
print_label_info(label_test, 'Testing labels') | ||
|
||
return loc_label_ids_train, loc_label_ids_test |