Skip to content

Commit

Permalink
Merge pull request #1 from emerald-geomodelling/train-test
Browse files Browse the repository at this point in the history
oversample, exhaust, stratify
  • Loading branch information
Egil Möller authored Jun 28, 2022
2 parents 4767e2a + 0557040 commit 6c14ca1
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 0 deletions.
4 changes: 4 additions & 0 deletions skl_emeralds/print.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
def print_label_info(series, message):
class_counts = series.value_counts()
class_fractions = class_counts / series.size
print(message,'\nClasses in the dataset, counts, fraction: \n', pd.concat((class_counts, class_fractions), axis=1))
52 changes: 52 additions & 0 deletions skl_emeralds/test_train_splitters/exhaust.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def exhaust_class_(pointcloud, new_label, test_size, classes):

class_counts = new_label.label.value_counts()

n_train = (1 - test_size) * new_label.label.shape[0]
n_even = n_train / classes.size

label_train_deficits = np.round(n_even - class_counts)

for c in classes:
# Pandas series, single class e.g. Brittle, non-brittle
label_c = new_label.label.loc[new_label.label == c]
# Dataframe of above indexes
data_c = pointcloud.loc[label_c.index]

if n_even <= class_counts[c]:
# Creating indices, array of indices
new_indices_c = np.random.choice(label_c.index, round(n_even), replace=False)
# 2 masks
loc_label_ids_train = data_c.index.isin(new_indices_c)
loc_label_ids_test = ~data_c.index.isin(new_indices_c)

if n_even > class_counts[c]:
n_concats = max(math.floor(label_train_deficits.loc[c] / class_counts.loc[c]), 0)
n_random_sel = int(label_train_deficits.loc[c] % class_counts.loc[c])
new_indices_c = np.concatenate((
np.array(label_c.index.to_list() * (n_concats + 1)),
np.random.choice(label_c.index.to_numpy(), n_random_sel, replace=False)))
# 1 mask
loc_label_ids_train = data_c.index.isin(new_indices_c)

print('...Classes exhausted')
return loc_label_ids_train, loc_label_ids_test

def test_train_split_balance_oversample_minority_exhaust(arr, filt, test_size=0.2, random_state=None, verbose=False):
training_wlabel = arr.loc[filt]

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(training_wlabel.label)
print('Classes in dataset are: ,', classes)

train, test = exhaust_class_(arr, training_wlabel, test_size, classes)
return train, test
104 changes: 104 additions & 0 deletions skl_emeralds/test_train_splitters/oversample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def test_train_split_balance_oversample_minority_byhole(arr, filt, test_size=0.2, random_state=None, verbose=False):
training_wlabel = arr.loc[filt]

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(training_wlabel.label)
print('Training classes: ', classes)

if verbose:
print('splitting by hole, then balancing')

# All points from a borehole should either be in test OR in train
data_train, data_test, label_train, label_test = train_test_split_byhole(arr,
training_wlabel.label,
test_size=test_size,
random_state=random_state,
test_size_byData=False,
hole_id_name="title")

for ID in data_train.title.unique():
if ID in data_test.title.unique():
print('Duplicate borehole in train / test split:', ID)

label_train_counts = label_train.value_counts()
label_train_deficits = label_train_counts.max() - label_train_counts
for c in classes:
if label_train_deficits.loc[c] == 0:
continue

index_train_c = label_train.loc[label_train == c].index
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c])
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c]
new_indices_c = np.concatenate((
np.array(index_train_c.to_list() * (n_concats)),
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False)))

data_train = pd.concat((data_train, data_train.loc[new_indices_c, :]))


loc_label_ids_train = data_train.index.isin(new_indices_c)

loc_label_ids_test = ~data_test.index.isin(new_indices_c)
if verbose:
print('----------------------------')
print_label_info(label_train, 'Training labels')
print('----------------------------')
print_label_info(label_test, 'Testing labels')

return loc_label_ids_train, loc_label_ids_test


def train_test_split_byhole(arr, label, test_size=0.2, hole_id_name='title', random_state=None, test_size_byData=None):
IDs = arr.loc[:, hole_id_name].unique()
if random_state is not None:
np.random.seed(random_state)
np.random.shuffle(IDs)
n_IDs = IDs.size
index_IDs = np.arange(n_IDs)

if not test_size_byData: # default setting, where test_size is interpreted as fraction of holes

n_IDs_train = int(round((test_size) * n_IDs))

IDs_train = IDs[:n_IDs_train]
IDs_test = IDs[n_IDs_train:]

data_train = arr[arr.loc[:, hole_id_name].isin(IDs_train)]
data_test = arr[arr.loc[:, hole_id_name].isin(IDs_test)]

label_train = label[arr.loc[:, hole_id_name].isin(IDs_train)]
label_test = label[arr.loc[:, hole_id_name].isin(IDs_test)]

elif test_size_byData: # test_size is interpreted as fraction of data points
data_wLabel = arr
data_wLabel.at[:, 'Label'] = label

# make new dataframe, pointID shuffled as index, order index as one column, use this as lookup table
lookup_ds = pd.Series(index=IDs, arr=index_IDs)
for row_index, value in data.loc[:, hole_id_name].items():
data_wLabel.at[row_index, 'ID_order'] = lookup_ds.loc[value].astype(np.int)
data_wLabel = data_wLabel.sort_values(axis=0, by='ID_order')

n_rows = data_wLabel.shape[0]
n_train = round((1 - test_size) * n_rows)

data_train = data_wLabel.iloc[:n_train, :]
data_test = data_wLabel.iloc[n_train:, :]

label_train = data_wLabel.iloc[:n_train, :].loc[:, 'Label']
label_test = data_wLabel.iloc[n_train:, :].loc[:, 'Label']

return data_train, data_test, label_train, label_test




49 changes: 49 additions & 0 deletions skl_emeralds/test_train_splitters/stratify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
import pandas as pd
import sklearn
import pickle
import elnes
import math
from skl_emeralds.print import *

def test_train_split_balance_oversample_minority_stratify(arr, filt, test_size, verbose=None):
print('Stratifying by label, then balancing')

if verbose:
print_label_info(filt, 'Input labels')

if random_state is not None:
np.random.seed(int(random_state))
classes = np.unique(filt)

data_train, data_test, label_train, label_test = sklearn.model_selection.train_test_split(
arr, filt, test_size=test_size,
random_state=random_state, stratify=filt)

label_train_counts = label_train.value_counts()
label_train_deficits = label_train_counts.max() - label_train_counts

for c in classes:
if label_train_deficits.loc[c] == 0:
continue

index_train_c = label_train.loc[label_train == c].index
n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c])
n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c]
new_indices_c = np.concatenate((
np.array(index_train_c.to_list() * (n_concats)),
np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False)))

data_train = pd.concat((data_train, data_train.loc[new_indices_c, :]))

loc_label_ids_train = data_train.index.isin(new_indices_c)

loc_label_ids_test = ~data_test.index.isin(new_indices_c)

if verbose:
print('---------------------')
print_label_info(label_train, 'Training labels')
print('---------------------')
print_label_info(label_test, 'Testing labels')

return loc_label_ids_train, loc_label_ids_test

0 comments on commit 6c14ca1

Please sign in to comment.