diff --git a/skl_emeralds/print.py b/skl_emeralds/print.py new file mode 100644 index 0000000..047b550 --- /dev/null +++ b/skl_emeralds/print.py @@ -0,0 +1,4 @@ +def print_label_info(series, message): + class_counts = series.value_counts() + class_fractions = class_counts / series.size + print(message,'\nClasses in the dataset, counts, fraction: \n', pd.concat((class_counts, class_fractions), axis=1)) \ No newline at end of file diff --git a/skl_emeralds/test_train_splitters/exhaust.py b/skl_emeralds/test_train_splitters/exhaust.py new file mode 100644 index 0000000..6e91898 --- /dev/null +++ b/skl_emeralds/test_train_splitters/exhaust.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import sklearn +import pickle +import elnes +import math +from skl_emeralds.print import * + +def exhaust_class_(pointcloud, new_label, test_size, classes): + + class_counts = new_label.label.value_counts() + + n_train = (1 - test_size) * new_label.label.shape[0] + n_even = n_train / classes.size + + label_train_deficits = np.round(n_even - class_counts) + + for c in classes: + # Pandas series, single class e.g. Brittle, non-brittle + label_c = new_label.label.loc[new_label.label == c] + # Dataframe of above indexes + data_c = pointcloud.loc[label_c.index] + + if n_even <= class_counts[c]: + # Creating indices, array of indices + new_indices_c = np.random.choice(label_c.index, round(n_even), replace=False) + # 2 masks + loc_label_ids_train = data_c.index.isin(new_indices_c) + loc_label_ids_test = ~data_c.index.isin(new_indices_c) + + if n_even > class_counts[c]: + n_concats = max(math.floor(label_train_deficits.loc[c] / class_counts.loc[c]), 0) + n_random_sel = int(label_train_deficits.loc[c] % class_counts.loc[c]) + new_indices_c = np.concatenate(( + np.array(label_c.index.to_list() * (n_concats + 1)), + np.random.choice(label_c.index.to_numpy(), n_random_sel, replace=False))) + # 1 mask + loc_label_ids_train = data_c.index.isin(new_indices_c) + + print('...Classes exhausted') + return loc_label_ids_train, loc_label_ids_test + +def test_train_split_balance_oversample_minority_exhaust(arr, filt, test_size=0.2, random_state=None, verbose=False): + training_wlabel = arr.loc[filt] + + if random_state is not None: + np.random.seed(int(random_state)) + classes = np.unique(training_wlabel.label) + print('Classes in dataset are: ,', classes) + + train, test = exhaust_class_(arr, training_wlabel, test_size, classes) + return train, test \ No newline at end of file diff --git a/skl_emeralds/test_train_splitters/oversample.py b/skl_emeralds/test_train_splitters/oversample.py new file mode 100644 index 0000000..1a39c5b --- /dev/null +++ b/skl_emeralds/test_train_splitters/oversample.py @@ -0,0 +1,104 @@ +import numpy as np +import pandas as pd +import sklearn +import pickle +import elnes +import math +from skl_emeralds.print import * + +def test_train_split_balance_oversample_minority_byhole(arr, filt, test_size=0.2, random_state=None, verbose=False): + training_wlabel = arr.loc[filt] + + if random_state is not None: + np.random.seed(int(random_state)) + classes = np.unique(training_wlabel.label) + print('Training classes: ', classes) + + if verbose: + print('splitting by hole, then balancing') + + # All points from a borehole should either be in test OR in train + data_train, data_test, label_train, label_test = train_test_split_byhole(arr, + training_wlabel.label, + test_size=test_size, + random_state=random_state, + test_size_byData=False, + hole_id_name="title") + + for ID in data_train.title.unique(): + if ID in data_test.title.unique(): + print('Duplicate borehole in train / test split:', ID) + + label_train_counts = label_train.value_counts() + label_train_deficits = label_train_counts.max() - label_train_counts + for c in classes: + if label_train_deficits.loc[c] == 0: + continue + + index_train_c = label_train.loc[label_train == c].index + n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c]) + n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c] + new_indices_c = np.concatenate(( + np.array(index_train_c.to_list() * (n_concats)), + np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False))) + + data_train = pd.concat((data_train, data_train.loc[new_indices_c, :])) + + + loc_label_ids_train = data_train.index.isin(new_indices_c) + + loc_label_ids_test = ~data_test.index.isin(new_indices_c) + if verbose: + print('----------------------------') + print_label_info(label_train, 'Training labels') + print('----------------------------') + print_label_info(label_test, 'Testing labels') + + return loc_label_ids_train, loc_label_ids_test + + +def train_test_split_byhole(arr, label, test_size=0.2, hole_id_name='title', random_state=None, test_size_byData=None): + IDs = arr.loc[:, hole_id_name].unique() + if random_state is not None: + np.random.seed(random_state) + np.random.shuffle(IDs) + n_IDs = IDs.size + index_IDs = np.arange(n_IDs) + + if not test_size_byData: # default setting, where test_size is interpreted as fraction of holes + + n_IDs_train = int(round((test_size) * n_IDs)) + + IDs_train = IDs[:n_IDs_train] + IDs_test = IDs[n_IDs_train:] + + data_train = arr[arr.loc[:, hole_id_name].isin(IDs_train)] + data_test = arr[arr.loc[:, hole_id_name].isin(IDs_test)] + + label_train = label[arr.loc[:, hole_id_name].isin(IDs_train)] + label_test = label[arr.loc[:, hole_id_name].isin(IDs_test)] + + elif test_size_byData: # test_size is interpreted as fraction of data points + data_wLabel = arr + data_wLabel.at[:, 'Label'] = label + + # make new dataframe, pointID shuffled as index, order index as one column, use this as lookup table + lookup_ds = pd.Series(index=IDs, arr=index_IDs) + for row_index, value in data.loc[:, hole_id_name].items(): + data_wLabel.at[row_index, 'ID_order'] = lookup_ds.loc[value].astype(np.int) + data_wLabel = data_wLabel.sort_values(axis=0, by='ID_order') + + n_rows = data_wLabel.shape[0] + n_train = round((1 - test_size) * n_rows) + + data_train = data_wLabel.iloc[:n_train, :] + data_test = data_wLabel.iloc[n_train:, :] + + label_train = data_wLabel.iloc[:n_train, :].loc[:, 'Label'] + label_test = data_wLabel.iloc[n_train:, :].loc[:, 'Label'] + + return data_train, data_test, label_train, label_test + + + + diff --git a/skl_emeralds/test_train_splitters/stratify.py b/skl_emeralds/test_train_splitters/stratify.py new file mode 100644 index 0000000..a0e8aee --- /dev/null +++ b/skl_emeralds/test_train_splitters/stratify.py @@ -0,0 +1,49 @@ +import numpy as np +import pandas as pd +import sklearn +import pickle +import elnes +import math +from skl_emeralds.print import * + +def test_train_split_balance_oversample_minority_stratify(arr, filt, test_size, verbose=None): + print('Stratifying by label, then balancing') + + if verbose: + print_label_info(filt, 'Input labels') + + if random_state is not None: + np.random.seed(int(random_state)) + classes = np.unique(filt) + + data_train, data_test, label_train, label_test = sklearn.model_selection.train_test_split( + arr, filt, test_size=test_size, + random_state=random_state, stratify=filt) + + label_train_counts = label_train.value_counts() + label_train_deficits = label_train_counts.max() - label_train_counts + + for c in classes: + if label_train_deficits.loc[c] == 0: + continue + + index_train_c = label_train.loc[label_train == c].index + n_concats = math.floor(label_train_deficits.loc[c] / label_train_counts.loc[c]) + n_random_sel = label_train_deficits.loc[c] % label_train_counts.loc[c] + new_indices_c = np.concatenate(( + np.array(index_train_c.to_list() * (n_concats)), + np.random.choice(index_train_c.to_numpy(), n_random_sel, replace=False))) + + data_train = pd.concat((data_train, data_train.loc[new_indices_c, :])) + + loc_label_ids_train = data_train.index.isin(new_indices_c) + + loc_label_ids_test = ~data_test.index.isin(new_indices_c) + + if verbose: + print('---------------------') + print_label_info(label_train, 'Training labels') + print('---------------------') + print_label_info(label_test, 'Testing labels') + + return loc_label_ids_train, loc_label_ids_test