diff --git a/experiments/02_openml.py b/experiments/02_openml.py index ea7c99c..c6d7567 100644 --- a/experiments/02_openml.py +++ b/experiments/02_openml.py @@ -31,7 +31,7 @@ def main(): train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["train_data_path"]), RandomFeaturesPandasDataset, - {}, + {"total_random_feature_sampling": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, ComposedDataLoader, diff --git a/liltab/data/dataloaders.py b/liltab/data/dataloaders.py index 6cbeb77..a2d6c08 100644 --- a/liltab/data/dataloaders.py +++ b/liltab/data/dataloaders.py @@ -2,8 +2,9 @@ from copy import deepcopy from torch import Tensor -from torch.utils.data import Dataset -from typing import Iterable, OrderedDict +from typing import Iterable, OrderedDict, Dict, Union + +from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset class FewShotDataLoader: @@ -15,28 +16,83 @@ class FewShotDataLoader: def __init__( self, - dataset: Dataset, + dataset: Union[PandasDataset, RandomFeaturesPandasDataset], support_size: int, query_size: int, n_episodes: int = None, + sample_classes_equally: bool = False, + sample_classes_stratified: bool = False, ): """ Args: - dataset (Dataset): dataset to load data from. + dataset (Union[PandasDataset, RandomFeaturesPandasDataset]): dataset to load data from. support_size (int): size of support set in each episode. query_size (int): size of query set in each episode. n_episodes (int, optional): number of episodes. If none, then iterator is without end. Defaults to None. + sample_classes_equally (bool, optional): If True, then in each iteration gives + in task equal number of observations per class. + Apply only to classification. + sample_classes_stratified (bool, optional): If True, then in each iteration gives + in task stratified number of observations per class. + Apply only to classification. """ self.dataset = dataset self.support_size = support_size self.query_size = query_size self.n_episodes = n_episodes + self.sample_classes_equally = sample_classes_equally + self.sample_classes_stratified = sample_classes_stratified + if self.sample_classes_equally and self.sample_classes_stratified: + raise ValueError("Only one of equal or stratified sampling can be used.") self.curr_episode = 0 self.n_rows = len(self.dataset) + if self.sample_classes_equally or self.sample_classes_stratified: + self.y = dataset.raw_y + self.class_values = np.unique(self.y) + if len(self.class_values) > self.support_size: + raise ValueError( + "When sampling equally the support size should " + "be higher than number of distinct values" + ) + if len(self.class_values) > self.query_size: + raise ValueError( + "When sampling equally the query size should " + "be higher than number of distinct values" + ) + self.class_values_idx = dict() + for val in self.class_values: + self.class_values_idx[val] = np.where(self.y == val)[0] + + if sample_classes_equally: + self._init_samples_per_class_equal() + + if sample_classes_stratified: + self._init_samples_per_class_stratified() + + def _init_samples_per_class_equal(self): + self.samples_per_class_support = { + class_value: self.support_size // len(self.class_values) + for class_value in self.class_values + } + self.samples_per_class_query = { + class_value: self.query_size // len(self.class_values) + for class_value in self.class_values + } + + def _init_samples_per_class_stratified(self): + self.samples_per_class_support = { + class_value: int(self.support_size * (self.y == class_value).sum() / len(self.y)) + for class_value in self.class_values + } + self.samples_per_class_query = { + class_value: int(self.query_size * (self.y == class_value).sum() / len(self.y)) + for class_value in self.class_values + } + def __iter__(self): return deepcopy(self) @@ -54,6 +110,44 @@ def __next__(self) -> tuple[Tensor, Tensor, Tensor, Tensor]: raise StopIteration() self.curr_episode += 1 + if self.sample_classes_equally or self.sample_classes_stratified: + return self._sample_with_custom_proportion_classes() + else: + return self._sample_without_stratified_classes() + + def _sample_with_custom_proportion_classes(self): + support_indices = self._generate_stratified_sampling_idx( + self.samples_per_class_support, self.support_size + ) + query_indices = self._generate_stratified_sampling_idx( + self.samples_per_class_query, self.query_size + ) + support_indices = np.random.permutation(support_indices) + query_indices = np.random.permutation(query_indices) + return *self.dataset[support_indices], *self.dataset[query_indices] + + def _generate_stratified_sampling_idx( + self, samples_per_class_dict: Dict[int, np.ndarray], set_size: int + ) -> list[int]: + sampled_indices = [] + for val, idx in self.class_values_idx.items(): + replace = samples_per_class_dict[val] > len(idx) + sampled_indices.extend( + np.random.choice(idx, samples_per_class_dict[val], replace=replace) + ) + remaining_to_sample = set_size - len(sampled_indices) + if remaining_to_sample > 0: + available_idx_for_sampling = list(set(range(self.n_rows)) - set(sampled_indices)) + replace = len(available_idx_for_sampling) > remaining_to_sample + sampled_indices.extend( + np.random.choice(available_idx_for_sampling, remaining_to_sample, replace=replace) + ) + + return sampled_indices + + def _sample_without_stratified_classes( + self, + ) -> tuple[Tensor, Tensor, Tensor, Tensor]: replace = True if self.support_size + self.query_size >= self.n_rows else False all_drawn_indices = np.random.choice( self.n_rows, self.support_size + self.query_size, replace=replace diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py index 49961c2..2ded337 100644 --- a/liltab/data/datasets.py +++ b/liltab/data/datasets.py @@ -2,13 +2,96 @@ import pandas as pd import torch +from abc import ABC, abstractmethod from pathlib import Path +from sklearn.preprocessing import OneHotEncoder from torch import Tensor -from torch.utils.data import Dataset from .preprocessing import get_preprocessing_pipeline +class Dataset(ABC): + """ + Abstract class for Datasets. It reads and stores data as Pandas + DataFrame. __getitem__ method is to be implemented with custom + indexing strategy. + """ + + def __init__( + self, + data_path: str, + attribute_columns: list[str], + response_columns: list[str], + preprocess_data: bool, + encode_categorical_target: bool, + ): + if ( + response_columns is not None + and len(response_columns) > 1 + and encode_categorical_target + ): + raise ValueError("One-hot encoding is supported only for single target") + + self.data_path = data_path + self.df = pd.read_csv(data_path) + + self.attribute_columns = np.array( + attribute_columns + if attribute_columns is not None + else self.df.columns.tolist()[:-1] + ) + self.response_columns = np.array( + response_columns + if response_columns is not None + else [self.df.columns.tolist()[-1]] + ) + self.n_attributes = len(self.attribute_columns) + self.n_responses = len(self.response_columns) + + self.encode_categorical_target = encode_categorical_target + self.preprocess_data = preprocess_data + + if self.preprocess_data: + self._preprocess_data() + if self.encode_categorical_target: + self._encode_categorical_target() + else: + self.y = self.df[self.response_columns].values + + def _preprocess_data(self): + """ + Standardizes data using z-score method. If encode_categorical_target = True + then response variable isn't scaled. + """ + self.preprocessing_pipeline = get_preprocessing_pipeline() + if self.encode_categorical_target: + self.df.loc[ + :, self.attribute_columns + ] = self.preprocessing_pipeline.fit_transform( + self.df[self.attribute_columns] + ) + else: + self.df = pd.DataFrame( + self.preprocessing_pipeline.fit_transform(self.df), + columns=self.df.columns, + ) + + def _encode_categorical_target(self): + """ + Encodes categorical response using one-hot encoding. + """ + self.one_hot_encoder = OneHotEncoder(sparse=False) + self.raw_y = self.df[self.response_columns] + self.y = self.one_hot_encoder.fit_transform((self.df[self.response_columns])) + + @abstractmethod + def __getitem__(self): + pass + + def __len__(self) -> int: + return self.df.shape[0] + + class PandasDataset(Dataset): """ Torch wrapper to pandas DataFrame which makes it usable @@ -20,7 +103,7 @@ def __init__( self, data_path: Path, attribute_columns: list[str] = None, - target_columns: list[str] = None, + response_columns: list[str] = None, preprocess_data: bool = True, encode_categorical_target: bool = False, ): @@ -30,7 +113,7 @@ def __init__( attribute_columns (list[str], optional): Columns from frame which will be used as attributes. Defaults to all columns without last. - target_columns (list[str], optional): Columns from frame + response_columns (list[str], optional): Columns from frame to be used as responses. Defaults to last column from frame. preprocess_data (bool, optional): If true, then imputes data using mean strategy and standardizes using StandardScaler. @@ -39,35 +122,18 @@ def __init__( will be encoded using one-hot. Works only with single target variable. Default to False. """ - self.data_path = data_path - self.encode_categorical_target = encode_categorical_target - self.df = pd.read_csv(data_path) - self.attribute_columns = ( - attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1] + super().__init__( + data_path=data_path, + attribute_columns=attribute_columns, + response_columns=response_columns, + encode_categorical_target=encode_categorical_target, + preprocess_data=preprocess_data, ) - self.target_columns = ( - target_columns if target_columns is not None else [self.df.columns.tolist()[-1]] - ) - - if len(self.target_columns) > 1 and self.encode_categorical_target: - raise ValueError("One-hot encoding is supported only for single target") - if preprocess_data: - self.preprocessing_pipeline = get_preprocessing_pipeline() - if self.encode_categorical_target: - self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform( - self.df[self.attribute_columns] - ) - else: - self.df = pd.DataFrame( - self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns - ) - self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32) - - self.y = self.df[self.target_columns] - if self.encode_categorical_target: - self.y = pd.get_dummies(self.y.astype("category")) - self.y = torch.from_numpy(self.y.to_numpy()).type(torch.float32) + self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type( + torch.float32 + ) + self.y = torch.from_numpy(self.y).type(torch.float32) def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: X = self.X[idx] @@ -75,9 +141,6 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: return X, y - def __len__(self) -> int: - return self.df.shape[0] - class RandomFeaturesPandasDataset(Dataset): """ @@ -90,51 +153,108 @@ class RandomFeaturesPandasDataset(Dataset): def __init__( self, data_path: Path, - persist_features_iter: int = 2, + attribute_columns: list[str] = None, + response_columns: list[str] = None, + total_random_feature_sampling: bool = False, preprocess_data: bool = True, + encode_categorical_target: bool = False, + persist_features_iter: int = 2, ): """ Args: data_path (Path): Path to data to be loaded - persist_features_iter (int, optional): For how many - iterations persist current selection of features. - Defaults to 2. + attribute_columns (list[str], optional): Columns from frame + which will be attributes sampled from. + Ignored when total_random_feature_sampling = True. + Defaults to all columns without last. + response_columns (list[str], optional): Columns from frame + to be responses sampled from. + Ignored when total_random_feature_sampling = True. + Defaults to last column from frame. + total_random_feature_sampling (list[bool], optional): If True then attributes + and responses are sampled from all datat columns and ignores + attribute_columns and response_columns. Defaults to False. preprocess_data(bool, optional): If true, then imputes data using mean strategy and standardizes using StandardScaler. Defaults to True. + encode_categorical_target(bool, optional): if True, then target column + will be encoded using one-hot. + When total_random_feature_sampling=True it should be False. + Works only with single target variable. + Default to False. + persist_features_iter (int, optional): For how many + iterations persist current selection of features. + Defaults to 2. """ - self.data_path = data_path - self.persist_features_iter = persist_features_iter - - self.df = pd.read_csv(data_path) - self.columns = self.df.columns.values - self.n_columns = len(self.columns) - - if preprocess_data: - self.preprocessing_pipeline = get_preprocessing_pipeline() - self.df = pd.DataFrame( - self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns + super().__init__( + data_path=data_path, + attribute_columns=attribute_columns, + response_columns=response_columns, + encode_categorical_target=encode_categorical_target, + preprocess_data=preprocess_data, + ) + if total_random_feature_sampling and ( + attribute_columns is not None + or response_columns + or encode_categorical_target + ): + raise ValueError( + "total_random_feature_sampling doesn't support feature or encoding specification" ) + self.total_random_feature_sampling = total_random_feature_sampling + self.persist_features_iter = persist_features_iter self.persist_features_counter = 0 + self.n_columns = self.df.shape[1] + self.columns = self.df.columns.values self.attributes = None - self.target = None + self.responses = None def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: if self.persist_features_counter == 0: self.persist_features_counter = self.persist_features_iter - col_idx = np.arange(self.n_columns) - features_size = np.random.randint(low=1, high=self.n_columns) - attributes_idx = np.random.choice(col_idx, features_size) - remaining_idx = list(set(col_idx) - set(attributes_idx)) - response_idx = np.random.choice(remaining_idx, 1) - self.attributes, self.target = self.columns[attributes_idx], self.columns[response_idx] + + if self.total_random_feature_sampling: + attributes_idx, responses_idx = self._get_features_from_all_columns() + self.attributes, self.responses = ( + self.columns[attributes_idx], + self.columns[responses_idx], + ) + else: + ( + attributes_idx, + responses_idx, + ) = self._get_features_from_selected_columns() + self.attributes, self.responses = ( + self.attribute_columns[attributes_idx], + self.response_columns[responses_idx], + ) self.persist_features_counter -= 1 X = torch.from_numpy(self.df[self.attributes].to_numpy()).type(torch.float32) - y = torch.from_numpy(self.df[self.target].to_numpy()).type(torch.float32) + if self.encode_categorical_target: + y = torch.from_numpy(self.y).type(torch.float32) + else: + y = torch.from_numpy(self.df[self.responses].to_numpy()).type(torch.float32) return X[idx], y[idx] - def __len__(self) -> int: - return self.df.shape[0] + def _get_features_from_selected_columns(self) -> tuple[int, int]: + attributes_size = np.random.randint(low=1, high=self.n_attributes + 1) + responses_size = np.random.randint(low=1, high=self.n_responses + 1) + attributes_idx = np.random.choice( + len(self.attribute_columns), attributes_size + ).tolist() + responses_idx = np.random.choice( + len(self.response_columns), responses_size + ).tolist() + + return attributes_idx, responses_idx + + def _get_features_from_all_columns(self) -> tuple[int, int]: + col_idx = np.arange(self.n_columns) + features_size = np.random.randint(low=1, high=self.n_columns) + attributes_idx = np.random.choice(col_idx, features_size) + remaining_idx = list(set(col_idx) - set(attributes_idx)) + responses_idx = np.random.choice(remaining_idx, 1) + return attributes_idx, responses_idx diff --git a/test/liltab/data/test_dataloaders.py b/test/liltab/data/test_dataloaders.py index 7c269f9..a7f1fbb 100644 --- a/test/liltab/data/test_dataloaders.py +++ b/test/liltab/data/test_dataloaders.py @@ -1,6 +1,6 @@ import numpy as np -from liltab.data.datasets import PandasDataset +from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset from liltab.data.dataloaders import ( FewShotDataLoader, ComposedDataLoader, @@ -44,6 +44,65 @@ def test_few_shot_data_loader_returns_disjoint_tensors(resources_path, utils): assert not utils.tensors_have_common_rows(y_support, y_query) +def test_few_shot_data_loader_samples_equally_when_set_size_divisible_by_nunique_classes( + resources_path, +): + frame_path = resources_path / "random_df_3.csv" + dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True) + + for episode in dataloader: + _, y_support, _, y_query = episode + for i in range(3): + assert (y_support[:, i]).sum() == 3 + assert (y_query[:, i]).sum() == 2 + + +def test_few_shot_data_loader_samples_equally_works_with_random_features( + resources_path, +): + frame_path = resources_path / "random_df_3.csv" + dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_target=True) + dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True) + + for episode in dataloader: + _, y_support, _, y_query = episode + for i in range(3): + assert (y_support[:, i]).sum() == 3 + assert (y_query[:, i]).sum() == 2 + + +def test_few_shot_data_loader_samples_equally_when_set_size_non_divisible_by_nunique_classes( + resources_path, +): + frame_path = resources_path / "random_df_3.csv" + dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataloader = FewShotDataLoader(dataset, 11, 7, n_episodes=10, sample_classes_equally=True) + + for episode in dataloader: + _, y_support, _, y_query = episode + for i in range(3): + assert (y_support[:, i]).sum() >= 3 + assert (y_query[:, i]).sum() >= 2 + + +def test_few_shot_data_loader_samples_stratified( + resources_path, +): + frame_path = resources_path / "random_df_4.csv" + dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataloader = FewShotDataLoader(dataset, 6, 12, n_episodes=10, sample_classes_stratified=True) + + for episode in dataloader: + _, y_support, _, y_query = episode + assert y_support[:, 0].sum() == 1 + assert y_support[:, 1].sum() == 2 + assert y_support[:, 2].sum() == 3 + assert y_query[:, 0].sum() == 2 + assert y_query[:, 1].sum() == 4 + assert y_query[:, 2].sum() == 6 + + def test_few_shot_data_loader_has_next(resources_path): frame_path = resources_path / "random_df_1.csv" dataset = PandasDataset(frame_path) diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py index 2687eab..cba6cdf 100644 --- a/test/liltab/data/test_datasets.py +++ b/test/liltab/data/test_datasets.py @@ -14,8 +14,8 @@ def test_dataset_initializes_default_columns(resources_path): dataset = PandasDataset(frame_path) - assert dataset.attribute_columns == frame_columns[:-1] - assert dataset.target_columns == [frame_columns[-1]] + assert (dataset.attribute_columns == frame_columns[:-1]).all() + assert (dataset.response_columns == [frame_columns[-1]]).all() def test_dataset_assigns_non_default_columns(resources_path): @@ -26,11 +26,11 @@ def test_dataset_assigns_non_default_columns(resources_path): dataset = PandasDataset( frame_path, attribute_columns=frame_columns[1:3], - target_columns=frame_columns[4:], + response_columns=frame_columns[4:], ) - assert dataset.attribute_columns == frame_columns[1:3] - assert dataset.target_columns == frame_columns[4:] + assert (dataset.attribute_columns == frame_columns[1:3]).all() + assert (dataset.response_columns == frame_columns[4:]).all() def test_indexing_dataset_returns_proper_data(resources_path): @@ -44,7 +44,7 @@ def test_indexing_dataset_returns_proper_data(resources_path): actual_X, actual_y = dataset[index] assert_almost_equal(actual_X.numpy(), expected_records[dataset.attribute_columns].values) - assert_almost_equal(actual_y.numpy(), expected_records[dataset.target_columns].values) + assert_almost_equal(actual_y.numpy(), expected_records[dataset.response_columns].values) def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path): @@ -62,7 +62,7 @@ def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path) actual_X.numpy(), expected_records[dataset.attribute_columns].values, decimal=2 ) assert_almost_equal( - actual_y.numpy(), expected_records[dataset.target_columns].values, decimal=2 + actual_y.numpy(), expected_records[dataset.response_columns].values, decimal=2 ) @@ -118,7 +118,10 @@ def test_random_features_pandas_dataset_change_features(resources_path): frame_path = resources_path / "random_df_1.csv" persist_features_iter = 3 dataset = RandomFeaturesPandasDataset( - frame_path, preprocess_data=True, persist_features_iter=persist_features_iter + frame_path, + preprocess_data=True, + persist_features_iter=persist_features_iter, + total_random_feature_sampling=True, ) previous_features = np.ndarray([]) @@ -130,10 +133,26 @@ def test_random_features_pandas_dataset_change_features(resources_path): for _ in range(int(1e2)): dataset[0] features_change_cnt += int(not np.array_equal(previous_features, dataset.attributes)) - target_change_cnt += int(not np.array_equal(previous_target, dataset.target)) + target_change_cnt += int(not np.array_equal(previous_target, dataset.responses)) previous_features = dataset.attributes - previous_target = dataset.target + previous_target = dataset.responses assert np.abs(features_change_cnt / int(1e2) - 1 / persist_features_iter) < 2e-1 assert np.abs(target_change_cnt / int(1e2) - 1 / persist_features_iter) < 2e-1 + + +def test_random_features_pandas_dataset_returns_proper_subset(resources_path): + frame_path = resources_path / "random_df_1.csv" + persist_features_iter = 1 + dataset = RandomFeaturesPandasDataset( + frame_path, + preprocess_data=True, + persist_features_iter=persist_features_iter, + attribute_columns=["col_1", "col_2"], + response_columns=["col_3", "col_4", "col_5"], + ) + for _ in range(int(1e2)): + X, y = dataset[0:10] + assert X.shape[1] <= 2 + assert y.shape[1] <= 3 diff --git a/test/resources/random_df_4.csv b/test/resources/random_df_4.csv new file mode 100644 index 0000000..8d297fc --- /dev/null +++ b/test/resources/random_df_4.csv @@ -0,0 +1,19 @@ +col_1,col_2,col_3,col_4,col_5,class +0.246993239710889,0.7461747744040207,0.3659976022146112,0.9523973056737076,0.6246641515015624,1 +0.6055862509658791,0.4629898405202652,0.285651505325428,0.8435168720137161,0.6703215557534022,1 +0.5614344688775709,0.9022501060661662,0.0089227845679765,0.5244792615769576,0.1450317176588593,1 +0.4415013075415568,0.6809463891065323,0.6106202182350691,0.116692268732601,0.0961897437145805,2 +0.582336435478352,0.3013949400985183,0.8260706488309976,0.3182700830401348,0.786516252260166,2 +0.5623613233474208,0.1933917188800752,0.5594758401896662,0.6677430220774577,0.5874554095356477,2 +0.5852721196624394,0.2872297353031366,0.9553285699118996,0.8817738174312242,0.4841250548234295,2 +0.5852721196624394,0.2872297353031366,0.9553285699118996,0.8817738174312242,0.4841250548234295,2 +0.5852721196624394,0.2872297353031366,0.9553285699118996,0.8817738174312242,0.4841250548234295,2 +0.8672571845860056,0.6129189261998793,0.6718465909087079,0.281141639559974,0.8443462585672832,3 +0.6757720738965469,0.1953555092473384,0.7940405957404102,0.0868952147151195,0.8605789280782232,3 +0.0047704445702196,0.455658746039237,0.4318137209882176,0.0056745284113097,0.4084412474708324,3 +0.3588924521711235,0.9016106038692904,0.1300210220962955,0.2731867597861721,0.1659767550211827,3 +0.4153795608755684,0.2645027199660554,0.9727640542847222,0.2607534035881627,0.4776996776018167,3 +0.4189926497144194,0.4434434301568061,0.8364996583922943,0.5031170129282356,0.8487586418005441,3 +0.3588924521711235,0.9016106038692904,0.1300210220962955,0.2731867597861721,0.1659767550211827,3 +0.4153795608755684,0.2645027199660554,0.9727640542847222,0.2607534035881627,0.4776996776018167,3 +0.4189926497144194,0.4434434301568061,0.8364996583922943,0.5031170129282356,0.8487586418005441,3 \ No newline at end of file