Add changes to RandomFeaturesPandasDataset

azoz01 · Dec 5, 2023 · b995727 · b995727
1 parent 090d11f
commit b995727
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 94 deletions.
diff --git a/liltab/data/dataloaders.py b/liltab/data/dataloaders.py
@@ -50,7 +50,7 @@ def __init__(
 
         self.n_rows = len(self.dataset)
 
-        if sample_classes_equally:
+        if self.sample_classes_equally or self.sample_classes_stratified:
             self.y = dataset.raw_y
             self.class_values = np.unique(self.y)
             if len(self.class_values) > self.support_size:
@@ -66,6 +66,8 @@ def __init__(
             self.class_values_idx = dict()
             for val in self.class_values:
                 self.class_values_idx[val] = np.where(self.y == val)[0]
+
+        if sample_classes_equally:
             self.samples_per_class_support = {
                 class_value: self.support_size // len(self.class_values)
                 for class_value in self.class_values
@@ -75,21 +77,12 @@ def __init__(
                 for class_value in self.class_values
             }
         if self.sample_classes_stratified:
-            self.y = dataset.raw_y
-            self.class_values = np.unique(self.y)
-            self.class_values_idx = dict()
-            for val in self.class_values:
-                self.class_values_idx[val] = np.where(self.y == val)[0]
             self.samples_per_class_support = {
-                class_value: int(
-                    self.support_size * (self.y == class_value).sum() / len(self.y)
-                )
+                class_value: int(self.support_size * (self.y == class_value).sum() / len(self.y))
                 for class_value in self.class_values
             }
             self.samples_per_class_query = {
-                class_value: int(
-                    self.query_size * (self.y == class_value).sum() / len(self.y)
-                )
+                class_value: int(self.query_size * (self.y == class_value).sum() / len(self.y))
                 for class_value in self.class_values
             }
 
@@ -111,11 +104,11 @@ def __next__(self) -> tuple[Tensor, Tensor, Tensor, Tensor]:
             self.curr_episode += 1
 
         if self.sample_classes_equally or self.sample_classes_stratified:
-            return self._sample_with_stratified_classes()
+            return self._sample_with_custom_proportion_classes()
         else:
             return self._sample_without_stratified_classes()
 
-    def _sample_with_stratified_classes(self):
+    def _sample_with_custom_proportion_classes(self):
         support_indices = self._generate_stratified_sampling_idx(
             self.samples_per_class_support, self.support_size
         )
@@ -137,14 +130,10 @@ def _generate_stratified_sampling_idx(
             )
         remaining_to_sample = set_size - len(sampled_indices)
         if remaining_to_sample > 0:
-            available_idx_for_sampling = list(
-                set(range(self.n_rows)) - set(sampled_indices)
-            )
+            available_idx_for_sampling = list(set(range(self.n_rows)) - set(sampled_indices))
             replace = len(available_idx_for_sampling) > remaining_to_sample
             sampled_indices.extend(
-                np.random.choice(
-                    available_idx_for_sampling, remaining_to_sample, replace=replace
-                )
+                np.random.choice(available_idx_for_sampling, remaining_to_sample, replace=replace)
             )
 
         return sampled_indices
@@ -156,9 +145,7 @@ def _sample_without_stratified_classes(
         all_drawn_indices = np.random.choice(
             self.n_rows, self.support_size + self.query_size, replace=replace
         )
-        support_indices = np.random.choice(
-            all_drawn_indices, self.support_size, replace=False
-        )
+        support_indices = np.random.choice(all_drawn_indices, self.support_size, replace=False)
         query_indices = np.array(list(set(all_drawn_indices) - set(support_indices)))
         return *self.dataset[support_indices], *self.dataset[query_indices]
 

diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py
@@ -1,15 +1,73 @@
 import numpy as np
 import pandas as pd
-from sklearn.preprocessing import OneHotEncoder
 import torch
 
+from abc import ABC, abstractmethod
 from pathlib import Path
+from sklearn.preprocessing import OneHotEncoder
 from torch import Tensor
-from torch.utils.data import Dataset
 
 from .preprocessing import get_preprocessing_pipeline
 
 
+class Dataset(ABC):
+    def __init__(
+        self,
+        data_path: str,
+        attribute_columns: list[str],
+        response_columns: list[str],
+        preprocess_data: bool,
+        encode_categorical_target: bool,
+    ):
+        if response_columns and len(response_columns) > 1 and encode_categorical_target:
+            raise ValueError("One-hot encoding is supported only for single target")
+
+        self.data_path = data_path
+        self.df = pd.read_csv(data_path)
+
+        self.attribute_columns = np.array(
+            attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1]
+        )
+        self.response_columns = np.array(
+            response_columns if response_columns is not None else [self.df.columns.tolist()[-1]]
+        )
+        self.n_attributes = len(self.attribute_columns)
+        self.n_responses = len(self.response_columns)
+
+        self.encode_categorical_target = encode_categorical_target
+        self.preprocess_data = preprocess_data
+
+        if self.preprocess_data:
+            self._preprocess_data()
+        if self.encode_categorical_target:
+            self._encode_categorical_target()
+        else:
+            self.y = self.df[self.response_columns].values
+
+    def _preprocess_data(self):
+        self.preprocessing_pipeline = get_preprocessing_pipeline()
+        if self.encode_categorical_target:
+            self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform(
+                self.df[self.attribute_columns]
+            )
+        else:
+            self.df = pd.DataFrame(
+                self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
+            )
+
+    def _encode_categorical_target(self):
+        self.one_hot_encoder = OneHotEncoder(sparse=False)
+        self.raw_y = self.df[self.response_columns]
+        self.y = self.one_hot_encoder.fit_transform((self.df[self.response_columns]))
+
+    @abstractmethod
+    def __getitem__(self):
+        pass
+
+    def __len__(self) -> int:
+        return self.df.shape[0]
+
+
 class PandasDataset(Dataset):
     """
     Torch wrapper to pandas DataFrame which makes it usable
@@ -21,7 +79,7 @@ def __init__(
         self,
         data_path: Path,
         attribute_columns: list[str] = None,
-        target_columns: list[str] = None,
+        response_columns: list[str] = None,
         preprocess_data: bool = True,
         encode_categorical_target: bool = False,
     ):
@@ -31,7 +89,7 @@ def __init__(
             attribute_columns (list[str], optional): Columns from frame
                 which will be used as attributes.
                 Defaults to all columns without last.
-            target_columns (list[str], optional): Columns from frame
+            response_columns (list[str], optional): Columns from frame
                 to be used as responses. Defaults to last column from frame.
             preprocess_data (bool, optional): If true, then imputes data
                 using mean strategy and standardizes using StandardScaler.
@@ -40,47 +98,23 @@ def __init__(
                 will be encoded using one-hot. Works only with single target variable.
                 Default to False.
         """
-        self.data_path = data_path
-        self.encode_categorical_target = encode_categorical_target
-        self.df = pd.read_csv(data_path)
-        self.attribute_columns = (
-            attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1]
-        )
-        self.target_columns = (
-            target_columns if target_columns is not None else [self.df.columns.tolist()[-1]]
+        super().__init__(
+            data_path=data_path,
+            attribute_columns=attribute_columns,
+            response_columns=response_columns,
+            encode_categorical_target=encode_categorical_target,
+            preprocess_data=preprocess_data,
         )
 
-        if len(self.target_columns) > 1 and self.encode_categorical_target:
-            raise ValueError("One-hot encoding is supported only for single target")
-
-        if preprocess_data:
-            self.preprocessing_pipeline = get_preprocessing_pipeline()
-            if self.encode_categorical_target:
-                self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform(
-                    self.df[self.attribute_columns]
-                )
-            else:
-                self.df = pd.DataFrame(
-                    self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
-                )
         self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32)
-
-        self.y = self.df[self.target_columns]
-        if self.encode_categorical_target:
-            self.one_hot_encoder = OneHotEncoder(sparse=False).set_output(transform="pandas")
-            self.raw_y = self.y
-            self.y = self.one_hot_encoder.fit_transform((self.y.astype("category")))
-        self.y = torch.from_numpy(self.y.values).type(torch.float32)
+        self.y = torch.from_numpy(self.y).type(torch.float32)
 
     def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
         X = self.X[idx]
         y = self.y[idx]
 
         return X, y
 
-    def __len__(self) -> int:
-        return self.df.shape[0]
-
 
 class RandomFeaturesPandasDataset(Dataset):
     """
@@ -93,51 +127,99 @@ class RandomFeaturesPandasDataset(Dataset):
     def __init__(
         self,
         data_path: Path,
-        persist_features_iter: int = 2,
+        attribute_columns: list[str] = None,
+        response_columns: list[str] = None,
+        total_random_feature_sampling: bool = False,
         preprocess_data: bool = True,
+        encode_categorical_target: bool = False,
+        persist_features_iter: int = 2,
     ):
         """
         Args:
             data_path (Path): Path to data to be loaded
-            persist_features_iter (int, optional): For how many
-                iterations persist current selection of features.
-                Defaults to 2.
+            attribute_columns (list[str], optional): Columns from frame
+                which will be attributes sampled from.
+                Ignored when total_random_feature_sampling = True.
+                Defaults to all columns without last.
+            response_columns (list[str], optional): Columns from frame
+                to be responses sampled from.
+                Ignored when total_random_feature_sampling = True.
+                Defaults to last column from frame.
+            total_random_feature_sampling (list[bool], optional): If True then attributes
+                and responses are sampled from all datat columns and ignores
+                attribute_columns and response_columns. Defaults to False.
             preprocess_data(bool, optional): If true, then imputes data
                 using mean strategy and standardizes using StandardScaler.
                 Defaults to True.
+            encode_categorical_target(bool, optional): if True, then target column
+                will be encoded using one-hot.
+                When total_random_feature_sampling=True it should be False.
+                Works only with single target variable.
+                Default to False.
+            persist_features_iter (int, optional): For how many
+                iterations persist current selection of features.
+                Defaults to 2.
         """
-        self.data_path = data_path
-        self.persist_features_iter = persist_features_iter
-
-        self.df = pd.read_csv(data_path)
-        self.columns = self.df.columns.values
-        self.n_columns = len(self.columns)
-
-        if preprocess_data:
-            self.preprocessing_pipeline = get_preprocessing_pipeline()
-            self.df = pd.DataFrame(
-                self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
+        super().__init__(
+            data_path=data_path,
+            attribute_columns=attribute_columns,
+            response_columns=response_columns,
+            encode_categorical_target=encode_categorical_target,
+            preprocess_data=preprocess_data,
+        )
+        if total_random_feature_sampling and (
+            attribute_columns or response_columns or encode_categorical_target
+        ):
+            raise ValueError(
+                "total_random_feature_sampling doesn't support feature or encoding specification"
             )
 
+        self.total_random_feature_sampling = total_random_feature_sampling
+        self.persist_features_iter = persist_features_iter
         self.persist_features_counter = 0
+        self.n_columns = self.df.shape[1]
+        self.columns = self.df.columns.values
         self.attributes = None
-        self.target = None
+        self.responses = None
 
     def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
         if self.persist_features_counter == 0:
             self.persist_features_counter = self.persist_features_iter
-            col_idx = np.arange(self.n_columns)
-            features_size = np.random.randint(low=1, high=self.n_columns)
-            attributes_idx = np.random.choice(col_idx, features_size)
-            remaining_idx = list(set(col_idx) - set(attributes_idx))
-            response_idx = np.random.choice(remaining_idx, 1)
-            self.attributes, self.target = self.columns[attributes_idx], self.columns[response_idx]
+
+            if self.total_random_feature_sampling:
+                attributes_idx, responses_idx = self._get_features_from_all_columns()
+                self.attributes, self.responses = (
+                    self.columns[attributes_idx],
+                    self.columns[responses_idx],
+                )
+            else:
+                attributes_idx, responses_idx = self._get_features_from_selected_columns()
+                self.attributes, self.responses = (
+                    self.attribute_columns[attributes_idx],
+                    self.response_columns[responses_idx],
+                )
         self.persist_features_counter -= 1
 
         X = torch.from_numpy(self.df[self.attributes].to_numpy()).type(torch.float32)
-        y = torch.from_numpy(self.df[self.target].to_numpy()).type(torch.float32)
+        if self.encode_categorical_target:
+            y = self.y
+        else:
+            y = torch.from_numpy(self.df[self.responses].to_numpy()).type(torch.float32)
 
         return X[idx], y[idx]
 
-    def __len__(self) -> int:
-        return self.df.shape[0]
+    def _get_features_from_selected_columns(self) -> tuple[int, int]:
+        attributes_size = np.random.randint(low=1, high=self.n_attributes + 1)
+        responses_size = np.random.randint(low=1, high=self.n_responses + 1)
+        attributes_idx = np.random.choice(len(self.attribute_columns), attributes_size).tolist()
+        responses_idx = np.random.choice(len(self.response_columns), responses_size).tolist()
+
+        return attributes_idx, responses_idx
+
+    def _get_features_from_all_columns(self) -> tuple[int, int]:
+        col_idx = np.arange(self.n_columns)
+        features_size = np.random.randint(low=1, high=self.n_columns)
+        attributes_idx = np.random.choice(col_idx, features_size)
+        remaining_idx = list(set(col_idx) - set(attributes_idx))
+        responses_idx = np.random.choice(remaining_idx, 1)
+        return attributes_idx, responses_idx
diff --git a/test/liltab/data/test_dataloaders.py b/test/liltab/data/test_dataloaders.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from liltab.data.datasets import PandasDataset
+from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset
 from liltab.data.dataloaders import (
     FewShotDataLoader,
     ComposedDataLoader,
@@ -58,6 +58,20 @@ def test_few_shot_data_loader_samples_equally_when_set_size_divisible_by_nunique
             assert (y_query[:, i]).sum() == 2
 
 
+def test_few_shot_data_loader_samples_equally_works_with_random_features(
+    resources_path,
+):
+    frame_path = resources_path / "random_df_3.csv"
+    dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_target=True)
+    dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True)
+
+    for episode in dataloader:
+        _, y_support, _, y_query = episode
+        for i in range(3):
+            assert (y_support[:, i]).sum() == 3
+            assert (y_query[:, i]).sum() == 2
+
+
 def test_few_shot_data_loader_samples_equally_when_set_size_non_divisible_by_nunique_classes(
     resources_path,
 ):