Skip to content

Commit

Permalink
Add changes to RandomFeaturesPandasDataset
Browse files Browse the repository at this point in the history
  • Loading branch information
azoz01 committed Dec 5, 2023
1 parent 090d11f commit b995727
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 94 deletions.
33 changes: 10 additions & 23 deletions liltab/data/dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(

self.n_rows = len(self.dataset)

if sample_classes_equally:
if self.sample_classes_equally or self.sample_classes_stratified:
self.y = dataset.raw_y
self.class_values = np.unique(self.y)
if len(self.class_values) > self.support_size:
Expand All @@ -66,6 +66,8 @@ def __init__(
self.class_values_idx = dict()
for val in self.class_values:
self.class_values_idx[val] = np.where(self.y == val)[0]

if sample_classes_equally:
self.samples_per_class_support = {
class_value: self.support_size // len(self.class_values)
for class_value in self.class_values
Expand All @@ -75,21 +77,12 @@ def __init__(
for class_value in self.class_values
}
if self.sample_classes_stratified:
self.y = dataset.raw_y
self.class_values = np.unique(self.y)
self.class_values_idx = dict()
for val in self.class_values:
self.class_values_idx[val] = np.where(self.y == val)[0]
self.samples_per_class_support = {
class_value: int(
self.support_size * (self.y == class_value).sum() / len(self.y)
)
class_value: int(self.support_size * (self.y == class_value).sum() / len(self.y))
for class_value in self.class_values
}
self.samples_per_class_query = {
class_value: int(
self.query_size * (self.y == class_value).sum() / len(self.y)
)
class_value: int(self.query_size * (self.y == class_value).sum() / len(self.y))
for class_value in self.class_values
}

Expand All @@ -111,11 +104,11 @@ def __next__(self) -> tuple[Tensor, Tensor, Tensor, Tensor]:
self.curr_episode += 1

if self.sample_classes_equally or self.sample_classes_stratified:
return self._sample_with_stratified_classes()
return self._sample_with_custom_proportion_classes()
else:
return self._sample_without_stratified_classes()

def _sample_with_stratified_classes(self):
def _sample_with_custom_proportion_classes(self):
support_indices = self._generate_stratified_sampling_idx(
self.samples_per_class_support, self.support_size
)
Expand All @@ -137,14 +130,10 @@ def _generate_stratified_sampling_idx(
)
remaining_to_sample = set_size - len(sampled_indices)
if remaining_to_sample > 0:
available_idx_for_sampling = list(
set(range(self.n_rows)) - set(sampled_indices)
)
available_idx_for_sampling = list(set(range(self.n_rows)) - set(sampled_indices))
replace = len(available_idx_for_sampling) > remaining_to_sample
sampled_indices.extend(
np.random.choice(
available_idx_for_sampling, remaining_to_sample, replace=replace
)
np.random.choice(available_idx_for_sampling, remaining_to_sample, replace=replace)
)

return sampled_indices
Expand All @@ -156,9 +145,7 @@ def _sample_without_stratified_classes(
all_drawn_indices = np.random.choice(
self.n_rows, self.support_size + self.query_size, replace=replace
)
support_indices = np.random.choice(
all_drawn_indices, self.support_size, replace=False
)
support_indices = np.random.choice(all_drawn_indices, self.support_size, replace=False)
query_indices = np.array(list(set(all_drawn_indices) - set(support_indices)))
return *self.dataset[support_indices], *self.dataset[query_indices]

Expand Down
202 changes: 142 additions & 60 deletions liltab/data/datasets.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,73 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import torch

from abc import ABC, abstractmethod
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from torch import Tensor
from torch.utils.data import Dataset

from .preprocessing import get_preprocessing_pipeline


class Dataset(ABC):
def __init__(
self,
data_path: str,
attribute_columns: list[str],
response_columns: list[str],
preprocess_data: bool,
encode_categorical_target: bool,
):
if response_columns and len(response_columns) > 1 and encode_categorical_target:
raise ValueError("One-hot encoding is supported only for single target")

self.data_path = data_path
self.df = pd.read_csv(data_path)

self.attribute_columns = np.array(
attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1]
)
self.response_columns = np.array(
response_columns if response_columns is not None else [self.df.columns.tolist()[-1]]
)
self.n_attributes = len(self.attribute_columns)
self.n_responses = len(self.response_columns)

self.encode_categorical_target = encode_categorical_target
self.preprocess_data = preprocess_data

if self.preprocess_data:
self._preprocess_data()
if self.encode_categorical_target:
self._encode_categorical_target()
else:
self.y = self.df[self.response_columns].values

def _preprocess_data(self):
self.preprocessing_pipeline = get_preprocessing_pipeline()
if self.encode_categorical_target:
self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform(
self.df[self.attribute_columns]
)
else:
self.df = pd.DataFrame(
self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
)

def _encode_categorical_target(self):
self.one_hot_encoder = OneHotEncoder(sparse=False)
self.raw_y = self.df[self.response_columns]
self.y = self.one_hot_encoder.fit_transform((self.df[self.response_columns]))

@abstractmethod
def __getitem__(self):
pass

def __len__(self) -> int:
return self.df.shape[0]


class PandasDataset(Dataset):
"""
Torch wrapper to pandas DataFrame which makes it usable
Expand All @@ -21,7 +79,7 @@ def __init__(
self,
data_path: Path,
attribute_columns: list[str] = None,
target_columns: list[str] = None,
response_columns: list[str] = None,
preprocess_data: bool = True,
encode_categorical_target: bool = False,
):
Expand All @@ -31,7 +89,7 @@ def __init__(
attribute_columns (list[str], optional): Columns from frame
which will be used as attributes.
Defaults to all columns without last.
target_columns (list[str], optional): Columns from frame
response_columns (list[str], optional): Columns from frame
to be used as responses. Defaults to last column from frame.
preprocess_data (bool, optional): If true, then imputes data
using mean strategy and standardizes using StandardScaler.
Expand All @@ -40,47 +98,23 @@ def __init__(
will be encoded using one-hot. Works only with single target variable.
Default to False.
"""
self.data_path = data_path
self.encode_categorical_target = encode_categorical_target
self.df = pd.read_csv(data_path)
self.attribute_columns = (
attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1]
)
self.target_columns = (
target_columns if target_columns is not None else [self.df.columns.tolist()[-1]]
super().__init__(
data_path=data_path,
attribute_columns=attribute_columns,
response_columns=response_columns,
encode_categorical_target=encode_categorical_target,
preprocess_data=preprocess_data,
)

if len(self.target_columns) > 1 and self.encode_categorical_target:
raise ValueError("One-hot encoding is supported only for single target")

if preprocess_data:
self.preprocessing_pipeline = get_preprocessing_pipeline()
if self.encode_categorical_target:
self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform(
self.df[self.attribute_columns]
)
else:
self.df = pd.DataFrame(
self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
)
self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32)

self.y = self.df[self.target_columns]
if self.encode_categorical_target:
self.one_hot_encoder = OneHotEncoder(sparse=False).set_output(transform="pandas")
self.raw_y = self.y
self.y = self.one_hot_encoder.fit_transform((self.y.astype("category")))
self.y = torch.from_numpy(self.y.values).type(torch.float32)
self.y = torch.from_numpy(self.y).type(torch.float32)

def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
X = self.X[idx]
y = self.y[idx]

return X, y

def __len__(self) -> int:
return self.df.shape[0]


class RandomFeaturesPandasDataset(Dataset):
"""
Expand All @@ -93,51 +127,99 @@ class RandomFeaturesPandasDataset(Dataset):
def __init__(
self,
data_path: Path,
persist_features_iter: int = 2,
attribute_columns: list[str] = None,
response_columns: list[str] = None,
total_random_feature_sampling: bool = False,
preprocess_data: bool = True,
encode_categorical_target: bool = False,
persist_features_iter: int = 2,
):
"""
Args:
data_path (Path): Path to data to be loaded
persist_features_iter (int, optional): For how many
iterations persist current selection of features.
Defaults to 2.
attribute_columns (list[str], optional): Columns from frame
which will be attributes sampled from.
Ignored when total_random_feature_sampling = True.
Defaults to all columns without last.
response_columns (list[str], optional): Columns from frame
to be responses sampled from.
Ignored when total_random_feature_sampling = True.
Defaults to last column from frame.
total_random_feature_sampling (list[bool], optional): If True then attributes
and responses are sampled from all datat columns and ignores
attribute_columns and response_columns. Defaults to False.
preprocess_data(bool, optional): If true, then imputes data
using mean strategy and standardizes using StandardScaler.
Defaults to True.
encode_categorical_target(bool, optional): if True, then target column
will be encoded using one-hot.
When total_random_feature_sampling=True it should be False.
Works only with single target variable.
Default to False.
persist_features_iter (int, optional): For how many
iterations persist current selection of features.
Defaults to 2.
"""
self.data_path = data_path
self.persist_features_iter = persist_features_iter

self.df = pd.read_csv(data_path)
self.columns = self.df.columns.values
self.n_columns = len(self.columns)

if preprocess_data:
self.preprocessing_pipeline = get_preprocessing_pipeline()
self.df = pd.DataFrame(
self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns
super().__init__(
data_path=data_path,
attribute_columns=attribute_columns,
response_columns=response_columns,
encode_categorical_target=encode_categorical_target,
preprocess_data=preprocess_data,
)
if total_random_feature_sampling and (
attribute_columns or response_columns or encode_categorical_target
):
raise ValueError(
"total_random_feature_sampling doesn't support feature or encoding specification"
)

self.total_random_feature_sampling = total_random_feature_sampling
self.persist_features_iter = persist_features_iter
self.persist_features_counter = 0
self.n_columns = self.df.shape[1]
self.columns = self.df.columns.values
self.attributes = None
self.target = None
self.responses = None

def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
if self.persist_features_counter == 0:
self.persist_features_counter = self.persist_features_iter
col_idx = np.arange(self.n_columns)
features_size = np.random.randint(low=1, high=self.n_columns)
attributes_idx = np.random.choice(col_idx, features_size)
remaining_idx = list(set(col_idx) - set(attributes_idx))
response_idx = np.random.choice(remaining_idx, 1)
self.attributes, self.target = self.columns[attributes_idx], self.columns[response_idx]

if self.total_random_feature_sampling:
attributes_idx, responses_idx = self._get_features_from_all_columns()
self.attributes, self.responses = (
self.columns[attributes_idx],
self.columns[responses_idx],
)
else:
attributes_idx, responses_idx = self._get_features_from_selected_columns()
self.attributes, self.responses = (
self.attribute_columns[attributes_idx],
self.response_columns[responses_idx],
)
self.persist_features_counter -= 1

X = torch.from_numpy(self.df[self.attributes].to_numpy()).type(torch.float32)
y = torch.from_numpy(self.df[self.target].to_numpy()).type(torch.float32)
if self.encode_categorical_target:
y = self.y
else:
y = torch.from_numpy(self.df[self.responses].to_numpy()).type(torch.float32)

return X[idx], y[idx]

def __len__(self) -> int:
return self.df.shape[0]
def _get_features_from_selected_columns(self) -> tuple[int, int]:
attributes_size = np.random.randint(low=1, high=self.n_attributes + 1)
responses_size = np.random.randint(low=1, high=self.n_responses + 1)
attributes_idx = np.random.choice(len(self.attribute_columns), attributes_size).tolist()
responses_idx = np.random.choice(len(self.response_columns), responses_size).tolist()

return attributes_idx, responses_idx

def _get_features_from_all_columns(self) -> tuple[int, int]:
col_idx = np.arange(self.n_columns)
features_size = np.random.randint(low=1, high=self.n_columns)
attributes_idx = np.random.choice(col_idx, features_size)
remaining_idx = list(set(col_idx) - set(attributes_idx))
responses_idx = np.random.choice(remaining_idx, 1)
return attributes_idx, responses_idx
16 changes: 15 additions & 1 deletion test/liltab/data/test_dataloaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from liltab.data.datasets import PandasDataset
from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset
from liltab.data.dataloaders import (
FewShotDataLoader,
ComposedDataLoader,
Expand Down Expand Up @@ -58,6 +58,20 @@ def test_few_shot_data_loader_samples_equally_when_set_size_divisible_by_nunique
assert (y_query[:, i]).sum() == 2


def test_few_shot_data_loader_samples_equally_works_with_random_features(
resources_path,
):
frame_path = resources_path / "random_df_3.csv"
dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_target=True)
dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True)

for episode in dataloader:
_, y_support, _, y_query = episode
for i in range(3):
assert (y_support[:, i]).sum() == 3
assert (y_query[:, i]).sum() == 2


def test_few_shot_data_loader_samples_equally_when_set_size_non_divisible_by_nunique_classes(
resources_path,
):
Expand Down
Loading

0 comments on commit b995727

Please sign in to comment.