From 3422d75da739c519fc07a023cf89ad5a377a309f Mon Sep 17 00:00:00 2001 From: Antoni Zajko Date: Mon, 18 Dec 2023 21:12:18 +0100 Subject: [PATCH 1/4] Resolve issues --- README.md | 2 +- bin/train.py | 2 +- .../01_synthetic_data_experiment_config.yaml | 2 +- config/02_openml_data_experiment_config.yaml | 2 +- .../03_openml_clf_data_experiment_config.yaml | 2 +- config/04_same_domain_experiment_config.yaml | 2 +- config/05_new_classes_experiment_config.yaml | 2 +- config/06_big_data_experiment_config.yaml | 2 +- experiments/01_synthetic.py | 2 +- experiments/02_openml.py | 2 +- experiments/03_openml_clf.py | 2 +- experiments/04_same_domain.py | 2 +- experiments/05_new_classes.py | 2 +- experiments/06_big_data.py | 2 +- liltab/data/datasets.py | 56 +++++++++------ liltab/data/preprocessing.py | 41 +++++++++-- liltab/train/trainer.py | 15 ++-- test/conftest.py | 2 +- test/liltab/data/test_datasets.py | 72 ++++++++++++++++++- 19 files changed, 162 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 0b0a2b7..afc2c55 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ HeterogenousAttributesNetworkTrainer( gradient_clipping=False, learning_rate=1e-3, weight_decay=1e-4, - early_stopping=True, + early_stopping_intervals=100, file_logger=True, tb_logger=True, model_checkpoints=True, diff --git a/bin/train.py b/bin/train.py index e7dd007..cfd5323 100644 --- a/bin/train.py +++ b/bin/train.py @@ -110,7 +110,7 @@ def main( gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], file_logger=file_logger, tb_logger=tb_logger, ) diff --git a/config/01_synthetic_data_experiment_config.yaml b/config/01_synthetic_data_experiment_config.yaml index aab4b13..8323ef1 100644 --- a/config/01_synthetic_data_experiment_config.yaml +++ b/config/01_synthetic_data_experiment_config.yaml @@ -4,7 +4,7 @@ learning_rate: 0.001 weight_decay: 0 batch_size: 256 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 5 query_size: 27 diff --git a/config/02_openml_data_experiment_config.yaml b/config/02_openml_data_experiment_config.yaml index e73aa3b..58a586f 100644 --- a/config/02_openml_data_experiment_config.yaml +++ b/config/02_openml_data_experiment_config.yaml @@ -4,7 +4,7 @@ learning_rate: 0.001 weight_decay: 0.0 batch_size: 37 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 3 query_size: 29 diff --git a/config/03_openml_clf_data_experiment_config.yaml b/config/03_openml_clf_data_experiment_config.yaml index 5af8f52..5020416 100644 --- a/config/03_openml_clf_data_experiment_config.yaml +++ b/config/03_openml_clf_data_experiment_config.yaml @@ -4,7 +4,7 @@ learning_rate: 0.001 weight_decay: 0 batch_size: 37 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 3 query_size: 29 diff --git a/config/04_same_domain_experiment_config.yaml b/config/04_same_domain_experiment_config.yaml index 9040ce4..66488aa 100644 --- a/config/04_same_domain_experiment_config.yaml +++ b/config/04_same_domain_experiment_config.yaml @@ -4,7 +4,7 @@ learning_rate: 0.001 weight_decay: 0.0001 batch_size: 37 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 3 query_size: 29 diff --git a/config/05_new_classes_experiment_config.yaml b/config/05_new_classes_experiment_config.yaml index d965f81..2c488c1 100644 --- a/config/05_new_classes_experiment_config.yaml +++ b/config/05_new_classes_experiment_config.yaml @@ -4,7 +4,7 @@ learning_rate: 0.0001 weight_decay: 0 batch_size: 16 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 3 query_size: 29 diff --git a/config/06_big_data_experiment_config.yaml b/config/06_big_data_experiment_config.yaml index c95d5a4..a4daae2 100644 --- a/config/06_big_data_experiment_config.yaml +++ b/config/06_big_data_experiment_config.yaml @@ -5,7 +5,7 @@ learning_rate: 0.001 weight_decay: 0 batch_size: 16 gradient_clipping: False -early_stopping: True +early_stopping_intervals: 100 support_size: 3 query_size: 29 diff --git a/experiments/01_synthetic.py b/experiments/01_synthetic.py index b196d2f..b0f32b2 100644 --- a/experiments/01_synthetic.py +++ b/experiments/01_synthetic.py @@ -71,7 +71,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], file_logger=True, tb_logger=True, model_checkpoints=True, diff --git a/experiments/02_openml.py b/experiments/02_openml.py index c6d7567..794eddf 100644 --- a/experiments/02_openml.py +++ b/experiments/02_openml.py @@ -74,7 +74,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], file_logger=True, tb_logger=True, model_checkpoints=True, diff --git a/experiments/03_openml_clf.py b/experiments/03_openml_clf.py index 8ff9fa0..a8b696e 100644 --- a/experiments/03_openml_clf.py +++ b/experiments/03_openml_clf.py @@ -75,7 +75,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], loss=nn.CrossEntropyLoss(), file_logger=True, tb_logger=True, diff --git a/experiments/04_same_domain.py b/experiments/04_same_domain.py index 07e43f6..ac16816 100644 --- a/experiments/04_same_domain.py +++ b/experiments/04_same_domain.py @@ -71,7 +71,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], file_logger=True, tb_logger=True, model_checkpoints=True, diff --git a/experiments/05_new_classes.py b/experiments/05_new_classes.py index 99aed68..5b820ef 100644 --- a/experiments/05_new_classes.py +++ b/experiments/05_new_classes.py @@ -72,7 +72,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], loss=nn.CrossEntropyLoss(), file_logger=True, tb_logger=True, diff --git a/experiments/06_big_data.py b/experiments/06_big_data.py index 1364bb0..cfcc83a 100644 --- a/experiments/06_big_data.py +++ b/experiments/06_big_data.py @@ -72,7 +72,7 @@ def main(): gradient_clipping=config["gradient_clipping"], learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], - early_stopping=config["early_stopping"], + early_stopping_intervals=config["early_stopping_intervals"], loss=nn.CrossEntropyLoss(), file_logger=True, tb_logger=True, diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py index 2ded337..d654d7c 100644 --- a/liltab/data/datasets.py +++ b/liltab/data/datasets.py @@ -1,11 +1,14 @@ import numpy as np import pandas as pd +from sklearn.discriminant_analysis import StandardScaler import torch from abc import ABC, abstractmethod -from pathlib import Path +from itertools import product +from pathlib import PosixPath from sklearn.preprocessing import OneHotEncoder from torch import Tensor +from typing import Union from .preprocessing import get_preprocessing_pipeline @@ -19,7 +22,7 @@ class Dataset(ABC): def __init__( self, - data_path: str, + data: Union[PosixPath, str, pd.DataFrame], attribute_columns: list[str], response_columns: list[str], preprocess_data: bool, @@ -32,8 +35,16 @@ def __init__( ): raise ValueError("One-hot encoding is supported only for single target") - self.data_path = data_path - self.df = pd.read_csv(data_path) + self.data = data + if type(data) in [str, PosixPath]: + self.df = pd.read_csv(data) + elif type(data) == pd.DataFrame: + self.df = data + else: + raise ValueError( + f"Data should be PosixPath, " + f"str or pandas.DataFrame but is {type(data)}" + ) self.attribute_columns = np.array( attribute_columns @@ -64,23 +75,24 @@ def _preprocess_data(self): then response variable isn't scaled. """ self.preprocessing_pipeline = get_preprocessing_pipeline() - if self.encode_categorical_target: - self.df.loc[ - :, self.attribute_columns - ] = self.preprocessing_pipeline.fit_transform( - self.df[self.attribute_columns] - ) - else: - self.df = pd.DataFrame( - self.preprocessing_pipeline.fit_transform(self.df), - columns=self.df.columns, - ) + df_preproc = self.preprocessing_pipeline.fit_transform(self.df[self.attribute_columns]) + self.df = self.df.drop(columns=self.attribute_columns) + self.df = pd.concat([df_preproc, self.df], axis=1) + + attribute_columns_new = [] + for attr_col, frame_col in product(self.attribute_columns, self.df.columns): + if attr_col in frame_col: + attribute_columns_new.append(frame_col) + self.attribute_columns = np.array(attribute_columns_new) + + if not self.encode_categorical_target: + self.df[self.response_columns] = StandardScaler().fit_transform(self.df[self.response_columns]) def _encode_categorical_target(self): """ Encodes categorical response using one-hot encoding. """ - self.one_hot_encoder = OneHotEncoder(sparse=False) + self.one_hot_encoder = OneHotEncoder(sparse_output=False) self.raw_y = self.df[self.response_columns] self.y = self.one_hot_encoder.fit_transform((self.df[self.response_columns])) @@ -101,7 +113,7 @@ class PandasDataset(Dataset): def __init__( self, - data_path: Path, + data: Union[PosixPath, str, pd.DataFrame], attribute_columns: list[str] = None, response_columns: list[str] = None, preprocess_data: bool = True, @@ -109,7 +121,7 @@ def __init__( ): """ Args: - data_path (Path): Path to data to be loaded + data (Union[PosixPath, str, pd.DataFrame]): Frame with data or path to .csv file. attribute_columns (list[str], optional): Columns from frame which will be used as attributes. Defaults to all columns without last. @@ -123,7 +135,7 @@ def __init__( Default to False. """ super().__init__( - data_path=data_path, + data=data, attribute_columns=attribute_columns, response_columns=response_columns, encode_categorical_target=encode_categorical_target, @@ -152,7 +164,7 @@ class RandomFeaturesPandasDataset(Dataset): def __init__( self, - data_path: Path, + data: Union[PosixPath, str, pd.DataFrame], attribute_columns: list[str] = None, response_columns: list[str] = None, total_random_feature_sampling: bool = False, @@ -162,7 +174,7 @@ def __init__( ): """ Args: - data_path (Path): Path to data to be loaded + data (Union[PosixPath, str, pd.DataFrame]): Frame with data or path to .csv file. attribute_columns (list[str], optional): Columns from frame which will be attributes sampled from. Ignored when total_random_feature_sampling = True. @@ -187,7 +199,7 @@ def __init__( Defaults to 2. """ super().__init__( - data_path=data_path, + data=data, attribute_columns=attribute_columns, response_columns=response_columns, encode_categorical_target=encode_categorical_target, diff --git a/liltab/data/preprocessing.py b/liltab/data/preprocessing.py index dacf89e..b877fab 100644 --- a/liltab/data/preprocessing.py +++ b/liltab/data/preprocessing.py @@ -1,15 +1,46 @@ -from sklearn.pipeline import make_pipeline +import numpy as np + +from sklearn.compose import make_column_selector, make_column_transformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, OneHotEncoder def get_preprocessing_pipeline() -> Pipeline: """ Returns preprocessing pipeline composed with: - 1. Simple imputer with mean strategy - 2. StandardScaler + 1. Simple imputer with mean/most_frequent strategy depending on data type. + 2. StandardScaler/One-hot encoding depending on data type Returns: Pipeline: Pipeline with preprocessing steps. """ - return make_pipeline(SimpleImputer(strategy="mean"), StandardScaler()) + cat_pipeline = Pipeline( + [ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("one-hot", OneHotEncoder(sparse_output=False, handle_unknown="ignore")), + ] + ).set_output(transform="pandas") + + num_pipeline = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean")), + ("scaler", StandardScaler()), + ] + ).set_output(transform="pandas") + + pipeline = Pipeline( + [ + ( + "transformers", + make_column_transformer( + ( + cat_pipeline, + make_column_selector(dtype_include=("object", "category")), + ), + (num_pipeline, make_column_selector(dtype_include=np.number)), + ), + ) + ] + ).set_output(transform="pandas") + + return pipeline diff --git a/liltab/train/trainer.py b/liltab/train/trainer.py index 6ec8529..3ae5bdb 100644 --- a/liltab/train/trainer.py +++ b/liltab/train/trainer.py @@ -26,7 +26,8 @@ def __init__( gradient_clipping: bool, learning_rate: float, weight_decay: float, - early_stopping: bool = False, + early_stopping_intervals: bool = 100, + check_val_every_n_epoch: int = 100, loss: Callable = nn.MSELoss(), file_logger: bool = True, tb_logger: bool = True, @@ -39,8 +40,10 @@ def __init__( gradient_clipping (bool): If true, then gradient clipping is applied learning_rate (float): learning rate during training. weight_decay (float): weight decay during training. - early_stopping (Optional, bool): if True, then early stopping with - patience n_epochs // 10 is applied. Defaults to False. + early_stopping_intervals (Optional, bool): if >0, then early stopping with + patience early_stopping_intervals*check_val_every_n_epoch epochs is applied. + check_val_every_n_epoch (Optional, bool): Specifies how often validation loss + is checked. Defaults to 100, loss (Callable): Loss used during training. Defaults to MSELoss(). file_logger (bool): if True, then file logger will write to {results_path} directory @@ -83,11 +86,11 @@ def __init__( ) callbacks.append(loggers_callback) - if early_stopping: + if early_stopping_intervals > 0: early_stopping = EarlyStopping( monitor="val_loss", mode="min", - patience=100, + patience=early_stopping_intervals, min_delta=1e-3, ) callbacks.append(early_stopping) @@ -103,8 +106,6 @@ def __init__( ) callbacks.append(model_checkpoints_callback) - check_val_every_n_epoch = n_epochs // 1000 if n_epochs > 1000 else 1 - self.trainer = pl.Trainer( max_epochs=n_epochs, gradient_clip_val=1 if gradient_clipping else 0, diff --git a/test/conftest.py b/test/conftest.py index 5795f0a..675475a 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -51,7 +51,7 @@ def forward(self, X_query: Tensor): def pytest_sessionfinish(session, exitstatus): - rmtree(Path("test") / "results") + rmtree(Path("test") / "results", ignore_errors=True) def pytest_sessionstart(session): diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py index cba6cdf..bd39b68 100644 --- a/test/liltab/data/test_datasets.py +++ b/test/liltab/data/test_datasets.py @@ -7,6 +7,28 @@ from torch import Tensor, float32 +def test_dataset_works_when_path_given(resources_path): + frame_path = resources_path / "random_df_1.csv" + + dataset = PandasDataset(frame_path) + + assert dataset.df is not None + + +def test_dataset_works_dataframe_given(resources_path): + frame_path = resources_path / "random_df_1.csv" + df = pd.read_csv(frame_path) + + dataset = PandasDataset(df) + + assert dataset.df is not None + + +def test_dataset_raises_error_with_incorrect_data(): + with pytest.raises(ValueError): + PandasDataset(1) + + def test_dataset_initializes_default_columns(resources_path): frame_path = resources_path / "random_df_1.csv" df = pd.read_csv(frame_path) @@ -14,7 +36,12 @@ def test_dataset_initializes_default_columns(resources_path): dataset = PandasDataset(frame_path) - assert (dataset.attribute_columns == frame_columns[:-1]).all() + assert (dataset.attribute_columns == [ + 'pipeline-2__col_1', + 'pipeline-2__col_2', + 'pipeline-2__col_3', + 'pipeline-2__col_4' + ]).all() assert (dataset.response_columns == [frame_columns[-1]]).all() @@ -29,7 +56,10 @@ def test_dataset_assigns_non_default_columns(resources_path): response_columns=frame_columns[4:], ) - assert (dataset.attribute_columns == frame_columns[1:3]).all() + assert (dataset.attribute_columns == [ + 'pipeline-2__col_2', + 'pipeline-2__col_3' + ]).all() assert (dataset.response_columns == frame_columns[4:]).all() @@ -59,13 +89,49 @@ def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path) actual_X, actual_y = dataset[index] assert_almost_equal( - actual_X.numpy(), expected_records[dataset.attribute_columns].values, decimal=2 + actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2 ) assert_almost_equal( actual_y.numpy(), expected_records[dataset.response_columns].values, decimal=2 ) +def test_dataset_encodes_categorical_columns(): + df = df = pd.DataFrame(data=[ + [1, "A", "E", .1], + [3, "B", "E", .5], + [3, "A", "F", .4], + [1, "C", "E", .3], + ], columns=["int1", "cat1", "cat2", "target"]) + df["cat2"].astype("category") + dataset = PandasDataset(df) + + expected_attribute_columns = [ + 'pipeline-2__int1', + 'pipeline-1__cat1_A', + 'pipeline-1__cat1_B', + 'pipeline-1__cat1_C', + 'pipeline-1__cat2_E', + 'pipeline-1__cat2_F' + ] + + assert (dataset.attribute_columns == expected_attribute_columns).all() + assert ( + dataset.df[[ + 'pipeline-1__cat1_A', + 'pipeline-1__cat1_B', + 'pipeline-1__cat1_C', + 'pipeline-1__cat2_E', + 'pipeline-1__cat2_F' + ]].values == np.array([ + [1, 0, 0, 1, 0], + [0, 1, 0, 1, 0], + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 0], + ]) + ).all() + + def test_class_forbids_one_hot_with_multiple_targets(resources_path): frame_path = resources_path / "random_df_1.csv" df = pd.read_csv(frame_path) From 4e550b270f9b036461ae9d0583ecdda819f0a86e Mon Sep 17 00:00:00 2001 From: Antoni Zajko Date: Mon, 18 Dec 2023 21:15:47 +0100 Subject: [PATCH 2/4] Formatting --- liltab/data/datasets.py | 37 +++++---------- test/liltab/data/test_datasets.py | 75 ++++++++++++++++--------------- 2 files changed, 49 insertions(+), 63 deletions(-) diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py index d654d7c..a65a8f4 100644 --- a/liltab/data/datasets.py +++ b/liltab/data/datasets.py @@ -28,11 +28,7 @@ def __init__( preprocess_data: bool, encode_categorical_target: bool, ): - if ( - response_columns is not None - and len(response_columns) > 1 - and encode_categorical_target - ): + if response_columns is not None and len(response_columns) > 1 and encode_categorical_target: raise ValueError("One-hot encoding is supported only for single target") self.data = data @@ -42,19 +38,14 @@ def __init__( self.df = data else: raise ValueError( - f"Data should be PosixPath, " - f"str or pandas.DataFrame but is {type(data)}" + f"Data should be PosixPath, " f"str or pandas.DataFrame but is {type(data)}" ) self.attribute_columns = np.array( - attribute_columns - if attribute_columns is not None - else self.df.columns.tolist()[:-1] + attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1] ) self.response_columns = np.array( - response_columns - if response_columns is not None - else [self.df.columns.tolist()[-1]] + response_columns if response_columns is not None else [self.df.columns.tolist()[-1]] ) self.n_attributes = len(self.attribute_columns) self.n_responses = len(self.response_columns) @@ -86,7 +77,9 @@ def _preprocess_data(self): self.attribute_columns = np.array(attribute_columns_new) if not self.encode_categorical_target: - self.df[self.response_columns] = StandardScaler().fit_transform(self.df[self.response_columns]) + self.df[self.response_columns] = StandardScaler().fit_transform( + self.df[self.response_columns] + ) def _encode_categorical_target(self): """ @@ -142,9 +135,7 @@ def __init__( preprocess_data=preprocess_data, ) - self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type( - torch.float32 - ) + self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32) self.y = torch.from_numpy(self.y).type(torch.float32) def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: @@ -206,9 +197,7 @@ def __init__( preprocess_data=preprocess_data, ) if total_random_feature_sampling and ( - attribute_columns is not None - or response_columns - or encode_categorical_target + attribute_columns is not None or response_columns or encode_categorical_target ): raise ValueError( "total_random_feature_sampling doesn't support feature or encoding specification" @@ -254,12 +243,8 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: def _get_features_from_selected_columns(self) -> tuple[int, int]: attributes_size = np.random.randint(low=1, high=self.n_attributes + 1) responses_size = np.random.randint(low=1, high=self.n_responses + 1) - attributes_idx = np.random.choice( - len(self.attribute_columns), attributes_size - ).tolist() - responses_idx = np.random.choice( - len(self.response_columns), responses_size - ).tolist() + attributes_idx = np.random.choice(len(self.attribute_columns), attributes_size).tolist() + responses_idx = np.random.choice(len(self.response_columns), responses_size).tolist() return attributes_idx, responses_idx diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py index bd39b68..809bfb4 100644 --- a/test/liltab/data/test_datasets.py +++ b/test/liltab/data/test_datasets.py @@ -36,12 +36,10 @@ def test_dataset_initializes_default_columns(resources_path): dataset = PandasDataset(frame_path) - assert (dataset.attribute_columns == [ - 'pipeline-2__col_1', - 'pipeline-2__col_2', - 'pipeline-2__col_3', - 'pipeline-2__col_4' - ]).all() + assert ( + dataset.attribute_columns + == ["pipeline-2__col_1", "pipeline-2__col_2", "pipeline-2__col_3", "pipeline-2__col_4"] + ).all() assert (dataset.response_columns == [frame_columns[-1]]).all() @@ -56,10 +54,7 @@ def test_dataset_assigns_non_default_columns(resources_path): response_columns=frame_columns[4:], ) - assert (dataset.attribute_columns == [ - 'pipeline-2__col_2', - 'pipeline-2__col_3' - ]).all() + assert (dataset.attribute_columns == ["pipeline-2__col_2", "pipeline-2__col_3"]).all() assert (dataset.response_columns == frame_columns[4:]).all() @@ -88,47 +83,53 @@ def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path) expected_records = df.loc[index] actual_X, actual_y = dataset[index] - assert_almost_equal( - actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2 - ) + assert_almost_equal(actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2) assert_almost_equal( actual_y.numpy(), expected_records[dataset.response_columns].values, decimal=2 ) def test_dataset_encodes_categorical_columns(): - df = df = pd.DataFrame(data=[ - [1, "A", "E", .1], - [3, "B", "E", .5], - [3, "A", "F", .4], - [1, "C", "E", .3], - ], columns=["int1", "cat1", "cat2", "target"]) + df = df = pd.DataFrame( + data=[ + [1, "A", "E", 0.1], + [3, "B", "E", 0.5], + [3, "A", "F", 0.4], + [1, "C", "E", 0.3], + ], + columns=["int1", "cat1", "cat2", "target"], + ) df["cat2"].astype("category") dataset = PandasDataset(df) expected_attribute_columns = [ - 'pipeline-2__int1', - 'pipeline-1__cat1_A', - 'pipeline-1__cat1_B', - 'pipeline-1__cat1_C', - 'pipeline-1__cat2_E', - 'pipeline-1__cat2_F' + "pipeline-2__int1", + "pipeline-1__cat1_A", + "pipeline-1__cat1_B", + "pipeline-1__cat1_C", + "pipeline-1__cat2_E", + "pipeline-1__cat2_F", ] assert (dataset.attribute_columns == expected_attribute_columns).all() assert ( - dataset.df[[ - 'pipeline-1__cat1_A', - 'pipeline-1__cat1_B', - 'pipeline-1__cat1_C', - 'pipeline-1__cat2_E', - 'pipeline-1__cat2_F' - ]].values == np.array([ - [1, 0, 0, 1, 0], - [0, 1, 0, 1, 0], - [1, 0, 0, 0, 1], - [0, 0, 1, 1, 0], - ]) + dataset.df[ + [ + "pipeline-1__cat1_A", + "pipeline-1__cat1_B", + "pipeline-1__cat1_C", + "pipeline-1__cat2_E", + "pipeline-1__cat2_F", + ] + ].values + == np.array( + [ + [1, 0, 0, 1, 0], + [0, 1, 0, 1, 0], + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 0], + ] + ) ).all() From 95ca757bc1db4e97884eaee2a1c013e55cf1f181 Mon Sep 17 00:00:00 2001 From: Antoni Zajko Date: Mon, 18 Dec 2023 22:26:49 +0100 Subject: [PATCH 3/4] Resolve comments --- bin/train.py | 128 --------------------------- experiments/03_openml_clf.py | 6 +- experiments/04_same_domain.py | 6 +- experiments/05_new_classes.py | 6 +- experiments/06_big_data.py | 6 +- liltab/data/datasets.py | 49 +++++----- liltab/train/trainer.py | 6 +- test/liltab/data/test_dataloaders.py | 8 +- test/liltab/data/test_datasets.py | 22 ++--- 9 files changed, 56 insertions(+), 181 deletions(-) delete mode 100644 bin/train.py diff --git a/bin/train.py b/bin/train.py deleted file mode 100644 index cfd5323..0000000 --- a/bin/train.py +++ /dev/null @@ -1,128 +0,0 @@ -import typer -import yaml -import pytorch_lightning as pl -import warnings - -from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset -from liltab.data.dataloaders import ( - FewShotDataLoader, - ComposedDataLoader, - RepeatableOutputComposedDataLoader, -) -from liltab.data.factory import ComposedDataLoaderFactory -from liltab.model.heterogenous_attributes_network import HeterogenousAttributesNetwork -from liltab.train.trainer import HeterogenousAttributesNetworkTrainer -from liltab.train.logger import TensorBoardLogger, FileLogger -from loguru import logger -from typing_extensions import Annotated -from pathlib import Path - -warnings.filterwarnings("ignore") -app = typer.Typer() - - -@app.command(help="Trains network on heterogenous attribute spaces.") -def main( - config_path: Annotated[Path, typer.Option(..., help="Path to experiment configuration.")], - logger_type: Annotated[ - str, - typer.Option( - ..., - help="""typer of logger. tb=[tensorboard], - flat=[flat file], both=[tensoboard and flat file]""", - ), - ] = "both", - use_profiler: Annotated[ - str, - typer.Option( - ..., - help="""""use profiler (take long time, 8-10 epoches suggested), - yes or no; requires tensorboard (logger-type=[tb|both])""", - ), - ] = "no", - seed: Annotated[int, typer.Option(..., help="Seed")] = 123, -): - pl.seed_everything(seed) - - logger.info("Loading config") - with open(config_path) as f: - config = yaml.load(f, Loader=yaml.CLoader) - - logger.info("Loading data") - train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( - Path(config["train_data_path"]), - RandomFeaturesPandasDataset, - {}, - FewShotDataLoader, - {"support_size": config["support_size"], "query_size": config["query_size"]}, - ComposedDataLoader, - batch_size=config["batch_size"], - ) - val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( - Path(config["val_data_path"]), - PandasDataset, - {}, - FewShotDataLoader, - {"support_size": config["support_size"], "query_size": config["query_size"]}, - RepeatableOutputComposedDataLoader, - batch_size=config["batch_size"], - ) - test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( - Path(config["test_data_path"]), - PandasDataset, - {}, - FewShotDataLoader, - {"support_size": config["support_size"], "query_size": config["query_size"]}, - RepeatableOutputComposedDataLoader, - batch_size=config["batch_size"], - ) - - logger.info("Creating model") - model = HeterogenousAttributesNetwork( - hidden_representation_size=config["hidden_representation_size"], - n_hidden_layers=config["n_hidden_layers"], - hidden_size=config["hidden_size"], - dropout_rate=config["dropout_rate"], - ) - - if logger_type == "tb": - tb_logger = TensorBoardLogger( - "results/tensorboard", - name=config["name"], - use_profiler=True if use_profiler == "yes" else False, - ) - file_logger = None - elif logger_type == "flat": - tb_logger = None - file_logger = FileLogger("results/flat") - elif logger_type == "both": - tb_logger = TensorBoardLogger( - "results/tensorboard", - name=config["name"], - use_profiler=True if use_profiler == "yes" else False, - ) - file_logger = FileLogger("results/flat") - else: - raise ValueError("logger_type must from [tb, flat, both]") - - trainer = HeterogenousAttributesNetworkTrainer( - n_epochs=config["num_epochs"], - gradient_clipping=config["gradient_clipping"], - learning_rate=config["learning_rate"], - weight_decay=config["weight_decay"], - early_stopping_intervals=config["early_stopping_intervals"], - file_logger=file_logger, - tb_logger=tb_logger, - ) - - logger.info("Training model") - trainer.train_and_test( - model=model, - train_loader=train_loader, - val_loader=val_loader, - test_loader=test_loader, - ) - - -if __name__ == "__main__": - app() diff --git a/experiments/03_openml_clf.py b/experiments/03_openml_clf.py index a8b696e..349786a 100644 --- a/experiments/03_openml_clf.py +++ b/experiments/03_openml_clf.py @@ -34,7 +34,7 @@ def main(): train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["train_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, ComposedDataLoader, @@ -43,7 +43,7 @@ def main(): val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["val_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, @@ -52,7 +52,7 @@ def main(): test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["test_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, diff --git a/experiments/04_same_domain.py b/experiments/04_same_domain.py index ac16816..b48d976 100644 --- a/experiments/04_same_domain.py +++ b/experiments/04_same_domain.py @@ -30,7 +30,7 @@ def main(): train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["train_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, ComposedDataLoader, @@ -39,7 +39,7 @@ def main(): val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["val_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, @@ -48,7 +48,7 @@ def main(): test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["test_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, diff --git a/experiments/05_new_classes.py b/experiments/05_new_classes.py index 5b820ef..182a15c 100644 --- a/experiments/05_new_classes.py +++ b/experiments/05_new_classes.py @@ -31,7 +31,7 @@ def main(): train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["train_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, ComposedDataLoader, @@ -40,7 +40,7 @@ def main(): val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["val_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, @@ -49,7 +49,7 @@ def main(): test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["test_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, diff --git a/experiments/06_big_data.py b/experiments/06_big_data.py index cfcc83a..5d49e42 100644 --- a/experiments/06_big_data.py +++ b/experiments/06_big_data.py @@ -31,7 +31,7 @@ def main(): train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["train_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, ComposedDataLoader, @@ -40,7 +40,7 @@ def main(): val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["val_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, @@ -49,7 +49,7 @@ def main(): test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path( Path(config["test_data_path"]), PandasDataset, - {"encode_categorical_target": True}, + {"encode_categorical_response": True}, FewShotDataLoader, {"support_size": config["support_size"], "query_size": config["query_size"]}, RepeatableOutputComposedDataLoader, diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py index a65a8f4..4338ede 100644 --- a/liltab/data/datasets.py +++ b/liltab/data/datasets.py @@ -4,7 +4,6 @@ import torch from abc import ABC, abstractmethod -from itertools import product from pathlib import PosixPath from sklearn.preprocessing import OneHotEncoder from torch import Tensor @@ -26,9 +25,13 @@ def __init__( attribute_columns: list[str], response_columns: list[str], preprocess_data: bool, - encode_categorical_target: bool, + encode_categorical_response: bool, ): - if response_columns is not None and len(response_columns) > 1 and encode_categorical_target: + if ( + response_columns is not None + and len(response_columns) > 1 + and encode_categorical_response + ): raise ValueError("One-hot encoding is supported only for single target") self.data = data @@ -50,38 +53,36 @@ def __init__( self.n_attributes = len(self.attribute_columns) self.n_responses = len(self.response_columns) - self.encode_categorical_target = encode_categorical_target + self.encode_categorical_response = encode_categorical_response self.preprocess_data = preprocess_data if self.preprocess_data: self._preprocess_data() - if self.encode_categorical_target: - self._encode_categorical_target() + if self.encode_categorical_response: + self._encode_categorical_response() else: self.y = self.df[self.response_columns].values def _preprocess_data(self): """ - Standardizes data using z-score method. If encode_categorical_target = True - then response variable isn't scaled. + Performs following preprocessing: + * data imputation + * z-score scaling of numerical columns + * one-hot encoding categorical columns + If encode_categorical_response = True, then omits response column. """ self.preprocessing_pipeline = get_preprocessing_pipeline() df_preproc = self.preprocessing_pipeline.fit_transform(self.df[self.attribute_columns]) self.df = self.df.drop(columns=self.attribute_columns) self.df = pd.concat([df_preproc, self.df], axis=1) + self.attribute_columns = df_preproc.columns.values - attribute_columns_new = [] - for attr_col, frame_col in product(self.attribute_columns, self.df.columns): - if attr_col in frame_col: - attribute_columns_new.append(frame_col) - self.attribute_columns = np.array(attribute_columns_new) - - if not self.encode_categorical_target: + if not self.encode_categorical_response: self.df[self.response_columns] = StandardScaler().fit_transform( self.df[self.response_columns] ) - def _encode_categorical_target(self): + def _encode_categorical_response(self): """ Encodes categorical response using one-hot encoding. """ @@ -110,7 +111,7 @@ def __init__( attribute_columns: list[str] = None, response_columns: list[str] = None, preprocess_data: bool = True, - encode_categorical_target: bool = False, + encode_categorical_response: bool = False, ): """ Args: @@ -123,7 +124,7 @@ def __init__( preprocess_data (bool, optional): If true, then imputes data using mean strategy and standardizes using StandardScaler. Defaults to True. - encode_categorical_target(bool, optional): if True, then target column + encode_categorical_response(bool, optional): if True, then target column will be encoded using one-hot. Works only with single target variable. Default to False. """ @@ -131,7 +132,7 @@ def __init__( data=data, attribute_columns=attribute_columns, response_columns=response_columns, - encode_categorical_target=encode_categorical_target, + encode_categorical_response=encode_categorical_response, preprocess_data=preprocess_data, ) @@ -160,7 +161,7 @@ def __init__( response_columns: list[str] = None, total_random_feature_sampling: bool = False, preprocess_data: bool = True, - encode_categorical_target: bool = False, + encode_categorical_response: bool = False, persist_features_iter: int = 2, ): """ @@ -180,7 +181,7 @@ def __init__( preprocess_data(bool, optional): If true, then imputes data using mean strategy and standardizes using StandardScaler. Defaults to True. - encode_categorical_target(bool, optional): if True, then target column + encode_categorical_response(bool, optional): if True, then target column will be encoded using one-hot. When total_random_feature_sampling=True it should be False. Works only with single target variable. @@ -193,11 +194,11 @@ def __init__( data=data, attribute_columns=attribute_columns, response_columns=response_columns, - encode_categorical_target=encode_categorical_target, + encode_categorical_response=encode_categorical_response, preprocess_data=preprocess_data, ) if total_random_feature_sampling and ( - attribute_columns is not None or response_columns or encode_categorical_target + attribute_columns is not None or response_columns or encode_categorical_response ): raise ValueError( "total_random_feature_sampling doesn't support feature or encoding specification" @@ -233,7 +234,7 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: self.persist_features_counter -= 1 X = torch.from_numpy(self.df[self.attributes].to_numpy()).type(torch.float32) - if self.encode_categorical_target: + if self.encode_categorical_response: y = torch.from_numpy(self.y).type(torch.float32) else: y = torch.from_numpy(self.df[self.responses].to_numpy()).type(torch.float32) diff --git a/liltab/train/trainer.py b/liltab/train/trainer.py index 3ae5bdb..39716c2 100644 --- a/liltab/train/trainer.py +++ b/liltab/train/trainer.py @@ -26,7 +26,7 @@ def __init__( gradient_clipping: bool, learning_rate: float, weight_decay: float, - early_stopping_intervals: bool = 100, + early_stopping_intervals: int = 100, check_val_every_n_epoch: int = 100, loss: Callable = nn.MSELoss(), file_logger: bool = True, @@ -40,9 +40,9 @@ def __init__( gradient_clipping (bool): If true, then gradient clipping is applied learning_rate (float): learning rate during training. weight_decay (float): weight decay during training. - early_stopping_intervals (Optional, bool): if >0, then early stopping with + early_stopping_intervals (Optional, int): if >0, then early stopping with patience early_stopping_intervals*check_val_every_n_epoch epochs is applied. - check_val_every_n_epoch (Optional, bool): Specifies how often validation loss + check_val_every_n_epoch (Optional, int): Specifies how often validation loss is checked. Defaults to 100, loss (Callable): Loss used during training. Defaults to MSELoss(). file_logger (bool): if True, then file logger will write to diff --git a/test/liltab/data/test_dataloaders.py b/test/liltab/data/test_dataloaders.py index a7f1fbb..fff1ece 100644 --- a/test/liltab/data/test_dataloaders.py +++ b/test/liltab/data/test_dataloaders.py @@ -48,7 +48,7 @@ def test_few_shot_data_loader_samples_equally_when_set_size_divisible_by_nunique resources_path, ): frame_path = resources_path / "random_df_3.csv" - dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataset = PandasDataset(frame_path, encode_categorical_response=True) dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True) for episode in dataloader: @@ -62,7 +62,7 @@ def test_few_shot_data_loader_samples_equally_works_with_random_features( resources_path, ): frame_path = resources_path / "random_df_3.csv" - dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_target=True) + dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_response=True) dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True) for episode in dataloader: @@ -76,7 +76,7 @@ def test_few_shot_data_loader_samples_equally_when_set_size_non_divisible_by_nun resources_path, ): frame_path = resources_path / "random_df_3.csv" - dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataset = PandasDataset(frame_path, encode_categorical_response=True) dataloader = FewShotDataLoader(dataset, 11, 7, n_episodes=10, sample_classes_equally=True) for episode in dataloader: @@ -90,7 +90,7 @@ def test_few_shot_data_loader_samples_stratified( resources_path, ): frame_path = resources_path / "random_df_4.csv" - dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataset = PandasDataset(frame_path, encode_categorical_response=True) dataloader = FewShotDataLoader(dataset, 6, 12, n_episodes=10, sample_classes_stratified=True) for episode in dataloader: diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py index 809bfb4..0cdda7f 100644 --- a/test/liltab/data/test_datasets.py +++ b/test/liltab/data/test_datasets.py @@ -102,14 +102,16 @@ def test_dataset_encodes_categorical_columns(): df["cat2"].astype("category") dataset = PandasDataset(df) - expected_attribute_columns = [ - "pipeline-2__int1", - "pipeline-1__cat1_A", - "pipeline-1__cat1_B", - "pipeline-1__cat1_C", - "pipeline-1__cat2_E", - "pipeline-1__cat2_F", - ] + expected_attribute_columns = np.array( + [ + "pipeline-1__cat1_A", + "pipeline-1__cat1_B", + "pipeline-1__cat1_C", + "pipeline-1__cat2_E", + "pipeline-1__cat2_F", + "pipeline-2__int1", + ] + ) assert (dataset.attribute_columns == expected_attribute_columns).all() assert ( @@ -139,7 +141,7 @@ def test_class_forbids_one_hot_with_multiple_targets(resources_path): feture_columns = df.columns[:-2] target_columns = df.columns[-2:] with pytest.raises(ValueError): - PandasDataset(frame_path, feture_columns, target_columns, encode_categorical_target=True) + PandasDataset(frame_path, feture_columns, target_columns, encode_categorical_response=True) def test_preprocessing_when_target_categorical(resources_path): @@ -148,7 +150,7 @@ def test_preprocessing_when_target_categorical(resources_path): expected_X = df.drop(columns=["class"]) expected_X = (expected_X - expected_X.mean(axis=0)) / expected_X.std(axis=0) - dataset = PandasDataset(frame_path, encode_categorical_target=True) + dataset = PandasDataset(frame_path, encode_categorical_response=True) assert dataset.y.shape == (df.shape[0], df["class"].max()) assert_almost_equal(dataset.y.sum(axis=1).numpy(), np.ones(df.shape[0])) From 85bf931e33dd13a0deadc23242b757033e363f7d Mon Sep 17 00:00:00 2001 From: Antoni Zajko Date: Mon, 18 Dec 2023 22:30:54 +0100 Subject: [PATCH 4/4] Fix check --- .github/workflows/code_check.yml | 6 ------ makefile | 2 -- 2 files changed, 8 deletions(-) diff --git a/.github/workflows/code_check.yml b/.github/workflows/code_check.yml index d979e1c..8e3e275 100644 --- a/.github/workflows/code_check.yml +++ b/.github/workflows/code_check.yml @@ -27,18 +27,12 @@ jobs: - name: Check black test code run: black test --line-length=100 - - name: Check black bin code - run: black bin --line-length=100 - - name: Check flake8 source code run: flake8 liltab --max-line-length=100 - name: Check flake8 test code run: flake8 test --max-line-length=100 - - name: Check flake8 bin code - run: flake8 bin --max-line-length=100 - - name: Run test run: | export PYTHONPATH=`pwd` diff --git a/makefile b/makefile index 916991f..19506fc 100644 --- a/makefile +++ b/makefile @@ -6,8 +6,6 @@ prepare_code: flake8 liltab --max-line-length=100 black --line-length=100 test flake8 test --max-line-length=100 - black --line-length=100 bin - flake8 bin --max-line-length=100 run_tests: export PYTHONPATH=`pwd` && pytest