From 3422d75da739c519fc07a023cf89ad5a377a309f Mon Sep 17 00:00:00 2001
From: Antoni Zajko <antoni.zajko.1@gmail.com>
Date: Mon, 18 Dec 2023 21:12:18 +0100
Subject: [PATCH 1/4] Resolve issues

---
 README.md                                     |  2 +-
 bin/train.py                                  |  2 +-
 .../01_synthetic_data_experiment_config.yaml  |  2 +-
 config/02_openml_data_experiment_config.yaml  |  2 +-
 .../03_openml_clf_data_experiment_config.yaml |  2 +-
 config/04_same_domain_experiment_config.yaml  |  2 +-
 config/05_new_classes_experiment_config.yaml  |  2 +-
 config/06_big_data_experiment_config.yaml     |  2 +-
 experiments/01_synthetic.py                   |  2 +-
 experiments/02_openml.py                      |  2 +-
 experiments/03_openml_clf.py                  |  2 +-
 experiments/04_same_domain.py                 |  2 +-
 experiments/05_new_classes.py                 |  2 +-
 experiments/06_big_data.py                    |  2 +-
 liltab/data/datasets.py                       | 56 +++++++++------
 liltab/data/preprocessing.py                  | 41 +++++++++--
 liltab/train/trainer.py                       | 15 ++--
 test/conftest.py                              |  2 +-
 test/liltab/data/test_datasets.py             | 72 ++++++++++++++++++-
 19 files changed, 162 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 0b0a2b7..afc2c55 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ HeterogenousAttributesNetworkTrainer(
     gradient_clipping=False,
     learning_rate=1e-3,
     weight_decay=1e-4,
-    early_stopping=True,
+    early_stopping_intervals=100,
     file_logger=True,
     tb_logger=True,
     model_checkpoints=True,
diff --git a/bin/train.py b/bin/train.py
index e7dd007..cfd5323 100644
--- a/bin/train.py
+++ b/bin/train.py
@@ -110,7 +110,7 @@ def main(
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         file_logger=file_logger,
         tb_logger=tb_logger,
     )
diff --git a/config/01_synthetic_data_experiment_config.yaml b/config/01_synthetic_data_experiment_config.yaml
index aab4b13..8323ef1 100644
--- a/config/01_synthetic_data_experiment_config.yaml
+++ b/config/01_synthetic_data_experiment_config.yaml
@@ -4,7 +4,7 @@ learning_rate: 0.001
 weight_decay: 0
 batch_size: 256
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 5
 query_size: 27
diff --git a/config/02_openml_data_experiment_config.yaml b/config/02_openml_data_experiment_config.yaml
index e73aa3b..58a586f 100644
--- a/config/02_openml_data_experiment_config.yaml
+++ b/config/02_openml_data_experiment_config.yaml
@@ -4,7 +4,7 @@ learning_rate: 0.001
 weight_decay: 0.0
 batch_size: 37
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 3
 query_size: 29
diff --git a/config/03_openml_clf_data_experiment_config.yaml b/config/03_openml_clf_data_experiment_config.yaml
index 5af8f52..5020416 100644
--- a/config/03_openml_clf_data_experiment_config.yaml
+++ b/config/03_openml_clf_data_experiment_config.yaml
@@ -4,7 +4,7 @@ learning_rate: 0.001
 weight_decay: 0
 batch_size: 37
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 3
 query_size: 29
diff --git a/config/04_same_domain_experiment_config.yaml b/config/04_same_domain_experiment_config.yaml
index 9040ce4..66488aa 100644
--- a/config/04_same_domain_experiment_config.yaml
+++ b/config/04_same_domain_experiment_config.yaml
@@ -4,7 +4,7 @@ learning_rate: 0.001
 weight_decay: 0.0001
 batch_size: 37
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 3
 query_size: 29
diff --git a/config/05_new_classes_experiment_config.yaml b/config/05_new_classes_experiment_config.yaml
index d965f81..2c488c1 100644
--- a/config/05_new_classes_experiment_config.yaml
+++ b/config/05_new_classes_experiment_config.yaml
@@ -4,7 +4,7 @@ learning_rate: 0.0001
 weight_decay: 0
 batch_size: 16
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 3
 query_size: 29
diff --git a/config/06_big_data_experiment_config.yaml b/config/06_big_data_experiment_config.yaml
index c95d5a4..a4daae2 100644
--- a/config/06_big_data_experiment_config.yaml
+++ b/config/06_big_data_experiment_config.yaml
@@ -5,7 +5,7 @@ learning_rate: 0.001
 weight_decay: 0
 batch_size: 16
 gradient_clipping: False
-early_stopping: True
+early_stopping_intervals: 100
 
 support_size: 3
 query_size: 29
diff --git a/experiments/01_synthetic.py b/experiments/01_synthetic.py
index b196d2f..b0f32b2 100644
--- a/experiments/01_synthetic.py
+++ b/experiments/01_synthetic.py
@@ -71,7 +71,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         file_logger=True,
         tb_logger=True,
         model_checkpoints=True,
diff --git a/experiments/02_openml.py b/experiments/02_openml.py
index c6d7567..794eddf 100644
--- a/experiments/02_openml.py
+++ b/experiments/02_openml.py
@@ -74,7 +74,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         file_logger=True,
         tb_logger=True,
         model_checkpoints=True,
diff --git a/experiments/03_openml_clf.py b/experiments/03_openml_clf.py
index 8ff9fa0..a8b696e 100644
--- a/experiments/03_openml_clf.py
+++ b/experiments/03_openml_clf.py
@@ -75,7 +75,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         loss=nn.CrossEntropyLoss(),
         file_logger=True,
         tb_logger=True,
diff --git a/experiments/04_same_domain.py b/experiments/04_same_domain.py
index 07e43f6..ac16816 100644
--- a/experiments/04_same_domain.py
+++ b/experiments/04_same_domain.py
@@ -71,7 +71,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         file_logger=True,
         tb_logger=True,
         model_checkpoints=True,
diff --git a/experiments/05_new_classes.py b/experiments/05_new_classes.py
index 99aed68..5b820ef 100644
--- a/experiments/05_new_classes.py
+++ b/experiments/05_new_classes.py
@@ -72,7 +72,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         loss=nn.CrossEntropyLoss(),
         file_logger=True,
         tb_logger=True,
diff --git a/experiments/06_big_data.py b/experiments/06_big_data.py
index 1364bb0..cfcc83a 100644
--- a/experiments/06_big_data.py
+++ b/experiments/06_big_data.py
@@ -72,7 +72,7 @@ def main():
         gradient_clipping=config["gradient_clipping"],
         learning_rate=config["learning_rate"],
         weight_decay=config["weight_decay"],
-        early_stopping=config["early_stopping"],
+        early_stopping_intervals=config["early_stopping_intervals"],
         loss=nn.CrossEntropyLoss(),
         file_logger=True,
         tb_logger=True,
diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py
index 2ded337..d654d7c 100644
--- a/liltab/data/datasets.py
+++ b/liltab/data/datasets.py
@@ -1,11 +1,14 @@
 import numpy as np
 import pandas as pd
+from sklearn.discriminant_analysis import StandardScaler
 import torch
 
 from abc import ABC, abstractmethod
-from pathlib import Path
+from itertools import product
+from pathlib import PosixPath
 from sklearn.preprocessing import OneHotEncoder
 from torch import Tensor
+from typing import Union
 
 from .preprocessing import get_preprocessing_pipeline
 
@@ -19,7 +22,7 @@ class Dataset(ABC):
 
     def __init__(
         self,
-        data_path: str,
+        data: Union[PosixPath, str, pd.DataFrame],
         attribute_columns: list[str],
         response_columns: list[str],
         preprocess_data: bool,
@@ -32,8 +35,16 @@ def __init__(
         ):
             raise ValueError("One-hot encoding is supported only for single target")
 
-        self.data_path = data_path
-        self.df = pd.read_csv(data_path)
+        self.data = data
+        if type(data) in [str, PosixPath]:
+            self.df = pd.read_csv(data)
+        elif type(data) == pd.DataFrame:
+            self.df = data
+        else:
+            raise ValueError(
+                f"Data should be PosixPath, "
+                f"str or pandas.DataFrame but is {type(data)}"
+            )
 
         self.attribute_columns = np.array(
             attribute_columns
@@ -64,23 +75,24 @@ def _preprocess_data(self):
         then response variable isn't scaled.
         """
         self.preprocessing_pipeline = get_preprocessing_pipeline()
-        if self.encode_categorical_target:
-            self.df.loc[
-                :, self.attribute_columns
-            ] = self.preprocessing_pipeline.fit_transform(
-                self.df[self.attribute_columns]
-            )
-        else:
-            self.df = pd.DataFrame(
-                self.preprocessing_pipeline.fit_transform(self.df),
-                columns=self.df.columns,
-            )
+        df_preproc = self.preprocessing_pipeline.fit_transform(self.df[self.attribute_columns])
+        self.df = self.df.drop(columns=self.attribute_columns)
+        self.df = pd.concat([df_preproc, self.df], axis=1)
+
+        attribute_columns_new = []
+        for attr_col, frame_col in product(self.attribute_columns, self.df.columns):
+            if attr_col in frame_col:
+                attribute_columns_new.append(frame_col)
+        self.attribute_columns = np.array(attribute_columns_new)
+
+        if not self.encode_categorical_target:
+            self.df[self.response_columns] = StandardScaler().fit_transform(self.df[self.response_columns])        
 
     def _encode_categorical_target(self):
         """
         Encodes categorical response using one-hot encoding.
         """
-        self.one_hot_encoder = OneHotEncoder(sparse=False)
+        self.one_hot_encoder = OneHotEncoder(sparse_output=False)
         self.raw_y = self.df[self.response_columns]
         self.y = self.one_hot_encoder.fit_transform((self.df[self.response_columns]))
 
@@ -101,7 +113,7 @@ class PandasDataset(Dataset):
 
     def __init__(
         self,
-        data_path: Path,
+        data: Union[PosixPath, str, pd.DataFrame],
         attribute_columns: list[str] = None,
         response_columns: list[str] = None,
         preprocess_data: bool = True,
@@ -109,7 +121,7 @@ def __init__(
     ):
         """
         Args:
-            data_path (Path): Path to data to be loaded
+            data (Union[PosixPath, str, pd.DataFrame]): Frame with data or path to .csv file.
             attribute_columns (list[str], optional): Columns from frame
                 which will be used as attributes.
                 Defaults to all columns without last.
@@ -123,7 +135,7 @@ def __init__(
                 Default to False.
         """
         super().__init__(
-            data_path=data_path,
+            data=data,
             attribute_columns=attribute_columns,
             response_columns=response_columns,
             encode_categorical_target=encode_categorical_target,
@@ -152,7 +164,7 @@ class RandomFeaturesPandasDataset(Dataset):
 
     def __init__(
         self,
-        data_path: Path,
+        data: Union[PosixPath, str, pd.DataFrame],
         attribute_columns: list[str] = None,
         response_columns: list[str] = None,
         total_random_feature_sampling: bool = False,
@@ -162,7 +174,7 @@ def __init__(
     ):
         """
         Args:
-            data_path (Path): Path to data to be loaded
+            data (Union[PosixPath, str, pd.DataFrame]): Frame with data or path to .csv file.
             attribute_columns (list[str], optional): Columns from frame
                 which will be attributes sampled from.
                 Ignored when total_random_feature_sampling = True.
@@ -187,7 +199,7 @@ def __init__(
                 Defaults to 2.
         """
         super().__init__(
-            data_path=data_path,
+            data=data,
             attribute_columns=attribute_columns,
             response_columns=response_columns,
             encode_categorical_target=encode_categorical_target,
diff --git a/liltab/data/preprocessing.py b/liltab/data/preprocessing.py
index dacf89e..b877fab 100644
--- a/liltab/data/preprocessing.py
+++ b/liltab/data/preprocessing.py
@@ -1,15 +1,46 @@
-from sklearn.pipeline import make_pipeline
+import numpy as np
+
+from sklearn.compose import make_column_selector, make_column_transformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 
 
 def get_preprocessing_pipeline() -> Pipeline:
     """
     Returns preprocessing pipeline composed with:
-        1. Simple imputer with mean strategy
-        2. StandardScaler
+        1. Simple imputer with mean/most_frequent strategy depending on data type.
+        2. StandardScaler/One-hot encoding depending on data type
     Returns:
         Pipeline: Pipeline with preprocessing steps.
     """
-    return make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())
+    cat_pipeline = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("one-hot", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
+        ]
+    ).set_output(transform="pandas")
+
+    num_pipeline = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="mean")),
+            ("scaler", StandardScaler()),
+        ]
+    ).set_output(transform="pandas")
+
+    pipeline = Pipeline(
+        [
+            (
+                "transformers",
+                make_column_transformer(
+                    (
+                        cat_pipeline,
+                        make_column_selector(dtype_include=("object", "category")),
+                    ),
+                    (num_pipeline, make_column_selector(dtype_include=np.number)),
+                ),
+            )
+        ]
+    ).set_output(transform="pandas")
+
+    return pipeline
diff --git a/liltab/train/trainer.py b/liltab/train/trainer.py
index 6ec8529..3ae5bdb 100644
--- a/liltab/train/trainer.py
+++ b/liltab/train/trainer.py
@@ -26,7 +26,8 @@ def __init__(
         gradient_clipping: bool,
         learning_rate: float,
         weight_decay: float,
-        early_stopping: bool = False,
+        early_stopping_intervals: bool = 100,
+        check_val_every_n_epoch: int = 100,
         loss: Callable = nn.MSELoss(),
         file_logger: bool = True,
         tb_logger: bool = True,
@@ -39,8 +40,10 @@ def __init__(
             gradient_clipping (bool): If true, then gradient clipping is applied
             learning_rate (float): learning rate during training.
             weight_decay (float): weight decay during training.
-            early_stopping (Optional, bool): if True, then early stopping with
-                patience n_epochs // 10 is applied. Defaults to False.
+            early_stopping_intervals (Optional, bool): if >0, then early stopping with
+                patience early_stopping_intervals*check_val_every_n_epoch epochs is applied.
+            check_val_every_n_epoch (Optional, bool): Specifies how often validation loss
+                is checked. Defaults to 100,
             loss (Callable): Loss used during training. Defaults to MSELoss().
             file_logger (bool): if True, then file logger will write to
                 {results_path} directory
@@ -83,11 +86,11 @@ def __init__(
             )
             callbacks.append(loggers_callback)
 
-        if early_stopping:
+        if early_stopping_intervals > 0:
             early_stopping = EarlyStopping(
                 monitor="val_loss",
                 mode="min",
-                patience=100,
+                patience=early_stopping_intervals,
                 min_delta=1e-3,
             )
             callbacks.append(early_stopping)
@@ -103,8 +106,6 @@ def __init__(
             )
             callbacks.append(model_checkpoints_callback)
 
-        check_val_every_n_epoch = n_epochs // 1000 if n_epochs > 1000 else 1
-
         self.trainer = pl.Trainer(
             max_epochs=n_epochs,
             gradient_clip_val=1 if gradient_clipping else 0,
diff --git a/test/conftest.py b/test/conftest.py
index 5795f0a..675475a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -51,7 +51,7 @@ def forward(self, X_query: Tensor):
 
 
 def pytest_sessionfinish(session, exitstatus):
-    rmtree(Path("test") / "results")
+    rmtree(Path("test") / "results", ignore_errors=True)
 
 
 def pytest_sessionstart(session):
diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py
index cba6cdf..bd39b68 100644
--- a/test/liltab/data/test_datasets.py
+++ b/test/liltab/data/test_datasets.py
@@ -7,6 +7,28 @@
 from torch import Tensor, float32
 
 
+def test_dataset_works_when_path_given(resources_path):
+    frame_path = resources_path / "random_df_1.csv"
+
+    dataset = PandasDataset(frame_path)
+
+    assert dataset.df is not None
+
+
+def test_dataset_works_dataframe_given(resources_path):
+    frame_path = resources_path / "random_df_1.csv"
+    df = pd.read_csv(frame_path)
+
+    dataset = PandasDataset(df)
+
+    assert dataset.df is not None
+
+
+def test_dataset_raises_error_with_incorrect_data():
+    with pytest.raises(ValueError):
+        PandasDataset(1)
+
+
 def test_dataset_initializes_default_columns(resources_path):
     frame_path = resources_path / "random_df_1.csv"
     df = pd.read_csv(frame_path)
@@ -14,7 +36,12 @@ def test_dataset_initializes_default_columns(resources_path):
 
     dataset = PandasDataset(frame_path)
 
-    assert (dataset.attribute_columns == frame_columns[:-1]).all()
+    assert (dataset.attribute_columns == [
+        'pipeline-2__col_1',
+        'pipeline-2__col_2',
+        'pipeline-2__col_3',
+        'pipeline-2__col_4'
+    ]).all()
     assert (dataset.response_columns == [frame_columns[-1]]).all()
 
 
@@ -29,7 +56,10 @@ def test_dataset_assigns_non_default_columns(resources_path):
         response_columns=frame_columns[4:],
     )
 
-    assert (dataset.attribute_columns == frame_columns[1:3]).all()
+    assert (dataset.attribute_columns == [
+        'pipeline-2__col_2',
+        'pipeline-2__col_3'
+    ]).all()
     assert (dataset.response_columns == frame_columns[4:]).all()
 
 
@@ -59,13 +89,49 @@ def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path)
     actual_X, actual_y = dataset[index]
 
     assert_almost_equal(
-        actual_X.numpy(), expected_records[dataset.attribute_columns].values, decimal=2
+        actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2
     )
     assert_almost_equal(
         actual_y.numpy(), expected_records[dataset.response_columns].values, decimal=2
     )
 
 
+def test_dataset_encodes_categorical_columns():
+    df = df = pd.DataFrame(data=[
+        [1, "A", "E", .1],
+        [3, "B", "E", .5],
+        [3, "A", "F", .4],
+        [1, "C", "E", .3],
+    ], columns=["int1", "cat1", "cat2", "target"])
+    df["cat2"].astype("category")
+    dataset = PandasDataset(df)
+
+    expected_attribute_columns = [
+        'pipeline-2__int1',
+        'pipeline-1__cat1_A',
+        'pipeline-1__cat1_B',
+        'pipeline-1__cat1_C',
+        'pipeline-1__cat2_E',
+        'pipeline-1__cat2_F'
+    ]
+
+    assert (dataset.attribute_columns == expected_attribute_columns).all()
+    assert (
+        dataset.df[[
+            'pipeline-1__cat1_A',
+            'pipeline-1__cat1_B',
+            'pipeline-1__cat1_C',
+            'pipeline-1__cat2_E',
+            'pipeline-1__cat2_F'
+        ]].values == np.array([
+            [1, 0, 0, 1, 0],
+            [0, 1, 0, 1, 0],
+            [1, 0, 0, 0, 1],
+            [0, 0, 1, 1, 0],
+        ])
+    ).all()
+
+
 def test_class_forbids_one_hot_with_multiple_targets(resources_path):
     frame_path = resources_path / "random_df_1.csv"
     df = pd.read_csv(frame_path)

From 4e550b270f9b036461ae9d0583ecdda819f0a86e Mon Sep 17 00:00:00 2001
From: Antoni Zajko <antoni.zajko.1@gmail.com>
Date: Mon, 18 Dec 2023 21:15:47 +0100
Subject: [PATCH 2/4] Formatting

---
 liltab/data/datasets.py           | 37 +++++----------
 test/liltab/data/test_datasets.py | 75 ++++++++++++++++---------------
 2 files changed, 49 insertions(+), 63 deletions(-)

diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py
index d654d7c..a65a8f4 100644
--- a/liltab/data/datasets.py
+++ b/liltab/data/datasets.py
@@ -28,11 +28,7 @@ def __init__(
         preprocess_data: bool,
         encode_categorical_target: bool,
     ):
-        if (
-            response_columns is not None
-            and len(response_columns) > 1
-            and encode_categorical_target
-        ):
+        if response_columns is not None and len(response_columns) > 1 and encode_categorical_target:
             raise ValueError("One-hot encoding is supported only for single target")
 
         self.data = data
@@ -42,19 +38,14 @@ def __init__(
             self.df = data
         else:
             raise ValueError(
-                f"Data should be PosixPath, "
-                f"str or pandas.DataFrame but is {type(data)}"
+                f"Data should be PosixPath, " f"str or pandas.DataFrame but is {type(data)}"
             )
 
         self.attribute_columns = np.array(
-            attribute_columns
-            if attribute_columns is not None
-            else self.df.columns.tolist()[:-1]
+            attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1]
         )
         self.response_columns = np.array(
-            response_columns
-            if response_columns is not None
-            else [self.df.columns.tolist()[-1]]
+            response_columns if response_columns is not None else [self.df.columns.tolist()[-1]]
         )
         self.n_attributes = len(self.attribute_columns)
         self.n_responses = len(self.response_columns)
@@ -86,7 +77,9 @@ def _preprocess_data(self):
         self.attribute_columns = np.array(attribute_columns_new)
 
         if not self.encode_categorical_target:
-            self.df[self.response_columns] = StandardScaler().fit_transform(self.df[self.response_columns])        
+            self.df[self.response_columns] = StandardScaler().fit_transform(
+                self.df[self.response_columns]
+            )
 
     def _encode_categorical_target(self):
         """
@@ -142,9 +135,7 @@ def __init__(
             preprocess_data=preprocess_data,
         )
 
-        self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(
-            torch.float32
-        )
+        self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32)
         self.y = torch.from_numpy(self.y).type(torch.float32)
 
     def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
@@ -206,9 +197,7 @@ def __init__(
             preprocess_data=preprocess_data,
         )
         if total_random_feature_sampling and (
-            attribute_columns is not None
-            or response_columns
-            or encode_categorical_target
+            attribute_columns is not None or response_columns or encode_categorical_target
         ):
             raise ValueError(
                 "total_random_feature_sampling doesn't support feature or encoding specification"
@@ -254,12 +243,8 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
     def _get_features_from_selected_columns(self) -> tuple[int, int]:
         attributes_size = np.random.randint(low=1, high=self.n_attributes + 1)
         responses_size = np.random.randint(low=1, high=self.n_responses + 1)
-        attributes_idx = np.random.choice(
-            len(self.attribute_columns), attributes_size
-        ).tolist()
-        responses_idx = np.random.choice(
-            len(self.response_columns), responses_size
-        ).tolist()
+        attributes_idx = np.random.choice(len(self.attribute_columns), attributes_size).tolist()
+        responses_idx = np.random.choice(len(self.response_columns), responses_size).tolist()
 
         return attributes_idx, responses_idx
 
diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py
index bd39b68..809bfb4 100644
--- a/test/liltab/data/test_datasets.py
+++ b/test/liltab/data/test_datasets.py
@@ -36,12 +36,10 @@ def test_dataset_initializes_default_columns(resources_path):
 
     dataset = PandasDataset(frame_path)
 
-    assert (dataset.attribute_columns == [
-        'pipeline-2__col_1',
-        'pipeline-2__col_2',
-        'pipeline-2__col_3',
-        'pipeline-2__col_4'
-    ]).all()
+    assert (
+        dataset.attribute_columns
+        == ["pipeline-2__col_1", "pipeline-2__col_2", "pipeline-2__col_3", "pipeline-2__col_4"]
+    ).all()
     assert (dataset.response_columns == [frame_columns[-1]]).all()
 
 
@@ -56,10 +54,7 @@ def test_dataset_assigns_non_default_columns(resources_path):
         response_columns=frame_columns[4:],
     )
 
-    assert (dataset.attribute_columns == [
-        'pipeline-2__col_2',
-        'pipeline-2__col_3'
-    ]).all()
+    assert (dataset.attribute_columns == ["pipeline-2__col_2", "pipeline-2__col_3"]).all()
     assert (dataset.response_columns == frame_columns[4:]).all()
 
 
@@ -88,47 +83,53 @@ def test_indexing_dataset_returns_proper_data_with_preprocessing(resources_path)
     expected_records = df.loc[index]
     actual_X, actual_y = dataset[index]
 
-    assert_almost_equal(
-        actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2
-    )
+    assert_almost_equal(actual_X.numpy(), expected_records.iloc[:, :4].values, decimal=2)
     assert_almost_equal(
         actual_y.numpy(), expected_records[dataset.response_columns].values, decimal=2
     )
 
 
 def test_dataset_encodes_categorical_columns():
-    df = df = pd.DataFrame(data=[
-        [1, "A", "E", .1],
-        [3, "B", "E", .5],
-        [3, "A", "F", .4],
-        [1, "C", "E", .3],
-    ], columns=["int1", "cat1", "cat2", "target"])
+    df = df = pd.DataFrame(
+        data=[
+            [1, "A", "E", 0.1],
+            [3, "B", "E", 0.5],
+            [3, "A", "F", 0.4],
+            [1, "C", "E", 0.3],
+        ],
+        columns=["int1", "cat1", "cat2", "target"],
+    )
     df["cat2"].astype("category")
     dataset = PandasDataset(df)
 
     expected_attribute_columns = [
-        'pipeline-2__int1',
-        'pipeline-1__cat1_A',
-        'pipeline-1__cat1_B',
-        'pipeline-1__cat1_C',
-        'pipeline-1__cat2_E',
-        'pipeline-1__cat2_F'
+        "pipeline-2__int1",
+        "pipeline-1__cat1_A",
+        "pipeline-1__cat1_B",
+        "pipeline-1__cat1_C",
+        "pipeline-1__cat2_E",
+        "pipeline-1__cat2_F",
     ]
 
     assert (dataset.attribute_columns == expected_attribute_columns).all()
     assert (
-        dataset.df[[
-            'pipeline-1__cat1_A',
-            'pipeline-1__cat1_B',
-            'pipeline-1__cat1_C',
-            'pipeline-1__cat2_E',
-            'pipeline-1__cat2_F'
-        ]].values == np.array([
-            [1, 0, 0, 1, 0],
-            [0, 1, 0, 1, 0],
-            [1, 0, 0, 0, 1],
-            [0, 0, 1, 1, 0],
-        ])
+        dataset.df[
+            [
+                "pipeline-1__cat1_A",
+                "pipeline-1__cat1_B",
+                "pipeline-1__cat1_C",
+                "pipeline-1__cat2_E",
+                "pipeline-1__cat2_F",
+            ]
+        ].values
+        == np.array(
+            [
+                [1, 0, 0, 1, 0],
+                [0, 1, 0, 1, 0],
+                [1, 0, 0, 0, 1],
+                [0, 0, 1, 1, 0],
+            ]
+        )
     ).all()
 
 

From 95ca757bc1db4e97884eaee2a1c013e55cf1f181 Mon Sep 17 00:00:00 2001
From: Antoni Zajko <antoni.zajko.1@gmail.com>
Date: Mon, 18 Dec 2023 22:26:49 +0100
Subject: [PATCH 3/4] Resolve comments

---
 bin/train.py                         | 128 ---------------------------
 experiments/03_openml_clf.py         |   6 +-
 experiments/04_same_domain.py        |   6 +-
 experiments/05_new_classes.py        |   6 +-
 experiments/06_big_data.py           |   6 +-
 liltab/data/datasets.py              |  49 +++++-----
 liltab/train/trainer.py              |   6 +-
 test/liltab/data/test_dataloaders.py |   8 +-
 test/liltab/data/test_datasets.py    |  22 ++---
 9 files changed, 56 insertions(+), 181 deletions(-)
 delete mode 100644 bin/train.py

diff --git a/bin/train.py b/bin/train.py
deleted file mode 100644
index cfd5323..0000000
--- a/bin/train.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import typer
-import yaml
-import pytorch_lightning as pl
-import warnings
-
-from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset
-from liltab.data.dataloaders import (
-    FewShotDataLoader,
-    ComposedDataLoader,
-    RepeatableOutputComposedDataLoader,
-)
-from liltab.data.factory import ComposedDataLoaderFactory
-from liltab.model.heterogenous_attributes_network import HeterogenousAttributesNetwork
-from liltab.train.trainer import HeterogenousAttributesNetworkTrainer
-from liltab.train.logger import TensorBoardLogger, FileLogger
-from loguru import logger
-from typing_extensions import Annotated
-from pathlib import Path
-
-warnings.filterwarnings("ignore")
-app = typer.Typer()
-
-
-@app.command(help="Trains network on heterogenous attribute spaces.")
-def main(
-    config_path: Annotated[Path, typer.Option(..., help="Path to experiment configuration.")],
-    logger_type: Annotated[
-        str,
-        typer.Option(
-            ...,
-            help="""typer of logger. tb=[tensorboard],
-            flat=[flat file], both=[tensoboard and flat file]""",
-        ),
-    ] = "both",
-    use_profiler: Annotated[
-        str,
-        typer.Option(
-            ...,
-            help="""""use profiler (take long time, 8-10 epoches suggested),
-            yes or no; requires tensorboard (logger-type=[tb|both])""",
-        ),
-    ] = "no",
-    seed: Annotated[int, typer.Option(..., help="Seed")] = 123,
-):
-    pl.seed_everything(seed)
-
-    logger.info("Loading config")
-    with open(config_path) as f:
-        config = yaml.load(f, Loader=yaml.CLoader)
-
-    logger.info("Loading data")
-    train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
-        Path(config["train_data_path"]),
-        RandomFeaturesPandasDataset,
-        {},
-        FewShotDataLoader,
-        {"support_size": config["support_size"], "query_size": config["query_size"]},
-        ComposedDataLoader,
-        batch_size=config["batch_size"],
-    )
-    val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
-        Path(config["val_data_path"]),
-        PandasDataset,
-        {},
-        FewShotDataLoader,
-        {"support_size": config["support_size"], "query_size": config["query_size"]},
-        RepeatableOutputComposedDataLoader,
-        batch_size=config["batch_size"],
-    )
-    test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
-        Path(config["test_data_path"]),
-        PandasDataset,
-        {},
-        FewShotDataLoader,
-        {"support_size": config["support_size"], "query_size": config["query_size"]},
-        RepeatableOutputComposedDataLoader,
-        batch_size=config["batch_size"],
-    )
-
-    logger.info("Creating model")
-    model = HeterogenousAttributesNetwork(
-        hidden_representation_size=config["hidden_representation_size"],
-        n_hidden_layers=config["n_hidden_layers"],
-        hidden_size=config["hidden_size"],
-        dropout_rate=config["dropout_rate"],
-    )
-
-    if logger_type == "tb":
-        tb_logger = TensorBoardLogger(
-            "results/tensorboard",
-            name=config["name"],
-            use_profiler=True if use_profiler == "yes" else False,
-        )
-        file_logger = None
-    elif logger_type == "flat":
-        tb_logger = None
-        file_logger = FileLogger("results/flat")
-    elif logger_type == "both":
-        tb_logger = TensorBoardLogger(
-            "results/tensorboard",
-            name=config["name"],
-            use_profiler=True if use_profiler == "yes" else False,
-        )
-        file_logger = FileLogger("results/flat")
-    else:
-        raise ValueError("logger_type must from [tb, flat, both]")
-
-    trainer = HeterogenousAttributesNetworkTrainer(
-        n_epochs=config["num_epochs"],
-        gradient_clipping=config["gradient_clipping"],
-        learning_rate=config["learning_rate"],
-        weight_decay=config["weight_decay"],
-        early_stopping_intervals=config["early_stopping_intervals"],
-        file_logger=file_logger,
-        tb_logger=tb_logger,
-    )
-
-    logger.info("Training model")
-    trainer.train_and_test(
-        model=model,
-        train_loader=train_loader,
-        val_loader=val_loader,
-        test_loader=test_loader,
-    )
-
-
-if __name__ == "__main__":
-    app()
diff --git a/experiments/03_openml_clf.py b/experiments/03_openml_clf.py
index a8b696e..349786a 100644
--- a/experiments/03_openml_clf.py
+++ b/experiments/03_openml_clf.py
@@ -34,7 +34,7 @@ def main():
     train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["train_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         ComposedDataLoader,
@@ -43,7 +43,7 @@ def main():
     val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["val_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
@@ -52,7 +52,7 @@ def main():
     test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["test_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
diff --git a/experiments/04_same_domain.py b/experiments/04_same_domain.py
index ac16816..b48d976 100644
--- a/experiments/04_same_domain.py
+++ b/experiments/04_same_domain.py
@@ -30,7 +30,7 @@ def main():
     train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["train_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         ComposedDataLoader,
@@ -39,7 +39,7 @@ def main():
     val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["val_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
@@ -48,7 +48,7 @@ def main():
     test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["test_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
diff --git a/experiments/05_new_classes.py b/experiments/05_new_classes.py
index 5b820ef..182a15c 100644
--- a/experiments/05_new_classes.py
+++ b/experiments/05_new_classes.py
@@ -31,7 +31,7 @@ def main():
     train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["train_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         ComposedDataLoader,
@@ -40,7 +40,7 @@ def main():
     val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["val_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
@@ -49,7 +49,7 @@ def main():
     test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["test_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
diff --git a/experiments/06_big_data.py b/experiments/06_big_data.py
index cfcc83a..5d49e42 100644
--- a/experiments/06_big_data.py
+++ b/experiments/06_big_data.py
@@ -31,7 +31,7 @@ def main():
     train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["train_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         ComposedDataLoader,
@@ -40,7 +40,7 @@ def main():
     val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["val_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
@@ -49,7 +49,7 @@ def main():
     test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
         Path(config["test_data_path"]),
         PandasDataset,
-        {"encode_categorical_target": True},
+        {"encode_categorical_response": True},
         FewShotDataLoader,
         {"support_size": config["support_size"], "query_size": config["query_size"]},
         RepeatableOutputComposedDataLoader,
diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py
index a65a8f4..4338ede 100644
--- a/liltab/data/datasets.py
+++ b/liltab/data/datasets.py
@@ -4,7 +4,6 @@
 import torch
 
 from abc import ABC, abstractmethod
-from itertools import product
 from pathlib import PosixPath
 from sklearn.preprocessing import OneHotEncoder
 from torch import Tensor
@@ -26,9 +25,13 @@ def __init__(
         attribute_columns: list[str],
         response_columns: list[str],
         preprocess_data: bool,
-        encode_categorical_target: bool,
+        encode_categorical_response: bool,
     ):
-        if response_columns is not None and len(response_columns) > 1 and encode_categorical_target:
+        if (
+            response_columns is not None
+            and len(response_columns) > 1
+            and encode_categorical_response
+        ):
             raise ValueError("One-hot encoding is supported only for single target")
 
         self.data = data
@@ -50,38 +53,36 @@ def __init__(
         self.n_attributes = len(self.attribute_columns)
         self.n_responses = len(self.response_columns)
 
-        self.encode_categorical_target = encode_categorical_target
+        self.encode_categorical_response = encode_categorical_response
         self.preprocess_data = preprocess_data
 
         if self.preprocess_data:
             self._preprocess_data()
-        if self.encode_categorical_target:
-            self._encode_categorical_target()
+        if self.encode_categorical_response:
+            self._encode_categorical_response()
         else:
             self.y = self.df[self.response_columns].values
 
     def _preprocess_data(self):
         """
-        Standardizes data using z-score method. If encode_categorical_target = True
-        then response variable isn't scaled.
+        Performs following preprocessing:
+            * data imputation
+            * z-score scaling of numerical columns
+            * one-hot encoding categorical columns
+        If encode_categorical_response = True, then omits response column.
         """
         self.preprocessing_pipeline = get_preprocessing_pipeline()
         df_preproc = self.preprocessing_pipeline.fit_transform(self.df[self.attribute_columns])
         self.df = self.df.drop(columns=self.attribute_columns)
         self.df = pd.concat([df_preproc, self.df], axis=1)
+        self.attribute_columns = df_preproc.columns.values
 
-        attribute_columns_new = []
-        for attr_col, frame_col in product(self.attribute_columns, self.df.columns):
-            if attr_col in frame_col:
-                attribute_columns_new.append(frame_col)
-        self.attribute_columns = np.array(attribute_columns_new)
-
-        if not self.encode_categorical_target:
+        if not self.encode_categorical_response:
             self.df[self.response_columns] = StandardScaler().fit_transform(
                 self.df[self.response_columns]
             )
 
-    def _encode_categorical_target(self):
+    def _encode_categorical_response(self):
         """
         Encodes categorical response using one-hot encoding.
         """
@@ -110,7 +111,7 @@ def __init__(
         attribute_columns: list[str] = None,
         response_columns: list[str] = None,
         preprocess_data: bool = True,
-        encode_categorical_target: bool = False,
+        encode_categorical_response: bool = False,
     ):
         """
         Args:
@@ -123,7 +124,7 @@ def __init__(
             preprocess_data (bool, optional): If true, then imputes data
                 using mean strategy and standardizes using StandardScaler.
                 Defaults to True.
-            encode_categorical_target(bool, optional): if True, then target column
+            encode_categorical_response(bool, optional): if True, then target column
                 will be encoded using one-hot. Works only with single target variable.
                 Default to False.
         """
@@ -131,7 +132,7 @@ def __init__(
             data=data,
             attribute_columns=attribute_columns,
             response_columns=response_columns,
-            encode_categorical_target=encode_categorical_target,
+            encode_categorical_response=encode_categorical_response,
             preprocess_data=preprocess_data,
         )
 
@@ -160,7 +161,7 @@ def __init__(
         response_columns: list[str] = None,
         total_random_feature_sampling: bool = False,
         preprocess_data: bool = True,
-        encode_categorical_target: bool = False,
+        encode_categorical_response: bool = False,
         persist_features_iter: int = 2,
     ):
         """
@@ -180,7 +181,7 @@ def __init__(
             preprocess_data(bool, optional): If true, then imputes data
                 using mean strategy and standardizes using StandardScaler.
                 Defaults to True.
-            encode_categorical_target(bool, optional): if True, then target column
+            encode_categorical_response(bool, optional): if True, then target column
                 will be encoded using one-hot.
                 When total_random_feature_sampling=True it should be False.
                 Works only with single target variable.
@@ -193,11 +194,11 @@ def __init__(
             data=data,
             attribute_columns=attribute_columns,
             response_columns=response_columns,
-            encode_categorical_target=encode_categorical_target,
+            encode_categorical_response=encode_categorical_response,
             preprocess_data=preprocess_data,
         )
         if total_random_feature_sampling and (
-            attribute_columns is not None or response_columns or encode_categorical_target
+            attribute_columns is not None or response_columns or encode_categorical_response
         ):
             raise ValueError(
                 "total_random_feature_sampling doesn't support feature or encoding specification"
@@ -233,7 +234,7 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]:
         self.persist_features_counter -= 1
 
         X = torch.from_numpy(self.df[self.attributes].to_numpy()).type(torch.float32)
-        if self.encode_categorical_target:
+        if self.encode_categorical_response:
             y = torch.from_numpy(self.y).type(torch.float32)
         else:
             y = torch.from_numpy(self.df[self.responses].to_numpy()).type(torch.float32)
diff --git a/liltab/train/trainer.py b/liltab/train/trainer.py
index 3ae5bdb..39716c2 100644
--- a/liltab/train/trainer.py
+++ b/liltab/train/trainer.py
@@ -26,7 +26,7 @@ def __init__(
         gradient_clipping: bool,
         learning_rate: float,
         weight_decay: float,
-        early_stopping_intervals: bool = 100,
+        early_stopping_intervals: int = 100,
         check_val_every_n_epoch: int = 100,
         loss: Callable = nn.MSELoss(),
         file_logger: bool = True,
@@ -40,9 +40,9 @@ def __init__(
             gradient_clipping (bool): If true, then gradient clipping is applied
             learning_rate (float): learning rate during training.
             weight_decay (float): weight decay during training.
-            early_stopping_intervals (Optional, bool): if >0, then early stopping with
+            early_stopping_intervals (Optional, int): if >0, then early stopping with
                 patience early_stopping_intervals*check_val_every_n_epoch epochs is applied.
-            check_val_every_n_epoch (Optional, bool): Specifies how often validation loss
+            check_val_every_n_epoch (Optional, int): Specifies how often validation loss
                 is checked. Defaults to 100,
             loss (Callable): Loss used during training. Defaults to MSELoss().
             file_logger (bool): if True, then file logger will write to
diff --git a/test/liltab/data/test_dataloaders.py b/test/liltab/data/test_dataloaders.py
index a7f1fbb..fff1ece 100644
--- a/test/liltab/data/test_dataloaders.py
+++ b/test/liltab/data/test_dataloaders.py
@@ -48,7 +48,7 @@ def test_few_shot_data_loader_samples_equally_when_set_size_divisible_by_nunique
     resources_path,
 ):
     frame_path = resources_path / "random_df_3.csv"
-    dataset = PandasDataset(frame_path, encode_categorical_target=True)
+    dataset = PandasDataset(frame_path, encode_categorical_response=True)
     dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True)
 
     for episode in dataloader:
@@ -62,7 +62,7 @@ def test_few_shot_data_loader_samples_equally_works_with_random_features(
     resources_path,
 ):
     frame_path = resources_path / "random_df_3.csv"
-    dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_target=True)
+    dataset = RandomFeaturesPandasDataset(frame_path, encode_categorical_response=True)
     dataloader = FewShotDataLoader(dataset, 9, 6, n_episodes=10, sample_classes_equally=True)
 
     for episode in dataloader:
@@ -76,7 +76,7 @@ def test_few_shot_data_loader_samples_equally_when_set_size_non_divisible_by_nun
     resources_path,
 ):
     frame_path = resources_path / "random_df_3.csv"
-    dataset = PandasDataset(frame_path, encode_categorical_target=True)
+    dataset = PandasDataset(frame_path, encode_categorical_response=True)
     dataloader = FewShotDataLoader(dataset, 11, 7, n_episodes=10, sample_classes_equally=True)
 
     for episode in dataloader:
@@ -90,7 +90,7 @@ def test_few_shot_data_loader_samples_stratified(
     resources_path,
 ):
     frame_path = resources_path / "random_df_4.csv"
-    dataset = PandasDataset(frame_path, encode_categorical_target=True)
+    dataset = PandasDataset(frame_path, encode_categorical_response=True)
     dataloader = FewShotDataLoader(dataset, 6, 12, n_episodes=10, sample_classes_stratified=True)
 
     for episode in dataloader:
diff --git a/test/liltab/data/test_datasets.py b/test/liltab/data/test_datasets.py
index 809bfb4..0cdda7f 100644
--- a/test/liltab/data/test_datasets.py
+++ b/test/liltab/data/test_datasets.py
@@ -102,14 +102,16 @@ def test_dataset_encodes_categorical_columns():
     df["cat2"].astype("category")
     dataset = PandasDataset(df)
 
-    expected_attribute_columns = [
-        "pipeline-2__int1",
-        "pipeline-1__cat1_A",
-        "pipeline-1__cat1_B",
-        "pipeline-1__cat1_C",
-        "pipeline-1__cat2_E",
-        "pipeline-1__cat2_F",
-    ]
+    expected_attribute_columns = np.array(
+        [
+            "pipeline-1__cat1_A",
+            "pipeline-1__cat1_B",
+            "pipeline-1__cat1_C",
+            "pipeline-1__cat2_E",
+            "pipeline-1__cat2_F",
+            "pipeline-2__int1",
+        ]
+    )
 
     assert (dataset.attribute_columns == expected_attribute_columns).all()
     assert (
@@ -139,7 +141,7 @@ def test_class_forbids_one_hot_with_multiple_targets(resources_path):
     feture_columns = df.columns[:-2]
     target_columns = df.columns[-2:]
     with pytest.raises(ValueError):
-        PandasDataset(frame_path, feture_columns, target_columns, encode_categorical_target=True)
+        PandasDataset(frame_path, feture_columns, target_columns, encode_categorical_response=True)
 
 
 def test_preprocessing_when_target_categorical(resources_path):
@@ -148,7 +150,7 @@ def test_preprocessing_when_target_categorical(resources_path):
     expected_X = df.drop(columns=["class"])
     expected_X = (expected_X - expected_X.mean(axis=0)) / expected_X.std(axis=0)
 
-    dataset = PandasDataset(frame_path, encode_categorical_target=True)
+    dataset = PandasDataset(frame_path, encode_categorical_response=True)
 
     assert dataset.y.shape == (df.shape[0], df["class"].max())
     assert_almost_equal(dataset.y.sum(axis=1).numpy(), np.ones(df.shape[0]))

From 85bf931e33dd13a0deadc23242b757033e363f7d Mon Sep 17 00:00:00 2001
From: Antoni Zajko <antoni.zajko.1@gmail.com>
Date: Mon, 18 Dec 2023 22:30:54 +0100
Subject: [PATCH 4/4] Fix check

---
 .github/workflows/code_check.yml | 6 ------
 makefile                         | 2 --
 2 files changed, 8 deletions(-)

diff --git a/.github/workflows/code_check.yml b/.github/workflows/code_check.yml
index d979e1c..8e3e275 100644
--- a/.github/workflows/code_check.yml
+++ b/.github/workflows/code_check.yml
@@ -27,18 +27,12 @@ jobs:
       - name: Check black test code
         run: black test --line-length=100
       
-      - name: Check black bin code
-        run: black bin --line-length=100
-      
       - name: Check flake8 source code
         run: flake8 liltab --max-line-length=100
 
       - name: Check flake8 test code
         run: flake8 test --max-line-length=100
       
-      - name: Check flake8 bin code
-        run: flake8 bin --max-line-length=100
-
       - name: Run test
         run: |
           export PYTHONPATH=`pwd`
diff --git a/makefile b/makefile
index 916991f..19506fc 100644
--- a/makefile
+++ b/makefile
@@ -6,8 +6,6 @@ prepare_code:
 	flake8 liltab --max-line-length=100
 	black --line-length=100 test
 	flake8 test --max-line-length=100
-	black --line-length=100 bin
-	flake8 bin --max-line-length=100
 
 run_tests:
 	export PYTHONPATH=`pwd` && pytest