diff --git a/liltab/data/datasets.py b/liltab/data/datasets.py index 697730d..2ded337 100644 --- a/liltab/data/datasets.py +++ b/liltab/data/datasets.py @@ -25,17 +25,25 @@ def __init__( preprocess_data: bool, encode_categorical_target: bool, ): - if response_columns and len(response_columns) > 1 and encode_categorical_target: + if ( + response_columns is not None + and len(response_columns) > 1 + and encode_categorical_target + ): raise ValueError("One-hot encoding is supported only for single target") self.data_path = data_path self.df = pd.read_csv(data_path) self.attribute_columns = np.array( - attribute_columns if attribute_columns is not None else self.df.columns.tolist()[:-1] + attribute_columns + if attribute_columns is not None + else self.df.columns.tolist()[:-1] ) self.response_columns = np.array( - response_columns if response_columns is not None else [self.df.columns.tolist()[-1]] + response_columns + if response_columns is not None + else [self.df.columns.tolist()[-1]] ) self.n_attributes = len(self.attribute_columns) self.n_responses = len(self.response_columns) @@ -57,12 +65,15 @@ def _preprocess_data(self): """ self.preprocessing_pipeline = get_preprocessing_pipeline() if self.encode_categorical_target: - self.df.loc[:, self.attribute_columns] = self.preprocessing_pipeline.fit_transform( + self.df.loc[ + :, self.attribute_columns + ] = self.preprocessing_pipeline.fit_transform( self.df[self.attribute_columns] ) else: self.df = pd.DataFrame( - self.preprocessing_pipeline.fit_transform(self.df), columns=self.df.columns + self.preprocessing_pipeline.fit_transform(self.df), + columns=self.df.columns, ) def _encode_categorical_target(self): @@ -119,7 +130,9 @@ def __init__( preprocess_data=preprocess_data, ) - self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type(torch.float32) + self.X = torch.from_numpy(self.df[self.attribute_columns].to_numpy()).type( + torch.float32 + ) self.y = torch.from_numpy(self.y).type(torch.float32) def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: @@ -181,7 +194,9 @@ def __init__( preprocess_data=preprocess_data, ) if total_random_feature_sampling and ( - attribute_columns or response_columns or encode_categorical_target + attribute_columns is not None + or response_columns + or encode_categorical_target ): raise ValueError( "total_random_feature_sampling doesn't support feature or encoding specification" @@ -206,7 +221,10 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: self.columns[responses_idx], ) else: - attributes_idx, responses_idx = self._get_features_from_selected_columns() + ( + attributes_idx, + responses_idx, + ) = self._get_features_from_selected_columns() self.attributes, self.responses = ( self.attribute_columns[attributes_idx], self.response_columns[responses_idx], @@ -224,8 +242,12 @@ def __getitem__(self, idx: list[int]) -> tuple[Tensor, Tensor]: def _get_features_from_selected_columns(self) -> tuple[int, int]: attributes_size = np.random.randint(low=1, high=self.n_attributes + 1) responses_size = np.random.randint(low=1, high=self.n_responses + 1) - attributes_idx = np.random.choice(len(self.attribute_columns), attributes_size).tolist() - responses_idx = np.random.choice(len(self.response_columns), responses_size).tolist() + attributes_idx = np.random.choice( + len(self.attribute_columns), attributes_size + ).tolist() + responses_idx = np.random.choice( + len(self.response_columns), responses_size + ).tolist() return attributes_idx, responses_idx