From 8b7a6cbf79d95a6242147f067a79af32168df02b Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Mon, 25 Nov 2024 12:36:06 -0800 Subject: [PATCH 1/6] Added categorical encoding function --- src/koheesio/categorical_encoding.py | 30 ++++++++++++++++++++ tests/test_categorical_encoding.py | 42 ++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 src/koheesio/categorical_encoding.py create mode 100644 tests/test_categorical_encoding.py diff --git a/src/koheesio/categorical_encoding.py b/src/koheesio/categorical_encoding.py new file mode 100644 index 00000000..d68055e7 --- /dev/null +++ b/src/koheesio/categorical_encoding.py @@ -0,0 +1,30 @@ +# Import necessary libraries +from sklearn.preprocessing import OneHotEncoder +import pandas as pd +from pydantic import BaseModel + +# Define the EncodingConfig class +class EncodingConfig(BaseModel): + drop_first: bool = True # Whether to drop the first dummy column + +# Define the categorical_encoding function +def categorical_encoding(data, columns, config: EncodingConfig): + """ + Encodes categorical columns using one-hot encoding. + + Args: + data (pd.DataFrame): Input dataset. + columns (list): List of categorical columns to encode. + config (EncodingConfig): Configuration for encoding. + + Returns: + pd.DataFrame: Dataset with one-hot encoded columns. + """ + encoder = OneHotEncoder(sparse_output=False, drop='first' if config.drop_first else None) + encoded_array = encoder.fit_transform(data[columns]) + encoded_df = pd.DataFrame( + encoded_array, + columns=encoder.get_feature_names_out(columns), + index=data.index, + ) + return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1) diff --git a/tests/test_categorical_encoding.py b/tests/test_categorical_encoding.py new file mode 100644 index 00000000..0ba0346b --- /dev/null +++ b/tests/test_categorical_encoding.py @@ -0,0 +1,42 @@ +import unittest +import pandas as pd +from src.koheesio.categorical_encoding import categorical_encoding, EncodingConfig + +class TestCategoricalEncoding(unittest.TestCase): + def setUp(self): + self.data = pd.DataFrame({ + 'color': ['red', 'blue', 'green', 'red'], + 'size': ['S', 'M', 'L', 'XL'], + 'price': [10.0, 20.0, 30.0, 40.0] + }) + + def test_one_hot_encoding(self): + # Configure encoding to drop the first column (avoid dummy variable trap) + config = EncodingConfig(drop_first=True) + result = categorical_encoding(self.data, ['color', 'size'], config) + # Check if the encoded columns are present + self.assertIn('color_green', result.columns) + self.assertIn('color_red', result.columns) + self.assertIn('size_M', result.columns) + self.assertIn('size_XL', result.columns) + # Check if the original categorical columns were removed + self.assertNotIn('color', result.columns) + self.assertNotIn('size', result.columns) + + def test_no_drop_first_encoding(self): + # Configure encoding to retain all dummy columns + config = EncodingConfig(drop_first=False) + result = categorical_encoding(self.data, ['color', 'size'], config) + # Check for all encoded columns + self.assertIn('color_blue', result.columns) + self.assertIn('color_green', result.columns) + self.assertIn('color_red', result.columns) + self.assertIn('size_S', result.columns) + self.assertIn('size_M', result.columns) + self.assertIn('size_XL', result.columns) + # Check if the original categorical columns were removed + self.assertNotIn('color', result.columns) + self.assertNotIn('size', result.columns) + +if __name__ == '__main__': + unittest.main() From d80646a190d1b4e9d199e2a2a33eee62878053c9 Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:21:34 -0800 Subject: [PATCH 2/6] Added dependencies in pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index be0a5496..0a4ccb61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,8 @@ dependencies = [ "pytz>=2023.3", "pyyaml>=6.0", "tomli>=2.0.1", + "pandas>=1.5.0", + "scikit-learn>=1.2.0" ] [project.urls] From 60c81dc1e8442b73ef9abb447e74c142d8410be9 Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:23:12 -0800 Subject: [PATCH 3/6] Updated Docstrings to be in Numpy notation --- src/koheesio/categorical_encoding.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/koheesio/categorical_encoding.py b/src/koheesio/categorical_encoding.py index d68055e7..ba87dcb6 100644 --- a/src/koheesio/categorical_encoding.py +++ b/src/koheesio/categorical_encoding.py @@ -12,14 +12,21 @@ def categorical_encoding(data, columns, config: EncodingConfig): """ Encodes categorical columns using one-hot encoding. - Args: - data (pd.DataFrame): Input dataset. - columns (list): List of categorical columns to encode. - config (EncodingConfig): Configuration for encoding. + Parameters + ---------- + data : pd.DataFrame + Input dataset. + columns : list + List of categorical columns to encode. + drop_first : bool, optional + Whether to drop the first dummy column to avoid multicollinearity, by default True. - Returns: - pd.DataFrame: Dataset with one-hot encoded columns. + Returns + ------- + pd.DataFrame + Dataset with one-hot encoded columns. """ + encoder = OneHotEncoder(sparse_output=False, drop='first' if config.drop_first else None) encoded_array = encoder.fit_transform(data[columns]) encoded_df = pd.DataFrame( From 27d2b8fff1c3bf70f569560cdfb60f0b67482685 Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Thu, 28 Nov 2024 18:29:59 -0800 Subject: [PATCH 4/6] Revert "Updated Docstrings to be in Numpy notation" This reverts commit 60c81dc1e8442b73ef9abb447e74c142d8410be9. --- src/koheesio/categorical_encoding.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/koheesio/categorical_encoding.py b/src/koheesio/categorical_encoding.py index ba87dcb6..d68055e7 100644 --- a/src/koheesio/categorical_encoding.py +++ b/src/koheesio/categorical_encoding.py @@ -12,21 +12,14 @@ def categorical_encoding(data, columns, config: EncodingConfig): """ Encodes categorical columns using one-hot encoding. - Parameters - ---------- - data : pd.DataFrame - Input dataset. - columns : list - List of categorical columns to encode. - drop_first : bool, optional - Whether to drop the first dummy column to avoid multicollinearity, by default True. + Args: + data (pd.DataFrame): Input dataset. + columns (list): List of categorical columns to encode. + config (EncodingConfig): Configuration for encoding. - Returns - ------- - pd.DataFrame - Dataset with one-hot encoded columns. + Returns: + pd.DataFrame: Dataset with one-hot encoded columns. """ - encoder = OneHotEncoder(sparse_output=False, drop='first' if config.drop_first else None) encoded_array = encoder.fit_transform(data[columns]) encoded_df = pd.DataFrame( From f69f66347dcae02936a2630ce53ef306d086d8b8 Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Thu, 28 Nov 2024 19:46:37 -0800 Subject: [PATCH 5/6] Updated categorical_encoding.py by using class and docstrings --- src/koheesio/categorical_encoding.py | 71 ++++++++++++------ tests/test_categorical_encoding.py | 107 ++++++++++++++++++--------- 2 files changed, 122 insertions(+), 56 deletions(-) diff --git a/src/koheesio/categorical_encoding.py b/src/koheesio/categorical_encoding.py index d68055e7..dfa35931 100644 --- a/src/koheesio/categorical_encoding.py +++ b/src/koheesio/categorical_encoding.py @@ -1,30 +1,57 @@ -# Import necessary libraries from sklearn.preprocessing import OneHotEncoder import pandas as pd from pydantic import BaseModel +from typing import List, Optional +from koheesio.steps import Step -# Define the EncodingConfig class -class EncodingConfig(BaseModel): - drop_first: bool = True # Whether to drop the first dummy column +from typing import List, Dict +import pandas as pd +from pydantic import BaseModel -# Define the categorical_encoding function -def categorical_encoding(data, columns, config: EncodingConfig): +class PandasCategoricalEncoding(BaseModel): """ - Encodes categorical columns using one-hot encoding. + Encodes categorical columns using one-hot encoding or ordinal encoding. - Args: - data (pd.DataFrame): Input dataset. - columns (list): List of categorical columns to encode. - config (EncodingConfig): Configuration for encoding. - - Returns: - pd.DataFrame: Dataset with one-hot encoded columns. + Attributes + ---------- + columns : List[str] + List of categorical columns to encode. + encoding_type : str, optional + Type of encoding, either "one-hot" or "ordinal" (default is "one-hot"). + drop_first : bool, optional + Whether to drop the first dummy column for one-hot encoding (default is True). + ordinal_mapping : dict, optional + A dictionary mapping categorical values to integers for ordinal encoding. """ - encoder = OneHotEncoder(sparse_output=False, drop='first' if config.drop_first else None) - encoded_array = encoder.fit_transform(data[columns]) - encoded_df = pd.DataFrame( - encoded_array, - columns=encoder.get_feature_names_out(columns), - index=data.index, - ) - return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1) + + columns: List[str] + encoding_type: str = "one-hot" + drop_first: bool = True + ordinal_mapping: Dict[str, Dict] = None + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.encoding_type not in ['one-hot', 'ordinal']: + raise ValueError(f"Invalid encoding type: {self.encoding_type}. Expected 'one-hot' or 'ordinal'.") + + def execute(self, data: pd.DataFrame) -> pd.DataFrame: + """ + Executes the categorical encoding transformation on the provided dataset. + + Parameters + ---------- + data : pd.DataFrame + The input dataset to encode. + + Returns + ------- + pd.DataFrame + The dataset with the specified categorical columns encoded. + """ + if self.encoding_type == 'one-hot': + data = pd.get_dummies(data, columns=self.columns, drop_first=self.drop_first) + elif self.encoding_type == 'ordinal': + for column in self.columns: + if column in data.columns and self.ordinal_mapping and column in self.ordinal_mapping: + data[column] = data[column].map(self.ordinal_mapping[column]).fillna(-1).astype(int) + return data diff --git a/tests/test_categorical_encoding.py b/tests/test_categorical_encoding.py index 0ba0346b..0de92d9f 100644 --- a/tests/test_categorical_encoding.py +++ b/tests/test_categorical_encoding.py @@ -1,42 +1,81 @@ import unittest import pandas as pd -from src.koheesio.categorical_encoding import categorical_encoding, EncodingConfig +from src.koheesio.categorical_encoding import PandasCategoricalEncoding +import unittest +import pandas as pd + +class TestPandasCategoricalEncoding(unittest.TestCase): + + """ + Unit tests for the PandasCategoricalEncoding class. + + Methods + ------- + setUp(): + Sets up the test environment with sample data. + test_one_hot_encoding(): + Tests the one-hot encoding functionality. + test_ordinal_encoding(): + Tests the ordinal encoding functionality. + """ -class TestCategoricalEncoding(unittest.TestCase): def setUp(self): + """ + Sets up the test environment with sample data. + + Creates a pandas DataFrame with categorical columns for testing. + """ self.data = pd.DataFrame({ - 'color': ['red', 'blue', 'green', 'red'], - 'size': ['S', 'M', 'L', 'XL'], - 'price': [10.0, 20.0, 30.0, 40.0] + 'color': ['red', 'blue', 'green', 'blue', 'red'], + 'size': ['S', 'M', 'L', 'S', 'XL'] }) def test_one_hot_encoding(self): - # Configure encoding to drop the first column (avoid dummy variable trap) - config = EncodingConfig(drop_first=True) - result = categorical_encoding(self.data, ['color', 'size'], config) - # Check if the encoded columns are present - self.assertIn('color_green', result.columns) - self.assertIn('color_red', result.columns) - self.assertIn('size_M', result.columns) - self.assertIn('size_XL', result.columns) - # Check if the original categorical columns were removed - self.assertNotIn('color', result.columns) - self.assertNotIn('size', result.columns) - - def test_no_drop_first_encoding(self): - # Configure encoding to retain all dummy columns - config = EncodingConfig(drop_first=False) - result = categorical_encoding(self.data, ['color', 'size'], config) - # Check for all encoded columns - self.assertIn('color_blue', result.columns) - self.assertIn('color_green', result.columns) - self.assertIn('color_red', result.columns) - self.assertIn('size_S', result.columns) - self.assertIn('size_M', result.columns) - self.assertIn('size_XL', result.columns) - # Check if the original categorical columns were removed - self.assertNotIn('color', result.columns) - self.assertNotIn('size', result.columns) - -if __name__ == '__main__': - unittest.main() + """ + Tests one-hot encoding functionality. + + Ensures that the specified categorical columns are correctly + encoded into dummy variables, with an option to drop the first + dummy column. + """ + encoding_step = PandasCategoricalEncoding( + columns=["color"], + encoding_type="one-hot", + drop_first=False # Adjusted to match expected columns + ) + + # Apply encoding + encoded_data = encoding_step.execute(self.data) + + # Expected columns after one-hot encoding + expected_columns = ['color_blue', 'color_green', 'color_red'] + + # Check if the encoded data contains expected columns + self.assertTrue(all(col in encoded_data.columns for col in expected_columns)) + print("One-hot encoding passed.") + + def test_ordinal_encoding(self): + """ + Tests ordinal encoding functionality. + + Ensures that the specified categorical columns are correctly + encoded into ordinal values based on a provided mapping. + """ + + ordinal_mapping = { + 'size': {'S': 1, 'M': 2, 'L': 3, 'XL': 4} + } + + encoding_step = PandasCategoricalEncoding( + columns=["size"], + encoding_type="ordinal", + ordinal_mapping=ordinal_mapping + ) + + # Apply encoding + encoded_data = encoding_step.execute(self.data) + + # Check if the "size" column is correctly ordinal encoded + expected_values = [1, 2, 3, 1, 4] + self.assertTrue(all(encoded_data['size'] == expected_values)) + print("Ordinal encoding passed.") From 5453aac786920040f2dd29b6e11432f695670682 Mon Sep 17 00:00:00 2001 From: Nandhakumar <40836847+Nandha951@users.noreply.github.com> Date: Thu, 28 Nov 2024 19:57:07 -0800 Subject: [PATCH 6/6] Updated to class based code, placed the file under pandas and added numpy based docstrings --- src/koheesio/{ => pandas}/categorical_encoding.py | 0 tests/test_categorical_encoding.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename src/koheesio/{ => pandas}/categorical_encoding.py (100%) diff --git a/src/koheesio/categorical_encoding.py b/src/koheesio/pandas/categorical_encoding.py similarity index 100% rename from src/koheesio/categorical_encoding.py rename to src/koheesio/pandas/categorical_encoding.py diff --git a/tests/test_categorical_encoding.py b/tests/test_categorical_encoding.py index 0de92d9f..6ca63eb9 100644 --- a/tests/test_categorical_encoding.py +++ b/tests/test_categorical_encoding.py @@ -1,6 +1,6 @@ import unittest import pandas as pd -from src.koheesio.categorical_encoding import PandasCategoricalEncoding +from src.koheesio.pandas.categorical_encoding import PandasCategoricalEncoding import unittest import pandas as pd