dyvenia · trymzet · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `_empty_column_to_string` and `_convert_all_to_string_type` to convert data types to string.
 - Added `na_values` parameter to `Sharepoint` class to parse `N/A` values coming from the excel file columns.
 - Added `get_last_segment_from_url` function to sharepoint file.
 - Added `validate` function to `viadot/utils.py`

diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py
@@ -145,6 +145,42 @@ def _download_excel(self, url: str, **kwargs) -> pd.ExcelFile:
             bytes_stream = io.BytesIO(response.content)
             return pd.ExcelFile(bytes_stream)
 
+    def _convert_all_to_string_type(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Convert all column data types in the DataFrame to strings.
+
+        This method converts all the values in the DataFrame to strings,
+        handling NaN values by replacing them with None.
+
+        Args:
+            df (pd.DataFrame): DataFrame to convert.
+
+        Returns:
+            pd.DataFrame: DataFrame with all data types converted to string.
+                        Columns that contain only None values are also
+                        converted to string type.
+        """
+        df_converted = df.astype(str).where(pd.notnull(df), None)
+        return self._empty_column_to_string(df=df_converted)
+
+    def _empty_column_to_string(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Convert the type of columns containing only None values to string.
+
+        This method iterates through the DataFrame columns and converts the
+        type of any column that contains only None values to string.
+
+        Args:
+            df (pd.DataFrame): DataFrame to convert.
+
+        Returns:
+            pd.DataFrame: Updated DataFrame with columns containing only
+                        None values converted to string type. All columns
+                        in the returned DataFrame will be of type object/string.
+        """
+        for col in df.columns:
+            if df[col].isnull().all():
+                df[col] = df[col].astype("string")
+        return df
+
     @add_viadot_metadata_columns
     def to_df(
         self,
@@ -218,4 +254,4 @@ def to_df(
         if tests:
             validate(df=df_clean, tests=tests)
 
-        return df_clean.astype(str).where(pd.notnull(df_clean), None)
+        return self._convert_all_to_string_type(df=df_clean)
diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py
@@ -3,29 +3,53 @@
 import pandas as pd
 from viadot.sources import Sharepoint
 
+DUMMY_CREDS = {"site": "test", "username": "test2", "password": "test"}
+SAMPLE_DF = pd.DataFrame(
+    {
+        "int_col": [1, 2, 3, 4, 5, None],
+        "float_col": [1.1, 2.2, 3.3, 3.0, 5.5, 6.6],
+        "str_col": ["a", "b", "c", "d", "e", "f"],
+        "nan_col": [None, None, None, None, None, None],
+        "mixed_col": [1, "text", None, None, 4.2, "text2"],
+    }
+)
+
 
 class SharepointMock(Sharepoint):
     def _download_excel(self, url=None):
         return pd.ExcelFile(Path("tests/unit/test_file.xlsx"))
 
 
 def test_sharepoint_default_na():
-    dummy_creds = {"site": "test", "username": "test2", "password": "test"}
-
-    s = SharepointMock(credentials=dummy_creds)
+    s = SharepointMock(credentials=DUMMY_CREDS)
     df = s.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES)
 
     assert not df.empty
     assert "NA" not in list(df["col_a"])
 
 
 def test_sharepoint_custom_na():
-    dummy_creds = {"site": "test", "username": "test", "password": "test"}
-
-    s = SharepointMock(credentials=dummy_creds)
+    s = SharepointMock(credentials=DUMMY_CREDS)
     df = s.to_df(
         url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"]
     )
 
     assert not df.empty
     assert "NA" in list(df["col_a"])
+
+
+def test_sharepoint_convert_all_to_string_type():
+    s = SharepointMock(credentials=DUMMY_CREDS)
+    converted_df = s._convert_all_to_string_type(df=SAMPLE_DF)
+
+    assert not converted_df.empty
+    assert pd.isnull(converted_df["nan_col"]).all()
+
+
+def test_sharepoint_convert_empty_columns_to_string():
+    s = SharepointMock(credentials=DUMMY_CREDS)
+    converted_df = s._empty_column_to_string(df=SAMPLE_DF)
+
+    assert not converted_df.empty
+    assert converted_df["float_col"].dtype == float
+    assert converted_df["nan_col"].dtype == "string"