Sharepoint handle null values (#931)

* ✨ Added `na_values` list * ✨ Modified way of casting to string to keep the NaN values * 🐛 Replaced `df` with `df_clean` * ✨ Added `DEFAULT_NA_VALUES` attribute and `_download_excel` function * 🐛 Changed pytestArgs from `src` to `tests` * ✅ Added tests for sharepoint na_values parameter * ✅ Changed parameter names in sharepoint tests * ⚡️ Assigned DEFAULT_NA_VALUES * 📝 Updated docstring * 👷 Update coverage settings --------- Co-authored-by: trymzet <[email protected]>
dyvenia · Jun 20, 2024 · fb52bbe · fb52bbe
1 parent 01f5bf3
commit fb52bbe
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 53 deletions.
diff --git a/.github/workflows/ci-2.0.yml b/.github/workflows/ci-2.0.yml
@@ -52,4 +52,4 @@ jobs:
 
       - name: Validate test coverage
         if: always()
-        run: rye run coverage report --fail-under=41
+        run: rye run coverage report --fail-under=32
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -22,7 +22,7 @@
     "editor.formatOnSave": true
   },
   "python.analysis.typeCheckingMode": "basic",
-  "python.testing.pytestArgs": ["src"],
+  "python.testing.pytestArgs": ["tests"],
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,
   "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `na_values` parameter to `Sharepoint` class to parse `N/A` values coming from the excel file columns.
 - Added `get_last_segment_from_url` function to sharepoint file.
 - Added `validate` function to `viadot/utils.py`
 - Fixed `Databricks.create_table_from_pandas()` failing to overwrite a table in some cases even with `replace="True"`

diff --git a/pyproject.toml b/pyproject.toml
@@ -69,3 +69,6 @@ log_format = "%(asctime)s %(levelname)s %(message)s"
 log_date_format = "%Y-%m-%d %H:%M:%S"
 log_cli = true
 log_level = "WARNING"
+
+[tool.coverage.run]
+omit = ['tests/*']
diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py
@@ -1,10 +1,11 @@
 import io
+import os
 from typing import Literal, Optional, Union
 from urllib.parse import urlparse
-import os
 
 import pandas as pd
 import sharepy
+from pandas._libs.parsers import STR_NA_VALUES
 from pydantic import BaseModel, root_validator
 from sharepy.errors import AuthError
 
@@ -80,6 +81,8 @@ class Sharepoint(Source):
         config_key (str, optional): The key in the viadot config holding relevant credentials.
     """
 
+    DEFAULT_NA_VALUES = list(STR_NA_VALUES)
+
     def __init__(
         self,
         credentials: SharepointCredentials = None,
@@ -88,9 +91,7 @@ def __init__(
         **kwargs,
     ):
         raw_creds = credentials or get_source_credentials(config_key) or {}
-        validated_creds = dict(
-            SharepointCredentials(**raw_creds)
-        )  # validate the credentials
+        validated_creds = dict(SharepointCredentials(**raw_creds))
         super().__init__(*args, credentials=validated_creds, **kwargs)
 
     def get_connection(self) -> sharepy.session.SharePointSession:
@@ -128,13 +129,30 @@ def download_file(self, url: str, to_path: list | str) -> None:
         )
         conn.close()
 
+    def _download_excel(self, url: str, **kwargs) -> pd.ExcelFile:
+        endpoint_value, endpoint_type = get_last_segment_from_url(url)
+        if "nrows" in kwargs:
+            raise ValueError("Parameter 'nrows' is not supported.")
+        conn = self.get_connection()
+
+        if endpoint_type == "file":
+            if endpoint_value != ".xlsx":
+                raise ValueError(
+                    "Only Excel files with 'XLSX' extension can be loaded into a DataFrame."
+                )
+            self.logger.info(f"Downloading data from {url}...")
+            response = conn.get(url)
+            bytes_stream = io.BytesIO(response.content)
+            return pd.ExcelFile(bytes_stream)
+
     @add_viadot_metadata_columns
     def to_df(
         self,
         url: str,
         sheet_name: Optional[Union[str, list, int]] = None,
         if_empty: str = "warn",
         tests: dict = {},
+        na_values: list[str] | None = None,
         **kwargs,
     ) -> pd.DataFrame:
         """
@@ -150,59 +168,54 @@ def to_df(
             tests (Dict[str], optional): A dictionary with optional list of tests
                 to verify the output dataframe. If defined, triggers the `validate`
                 function from utils. Defaults to None.
+            na_values (list[str] | None): Additional strings to recognize as NA/NaN.
+                If list passed, the specific NA values for each column will be recognized.
+                Defaults to None.
+                If None then the "DEFAULT_NA_VALUES" is assigned list(" ", "#N/A", "#N/A N/A",
+                "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
+                "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null").
+            If list passed, the specific NA values for each column will be recognized.
+            Defaults to None.
             kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). Note that
             `nrows` is not supported.
 
         Returns:
             pd.DataFrame: The resulting data as a pandas DataFrame.
         """
-        endpoint_value, endpoint_type = get_last_segment_from_url(url)
-
-        if "nrows" in kwargs:
-            raise ValueError("Parameter 'nrows' is not supported.")
-
-        conn = self.get_connection()
-        ## add option to get only excel files - if needed - here folder with only excels is required
-        if endpoint_type == "file":
-            if endpoint_value != ".xlsx":
-                raise ValueError(
-                    "Only Excel files with 'XLSX' extension can be loaded into a DataFrame."
+        excel_file = self._download_excel(url=url, **kwargs)
+
+        if sheet_name:
+            df = excel_file.parse(
+                sheet_name=sheet_name,
+                keep_default_na=False,
+                na_values=na_values or self.DEFAULT_NA_VALUES,
+                **kwargs,
+            )
+            df["sheet_name"] = sheet_name
+        else:
+            sheets: list[pd.DataFrame] = []
+            for sheet_name in excel_file.sheet_names:
+                sheet = excel_file.parse(
+                    sheet_name=sheet_name,
+                    keep_default_na=False,
+                    na_values=na_values or self.DEFAULT_NA_VALUES,
+                    **kwargs,
                 )
-            self.logger.info(f"Downloading data from {url}...")
+                sheet["sheet_name"] = sheet_name
+                sheets.append(sheet)
+            df = pd.concat(sheets)
+
+        if df.empty:
+            try:
+                self._handle_if_empty(if_empty)
+            except SKIP:
+                return pd.DataFrame()
+        else:
+            self.logger.info(f"Successfully downloaded {len(df)} of data.")
 
-            response = conn.get(url)
-            bytes_stream = io.BytesIO(response.content)
-            excel_file = pd.ExcelFile(bytes_stream)
+        df_clean = cleanup_df(df)
 
-            if sheet_name:
-                df = excel_file.parse(
-                    sheet_name=sheet_name, keep_default_na=False, na_values="", **kwargs
-                )
-                df["sheet_name"] = sheet_name
-            else:
-                sheets: list[pd.DataFrame] = []
-                for sheet_name in excel_file.sheet_names:
-                    sheet = excel_file.parse(
-                        sheet_name=sheet_name,
-                        keep_default_na=False,
-                        na_values="",
-                        **kwargs,
-                    )
-                    sheet["sheet_name"] = sheet_name
-                    sheets.append(sheet)
-                df = pd.concat(sheets)
-
-            if df.empty:
-                try:
-                    self._handle_if_empty(if_empty)
-                except SKIP:
-                    return pd.DataFrame()
-            else:
-                self.logger.info(f"Successfully downloaded {len(df)} of data.")
-
-            df_clean = cleanup_df(df)
-
-            if tests:
-                validate(df=df_clean, tests=tests)
-
-            return df_clean.astype(str)
+        if tests:
+            validate(df=df_clean, tests=tests)
+
+        return df_clean.astype(str).where(pd.notnull(df_clean), None)
diff --git a/tests/unit/test_file.xlsx b/tests/unit/test_file.xlsx
diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+import pandas as pd
+from viadot.sources import Sharepoint
+
+
+class SharepointMock(Sharepoint):
+    def _download_excel(self, url=None):
+        return pd.ExcelFile(Path("tests/unit/test_file.xlsx"))
+
+
+def test_sharepoint_default_na():
+    dummy_creds = {"site": "test", "username": "test2", "password": "test"}
+
+    s = SharepointMock(credentials=dummy_creds)
+    df = s.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES)
+
+    assert not df.empty
+    assert "NA" not in list(df["col_a"])
+
+
+def test_sharepoint_custom_na():
+    dummy_creds = {"site": "test", "username": "test", "password": "test"}
+
+    s = SharepointMock(credentials=dummy_creds)
+    df = s.to_df(
+        url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"]
+    )
+
+    assert not df.empty
+    assert "NA" in list(df["col_a"])