Skip to content

Commit

Permalink
Sharepoint handle null values (#931)
Browse files Browse the repository at this point in the history
* ✨ Added `na_values` list

* ✨ Modified way of casting to string to keep the NaN values

* 🐛 Replaced `df` with `df_clean`

* ✨ Added `DEFAULT_NA_VALUES` attribute and `_download_excel` function

* 🐛 Changed pytestArgs from `src` to `tests`

* ✅ Added tests for sharepoint na_values parameter

* ✅ Changed parameter names in sharepoint tests

* ⚡️ Assigned DEFAULT_NA_VALUES

* 📝 Updated docstring

* 👷 Update coverage settings

---------

Co-authored-by: trymzet <[email protected]>
  • Loading branch information
Rafalz13 and trymzet authored Jun 20, 2024
1 parent 01f5bf3 commit fb52bbe
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-2.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@ jobs:

- name: Validate test coverage
if: always()
run: rye run coverage report --fail-under=41
run: rye run coverage report --fail-under=32
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"editor.formatOnSave": true
},
"python.analysis.typeCheckingMode": "basic",
"python.testing.pytestArgs": ["src"],
"python.testing.pytestArgs": ["tests"],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added `na_values` parameter to `Sharepoint` class to parse `N/A` values coming from the excel file columns.
- Added `get_last_segment_from_url` function to sharepoint file.
- Added `validate` function to `viadot/utils.py`
- Fixed `Databricks.create_table_from_pandas()` failing to overwrite a table in some cases even with `replace="True"`
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,6 @@ log_format = "%(asctime)s %(levelname)s %(message)s"
log_date_format = "%Y-%m-%d %H:%M:%S"
log_cli = true
log_level = "WARNING"

[tool.coverage.run]
omit = ['tests/*']
115 changes: 64 additions & 51 deletions src/viadot/sources/sharepoint.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import io
import os
from typing import Literal, Optional, Union
from urllib.parse import urlparse
import os

import pandas as pd
import sharepy
from pandas._libs.parsers import STR_NA_VALUES
from pydantic import BaseModel, root_validator
from sharepy.errors import AuthError

Expand Down Expand Up @@ -80,6 +81,8 @@ class Sharepoint(Source):
config_key (str, optional): The key in the viadot config holding relevant credentials.
"""

DEFAULT_NA_VALUES = list(STR_NA_VALUES)

def __init__(
self,
credentials: SharepointCredentials = None,
Expand All @@ -88,9 +91,7 @@ def __init__(
**kwargs,
):
raw_creds = credentials or get_source_credentials(config_key) or {}
validated_creds = dict(
SharepointCredentials(**raw_creds)
) # validate the credentials
validated_creds = dict(SharepointCredentials(**raw_creds))
super().__init__(*args, credentials=validated_creds, **kwargs)

def get_connection(self) -> sharepy.session.SharePointSession:
Expand Down Expand Up @@ -128,13 +129,30 @@ def download_file(self, url: str, to_path: list | str) -> None:
)
conn.close()

def _download_excel(self, url: str, **kwargs) -> pd.ExcelFile:
endpoint_value, endpoint_type = get_last_segment_from_url(url)
if "nrows" in kwargs:
raise ValueError("Parameter 'nrows' is not supported.")
conn = self.get_connection()

if endpoint_type == "file":
if endpoint_value != ".xlsx":
raise ValueError(
"Only Excel files with 'XLSX' extension can be loaded into a DataFrame."
)
self.logger.info(f"Downloading data from {url}...")
response = conn.get(url)
bytes_stream = io.BytesIO(response.content)
return pd.ExcelFile(bytes_stream)

@add_viadot_metadata_columns
def to_df(
self,
url: str,
sheet_name: Optional[Union[str, list, int]] = None,
if_empty: str = "warn",
tests: dict = {},
na_values: list[str] | None = None,
**kwargs,
) -> pd.DataFrame:
"""
Expand All @@ -150,59 +168,54 @@ def to_df(
tests (Dict[str], optional): A dictionary with optional list of tests
to verify the output dataframe. If defined, triggers the `validate`
function from utils. Defaults to None.
na_values (list[str] | None): Additional strings to recognize as NA/NaN.
If list passed, the specific NA values for each column will be recognized.
Defaults to None.
If None then the "DEFAULT_NA_VALUES" is assigned list(" ", "#N/A", "#N/A N/A",
"#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
"<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null").
If list passed, the specific NA values for each column will be recognized.
Defaults to None.
kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). Note that
`nrows` is not supported.
Returns:
pd.DataFrame: The resulting data as a pandas DataFrame.
"""
endpoint_value, endpoint_type = get_last_segment_from_url(url)

if "nrows" in kwargs:
raise ValueError("Parameter 'nrows' is not supported.")

conn = self.get_connection()
## add option to get only excel files - if needed - here folder with only excels is required
if endpoint_type == "file":
if endpoint_value != ".xlsx":
raise ValueError(
"Only Excel files with 'XLSX' extension can be loaded into a DataFrame."
excel_file = self._download_excel(url=url, **kwargs)

if sheet_name:
df = excel_file.parse(
sheet_name=sheet_name,
keep_default_na=False,
na_values=na_values or self.DEFAULT_NA_VALUES,
**kwargs,
)
df["sheet_name"] = sheet_name
else:
sheets: list[pd.DataFrame] = []
for sheet_name in excel_file.sheet_names:
sheet = excel_file.parse(
sheet_name=sheet_name,
keep_default_na=False,
na_values=na_values or self.DEFAULT_NA_VALUES,
**kwargs,
)
self.logger.info(f"Downloading data from {url}...")
sheet["sheet_name"] = sheet_name
sheets.append(sheet)
df = pd.concat(sheets)

if df.empty:
try:
self._handle_if_empty(if_empty)
except SKIP:
return pd.DataFrame()
else:
self.logger.info(f"Successfully downloaded {len(df)} of data.")

response = conn.get(url)
bytes_stream = io.BytesIO(response.content)
excel_file = pd.ExcelFile(bytes_stream)
df_clean = cleanup_df(df)

if sheet_name:
df = excel_file.parse(
sheet_name=sheet_name, keep_default_na=False, na_values="", **kwargs
)
df["sheet_name"] = sheet_name
else:
sheets: list[pd.DataFrame] = []
for sheet_name in excel_file.sheet_names:
sheet = excel_file.parse(
sheet_name=sheet_name,
keep_default_na=False,
na_values="",
**kwargs,
)
sheet["sheet_name"] = sheet_name
sheets.append(sheet)
df = pd.concat(sheets)

if df.empty:
try:
self._handle_if_empty(if_empty)
except SKIP:
return pd.DataFrame()
else:
self.logger.info(f"Successfully downloaded {len(df)} of data.")

df_clean = cleanup_df(df)

if tests:
validate(df=df_clean, tests=tests)

return df_clean.astype(str)
if tests:
validate(df=df_clean, tests=tests)

return df_clean.astype(str).where(pd.notnull(df_clean), None)
Binary file added tests/unit/test_file.xlsx
Binary file not shown.
31 changes: 31 additions & 0 deletions tests/unit/test_sharepoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path

import pandas as pd
from viadot.sources import Sharepoint


class SharepointMock(Sharepoint):
def _download_excel(self, url=None):
return pd.ExcelFile(Path("tests/unit/test_file.xlsx"))


def test_sharepoint_default_na():
dummy_creds = {"site": "test", "username": "test2", "password": "test"}

s = SharepointMock(credentials=dummy_creds)
df = s.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES)

assert not df.empty
assert "NA" not in list(df["col_a"])


def test_sharepoint_custom_na():
dummy_creds = {"site": "test", "username": "test", "password": "test"}

s = SharepointMock(credentials=dummy_creds)
df = s.to_df(
url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"]
)

assert not df.empty
assert "NA" in list(df["col_a"])

0 comments on commit fb52bbe

Please sign in to comment.