Wrap pandas functions to support not including None with the NA val…

…ues argument
Sage-Bionetworks · Nov 22, 2024 · af4683c · af4683c
1 parent 14aa510
commit af4683c
Show file tree

Hide file tree

Showing 13 changed files with 126 additions and 37 deletions.
diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py
@@ -17,6 +17,7 @@
 
 from schematic.schemas.data_model_graph import DataModelGraphExplorer
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import read_csv
 from schematic.utils.validate_rules_utils import validation_rule_info
 from schematic.utils.validate_utils import (
     comma_separated_list_regex,
@@ -868,7 +869,7 @@ def _get_target_manifest_dataframes(
             entity: File = self.synStore.getDatasetManifest(
                 datasetId=dataset_id, downloadFile=True
             )
-            manifests.append(pd.read_csv(entity.path))
+            manifests.append(read_csv(entity.path))
         return dict(zip(manifest_ids, manifests))
 
     def get_target_manifests(

diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py
@@ -8,6 +8,7 @@
 from opentelemetry import trace
 
 from schematic.store.synapse_tracker import SynapseEntityTracker
+from schematic.utils.df_utils import read_csv
 
 
 class SynapseTableNameError(Exception):
@@ -108,7 +109,7 @@ def execute_sql_query(
             pandas.DataFrame: The queried table
         """
         result = self.execute_sql_statement(query, include_row_data)
-        table = pandas.read_csv(result.filepath)
+        table = read_csv(result.filepath)
         return table
 
     def execute_sql_statement(

diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
@@ -58,7 +58,12 @@
 from schematic.store.base import BaseStorage
 from schematic.store.database.synapse_database import SynapseDatabase
 from schematic.store.synapse_tracker import SynapseEntityTracker
-from schematic.utils.df_utils import col_in_dataframe, load_df, update_df
+from schematic.utils.df_utils import (
+    STR_NA_VALUES_FILTERED,
+    col_in_dataframe,
+    load_df,
+    update_df,
+)
 
 # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
 # Please do not remove these import statements
@@ -401,7 +406,7 @@ def query_fileview(
             try:
                 self.storageFileviewTable = self.syn.tableQuery(
                     query=self.fileview_query,
-                ).asDataFrame()
+                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             except SynapseHTTPError as exc:
                 exception_text = str(exc)
                 if "Unknown column path" in exception_text:
@@ -1433,7 +1438,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
         """
 
         results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
-        df = results.asDataFrame(rowIdAndVersionInIndex=False)
+        df = results.asDataFrame(
+            rowIdAndVersionInIndex=False,
+            na_values=STR_NA_VALUES_FILTERED,
+            keep_default_na=False,
+        )
 
         return df, results
 
@@ -3485,7 +3494,11 @@ def query(self, tidy=True, force=False):
         if self.table is None or force:
             fileview_id = self.view_schema["id"]
             self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
-            self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False)
+            self.table = self.results.asDataFrame(
+                rowIdAndVersionInIndex=False,
+                na_values=STR_NA_VALUES_FILTERED,
+                keep_default_na=False,
+            )
         if tidy:
             self.tidy_table()
         return self.table

diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py
@@ -4,17 +4,41 @@
 
 import logging
 from copy import deepcopy
-from time import perf_counter
-from typing import Union, Any, Optional
 from datetime import datetime
+from time import perf_counter
+from typing import Any, Optional, Union
+
 import dateparser as dp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from pandarallel import pandarallel  # type: ignore
+from pandas._libs.parsers import STR_NA_VALUES
+
+STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)
+
+try:
+    STR_NA_VALUES_FILTERED.remove("None")
+except KeyError:
+    pass
 
 logger = logging.getLogger(__name__)
 
 
+def read_csv(
+    path_or_buffer: str, keep_default_na=False, encoding="utf8", **load_args: Any
+) -> pd.DataFrame:
+    na_values = load_args.pop(
+        "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
+    )
+    return pd.read_csv(  # type: ignore
+        path_or_buffer,
+        na_values=na_values,
+        keep_default_na=keep_default_na,
+        encoding=encoding,
+        **load_args,
+    )
+
+
 def load_df(
     file_path: str,
     preserve_raw_input: bool = True,
@@ -45,9 +69,7 @@ def load_df(
     t_load_df = perf_counter()
 
     # Read CSV to df as type specified in kwargs
-    org_df = pd.read_csv(  # type: ignore
-        file_path, keep_default_na=True, encoding="utf8", **load_args
-    )
+    org_df = read_csv(file_path, encoding="utf8", **load_args)  # type: ignore
     if not isinstance(org_df, pd.DataFrame):
         raise ValueError(
             (

diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py
@@ -20,6 +20,7 @@
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.store.synapse import ManifestDownload, SynapseStorage
+from schematic.utils.df_utils import read_csv
 from schematic.utils.general import create_temp_folder, entity_type_mapping
 from schematic.utils.schema_utils import (
     DisplayLabelType,
@@ -178,7 +179,7 @@ def parse_bool(str_bool):
 
 
 def return_as_json(manifest_local_file_path):
-    manifest_csv = pd.read_csv(manifest_local_file_path)
+    manifest_csv = read_csv(manifest_local_file_path)
     manifest_json = manifest_csv.to_dict(orient="records")
     return manifest_json
 

diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py
@@ -4,16 +4,16 @@
 import uuid
 from io import BytesIO
 
+import numpy as np
 import pytest
 import requests
-from openpyxl import load_workbook
 from click.testing import CliRunner
-import pandas as pd
-import numpy as np
+from openpyxl import load_workbook
 
-from schematic.configuration.configuration import Configuration, CONFIG
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.manifest.commands import manifest
 from schematic.models.commands import model
+from schematic.utils.df_utils import read_csv
 from tests.conftest import ConfigurationForTesting
 
 LIGHT_BLUE = "FFEAF7F9"  # Required cell
@@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None:
             # command has no (python) errors, has exit code 0
             assert result.exit_code == 0
 
-            biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv")
-            patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv")
+            biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv")
+            patient_df = read_csv("tests/data/example.Patient.manifest.csv")
 
         # Remove created files:
         finally:

diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py
@@ -12,6 +12,7 @@
 
 from schematic.configuration.configuration import CONFIG
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import read_csv
 from tests.conftest import ConfigurationForTesting, Helpers
 from tests.utils import CleanupItem
 
@@ -73,7 +74,7 @@ def validate_submitted_manifest_file(
         manifest_file_path = os.path.join(
             download_location, manifest_data["properties"]["name"]
         )
-        manifest_submitted_df = pd.read_csv(manifest_file_path)
+        manifest_submitted_df = read_csv(manifest_file_path)
         assert "entityId" in manifest_submitted_df.columns
         assert "Id" in manifest_submitted_df.columns
 

diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py
@@ -23,6 +23,7 @@
 from synapseclient.models import File, Folder
 
 from schematic.store.synapse import SynapseStorage
+from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
 from schematic.utils.general import create_temp_folder
 from tests.conftest import Helpers, metadata_model
 from tests.utils import CleanupItem
@@ -531,7 +532,7 @@ def _submit_and_verify_manifest(
                 )
                 manifest_table = synapse_store.syn.tableQuery(
                     f"select * from {expected_table_id}", downloadLocation=download_dir
-                ).asDataFrame()
+                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 
                 # AND the columns in the manifest table should reflect the ones in the file
                 table_columns = manifest_table.columns

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -14,11 +14,11 @@
 from flask.testing import FlaskClient
 from opentelemetry import trace
 
-from schematic.configuration.configuration import Configuration
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_parser import DataModelParser
+from schematic.utils.df_utils import read_csv
 from schematic.utils.general import create_temp_folder
-from schematic.configuration.configuration import CONFIG
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations(
         response_google_sheet = json.loads(response.data)
 
         # open the google sheet
-        google_sheet_df = pd.read_csv(
+        google_sheet_df = read_csv(
             response_google_sheet[0] + "/export?gid=0&format=csv"
         )
 
@@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations(
         response_google_sheet = json.loads(response.data)
 
         # open the google sheet
-        google_sheet_df = pd.read_csv(
+        google_sheet_df = read_csv(
             response_google_sheet[0] + "/export?gid=0&format=csv"
         )
 

diff --git a/tests/test_store.py b/tests/test_store.py
@@ -28,6 +28,7 @@
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.store.base import BaseStorage
 from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage
+from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
 from schematic.utils.general import check_synapse_cache_size, create_temp_folder
 from tests.conftest import Helpers
 from tests.utils import CleanupItem
@@ -1244,7 +1245,7 @@ async def copy_folder_and_update_manifest(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         days_to_follow_up = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1281,7 +1282,7 @@ async def copy_folder_and_update_manifest(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         days_to_follow_up = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1343,7 +1344,7 @@ async def test_upsert_table(
         # Query table for DaystoFollowUp column
         table_query = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 
@@ -1384,7 +1385,7 @@ async def test_upsert_table(
         table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
         table_query = (
             synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
-            .asDataFrame()
+            .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
             .squeeze()
         )
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -27,7 +27,7 @@
 )
 from schematic.schemas.data_model_parser import DataModelParser
 from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils
-from schematic.utils.df_utils import load_df
+from schematic.utils.df_utils import load_df, read_csv
 from schematic.utils.general import (
     calculate_datetime,
     check_synapse_cache_size,
@@ -473,7 +473,7 @@ def test_load_df(self, helpers, preserve_raw_input):
         test_col = "Check NA"
         file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv")
 
-        unprocessed_df = pd.read_csv(file_path, encoding="utf8")
+        unprocessed_df = read_csv(file_path, encoding="utf8")
         df = df_utils.load_df(
             file_path, preserve_raw_input=preserve_raw_input, data_model=False
         )
@@ -1100,7 +1100,7 @@ def test_convert_nan_entries_to_empty_strings(
         manifest_path = helpers.get_data_path(manifest)
         model_path = helpers.get_data_path(model)
 
-        ## Gather parmeters needed to run validate_manifest_rules
+        # Gather parmeters needed to run validate_manifest_rules
         errors = []
         load_args = {
             "dtype": "string",

diff --git a/tests/test_viz.py b/tests/test_viz.py
@@ -1,11 +1,10 @@
 import json
 import logging
-import os
 from io import StringIO
 
-import pandas as pd
 import pytest
 
+from schematic.utils.df_utils import read_csv
 from schematic.visualization.attributes_explorer import AttributesExplorer
 from schematic.visualization.tangled_tree import TangledTree
 
@@ -44,7 +43,7 @@ class TestVisualization:
     def test_ae(self, helpers, attributes_explorer):
         attributes_str = attributes_explorer.parse_attributes(save_file=False)
 
-        df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])
+        df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])
 
         # For the attributes df define expected columns
         expect_col_names = [
@@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer):
             component=component, save_file=False, include_index=False
         )
         # convert to dataframe
-        component_attributes = pd.read_csv(StringIO(component_attributes_str))
+        component_attributes = read_csv(StringIO(component_attributes_str))
 
         # For the attributes df define expected columns
         expect_col_names = [
@@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree):
         # Get text for tangled tree.
         text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False)
 
-        df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])
+        df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])
 
         # Define expected text associated with 'Patient' and 'Imaging' tree
         expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"]