diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 9b13bebaf..52c30414f 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -17,6 +17,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.validate_rules_utils import validation_rule_info from schematic.utils.validate_utils import ( comma_separated_list_regex, @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes( entity: File = self.synStore.getDatasetManifest( datasetId=dataset_id, downloadFile=True ) - manifests.append(pd.read_csv(entity.path)) + manifests.append(read_csv(entity.path)) return dict(zip(manifest_ids, manifests)) def get_target_manifests( diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py index b827b140f..ba0ed2dc9 100644 --- a/schematic/store/database/synapse_database_wrapper.py +++ b/schematic/store/database/synapse_database_wrapper.py @@ -8,6 +8,7 @@ from opentelemetry import trace from schematic.store.synapse_tracker import SynapseEntityTracker +from schematic.utils.df_utils import read_csv class SynapseTableNameError(Exception): @@ -108,7 +109,7 @@ def execute_sql_query( pandas.DataFrame: The queried table """ result = self.execute_sql_statement(query, include_row_data) - table = pandas.read_csv(result.filepath) + table = read_csv(result.filepath) return table def execute_sql_statement( diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 7ccd98362..bf5bb6c49 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -58,7 +58,12 @@ from schematic.store.base import BaseStorage from schematic.store.database.synapse_database import SynapseDatabase from schematic.store.synapse_tracker import SynapseEntityTracker -from schematic.utils.df_utils import col_in_dataframe, load_df, update_df +from schematic.utils.df_utils import ( + STR_NA_VALUES_FILTERED, + col_in_dataframe, + load_df, + update_df, +) # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment # Please do not remove these import statements @@ -401,7 +406,7 @@ def query_fileview( try: self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) except SynapseHTTPError as exc: exception_text = str(exc) if "Unknown column path" in exception_text: @@ -1433,7 +1438,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable """ results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex=False) + df = results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) return df, results @@ -3485,7 +3494,11 @@ def query(self, tidy=True, force=False): if self.table is None or force: fileview_id = self.view_schema["id"] self.results = self.synapse.tableQuery(f"select * from {fileview_id}") - self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) + self.table = self.results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) if tidy: self.tidy_table() return self.table diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index b25e7db82..cc9f5ed9b 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -4,17 +4,41 @@ import logging from copy import deepcopy -from time import perf_counter -from typing import Union, Any, Optional from datetime import datetime +from time import perf_counter +from typing import Any, Optional, Union + import dateparser as dp -import pandas as pd import numpy as np +import pandas as pd from pandarallel import pandarallel # type: ignore +from pandas._libs.parsers import STR_NA_VALUES + +STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) + +try: + STR_NA_VALUES_FILTERED.remove("None") +except KeyError: + pass logger = logging.getLogger(__name__) +def read_csv( + path_or_buffer: str, keep_default_na=False, encoding="utf8", **load_args: Any +) -> pd.DataFrame: + na_values = load_args.pop( + "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None + ) + return pd.read_csv( # type: ignore + path_or_buffer, + na_values=na_values, + keep_default_na=keep_default_na, + encoding=encoding, + **load_args, + ) + + def load_df( file_path: str, preserve_raw_input: bool = True, @@ -45,9 +69,7 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv( # type: ignore - file_path, keep_default_na=True, encoding="utf8", **load_args - ) + org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore if not isinstance(org_df, pd.DataFrame): raise ValueError( ( diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 97484c15c..ad1e25279 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -20,6 +20,7 @@ from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import ManifestDownload, SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder, entity_type_mapping from schematic.utils.schema_utils import ( DisplayLabelType, @@ -178,7 +179,7 @@ def parse_bool(str_bool): def return_as_json(manifest_local_file_path): - manifest_csv = pd.read_csv(manifest_local_file_path) + manifest_csv = read_csv(manifest_local_file_path) manifest_json = manifest_csv.to_dict(orient="records") return manifest_json diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 8658f6e6b..e0edfd853 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -4,16 +4,16 @@ import uuid from io import BytesIO +import numpy as np import pytest import requests -from openpyxl import load_workbook from click.testing import CliRunner -import pandas as pd -import numpy as np +from openpyxl import load_workbook -from schematic.configuration.configuration import Configuration, CONFIG +from schematic.configuration.configuration import CONFIG, Configuration from schematic.manifest.commands import manifest from schematic.models.commands import model +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting LIGHT_BLUE = "FFEAF7F9" # Required cell @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 - biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv") - patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv") + biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv") + patient_df = read_csv("tests/data/example.Patient.manifest.csv") # Remove created files: finally: diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py index 92e6911c1..92a52ebe9 100644 --- a/tests/integration/test_manifest_submission.py +++ b/tests/integration/test_manifest_submission.py @@ -12,6 +12,7 @@ from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting, Helpers from tests.utils import CleanupItem @@ -73,7 +74,7 @@ def validate_submitted_manifest_file( manifest_file_path = os.path.join( download_location, manifest_data["properties"]["name"] ) - manifest_submitted_df = pd.read_csv(manifest_file_path) + manifest_submitted_df = read_csv(manifest_file_path) assert "entityId" in manifest_submitted_df.columns assert "Id" in manifest_submitted_df.columns diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 2178a83b8..3d42b13b5 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -23,6 +23,7 @@ from synapseclient.models import File, Folder from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder from tests.conftest import Helpers, metadata_model from tests.utils import CleanupItem @@ -531,7 +532,7 @@ def _submit_and_verify_manifest( ) manifest_table = synapse_store.syn.tableQuery( f"select * from {expected_table_id}", downloadLocation=download_dir - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) # AND the columns in the manifest table should reflect the ones in the file table_columns = manifest_table.columns diff --git a/tests/test_api.py b/tests/test_api.py index 1f2d79add..842210d5b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -14,11 +14,11 @@ from flask.testing import FlaskClient from opentelemetry import trace -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder -from schematic.configuration.configuration import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) @@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) diff --git a/tests/test_store.py b/tests/test_store.py index c9f32bec2..e90908876 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -28,6 +28,7 @@ from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import check_synapse_cache_size, create_temp_folder from tests.conftest import Helpers from tests.utils import CleanupItem @@ -1244,7 +1245,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1281,7 +1282,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1343,7 +1344,7 @@ async def test_upsert_table( # Query table for DaystoFollowUp column table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1384,7 +1385,7 @@ async def test_upsert_table( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2a0744439..f708e777b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -27,7 +27,7 @@ ) from schematic.schemas.data_model_parser import DataModelParser from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils -from schematic.utils.df_utils import load_df +from schematic.utils.df_utils import load_df, read_csv from schematic.utils.general import ( calculate_datetime, check_synapse_cache_size, @@ -473,7 +473,7 @@ def test_load_df(self, helpers, preserve_raw_input): test_col = "Check NA" file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv") - unprocessed_df = pd.read_csv(file_path, encoding="utf8") + unprocessed_df = read_csv(file_path, encoding="utf8") df = df_utils.load_df( file_path, preserve_raw_input=preserve_raw_input, data_model=False ) @@ -1100,7 +1100,7 @@ def test_convert_nan_entries_to_empty_strings( manifest_path = helpers.get_data_path(manifest) model_path = helpers.get_data_path(model) - ## Gather parmeters needed to run validate_manifest_rules + # Gather parmeters needed to run validate_manifest_rules errors = [] load_args = { "dtype": "string", diff --git a/tests/test_viz.py b/tests/test_viz.py index b94d79688..2ab78b9ce 100644 --- a/tests/test_viz.py +++ b/tests/test_viz.py @@ -1,11 +1,10 @@ import json import logging -import os from io import StringIO -import pandas as pd import pytest +from schematic.utils.df_utils import read_csv from schematic.visualization.attributes_explorer import AttributesExplorer from schematic.visualization.tangled_tree import TangledTree @@ -44,7 +43,7 @@ class TestVisualization: def test_ae(self, helpers, attributes_explorer): attributes_str = attributes_explorer.parse_attributes(save_file=False) - df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) # For the attributes df define expected columns expect_col_names = [ @@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer): component=component, save_file=False, include_index=False ) # convert to dataframe - component_attributes = pd.read_csv(StringIO(component_attributes_str)) + component_attributes = read_csv(StringIO(component_attributes_str)) # For the attributes df define expected columns expect_col_names = [ @@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree): # Get text for tangled tree. text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False) - df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) # Define expected text associated with 'Patient' and 'Imaging' tree expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"] diff --git a/tests/unit/test_df_utils.py b/tests/unit/test_df_utils.py new file mode 100644 index 000000000..28db591a2 --- /dev/null +++ b/tests/unit/test_df_utils.py @@ -0,0 +1,49 @@ +from io import BytesIO + +import numpy as np +import pandas as pd +from pandas._libs.parsers import STR_NA_VALUES + +from schematic.utils.df_utils import read_csv + + +class TestReadCsv: + def test_none_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND None is included in the STR_NA_VALUES + if "None" not in STR_NA_VALUES: + STR_NA_VALUES.add("None") + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer, na_values=STR_NA_VALUES) + + # THEN the None string value is not preserved + assert not result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] is np.nan + + def test_none_not_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer) + + # THEN the None string value is preserved + assert result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] == "None"