Skip to content

Commit

Permalink
Wrap pandas functions to support not including None with the NA val…
Browse files Browse the repository at this point in the history
…ues argument
  • Loading branch information
BryanFauble committed Nov 22, 2024
1 parent 14aa510 commit af4683c
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 37 deletions.
3 changes: 2 additions & 1 deletion schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from schematic.schemas.data_model_graph import DataModelGraphExplorer
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import (
comma_separated_list_regex,
Expand Down Expand Up @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes(
entity: File = self.synStore.getDatasetManifest(
datasetId=dataset_id, downloadFile=True
)
manifests.append(pd.read_csv(entity.path))
manifests.append(read_csv(entity.path))
return dict(zip(manifest_ids, manifests))

def get_target_manifests(
Expand Down
3 changes: 2 additions & 1 deletion schematic/store/database/synapse_database_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from opentelemetry import trace

from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import read_csv


class SynapseTableNameError(Exception):
Expand Down Expand Up @@ -108,7 +109,7 @@ def execute_sql_query(
pandas.DataFrame: The queried table
"""
result = self.execute_sql_statement(query, include_row_data)
table = pandas.read_csv(result.filepath)
table = read_csv(result.filepath)
return table

def execute_sql_statement(
Expand Down
21 changes: 17 additions & 4 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@
from schematic.store.base import BaseStorage
from schematic.store.database.synapse_database import SynapseDatabase
from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import col_in_dataframe, load_df, update_df
from schematic.utils.df_utils import (
STR_NA_VALUES_FILTERED,
col_in_dataframe,
load_df,
update_df,
)

# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
# Please do not remove these import statements
Expand Down Expand Up @@ -401,7 +406,7 @@ def query_fileview(
try:
self.storageFileviewTable = self.syn.tableQuery(
query=self.fileview_query,
).asDataFrame()
).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
except SynapseHTTPError as exc:
exception_text = str(exc)
if "Unknown column path" in exception_text:
Expand Down Expand Up @@ -1433,7 +1438,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
"""

results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
df = results.asDataFrame(rowIdAndVersionInIndex=False)
df = results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)

return df, results

Expand Down Expand Up @@ -3485,7 +3494,11 @@ def query(self, tidy=True, force=False):
if self.table is None or force:
fileview_id = self.view_schema["id"]
self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False)
self.table = self.results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)
if tidy:
self.tidy_table()
return self.table
Expand Down
34 changes: 28 additions & 6 deletions schematic/utils/df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,41 @@

import logging
from copy import deepcopy
from time import perf_counter
from typing import Union, Any, Optional
from datetime import datetime
from time import perf_counter
from typing import Any, Optional, Union

import dateparser as dp
import pandas as pd
import numpy as np
import pandas as pd
from pandarallel import pandarallel # type: ignore
from pandas._libs.parsers import STR_NA_VALUES

STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)

try:
STR_NA_VALUES_FILTERED.remove("None")
except KeyError:
pass

logger = logging.getLogger(__name__)


def read_csv(
path_or_buffer: str, keep_default_na=False, encoding="utf8", **load_args: Any
) -> pd.DataFrame:
na_values = load_args.pop(
"na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
)
return pd.read_csv( # type: ignore
path_or_buffer,
na_values=na_values,
keep_default_na=keep_default_na,
encoding=encoding,
**load_args,
)


def load_df(
file_path: str,
preserve_raw_input: bool = True,
Expand Down Expand Up @@ -45,9 +69,7 @@ def load_df(
t_load_df = perf_counter()

# Read CSV to df as type specified in kwargs
org_df = pd.read_csv( # type: ignore
file_path, keep_default_na=True, encoding="utf8", **load_args
)
org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore
if not isinstance(org_df, pd.DataFrame):
raise ValueError(
(
Expand Down
3 changes: 2 additions & 1 deletion schematic_api/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.synapse import ManifestDownload, SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.general import create_temp_folder, entity_type_mapping
from schematic.utils.schema_utils import (
DisplayLabelType,
Expand Down Expand Up @@ -178,7 +179,7 @@ def parse_bool(str_bool):


def return_as_json(manifest_local_file_path):
manifest_csv = pd.read_csv(manifest_local_file_path)
manifest_csv = read_csv(manifest_local_file_path)
manifest_json = manifest_csv.to_dict(orient="records")
return manifest_json

Expand Down
12 changes: 6 additions & 6 deletions tests/integration/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
import uuid
from io import BytesIO

import numpy as np
import pytest
import requests
from openpyxl import load_workbook
from click.testing import CliRunner
import pandas as pd
import numpy as np
from openpyxl import load_workbook

from schematic.configuration.configuration import Configuration, CONFIG
from schematic.configuration.configuration import CONFIG, Configuration
from schematic.manifest.commands import manifest
from schematic.models.commands import model
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting

LIGHT_BLUE = "FFEAF7F9" # Required cell
Expand Down Expand Up @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None:
# command has no (python) errors, has exit code 0
assert result.exit_code == 0

biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv")
biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = read_csv("tests/data/example.Patient.manifest.csv")

# Remove created files:
finally:
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_manifest_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from schematic.configuration.configuration import CONFIG
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting, Helpers
from tests.utils import CleanupItem

Expand Down Expand Up @@ -73,7 +74,7 @@ def validate_submitted_manifest_file(
manifest_file_path = os.path.join(
download_location, manifest_data["properties"]["name"]
)
manifest_submitted_df = pd.read_csv(manifest_file_path)
manifest_submitted_df = read_csv(manifest_file_path)
assert "entityId" in manifest_submitted_df.columns
assert "Id" in manifest_submitted_df.columns

Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_metadata_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from synapseclient.models import File, Folder

from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
from schematic.utils.general import create_temp_folder
from tests.conftest import Helpers, metadata_model
from tests.utils import CleanupItem
Expand Down Expand Up @@ -531,7 +532,7 @@ def _submit_and_verify_manifest(
)
manifest_table = synapse_store.syn.tableQuery(
f"select * from {expected_table_id}", downloadLocation=download_dir
).asDataFrame()
).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)

# AND the columns in the manifest table should reflect the ones in the file
table_columns = manifest_table.columns
Expand Down
8 changes: 4 additions & 4 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
from flask.testing import FlaskClient
from opentelemetry import trace

from schematic.configuration.configuration import Configuration
from schematic.configuration.configuration import CONFIG, Configuration
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.utils.df_utils import read_csv
from schematic.utils.general import create_temp_folder
from schematic.configuration.configuration import CONFIG

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations(
response_google_sheet = json.loads(response.data)

# open the google sheet
google_sheet_df = pd.read_csv(
google_sheet_df = read_csv(
response_google_sheet[0] + "/export?gid=0&format=csv"
)

Expand Down Expand Up @@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations(
response_google_sheet = json.loads(response.data)

# open the google sheet
google_sheet_df = pd.read_csv(
google_sheet_df = read_csv(
response_google_sheet[0] + "/export?gid=0&format=csv"
)

Expand Down
9 changes: 5 additions & 4 deletions tests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.base import BaseStorage
from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage
from schematic.utils.df_utils import STR_NA_VALUES_FILTERED
from schematic.utils.general import check_synapse_cache_size, create_temp_folder
from tests.conftest import Helpers
from tests.utils import CleanupItem
Expand Down Expand Up @@ -1244,7 +1245,7 @@ async def copy_folder_and_update_manifest(
table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
days_to_follow_up = (
synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
.asDataFrame()
.asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
.squeeze()
)

Expand Down Expand Up @@ -1281,7 +1282,7 @@ async def copy_folder_and_update_manifest(
table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
days_to_follow_up = (
synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
.asDataFrame()
.asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
.squeeze()
)

Expand Down Expand Up @@ -1343,7 +1344,7 @@ async def test_upsert_table(
# Query table for DaystoFollowUp column
table_query = (
synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
.asDataFrame()
.asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
.squeeze()
)

Expand Down Expand Up @@ -1384,7 +1385,7 @@ async def test_upsert_table(
table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId)
table_query = (
synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}")
.asDataFrame()
.asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
.squeeze()
)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)
from schematic.schemas.data_model_parser import DataModelParser
from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils
from schematic.utils.df_utils import load_df
from schematic.utils.df_utils import load_df, read_csv
from schematic.utils.general import (
calculate_datetime,
check_synapse_cache_size,
Expand Down Expand Up @@ -473,7 +473,7 @@ def test_load_df(self, helpers, preserve_raw_input):
test_col = "Check NA"
file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv")

unprocessed_df = pd.read_csv(file_path, encoding="utf8")
unprocessed_df = read_csv(file_path, encoding="utf8")
df = df_utils.load_df(
file_path, preserve_raw_input=preserve_raw_input, data_model=False
)
Expand Down Expand Up @@ -1100,7 +1100,7 @@ def test_convert_nan_entries_to_empty_strings(
manifest_path = helpers.get_data_path(manifest)
model_path = helpers.get_data_path(model)

## Gather parmeters needed to run validate_manifest_rules
# Gather parmeters needed to run validate_manifest_rules
errors = []
load_args = {
"dtype": "string",
Expand Down
9 changes: 4 additions & 5 deletions tests/test_viz.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import json
import logging
import os
from io import StringIO

import pandas as pd
import pytest

from schematic.utils.df_utils import read_csv
from schematic.visualization.attributes_explorer import AttributesExplorer
from schematic.visualization.tangled_tree import TangledTree

Expand Down Expand Up @@ -44,7 +43,7 @@ class TestVisualization:
def test_ae(self, helpers, attributes_explorer):
attributes_str = attributes_explorer.parse_attributes(save_file=False)

df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])
df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"])

# For the attributes df define expected columns
expect_col_names = [
Expand Down Expand Up @@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer):
component=component, save_file=False, include_index=False
)
# convert to dataframe
component_attributes = pd.read_csv(StringIO(component_attributes_str))
component_attributes = read_csv(StringIO(component_attributes_str))

# For the attributes df define expected columns
expect_col_names = [
Expand All @@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree):
# Get text for tangled tree.
text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False)

df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])
df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"])

# Define expected text associated with 'Patient' and 'Imaging' tree
expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"]
Expand Down
Loading

0 comments on commit af4683c

Please sign in to comment.