diff --git a/Changelog.md b/Changelog.md index 24ff8ff..0fb0676 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,16 @@ Changes in this log refer only to changes that make it to the 'main' branch. and For changes in deployment, please see the [deployment changelog](deploy/cttso-ica-to-pieriandx-cdk/Changelog.md) +## 2023-11-17 + +> Author: Alexis Lucattini +> Email: [Alexis.Lucattini@umccr.org](mailto:alexis.lucattini@umccr.org) + + +### Hotfixes + +* Update how is_identified column is processed as it can both be a string and a boolean value (https://github.com/umccr/cttso-ica-to-pieriandx/pull/163) + * Resolves https://github.com/umccr/cttso-ica-to-pieriandx/issues/162 ## 2023-10-18 diff --git a/utils/accession.py b/utils/accession.py index 21759c7..6223631 100644 --- a/utils/accession.py +++ b/utils/accession.py @@ -345,6 +345,28 @@ def datetime_obj_to_utc(datetime_obj: datetime) -> datetime: return datetime_obj.replace(microsecond=0) +def is_identified_sample(is_identified: Union[str, bool]) -> bool: + """ + Is either True / False or "identified" / "deidentified" + :param is_identified: + :return: + """ + + if isinstance(is_identified, bool): + return is_identified + + if isinstance(is_identified, str): + if is_identified not in ["identified", "deidentified"]: + logger.error(f"Got '{is_identified}', expected one of 'identified' or 'deidentified'") + raise ValueError + if is_identified == "identified": + return True + return False + + logger.error(f"Did not expect type {type(is_identified)} for is_identified input") + raise TypeError + + def sanitise_data_frame(input_df: pd.DataFrame) -> pd.DataFrame: # Copy dataframe and convert blanks to nas input_df = input_df.copy().replace("", pd.NA) @@ -461,7 +483,7 @@ def sanitise_data_frame(input_df: pd.DataFrame) -> pd.DataFrame: input_df["gender"] = input_df["gender"].apply(lambda x: Gender(x.lower())) # Check if identified column set, if not set, set to false - input_df["is_identified"] = input_df.apply(lambda x: x.is_identified + input_df["is_identified"] = input_df.apply(lambda x: is_identified_sample(x.is_identified) if hasattr(x, "is_identified") else False, axis="columns")