diff --git a/Changelog.md b/Changelog.md index ef830b9..6f4a5f1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,19 @@ Changes in this log refer only to changes that make it to the 'main' branch. and For changes in deployment, please see the [deployment changelog](deploy/cttso-ica-to-pieriandx-cdk/Changelog.md) +## 2023-10-18 + +> Author: Alexis Lucattini +> Email: [Alexis.Lucattini@ummcr.org](mailto:alexis.lucattini@umccr.org) + +### Enhancement + +* Add portal run id to sequencer run attribute of PierianDx Case Accession (https://github.com/umccr/cttso-ica-to-pieriandx/pull/142) + * Resolves https://github.com/umccr/cttso-ica-to-pieriandx/issues/130 + +* Allow both sub_panel and subpanel as valid panel types (https://github.com/umccr/cttso-ica-to-pieriandx/pull/143) + * Resolves https://github.com/umccr/cttso-ica-to-pieriandx/issues/139 + ## 2023-07-10 > Author: Alexis Lucattini diff --git a/deploy/cttso-ica-to-pieriandx-cdk/Changelog.md b/deploy/cttso-ica-to-pieriandx-cdk/Changelog.md index be2bc18..64697ab 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/Changelog.md +++ b/deploy/cttso-ica-to-pieriandx-cdk/Changelog.md @@ -3,6 +3,27 @@ Changes in this log refer only to changes that make it to the 'main' branch and are nested under deploy/cttso-ica-to-pieriandx-cdk. +## 2023-10-18 + +> Author: Alexis Lucattini +> Email: [Alexis.Lucattini@umccr.org](mailto:alexis.lucattini@umccr.org) + +### Enhancements + +* Move to project owner / project name mapping logic (https://github.com/umccr/cttso-ica-to-pieriandx/pull/141) + * And restructure LIMS sheet + * Diagram also updated + * Resolves: + * https://github.com/umccr/cttso-ica-to-pieriandx/issues/131 + * https://github.com/umccr/cttso-ica-to-pieriandx/issues/132 + * https://github.com/umccr/cttso-ica-to-pieriandx/issues/134 + * https://github.com/umccr/cttso-ica-to-pieriandx/issues/135 + +* Add deleted sheet (https://github.com/umccr/cttso-ica-to-pieriandx/pull/140) + * All cases assigned to user ToBe Deleted, are moved to a separate sheet + + + ## 2023-08-13 > Author: Alexis Lucattini diff --git a/deploy/cttso-ica-to-pieriandx-cdk/README.md b/deploy/cttso-ica-to-pieriandx-cdk/README.md index d58bba5..a6880a2 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/README.md +++ b/deploy/cttso-ica-to-pieriandx-cdk/README.md @@ -44,8 +44,13 @@ new_headers = [ "in_portal", "in_redcap", "in_pieriandx", - "glims_is_validation", - "glims_is_research", + "glims_project_owner", + "glims_project_name", + "glims_panel", + "glims_sample_type", + "glims_is_identified", + "glims_default_snomed_term", + "glims_needs_redcap", "redcap_sample_type", "redcap_is_complete", "portal_wfr_id", @@ -58,11 +63,12 @@ new_headers = [ "pieriandx_case_accession_number", "pieriandx_case_creation_date", "pieriandx_case_identified", + "pieriandx_assignee", "pieriandx_panel_type", "pieriandx_sample_type", "pieriandx_workflow_id", "pieriandx_workflow_status", - "pieriandx_report_status", + "pieriandx_report_status" ] headers_df = pd.DataFrame(columns=new_headers) @@ -87,35 +93,14 @@ print(new_spread.url) ## ctTSO LIMS Decision Tree -* Sample Types are determined by Google LIMS and RedCap - * If the ProjectName column in Google LIMS is set to _Validation_ or _Control_. - * Validation Sample goes through Validation Lambda - * If the Workflow column in Google LIMS is set to _Research_ AND Sample does not exist in RedCap - * Validation Sample goes through Validation Lambda - * If the Sample Type is Validation in RedCap - * Validation Sample goes through RedCap Lambda (but with SampleType set to Validation) - - * If Sample Type is PatientCare in RedCap - * Patient Care Sample goes through RedCap Lambda (with SampleType set to PatientCare) - - * If none of the above is true - * We assume this is a patient sample that is not in RedCap yet and hold off on running sample. - -* Panel Types are coupled to the Sample Type - * If the Sample is a Validation Sample - * Panel Type will always be _MAIN_ - * If the Sample is a Clinical Sample - * Panel Type will always be _SUBPANEL_ - -The following diagram(s) may be of assistance +Please see [#validation-or-clinical-script](#validation-or-clinical-script) for more information. ### Overview -![images/overview.drawio.png](images/overview.drawio.png) +> The following diagram may be of assistance -### Choose Launch Pathway +![images/overview.drawio.png](images/overview.drawio.png) -![images/choose-launch-pathway.drawio.png](images/choose-launch-pathway.drawio.png) ## Helpful scripts @@ -253,7 +238,7 @@ Now change to the deployment directory (the directory this readme is in) cd deploy/cttso-ica-to-pieriandx-cdk ``` -### Wake up lamdas! +### Wake up lambdas! Before we launch any payloads, let's ensure that the lambda (and any downstream lambdas) are active. @@ -282,6 +267,106 @@ Find the workflow with the subject id and library id of interest in the workflow Use the Google LIMS page to check if you're sample is a validation sample (ProjectName field is either _control_ or _validation_). Validation samples do not go through the subpanel pipeline, clinical samples go through the subpanel pipeline. +We use the following JSON logic to determine the pathway for each pieriandx sample based on it's project owner + +This file can be found in `project-name-to-pieriandx-mapping.json`. + +The mapping can be updated with the script `update_project_name_mapping.sh`. + +This ssm parameter is NOT part of the cdk stack and MUST be updated using the script above. + +```json +[ + { + "project_owner": "VCCC", + "project_name": "PO", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term":null + }, + { + "project_owner": "Grimmond", + "project_name": "COUMN", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "Tothill", + "project_name": "CUP", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": "Disseminated malignancy of unknown primary" + }, + { + "project_owner": "Tothill", + "project_name": "PPGL", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "TJohn", + "project_name": "MESO", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "TJohn", + "project_name": "OCEANiC", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": null + }, + { + "project_owner": "*", + "project_name": "SOLACE2", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "SLuen", + "project_name": "IMPARP", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "UMCCR", + "project_name": "Control", + "panel": "main", + "sample_type": "validation", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "UMCCR", + "project_name": "QAP", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "*", + "project_name": "*", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + } +] +``` ### Creating the input payloads file diff --git a/deploy/cttso-ica-to-pieriandx-cdk/constants.ts b/deploy/cttso-ica-to-pieriandx-cdk/constants.ts index c3fba5c..7d19e1c 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/constants.ts +++ b/deploy/cttso-ica-to-pieriandx-cdk/constants.ts @@ -44,3 +44,6 @@ export const SSM_TOKEN_REFRESH_LAMBDA_FUNCTION_ARN_VALUE: string = "token-refres // Output things export const SSM_LAMBDA_FUNCTION_ARN_VALUE: string = "cttso-ica-to-pieriandx-lambda-function" + +// Project Owner mapping path +export const SSM_PROJECT_NAME_TO_PIERIANDX_CONFIG_SSM_PATH: string = "cttso-lims-project-name-to-pieriandx-mapping" diff --git a/deploy/cttso-ica-to-pieriandx-cdk/images/choose-launch-pathway.drawio.png b/deploy/cttso-ica-to-pieriandx-cdk/images/choose-launch-pathway.drawio.png deleted file mode 100644 index ae452b3..0000000 Binary files a/deploy/cttso-ica-to-pieriandx-cdk/images/choose-launch-pathway.drawio.png and /dev/null differ diff --git a/deploy/cttso-ica-to-pieriandx-cdk/images/overview.drawio.png b/deploy/cttso-ica-to-pieriandx-cdk/images/overview.drawio.png index 355a8d8..ff2bfcb 100644 Binary files a/deploy/cttso-ica-to-pieriandx-cdk/images/overview.drawio.png and b/deploy/cttso-ica-to-pieriandx-cdk/images/overview.drawio.png differ diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_defaults_and_launch_validation_workflow/lambda_code.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_defaults_and_launch_validation_workflow/lambda_code.py index 17dc4db..baca52b 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_defaults_and_launch_validation_workflow/lambda_code.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_defaults_and_launch_validation_workflow/lambda_code.py @@ -44,7 +44,9 @@ def lambda_handler(event, context): "library_id": "L1234567", "case_accession_number": "SBJID_LIBID_123", "ica_workflow_run_id": "wfr.123abc", - "panel_type": "main" + "panel_type": "main", + "sample_type": "validation", + "is_identified": False } """ @@ -76,17 +78,30 @@ def lambda_handler(event, context): f"for subject id '{subject_id}' / library id '{library_id}'") raise ValueError + # Get Panel Type (or get default if none + panel_type: str + if (panel_type := event.get("panel_type", None)) is None: + panel_type = VALIDATION_DEFAULTS["panel_type"] + + # Get is identified + sample_type: str + if (sample_type := event.get("sample_type", None)) is None: + sample_type = VALIDATION_DEFAULTS["sample_type"] + + # Get is identified + is_identified: str + if (is_identified := event.get("is_identified", None)) is None: + is_identified = VALIDATION_DEFAULTS["is_identified"] + # Update sample_df with validation defaults - sample_df["sample_type"] = VALIDATION_DEFAULTS["sample_type"] + sample_df["sample_type"] = sample_type + sample_df["panel_type"] = panel_type + sample_df["is_identified"] = is_identified sample_df["indication"] = VALIDATION_DEFAULTS["indication"] sample_df["disease_id"] = VALIDATION_DEFAULTS["disease_id"] sample_df["disease_name"] = VALIDATION_DEFAULTS["disease_name"] - sample_df["is_identified"] = VALIDATION_DEFAULTS["is_identified"] sample_df["requesting_physicians_first_name"] = VALIDATION_DEFAULTS["requesting_physicians_first_name"] sample_df["requesting_physicians_last_name"] = VALIDATION_DEFAULTS["requesting_physicians_last_name"] - sample_df["first_name"] = VALIDATION_DEFAULTS["first_name"] - sample_df["last_name"] = VALIDATION_DEFAULTS["last_name"] - sample_df["date_of_birth"] = VALIDATION_DEFAULTS["date_of_birth"] sample_df["specimen_type"] = VALIDATION_DEFAULTS["specimen_type"] sample_df["date_accessioned"] = VALIDATION_DEFAULTS["date_accessioned"] sample_df["date_collected"] = VALIDATION_DEFAULTS["date_collected"] @@ -124,18 +139,11 @@ def lambda_handler(event, context): sample_df["accession_number"] = case_accession_number sample_df["date_accessioned"] = datetime_obj_to_utc_isoformat(CURRENT_TIME) - # Convert times to utc time and strings - for date_column in ["date_received", "date_collected", "date_of_birth"]: - sample_df[date_column] = sample_df[date_column].apply( - lambda x: datetime_obj_to_utc_isoformat(handle_date(x)) - ) - # Rename columns logger.info("Rename external subject and external sample columns") sample_df = sample_df.rename( columns={ "external_sample_id": "external_specimen_id", - "external_subject_id": "mrn" } ) @@ -148,6 +156,31 @@ def lambda_handler(event, context): axis="columns" ) + # For identified - we rename external subject id as the medical record number + if all(sample_df["is_identified"]): + sample_df["first_name"] = VALIDATION_DEFAULTS["first_name"] + sample_df["last_name"] = VALIDATION_DEFAULTS["last_name"] + sample_df["date_of_birth"] = VALIDATION_DEFAULTS["date_of_birth"] + sample_df = sample_df.rename( + columns={ + "external_subject_id": "mrn" + } + ) + # For deidentified - we rename the external subject id as the study subject identifier + else: + sample_df["study_identifier"] = sample_df["project_name"] + sample_df = sample_df.rename( + columns={ + "external_subject_id": "study_subject_identifier" + } + ) + + # Convert times to utc time and strings + for date_column in ["date_received", "date_collected", "date_of_birth"]: + sample_df[date_column] = sample_df[date_column].apply( + lambda x: datetime_obj_to_utc_isoformat(handle_date(x)) + ) + # Assert expected values exist logger.info("Check we have all of the expected information") for expected_column in EXPECTED_ATTRIBUTES: @@ -158,10 +191,6 @@ def lambda_handler(event, context): ) raise ValueError - if (panel_type := event.get("panel_type", None)) is None: - panel_type = VALIDATION_DEFAULTS["panel_type"].name.lower() - sample_df["panel_type"] = panel_type - # Launch batch lambda function accession_json: Dict = sample_df.to_dict(orient="records")[0] diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_redcap_and_launch_clinical_workflow/lambda_code.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_redcap_and_launch_clinical_workflow/lambda_code.py index 46631fc..4e0daff 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_redcap_and_launch_clinical_workflow/lambda_code.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/get_metadata_from_portal_and_redcap_and_launch_clinical_workflow/lambda_code.py @@ -58,6 +58,7 @@ def merge_clinical_redcap_and_portal_data(redcap_df: pd.DataFrame, portal_df: pd Whilst the portal dataframe contains the following columns: * subject_id * library_id + * project_name * external_sample_id * external_subject_id :param redcap_df: @@ -109,7 +110,9 @@ def lambda_handler(event, context): "case_accession_number": "SBJID_LIBID_123", "ica_workflow_run_id": "wfr.123abc", "allow_missing_redcap_entry": false, - "panel_type": "main" + "panel_type": "main", + "sample_type": "patient_care_sample", + "is_identified": true, } """ @@ -204,6 +207,17 @@ def lambda_handler(event, context): if (panel_type := event.get("panel_type", None)) is None: panel_type = CLINICAL_DEFAULTS["panel_type"].name.lower() + if (sample_type := event.get("sample_type", None)) is None: + sample_type = CLINICAL_DEFAULTS["sample_type"].name.lower() + + if (is_identified := event.get("is_identified", None)) is None: + is_identified = CLINICAL_DEFAULTS["is_identified"].name.lower() + + # Set panel type (if not null) + merged_df["panel_type"] = panel_type + merged_df["sample_type"] = sample_type + merged_df["is_identified"] = is_identified + # Check length if merged_df.shape[0] == 0: logger.error("PierianDx metadata was not 'Complete', exiting") @@ -236,7 +250,6 @@ def lambda_handler(event, context): # Set defaults merged_df["specimen_type"] = CLINICAL_DEFAULTS["specimen_type"] - merged_df["is_identified"] = CLINICAL_DEFAULTS["is_identified"] merged_df["indication"] = CLINICAL_DEFAULTS["indication"] merged_df["hospital_number"] = CLINICAL_DEFAULTS["hospital_number"] merged_df["accession_number"] = case_accession_number @@ -252,8 +265,7 @@ def lambda_handler(event, context): logger.info("Rename external subject and external sample columns") merged_df = merged_df.rename( columns={ - "external_sample_id": "external_specimen_id", - "external_subject_id": "mrn" + "external_sample_id": "external_specimen_id" } ) @@ -268,18 +280,32 @@ def lambda_handler(event, context): raise ValueError # Step 7a - make up the 'identified' values (date_of_birth / first_name / last_name) - merged_df["date_of_birth"] = datetime_obj_to_utc_isoformat(CLINICAL_DEFAULTS["date_of_birth"]) - merged_df["first_name"] = merged_df.apply( - lambda x: CLINICAL_DEFAULTS["patient_name"][x.gender.lower()].split(" ")[0], - axis="columns" - ) - merged_df["last_name"] = merged_df.apply( - lambda x: CLINICAL_DEFAULTS["patient_name"][x.gender.lower()].split(" ")[-1], - axis="columns" - ) + # We set all but we only have one row (as asserted in the merge df) + if all(merged_df["is_identified"]): + merged_df["date_of_birth"] = datetime_obj_to_utc_isoformat(CLINICAL_DEFAULTS["date_of_birth"]) + merged_df["first_name"] = merged_df.apply( + lambda x: CLINICAL_DEFAULTS["patient_name"][x.gender.lower()].split(" ")[0], + axis="columns" + ) + merged_df["last_name"] = merged_df.apply( + lambda x: CLINICAL_DEFAULTS["patient_name"][x.gender.lower()].split(" ")[-1], + axis="columns" + ) + merged_df = merged_df.rename( + columns={ + "external_subject_id": "mrn" + } + ) + # Step 7b - for deidentified samples, use study_identified and study_subject_identifier + else: + merged_df["study_identifier"] = merged_df["project_name"] + merged_df = merged_df.rename( + columns={ + "external_subject_id": "study_subject_identifier" + } + ) - # Set panel type - merged_df["panel_type"] = panel_type + # Set is_identified # Step 7 - Launch batch lambda function accession_json: Dict = merged_df.to_dict(orient="records")[0] diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/launch_available_payloads_and_update_cttso_lims_sheet/lambda_code.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/launch_available_payloads_and_update_cttso_lims_sheet/lambda_code.py index 17afd73..a41cad6 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/launch_available_payloads_and_update_cttso_lims_sheet/lambda_code.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/launch_available_payloads_and_update_cttso_lims_sheet/lambda_code.py @@ -50,11 +50,11 @@ from lambda_utils.arns import get_validation_lambda_arn, get_clinical_lambda_arn from lambda_utils.aws_helpers import get_boto3_lambda_client, get_boto3_ssm_client, get_boto3_events_client from lambda_utils.gspread_helpers import \ - get_cttso_samples_from_glims, get_cttso_lims, update_cttso_lims_row, \ - append_df_to_cttso_lims + get_cttso_lims, update_cttso_lims_row, \ + append_df_to_cttso_lims, add_deleted_cases_to_deleted_sheet, get_deleted_lims_df, set_google_secrets from lambda_utils.logger import get_logger from lambda_utils.pieriandx_helpers import get_pieriandx_df, get_pieriandx_status_for_missing_sample -from lambda_utils.portal_helpers import get_portal_workflow_run_data_df +from lambda_utils.portal_helpers import get_portal_workflow_run_data_df, get_cttso_samples_from_limsrow_df from lambda_utils.redcap_helpers import get_full_redcap_data_df from lambda_utils.globals import \ PIERIANDX_LAMBDA_LAUNCH_FUNCTION_ARN_SSM_PATH, \ @@ -88,9 +88,14 @@ def merge_redcap_portal_and_glims_data(redcap_df, portal_df, glims_df) -> pd.Dat * subject_id * library_id * in_glims - * sequence_run_name - * glims_is_validation - * glims_is_research + * glims_illumina_id + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap :return: A pandas dataframe with the following columns: * subject_id * library_id @@ -104,8 +109,13 @@ def merge_redcap_portal_and_glims_data(redcap_df, portal_df, glims_df) -> pd.Dat * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap """ portal_redcap_df = pd.merge(portal_df, redcap_df, on=["subject_id", "library_id"], @@ -115,7 +125,7 @@ def merge_redcap_portal_and_glims_data(redcap_df, portal_df, glims_df) -> pd.Dat # aren't in the portal df glims_df = glims_df.rename( columns={ - "sequence_run_name": "portal_sequence_run_name" + "glims_illumina_id": "portal_sequence_run_name" } ) # Use portal_sequence_run_name to drop values from glims df that @@ -161,12 +171,18 @@ def get_libraries_for_processing(merged_df) -> pd.DataFrame: * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_submission_time * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date + * pieriandx_assignee * pieriandx_case_identified * pieriandx_panel_type * pieriandx_sample_type @@ -174,8 +190,10 @@ def get_libraries_for_processing(merged_df) -> pd.DataFrame: * subject_id * library_id * portal_wfr_id - * is_validation_sample - * is_research_sample + * panel + * sample_type + * is_identified + * needs_redcap * redcap_is_complete """ @@ -184,10 +202,10 @@ def get_libraries_for_processing(merged_df) -> pd.DataFrame: "subject_id", "library_id", "portal_wfr_id", - "glims_is_validation", - "glims_is_research", - "is_validation_sample", - "is_research_sample", + "panel", + "sample_type", + "is_identified", + "needs_redcap", "redcap_is_complete" ] @@ -218,11 +236,8 @@ def get_libraries_for_processing(merged_df) -> pd.DataFrame: " not redcap_is_complete.isnull() and redcap_is_complete.str.lower() == 'complete' " " ) or " " ( " - " not glims_is_validation.isnull() and glims_is_validation == True " - " ) or " - " ( " - " not glims_is_research.isnull() and glims_is_research == True " - " )" + " glims_needs_redcap == False " + " ) " " ) " ") ", engine="python" # Required for the isnull bit - https://stackoverflow.com/a/54099389/6946787 @@ -234,36 +249,24 @@ def get_libraries_for_processing(merged_df) -> pd.DataFrame: columns=processing_columns ) - # For validation sample to be set to true - # Must not be in redcap - # AND be in glims - # Redcap samples can be validation samples - # But samples not in redcap are processed through a different - # lambda endpoint - to_process_df["is_validation_sample"] = to_process_df.apply( - lambda x: True - if x.glims_is_validation is True - or ( - not pd.isnull(x.redcap_is_complete) - and x.redcap_is_complete.lower() == "complete" - and x.redcap_sample_type.lower() == "validation" - ) - else False, - axis="columns" - ) + # Update columns to strip glims_ attributes + new_column_names = [ + "panel", + "sample_type", + "is_identified", + "needs_redcap" + ] - to_process_df["is_research_sample"] = to_process_df.apply( - lambda x: True - if x.glims_is_research is True else False, - axis="columns" - ) + for column_name in new_column_names: + to_process_df[column_name] = to_process_df[f"glims_{column_name}"] + # Return subsetted dataframe return to_process_df[ processing_columns ] -def submit_library_to_pieriandx(subject_id: str, library_id: str, workflow_run_id: str, lambda_arn: str, panel_type: str): +def submit_library_to_pieriandx(subject_id: str, library_id: str, workflow_run_id: str, lambda_arn: str, panel_type: str, sample_type: str, is_identified: bool): """ Submit library to pieriandx :param subject_id: @@ -279,7 +282,9 @@ def submit_library_to_pieriandx(subject_id: str, library_id: str, workflow_run_i "subject_id": subject_id, "library_id": library_id, "ica_workflow_run_id": workflow_run_id, - "panel_type": panel_type + "panel_type": panel_type, + "sample_type": sample_type, + "is_identified": is_identified } logger.info(f"Launching lambda function {lambda_arn} with the following payload {json.dumps(lambda_payload)}") @@ -333,47 +338,47 @@ def submit_libraries_to_pieriandx(processing_df: pd.DataFrame) -> pd.DataFrame: * subject_id * library_id * portal_wfr_id - * is_validation_sample - * glims_is_validation - * glims_is_research - * is_research_sample + * panel + * sample_type + * is_identified + * needs_redcap * redcap_is_complete :return: + A pandas dataframe with the following columns + * subject_id + * library_id + * portal_wfr_id + * panel + * sample_type + * is_identified + * needs_redcap + * redcap_is_complete + * submission_succeeded + * submission_time """ # Get number of rows to submit num_submissions = processing_df.shape[0] if num_submissions > MAX_SUBMISSIONS_PER_LIMS_UPDATE_CYCLE: logger.info(f"Dropping submission number from {num_submissions} to {MAX_SUBMISSIONS_PER_LIMS_UPDATE_CYCLE}") - processing_df = processing_df.head(MAX_SUBMISSIONS_PER_LIMS_UPDATE_CYCLE) + processing_df = processing_df.head(n=MAX_SUBMISSIONS_PER_LIMS_UPDATE_CYCLE) # Validation df # Validation if is validation sample or IS research sample with no redcap information processing_df["submission_arn"] = processing_df.apply( lambda x: get_validation_lambda_arn() - if x.glims_is_validation is True - or ( - # Sample not in RedCap - ( - pd.isnull(x.redcap_is_complete) or - not x.redcap_is_complete.lower() == "complete" - ) and - # GLIMS Workflow is set to 'Research' - ( - x.glims_is_research - ) + if not x.needs_redcap and + ( + # Sample not in RedCap + ( + pd.isnull(x.redcap_is_complete) or + not x.redcap_is_complete.lower() == "complete" + ) ) else get_clinical_lambda_arn(), axis="columns" ) - processing_df["panel_type"] = processing_df.apply( - lambda x: "main" - if (x.is_validation_sample or x.is_research_sample) - else "subpanel", - axis="columns" - ) - processing_df["submission_succeeded"] = False for index, row in processing_df.iterrows(): @@ -386,7 +391,9 @@ def submit_libraries_to_pieriandx(processing_df: pd.DataFrame) -> pd.DataFrame: library_id=row.library_id, workflow_run_id=row.portal_wfr_id, lambda_arn=row.submission_arn, - panel_type=row.panel_type + panel_type=row.panel, + sample_type=row.sample_type, + is_identified=row.is_identified ) except ValueError: pass @@ -414,17 +421,31 @@ def append_to_cttso_lims(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, e * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee :param cttso_lims_df: A pandas dataframe with the following columns * subject_id * library_id - * in_redcap - * in_portal * in_glims - * glims_is_validation - * glims_is_research + * in_portal + * in_redcap + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -432,16 +453,17 @@ def append_to_cttso_lims(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, e * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run + * pieriandx_submission_time * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id * pieriandx_workflow_status * pieriandx_report_status - * pieriandx_report_signed_out - currently ignored :param excel_row_number_mapping_df: A pandas DataFrame with the following columns: * cttso_lims_index * excel_row_number @@ -561,6 +583,7 @@ def append_to_cttso_lims(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, e excel_row_number ) + if len(rows_to_append) == 0: return @@ -585,13 +608,19 @@ def get_pieriandx_incomplete_job_df_from_cttso_lims_df(cttso_lims_df: pd.DataFra :param cttso_lims_df: A pandas dataframe with the following columns: * subject_id * library_id - * glims_is_validation - * glims_is_research * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * portal_wfr_id * portal_wfr_end * portal_wfr_status @@ -611,13 +640,19 @@ def get_pieriandx_incomplete_job_df_from_cttso_lims_df(cttso_lims_df: pd.DataFra :return: A pandas DataFrame with the following columns: * subject_id * library_id - * glims_is_validation - * glims_is_research * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * portal_wfr_id * portal_wfr_end * portal_wfr_status @@ -674,6 +709,7 @@ def update_merged_df_with_processing_df(merged_df, processing_df) -> pd.DataFram * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -681,20 +717,35 @@ def update_merged_df_with_processing_df(merged_df, processing_df) -> pd.DataFram * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee :param processing_df: A pandas dataframe with the following columns * subject_id * library_id * portal_wfr_id - * is_validation_sample + * panel + * sample_type + * is_identified + * needs_redcap + * redcap_is_complete * submission_succeeded + * submission_time :return: A pandas dataframe with the following columns * subject_id * library_id * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -702,9 +753,18 @@ def update_merged_df_with_processing_df(merged_df, processing_df) -> pd.DataFram * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee + * pieriandx_submission_time """ # Set the pieriandx case id to these samples as 'pending' for index, row in processing_df.iterrows(): @@ -739,6 +799,14 @@ def update_pieriandx_job_status_missing_df(pieriandx_job_status_missing_df, merg * in_redcap * in_portal * in_glims + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -746,8 +814,6 @@ def update_pieriandx_job_status_missing_df(pieriandx_job_status_missing_df, merg * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date @@ -757,6 +823,14 @@ def update_pieriandx_job_status_missing_df(pieriandx_job_status_missing_df, merg * in_redcap * in_portal * in_glims + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -764,13 +838,9 @@ def update_pieriandx_job_status_missing_df(pieriandx_job_status_missing_df, merg * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research * pieriandx_case_id - * pieriandx_case_accession_number - * pieriandx_workflow_id - * pieriandx_workflow_status - * pieriandx_report_status + * pieriandx_case_creation_date + * pieriandx_assignee """ # We only want some columns from merged df @@ -788,11 +858,17 @@ def update_pieriandx_job_status_missing_df(pieriandx_job_status_missing_df, merg "portal_wfr_status", "portal_sequence_run_name", "portal_is_failed_run", - "glims_is_validation", - "glims_is_research", + "glims_project_owner", + "glims_project_name", + "glims_panel", + "glims_sample_type", + "glims_is_identified", + "glims_default_snomed_term", + "glims_needs_redcap", "pieriandx_submission_time", "pieriandx_case_id", - "pieriandx_case_creation_date" + "pieriandx_case_creation_date", + "pieriandx_assignee" ]] # We merge right since we only want jobs we've picked up in the incomplete jobs df @@ -819,20 +895,27 @@ def add_pieriandx_df_to_merged_df(merged_df: pd.DataFrame, pieriandx_df: pd.Data * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap :param pieriandx_df: A pandas dataframe with the following columns: * subject_id * library_id * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date + * pieriandx_assignee :return: A pandas dataframe with the following columns: * subject_id * library_id * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -840,11 +923,17 @@ def add_pieriandx_df_to_merged_df(merged_df: pd.DataFrame, pieriandx_df: pd.Data * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date + * pieriandx_assignee """ merged_df_with_pieriandx_df = pd.merge( merged_df, @@ -909,10 +998,8 @@ def add_pieriandx_df_to_merged_df(merged_df: pd.DataFrame, pieriandx_df: pd.Data # Drop cases in pieriandx where not found in redcap or glims and not found in portal merged_df_with_pieriandx_df = merged_df_with_pieriandx_df.query( "not ( " - " ( " - " in_glims == False and " - " in_redcap == False " - " ) and " + " in_glims == False and " + " in_redcap == False and " " in_portal == False and " " in_pieriandx == True" ")" @@ -967,8 +1054,17 @@ def update_cttso_lims(update_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, exce :param update_df: A pandas DataFrame with the following columns: * subject_id * library_id - * glims_is_validation - * glims_is_research + * in_redcap + * in_portal + * in_glims + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -976,22 +1072,23 @@ def update_cttso_lims(update_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, exce * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * pieriandx_submission_time * pieriandx_case_id - * pieriandx_case_accession_number * pieriandx_case_creation_date - * pieriandx_case_identified - * pieriandx_panel_type - * pieriandx_sample_type - * pieriandx_workflow_id - * pieriandx_workflow_status - * pieriandx_report_status - * pieriandx_report_signed_out - currently ignored + * pieriandx_assignee :param cttso_lims_df: A pandas DataFrame with the following columns: * subject_id * library_id - * glims_is_validation - * glims_is_research + * in_glims + * in_portal + * in_redcap + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -1004,12 +1101,12 @@ def update_cttso_lims(update_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, exce * pieriandx_case_accession_number * pieriandx_case_creation_date * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id * pieriandx_workflow_status * pieriandx_report_status - * pieriandx_report_signed_out - currently ignored :param excel_row_mapping_df: A pandas dataframe with the following columns: * cttso_lims_index * excel_row_number @@ -1053,6 +1150,7 @@ def update_cttso_lims(update_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, exce "pieriandx_case_id", "pieriandx_case_accession_number", "pieriandx_case_creation_date", + "pieriandx_assignee", "pieriandx_case_identified", "pieriandx_panel_type", "pieriandx_sample_type", @@ -1086,12 +1184,10 @@ def get_duplicate_case_ids(lims_df: pd.DataFrame) -> List: :param lims_df: A pandas dataframe with the following columns * subject_id * library_id - * in_glims - * in_portal * in_redcap + * in_portal + * in_glims * in_pieriandx - * glims_is_validation - * glims_is_research * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -1100,10 +1196,17 @@ def get_duplicate_case_ids(lims_df: pd.DataFrame) -> List: * portal_sequence_run_name * portal_is_failed_run * pieriandx_submission_time + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date - * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id @@ -1114,16 +1217,22 @@ def get_duplicate_case_ids(lims_df: pd.DataFrame) -> List: * 'in_portal_lims', * 'in_redcap_lims', * 'in_pieriandx_lims', - * 'glims_is_validation_lims', - * 'glims_is_research_lims', * 'redcap_sample_type_lims', * 'redcap_is_complete_lims', + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * 'portal_wfr_end_lims', * 'portal_wfr_status_lims', * 'portal_sequence_run_name_lims', * 'portal_is_failed_run_lims', * 'pieriandx_case_accession_number_lims', * 'pieriandx_case_creation_date_lims' + * 'pieriandx_assignee_lims' :return: """ @@ -1146,6 +1255,16 @@ def get_duplicate_case_ids(lims_df: pd.DataFrame) -> List: # Single unique row - nothing to see here continue + # Any cases with an assignee should be tracked + # So drop any accession that have an assignment + mini_df = mini_df.query( + "pieriandx_assignee.isnull()", + engine="python" + ) + # No cases without assignment + if mini_df.shape[0] == 0: + continue + # Check we don't have duplicate pieriandx case ids if not len(mini_df["pieriandx_case_id"].unique()) == mini_df.shape[0]: logger.info(f"Got duplicates pieriandx case ids " @@ -1216,6 +1335,8 @@ def get_duplicate_case_ids(lims_df: pd.DataFrame) -> List: for case_id in case_ids_to_remove if not pd.isnull(case_id)] + print(case_ids_to_remove) + return case_ids_to_remove @@ -1236,6 +1357,7 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * in_redcap * in_portal * in_glims + * in_pieriandx * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -1243,11 +1365,17 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * portal_wfr_status * portal_sequence_run_name * portal_is_failed_run - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date + * pieriandx_assignee :param cttso_lims_df: A pandas DataFrame with the following columns: * subject_id * library_id @@ -1255,8 +1383,13 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * in_portal * in_redcap * in_pieriandx - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -1269,6 +1402,7 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * pieriandx_case_accession_number * pieriandx_case_creation_date * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id @@ -1279,23 +1413,30 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * excel_row_number :return: ( A pandas DataFrame with the following columns: - * subject_id - * library_id - * in_redcap - * in_portal - * in_glims - * redcap_sample_type - * redcap_is_complete - * portal_wfr_id - * portal_wfr_end - * portal_wfr_status - * portal_sequence_run_name - * portal_is_failed_run - * glims_is_validation - * glims_is_research - * pieriandx_case_id - * pieriandx_case_accession_number - * pieriandx_case_creation_date + * subject_id + * library_id + * in_redcap + * in_portal + * in_glims + * in_pieriandx + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee , A pandas DataFrame with the following columns: * subject_id @@ -1304,8 +1445,13 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * in_portal * in_redcap * in_pieriandx - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -1318,6 +1464,7 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, * pieriandx_case_accession_number * pieriandx_case_creation_date * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id @@ -1337,7 +1484,9 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, how="outer", suffixes=("", "_lims") ) - merged_lims_df = bind_pieriandx_case_submission_time_to_merged_df(merged_lims_df, cttso_lims_df) + # We might have needed to reset the lims db + if not cttso_lims_df.shape[0] == 0: + merged_lims_df = bind_pieriandx_case_submission_time_to_merged_df(merged_lims_df, cttso_lims_df) # Get list of case ids to drop case_ids_to_remove: List = get_duplicate_case_ids(merged_lims_df) @@ -1384,7 +1533,10 @@ def cleanup_duplicate_rows(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, excel_row_number_mapping_df: pd.DataFrame cttso_lims_df, excel_row_number_mapping_df = get_cttso_lims() - merged_df_dedup = bind_pieriandx_case_submission_time_to_merged_df(merged_df_dedup, cttso_lims_df) + if not cttso_lims_df.shape[0] == 0: + merged_df_dedup = bind_pieriandx_case_submission_time_to_merged_df(merged_df_dedup, cttso_lims_df) + else: + merged_df_dedup["pieriandx_submission_time"] = pd.NA return merged_df_dedup, cttso_lims_df, excel_row_number_mapping_df @@ -1429,13 +1581,125 @@ def get_pieriandx_case_id_from_merged_df_for_pending_case(cttso_lims_series, mer return pieriandx_case_id -def bind_pieriandx_case_submission_time_to_merged_df(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame): - # Split lims submission into two - # Rows with a valid pieriandx case accession id - # Rows that do not have a valid pieriandx case accession id - # We bind first set on pieriandx_case_id - simples - # For remaining merged df (we try bind on 'pending') - # If we still have remaining, we set pieriandx submission time as day of creation +def bind_pieriandx_case_submission_time_to_merged_df(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame) -> pd.DataFrame: + """ + plit lims submission into two + Rows with a valid pieriandx case accession id + Rows that do not have a valid pieriandx case accession id + We bind first set on pieriandx_case_id - simples + For remaining merged df (we try bind on 'pending') + If we still have remaining, we set pieriandx submission time as day of creation + :param merged_df: A pandas dataframe with the following columns: + * subject_id + * library_id + * in_redcap + * in_portal + * in_glims + * in_pieriandx + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee + :param cttso_lims_df: A pandas dataframe with the following columns: + * subject_id + * library_id + * in_glims + * in_portal + * in_redcap + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * pieriandx_submission_time + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_case_identified + * pieriandx_assignee + * pieriandx_panel_type + * pieriandx_sample_type + * pieriandx_workflow_id + * pieriandx_workflow_status + * pieriandx_report_status + :return: A pandas DataFrame with the following columns: + * subject_id + * library_id + * in_redcap + * in_portal + * in_glims + * in_pieriandx + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * pieriandx_submission_time + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee + * pieriandx_panel_type + * pieriandx_sample_type + * pieriandx_workflow_id + * pieriandx_workflow_status + * pieriandx_report_status + # Duplicate columns + * 'in_glims_lims', + * 'in_portal_lims', + * 'in_redcap_lims', + * 'in_pieriandx_lims', + * 'redcap_sample_type_lims', + * 'redcap_is_complete_lims', + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * 'portal_wfr_end_lims', + * 'portal_wfr_status_lims', + * 'portal_sequence_run_name_lims', + * 'portal_is_failed_run_lims', + * 'pieriandx_case_accession_number_lims', + * 'pieriandx_case_creation_date_lims' + * 'pieriandx_assignee_lims' + """ + cttso_lims_df_valid_merge = cttso_lims_df.query( "not pieriandx_case_id.isnull() and " "not pieriandx_submission_time.isnull() " @@ -1519,7 +1783,7 @@ def bind_pieriandx_case_submission_time_to_merged_df(merged_df: pd.DataFrame, ct # All actions are the same - just logging is different if pd.isnull(pieriandx_case_submission_time): logger.info("Pieriandx Case Submission time is null, we just merge as null") - elif pieriandx_case_submission_time < one_week_ago: + elif pd.to_datetime(pieriandx_case_submission_time).tz_localize("UTC") < one_week_ago: logger.info("Case pending for over one week, this case will be resubmitted") else: logger.info("Case pending for less than one week, please resubmit manually") @@ -1597,6 +1861,126 @@ def bind_pieriandx_case_submission_time_to_merged_df(merged_df: pd.DataFrame, ct return merged_lims_df +def drop_to_be_deleted_cases(merged_df: pd.DataFrame, cttso_lims_df: pd.DataFrame, excel_row_mapping_number_df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): + """ + Cases that have been assigned to ToBeDeleted need to be dropped from row list + and instead attached to a new sheet + :param merged_df: + * subject_id + * library_id + * in_redcap + * in_portal + * in_glims + * in_pieriandx + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_assignee + :param cttso_lims_df: + A pandas DataFrame with the following columns: + * subject_id + * library_id + * in_glims + * in_portal + * in_redcap + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * pieriandx_submission_time + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_case_identified + * pieriandx_assignee + * pieriandx_panel_type + * pieriandx_sample_type + * pieriandx_workflow_id + * pieriandx_workflow_status + * pieriandx_report_status + * pieriandx_report_signed_out - currently ignored + :param excel_row_mapping_number_df: + A pandas DataFrame with the following columns: + * cttso_lims_index + * excel_row_number + :return: + """ + # Split cttso lims df by query + to_be_deleted_cases_lims = cttso_lims_df.query("pieriandx_assignee == 'ToBeDeleted'") + to_be_deleted_cases_merged_df = merged_df.query("pieriandx_assignee == 'ToBeDeleted'") + + if to_be_deleted_cases_lims.shape[0] == 0 and to_be_deleted_cases_merged_df.shape[0] == 0: + logger.info("Nothing to transfer to delete pile") + return merged_df, cttso_lims_df, excel_row_mapping_number_df + + cttso_lims_df_cleaned = cttso_lims_df.query("pieriandx_assignee != 'ToBeDeleted'") + clean_cttso_case_ids_list = cttso_lims_df_cleaned["pieriandx_case_id"].tolist() + deleted_case_ids_list = to_be_deleted_cases_lims["pieriandx_case_id"].tolist() + + # Clean out merged df with existing deleted cases + # And any cases we're about to put into the deleted lims as well + deleted_lims_df, deleted_lims_excel_row_mapping_number = get_deleted_lims_df() + case_ids_to_remove_from_merged_df = list( + set( + deleted_lims_df["pieriandx_case_id"].tolist() + + to_be_deleted_cases_lims["pieriandx_case_id"].tolist() + ) + ) + + # If the case id is in both, we need to keep it, and have it reassigned + merged_df = merged_df.query( + "pieriandx_case_id.isnull() or " + "pieriandx_case_id not in @case_ids_to_remove_from_merged_df or " + "( " + " pieriandx_case_id in @clean_cttso_case_ids_list and " + " pieriandx_case_id in @deleted_case_ids_list " + ")", + engine="python" + ) + + # Update cttso lims sheet with replacement + append_df_to_cttso_lims(cttso_lims_df_cleaned, replace=True) + # Wait for doc population + sleep(10) + + # Collect new values + cttso_lims_df: pd.DataFrame + excel_row_number_mapping_df: pd.DataFrame + cttso_lims_df, excel_row_number_mapping_df = get_cttso_lims() + + # Update deleted sheet - note we only add in the cases that are in the LIMS - + # cases in merged_df will need to be updated into LIMS first THEN pulled out of LIMS in the next iteration of this + # lambda script + add_deleted_cases_to_deleted_sheet(to_be_deleted_cases_lims) + + return merged_df, cttso_lims_df, excel_row_number_mapping_df + + def lambdas_awake() -> bool: """ Go through the lambdas that are required for this service and make sure that they're all awake @@ -1699,12 +2083,15 @@ def lambda_handler(event, context): :param context: :return: """ + # Set GLIMS Secrets env vars + set_google_secrets() + # Get raw data values redcap_df: pd.DataFrame = get_full_redcap_data_df() redcap_df["in_redcap"] = True portal_df: pd.DataFrame = get_portal_workflow_run_data_df() portal_df["in_portal"] = True - glims_df: pd.DataFrame = get_cttso_samples_from_glims() + glims_df: pd.DataFrame = get_cttso_samples_from_limsrow_df() glims_df["in_glims"] = True pieriandx_df: pd.DataFrame = get_pieriandx_df() pieriandx_df["in_pieriandx"] = True @@ -1720,6 +2107,10 @@ def lambda_handler(event, context): excel_row_number_mapping_df: pd.DataFrame cttso_lims_df, excel_row_number_mapping_df = get_cttso_lims() + # Clean out to-be-deleted cases + merged_df, cttso_lims_df, excel_row_number_mapping_df = \ + drop_to_be_deleted_cases(merged_df, cttso_lims_df, excel_row_number_mapping_df) + merged_df, cttso_lims_df, excel_row_number_mapping_df = \ cleanup_duplicate_rows(merged_df, cttso_lims_df, excel_row_number_mapping_df) @@ -1781,6 +2172,7 @@ def lambda_handler(event, context): "Please fix this manually, stopping all further submissions then reenable the event bridge" ) disable_event_rule() + sys.exit(1) # Launch payloads for pieriandx_df samples that have no case id - if existent @@ -1803,4 +2195,4 @@ def lambda_handler(event, context): ## LOCAL DEBUG ONLY ## # if __name__ == "__main__": -# lambda_handler(None, None) +# lambda_handler(None, None) diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/async_lambda_functions.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/async_lambda_functions.py index 04e3c04..697beaa 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/async_lambda_functions.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/async_lambda_functions.py @@ -79,6 +79,7 @@ async def async_get_metadata_information_from_portal(subject_id: str, library_id :return: A pandas DataFrame with the following columns: * subject_id * library_id + * project_name * external_sample_id * external_subject_id """ diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/globals.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/globals.py index 5a17c6d..ad86b7c 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/globals.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/globals.py @@ -29,9 +29,13 @@ class SampleType(Enum): PORTAL_API_BASE_URL_SSM_PATH = "/data_portal/backend/api_domain_name" PORTAL_METADATA_ENDPOINT = "https://{PORTAL_API_BASE_URL}/iam/metadata/" PORTAL_WORKFLOWS_ENDPOINT = "https://{PORTAL_API_BASE_URL}/iam/workflows" +PORTAL_LIMSROW_ENDPOINT = "https://{PORTAL_API_BASE_URL}/iam/lims" PORTAL_SEQUENCE_RUNS_ENDPOINT = "https://{PORTAL_API_BASE_URL}/iam/sequencerun" PORTAL_MAX_ROWS_PER_PAGE = 100 -PORTAL_CTTSO_TYPE_NAME = "tso_ctdna_tumor_only" +PORTAL_CTTSO_WORKFLOW_TYPE_NAME = "tso_ctdna_tumor_only" +PORTAL_CTTSO_SAMPLE_TYPE = "ctDNA" +PORTAL_CTTSO_SAMPLE_ASSAY = "ctTSO" +PORTAL_CTTSO_SAMPLE_PHENOTYPE = "tumor" PORTAL_WORKFLOW_ORDERING = "-start" # We generally want the latest GOOGLE_LIMS_AUTH_JSON_SSM_PARAMETER_PATH = "/umccr/google/drive/lims_service_account_json" @@ -55,6 +59,8 @@ class SampleType(Enum): PIERIANDX_LAMBDA_LAUNCH_FUNCTION_ARN_SSM_PATH = "cttso-ica-to-pieriandx-lambda-function" +LIMS_PROJECT_NAME_MAPPING_SSM_PATH = "cttso-lims-project-name-to-pieriandx-mapping" + MAX_ATTEMPTS_GET_CASES = 5 LIST_CASES_RETRY_TIME = 5 MAX_SUBMISSIONS_PER_LIMS_UPDATE_CYCLE = 20 @@ -162,6 +168,7 @@ class SampleType(Enum): PORTAL_FIELDS: List = [ "subject_id", "library_id", + "project_name", "external_sample_id", "external_subject_id" ] @@ -185,6 +192,6 @@ class SampleType(Enum): WFR_NAME_REGEX = re.compile( # "umccr__automated__tso_ctdna_tumor_only__SBJ00998__L2101500__202112115d8bdae7" - rf"umccr__automated__{PORTAL_CTTSO_TYPE_NAME}__(SBJ\d{{5}})__(L\d{{7}})__\S+" + rf"umccr__automated__{PORTAL_CTTSO_WORKFLOW_TYPE_NAME}__(SBJ\d{{5}})__(L\d{{7}})__\S+" ) diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/gspread_helpers.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/gspread_helpers.py index c629fa5..6cd1fa2 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/gspread_helpers.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/gspread_helpers.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 - +from datetime import datetime from pathlib import Path from tempfile import TemporaryDirectory import os +from typing import List + import pandas as pd import numpy as np from gspread_pandas import Spread @@ -67,6 +69,9 @@ def create_gspread_pandas_dir() -> Path: def set_google_secrets(): + if os.environ.get("GSPREAD_PANDAS_CONFIG_DIR", None) is not None: + return + # Add in the secret and set the env var gspread_pandas_dir = create_gspread_pandas_dir() @@ -75,45 +80,28 @@ def set_google_secrets(): os.environ["GSPREAD_PANDAS_CONFIG_DIR"] = str(gspread_pandas_dir) -def get_cttso_samples_from_glims() -> pd.DataFrame: +def get_column_range(series_length: int) -> List: """ - Get cttso samples from GLIMS - :return: A pandas DataFrame with the following columns - * subject_id - * library_id - * sequence_run_name - * glims_is_validation -> Is this a validation sample? Determined by ProjectName is equal to "Validation" or "Control" - * glims_is_research -> Is this a research sample? Determined by ProjectName is equal to "Research" + A to Z plus AA, AB, AC etc + :param series_length: + :return: """ + column_range = get_alphabet() + counter = 0 + + while True: + if len(column_range) >= series_length: + break + column_range = column_range + list( + map( + lambda letter: get_alphabet()[counter] + letter, + get_alphabet() + ) + ) - if os.environ.get("GSPREAD_PANDAS_CONFIG_DIR") is None: - set_google_secrets() - - # Pull in from sheet data - glims_df: pd.DataFrame = Spread(spread=get_glims_sheet_id(), sheet="Sheet1").sheet_to_df(index=0) - - # We also set Phenotype to 'tumor' to prevent NTC being uploaded to PierianDx - glims_df = glims_df.query("Type=='ctDNA' & Assay=='ctTSO' & Phenotype=='tumor'") - - glims_df["glims_is_validation"] = glims_df.apply( - lambda x: True if x.ProjectName.lower() in ["validation", "control"] else False, - axis="columns" - ) - glims_df["glims_is_research"] = glims_df.apply( - lambda x: True if x.Workflow.lower() in ["research"] else False, - axis="columns" - ) - - glims_df = glims_df.rename( - columns={ - "SubjectID": "subject_id", - "IlluminaID": "sequence_run_name", - "LibraryID": "library_id" - } - ) + counter += 1 - # Drop duplicate rows and return - return glims_df[["subject_id", "library_id", "sequence_run_name", "glims_is_validation", "glims_is_research"]].drop_duplicates() + return column_range[:series_length] def update_cttso_lims_row(new_row: pd.Series, row_number: int): @@ -127,7 +115,7 @@ def update_cttso_lims_row(new_row: pd.Series, row_number: int): new_row = new_row.replace({pd.NaT: None}).replace({'NaT': None}).replace({np.NaN: ""}) series_length = new_row.shape[0] - column_range = get_alphabet()[:series_length] + column_range = get_column_range(series_length) sheet_obj = Spread(spread=get_cttso_lims_sheet_id(), sheet="Sheet1") sheet_obj.update_cells( start=f"{column_range[0]}{row_number}", @@ -214,8 +202,13 @@ def get_cttso_lims() -> (pd.DataFrame, pd.DataFrame): * in_portal * in_redcap * in_pieriandx - * glims_is_validation - * glims_is_research + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap * redcap_sample_type * redcap_is_complete * portal_wfr_id @@ -228,6 +221,7 @@ def get_cttso_lims() -> (pd.DataFrame, pd.DataFrame): * pieriandx_case_accession_number * pieriandx_case_creation_date * pieriandx_case_identified + * pieriandx_assignee * pieriandx_panel_type * pieriandx_sample_type * pieriandx_workflow_id @@ -270,3 +264,122 @@ def get_cttso_lims() -> (pd.DataFrame, pd.DataFrame): ) return cttso_lims_df, excel_row_number_df + + +def get_deleted_lims_df() -> (pd.DataFrame, pd.DataFrame): + """ + Collect the values from the existing GSuite spreadsheet + Maps the values from the existing GSuite spreadsheet to their excel row number + Also returns the row value for each of the items + :return: ( + A pandas DataFrame with the following columns: + * subject_id + * library_id + * in_glims + * in_portal + * in_redcap + * in_pieriandx + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + * redcap_sample_type + * redcap_is_complete + * portal_wfr_id + * portal_wfr_end + * portal_wfr_status + * portal_sequence_run_name + * portal_is_failed_run + * pieriandx_submission_time + * pieriandx_case_id + * pieriandx_case_accession_number + * pieriandx_case_creation_date + * pieriandx_case_identified + * pieriandx_assignee + * pieriandx_panel_type + * pieriandx_sample_type + * pieriandx_workflow_id + * pieriandx_workflow_status + * pieriandx_report_status + * pieriandx_report_signed_out - currently ignored + * date_added_to_deletion_table + , + A pandas DataFrame with the following columns: + * cttso_lims_index + * excel_row_number + ) + :return: + """ + deleted_lims_df: pd.DataFrame = Spread(spread=get_cttso_lims_sheet_id(), sheet="Deleted Cases").sheet_to_df(index=0) + + deleted_lims_df = deleted_lims_df.replace("", pd.NA) + + # Replace booleans + deleted_lims_df = deleted_lims_df.replace({ + "TRUE": True, + "FALSE": False + }) + + excel_row_number_df: pd.DataFrame = pd.DataFrame({"cttso_lims_index": deleted_lims_df.index}) + + # Conversion to 1-based index plus single header row + excel_row_number_df["excel_row_number"] = excel_row_number_df.index + 2 + + return deleted_lims_df, excel_row_number_df + + +def append_rows_to_deleted_lims(to_be_deleted: pd.DataFrame): + """ + List of rows to be added to the deleted lims database + # FIXME add df + :param to_be_deleted: + :return: + """ + # Open up the sheet object + sheet_obj = Spread(spread=get_cttso_lims_sheet_id(), sheet="Deleted Cases") + + # Perform a proper NA replacement + # https://github.com/pandas-dev/pandas/issues/29024#issuecomment-1098052276 + new_df = to_be_deleted.replace({pd.NaT: None}).replace({'NaT': None}).replace({np.NaN: ""}) + + # Get existing sheet + existing_sheet = sheet_obj.sheet_to_df(index=0) + # Update the sheet object with the list + sheet_obj.df_to_sheet( + pd.concat( + [ + existing_sheet, + new_df, + pd.DataFrame(columns=existing_sheet.columns, index=range(1000)) + ] + ), + index=False, replace=True, fill_value="" + ) + + +def add_deleted_cases_to_deleted_sheet(new_cases_to_delete_df: pd.DataFrame): + """ + # FIXME add df here + :param new_cases_to_delete_df: + :return: + """ + deleted_lims_df, excel_row_mapping_number = get_deleted_lims_df() + + # Create list for query + existing_deleted_case_ids = deleted_lims_df["pieriandx_case_id"].tolist() + + # Get list of deleted cases + new_cases_to_delete_df = new_cases_to_delete_df.query( + "pieriandx_case_id not in @existing_deleted_case_ids", + engine="python" + ) + + if new_cases_to_delete_df.shape[0] == 0: + return + + new_cases_to_delete_df["date_added_to_deletion_table"] = datetime.utcnow().isoformat(sep="T", timespec="seconds") + "Z" + + append_rows_to_deleted_lims(new_cases_to_delete_df) diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/pieriandx_helpers.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/pieriandx_helpers.py index bd21c75..fb8fa90 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/pieriandx_helpers.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/pieriandx_helpers.py @@ -149,6 +149,7 @@ def get_pieriandx_df() -> pd.DataFrame: * pieriandx_case_id * pieriandx_case_accession_number * pieriandx_case_creation_date (as dt object) + * pieriandx_assignee """ email, auth_token, institution, base_url = get_pieriandx_env_vars() @@ -185,12 +186,31 @@ def get_pieriandx_df() -> pd.DataFrame: cases_df.columns = sanitised_columns + # Update column names + columns_to_update = { + "id": "pieriandx_case_id", + "accession_number": "pieriandx_case_accession_number", + "date_created": "pieriandx_case_creation_date" + } + + if "assignee" in cases_df.columns.tolist(): + columns_to_update.update( + { + "assignee": "pieriandx_assignee" + } + ) + else: + # Assign nulls to column + cases_df["pieriandx_assignee"] = pd.NA + cases_df = cases_df.rename( - columns={ - "id": "pieriandx_case_id", - "accession_number": "pieriandx_case_accession_number", - "date_created": "pieriandx_case_creation_date", - } + columns=columns_to_update + ) + + # Convert pieriandx assignee from list to last assignee + # pieriandx assignee might not exist + cases_df["pieriandx_assignee"] = cases_df["pieriandx_assignee"].apply( + lambda x: x[-1] if isinstance(x, List) else pd.NA ) # Convert case creation date to datetime object @@ -220,7 +240,8 @@ def get_pieriandx_df() -> pd.DataFrame: "library_id", "pieriandx_case_id", "pieriandx_case_accession_number", - "pieriandx_case_creation_date" + "pieriandx_case_creation_date", + "pieriandx_assignee" ] return cases_df[columns_to_return] diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/portal_helpers.py b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/portal_helpers.py index 6dc4620..9dd46d9 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/portal_helpers.py +++ b/deploy/cttso-ica-to-pieriandx-cdk/lambdas/layers/lambda_utils/portal_helpers.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import json from mypy_boto3_ssm.client import SSMClient from aws_requests_auth.boto_utils import BotoAWSRequestsAuth @@ -14,9 +15,10 @@ PORTAL_WORKFLOWS_ENDPOINT, \ PORTAL_WORKFLOW_ORDERING, \ PORTAL_MAX_ROWS_PER_PAGE, \ - PORTAL_CTTSO_TYPE_NAME, \ + PORTAL_CTTSO_WORKFLOW_TYPE_NAME, \ PORTAL_FIELDS, \ - WFR_NAME_REGEX, PORTAL_SEQUENCE_RUNS_ENDPOINT + WFR_NAME_REGEX, PORTAL_SEQUENCE_RUNS_ENDPOINT, PORTAL_LIMSROW_ENDPOINT, PORTAL_CTTSO_SAMPLE_TYPE, \ + PORTAL_CTTSO_SAMPLE_ASSAY, PORTAL_CTTSO_SAMPLE_PHENOTYPE, LIMS_PROJECT_NAME_MAPPING_SSM_PATH from .aws_helpers import get_aws_region, get_boto3_ssm_client from .logger import get_logger @@ -136,7 +138,7 @@ def get_portal_workflow_run_data_df() -> pd.DataFrame: auth=portal_auth, params={ "rowsPerPage": PORTAL_MAX_ROWS_PER_PAGE, - "type_name": PORTAL_CTTSO_TYPE_NAME, + "type_name": PORTAL_CTTSO_WORKFLOW_TYPE_NAME, "ordering": PORTAL_WORKFLOW_ORDERING, "page": page_number } @@ -276,6 +278,7 @@ def get_clinical_metadata_information_from_portal_for_subject(subject_id: str, l :return: A pandas DataFrame with the following columns: * subject_id * library_id + * project_name * external_sample_id * external_subject_id """ @@ -336,30 +339,39 @@ def get_ica_workflow_run_id_from_portal(subject_id: str, library_id: str) -> str ) portal_auth = get_portal_creds(portal_url_endpoint) - # FIXME - iterate over pages + all_results = [] + page_number = 1 - req: Response = requests.get( - url=portal_url_endpoint, - auth=portal_auth, - params={ - "type_name": PORTAL_CTTSO_TYPE_NAME, - "end_status": "Succeeded", - "ordering": PORTAL_WORKFLOW_ORDERING, - "rowsPerPage": PORTAL_MAX_ROWS_PER_PAGE - } - ) + while True: + req: Response = requests.get( + url=portal_url_endpoint, + auth=portal_auth, + params={ + "type_name": PORTAL_CTTSO_WORKFLOW_TYPE_NAME, + "end_status": "Succeeded", + "ordering": PORTAL_WORKFLOW_ORDERING, + "rowsPerPage": PORTAL_MAX_ROWS_PER_PAGE + } + ) - # Collect the json - json_dict: Dict = req.json() + req_dict: Dict = req.json() - # Ensure requests - results: List - if (results := json_dict.get("results", None)) is None: - logger.error("Could not get requests from portal workflow endpoint") - raise ValueError + results: List + if (results := req_dict.get("results", None)) is None: + logger.error("Could not get requests from portal workflow endpoint") + raise ValueError + + # Extend all results + all_results.extend(results) + + # Get next page + if req_dict.get("links", {}).get("next", None) is not None: + page_number += 1 + else: + break # Collect data frames - cttso_workflows_df: pd.DataFrame = pd.DataFrame(results) + cttso_workflows_df: pd.DataFrame = pd.DataFrame(all_results) cttso_workflows_df["subject_id"] = cttso_workflows_df.apply( lambda x: WFR_NAME_REGEX.fullmatch(x.wfr_name).groups(1), @@ -383,3 +395,212 @@ def get_ica_workflow_run_id_from_portal(subject_id: str, library_id: str) -> str # Collect the workflow run id from the most recent run return cttso_workflows_df["wfr_id"].tolist()[0] + +def get_ssm_project_mapping_json() -> List: + """ + Returns the json object that maps project owner to sample type - + Looks like this + [ + { + "project_owner": "VCCC", + "project_name": "PO", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term":null + }, + ... + { + "project_owner": "UMCCR", + "project_name": "Control", + "panel": "main", + "sample_type": "validation", + "is_identified": "deidentified", + "default_snomed_term": "Disseminated malignancy of unknown primary" + }, + { + "project_owner": "*", + "project_name": "*", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Disseminated malignancy of unknown primary" + } + ] + :return: + """ + ssm_client = get_boto3_ssm_client() + + return json.loads( + ssm_client.get_parameter( + Name=LIMS_PROJECT_NAME_MAPPING_SSM_PATH + ).get("Parameter").get("Value") + ) + + +def apply_mapping_json_to_row(row: pd.Series, mapping_json: List): + """ + Apply mapping json to row + { + "" + } + :param row: + :param mapping_json: + :return: + """ + + # Get mapping dict by project_name / project_owner + try: + mapping_dict = next( + filter( + lambda mapping_json_iter: + ( + mapping_json_iter.get("project_owner") == row['glims_project_owner'] and + mapping_json_iter.get("project_name") == row['glims_project_name'] + ) or + ( + mapping_json_iter.get("project_owner") == "*" and + mapping_json_iter.get("project_name") == row['glims_project_name'] + ) or + ( + mapping_json_iter.get("project_owner") == row['glims_project_owner'] and + mapping_json_iter.get("project_name") == "*" + ), + mapping_json + ) + ) + except StopIteration: + mapping_dict = next( + filter( + lambda mapping_json_iter: + mapping_json_iter.get("project_owner") == "*" and + mapping_json_iter.get("project_name") == "*", + mapping_json + ) + ) + + # Determing column 'needs_redcap' by determining if default_snomed_term is set? + if mapping_dict.get("default_snomed_term", None) is None: + mapping_dict["needs_redcap"] = True + else: + mapping_dict["needs_redcap"] = False + + return mapping_dict + + +def get_cttso_samples_from_limsrow_df() -> pd.DataFrame: + """ + Get cttso samples from GLIMS + + :return: A pandas DataFrame with the following columns + * subject_id + * library_id + * in_glims + * glims_illumina_id + * glims_project_owner + * glims_project_name + * glims_panel + * glims_sample_type + * glims_is_identified + * glims_default_snomed_term + * glims_needs_redcap + """ + + portal_base_url = get_portal_base_url() + portal_url_endpoint = PORTAL_LIMSROW_ENDPOINT.format( + PORTAL_API_BASE_URL=portal_base_url + ) + portal_auth = get_portal_creds(portal_url_endpoint) + + # Initialise page and appended list + all_results = [] + page_number = 1 + + while True: + req: Response = requests.get( + url=portal_url_endpoint, + auth=portal_auth, + params={ + "type": PORTAL_CTTSO_SAMPLE_TYPE, + "assay": PORTAL_CTTSO_SAMPLE_ASSAY, + "phenotype": PORTAL_CTTSO_SAMPLE_PHENOTYPE, + "rowsPerPage": PORTAL_MAX_ROWS_PER_PAGE, + "ordering": PORTAL_WORKFLOW_ORDERING, + "page": page_number + } + ) + + req_dict: Dict = req.json() + + results: List + if (results := req_dict.get("results", None)) is None: + raise ValueError + + # Extend all results + all_results.extend(results) + + # Get next page + if req_dict.get("links", {}).get("next", None) is not None: + page_number += 1 + else: + break + + # Convret to dataframe + portal_cttso_limsrow_df = pd.DataFrame(all_results) + + # Set column in_glims to true for all rows in this df + portal_cttso_limsrow_df["in_glims"] = True + + # Rename project owner and name columns + portal_cttso_limsrow_df = portal_cttso_limsrow_df.rename( + columns={ + "project_owner": "glims_project_owner", + "project_name": "glims_project_name", + } + ) + + mapping_json: List = get_ssm_project_mapping_json() + + portal_cttso_limsrow_df["mapping_json"] = portal_cttso_limsrow_df.apply( + lambda row: apply_mapping_json_to_row(row, mapping_json), + axis="columns" + ) + + # Get glims rows based on project owner and project name + columns_to_update = [ + "panel", + "sample_type", + "is_identified", + "default_snomed_term", + "needs_redcap" + ] + + for columns_to_update in columns_to_update: + portal_cttso_limsrow_df[f"glims_{columns_to_update}"] = portal_cttso_limsrow_df["mapping_json"].apply( + lambda json_map: json_map.get(columns_to_update) + ) + + # Rename illumina_id to glims_illumina_id + portal_cttso_limsrow_df.rename( + columns={ + "illumina_id": "glims_illumina_id" + }, + inplace=True + ) + + columns_to_return = [ + "subject_id", + "library_id", + "in_glims", + "glims_illumina_id", + "glims_project_owner", + "glims_project_name", + "glims_panel", + "glims_sample_type", + "glims_is_identified", + "glims_default_snomed_term", + "glims_needs_redcap" + ] + + return portal_cttso_limsrow_df[columns_to_return] + diff --git a/deploy/cttso-ica-to-pieriandx-cdk/lib/cttso-ica-to-pieriandx-lims-maker-lambda-stack.ts b/deploy/cttso-ica-to-pieriandx-cdk/lib/cttso-ica-to-pieriandx-lims-maker-lambda-stack.ts index dde4c4d..b2678ec 100644 --- a/deploy/cttso-ica-to-pieriandx-cdk/lib/cttso-ica-to-pieriandx-lims-maker-lambda-stack.ts +++ b/deploy/cttso-ica-to-pieriandx-cdk/lib/cttso-ica-to-pieriandx-lims-maker-lambda-stack.ts @@ -17,7 +17,8 @@ import { SSM_CLINICAL_LAMBDA_FUNCTION_ARN_VALUE, GLIMS_SSM_PARAMETER_PATH, REDCAP_LAMBDA_FUNCTION_SSM_KEY, - SSM_LIMS_LAMBDA_FUNCTION_EVENT_RULE_NAME_VALUE + SSM_LIMS_LAMBDA_FUNCTION_EVENT_RULE_NAME_VALUE, + SSM_PROJECT_NAME_TO_PIERIANDX_CONFIG_SSM_PATH } from "../constants"; import {Rule, Schedule} from "aws-cdk-lib/aws-events"; import { LambdaFunction as LambdaFunctionTarget } from "aws-cdk-lib/aws-events-targets" @@ -136,6 +137,9 @@ export class CttsoIcaToPieriandxLimsMakerLambdaStack extends Stack { }) )) + // Get permission to project owner configuration SSM Parameter + + // Add pieriandx secrets access to lambda policy const pieriandx_secrets_path = Secret.fromSecretNameV2( this, @@ -225,6 +229,12 @@ export class CttsoIcaToPieriandxLimsMakerLambdaStack extends Stack { `${props.stack_prefix}-ica-to-pieriandx-function-arn`, SSM_LAMBDA_FUNCTION_ARN_VALUE ) + const cttso_project_mapping_to_pieriandx_ssm = StringParameter.fromStringParameterName( + this, + `${props.stack_prefix}-ica-to-pieriandx-project-mapping-arn`, + SSM_PROJECT_NAME_TO_PIERIANDX_CONFIG_SSM_PATH + ) + // Step 2: Add ssm to policy lambda_function.addToRolePolicy( new PolicyStatement({ @@ -234,7 +244,8 @@ export class CttsoIcaToPieriandxLimsMakerLambdaStack extends Stack { resources: [ clinical_lambda_function_ssm.parameterArn, validation_lambda_function_ssm.parameterArn, - cttso_ica_to_pieriandx_ssm.parameterArn + cttso_ica_to_pieriandx_ssm.parameterArn, + cttso_project_mapping_to_pieriandx_ssm.parameterArn ] } ) diff --git a/deploy/cttso-ica-to-pieriandx-cdk/project-name-to-pieriandx-mapping.json b/deploy/cttso-ica-to-pieriandx-cdk/project-name-to-pieriandx-mapping.json new file mode 100644 index 0000000..5f7200b --- /dev/null +++ b/deploy/cttso-ica-to-pieriandx-cdk/project-name-to-pieriandx-mapping.json @@ -0,0 +1,90 @@ +[ + { + "project_owner": "VCCC", + "project_name": "PO", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term":null + }, + { + "project_owner": "Grimmond", + "project_name": "COUMN", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "Tothill", + "project_name": "CUP", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": "Disseminated malignancy of unknown primary" + }, + { + "project_owner": "Tothill", + "project_name": "PPGL", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "TJohn", + "project_name": "MESO", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "TJohn", + "project_name": "OCEANiC", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": null + }, + { + "project_owner": "*", + "project_name": "SOLACE2", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "SLuen", + "project_name": "IMPARP", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "UMCCR", + "project_name": "Control", + "panel": "main", + "sample_type": "validation", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + }, + { + "project_owner": "UMCCR", + "project_name": "QAP", + "panel": "subpanel", + "sample_type": "patient_care_sample", + "is_identified": "identified", + "default_snomed_term": null + }, + { + "project_owner": "*", + "project_name": "*", + "panel": "main", + "sample_type": "patient_care_sample", + "is_identified": "deidentified", + "default_snomed_term": "Neoplastic disease" + } +] \ No newline at end of file diff --git a/deploy/cttso-ica-to-pieriandx-cdk/scripts/update_project_name_mapping.sh b/deploy/cttso-ica-to-pieriandx-cdk/scripts/update_project_name_mapping.sh new file mode 100755 index 0000000..4384489 --- /dev/null +++ b/deploy/cttso-ica-to-pieriandx-cdk/scripts/update_project_name_mapping.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# Set production account as env var +PROD_ACCOUNT="472057503814" +SSM_PATH="cttso-lims-project-name-to-pieriandx-mapping" +MAPPING_PATH_NAME="project-name-to-pieriandx-mapping.json" + +# Get json file based on account +ACCOUNT_ID="$( \ + aws sts get-caller-identity \ + --output json | \ + jq --raw-output \ + '.Account' \ +)" + +if [[ "${ACCOUNT_ID}" == "${PROD_ACCOUNT}" ]]; then + echo "Updating mapping in prod" 1>&2 +else + echo "Error! Please ensure you're logged in the UMCCR AWS prod account" 1>&2 + print_help + exit 1 +fi + +# Get this directory path +get_this_path() { + : ' + Mac users use greadlink over readlink + Return the directory of where this install.sh file is located + ' + local this_dir + + # darwin is for mac, else linux + if [[ "${OSTYPE}" == "darwin"* ]]; then + readlink_program="greadlink" + else + readlink_program="readlink" + fi + + # Get directory name of the install.sh file + this_dir="$(dirname "$("${readlink_program}" -f "${0}")")" + + # Return directory name + echo "${this_dir}" +} + +project_name_mapping_json_file="$(get_this_path)/../${MAPPING_PATH_NAME}" + +# Validate json +if ! jq -r < "${project_name_mapping_json_file}" 1>/dev/null; then + echo "mapping json is not valid json" +fi + +compact_json_wrapped_str="$( \ + jq --raw-output --compact-output \ + < "${project_name_mapping_json_file}" \ +)" + +if current_value="$( \ + aws ssm get-parameter \ + --output json --name "${SSM_PATH}" | \ + jq --raw-output \ + '.Parameter?.Value' \ + )" 2>/dev/null; then + # Compare on new value + if [[ "${current_value}" == "${compact_json_wrapped_str}" ]]; then + echo "Current value for '${SSM_PATH}' already matches '${MAPPING_PATH_NAME}', skipping update" 1>&2 + exit + fi +fi + +# Put the ssm parameter +echo "Updating ssm parameter with contents of '${MAPPING_PATH_NAME}'" +aws ssm put-parameter \ + --name "${SSM_PATH}" \ + --output json \ + --overwrite \ + --type "String" \ + --value "${compact_json_wrapped_str}" + + + + diff --git a/scripts/cttso-ica-to-pieriandx.py b/scripts/cttso-ica-to-pieriandx.py index ba57fba..2e6d9d7 100644 --- a/scripts/cttso-ica-to-pieriandx.py +++ b/scripts/cttso-ica-to-pieriandx.py @@ -112,7 +112,7 @@ def main(): except ValueError: logger.warning(f"Could not get run flowcell id from '{portal_run_id}', skipping renaming run") else: - run.rename_run(new_run_name=f"{case.case_accession_number}_{run_flowcell_id}_{run.get_timestamp()}") + run.rename_run(new_run_name=f"{case.case_accession_number}_{run_flowcell_id}_{portal_run_id}_{run.get_timestamp()}") else: logger.info(f"Couldn't rename the run object since workflow " f"run name '{ica_workflow_run_obj.name}' was not in recognised regex form") diff --git a/utils/classes.py b/utils/classes.py index f0e3618..a9a599a 100644 --- a/utils/classes.py +++ b/utils/classes.py @@ -733,7 +733,7 @@ def from_dict(cls, case_dict: Dict): return cls(case_accession_number=case_dict.get("accession_number"), disease=case_dict.get("disease_obj"), sample_type=SampleType(case_dict.get("sample_type")), - panel_type=PanelType[case_dict.get("panel_type").upper()], + panel_type=PanelType[case_dict.get("panel_type").upper().replace("_", "")], # Still need to load this specimen=DeIdentifiedSpecimen.from_dict(case_dict), indication=case_dict.get("indication")) @@ -786,7 +786,7 @@ def from_dict(cls, case_dict: Dict): case_accession_number=case_dict.get("accession_number"), disease=case_dict.get("disease_obj"), sample_type=SampleType(case_dict.get("sample_type")), - panel_type=PanelType[case_dict.get("panel_type").upper()], + panel_type=PanelType[case_dict.get("panel_type").upper().replace("_", "")], requesting_physicians=[ Physician.from_dict(physician) for physician in case_dict.get("requesting_physicians")