Merge the 2 list of cohorts: the one from the spreadsheet and the one…

… from the GWAS study (fetched via the GWAS REST API)
HDRUK · Oct 25, 2024 · a243507 · a243507
1 parent b474a31
commit a243507
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 10 deletions.
diff --git a/curation/template_parser.py b/curation/template_parser.py
@@ -201,7 +201,10 @@ def extract_samples(self):
             sample_keys = sample_data.data.keys()
             if 'sample_number' not in sample_keys:
                 if 'source_GWAS_catalog' in sample_keys:
-                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_name)
+                    spreadsheet_cohorts = []
+                    if 'cohorts' in sample_keys:
+                        spreadsheet_cohorts = sample_data.data['cohorts']
+                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
                     if gwas_study:
                         for gwas_ancestry in gwas_study:
                             c_sample = SampleData(spreadsheet_name)
@@ -291,7 +294,7 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
         return sample_data
 
 
-    def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
+    def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
         """
         Get the GWAS Study information related to the PGS sample.
         Check that all the required data is available
@@ -308,18 +311,25 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
             return study_data
         response_data = response.json()
         if response_data:
+            # List the cohorts present in the spreadsheet for this sample
+            spreadsheet_cohorts_names = []
+            if spreadsheet_cohorts:
+                spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]
+
             try:
                 source_PMID = response_data['publicationInfo']['pubmedId']
-
                 # Create list of cohorts if it exists in the GWAS study
                 # This override the Cohorts found in the cohort column in the spreadsheet
-                cohorts_list = []
+                cohorts_list = spreadsheet_cohorts
                 if 'cohort' in response_data.keys():
                     cohorts = response_data['cohort'].split('|')
                     for cohort in cohorts:
                         cohort_id = cohort.upper()
+                        # Check if cohort in list of cohort references
+                        # # and if the cohort is already in the list provided by th author
                         if cohort_id in self.parsed_cohorts:
-                            cohorts_list.append(self.parsed_cohorts[cohort_id])
+                            if cohort_id not in spreadsheet_cohorts_names:
+                                cohorts_list.append(self.parsed_cohorts[cohort_id])
                         else:
                             self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
 

diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
@@ -130,12 +130,18 @@ def update_studies(self):
                         new_sample.cohorts.set(cohorts_list)
                         # Print a message if the 2 list of cohorts (old & new) are different
                         if sample.cohorts:
-                            old_set = ', '.join(sorted([x.name_short for x in sample.cohorts.all()]))
-                            new_set = ', '.join(sorted([x.name_short for x in cohorts_list]))
-                            if old_set != new_set:
+                            new_set = sorted([x.name_short.upper() for x in cohorts_list])
+
+                            old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
+                            new_set_string = ', '.join(new_set)
+                            if old_set_string != new_set_string:
+                                # Add cohorts which are already associated to the sample in the database, but not in the GWAS study
+                                for sample_cohort in sample.cohorts.all():
+                                    if sample_cohort.name_short.upper() not in new_set:
+                                        new_sample.cohorts.add(sample_cohort)
                                 print(f"\t/!\ Replacing cohorts list:")
-                                print(f"\t  - Old set: {old_set}")
-                                print(f"\t  + New set: {new_set}")
+                                print(f"\t  - Old set: {old_set_string}")
+                                print(f"\t  + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
                     # Copy the list of cohorts from the existing sample.
                     # Need to be added once the new Sample object as been saved,
                     # i.e. when the Sample `id` has been created