From 0a8aecaa13cec080fa499715470a2fbc774d2f05 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 28 Nov 2024 14:40:40 +0000
Subject: [PATCH] Search the cohort 'long name' (column 'Cohort Name') in the
 Cohort Refr. spreadsheet when the cohort can't be found in the short names
 list (column 'Cohort ID')

---
 curation/template_parser.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index b641551..4e47b3d 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -18,6 +18,7 @@ def __init__(self):
         self.parsed_publication = None
         self.parsed_scores = {}
         self.parsed_cohorts = {}
+        self.parsed_cohorts_long_names = {}
         self.parsed_samples_scores = []
         self.parsed_samples_testing = []
         self.parsed_performances = []
@@ -89,6 +90,9 @@ def extract_cohorts(self):
             if cohort_id in self.parsed_cohorts:
                 self.report_warning(spreadsheet_name, f'Ambiguity found in the Cohort spreadsheet: the cohort ID "{cohort_name}" has been found more than once!')
             self.parsed_cohorts[cohort_id] = parsed_cohort
+            if parsed_cohort.name_long:
+                cohort_long_name = parsed_cohort.name_long.upper()
+                self.parsed_cohorts_long_names[cohort_long_name] = cohort_id
             self.update_report(parsed_cohort)
 
 
@@ -275,6 +279,11 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
                                 cohort_id = cohort.upper()
                                 if cohort_id in self.parsed_cohorts:
                                     cohorts_list.append(self.parsed_cohorts[cohort_id])
+                                # Check if the cohort name corresponds to a cohort long name on the Cohort Refr. spreadsheet
+                                elif cohort_id in self.parsed_cohorts_long_names.keys():
+                                    new_cohort_id = self.parsed_cohorts_long_names[cohort_id]
+                                    cohorts_list.append(self.parsed_cohorts[new_cohort_id])
+                                    self.report_warning(spreadsheet_name, f'Warning: the sample cohort "{cohort}" has been found in the Cohort Refr. spreadsheet as "{new_cohort_id}"')
                                 else:
                                     self.report_error(spreadsheet_name, f'Error: the sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
                             val = cohorts_list
@@ -331,6 +340,12 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:st
                         if cohort_id in self.parsed_cohorts:
                             if cohort_id not in spreadsheet_cohorts_names:
                                 cohorts_list.append(self.parsed_cohorts[cohort_id])
+                        # Check if the cohort name corresponds to a cohort long name on the Cohort Refr. spreadsheet
+                        elif cohort_id in self.parsed_cohorts_long_names.keys():
+                            new_cohort_id = self.parsed_cohorts_long_names[cohort_id]
+                            if new_cohort_id not in spreadsheet_cohorts_names:
+                                cohorts_list.append(self.parsed_cohorts[new_cohort_id])
+                                self.report_warning(spreadsheet_name, f'Warning: the GWAS Catalog sample cohort "{cohort}" has been found in the Cohort Refr. spreadsheet as "{new_cohort_id}"')
                         else:
                             self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
                     # Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged.