From b1c9eaa6b8923113819cd6238950e2d37e7ec800 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 17 Oct 2024 12:00:08 +0100
Subject: [PATCH 1/9] Fetcht the list of cohorts from GWAS Catalog REST API if
 available

---
 curation/template_parser.py | 142 ++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 61 deletions(-)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index 76b78c09..fdbcca9d 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -201,12 +201,14 @@ def extract_samples(self):
             sample_keys = sample_data.data.keys()
             if 'sample_number' not in sample_keys:
                 if 'source_GWAS_catalog' in sample_keys:
-                    gwas_study = get_gwas_study(sample_data.data['source_GWAS_catalog'])
+                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_name)
                     if gwas_study:
                         for gwas_ancestry in gwas_study:
                             c_sample = SampleData(spreadsheet_name)
+                            # Spreadsheet sample/cohort data
                             for col, entry in sample_data.data.items():
                                 c_sample.add_data(col, entry)
+                            # GWAS Catalog sample/cohort data
                             for field, val in gwas_ancestry.items():
                                 c_sample.add_data(field, val)
                             self.update_report(c_sample)
@@ -289,6 +291,84 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
         return sample_data
 
 
+    def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
+        """
+        Get the GWAS Study information related to the PGS sample.
+        Check that all the required data is available
+        > Parameter:
+            - gcst_id: GWAS Study ID (e.g. GCST010127)
+            - spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions)
+        > Return: list of dictionnaries (1 per ancestry)
+        """
+        study_data = []
+        gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
+        response = requests.get(f'{gwas_rest_url}{gcst_id}')
+
+        if not response:
+            return study_data
+        response_data = response.json()
+        if response_data:
+            try:
+                source_PMID = response_data['publicationInfo']['pubmedId']
+
+                # Create list of cohorts if it exists in the GWAS study
+                # This override the Cohorts found in the cohort column in the spreadsheet
+                cohorts_list = []
+                if 'cohort' in response_data.keys():
+                    cohorts = response_data['cohort'].split('|')
+                    for cohort in cohorts:
+                        cohort_id = cohort.upper()
+                        if cohort_id in self.parsed_cohorts:
+                            cohorts_list.append(self.parsed_cohorts[cohort_id])
+                        else:
+                            self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
+
+                for ancestry in response_data['ancestries']:
+
+                    if ancestry['type'] != 'initial':
+                        continue
+
+                    ancestry_data = { 'source_PMID': source_PMID }
+                    # Add cohorts list
+                    if cohorts_list:
+                        ancestry_data['cohorts'] = cohorts_list
+                    ancestry_data['sample_number'] = ancestry['numberOfIndividuals']
+
+                    # ancestry_broad
+                    for ancestralGroup in ancestry['ancestralGroups']:
+                        if not 'ancestry_broad' in ancestry_data:
+                            ancestry_data['ancestry_broad'] = ''
+                        else:
+                            ancestry_data['ancestry_broad'] += ','
+                        ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']
+
+                    # ancestry_free
+                    for countryOfOrigin in ancestry['countryOfOrigin']:
+                        if countryOfOrigin['countryName'] != 'NR':
+                            if not 'ancestry_free' in ancestry_data:
+                                ancestry_data['ancestry_free'] = ''
+                            else:
+                                ancestry_data['ancestry_free'] += ','
+                            ancestry_data['ancestry_free'] += countryOfOrigin['countryName']
+
+                    # ancestry_country
+                    for countryOfRecruitment in ancestry['countryOfRecruitment']:
+                        if countryOfRecruitment['countryName'] != 'NR':
+                            if not 'ancestry_country' in ancestry_data:
+                                ancestry_data['ancestry_country'] = ''
+                            else:
+                                ancestry_data['ancestry_country'] += ','
+                            ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
+                    # ancestry_additional
+                    # Not found in the REST API
+
+                    study_data.append(ancestry_data)
+            except:
+                print(f'Error: can\'t fetch GWAS results for {gcst_id}')
+        return study_data
+
+
+
     def get_model_field_from_schema(self, col, current_schema):
         '''
         Retrieve the model and field from the Template, that corresponds to the current spreadsheet column.
@@ -370,66 +450,6 @@ def has_report_info(self):
 #  Independent methods  #
 #=======================#
 
-def get_gwas_study(gcst_id):
-    """
-    Get the GWAS Study information related to the PGS sample.
-    Check that all the required data is available
-    > Parameter:
-        - gcst_id: GWAS Study ID (e.g. GCST010127)
-    > Return: list of dictionnaries (1 per ancestry)
-    """
-    study_data = []
-    gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
-    response = requests.get(f'{gwas_rest_url}{gcst_id}')
-
-    if not response:
-        return study_data
-    response_data = response.json()
-    if response_data:
-        try:
-            source_PMID = response_data['publicationInfo']['pubmedId']
-            for ancestry in response_data['ancestries']:
-
-                if ancestry['type'] != 'initial':
-                    continue
-
-                ancestry_data = { 'source_PMID': source_PMID }
-                ancestry_data['sample_number'] = ancestry['numberOfIndividuals']
-
-                # ancestry_broad
-                for ancestralGroup in ancestry['ancestralGroups']:
-                    if not 'ancestry_broad' in ancestry_data:
-                        ancestry_data['ancestry_broad'] = ''
-                    else:
-                        ancestry_data['ancestry_broad'] += ','
-                    ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']
-
-                # ancestry_free
-                for countryOfOrigin in ancestry['countryOfOrigin']:
-                    if countryOfOrigin['countryName'] != 'NR':
-                        if not 'ancestry_free' in ancestry_data:
-                            ancestry_data['ancestry_free'] = ''
-                        else:
-                            ancestry_data['ancestry_free'] += ','
-                        ancestry_data['ancestry_free'] += countryOfOrigin['countryName']
-
-                # ancestry_country
-                for countryOfRecruitment in ancestry['countryOfRecruitment']:
-                    if countryOfRecruitment['countryName'] != 'NR':
-                        if not 'ancestry_country' in ancestry_data:
-                            ancestry_data['ancestry_country'] = ''
-                        else:
-                            ancestry_data['ancestry_country'] += ','
-                        ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
-                # ancestry_additional
-                # Not found in the REST API
-
-                study_data.append(ancestry_data)
-        except:
-            print(f'Error: can\'t fetch GWAS results for {gcst_id}')
-    return study_data
-
-
 def next_PSS_num():
     r = SampleSet.objects.last()
     if r == None:

From 0bb94ce12742740d87f55c66cac077d317ef2897 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 17 Oct 2024 12:08:29 +0100
Subject: [PATCH 2/9] Use the list of cohorts from the GWAS Catalog REST API
 while updating the record, if available

---
 release/scripts/UpdateGwasStudies.py | 64 ++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
index 99308a77..823620a7 100644
--- a/release/scripts/UpdateGwasStudies.py
+++ b/release/scripts/UpdateGwasStudies.py
@@ -1,7 +1,5 @@
 import requests
-from catalog.models import Sample, Score
-from pgs_web import constants
-
+from catalog.models import Sample, Score, Cohort
 
 
 class UpdateGwasStudies:
@@ -16,15 +14,15 @@ def __init__(self,verbose=None):
         self.verbose = verbose
 
 
-    def get_gwas_info(self,sample):
+    def get_gwas_info(self,sample:Sample) -> dict:
         """
         Get the GWAS Study information related to the PGS sample.
         Check that all the required data is available
         > Parameter:
-            - gcst_id: GWAS Study ID (e.g. GCST010127)
-        > Return: list of dictionnaries (1 per ancestry)
+            - sample: instance of a Sample model
+        > Return: dictionary (cohorts and ancestries)
         """
-        study_data = []
+        study_data = { "ancestries": [] }
         gcst_id = sample.source_GWAS_catalog
         response = requests.get(f'{self.gwas_rest_url}{gcst_id}')
 
@@ -37,6 +35,26 @@ def get_gwas_info(self,sample):
         if response_data:
             try:
                 source_PMID = response_data['publicationInfo']['pubmedId']
+
+                # Create list of cohorts if it exists in the GWAS study
+                # This override the Cohorts found previously in the cohort column in the spreadsheet
+                cohorts_list = []
+                if 'cohort' in response_data.keys():
+                    cohorts = response_data['cohort'].split('|')
+                    for cohort in cohorts:
+                        cohort_id = cohort.upper()
+                        try:
+                            cohort_model = Cohort.objects.get(name_short__iexact=cohort_id)
+                            cohorts_list.append(cohort_model)
+                        except Cohort.DoesNotExist:
+                            print(f"New cohort found: {cohort_id}")
+                            cohort_model = Cohort(name_short=cohort_id,name_full=cohort_id)
+                            cohort_model.save()
+                            cohorts_list.append(cohort_model)
+                    if cohorts_list:
+                        study_data['cohorts'] = cohorts_list
+
+                # Ancestries
                 for ancestry in response_data['ancestries']:
 
                     if ancestry['type'] != 'initial':
@@ -70,12 +88,12 @@ def get_gwas_info(self,sample):
                             else:
                                 ancestry_data['ancestry_country'] += self.country_sep
                             ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
-                    study_data.append(ancestry_data)
+                    study_data["ancestries"].append(ancestry_data)
 
-                if study_data:
-                    print(f'\t{len(study_data)} distinct ancestries')
+                if study_data["ancestries"]:
+                    print(f'\t{len(study_data["ancestries"])} distinct ancestries')
                     if self.verbose:
-                        for anc in study_data:
+                        for anc in study_data["ancestries"]:
                             print(f'\t{anc}')
                 else:
                     print("\tNo ancestry")
@@ -90,7 +108,12 @@ def update_studies(self):
         for sample in self.samples:
             gwas_study = self.get_gwas_info(sample)
             new_samples = []
-            for gwas_ancestry in gwas_study:
+            cohorts_list = []
+            # List of cohorts
+            if 'cohorts' in gwas_study.keys():
+                cohorts_list = gwas_study['cohorts']
+            # List of ancestry data
+            for gwas_ancestry in gwas_study['ancestries']:
                 new_sample = Sample()
                 new_sample.source_GWAS_catalog = sample.source_GWAS_catalog
                 for field, val in gwas_ancestry.items():
@@ -99,11 +122,18 @@ def update_studies(self):
                     setattr(new_sample, field, val)
                 new_sample.save()
 
-                # Cohorts - need to be added once the Sample object as been saved,
-                # i.e. when the Sample `id` has been created
-                if sample.cohorts:
-                    for cohort in sample.cohorts.all():
-                        new_sample.cohorts.add(cohort)
+                # Cohorts data
+                if cohorts_list or sample.cohorts:
+                    # Use the list of cohorts from the GWAS study (if available)
+                    # Override the list of cohorts found in the existing sample
+                    if cohorts_list:
+                        new_sample.cohorts.set(cohorts_list)
+                    # Copy the list of cohorts from the existing sample.
+                    # Need to be added once the new Sample object as been saved,
+                    # i.e. when the Sample `id` has been created
+                    elif sample.cohorts:
+                        for cohort in sample.cohorts.all():
+                            new_sample.cohorts.add(cohort)
                     new_sample.save()
 
                 new_samples.append(new_sample)

From 69d8bf6b334d3bb8796f676bad9e26d9add2bfc9 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 18 Oct 2024 14:38:05 +0100
Subject: [PATCH 3/9] Print a message if the new list of cohorts (from GWAS)
 differs from the one alreday stored in the database

---
 release/scripts/UpdateGwasStudies.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
index 823620a7..f18f52df 100644
--- a/release/scripts/UpdateGwasStudies.py
+++ b/release/scripts/UpdateGwasStudies.py
@@ -128,6 +128,14 @@ def update_studies(self):
                     # Override the list of cohorts found in the existing sample
                     if cohorts_list:
                         new_sample.cohorts.set(cohorts_list)
+                        # Print a message if the 2 list of cohorts (old & new) are different
+                        if sample.cohorts:
+                            old_set = ', '.join(sorted([x.name_short for x in sample.cohorts.all()]))
+                            new_set = ', '.join(sorted([x.name_short for x in cohorts_list]))
+                            if old_set != new_set:
+                                print(f"\t# {new_sample.source_GWAS_catalog}: replacing cohorts list")
+                                print(f"\t  - Old set: {old_set}")
+                                print(f"\t  + New set: {new_set}")
                     # Copy the list of cohorts from the existing sample.
                     # Need to be added once the new Sample object as been saved,
                     # i.e. when the Sample `id` has been created

From 1335aa77c23cd46d8eaebc48a254890b8e9b0c07 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 18 Oct 2024 14:41:16 +0100
Subject: [PATCH 4/9] Update text message

---
 release/scripts/UpdateGwasStudies.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
index f18f52df..f2e06aa6 100644
--- a/release/scripts/UpdateGwasStudies.py
+++ b/release/scripts/UpdateGwasStudies.py
@@ -133,7 +133,7 @@ def update_studies(self):
                             old_set = ', '.join(sorted([x.name_short for x in sample.cohorts.all()]))
                             new_set = ', '.join(sorted([x.name_short for x in cohorts_list]))
                             if old_set != new_set:
-                                print(f"\t# {new_sample.source_GWAS_catalog}: replacing cohorts list")
+                                print(f"\t/!\ Replacing cohorts list:")
                                 print(f"\t  - Old set: {old_set}")
                                 print(f"\t  + New set: {new_set}")
                     # Copy the list of cohorts from the existing sample.

From b474a314872b587cd61053202689ed0907519e4e Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 25 Oct 2024 15:15:26 +0100
Subject: [PATCH 5/9] Fix regex warning during data import

---
 curation/parsers/performance.py | 2 +-
 curation/parsers/sample.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/curation/parsers/performance.py b/curation/parsers/performance.py
index 91a703e1..f0baa9a4 100644
--- a/curation/parsers/performance.py
+++ b/curation/parsers/performance.py
@@ -52,7 +52,7 @@ def str2metric(self, field, val):
         val = self.replace_non_ascii_chars(field,val)
 
         # Estimate with percentage as unit
-        if re.match('^\d+\.?\d*\s*\%$',val):
+        if re.match(r'^\d+\.?\d*\s*\%$',val):
             val = val.replace('%','').strip()
             current_metric.add_data('estimate', val)
             current_metric.add_data('unit', '%')
diff --git a/curation/parsers/sample.py b/curation/parsers/sample.py
index 5a8148dc..df648b77 100644
--- a/curation/parsers/sample.py
+++ b/curation/parsers/sample.py
@@ -18,7 +18,7 @@ def str2demographic(self, field, val):
         - val: data value
         Return type: DemographicData object
         '''
-        unit_regex = "([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
+        unit_regex = r"([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
         current_demographic = DemographicData(field,val,self.spreadsheet_name)
         if type(val) == float:
             current_demographic.add_data('estimate', val)
@@ -129,7 +129,7 @@ def create_sample_model(self):
                     elif field == 'sample_percent_male':
                         # Remove % character
                         val_str = str(val)
-                        if re.search('\%',val_str):
+                        if re.search(r'\%',val_str):
                             val_str = re.sub(r'\%', r'', val_str)
                             val_str = re.sub(r' ', r'', val_str)
                             val = float(val_str)

From a24350771758a61a050929064f9acbb97ebc7573 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 25 Oct 2024 15:22:47 +0100
Subject: [PATCH 6/9] Merge the 2 list of cohorts: the one from the spreadsheet
 and the one from the GWAS study (fetched via the GWAS REST API)

---
 curation/template_parser.py          | 20 +++++++++++++++-----
 release/scripts/UpdateGwasStudies.py | 16 +++++++++++-----
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index fdbcca9d..03a4554e 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -201,7 +201,10 @@ def extract_samples(self):
             sample_keys = sample_data.data.keys()
             if 'sample_number' not in sample_keys:
                 if 'source_GWAS_catalog' in sample_keys:
-                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_name)
+                    spreadsheet_cohorts = []
+                    if 'cohorts' in sample_keys:
+                        spreadsheet_cohorts = sample_data.data['cohorts']
+                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
                     if gwas_study:
                         for gwas_ancestry in gwas_study:
                             c_sample = SampleData(spreadsheet_name)
@@ -291,7 +294,7 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
         return sample_data
 
 
-    def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
+    def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
         """
         Get the GWAS Study information related to the PGS sample.
         Check that all the required data is available
@@ -308,18 +311,25 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
             return study_data
         response_data = response.json()
         if response_data:
+            # List the cohorts present in the spreadsheet for this sample
+            spreadsheet_cohorts_names = []
+            if spreadsheet_cohorts:
+                spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]
+
             try:
                 source_PMID = response_data['publicationInfo']['pubmedId']
-
                 # Create list of cohorts if it exists in the GWAS study
                 # This override the Cohorts found in the cohort column in the spreadsheet
-                cohorts_list = []
+                cohorts_list = spreadsheet_cohorts
                 if 'cohort' in response_data.keys():
                     cohorts = response_data['cohort'].split('|')
                     for cohort in cohorts:
                         cohort_id = cohort.upper()
+                        # Check if cohort in list of cohort references
+                        # # and if the cohort is already in the list provided by th author
                         if cohort_id in self.parsed_cohorts:
-                            cohorts_list.append(self.parsed_cohorts[cohort_id])
+                            if cohort_id not in spreadsheet_cohorts_names:
+                                cohorts_list.append(self.parsed_cohorts[cohort_id])
                         else:
                             self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
 
diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
index f2e06aa6..7418ae5e 100644
--- a/release/scripts/UpdateGwasStudies.py
+++ b/release/scripts/UpdateGwasStudies.py
@@ -130,12 +130,18 @@ def update_studies(self):
                         new_sample.cohorts.set(cohorts_list)
                         # Print a message if the 2 list of cohorts (old & new) are different
                         if sample.cohorts:
-                            old_set = ', '.join(sorted([x.name_short for x in sample.cohorts.all()]))
-                            new_set = ', '.join(sorted([x.name_short for x in cohorts_list]))
-                            if old_set != new_set:
+                            new_set = sorted([x.name_short.upper() for x in cohorts_list])
+
+                            old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
+                            new_set_string = ', '.join(new_set)
+                            if old_set_string != new_set_string:
+                                # Add cohorts which are already associated to the sample in the database, but not in the GWAS study
+                                for sample_cohort in sample.cohorts.all():
+                                    if sample_cohort.name_short.upper() not in new_set:
+                                        new_sample.cohorts.add(sample_cohort)
                                 print(f"\t/!\ Replacing cohorts list:")
-                                print(f"\t  - Old set: {old_set}")
-                                print(f"\t  + New set: {new_set}")
+                                print(f"\t  - Old set: {old_set_string}")
+                                print(f"\t  + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
                     # Copy the list of cohorts from the existing sample.
                     # Need to be added once the new Sample object as been saved,
                     # i.e. when the Sample `id` has been created

From f4474c8c1b05e3df800ca3dba7eed541e85f8688 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 25 Oct 2024 15:28:19 +0100
Subject: [PATCH 7/9] Add missing comment

---
 curation/template_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index 03a4554e..a7787265 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -300,6 +300,7 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:st
         Check that all the required data is available
         > Parameter:
             - gcst_id: GWAS Study ID (e.g. GCST010127)
+            - spreadsheet_cohorts: list of CohortData objects for the current sample, collected from the spreadsheet
             - spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions)
         > Return: list of dictionnaries (1 per ancestry)
         """

From c123da533d47d59d02290a670d7d8452305d88d4 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 1 Nov 2024 16:44:51 +0000
Subject: [PATCH 8/9] Update comments about the GWAS and sample cohorts

---
 curation/template_parser.py          | 4 ++--
 release/scripts/UpdateGwasStudies.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index a7787265..6e93d314 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -319,8 +319,8 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:st
 
             try:
                 source_PMID = response_data['publicationInfo']['pubmedId']
-                # Create list of cohorts if it exists in the GWAS study
-                # This override the Cohorts found in the cohort column in the spreadsheet
+                # Update the Cohorts list found in the cohort column of the spreadsheet by
+                # adding the list of cohorts from the GWAS study (if the list is present)
                 cohorts_list = spreadsheet_cohorts
                 if 'cohort' in response_data.keys():
                     cohorts = response_data['cohort'].split('|')
diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
index 7418ae5e..184ef0fb 100644
--- a/release/scripts/UpdateGwasStudies.py
+++ b/release/scripts/UpdateGwasStudies.py
@@ -125,7 +125,7 @@ def update_studies(self):
                 # Cohorts data
                 if cohorts_list or sample.cohorts:
                     # Use the list of cohorts from the GWAS study (if available)
-                    # Override the list of cohorts found in the existing sample
+                    # Update the list of cohorts from the existing sample if new cohorts are found in the GWAS study
                     if cohorts_list:
                         new_sample.cohorts.set(cohorts_list)
                         # Print a message if the 2 list of cohorts (old & new) are different

From 57c6caca83e1ff62bd857933cb42d78812cae62f Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 7 Nov 2024 10:56:05 +0000
Subject: [PATCH 9/9] Print message when the list of cohorts from the
 spreadsheet (for a GWAS study) is merged with the list of cohorts from the
 GWAS Catalog REST API

---
 curation/template_parser.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/curation/template_parser.py b/curation/template_parser.py
index 6e93d314..b641551c 100644
--- a/curation/template_parser.py
+++ b/curation/template_parser.py
@@ -321,19 +321,26 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:st
                 source_PMID = response_data['publicationInfo']['pubmedId']
                 # Update the Cohorts list found in the cohort column of the spreadsheet by
                 # adding the list of cohorts from the GWAS study (if the list is present)
-                cohorts_list = spreadsheet_cohorts
+                cohorts_list = spreadsheet_cohorts.copy()
                 if 'cohort' in response_data.keys():
                     cohorts = response_data['cohort'].split('|')
                     for cohort in cohorts:
                         cohort_id = cohort.upper()
                         # Check if cohort in list of cohort references
-                        # # and if the cohort is already in the list provided by th author
+                        # and if the cohort is already in the list provided by the author
                         if cohort_id in self.parsed_cohorts:
                             if cohort_id not in spreadsheet_cohorts_names:
                                 cohorts_list.append(self.parsed_cohorts[cohort_id])
                         else:
                             self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
-
+                    # Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged.
+                    if spreadsheet_cohorts and len(spreadsheet_cohorts) != len(cohorts_list):
+                        msg = f'''GWAS study {gcst_id} -> the list of cohorts from the spreadsheet has been merged with the one from GWAS.
+                        \t- Spreadsheet list: {', '.join(sorted(spreadsheet_cohorts_names))}
+                        \t+ Merged GWAS list: {', '.join(sorted([x.name.upper() for x in cohorts_list]))}'''
+                        self.report_warning(spreadsheet_name, msg)
+
+                # Ancestry information
                 for ancestry in response_data['ancestries']:
 
                     if ancestry['type'] != 'initial':