diff --git a/curation/parsers/performance.py b/curation/parsers/performance.py index 91a703e1..f0baa9a4 100644 --- a/curation/parsers/performance.py +++ b/curation/parsers/performance.py @@ -52,7 +52,7 @@ def str2metric(self, field, val): val = self.replace_non_ascii_chars(field,val) # Estimate with percentage as unit - if re.match('^\d+\.?\d*\s*\%$',val): + if re.match(r'^\d+\.?\d*\s*\%$',val): val = val.replace('%','').strip() current_metric.add_data('estimate', val) current_metric.add_data('unit', '%') diff --git a/curation/parsers/sample.py b/curation/parsers/sample.py index 5a8148dc..df648b77 100644 --- a/curation/parsers/sample.py +++ b/curation/parsers/sample.py @@ -18,7 +18,7 @@ def str2demographic(self, field, val): - val: data value Return type: DemographicData object ''' - unit_regex = "([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)" + unit_regex = r"([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)" current_demographic = DemographicData(field,val,self.spreadsheet_name) if type(val) == float: current_demographic.add_data('estimate', val) @@ -129,7 +129,7 @@ def create_sample_model(self): elif field == 'sample_percent_male': # Remove % character val_str = str(val) - if re.search('\%',val_str): + if re.search(r'\%',val_str): val_str = re.sub(r'\%', r'', val_str) val_str = re.sub(r' ', r'', val_str) val = float(val_str) diff --git a/curation/template_parser.py b/curation/template_parser.py index 76b78c09..b641551c 100644 --- a/curation/template_parser.py +++ b/curation/template_parser.py @@ -201,12 +201,17 @@ def extract_samples(self): sample_keys = sample_data.data.keys() if 'sample_number' not in sample_keys: if 'source_GWAS_catalog' in sample_keys: - gwas_study = get_gwas_study(sample_data.data['source_GWAS_catalog']) + spreadsheet_cohorts = [] + if 'cohorts' in sample_keys: + spreadsheet_cohorts = sample_data.data['cohorts'] + gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name) if gwas_study: for gwas_ancestry in gwas_study: c_sample = SampleData(spreadsheet_name) + # Spreadsheet sample/cohort data for col, entry in sample_data.data.items(): c_sample.add_data(col, entry) + # GWAS Catalog sample/cohort data for field, val in gwas_ancestry.items(): c_sample.add_data(field, val) self.update_report(c_sample) @@ -289,6 +294,99 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples return sample_data + def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict: + """ + Get the GWAS Study information related to the PGS sample. + Check that all the required data is available + > Parameter: + - gcst_id: GWAS Study ID (e.g. GCST010127) + - spreadsheet_cohorts: list of CohortData objects for the current sample, collected from the spreadsheet + - spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions) + > Return: list of dictionnaries (1 per ancestry) + """ + study_data = [] + gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/' + response = requests.get(f'{gwas_rest_url}{gcst_id}') + + if not response: + return study_data + response_data = response.json() + if response_data: + # List the cohorts present in the spreadsheet for this sample + spreadsheet_cohorts_names = [] + if spreadsheet_cohorts: + spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts] + + try: + source_PMID = response_data['publicationInfo']['pubmedId'] + # Update the Cohorts list found in the cohort column of the spreadsheet by + # adding the list of cohorts from the GWAS study (if the list is present) + cohorts_list = spreadsheet_cohorts.copy() + if 'cohort' in response_data.keys(): + cohorts = response_data['cohort'].split('|') + for cohort in cohorts: + cohort_id = cohort.upper() + # Check if cohort in list of cohort references + # and if the cohort is already in the list provided by the author + if cohort_id in self.parsed_cohorts: + if cohort_id not in spreadsheet_cohorts_names: + cohorts_list.append(self.parsed_cohorts[cohort_id]) + else: + self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet') + # Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged. + if spreadsheet_cohorts and len(spreadsheet_cohorts) != len(cohorts_list): + msg = f'''GWAS study {gcst_id} -> the list of cohorts from the spreadsheet has been merged with the one from GWAS. + \t- Spreadsheet list: {', '.join(sorted(spreadsheet_cohorts_names))} + \t+ Merged GWAS list: {', '.join(sorted([x.name.upper() for x in cohorts_list]))}''' + self.report_warning(spreadsheet_name, msg) + + # Ancestry information + for ancestry in response_data['ancestries']: + + if ancestry['type'] != 'initial': + continue + + ancestry_data = { 'source_PMID': source_PMID } + # Add cohorts list + if cohorts_list: + ancestry_data['cohorts'] = cohorts_list + ancestry_data['sample_number'] = ancestry['numberOfIndividuals'] + + # ancestry_broad + for ancestralGroup in ancestry['ancestralGroups']: + if not 'ancestry_broad' in ancestry_data: + ancestry_data['ancestry_broad'] = '' + else: + ancestry_data['ancestry_broad'] += ',' + ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup'] + + # ancestry_free + for countryOfOrigin in ancestry['countryOfOrigin']: + if countryOfOrigin['countryName'] != 'NR': + if not 'ancestry_free' in ancestry_data: + ancestry_data['ancestry_free'] = '' + else: + ancestry_data['ancestry_free'] += ',' + ancestry_data['ancestry_free'] += countryOfOrigin['countryName'] + + # ancestry_country + for countryOfRecruitment in ancestry['countryOfRecruitment']: + if countryOfRecruitment['countryName'] != 'NR': + if not 'ancestry_country' in ancestry_data: + ancestry_data['ancestry_country'] = '' + else: + ancestry_data['ancestry_country'] += ',' + ancestry_data['ancestry_country'] += countryOfRecruitment['countryName'] + # ancestry_additional + # Not found in the REST API + + study_data.append(ancestry_data) + except: + print(f'Error: can\'t fetch GWAS results for {gcst_id}') + return study_data + + + def get_model_field_from_schema(self, col, current_schema): ''' Retrieve the model and field from the Template, that corresponds to the current spreadsheet column. @@ -370,66 +468,6 @@ def has_report_info(self): # Independent methods # #=======================# -def get_gwas_study(gcst_id): - """ - Get the GWAS Study information related to the PGS sample. - Check that all the required data is available - > Parameter: - - gcst_id: GWAS Study ID (e.g. GCST010127) - > Return: list of dictionnaries (1 per ancestry) - """ - study_data = [] - gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/' - response = requests.get(f'{gwas_rest_url}{gcst_id}') - - if not response: - return study_data - response_data = response.json() - if response_data: - try: - source_PMID = response_data['publicationInfo']['pubmedId'] - for ancestry in response_data['ancestries']: - - if ancestry['type'] != 'initial': - continue - - ancestry_data = { 'source_PMID': source_PMID } - ancestry_data['sample_number'] = ancestry['numberOfIndividuals'] - - # ancestry_broad - for ancestralGroup in ancestry['ancestralGroups']: - if not 'ancestry_broad' in ancestry_data: - ancestry_data['ancestry_broad'] = '' - else: - ancestry_data['ancestry_broad'] += ',' - ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup'] - - # ancestry_free - for countryOfOrigin in ancestry['countryOfOrigin']: - if countryOfOrigin['countryName'] != 'NR': - if not 'ancestry_free' in ancestry_data: - ancestry_data['ancestry_free'] = '' - else: - ancestry_data['ancestry_free'] += ',' - ancestry_data['ancestry_free'] += countryOfOrigin['countryName'] - - # ancestry_country - for countryOfRecruitment in ancestry['countryOfRecruitment']: - if countryOfRecruitment['countryName'] != 'NR': - if not 'ancestry_country' in ancestry_data: - ancestry_data['ancestry_country'] = '' - else: - ancestry_data['ancestry_country'] += ',' - ancestry_data['ancestry_country'] += countryOfRecruitment['countryName'] - # ancestry_additional - # Not found in the REST API - - study_data.append(ancestry_data) - except: - print(f'Error: can\'t fetch GWAS results for {gcst_id}') - return study_data - - def next_PSS_num(): r = SampleSet.objects.last() if r == None: diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py index 99308a77..184ef0fb 100644 --- a/release/scripts/UpdateGwasStudies.py +++ b/release/scripts/UpdateGwasStudies.py @@ -1,7 +1,5 @@ import requests -from catalog.models import Sample, Score -from pgs_web import constants - +from catalog.models import Sample, Score, Cohort class UpdateGwasStudies: @@ -16,15 +14,15 @@ def __init__(self,verbose=None): self.verbose = verbose - def get_gwas_info(self,sample): + def get_gwas_info(self,sample:Sample) -> dict: """ Get the GWAS Study information related to the PGS sample. Check that all the required data is available > Parameter: - - gcst_id: GWAS Study ID (e.g. GCST010127) - > Return: list of dictionnaries (1 per ancestry) + - sample: instance of a Sample model + > Return: dictionary (cohorts and ancestries) """ - study_data = [] + study_data = { "ancestries": [] } gcst_id = sample.source_GWAS_catalog response = requests.get(f'{self.gwas_rest_url}{gcst_id}') @@ -37,6 +35,26 @@ def get_gwas_info(self,sample): if response_data: try: source_PMID = response_data['publicationInfo']['pubmedId'] + + # Create list of cohorts if it exists in the GWAS study + # This override the Cohorts found previously in the cohort column in the spreadsheet + cohorts_list = [] + if 'cohort' in response_data.keys(): + cohorts = response_data['cohort'].split('|') + for cohort in cohorts: + cohort_id = cohort.upper() + try: + cohort_model = Cohort.objects.get(name_short__iexact=cohort_id) + cohorts_list.append(cohort_model) + except Cohort.DoesNotExist: + print(f"New cohort found: {cohort_id}") + cohort_model = Cohort(name_short=cohort_id,name_full=cohort_id) + cohort_model.save() + cohorts_list.append(cohort_model) + if cohorts_list: + study_data['cohorts'] = cohorts_list + + # Ancestries for ancestry in response_data['ancestries']: if ancestry['type'] != 'initial': @@ -70,12 +88,12 @@ def get_gwas_info(self,sample): else: ancestry_data['ancestry_country'] += self.country_sep ancestry_data['ancestry_country'] += countryOfRecruitment['countryName'] - study_data.append(ancestry_data) + study_data["ancestries"].append(ancestry_data) - if study_data: - print(f'\t{len(study_data)} distinct ancestries') + if study_data["ancestries"]: + print(f'\t{len(study_data["ancestries"])} distinct ancestries') if self.verbose: - for anc in study_data: + for anc in study_data["ancestries"]: print(f'\t{anc}') else: print("\tNo ancestry") @@ -90,7 +108,12 @@ def update_studies(self): for sample in self.samples: gwas_study = self.get_gwas_info(sample) new_samples = [] - for gwas_ancestry in gwas_study: + cohorts_list = [] + # List of cohorts + if 'cohorts' in gwas_study.keys(): + cohorts_list = gwas_study['cohorts'] + # List of ancestry data + for gwas_ancestry in gwas_study['ancestries']: new_sample = Sample() new_sample.source_GWAS_catalog = sample.source_GWAS_catalog for field, val in gwas_ancestry.items(): @@ -99,11 +122,32 @@ def update_studies(self): setattr(new_sample, field, val) new_sample.save() - # Cohorts - need to be added once the Sample object as been saved, - # i.e. when the Sample `id` has been created - if sample.cohorts: - for cohort in sample.cohorts.all(): - new_sample.cohorts.add(cohort) + # Cohorts data + if cohorts_list or sample.cohorts: + # Use the list of cohorts from the GWAS study (if available) + # Update the list of cohorts from the existing sample if new cohorts are found in the GWAS study + if cohorts_list: + new_sample.cohorts.set(cohorts_list) + # Print a message if the 2 list of cohorts (old & new) are different + if sample.cohorts: + new_set = sorted([x.name_short.upper() for x in cohorts_list]) + + old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()])) + new_set_string = ', '.join(new_set) + if old_set_string != new_set_string: + # Add cohorts which are already associated to the sample in the database, but not in the GWAS study + for sample_cohort in sample.cohorts.all(): + if sample_cohort.name_short.upper() not in new_set: + new_sample.cohorts.add(sample_cohort) + print(f"\t/!\ Replacing cohorts list:") + print(f"\t - Old set: {old_set_string}") + print(f"\t + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}") + # Copy the list of cohorts from the existing sample. + # Need to be added once the new Sample object as been saved, + # i.e. when the Sample `id` has been created + elif sample.cohorts: + for cohort in sample.cohorts.all(): + new_sample.cohorts.add(cohort) new_sample.save() new_samples.append(new_sample)