Skip to content

Commit

Permalink
Merge pull request PGScatalog#394 from ens-lgil/feature/gwas_cohorts
Browse files Browse the repository at this point in the history
Synchronize cohorts with the NHGRI GWAS Catalog, when available
  • Loading branch information
fyvon authored Nov 7, 2024
2 parents a1c4eb1 + 57c6cac commit 6e102aa
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 81 deletions.
2 changes: 1 addition & 1 deletion curation/parsers/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def str2metric(self, field, val):
val = self.replace_non_ascii_chars(field,val)

# Estimate with percentage as unit
if re.match('^\d+\.?\d*\s*\%$',val):
if re.match(r'^\d+\.?\d*\s*\%$',val):
val = val.replace('%','').strip()
current_metric.add_data('estimate', val)
current_metric.add_data('unit', '%')
Expand Down
4 changes: 2 additions & 2 deletions curation/parsers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def str2demographic(self, field, val):
- val: data value
Return type: DemographicData object
'''
unit_regex = "([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
unit_regex = r"([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
current_demographic = DemographicData(field,val,self.spreadsheet_name)
if type(val) == float:
current_demographic.add_data('estimate', val)
Expand Down Expand Up @@ -129,7 +129,7 @@ def create_sample_model(self):
elif field == 'sample_percent_male':
# Remove % character
val_str = str(val)
if re.search('\%',val_str):
if re.search(r'\%',val_str):
val_str = re.sub(r'\%', r'', val_str)
val_str = re.sub(r' ', r'', val_str)
val = float(val_str)
Expand Down
160 changes: 99 additions & 61 deletions curation/template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,17 @@ def extract_samples(self):
sample_keys = sample_data.data.keys()
if 'sample_number' not in sample_keys:
if 'source_GWAS_catalog' in sample_keys:
gwas_study = get_gwas_study(sample_data.data['source_GWAS_catalog'])
spreadsheet_cohorts = []
if 'cohorts' in sample_keys:
spreadsheet_cohorts = sample_data.data['cohorts']
gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
if gwas_study:
for gwas_ancestry in gwas_study:
c_sample = SampleData(spreadsheet_name)
# Spreadsheet sample/cohort data
for col, entry in sample_data.data.items():
c_sample.add_data(col, entry)
# GWAS Catalog sample/cohort data
for field, val in gwas_ancestry.items():
c_sample.add_data(field, val)
self.update_report(c_sample)
Expand Down Expand Up @@ -289,6 +294,99 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
return sample_data


def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
- spreadsheet_cohorts: list of CohortData objects for the current sample, collected from the spreadsheet
- spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions)
> Return: list of dictionnaries (1 per ancestry)
"""
study_data = []
gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
response = requests.get(f'{gwas_rest_url}{gcst_id}')

if not response:
return study_data
response_data = response.json()
if response_data:
# List the cohorts present in the spreadsheet for this sample
spreadsheet_cohorts_names = []
if spreadsheet_cohorts:
spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]

try:
source_PMID = response_data['publicationInfo']['pubmedId']
# Update the Cohorts list found in the cohort column of the spreadsheet by
# adding the list of cohorts from the GWAS study (if the list is present)
cohorts_list = spreadsheet_cohorts.copy()
if 'cohort' in response_data.keys():
cohorts = response_data['cohort'].split('|')
for cohort in cohorts:
cohort_id = cohort.upper()
# Check if cohort in list of cohort references
# and if the cohort is already in the list provided by the author
if cohort_id in self.parsed_cohorts:
if cohort_id not in spreadsheet_cohorts_names:
cohorts_list.append(self.parsed_cohorts[cohort_id])
else:
self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
# Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged.
if spreadsheet_cohorts and len(spreadsheet_cohorts) != len(cohorts_list):
msg = f'''GWAS study {gcst_id} -> the list of cohorts from the spreadsheet has been merged with the one from GWAS.
\t- Spreadsheet list: {', '.join(sorted(spreadsheet_cohorts_names))}
\t+ Merged GWAS list: {', '.join(sorted([x.name.upper() for x in cohorts_list]))}'''
self.report_warning(spreadsheet_name, msg)

# Ancestry information
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
continue

ancestry_data = { 'source_PMID': source_PMID }
# Add cohorts list
if cohorts_list:
ancestry_data['cohorts'] = cohorts_list
ancestry_data['sample_number'] = ancestry['numberOfIndividuals']

# ancestry_broad
for ancestralGroup in ancestry['ancestralGroups']:
if not 'ancestry_broad' in ancestry_data:
ancestry_data['ancestry_broad'] = ''
else:
ancestry_data['ancestry_broad'] += ','
ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']

# ancestry_free
for countryOfOrigin in ancestry['countryOfOrigin']:
if countryOfOrigin['countryName'] != 'NR':
if not 'ancestry_free' in ancestry_data:
ancestry_data['ancestry_free'] = ''
else:
ancestry_data['ancestry_free'] += ','
ancestry_data['ancestry_free'] += countryOfOrigin['countryName']

# ancestry_country
for countryOfRecruitment in ancestry['countryOfRecruitment']:
if countryOfRecruitment['countryName'] != 'NR':
if not 'ancestry_country' in ancestry_data:
ancestry_data['ancestry_country'] = ''
else:
ancestry_data['ancestry_country'] += ','
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
# ancestry_additional
# Not found in the REST API

study_data.append(ancestry_data)
except:
print(f'Error: can\'t fetch GWAS results for {gcst_id}')
return study_data



def get_model_field_from_schema(self, col, current_schema):
'''
Retrieve the model and field from the Template, that corresponds to the current spreadsheet column.
Expand Down Expand Up @@ -370,66 +468,6 @@ def has_report_info(self):
# Independent methods #
#=======================#

def get_gwas_study(gcst_id):
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
> Return: list of dictionnaries (1 per ancestry)
"""
study_data = []
gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
response = requests.get(f'{gwas_rest_url}{gcst_id}')

if not response:
return study_data
response_data = response.json()
if response_data:
try:
source_PMID = response_data['publicationInfo']['pubmedId']
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
continue

ancestry_data = { 'source_PMID': source_PMID }
ancestry_data['sample_number'] = ancestry['numberOfIndividuals']

# ancestry_broad
for ancestralGroup in ancestry['ancestralGroups']:
if not 'ancestry_broad' in ancestry_data:
ancestry_data['ancestry_broad'] = ''
else:
ancestry_data['ancestry_broad'] += ','
ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']

# ancestry_free
for countryOfOrigin in ancestry['countryOfOrigin']:
if countryOfOrigin['countryName'] != 'NR':
if not 'ancestry_free' in ancestry_data:
ancestry_data['ancestry_free'] = ''
else:
ancestry_data['ancestry_free'] += ','
ancestry_data['ancestry_free'] += countryOfOrigin['countryName']

# ancestry_country
for countryOfRecruitment in ancestry['countryOfRecruitment']:
if countryOfRecruitment['countryName'] != 'NR':
if not 'ancestry_country' in ancestry_data:
ancestry_data['ancestry_country'] = ''
else:
ancestry_data['ancestry_country'] += ','
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
# ancestry_additional
# Not found in the REST API

study_data.append(ancestry_data)
except:
print(f'Error: can\'t fetch GWAS results for {gcst_id}')
return study_data


def next_PSS_num():
r = SampleSet.objects.last()
if r == None:
Expand Down
78 changes: 61 additions & 17 deletions release/scripts/UpdateGwasStudies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import requests
from catalog.models import Sample, Score
from pgs_web import constants

from catalog.models import Sample, Score, Cohort


class UpdateGwasStudies:
Expand All @@ -16,15 +14,15 @@ def __init__(self,verbose=None):
self.verbose = verbose


def get_gwas_info(self,sample):
def get_gwas_info(self,sample:Sample) -> dict:
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
> Return: list of dictionnaries (1 per ancestry)
- sample: instance of a Sample model
> Return: dictionary (cohorts and ancestries)
"""
study_data = []
study_data = { "ancestries": [] }
gcst_id = sample.source_GWAS_catalog
response = requests.get(f'{self.gwas_rest_url}{gcst_id}')

Expand All @@ -37,6 +35,26 @@ def get_gwas_info(self,sample):
if response_data:
try:
source_PMID = response_data['publicationInfo']['pubmedId']

# Create list of cohorts if it exists in the GWAS study
# This override the Cohorts found previously in the cohort column in the spreadsheet
cohorts_list = []
if 'cohort' in response_data.keys():
cohorts = response_data['cohort'].split('|')
for cohort in cohorts:
cohort_id = cohort.upper()
try:
cohort_model = Cohort.objects.get(name_short__iexact=cohort_id)
cohorts_list.append(cohort_model)
except Cohort.DoesNotExist:
print(f"New cohort found: {cohort_id}")
cohort_model = Cohort(name_short=cohort_id,name_full=cohort_id)
cohort_model.save()
cohorts_list.append(cohort_model)
if cohorts_list:
study_data['cohorts'] = cohorts_list

# Ancestries
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
Expand Down Expand Up @@ -70,12 +88,12 @@ def get_gwas_info(self,sample):
else:
ancestry_data['ancestry_country'] += self.country_sep
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
study_data.append(ancestry_data)
study_data["ancestries"].append(ancestry_data)

if study_data:
print(f'\t{len(study_data)} distinct ancestries')
if study_data["ancestries"]:
print(f'\t{len(study_data["ancestries"])} distinct ancestries')
if self.verbose:
for anc in study_data:
for anc in study_data["ancestries"]:
print(f'\t{anc}')
else:
print("\tNo ancestry")
Expand All @@ -90,7 +108,12 @@ def update_studies(self):
for sample in self.samples:
gwas_study = self.get_gwas_info(sample)
new_samples = []
for gwas_ancestry in gwas_study:
cohorts_list = []
# List of cohorts
if 'cohorts' in gwas_study.keys():
cohorts_list = gwas_study['cohorts']
# List of ancestry data
for gwas_ancestry in gwas_study['ancestries']:
new_sample = Sample()
new_sample.source_GWAS_catalog = sample.source_GWAS_catalog
for field, val in gwas_ancestry.items():
Expand All @@ -99,11 +122,32 @@ def update_studies(self):
setattr(new_sample, field, val)
new_sample.save()

# Cohorts - need to be added once the Sample object as been saved,
# i.e. when the Sample `id` has been created
if sample.cohorts:
for cohort in sample.cohorts.all():
new_sample.cohorts.add(cohort)
# Cohorts data
if cohorts_list or sample.cohorts:
# Use the list of cohorts from the GWAS study (if available)
# Update the list of cohorts from the existing sample if new cohorts are found in the GWAS study
if cohorts_list:
new_sample.cohorts.set(cohorts_list)
# Print a message if the 2 list of cohorts (old & new) are different
if sample.cohorts:
new_set = sorted([x.name_short.upper() for x in cohorts_list])

old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
new_set_string = ', '.join(new_set)
if old_set_string != new_set_string:
# Add cohorts which are already associated to the sample in the database, but not in the GWAS study
for sample_cohort in sample.cohorts.all():
if sample_cohort.name_short.upper() not in new_set:
new_sample.cohorts.add(sample_cohort)
print(f"\t/!\ Replacing cohorts list:")
print(f"\t - Old set: {old_set_string}")
print(f"\t + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
# Copy the list of cohorts from the existing sample.
# Need to be added once the new Sample object as been saved,
# i.e. when the Sample `id` has been created
elif sample.cohorts:
for cohort in sample.cohorts.all():
new_sample.cohorts.add(cohort)
new_sample.save()

new_samples.append(new_sample)
Expand Down

0 comments on commit 6e102aa

Please sign in to comment.