Skip to content

Commit

Permalink
Merge the 2 list of cohorts: the one from the spreadsheet and the one…
Browse files Browse the repository at this point in the history
… from the GWAS study (fetched via the GWAS REST API)
  • Loading branch information
ens-lgil committed Oct 25, 2024
1 parent b474a31 commit a243507
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
20 changes: 15 additions & 5 deletions curation/template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,10 @@ def extract_samples(self):
sample_keys = sample_data.data.keys()
if 'sample_number' not in sample_keys:
if 'source_GWAS_catalog' in sample_keys:
gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_name)
spreadsheet_cohorts = []
if 'cohorts' in sample_keys:
spreadsheet_cohorts = sample_data.data['cohorts']
gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
if gwas_study:
for gwas_ancestry in gwas_study:
c_sample = SampleData(spreadsheet_name)
Expand Down Expand Up @@ -291,7 +294,7 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
return sample_data


def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
Expand All @@ -308,18 +311,25 @@ def get_gwas_study(self,gcst_id:str,spreadsheet_name:str) -> dict:
return study_data
response_data = response.json()
if response_data:
# List the cohorts present in the spreadsheet for this sample
spreadsheet_cohorts_names = []
if spreadsheet_cohorts:
spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]

try:
source_PMID = response_data['publicationInfo']['pubmedId']

# Create list of cohorts if it exists in the GWAS study
# This override the Cohorts found in the cohort column in the spreadsheet
cohorts_list = []
cohorts_list = spreadsheet_cohorts
if 'cohort' in response_data.keys():
cohorts = response_data['cohort'].split('|')
for cohort in cohorts:
cohort_id = cohort.upper()
# Check if cohort in list of cohort references
# # and if the cohort is already in the list provided by th author
if cohort_id in self.parsed_cohorts:
cohorts_list.append(self.parsed_cohorts[cohort_id])
if cohort_id not in spreadsheet_cohorts_names:
cohorts_list.append(self.parsed_cohorts[cohort_id])
else:
self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')

Expand Down
16 changes: 11 additions & 5 deletions release/scripts/UpdateGwasStudies.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,18 @@ def update_studies(self):
new_sample.cohorts.set(cohorts_list)
# Print a message if the 2 list of cohorts (old & new) are different
if sample.cohorts:
old_set = ', '.join(sorted([x.name_short for x in sample.cohorts.all()]))
new_set = ', '.join(sorted([x.name_short for x in cohorts_list]))
if old_set != new_set:
new_set = sorted([x.name_short.upper() for x in cohorts_list])

old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
new_set_string = ', '.join(new_set)
if old_set_string != new_set_string:
# Add cohorts which are already associated to the sample in the database, but not in the GWAS study
for sample_cohort in sample.cohorts.all():
if sample_cohort.name_short.upper() not in new_set:
new_sample.cohorts.add(sample_cohort)
print(f"\t/!\ Replacing cohorts list:")
print(f"\t - Old set: {old_set}")
print(f"\t + New set: {new_set}")
print(f"\t - Old set: {old_set_string}")
print(f"\t + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
# Copy the list of cohorts from the existing sample.
# Need to be added once the new Sample object as been saved,
# i.e. when the Sample `id` has been created
Expand Down

0 comments on commit a243507

Please sign in to comment.