Skip to content

Commit

Permalink
Merge pull request PGScatalog#331 from fyvon/improve/qc_variant_posit…
Browse files Browse the repository at this point in the history
…ions

Study import QC (ref genome validation + reported trait cleaning)
  • Loading branch information
fyvon authored Mar 6, 2024
2 parents 856aa97 + 131cb64 commit 5a04037
Show file tree
Hide file tree
Showing 14 changed files with 516 additions and 31 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
/static/
app.yaml
pgs-catalog-cred.json
.idea
.vscode
19 changes: 15 additions & 4 deletions curation/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
curation_directories = {
'template_schema': './curation/templates/TemplateColumns2Models.xlsx',
'scoring_schema': './curation/templates/ScoringFileSchema.xlsx',
'studies_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/',
'scoring_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/ScoringFiles/'
'studies_dir': '<studies_dir_path>',
'scoring_dir': '<studies_dir_path>/ScoringFiles/'
}

study_names_list = [
{ 'name': 'Chen2018' },
{ 'name': 'Kolin2021' }
{'name': '<study_name>'},
]

default_curation_status = 'IP'
Expand All @@ -17,3 +16,15 @@
skip_scoringfiles = False

skip_curationtracker = False

variant_positions_qc_config = {
'skip': False, # Set to True to ignore the variant positions QC step
'n_requests': 4, # Maximum number of requests allowed per score to the Ensembl REST API
'ensembl_max_variation_req_size': 10, # Maximum number of variants per request to the Ensembl variation REST API
'ensembl_max_sequence_req_size': 50, # Maximum number of variants per request to the Ensembl sequence REST API
'minimum_match_rate': 0.9 # The minimum required match rate for the QC to be satisfactory
}

# TSV file containing the reported traits to be replaced for homogeneity.
# Required columns: "trait_reported", "corrected".
reported_traits_replacement_file = '<local_dir>/reported_traits_dict.tsv'
6 changes: 5 additions & 1 deletion curation/import_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@
from curation.config import *

# Main script
curation_import = CurationImport(curation_directories, study_names_list, default_curation_status, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker)
curation_import = CurationImport(
data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status,
scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scoringfiles,
skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config,
reported_traits_dict_file=reported_traits_replacement_file)
curation_import.run_curation_import()
40 changes: 37 additions & 3 deletions curation/imports/curation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import csv
from curation.imports.study import StudyImport
from curation.imports.scoring_file import ScoringFileUpdate
from curation.imports.scoring_file import ScoringFileUpdate, VariantPositionsQC
from curation_tracker.models import CurationPublicationAnnotation


Expand All @@ -17,7 +18,7 @@ class CurationImport():

failed_studies = {}

def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker):
def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker, variant_positions_qc_config, reported_traits_dict_file:str):
self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0)
self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0)

Expand All @@ -30,13 +31,25 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf
self.skip_curationtracker = skip_curationtracker

self.curation_status_by_default = curation_status_by_default
self.variant_positions_qc_config = variant_positions_qc_config

# Reading the reported-traits dictionary file
try:
with open(reported_traits_dict_file, mode='r') as infile:
reader = csv.DictReader(infile, delimiter='\t')
self.reported_traits_cleaner = {row['trait_reported']: row['corrected'] for row in reader}
except FileNotFoundError:
print('ERROR: Could not find \'reported_traits_dict_file\'')
self.reported_traits_cleaner = {}

self.step = 1
self.steps_total = 2
if self.skip_scoringfiles == False:
self.steps_total = self.steps_total + 1
if self.skip_curationtracker == False:
self.steps_total = self.steps_total + 1
if self.variant_positions_qc_config['skip'] == False:
self.steps_total = self.steps_total + 1


def global_report(self):
Expand Down Expand Up @@ -67,7 +80,7 @@ def run_curation_import(self):

## Parsing ##
self.step = 1
study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default)
study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default, self.reported_traits_cleaner)
study_import.print_title()
print(f'==> Step {self.step}/{self.steps_total}: Parsing study data')
study_import.parse_curation_data()
Expand Down Expand Up @@ -98,6 +111,27 @@ def run_curation_import(self):
if study_import.study_name in self.failed_studies.keys():
continue

## Variant positions QC ##
if not self.variant_positions_qc_config['skip']:
self.step += 1
print('\n----------------------------------\n')
print(f'==> Step {self.step}/{self.steps_total}: QC of the variant positions of the Scoring file(s)')
if study_import.study_scores:
for score_id, score in study_import.study_scores.items():
print(f'SCORE: {score.id}')
variant_pos_qc = VariantPositionsQC(score, self.new_scoring_path, self.variant_positions_qc_config)
is_failed = variant_pos_qc.qc_variant_positions(
report_func=lambda msg: print(f' > {msg}'),
error_func=lambda msg: print(f'ERROR: {msg}!')
)
if is_failed:
self.failed_studies[study_import.study_name] = 'scoring file error'
continue
else:
print(" > No scores for this study, therefore no scoring files")
if study_import.study_name in self.failed_studies.keys():
continue

## Update Curation Tracker ##
if self.skip_curationtracker == False:
self.step += 1
Expand Down
101 changes: 96 additions & 5 deletions curation/imports/scoring_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
from catalog.models import Score
from curation.scripts.qc_ref_genome import sample_df, map_rsids, map_variants_to_reference_genome, usecols


class ScoringFileUpdate():
Expand All @@ -22,7 +23,6 @@ def __init__(self, score, study_path, new_scoring_dir, score_file_schema, score_
if not os.path.isdir(self.score_file_path):
self.score_file_path = f'{study_path}/raw scores'


def create_scoringfileheader(self):
'''Function to extract score & publication information for the PGS Catalog Scoring File commented header'''
score = self.score
Expand Down Expand Up @@ -57,7 +57,6 @@ def create_scoringfileheader(self):
print(f'Header creation issue: {e}')
return lines


def update_scoring_file(self):
''' Method to fetch the file, read it, add the header and compress it. '''
failed_update = False
Expand All @@ -69,9 +68,9 @@ def update_scoring_file(self):
raw_scorefile_tsv = f'{raw_scorefile_path}{self.tsv_ext}'
raw_scorefile_xls = f'{raw_scorefile_path}{self.xls_ext}'
if os.path.exists(raw_scorefile_txt):
df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine = 'python')
df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine='python')
elif os.path.exists(raw_scorefile_tsv):
df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine = 'python')
df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine='python')
elif os.path.exists(raw_scorefile_xls):
df_scoring = pd.read_excel(raw_scorefile_xls, dtype='str')
else:
Expand Down Expand Up @@ -140,7 +139,8 @@ def update_scoring_file(self):
new_df_csv.append(row)
df_csv = '\n'.join(new_df_csv)

with gzip.open(f'{self.new_score_file_path}/{score_id}.txt.gz', 'w') as outf:
new_score_file = f'{self.new_score_file_path}/{score_id}.txt.gz'
with gzip.open(new_score_file, 'w') as outf:
outf.write('\n'.join(header).encode('utf-8'))
outf.write('\n'.encode('utf-8'))
outf.write(df_csv.encode('utf-8'))
Expand All @@ -155,3 +155,94 @@ def update_scoring_file(self):
failed_update = True
print(f'ERROR reading scorefile: {raw_scorefile_path}\n-> {e}')
return failed_update


class VariantPositionsQC:

def __init__(self, score, new_scoring_dir, variant_positions_qc_config):
self.score = score
self.new_score_file_path = new_scoring_dir
self.variant_positions_qc_config = variant_positions_qc_config

def qc_variant_positions(self, report_func=print, error_func=print) -> bool:
""" QC the variant positions regarding the assigned genome build of the scoring file."""
failed_qc = False
score = self.score
genome_build = score.variants_genomebuild
if not genome_build:
failed_qc = True
error_func(f'Missing genome build')
build_version = None
if genome_build in ('GRCh37', 'hg19'):
build_version = '37'
elif genome_build in ('GRCh38', 'hg38'):
build_version = '38'
elif genome_build == 'NR':
report_func('Genome build = NR, variant positions validation ignored.')
return failed_qc
else:
report_func(f'Genome build = "{genome_build}", not detected as 37 or 38, variant positions validation ignored.')
return failed_qc
report_func(f'Build version = {build_version}')

new_score_file = f'{self.new_score_file_path}/{score.id}.txt.gz'

# Reading the scoring file
df = pd.read_csv(new_score_file,
sep="\t",
comment='#',
usecols=lambda c: c in usecols,
dtype={"rsID": 'string', "chr_name": 'string', "chr_position": 'Int64',
"effect_allele": 'string', "other_allele": 'string'})

if 'chr_position' not in df.columns:
report_func('No chr_position column. Variant position QC will be skipped')
return failed_qc

# Headers for alleles. other_allele is optional, but should be tested if exists as we don't know which allele is the reference one.
alleles_headers = ['effect_allele']
if 'other_allele' in df.columns:
alleles_headers.append('other_allele')

n_requests = self.variant_positions_qc_config['n_requests']

if 'rsID' in df.columns:
max_request_size = self.variant_positions_qc_config['ensembl_max_variation_req_size']
map_variants_func = map_rsids
else:
if len(alleles_headers) == 1:
# 1 allele column is not enough for sequence-based validation, as we don't know if the allele is ref or alt.
report_func('Only 1 allele column with no rsID. Variant position QC will be skipped')
return failed_qc
max_request_size = self.variant_positions_qc_config['ensembl_max_sequence_req_size']
map_variants_func = map_variants_to_reference_genome

errors = []
variant_slices = sample_df(df, n_requests, max_request_size, alleles_headers, False, errors)

match = 0
mismatch = 0
for variants in variant_slices:
results = map_variants_func(variants, build_version)
match += results['match']
mismatch += results['mismatch']
errors = errors + results['errors']

final_report = '''Results:
- Matches: {0}
- Mismatches: {1}'''
report_func(final_report.format(str(match), str(mismatch)))

minimum_match_rate = self.variant_positions_qc_config['minimum_match_rate']

# Fail if low match rate
if match+mismatch == 0:
report_func('WARNING: No match data!')
else:
match_rate: float = float(match) / (match + mismatch)
report_func(f'Match rate: {match_rate}')
if match_rate < minimum_match_rate:
error_func('Low match rate! The assigned genome build might be incorrect')
failed_qc = True

return failed_qc
6 changes: 4 additions & 2 deletions curation/imports/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class StudyImport():
('sample', 'id', Sample)
]

def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default):
def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default, reported_traits_cleaner:dict):
self.study_name = study_data['name']

if not studies_dir.endswith('/'):
Expand Down Expand Up @@ -49,6 +49,8 @@ def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_
self.failed_data_import = []
self.has_failed = False

self.reported_traits_cleaner = reported_traits_cleaner


def print_title(self):
''' Print the study name '''
Expand Down Expand Up @@ -144,7 +146,7 @@ def import_score_models(self):
self.existing_scores.append(score.id)
# Create Score model
except Score.DoesNotExist:
score = score_data.create_score_model(self.study_publication)
score = score_data.create_score_model(self.study_publication, self.reported_traits_cleaner)
self.import_warnings.append(f'New Score: {score.id} ({score_id})')
self.study_scores[score_id] = score

Expand Down
13 changes: 5 additions & 8 deletions curation/parsers/publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,13 @@ def rest_api_call_to_epmc(self,query):
- query: the search query
Return type: JSON
'''
payload = {'format': 'json'}
payload['query'] = query
payload = {'format': 'json', 'query': query}
result = requests.get(constants.USEFUL_URLS['EPMC_REST_SEARCH'], params=payload)
result = result.json()
try:
result = result['resultList']['result'][0]
except:
result = {}
print("ERROR: Can't find the paper in EuropePMC!")
return result
if 'result' in result['resultList']:
return result['resultList']['result'][0]
else:
raise Exception(f'Can\'t find the paper in EuropePMC! (query:{query})')


def create_publication_model(self):
Expand Down
14 changes: 9 additions & 5 deletions curation/parsers/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ class ScoreData(GenericData):
'PRScs': 'PRS-CS'
}

def __init__(self,score_name,spreadsheet_name):
GenericData.__init__(self,spreadsheet_name)
def __init__(self, score_name, spreadsheet_name):
GenericData.__init__(self, spreadsheet_name)
self.name = score_name
self.data = {'name': score_name}


@transaction.atomic
def create_score_model(self,publication):
def create_score_model(self, publication, reported_traits_cleaner: dict):
'''
Create an instance of the Score model.
It also create instance(s) of the EFOTrait model if needed.
Expand All @@ -35,14 +34,19 @@ def create_score_model(self,publication):
if field == 'trait_efo':
efo_traits = []
for trait_id in val:
trait_id = trait_id.replace(':','_').strip()
trait_id = trait_id.replace(':', '_').strip()
trait = TraitData(trait_id, self.spreadsheet_name)
efo = trait.efotrait_model()
efo_traits.append(efo)
else:
if field == 'method_name':
if val in self.method_name_replacement.keys():
val = self.method_name_replacement[val]
elif field == 'trait_reported':
if val in reported_traits_cleaner:
new_val = reported_traits_cleaner[val]
print("Replaced reported trait \"{}\" with \"{}\"".format(val, new_val))
val = new_val
setattr(self.model, field, val)
# Associate a Publication
self.model.publication = publication
Expand Down
Loading

0 comments on commit 5a04037

Please sign in to comment.