Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from PGScatalog:master #124

Merged
merged 12 commits into from
Mar 7, 2024
Merged
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
/static/
app.yaml
pgs-catalog-cred.json
.idea
.vscode
19 changes: 15 additions & 4 deletions curation/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
curation_directories = {
'template_schema': './curation/templates/TemplateColumns2Models.xlsx',
'scoring_schema': './curation/templates/ScoringFileSchema.xlsx',
'studies_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/',
'scoring_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/ScoringFiles/'
'studies_dir': '<studies_dir_path>',
'scoring_dir': '<studies_dir_path>/ScoringFiles/'
}

study_names_list = [
{ 'name': 'Chen2018' },
{ 'name': 'Kolin2021' }
{'name': '<study_name>'},
]

default_curation_status = 'IP'
Expand All @@ -17,3 +16,15 @@
skip_scoringfiles = False

skip_curationtracker = False

variant_positions_qc_config = {
'skip': False, # Set to True to ignore the variant positions QC step
'n_requests': 4, # Maximum number of requests allowed per score to the Ensembl REST API
'ensembl_max_variation_req_size': 10, # Maximum number of variants per request to the Ensembl variation REST API
'ensembl_max_sequence_req_size': 50, # Maximum number of variants per request to the Ensembl sequence REST API
'minimum_match_rate': 0.9 # The minimum required match rate for the QC to be satisfactory
}

# TSV file containing the reported traits to be replaced for homogeneity.
# Required columns: "trait_reported", "corrected".
reported_traits_replacement_file = '<local_dir>/reported_traits_dict.tsv'
6 changes: 5 additions & 1 deletion curation/import_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@
from curation.config import *

# Main script
curation_import = CurationImport(curation_directories, study_names_list, default_curation_status, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker)
curation_import = CurationImport(
data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status,
scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scoringfiles,
skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config,
reported_traits_dict_file=reported_traits_replacement_file)
curation_import.run_curation_import()
40 changes: 37 additions & 3 deletions curation/imports/curation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import csv
from curation.imports.study import StudyImport
from curation.imports.scoring_file import ScoringFileUpdate
from curation.imports.scoring_file import ScoringFileUpdate, VariantPositionsQC
from curation_tracker.models import CurationPublicationAnnotation


Expand All @@ -17,7 +18,7 @@ class CurationImport():

failed_studies = {}

def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker):
def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker, variant_positions_qc_config, reported_traits_dict_file:str):
self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0)
self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0)

Expand All @@ -30,13 +31,25 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf
self.skip_curationtracker = skip_curationtracker

self.curation_status_by_default = curation_status_by_default
self.variant_positions_qc_config = variant_positions_qc_config

# Reading the reported-traits dictionary file
try:
with open(reported_traits_dict_file, mode='r') as infile:
reader = csv.DictReader(infile, delimiter='\t')
self.reported_traits_cleaner = {row['trait_reported']: row['corrected'] for row in reader}
except FileNotFoundError:
print('ERROR: Could not find \'reported_traits_dict_file\'')
self.reported_traits_cleaner = {}

self.step = 1
self.steps_total = 2
if self.skip_scoringfiles == False:
self.steps_total = self.steps_total + 1
if self.skip_curationtracker == False:
self.steps_total = self.steps_total + 1
if self.variant_positions_qc_config['skip'] == False:
self.steps_total = self.steps_total + 1


def global_report(self):
Expand Down Expand Up @@ -67,7 +80,7 @@ def run_curation_import(self):

## Parsing ##
self.step = 1
study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default)
study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default, self.reported_traits_cleaner)
study_import.print_title()
print(f'==> Step {self.step}/{self.steps_total}: Parsing study data')
study_import.parse_curation_data()
Expand Down Expand Up @@ -98,6 +111,27 @@ def run_curation_import(self):
if study_import.study_name in self.failed_studies.keys():
continue

## Variant positions QC ##
if not self.variant_positions_qc_config['skip']:
self.step += 1
print('\n----------------------------------\n')
print(f'==> Step {self.step}/{self.steps_total}: QC of the variant positions of the Scoring file(s)')
if study_import.study_scores:
for score_id, score in study_import.study_scores.items():
print(f'SCORE: {score.id}')
variant_pos_qc = VariantPositionsQC(score, self.new_scoring_path, self.variant_positions_qc_config)
is_failed = variant_pos_qc.qc_variant_positions(
report_func=lambda msg: print(f' > {msg}'),
error_func=lambda msg: print(f'ERROR: {msg}!')
)
if is_failed:
self.failed_studies[study_import.study_name] = 'scoring file error'
continue
else:
print(" > No scores for this study, therefore no scoring files")
if study_import.study_name in self.failed_studies.keys():
continue

## Update Curation Tracker ##
if self.skip_curationtracker == False:
self.step += 1
Expand Down
101 changes: 96 additions & 5 deletions curation/imports/scoring_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
from catalog.models import Score
from curation.scripts.qc_ref_genome import sample_df, map_rsids, map_variants_to_reference_genome, usecols


class ScoringFileUpdate():
Expand All @@ -22,7 +23,6 @@ def __init__(self, score, study_path, new_scoring_dir, score_file_schema, score_
if not os.path.isdir(self.score_file_path):
self.score_file_path = f'{study_path}/raw scores'


def create_scoringfileheader(self):
'''Function to extract score & publication information for the PGS Catalog Scoring File commented header'''
score = self.score
Expand Down Expand Up @@ -57,7 +57,6 @@ def create_scoringfileheader(self):
print(f'Header creation issue: {e}')
return lines


def update_scoring_file(self):
''' Method to fetch the file, read it, add the header and compress it. '''
failed_update = False
Expand All @@ -69,9 +68,9 @@ def update_scoring_file(self):
raw_scorefile_tsv = f'{raw_scorefile_path}{self.tsv_ext}'
raw_scorefile_xls = f'{raw_scorefile_path}{self.xls_ext}'
if os.path.exists(raw_scorefile_txt):
df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine = 'python')
df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine='python')
elif os.path.exists(raw_scorefile_tsv):
df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine = 'python')
df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine='python')
elif os.path.exists(raw_scorefile_xls):
df_scoring = pd.read_excel(raw_scorefile_xls, dtype='str')
else:
Expand Down Expand Up @@ -140,7 +139,8 @@ def update_scoring_file(self):
new_df_csv.append(row)
df_csv = '\n'.join(new_df_csv)

with gzip.open(f'{self.new_score_file_path}/{score_id}.txt.gz', 'w') as outf:
new_score_file = f'{self.new_score_file_path}/{score_id}.txt.gz'
with gzip.open(new_score_file, 'w') as outf:
outf.write('\n'.join(header).encode('utf-8'))
outf.write('\n'.encode('utf-8'))
outf.write(df_csv.encode('utf-8'))
Expand All @@ -155,3 +155,94 @@ def update_scoring_file(self):
failed_update = True
print(f'ERROR reading scorefile: {raw_scorefile_path}\n-> {e}')
return failed_update


class VariantPositionsQC:

def __init__(self, score, new_scoring_dir, variant_positions_qc_config):
self.score = score
self.new_score_file_path = new_scoring_dir
self.variant_positions_qc_config = variant_positions_qc_config

def qc_variant_positions(self, report_func=print, error_func=print) -> bool:
""" QC the variant positions regarding the assigned genome build of the scoring file."""
failed_qc = False
score = self.score
genome_build = score.variants_genomebuild
if not genome_build:
failed_qc = True
error_func(f'Missing genome build')
build_version = None
if genome_build in ('GRCh37', 'hg19'):
build_version = '37'
elif genome_build in ('GRCh38', 'hg38'):
build_version = '38'
elif genome_build == 'NR':
report_func('Genome build = NR, variant positions validation ignored.')
return failed_qc
else:
report_func(f'Genome build = "{genome_build}", not detected as 37 or 38, variant positions validation ignored.')
return failed_qc
report_func(f'Build version = {build_version}')

new_score_file = f'{self.new_score_file_path}/{score.id}.txt.gz'

# Reading the scoring file
df = pd.read_csv(new_score_file,
sep="\t",
comment='#',
usecols=lambda c: c in usecols,
dtype={"rsID": 'string', "chr_name": 'string', "chr_position": 'Int64',
"effect_allele": 'string', "other_allele": 'string'})

if 'chr_position' not in df.columns:
report_func('No chr_position column. Variant position QC will be skipped')
return failed_qc

# Headers for alleles. other_allele is optional, but should be tested if exists as we don't know which allele is the reference one.
alleles_headers = ['effect_allele']
if 'other_allele' in df.columns:
alleles_headers.append('other_allele')

n_requests = self.variant_positions_qc_config['n_requests']

if 'rsID' in df.columns:
max_request_size = self.variant_positions_qc_config['ensembl_max_variation_req_size']
map_variants_func = map_rsids
else:
if len(alleles_headers) == 1:
# 1 allele column is not enough for sequence-based validation, as we don't know if the allele is ref or alt.
report_func('Only 1 allele column with no rsID. Variant position QC will be skipped')
return failed_qc
max_request_size = self.variant_positions_qc_config['ensembl_max_sequence_req_size']
map_variants_func = map_variants_to_reference_genome

errors = []
variant_slices = sample_df(df, n_requests, max_request_size, alleles_headers, False, errors)

match = 0
mismatch = 0
for variants in variant_slices:
results = map_variants_func(variants, build_version)
match += results['match']
mismatch += results['mismatch']
errors = errors + results['errors']

final_report = '''Results:
- Matches: {0}
- Mismatches: {1}'''
report_func(final_report.format(str(match), str(mismatch)))

minimum_match_rate = self.variant_positions_qc_config['minimum_match_rate']

# Fail if low match rate
if match+mismatch == 0:
report_func('WARNING: No match data!')
else:
match_rate: float = float(match) / (match + mismatch)
report_func(f'Match rate: {match_rate}')
if match_rate < minimum_match_rate:
error_func('Low match rate! The assigned genome build might be incorrect')
failed_qc = True

return failed_qc
6 changes: 4 additions & 2 deletions curation/imports/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class StudyImport():
('sample', 'id', Sample)
]

def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default):
def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default, reported_traits_cleaner:dict):
self.study_name = study_data['name']

if not studies_dir.endswith('/'):
Expand Down Expand Up @@ -49,6 +49,8 @@ def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_
self.failed_data_import = []
self.has_failed = False

self.reported_traits_cleaner = reported_traits_cleaner


def print_title(self):
''' Print the study name '''
Expand Down Expand Up @@ -144,7 +146,7 @@ def import_score_models(self):
self.existing_scores.append(score.id)
# Create Score model
except Score.DoesNotExist:
score = score_data.create_score_model(self.study_publication)
score = score_data.create_score_model(self.study_publication, self.reported_traits_cleaner)
self.import_warnings.append(f'New Score: {score.id} ({score_id})')
self.study_scores[score_id] = score

Expand Down
13 changes: 5 additions & 8 deletions curation/parsers/publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,13 @@ def rest_api_call_to_epmc(self,query):
- query: the search query
Return type: JSON
'''
payload = {'format': 'json'}
payload['query'] = query
payload = {'format': 'json', 'query': query}
result = requests.get(constants.USEFUL_URLS['EPMC_REST_SEARCH'], params=payload)
result = result.json()
try:
result = result['resultList']['result'][0]
except:
result = {}
print("ERROR: Can't find the paper in EuropePMC!")
return result
if 'result' in result['resultList']:
return result['resultList']['result'][0]
else:
raise Exception(f'Can\'t find the paper in EuropePMC! (query:{query})')


def create_publication_model(self):
Expand Down
14 changes: 9 additions & 5 deletions curation/parsers/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ class ScoreData(GenericData):
'PRScs': 'PRS-CS'
}

def __init__(self,score_name,spreadsheet_name):
GenericData.__init__(self,spreadsheet_name)
def __init__(self, score_name, spreadsheet_name):
GenericData.__init__(self, spreadsheet_name)
self.name = score_name
self.data = {'name': score_name}


@transaction.atomic
def create_score_model(self,publication):
def create_score_model(self, publication, reported_traits_cleaner: dict):
'''
Create an instance of the Score model.
It also create instance(s) of the EFOTrait model if needed.
Expand All @@ -35,14 +34,19 @@ def create_score_model(self,publication):
if field == 'trait_efo':
efo_traits = []
for trait_id in val:
trait_id = trait_id.replace(':','_').strip()
trait_id = trait_id.replace(':', '_').strip()
trait = TraitData(trait_id, self.spreadsheet_name)
efo = trait.efotrait_model()
efo_traits.append(efo)
else:
if field == 'method_name':
if val in self.method_name_replacement.keys():
val = self.method_name_replacement[val]
elif field == 'trait_reported':
if val in reported_traits_cleaner:
new_val = reported_traits_cleaner[val]
print("Replaced reported trait \"{}\" with \"{}\"".format(val, new_val))
val = new_val
setattr(self.model, field, val)
# Associate a Publication
self.model.publication = publication
Expand Down
Loading
Loading