diff --git a/.gitignore b/.gitignore index f625697b..daba40ae 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ /static/ app.yaml pgs-catalog-cred.json +.idea +.vscode \ No newline at end of file diff --git a/curation/config.py b/curation/config.py index a5bcac79..fbe548fe 100644 --- a/curation/config.py +++ b/curation/config.py @@ -1,13 +1,12 @@ curation_directories = { 'template_schema': './curation/templates/TemplateColumns2Models.xlsx', 'scoring_schema': './curation/templates/ScoringFileSchema.xlsx', - 'studies_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/', - 'scoring_dir': '/Users/lg10/Workspace/datafiles/SourceFiles/ScoringFiles/' + 'studies_dir': '', + 'scoring_dir': '/ScoringFiles/' } study_names_list = [ - { 'name': 'Chen2018' }, - { 'name': 'Kolin2021' } + {'name': ''}, ] default_curation_status = 'IP' @@ -17,3 +16,15 @@ skip_scoringfiles = False skip_curationtracker = False + +variant_positions_qc_config = { + 'skip': False, # Set to True to ignore the variant positions QC step + 'n_requests': 4, # Maximum number of requests allowed per score to the Ensembl REST API + 'ensembl_max_variation_req_size': 10, # Maximum number of variants per request to the Ensembl variation REST API + 'ensembl_max_sequence_req_size': 50, # Maximum number of variants per request to the Ensembl sequence REST API + 'minimum_match_rate': 0.9 # The minimum required match rate for the QC to be satisfactory +} + +# TSV file containing the reported traits to be replaced for homogeneity. +# Required columns: "trait_reported", "corrected". +reported_traits_replacement_file = '/reported_traits_dict.tsv' diff --git a/curation/import_studies.py b/curation/import_studies.py index 8fcef765..004a0540 100644 --- a/curation/import_studies.py +++ b/curation/import_studies.py @@ -2,5 +2,9 @@ from curation.config import * # Main script -curation_import = CurationImport(curation_directories, study_names_list, default_curation_status, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker) +curation_import = CurationImport( + data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status, + scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scoringfiles, + skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config, + reported_traits_dict_file=reported_traits_replacement_file) curation_import.run_curation_import() diff --git a/curation/imports/curation.py b/curation/imports/curation.py index eed290ed..7b6d37b3 100644 --- a/curation/imports/curation.py +++ b/curation/imports/curation.py @@ -1,6 +1,7 @@ import pandas as pd +import csv from curation.imports.study import StudyImport -from curation.imports.scoring_file import ScoringFileUpdate +from curation.imports.scoring_file import ScoringFileUpdate, VariantPositionsQC from curation_tracker.models import CurationPublicationAnnotation @@ -17,7 +18,7 @@ class CurationImport(): failed_studies = {} - def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker): + def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker, variant_positions_qc_config, reported_traits_dict_file:str): self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0) self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0) @@ -30,6 +31,16 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf self.skip_curationtracker = skip_curationtracker self.curation_status_by_default = curation_status_by_default + self.variant_positions_qc_config = variant_positions_qc_config + + # Reading the reported-traits dictionary file + try: + with open(reported_traits_dict_file, mode='r') as infile: + reader = csv.DictReader(infile, delimiter='\t') + self.reported_traits_cleaner = {row['trait_reported']: row['corrected'] for row in reader} + except FileNotFoundError: + print('ERROR: Could not find \'reported_traits_dict_file\'') + self.reported_traits_cleaner = {} self.step = 1 self.steps_total = 2 @@ -37,6 +48,8 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf self.steps_total = self.steps_total + 1 if self.skip_curationtracker == False: self.steps_total = self.steps_total + 1 + if self.variant_positions_qc_config['skip'] == False: + self.steps_total = self.steps_total + 1 def global_report(self): @@ -67,7 +80,7 @@ def run_curation_import(self): ## Parsing ## self.step = 1 - study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default) + study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default, self.reported_traits_cleaner) study_import.print_title() print(f'==> Step {self.step}/{self.steps_total}: Parsing study data') study_import.parse_curation_data() @@ -98,6 +111,27 @@ def run_curation_import(self): if study_import.study_name in self.failed_studies.keys(): continue + ## Variant positions QC ## + if not self.variant_positions_qc_config['skip']: + self.step += 1 + print('\n----------------------------------\n') + print(f'==> Step {self.step}/{self.steps_total}: QC of the variant positions of the Scoring file(s)') + if study_import.study_scores: + for score_id, score in study_import.study_scores.items(): + print(f'SCORE: {score.id}') + variant_pos_qc = VariantPositionsQC(score, self.new_scoring_path, self.variant_positions_qc_config) + is_failed = variant_pos_qc.qc_variant_positions( + report_func=lambda msg: print(f' > {msg}'), + error_func=lambda msg: print(f'ERROR: {msg}!') + ) + if is_failed: + self.failed_studies[study_import.study_name] = 'scoring file error' + continue + else: + print(" > No scores for this study, therefore no scoring files") + if study_import.study_name in self.failed_studies.keys(): + continue + ## Update Curation Tracker ## if self.skip_curationtracker == False: self.step += 1 diff --git a/curation/imports/scoring_file.py b/curation/imports/scoring_file.py index ee362f53..725d5aab 100644 --- a/curation/imports/scoring_file.py +++ b/curation/imports/scoring_file.py @@ -2,6 +2,7 @@ import pandas as pd import numpy as np from catalog.models import Score +from curation.scripts.qc_ref_genome import sample_df, map_rsids, map_variants_to_reference_genome, usecols class ScoringFileUpdate(): @@ -22,7 +23,6 @@ def __init__(self, score, study_path, new_scoring_dir, score_file_schema, score_ if not os.path.isdir(self.score_file_path): self.score_file_path = f'{study_path}/raw scores' - def create_scoringfileheader(self): '''Function to extract score & publication information for the PGS Catalog Scoring File commented header''' score = self.score @@ -57,7 +57,6 @@ def create_scoringfileheader(self): print(f'Header creation issue: {e}') return lines - def update_scoring_file(self): ''' Method to fetch the file, read it, add the header and compress it. ''' failed_update = False @@ -69,9 +68,9 @@ def update_scoring_file(self): raw_scorefile_tsv = f'{raw_scorefile_path}{self.tsv_ext}' raw_scorefile_xls = f'{raw_scorefile_path}{self.xls_ext}' if os.path.exists(raw_scorefile_txt): - df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine = 'python') + df_scoring = pd.read_table(raw_scorefile_txt, dtype='str', engine='python') elif os.path.exists(raw_scorefile_tsv): - df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine = 'python') + df_scoring = pd.read_table(raw_scorefile_tsv, dtype='str', engine='python') elif os.path.exists(raw_scorefile_xls): df_scoring = pd.read_excel(raw_scorefile_xls, dtype='str') else: @@ -140,7 +139,8 @@ def update_scoring_file(self): new_df_csv.append(row) df_csv = '\n'.join(new_df_csv) - with gzip.open(f'{self.new_score_file_path}/{score_id}.txt.gz', 'w') as outf: + new_score_file = f'{self.new_score_file_path}/{score_id}.txt.gz' + with gzip.open(new_score_file, 'w') as outf: outf.write('\n'.join(header).encode('utf-8')) outf.write('\n'.encode('utf-8')) outf.write(df_csv.encode('utf-8')) @@ -155,3 +155,94 @@ def update_scoring_file(self): failed_update = True print(f'ERROR reading scorefile: {raw_scorefile_path}\n-> {e}') return failed_update + + +class VariantPositionsQC: + + def __init__(self, score, new_scoring_dir, variant_positions_qc_config): + self.score = score + self.new_score_file_path = new_scoring_dir + self.variant_positions_qc_config = variant_positions_qc_config + + def qc_variant_positions(self, report_func=print, error_func=print) -> bool: + """ QC the variant positions regarding the assigned genome build of the scoring file.""" + failed_qc = False + score = self.score + genome_build = score.variants_genomebuild + if not genome_build: + failed_qc = True + error_func(f'Missing genome build') + build_version = None + if genome_build in ('GRCh37', 'hg19'): + build_version = '37' + elif genome_build in ('GRCh38', 'hg38'): + build_version = '38' + elif genome_build == 'NR': + report_func('Genome build = NR, variant positions validation ignored.') + return failed_qc + else: + report_func(f'Genome build = "{genome_build}", not detected as 37 or 38, variant positions validation ignored.') + return failed_qc + report_func(f'Build version = {build_version}') + + new_score_file = f'{self.new_score_file_path}/{score.id}.txt.gz' + + # Reading the scoring file + df = pd.read_csv(new_score_file, + sep="\t", + comment='#', + usecols=lambda c: c in usecols, + dtype={"rsID": 'string', "chr_name": 'string', "chr_position": 'Int64', + "effect_allele": 'string', "other_allele": 'string'}) + + if 'chr_position' not in df.columns: + report_func('No chr_position column. Variant position QC will be skipped') + return failed_qc + + # Headers for alleles. other_allele is optional, but should be tested if exists as we don't know which allele is the reference one. + alleles_headers = ['effect_allele'] + if 'other_allele' in df.columns: + alleles_headers.append('other_allele') + + n_requests = self.variant_positions_qc_config['n_requests'] + + if 'rsID' in df.columns: + max_request_size = self.variant_positions_qc_config['ensembl_max_variation_req_size'] + map_variants_func = map_rsids + else: + if len(alleles_headers) == 1: + # 1 allele column is not enough for sequence-based validation, as we don't know if the allele is ref or alt. + report_func('Only 1 allele column with no rsID. Variant position QC will be skipped') + return failed_qc + max_request_size = self.variant_positions_qc_config['ensembl_max_sequence_req_size'] + map_variants_func = map_variants_to_reference_genome + + errors = [] + variant_slices = sample_df(df, n_requests, max_request_size, alleles_headers, False, errors) + + match = 0 + mismatch = 0 + for variants in variant_slices: + results = map_variants_func(variants, build_version) + match += results['match'] + mismatch += results['mismatch'] + errors = errors + results['errors'] + + final_report = '''Results: + - Matches: {0} + - Mismatches: {1}''' + report_func(final_report.format(str(match), str(mismatch))) + + minimum_match_rate = self.variant_positions_qc_config['minimum_match_rate'] + + # Fail if low match rate + if match+mismatch == 0: + report_func('WARNING: No match data!') + else: + match_rate: float = float(match) / (match + mismatch) + report_func(f'Match rate: {match_rate}') + if match_rate < minimum_match_rate: + error_func('Low match rate! The assigned genome build might be incorrect') + failed_qc = True + + return failed_qc diff --git a/curation/imports/study.py b/curation/imports/study.py index 0b5dd452..999b117d 100644 --- a/curation/imports/study.py +++ b/curation/imports/study.py @@ -17,7 +17,7 @@ class StudyImport(): ('sample', 'id', Sample) ] - def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default): + def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default, reported_traits_cleaner:dict): self.study_name = study_data['name'] if not studies_dir.endswith('/'): @@ -49,6 +49,8 @@ def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_ self.failed_data_import = [] self.has_failed = False + self.reported_traits_cleaner = reported_traits_cleaner + def print_title(self): ''' Print the study name ''' @@ -144,7 +146,7 @@ def import_score_models(self): self.existing_scores.append(score.id) # Create Score model except Score.DoesNotExist: - score = score_data.create_score_model(self.study_publication) + score = score_data.create_score_model(self.study_publication, self.reported_traits_cleaner) self.import_warnings.append(f'New Score: {score.id} ({score_id})') self.study_scores[score_id] = score diff --git a/curation/parsers/publication.py b/curation/parsers/publication.py index 49955945..adc67c23 100644 --- a/curation/parsers/publication.py +++ b/curation/parsers/publication.py @@ -73,16 +73,13 @@ def rest_api_call_to_epmc(self,query): - query: the search query Return type: JSON ''' - payload = {'format': 'json'} - payload['query'] = query + payload = {'format': 'json', 'query': query} result = requests.get(constants.USEFUL_URLS['EPMC_REST_SEARCH'], params=payload) result = result.json() - try: - result = result['resultList']['result'][0] - except: - result = {} - print("ERROR: Can't find the paper in EuropePMC!") - return result + if 'result' in result['resultList']: + return result['resultList']['result'][0] + else: + raise Exception(f'Can\'t find the paper in EuropePMC! (query:{query})') def create_publication_model(self): diff --git a/curation/parsers/score.py b/curation/parsers/score.py index 16a64a11..3f8879f6 100644 --- a/curation/parsers/score.py +++ b/curation/parsers/score.py @@ -13,14 +13,13 @@ class ScoreData(GenericData): 'PRScs': 'PRS-CS' } - def __init__(self,score_name,spreadsheet_name): - GenericData.__init__(self,spreadsheet_name) + def __init__(self, score_name, spreadsheet_name): + GenericData.__init__(self, spreadsheet_name) self.name = score_name self.data = {'name': score_name} - @transaction.atomic - def create_score_model(self,publication): + def create_score_model(self, publication, reported_traits_cleaner: dict): ''' Create an instance of the Score model. It also create instance(s) of the EFOTrait model if needed. @@ -35,7 +34,7 @@ def create_score_model(self,publication): if field == 'trait_efo': efo_traits = [] for trait_id in val: - trait_id = trait_id.replace(':','_').strip() + trait_id = trait_id.replace(':', '_').strip() trait = TraitData(trait_id, self.spreadsheet_name) efo = trait.efotrait_model() efo_traits.append(efo) @@ -43,6 +42,11 @@ def create_score_model(self,publication): if field == 'method_name': if val in self.method_name_replacement.keys(): val = self.method_name_replacement[val] + elif field == 'trait_reported': + if val in reported_traits_cleaner: + new_val = reported_traits_cleaner[val] + print("Replaced reported trait \"{}\" with \"{}\"".format(val, new_val)) + val = new_val setattr(self.model, field, val) # Associate a Publication self.model.publication = publication diff --git a/curation/readme.md b/curation/readme.md index f2133d32..edffc208 100644 --- a/curation/readme.md +++ b/curation/readme.md @@ -19,6 +19,8 @@ It is located under curation (`curation/config.py`). There you can configure dif * the study input directory: `curation_directories['studies_dir']` * the scoring files output directory: `curation_directories['scoring_dir']` * the list of study names to import: `study_names_list` +* the settings of the variant positions quality control: `variant_positions_qc_config` +* the location of the reported traits cleaning dictionary: `reported_traits_replacement_file` You might need to change the value of `skip_scorefiles` to **True** if you only want to import the metadata (to test the study data for instance) @@ -51,6 +53,18 @@ default_curation_status = 'IP' scoringfiles_format_version = '2.0' skip_scorefiles = False + +skip_curationtracker = False + +variant_positions_qc_config = { + 'skip': False, + 'n_requests': 4, + 'ensembl_max_variation_req_size': 10, + 'ensembl_max_sequence_req_size': 50, + 'minimum_match_rate': 0.9 +} + +reported_traits_replacement_file = '/Users/myhome/PGS/reported_traits_dict.tsv' ``` #### Additional attributes for the study_names_list @@ -81,6 +95,21 @@ study_names_list = [ The value **E** corresponds to the curation status "Embargoed". +#### Variant positions quality control settings +This quality control step can be skipped by setting the attribute **skip** to **True** + +###### **'n_requests'**: number of requests sent +The QC uses the Ensembl REST API to validate the variant positions. The requests have a limited size (see other parameters below), therefore multiple requests might be required to get an accurate result. Each request takes a random sample of variants with the specified size from the scoring file (without replacement). A higher number will increase the accuracy of the results but slow down the import. + +###### **'ensembl_max_variation_req_size'**: the maximum number of variants sent the Ensembl Variation API +If the scoring file contains rsIDs, the Ensembl Variation API is used to validate the variant coordinates. In theory, variation requests maximum size is 200, however the API is likely to malfunction with that many variants. A smaller number like **10** is recommended. + +###### **'ensembl_max_sequence_req_size'**: the maximum number of variants sent the Ensembl Sequence API +If the scoring file doesn't contain the rsIDs but only the chromosome and position of the variants, the Ensembl Sequence API is used to validate the variant positions. The test compares each variant effect allele and other allele with the nucleotide found in the reference genome at the variant position. Both alleles are used as there is no certainty about which of the alleles is the reference allele. This means that a scoring file with a wrongly assigned reference genome can get 50% success match rate. It is assumed that the variant alleles use the forward strand as reference. + +###### **'minimum_match_rate'**: the threshold used to determine of the assigned reference genome of the scoring file is correct +A scoring file with a correctly assigned reference genome should get a match rate close to 100%. A scoring file with a wrongly assigned genome build can get 50% match rate. A threshold of 0.9 is recommended. + ### 3 - Update the Django settings In order to run the script, you might need to update the global settings of the Django project by adding the **Curation** app (`curation.apps.CurationConfig`) to the list of **INSTALLED_APPS**, in `pgs_web/settings.py`, e.g.: ``` diff --git a/curation/scripts/qc_ref_genome.py b/curation/scripts/qc_ref_genome.py new file mode 100644 index 00000000..628e4b19 --- /dev/null +++ b/curation/scripts/qc_ref_genome.py @@ -0,0 +1,262 @@ +import sys +import pandas as pd +import subprocess +import requests +from requests.adapters import HTTPAdapter, Retry +import json +import argparse +import re +from typing import TypedDict + +# Ensembl REST server config +servers = { + '37': 'https://grch37.rest.ensembl.org', + '38': 'https://rest.ensembl.org/' +} +ext_seq = "/sequence/region/human" +ext_var = "/variation/human" +headers = {"Content-Type": "application/json", "Accept": "application/json"} +MAX_SEQUENCE_REQUEST_SIZE = 50 +MAX_VARIATION_REQUEST_SIZE = 10 # maximum is 200, although it causes some requests to return no result + +usecols = ("rsID", "chr_name", "chr_position", "effect_allele", "other_allele") + + +class Variant(TypedDict): + rsID: str + chr: str + pos: int + alleles: list[str] + + +class MappingResults(TypedDict): + match: int + mismatch: int + errors: list[str] + + +def report(msg: str, submsgs: list[str] = []): + print('# ' + msg) + for submsg in submsgs: + print('# - ' + submsg) + + +def warn(msg: str): + sys.stderr.write(msg + "\n") + + +compl_alleles = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'} + + +def flip_alleles(alleles: list[str]): + """Flip the given alleles for checking for the reverse strand""" + return [compl_alleles[a.upper()] for a in alleles] + + +def post_request_with_retry(url, data, retry: int = 0): + """Send a POST request to the given URL. The request will be retried up to specified number of times + if the returned response is empty yet successful (happens with Ensembl Variation API)""" + s = requests.Session() + retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504], + allowed_methods=frozenset(['GET', 'POST'])) + s.mount('http://', HTTPAdapter(max_retries=retries)) + s.mount('https://', HTTPAdapter(max_retries=retries)) + r = s.post(url, headers=headers, data=json.dumps(data)) + + if not r.ok: + r.raise_for_status() + quit() + + result = r.json() + if retry and not result: # Sometimes Ensembl returns an empty response (although status 200) + warn('Returned empty results. Retrying request (remaining attempts: %i)' % retry) + result = post_request_with_retry(url, data, retry - 1) + + return result + + +def get_variation_from_ensembl(rsids: list[str], ref_genome): + url = servers[ref_genome] + ext_var + data = {'ids': rsids} + return post_request_with_retry(url, data, 4) + + +def get_ref_alleles_from_ensembl(variants: list[Variant], ref_genome): + url = servers[ref_genome] + ext_seq + data = {"regions": ["{}:{}-{}".format(v['chr'], v['pos'], v['pos']) for v in variants]} + return post_request_with_retry(url, data) + + +def map_variants_to_reference_genome(variants: list[Variant], ref_genome) -> MappingResults: + """Maps the given variants to their reference genome using their coordinates and the Ensembl Sequence API.""" + if len(variants) == 0: + return {'match': 0, 'mismatch': 0, 'errors': []} + response = get_ref_alleles_from_ensembl(variants, ref_genome) + match = 0 + mismatch = 0 + errors = [] + variants_dict = {"{}:{}-{}".format(v['chr'], v['pos'], v['pos']): v['alleles'] for v in variants} + for resp in response: + try: + seq = resp['seq'] + query = resp['query'] + if seq.upper() in [allele.upper() for allele in variants_dict[query]]: + match += 1 + else: + mismatch += 1 + except Exception as e: + errors.append(str(e)) + + return {'match': match, 'mismatch': mismatch, 'errors': errors} + + +def map_rsids(variants: list[Variant], ref_genome) -> MappingResults: + """Maps the given variants to their reference genome using their rsIDs and the Ensembl Variation API.""" + if len(variants) == 0: + return {'match': 0, 'mismatch': 0, 'errors': []} + response = get_variation_from_ensembl([v['rsID'] for v in variants], ref_genome) + match = 0 + mismatch = 0 + errors = [] + for variant in variants: + try: + mapping = response[variant['rsID']]['mappings'][0] + position = mapping['start'] + if position == variant['pos']: + match += 1 + else: + mismatch += 1 + except Exception as e: + errors.append(str(e)) + return {'match': match, 'mismatch': mismatch, 'errors': errors} + + +def get_alleles(row, alleles_headers: list[str]): + """Gets the alleles of the given row and checks that they are valid.""" + alleles = [] + for colname in alleles_headers: + allele = row[colname] + if allele.upper() not in ('A', 'C', 'G', 'T'): + raise Exception("Unexpected allele '{}'.".format(allele)) + alleles.append(allele) + return alleles + + +def sample_df(scoring_file_df, n_requests, max_request_size, alleles_headers, flip, errors) -> list[list[Variant]]: + """Gets n random variant samples of the given size from the given scoring file pandas dataframe.""" + variants_slices = [] + df = scoring_file_df + for i in range(n_requests): + variants = [] + # Sampling up to variants from the scoring file + sample_size = min(max_request_size, len(df.index)) + if sample_size == 0: + break + sample = df.sample(n=sample_size) + df = df.drop(sample.index) # Removing the sampled variants from the whole batch + for index, row in sample.iterrows(): + try: + chr_name = row['chr_name'] + chr_position = row['chr_position'] + if pd.isna(chr_position): + errors.append(f'Position[{index}] is NA') + continue + + alleles = get_alleles(row, alleles_headers) + if flip: + alleles = flip_alleles(alleles) + variant = {"chr": chr_name, "pos": chr_position, "alleles": alleles} + if 'rsID' in df.columns: + variant['rsID'] = row['rsID'] + variants.append(variant) + except Exception as e: + errors.append(str(e)) + + variants_slices.append(variants) + return variants_slices + + +def main(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("--scoring_file", help="An unzipped scoring file") + parser.add_argument("--ref", choices=["auto", "37", "38"], default="auto", + help="Reference genome. Allowed values: auto,37,38. If auto, the scoring file header genome_build attribute will be used.") + parser.add_argument("--n_requests", type=int, default=1, + help="Number of requests of size 50 (maximum allowed per Ensembl POST request). Default: 1") + parser.add_argument("--flip", default=False, action=argparse.BooleanOptionalAction) + + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + args = parser.parse_args() + + scoring_file = args.scoring_file + n_requests = args.n_requests + + # Getting the reference genome from the scoring file if not defined + ref_genome = None + if args.ref == 'auto': + grep_output = subprocess.run(['grep', '-m 1', 'genome_build', scoring_file], stdout=subprocess.PIPE).stdout.decode('utf-8').split("\n")[0] + ref_genome = re.sub("[^0-9]", "", grep_output.split('=')[1]) + if ref_genome == "19": + ref_genome = "37" + else: + ref_genome = args.ref + + flip = args.flip + + # Reading the scoring file + df = pd.read_csv(scoring_file, + sep="\t", + comment='#', + usecols=lambda c: c in usecols, + dtype={"rsID": 'string', "chr_name": 'string', "chr_position": 'Int64', "effect_allele": 'string', + "other_allele": 'string'}) + + # Ending if the scoring file does not contain positions (rsIDs) + if 'chr_position' not in df.columns: + report('No position') + quit() + + # Headers for alleles. other_allele is optional, but should be tested if exists as we don't know which allele is the reference one. + alleles_headers = ['effect_allele'] + if 'other_allele' in df.columns: + alleles_headers.append('other_allele') + + if len(alleles_headers) == 1: + report('Warning: Only 1 allele column, expect low match rate.') + + errors = [] + match = 0 + mismatch = 0 + + # If rsID, just fetch the variant position and compare it + max_request_size = None + map_variants_func = None + if 'rsID' in df.columns: + report('Using rsIDs to validate variant positions') + max_request_size = MAX_VARIATION_REQUEST_SIZE + map_variants_func = map_rsids + else: + max_request_size = MAX_SEQUENCE_REQUEST_SIZE + map_variants_func = map_variants_to_reference_genome + + variant_slices = sample_df(df, n_requests, max_request_size, alleles_headers, flip, errors) + + for variants in variant_slices: + results = map_variants_func(variants, ref_genome) + match += results['match'] + mismatch += results['mismatch'] + errors = errors + results['errors'] + + report(scoring_file) + report("Ref genome: " + ref_genome) + report("Matches: " + str(match)) + report("Mismatches: " + str(mismatch)) + if errors: + report("Errors:", errors) + + +if __name__ == "__main__": + main() diff --git a/curation/scripts/qc_ref_genome_readme.md b/curation/scripts/qc_ref_genome_readme.md new file mode 100644 index 00000000..bffd06b5 --- /dev/null +++ b/curation/scripts/qc_ref_genome_readme.md @@ -0,0 +1,34 @@ +## About the reference genome QC script + +This scripts verifies if the variant positions of a scoring file match the given reference genome. + +### How it works + +A sample of variants is taken from the total in the scoring file and tested for match against the reference genome. The reference genome is found using the Ensembl REST API. Two different services can be used depending on the available information of the variants in the scoring file: variants with rsID or variants with coordinates only. + +#### Using rsIDs (matching position) + +If a _rsID_ column is present, the Ensembl Variation API is used. A variant is considered matching if the both the positions in the scoring file and in the Ensembl API response are equal. + +#### Using coordinates (matching nucleotide) + +If only the coordinates (_chr_name_ and _chr_position_) are present, the Ensembl Sequence API is used. For each variant, the scripts fetches the nucleotide present at the variant position in the reference genome. Since there is no certainty about which allele is the reference one, both effect and other (if present) alleles are tested, and the variant is considered matching if one of these alleles is equal to the one returned by the API. This means that if the genome build is **not** correct, a match rate of 50% is expected if both effect and other alleles are specified, or 25% if only the effect allele is specified. + +#### Sample size + +The sample size varies depending on the API service used. The maximum request size of the Sequence API is 50, whereas the maximum size is 200 for the Variation API. However, it is recommended to set a smaller request size for the Variation API as it sometimes returns an empty response if the request size is too close to the maximum allowed. Each of those sizes can be modified in the script by changing the value of **MAX_SEQUENCE_REQUEST_SIZE** and **MAX_VARIATION_REQUEST_SIZE**. + +#### Number of samples + +Multiple requests to the Ensembl API can be made to increase the size of the total sample of tested variants (see Usage section). A higher number will increase the precision of the validation but slow down the execution. The samples are made from all the scoring file variants without replacement. No more request is made if all the variants have been sampled. + +### Usage + +``` +qc_ref_genome.py [-h] [--scoring_file SCORING_FILE] [--ref {auto,37,38}] [--n_requests N_REQUESTS] [--flip | --no-flip] +``` + +* *scoring_file* : the scoring file to validate (unzipped) +* *ref*: allowed values: **auto**, **37**, **38**. If set to auto, the reference genome found in the header of the scoring file will be used. +* *n_requests*: maximum number of requests (or number of samples) sent to the Ensembl API. Default: **1** +* *flip*: default is off. If flip is on, all variants will be tested against the reverse strand if using coordinates without rsID. \ No newline at end of file diff --git a/curation/template_parser.py b/curation/template_parser.py index e3b2ae65..76b78c09 100644 --- a/curation/template_parser.py +++ b/curation/template_parser.py @@ -96,7 +96,7 @@ def extract_publication(self,curation_status=None): ''' Extract publication information, fetch extra information via EuropePMC REST API and store it into a PublicationData object. ''' spreadsheet_name = self.spreadsheet_names['Publication'] pinfo = self.table_publication.iloc[0] - c_doi = pinfo['doi'] + c_doi = pinfo['doi'] if not pd.isnull(pinfo['doi']) else None # Getting rid of potential NaN if type(c_doi) == str: c_doi = c_doi.strip() if c_doi.startswith('https'): @@ -153,7 +153,6 @@ def extract_publication(self,curation_status=None): previous_field = field if 'date_publication' not in parsed_publication.data: parsed_publication.add_data('date_publication',date.today()) - parsed_publication if new_publication == True: parsed_publication.add_curation_notes() diff --git a/curation/tests/test_files/reported_traits_dict.tsv b/curation/tests/test_files/reported_traits_dict.tsv new file mode 100644 index 00000000..e0fbabcd --- /dev/null +++ b/curation/tests/test_files/reported_traits_dict.tsv @@ -0,0 +1,2 @@ +trait_reported corrected +"Coffee consumption" "Caffeine consumption" diff --git a/curation/tests/test_import.py b/curation/tests/test_import.py index 6fb8b1fa..8ffbed78 100755 --- a/curation/tests/test_import.py +++ b/curation/tests/test_import.py @@ -26,6 +26,12 @@ skip_curationtracker = True +variant_positions_qc_config = { + 'skip': True +} + +reported_traits_replacement_file = './curation/tests/test_files/reported_traits_dict.tsv' + # Test values data_counts = { 'publication': len(study_names_list), @@ -40,7 +46,11 @@ class ImportTest(TestCase): def run_import(self): # Main script - curation_import = CurationImport(curation_directories, study_names_list, default_curation_status, scoringfiles_format_version, skip_scorefiles, skip_curationtracker) + curation_import = CurationImport( + data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status, + scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scorefiles, + skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config, + reported_traits_dict_file=reported_traits_replacement_file) curation_import.run_curation_import() @@ -111,3 +121,7 @@ def test_import(self): for performance in performances: metrics = Metric.objects.filter(performance=performance) self.assertGreater(metrics.count(), 0) + + ## Reported trait cleaning test ## + coffee_score = Score.objects.get(name='PGS_test1') + self.assertEqual(coffee_score.trait_reported, 'Caffeine consumption')