Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to output ClinVar XML #365

Merged
merged 6 commits into from
Feb 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ jobs:
echo "/tmp/nextflow" >> $GITHUB_PATH
cd -

- name: Unit tests for clinvar_xml_io
run: python -m pytest --cov=eva_cttv_pipeline eva_cttv_pipeline/clinvar_xml_io/tests

- name: Unit tests for all pipelines
run: python -m pytest --cov=eva_cttv_pipeline --cov=consequence_prediction tests -k "not integration"
run: python -m pytest --cov=eva_cttv_pipeline --cov=consequence_prediction --cov-append tests -k "not integration"

- name: Integration tests for all pipelines
run: python -m pytest --cov=eva_cttv_pipeline --cov=consequence_prediction --cov-append tests -k integration
Expand Down
5 changes: 4 additions & 1 deletion eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from .clinvar_xml_io import *
from .clinvar_dataset import ClinVarDataset
from .clinvar_measure import ClinVarRecordMeasure
from .clinvar_record import ClinVarRecord
from .clinvar_trait import ClinVarTrait
37 changes: 37 additions & 0 deletions eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import gzip
import logging

from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.clinvar_record import ClinVarRecord
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class ClinVarDataset:
"""Iterate through records in ClinVar XML dump and convert them into internal ClinVarRecord representation."""
def __init__(self, clinvar_xml):
self.clinvar_xml = clinvar_xml

def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarRecord(rcv)

def write(self, output_xml):
"""Writes the entire ClinVarDataset to a gzipped file at output_xml."""
logger.info(f'Writing ClinVarDataset to: {output_xml}')
# TODO add date, maybe metadata about being processed by this pipeline?
header = b'''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<ReleaseSet Dated="." xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.60.xsd">
'''
count = 0
with gzip.open(output_xml, 'wb') as output_file:
output_file.write(header)
for record in self:
output_file.write(b'<ClinVarSet>\n ')
record.write(output_file)
output_file.write(b'</ClinVarSet>\n')
count += 1

output_file.write(b'\n</ReleaseSet>')
logger.info(f'Records written: {count}')
222 changes: 222 additions & 0 deletions eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import logging
from functools import cached_property

from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class ClinVarRecordMeasure:
"""This class represents individual ClinVar record "measures". Measures are essentially isolated variants, which can
be combined into either MeasureSets (include one or more Measures) or GenotypeSets. For a detailed description of
ClinVar data model, see data-exploration/clinvar-variant-types/."""

# For ClinVar Microsatellite events with complete coordinates, require the event to be at least this number of bases
# long in order for it to be considered a repeat expansion event. Smaller events will be processed as regular
# insertions. The current value was chosen as a reasonable threshold to separate thousands of very small insertions
# which are technically microsatellite expansion events but are not long enough to be considered clinically
# significant repeat expansion variants.
REPEAT_EXPANSION_THRESHOLD = 12

# Microsatellite variant types.
MS_DELETION = 'deletion'
MS_SHORT_EXPANSION = 'short_expansion'
MS_REPEAT_EXPANSION = 'repeat_expansion'
MS_NO_COMPLETE_COORDS = 'no_complete_coords'

def __init__(self, measure_xml, clinvar_record):
self.measure_xml = measure_xml
self.clinvar_record = clinvar_record

@property
def all_names(self):
"""Returns a lexicographically sorted list of all measure names, including the preferred one (if any)."""
return sorted(name.text for name in find_elements(self.measure_xml, './Name/ElementValue'))

@property
def preferred_name(self):
"""Returns a single preferred measure name, as indicated in the ClinVar record."""
name = find_optional_unique_element(self.measure_xml, './Name/ElementValue[@Type="Preferred"]')
return None if name is None else name.text

@property
def preferred_or_other_name(self):
"""Returns a consistent name for a measure, if one is present."""
if self.preferred_name:
return self.preferred_name
elif self.all_names:
return self.all_names[0]
else:
return None

@property
def preferred_gene_symbols(self):
return [elem.text for elem in find_elements(
self.measure_xml, './MeasureRelationship/Symbol/ElementValue[@Type="Preferred"]')]

@property
def hgnc_ids(self):
return [elem.attrib['ID'] for elem in find_elements(self.measure_xml, './MeasureRelationship/XRef[@DB="HGNC"]')]

@property
def rs_id(self):
"""Returns dbSNP rsID found for the record measure."""
rs_ids = ['rs' + elem.attrib['ID'] for elem in find_elements(self.measure_xml, './XRef[@DB="dbSNP"]')]
if len(rs_ids) == 0:
return None
elif len(rs_ids) == 1:
return rs_ids[0]
else:
logger.warning(f'Found multiple RS IDs for {self.clinvar_record}, this is not yet supported')
return None

@property
def nsv_id(self):
nsv_ids = [elem.attrib['ID']
for elem in find_elements(self.measure_xml, './XRef[@DB="dbVar"]')
if elem.attrib['ID'].startswith('nsv')]
if len(nsv_ids) == 0:
return None
elif len(nsv_ids) == 1:
return nsv_ids[0]
else:
logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported')
return None

@cached_property
def _hgvs_to_types(self):
"""
Returns a dict of HgvsVariant to type attributes from the XML, for all HGVS identifiers in the measure.
For example, if there's an element
<Attribute Type="HGVS, genomic, RefSeqGene">NG_008029.2:g.9185del</Attribute>
we will include
'NG_008029.2:g.9185del': ['hgvs', 'genomic', 'refseqgene']
in the returned dict.
"""
return {
HgvsVariant(elem.text): {t.lower().strip() for t in elem.attrib['Type'].split(',')}
for elem in find_elements(self.measure_xml, './AttributeSet/Attribute')
if elem.attrib['Type'].startswith('HGVS') and elem.text is not None
}

@cached_property
def all_hgvs(self):
return [hgvs for hgvs in self._hgvs_to_types]

@cached_property
def current_hgvs(self):
return {hgvs for hgvs, types in self._hgvs_to_types.items() if 'previous' not in types}

@cached_property
def genomic_hgvs(self):
return {hgvs for hgvs, types in self._hgvs_to_types.items() if 'genomic' in types}

@cached_property
def toplevel_refseq_hgvs(self):
for hgvs, types in self._hgvs_to_types.items():
if types == {'hgvs', 'genomic', 'top level'}:
return hgvs
return None

@cached_property
def preferred_current_hgvs(self):
"""
Returns a consistent current HGVS identifier for a measure, if one is present.
Currently the order of preferences is as follows:
- top level RefSeq HGVS
- lexicographically first genomic HGVS
- lexicographically first other HGVS
"""
if self.toplevel_refseq_hgvs:
return self.toplevel_refseq_hgvs
elif self.current_hgvs:
current_genomic = sorted(self.current_hgvs & self.genomic_hgvs)
if current_genomic:
for hgvs in current_genomic:
if hgvs.reference_sequence == self.sequence_location_helper('Accession'):
return hgvs
return current_genomic[0]
return sorted(self.current_hgvs)[0]
return None

@property
def variant_type(self):
return self.measure_xml.attrib['Type']

@property
def explicit_insertion_length(self):
if self.vcf_alt and self.vcf_ref:
return len(self.vcf_alt) - len(self.vcf_ref)
return None

@property
def microsatellite_category(self):
if self.variant_type == 'Microsatellite':
if self.has_complete_coordinates:
if self.explicit_insertion_length < 0:
return self.MS_DELETION
elif self.explicit_insertion_length < self.REPEAT_EXPANSION_THRESHOLD:
return self.MS_SHORT_EXPANSION
else:
return self.MS_REPEAT_EXPANSION
else:
return self.MS_NO_COMPLETE_COORDS
else:
return None

@property
def is_repeat_expansion_variant(self):
return self.microsatellite_category in (self.MS_REPEAT_EXPANSION, self.MS_NO_COMPLETE_COORDS)

@property
def pubmed_refs(self):
"""Variant-specific PubMed references, contained inside a Measure entity. These are usually large reviews which
focus on genetics of specific types of variants or genomic regions."""
return [int(elem.text) for elem in find_elements(self.measure_xml, './Citation/ID[@Source="PubMed"]')]

@property
def chr(self):
return self.sequence_location_helper('Chr')

@property
def vcf_pos(self):
return self.sequence_location_helper('positionVCF')

@property
def vcf_ref(self):
return self.sequence_location_helper('referenceAlleleVCF')

@property
def vcf_alt(self):
return self.sequence_location_helper('alternateAlleleVCF')

@property
def has_complete_coordinates(self):
return bool(self.chr and self.vcf_pos and self.vcf_ref and self.vcf_alt)

@property
def vcf_full_coords(self):
"""Returns complete variant coordinates in CHROM_POS_REF_ALT format, if present, otherwise None."""
if self.has_complete_coordinates:
return '_'.join([self.chr, self.vcf_pos, self.vcf_ref, self.vcf_alt])

def sequence_location_helper(self, attr):
if self.variant_type == 'Translocation':
# TODO: Translocations have multiple locations and are not supported.
# TODO: https://github.com/EBIvariation/eva-opentargets/issues/171
return None
sequence_locations = find_elements(self.measure_xml, './SequenceLocation[@Assembly="GRCh38"]')
if len(sequence_locations) != 1:
# TODO: Support variants with multiple locations (for example, chrX/chrY).
# TODO: https://github.com/EBIvariation/eva-opentargets/issues/172
return None
return sequence_locations[0].attrib.get(attr)

def get_variant_name_or_hgvs(self):
if self.preferred_or_other_name:
return self.preferred_or_other_name
if self.toplevel_refseq_hgvs:
return self.toplevel_refseq_hgvs.text
return None
Loading