From 3de425d2321ec188cdfda89b7eb1d8b01cd6f633 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Wed, 3 Jan 2024 16:21:03 -0500 Subject: [PATCH] share method --- src/gene/etl/base.py | 32 +++++++++++++++++++++++++++++++- src/gene/etl/ensembl.py | 31 ------------------------------- src/gene/etl/ncbi.py | 30 ------------------------------ 3 files changed, 31 insertions(+), 62 deletions(-) diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 0d8a364e..1dde1585 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -7,12 +7,13 @@ from typing import Dict, List, Optional, Union import click +import pandas as pd import pydantic from biocommons.seqrepo import SeqRepo from wags_tails import EnsemblData, HgncData, NcbiGeneData from gene.database import AbstractDatabase -from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName +from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName, StoredSequenceLocation _logger = logging.getLogger(__name__) @@ -206,6 +207,35 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: # return chr_location # return None + def _build_sequence_location( + self, seq_id: str, row: pd.Series, concept_id: str + ) -> Optional[StoredSequenceLocation]: + """Construct a sequence location for storing in a DB. + + :param seq_id: The sequence ID. + :param row: A gene from the source file. + :param concept_id: record ID from source + :return: A storable SequenceLocation containing relevant params for returning a + VRS SequenceLocation, or None if unable to retrieve valid parameters + """ + aliases = self._get_seq_id_aliases(seq_id) + if not aliases or row.start is None or row.end is None: + return None + + sequence = aliases[0] + + if row.start != "." and row.end != "." and sequence: + if 0 <= row.start <= row.end: + return StoredSequenceLocation( + start=row.start - 1, + end=row.end, + sequence_id=sequence, + ) + else: + _logger.warning( + f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}" + ) + def _get_seq_id_aliases(self, seq_id: str) -> List[str]: """Get GA4GH aliases for a sequence id diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 6448d008..201a2145 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -14,7 +14,6 @@ DataLicenseAttributes, NamespacePrefix, SourceMeta, - StoredSequenceLocation, Strand, ) @@ -89,36 +88,6 @@ def _add_gene(self, row: pd.Series, accession_numbers: Dict) -> Dict: return gene_params - # TODO reincorporate with NCBI - def _build_sequence_location( - self, seq_id: str, row: pd.Series, concept_id: str - ) -> Optional[StoredSequenceLocation]: - """Construct a sequence location for storing in a DB. - - :param seq_id: The sequence ID. - :param row: A gene from the source file. - :param concept_id: record ID from source - :return: A storable SequenceLocation containing relevant params for returning a - VRS SequenceLocation, or None if unable to retrieve valid parameters - """ - aliases = self._get_seq_id_aliases(seq_id) - if not aliases or row.start is None or row.end is None: - return None - - sequence = aliases[0] - - if row.start != "." and row.end != "." and sequence: - if 0 <= row.start <= row.end: - return StoredSequenceLocation( - start=row.start - 1, - end=row.end, - sequence_id=sequence, - ) - else: - _logger.warning( - f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}" - ) - def _add_attributes(self, row: pd.Series, gene: Dict) -> None: """Add concept_id, symbol, and xrefs to a gene record. diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index d98681c8..8df2134c 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -23,7 +23,6 @@ NamespacePrefix, SourceMeta, SourceName, - StoredSequenceLocation, SymbolStatus, ) @@ -188,35 +187,6 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, Dict]: info_genes[params["symbol"]] = params return info_genes - def _build_sequence_location( - self, seq_id: str, row: pd.Series, concept_id: str - ) -> Optional[StoredSequenceLocation]: - """Construct a sequence location for storing in a DB. - - :param seq_id: The sequence ID. - :param row: A gene from the source file. - :param concept_id: record ID from source - :return: A storable SequenceLocation containing relevant params for returning a - VRS SequenceLocation, or None if unable to retrieve valid parameters - """ - aliases = self._get_seq_id_aliases(seq_id) - if not aliases or row.start is None or row.end is None: - return None - - sequence = aliases[0] - - if row.start != "." and row.end != "." and sequence: - if 0 <= row.start <= row.end: - return StoredSequenceLocation( - start=row.start - 1, - end=row.end, - sequence_id=sequence, - ) - else: - _logger.warning( - f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}" - ) - def _get_gene_gff(self, df: pd.DataFrame, info_genes: Dict) -> None: """Store genes from NCBI gff file.