Skip to content

Commit

Permalink
share method
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jan 3, 2024
1 parent cf27bb9 commit 3de425d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 62 deletions.
32 changes: 31 additions & 1 deletion src/gene/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from typing import Dict, List, Optional, Union

import click
import pandas as pd
import pydantic
from biocommons.seqrepo import SeqRepo
from wags_tails import EnsemblData, HgncData, NcbiGeneData

from gene.database import AbstractDatabase
from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName
from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName, StoredSequenceLocation

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -206,6 +207,35 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None:
# return chr_location
# return None

def _build_sequence_location(
self, seq_id: str, row: pd.Series, concept_id: str
) -> Optional[StoredSequenceLocation]:
"""Construct a sequence location for storing in a DB.
:param seq_id: The sequence ID.
:param row: A gene from the source file.
:param concept_id: record ID from source
:return: A storable SequenceLocation containing relevant params for returning a
VRS SequenceLocation, or None if unable to retrieve valid parameters
"""
aliases = self._get_seq_id_aliases(seq_id)
if not aliases or row.start is None or row.end is None:
return None

sequence = aliases[0]

if row.start != "." and row.end != "." and sequence:
if 0 <= row.start <= row.end:
return StoredSequenceLocation(
start=row.start - 1,
end=row.end,
sequence_id=sequence,
)
else:
_logger.warning(
f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}"
)

def _get_seq_id_aliases(self, seq_id: str) -> List[str]:
"""Get GA4GH aliases for a sequence id
Expand Down
31 changes: 0 additions & 31 deletions src/gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
DataLicenseAttributes,
NamespacePrefix,
SourceMeta,
StoredSequenceLocation,
Strand,
)

Expand Down Expand Up @@ -89,36 +88,6 @@ def _add_gene(self, row: pd.Series, accession_numbers: Dict) -> Dict:

return gene_params

# TODO reincorporate with NCBI
def _build_sequence_location(
self, seq_id: str, row: pd.Series, concept_id: str
) -> Optional[StoredSequenceLocation]:
"""Construct a sequence location for storing in a DB.
:param seq_id: The sequence ID.
:param row: A gene from the source file.
:param concept_id: record ID from source
:return: A storable SequenceLocation containing relevant params for returning a
VRS SequenceLocation, or None if unable to retrieve valid parameters
"""
aliases = self._get_seq_id_aliases(seq_id)
if not aliases or row.start is None or row.end is None:
return None

sequence = aliases[0]

if row.start != "." and row.end != "." and sequence:
if 0 <= row.start <= row.end:
return StoredSequenceLocation(
start=row.start - 1,
end=row.end,
sequence_id=sequence,
)
else:
_logger.warning(
f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}"
)

def _add_attributes(self, row: pd.Series, gene: Dict) -> None:
"""Add concept_id, symbol, and xrefs to a gene record.
Expand Down
30 changes: 0 additions & 30 deletions src/gene/etl/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
NamespacePrefix,
SourceMeta,
SourceName,
StoredSequenceLocation,
SymbolStatus,
)

Expand Down Expand Up @@ -188,35 +187,6 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, Dict]:
info_genes[params["symbol"]] = params
return info_genes

def _build_sequence_location(
self, seq_id: str, row: pd.Series, concept_id: str
) -> Optional[StoredSequenceLocation]:
"""Construct a sequence location for storing in a DB.
:param seq_id: The sequence ID.
:param row: A gene from the source file.
:param concept_id: record ID from source
:return: A storable SequenceLocation containing relevant params for returning a
VRS SequenceLocation, or None if unable to retrieve valid parameters
"""
aliases = self._get_seq_id_aliases(seq_id)
if not aliases or row.start is None or row.end is None:
return None

sequence = aliases[0]

if row.start != "." and row.end != "." and sequence:
if 0 <= row.start <= row.end:
return StoredSequenceLocation(
start=row.start - 1,
end=row.end,
sequence_id=sequence,
)
else:
_logger.warning(
f"{concept_id} has invalid interval: start={row.start - 1} end={row.end}"
)

def _get_gene_gff(self, df: pd.DataFrame, info_genes: Dict) -> None:
"""Store genes from NCBI gff file.
Expand Down

0 comments on commit 3de425d

Please sign in to comment.