Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(IPVC-2276): skip accessions from gff that are not in the tx_info file, log missing acs to file #18

Merged
merged 10 commits into from
Apr 5, 2024
8 changes: 6 additions & 2 deletions etc/scripts/run-uta-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as
sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \
tee "$logs_dir/ncbi-parse-genomic-gff.log"

sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \
--missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
tee "$logs_dir/filter_exonset_transcripts.log"

# generate seqinfo files from exonsets
sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
tee "$logs_dir/exonset-to-seqinfo.log"

### update the uta database
Expand All @@ -77,7 +81,7 @@ uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo "$loadi
tee "$logs_dir/load-txinfo.log"

# gff exon sets
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
tee "$logs_dir/load-exonsets.log"

# align exons
Expand Down
51 changes: 51 additions & 0 deletions sbin/filter_exonset_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

import argparse
import csv
import logging.config
import sys

import importlib_resources

from uta.formats.exonset import ExonSetReader, ExonSetWriter
from uta.formats.txinfo import TxInfoReader
from uta.tools.file_utils import open_file

logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
logging.config.fileConfig(logging_conf_fn)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)


def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

exonsets = ExonSetReader(es_f)
esw = ExonSetWriter(sys.stdout)
writer_missing = csv.writer(missing_f)
missing_acs = set()

for exonset in exonsets:
if exonset.tx_ac in transcript_ids:
esw.write(exonset)
else:
logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
writer_missing.writerow([exonset.tx_ac])
missing_acs.add(exonset.tx_ac)
logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}")


def main():
parser = argparse.ArgumentParser(description='Filter exonset data.')
parser.add_argument('--tx-info', help='Path to the transcript info file')
parser.add_argument('--exonsets', help='Path to the exonset file')
parser.add_argument('--missing-ids', help='Path to the missing transcript ids file')
args = parser.parse_args()

with open_file(args.tx_info) as f:
tx_reader = TxInfoReader(f)
transcript_ids = {row.ac for row in tx_reader}
filter_exonset(args.exonsets, transcript_ids, args.missing_ids)


if __name__ == '__main__':
main()
17 changes: 3 additions & 14 deletions sbin/ncbi_parse_genomic_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,16 @@
"""

import argparse
import gzip
import logging.config
import sys
from collections import defaultdict
from contextlib import contextmanager
from dataclasses import dataclass
from typing import List, Optional

import pkg_resources

from uta.formats.exonset import ExonSet, ExonSetWriter


@contextmanager
def open_file(filename):
if filename.endswith(".gz"):
with gzip.open(filename, "rt") as f:
yield f
else:
with open(filename) as f:
yield f
from uta.tools.file_utils import open_file


@dataclass
Expand Down Expand Up @@ -115,7 +104,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int:
return int(alignment_id.split("-")[-1])


def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the name to reflect the function takes a list of files

tx_data = defaultdict(list)
for file_path in file_paths:
with open_file(file_path) as f:
Expand Down Expand Up @@ -152,7 +141,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str:
gff_files = args.gff_files
esw = ExonSetWriter(sys.stdout)

transcript_alignments = parse_gff_file(gff_files)
transcript_alignments = parse_gff_files(gff_files)
logger.info(
f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}"
)
Expand Down
12 changes: 12 additions & 0 deletions src/uta/tools/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import gzip
from contextlib import contextmanager


@contextmanager
def open_file(filename):
if filename.endswith(".gz"):
with gzip.open(filename, "rt") as f:
yield f
else:
with open(filename) as f:
yield f
46 changes: 46 additions & 0 deletions tests/test_filter_exonset_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import contextlib
import io
import unittest
from tempfile import NamedTemporaryFile
from unittest.mock import patch

from sbin.filter_exonset_transcripts import filter_exonset


class TestFilterExonsetTranscripts(unittest.TestCase):

@patch('sbin.filter_exonset_transcripts.logger')
def test_filter_exonset(self, mock_logger):
# Test NR_046571.1 is filtered out
lines = [
"tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
"NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n",
"NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n",
"NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n"
]
with NamedTemporaryFile(delete=False) as temp_exonsets:
with open(temp_exonsets.name, "wt") as f:
for line in lines:
f.write(line)
temp_exonsets.seek(0)
missing_ids_file = NamedTemporaryFile()

transcript_ids = {"NR_122113.1", "NR_133911.1"}
stdout = io.StringIO()
with contextlib.redirect_stdout(stdout):
filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name)

# Assert the record for NR_046571.1 is filtered out
self.assertEqual(stdout.getvalue(), ''.join(lines[0:3]))

# Confirm filtered transcript is present in missing_ids_file
with open(missing_ids_file.name, 'r') as f:
contents = f.read()
self.assertEqual(contents, 'NR_046571.1\n')

mock_logger.warning.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.')
mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s): NR_046571.1')


if __name__ == "__main__":
unittest.main()
6 changes: 3 additions & 3 deletions tests/test_ncbi_parse_genomic_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sbin.ncbi_parse_genomic_gff import (
get_zero_based_exon_ranges,
GFFRecord,
parse_gff_file,
parse_gff_files,
parse_gff_record,
)

Expand Down Expand Up @@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self):
def test_parse_gff_file(self):
# Test parsing the entire uncompressed GFF file
expected_result = {"rna-NR_046018.2": self.gff_records}
parsed_result = parse_gff_file([self.temp_gff.name])
parsed_result = parse_gff_files([self.temp_gff.name])
self.assertEqual(parsed_result, expected_result)

def test_parse_gff_file_accepts_gzipped_files(self):
Expand All @@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self):

# Test parsing the gzipped GFF file
expected_result = {"rna-NR_046018.2": self.gff_records}
parsed_result = parse_gff_file([self.temp_gff.name + ".gz"])
parsed_result = parse_gff_files([self.temp_gff.name + ".gz"])
self.assertEqual(parsed_result, expected_result)

def test_get_zero_based_exon_ranges(self):
Expand Down
Loading