diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 166c6cb..1bd0bac 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -60,8 +60,12 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-genomic-gff.log" +sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \ + --missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ + tee "$logs_dir/filter_exonset_transcripts.log" + # generate seqinfo files from exonsets -sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ +sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ tee "$logs_dir/exonset-to-seqinfo.log" ### update the uta database @@ -77,7 +81,7 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loadi tee "$logs_dir/load-txinfo.log" # gff exon sets -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ tee "$logs_dir/load-exonsets.log" # align exons diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py new file mode 100755 index 0000000..b3d1fc2 --- /dev/null +++ b/sbin/filter_exonset_transcripts.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import csv +import logging.config +import sys + +import importlib_resources + +from uta.formats.exonset import ExonSetReader, ExonSetWriter +from uta.formats.txinfo import TxInfoReader +from uta.tools.file_utils import open_file + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def filter_exonset(exonset_file, transcript_ids, missing_ids_file): + with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f: + exonsets = ExonSetReader(es_f) + esw = ExonSetWriter(sys.stdout) + writer_missing = csv.writer(missing_f) + missing_acs = set() + + for exonset in exonsets: + if exonset.tx_ac in transcript_ids: + esw.write(exonset) + else: + logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.") + writer_missing.writerow([exonset.tx_ac]) + missing_acs.add(exonset.tx_ac) + logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}") + + +def main(): + parser = argparse.ArgumentParser(description='Filter exonset data.') + parser.add_argument('--tx-info', help='Path to the transcript info file') + parser.add_argument('--exonsets', help='Path to the exonset file') + parser.add_argument('--missing-ids', help='Path to the missing transcript ids file') + args = parser.parse_args() + + with open_file(args.tx_info) as f: + tx_reader = TxInfoReader(f) + transcript_ids = {row.ac for row in tx_reader} + filter_exonset(args.exonsets, transcript_ids, args.missing_ids) + + +if __name__ == '__main__': + main() diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py index 8fb1169..728e8a5 100755 --- a/sbin/ncbi_parse_genomic_gff.py +++ b/sbin/ncbi_parse_genomic_gff.py @@ -24,27 +24,16 @@ """ import argparse -import gzip import logging.config import sys from collections import defaultdict -from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional import pkg_resources from uta.formats.exonset import ExonSet, ExonSetWriter - - -@contextmanager -def open_file(filename): - if filename.endswith(".gz"): - with gzip.open(filename, "rt") as f: - yield f - else: - with open(filename) as f: - yield f +from uta.tools.file_utils import open_file @dataclass @@ -115,7 +104,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int: return int(alignment_id.split("-")[-1]) -def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]: +def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]: tx_data = defaultdict(list) for file_path in file_paths: with open_file(file_path) as f: @@ -152,7 +141,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str: gff_files = args.gff_files esw = ExonSetWriter(sys.stdout) - transcript_alignments = parse_gff_file(gff_files) + transcript_alignments = parse_gff_files(gff_files) logger.info( f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}" ) diff --git a/src/uta/tools/file_utils.py b/src/uta/tools/file_utils.py new file mode 100644 index 0000000..78d8d3d --- /dev/null +++ b/src/uta/tools/file_utils.py @@ -0,0 +1,12 @@ +import gzip +from contextlib import contextmanager + + +@contextmanager +def open_file(filename): + if filename.endswith(".gz"): + with gzip.open(filename, "rt") as f: + yield f + else: + with open(filename) as f: + yield f diff --git a/tests/test_filter_exonset_transcripts.py b/tests/test_filter_exonset_transcripts.py new file mode 100644 index 0000000..7ae9cf6 --- /dev/null +++ b/tests/test_filter_exonset_transcripts.py @@ -0,0 +1,46 @@ +import contextlib +import io +import unittest +from tempfile import NamedTemporaryFile +from unittest.mock import patch + +from sbin.filter_exonset_transcripts import filter_exonset + + +class TestFilterExonsetTranscripts(unittest.TestCase): + + @patch('sbin.filter_exonset_transcripts.logger') + def test_filter_exonset(self, mock_logger): + # Test NR_046571.1 is filtered out + lines = [ + "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n", + "NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n", + "NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n", + "NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n" + ] + with NamedTemporaryFile(delete=False) as temp_exonsets: + with open(temp_exonsets.name, "wt") as f: + for line in lines: + f.write(line) + temp_exonsets.seek(0) + missing_ids_file = NamedTemporaryFile() + + transcript_ids = {"NR_122113.1", "NR_133911.1"} + stdout = io.StringIO() + with contextlib.redirect_stdout(stdout): + filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name) + + # Assert the record for NR_046571.1 is filtered out + self.assertEqual(stdout.getvalue(), ''.join(lines[0:3])) + + # Confirm filtered transcript is present in missing_ids_file + with open(missing_ids_file.name, 'r') as f: + contents = f.read() + self.assertEqual(contents, 'NR_046571.1\n') + + mock_logger.warning.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.') + mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s): NR_046571.1') + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ncbi_parse_genomic_gff.py b/tests/test_ncbi_parse_genomic_gff.py index 71d92f9..b40edeb 100644 --- a/tests/test_ncbi_parse_genomic_gff.py +++ b/tests/test_ncbi_parse_genomic_gff.py @@ -7,7 +7,7 @@ from sbin.ncbi_parse_genomic_gff import ( get_zero_based_exon_ranges, GFFRecord, - parse_gff_file, + parse_gff_files, parse_gff_record, ) @@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self): def test_parse_gff_file(self): # Test parsing the entire uncompressed GFF file expected_result = {"rna-NR_046018.2": self.gff_records} - parsed_result = parse_gff_file([self.temp_gff.name]) + parsed_result = parse_gff_files([self.temp_gff.name]) self.assertEqual(parsed_result, expected_result) def test_parse_gff_file_accepts_gzipped_files(self): @@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self): # Test parsing the gzipped GFF file expected_result = {"rna-NR_046018.2": self.gff_records} - parsed_result = parse_gff_file([self.temp_gff.name + ".gz"]) + parsed_result = parse_gff_files([self.temp_gff.name + ".gz"]) self.assertEqual(parsed_result, expected_result) def test_get_zero_based_exon_ranges(self):