From 4ecb9b4f34e190796a4622d82dbbddb5029affe3 Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Tue, 2 Apr 2024 08:21:03 -0700 Subject: [PATCH 1/9] feat(IPVC-2276): skip acs missing from seqrepo and save them to a file --- sbin/exonset-to-seqinfo | 60 ++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo index 33b3cba..2302019 100755 --- a/sbin/exonset-to-seqinfo +++ b/sbin/exonset-to-seqinfo @@ -1,20 +1,20 @@ #!/usr/bin/env python -"""write SeqInfo files from fasta""" +"""Writes SeqInfo files from fasta to stdout. Writes transcript accessions that are missing from +SeqRepo to a file called exonset_missing_accessions.txt in the same directory as the input files.""" import argparse -import configparser as ConfigParser import gzip import itertools -import logging import logging.config -import pkg_resources +import os import re import sys -from bioutils.digests import seq_md5 +import configparser as ConfigParser +import pkg_resources from biocommons.seqrepo import SeqRepo -# from multifastadb import MultiFastaDB +from bioutils.digests import seq_md5 from uta.formats.exonset import ExonSetReader from uta.formats.seqinfo import SeqInfo, SeqInfoWriter @@ -49,6 +49,8 @@ if __name__ == "__main__": ac_re = re.compile("[NX][CGMPR]_") opts = parse_args(sys.argv[1:]) + input_dir = os.path.dirname(opts.FILES[0]) + missing_accessions_file_path = os.path.join(input_dir, "exonset_missing_accessions.txt") cf = ConfigParser.ConfigParser() for conf_fn in opts.conf: @@ -72,25 +74,27 @@ if __name__ == "__main__": # this is just a fancy way to make a set of all tx_ac and alt_ac accessions acs = sorted( set(itertools.chain.from_iterable((es.tx_ac, es.alt_ac) for es in esr))) - - acs_not_found = set() - for ac in acs: - try: - seq = str(sr[ac]) - except KeyError: - logging.warning("Sequence not found: " + ac) - acs_not_found.update([ac]) - continue - - si = SeqInfo( - ac=ac, - descr=None, - len=len(seq), - md5=seq_md5(seq), - origin=opts.origin, - seq=None, - ) - siw.write(si) - - if acs_not_found: - raise RuntimeError("Sequences for {} accessions not found".format(len(acs_not_found))) + missing_acs_count = 0 + with open(missing_accessions_file_path, "w") as f: + f.write("tx_ac\n") + for ac in acs: + try: + seq = str(sr[ac]) + except KeyError: + logging.warning("Sequence not found: " + ac) + missing_acs_count += 1 + f.write(ac + "\n") + continue + + si = SeqInfo( + ac=ac, + descr=None, + len=len(seq), + md5=seq_md5(seq), + origin=opts.origin, + seq=None, + ) + siw.write(si) + + # Log the number of missing accessions written to the file + logging.info(f"{missing_acs_count} accessions were missing from Seqrepo. See {missing_accessions_file_path}.") \ No newline at end of file From dc48fa07e21c211a1d2f4c91b23c5d2befb6fcb5 Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Tue, 2 Apr 2024 08:21:39 -0700 Subject: [PATCH 2/9] feat(IPVC-2276): update fx name --- sbin/ncbi_parse_genomic_gff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py index 8fb1169..bff42fe 100755 --- a/sbin/ncbi_parse_genomic_gff.py +++ b/sbin/ncbi_parse_genomic_gff.py @@ -115,7 +115,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int: return int(alignment_id.split("-")[-1]) -def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]: +def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]: tx_data = defaultdict(list) for file_path in file_paths: with open_file(file_path) as f: @@ -152,7 +152,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str: gff_files = args.gff_files esw = ExonSetWriter(sys.stdout) - transcript_alignments = parse_gff_file(gff_files) + transcript_alignments = parse_gff_files(gff_files) logger.info( f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}" ) From 5b77ba6fd2196084bf231e2c065a460f7153787e Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Tue, 2 Apr 2024 08:21:58 -0700 Subject: [PATCH 3/9] feat(IPVC-2276): update fx name --- tests/test_ncbi_parse_genomic_gff.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ncbi_parse_genomic_gff.py b/tests/test_ncbi_parse_genomic_gff.py index 71d92f9..b40edeb 100644 --- a/tests/test_ncbi_parse_genomic_gff.py +++ b/tests/test_ncbi_parse_genomic_gff.py @@ -7,7 +7,7 @@ from sbin.ncbi_parse_genomic_gff import ( get_zero_based_exon_ranges, GFFRecord, - parse_gff_file, + parse_gff_files, parse_gff_record, ) @@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self): def test_parse_gff_file(self): # Test parsing the entire uncompressed GFF file expected_result = {"rna-NR_046018.2": self.gff_records} - parsed_result = parse_gff_file([self.temp_gff.name]) + parsed_result = parse_gff_files([self.temp_gff.name]) self.assertEqual(parsed_result, expected_result) def test_parse_gff_file_accepts_gzipped_files(self): @@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self): # Test parsing the gzipped GFF file expected_result = {"rna-NR_046018.2": self.gff_records} - parsed_result = parse_gff_file([self.temp_gff.name + ".gz"]) + parsed_result = parse_gff_files([self.temp_gff.name + ".gz"]) self.assertEqual(parsed_result, expected_result) def test_get_zero_based_exon_ranges(self): From 523f06c270e4938c04ef5aff1f92809d518089ed Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Tue, 2 Apr 2024 08:40:11 -0700 Subject: [PATCH 4/9] style(IPVC-2276): add empty new line --- sbin/exonset-to-seqinfo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo index 2302019..5c0e4b5 100755 --- a/sbin/exonset-to-seqinfo +++ b/sbin/exonset-to-seqinfo @@ -97,4 +97,4 @@ if __name__ == "__main__": siw.write(si) # Log the number of missing accessions written to the file - logging.info(f"{missing_acs_count} accessions were missing from Seqrepo. See {missing_accessions_file_path}.") \ No newline at end of file + logging.info(f"{missing_acs_count} accessions were missing from Seqrepo. See {missing_accessions_file_path}.") From 50ff903c5b18bc405f65c1d37e8ee4b55c1601b9 Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Thu, 4 Apr 2024 07:35:45 -0700 Subject: [PATCH 5/9] Revert "feat(IPVC-2276): skip acs missing from seqrepo and save them to a file" This reverts commit 4ecb9b4f34e190796a4622d82dbbddb5029affe3. --- sbin/exonset-to-seqinfo | 60 +++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo index 5c0e4b5..33b3cba 100755 --- a/sbin/exonset-to-seqinfo +++ b/sbin/exonset-to-seqinfo @@ -1,20 +1,20 @@ #!/usr/bin/env python -"""Writes SeqInfo files from fasta to stdout. Writes transcript accessions that are missing from -SeqRepo to a file called exonset_missing_accessions.txt in the same directory as the input files.""" +"""write SeqInfo files from fasta""" import argparse +import configparser as ConfigParser import gzip import itertools +import logging import logging.config -import os +import pkg_resources import re import sys -import configparser as ConfigParser -import pkg_resources -from biocommons.seqrepo import SeqRepo from bioutils.digests import seq_md5 +from biocommons.seqrepo import SeqRepo +# from multifastadb import MultiFastaDB from uta.formats.exonset import ExonSetReader from uta.formats.seqinfo import SeqInfo, SeqInfoWriter @@ -49,8 +49,6 @@ if __name__ == "__main__": ac_re = re.compile("[NX][CGMPR]_") opts = parse_args(sys.argv[1:]) - input_dir = os.path.dirname(opts.FILES[0]) - missing_accessions_file_path = os.path.join(input_dir, "exonset_missing_accessions.txt") cf = ConfigParser.ConfigParser() for conf_fn in opts.conf: @@ -74,27 +72,25 @@ if __name__ == "__main__": # this is just a fancy way to make a set of all tx_ac and alt_ac accessions acs = sorted( set(itertools.chain.from_iterable((es.tx_ac, es.alt_ac) for es in esr))) - missing_acs_count = 0 - with open(missing_accessions_file_path, "w") as f: - f.write("tx_ac\n") - for ac in acs: - try: - seq = str(sr[ac]) - except KeyError: - logging.warning("Sequence not found: " + ac) - missing_acs_count += 1 - f.write(ac + "\n") - continue - - si = SeqInfo( - ac=ac, - descr=None, - len=len(seq), - md5=seq_md5(seq), - origin=opts.origin, - seq=None, - ) - siw.write(si) - - # Log the number of missing accessions written to the file - logging.info(f"{missing_acs_count} accessions were missing from Seqrepo. See {missing_accessions_file_path}.") + + acs_not_found = set() + for ac in acs: + try: + seq = str(sr[ac]) + except KeyError: + logging.warning("Sequence not found: " + ac) + acs_not_found.update([ac]) + continue + + si = SeqInfo( + ac=ac, + descr=None, + len=len(seq), + md5=seq_md5(seq), + origin=opts.origin, + seq=None, + ) + siw.write(si) + + if acs_not_found: + raise RuntimeError("Sequences for {} accessions not found".format(len(acs_not_found))) From 0e15485570c06e5eb9d01dd40e7bda3fb88bd7df Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Thu, 4 Apr 2024 15:01:45 -0700 Subject: [PATCH 6/9] refactor(IPVC-2276): pull out open_file into module --- sbin/ncbi_parse_genomic_gff.py | 13 +------------ src/uta/tools/file_utils.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) create mode 100644 src/uta/tools/file_utils.py diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py index bff42fe..728e8a5 100755 --- a/sbin/ncbi_parse_genomic_gff.py +++ b/sbin/ncbi_parse_genomic_gff.py @@ -24,27 +24,16 @@ """ import argparse -import gzip import logging.config import sys from collections import defaultdict -from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional import pkg_resources from uta.formats.exonset import ExonSet, ExonSetWriter - - -@contextmanager -def open_file(filename): - if filename.endswith(".gz"): - with gzip.open(filename, "rt") as f: - yield f - else: - with open(filename) as f: - yield f +from uta.tools.file_utils import open_file @dataclass diff --git a/src/uta/tools/file_utils.py b/src/uta/tools/file_utils.py new file mode 100644 index 0000000..78d8d3d --- /dev/null +++ b/src/uta/tools/file_utils.py @@ -0,0 +1,12 @@ +import gzip +from contextlib import contextmanager + + +@contextmanager +def open_file(filename): + if filename.endswith(".gz"): + with gzip.open(filename, "rt") as f: + yield f + else: + with open(filename) as f: + yield f From 18b0443e3a32402f8b9484cb103d0292a44aecde Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Thu, 4 Apr 2024 15:02:34 -0700 Subject: [PATCH 7/9] feat(IPVC-2276): filter exonsets by transcript info file --- sbin/filter_exonset_transcripts.py | 51 ++++++++++++++++++++++++ tests/test_filter_exonset_transcripts.py | 46 +++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100755 sbin/filter_exonset_transcripts.py create mode 100644 tests/test_filter_exonset_transcripts.py diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py new file mode 100755 index 0000000..ce58a3b --- /dev/null +++ b/sbin/filter_exonset_transcripts.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import csv +import logging.config +import sys + +import importlib_resources + +from uta.formats.exonset import ExonSetReader, ExonSetWriter +from uta.formats.txinfo import TxInfoReader +from uta.tools.file_utils import open_file + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def filter_exonset(exonset_file, transcript_ids, missing_ids_file): + with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f: + exonsets = ExonSetReader(es_f) + esw = ExonSetWriter(sys.stdout) + writer_missing = csv.writer(missing_f) + missing_acs = set() + + for exonset in exonsets: + if exonset.tx_ac in transcript_ids: + esw.write(exonset) + else: + logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.") + writer_missing.writerow([exonset.tx_ac]) + missing_acs.add(exonset.tx_ac) + logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}") + + +def main(): + parser = argparse.ArgumentParser(description='Filter exonset data.') + parser.add_argument('--tx-info', help='Path to the transcript info file') + parser.add_argument('--exonsets', help='Path to the exonset file') + parser.add_argument('--missing-ids', help='Path to the missing transcript ids file') + args = parser.parse_args() + + with open_file(args.tx_info) as f: + tx_reader = TxInfoReader(f) + transcript_ids = {row.ac for row in tx_reader} + filter_exonset(args.exonsets, transcript_ids, args.missing_ids) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/test_filter_exonset_transcripts.py b/tests/test_filter_exonset_transcripts.py new file mode 100644 index 0000000..7ae9cf6 --- /dev/null +++ b/tests/test_filter_exonset_transcripts.py @@ -0,0 +1,46 @@ +import contextlib +import io +import unittest +from tempfile import NamedTemporaryFile +from unittest.mock import patch + +from sbin.filter_exonset_transcripts import filter_exonset + + +class TestFilterExonsetTranscripts(unittest.TestCase): + + @patch('sbin.filter_exonset_transcripts.logger') + def test_filter_exonset(self, mock_logger): + # Test NR_046571.1 is filtered out + lines = [ + "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n", + "NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n", + "NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n", + "NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n" + ] + with NamedTemporaryFile(delete=False) as temp_exonsets: + with open(temp_exonsets.name, "wt") as f: + for line in lines: + f.write(line) + temp_exonsets.seek(0) + missing_ids_file = NamedTemporaryFile() + + transcript_ids = {"NR_122113.1", "NR_133911.1"} + stdout = io.StringIO() + with contextlib.redirect_stdout(stdout): + filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name) + + # Assert the record for NR_046571.1 is filtered out + self.assertEqual(stdout.getvalue(), ''.join(lines[0:3])) + + # Confirm filtered transcript is present in missing_ids_file + with open(missing_ids_file.name, 'r') as f: + contents = f.read() + self.assertEqual(contents, 'NR_046571.1\n') + + mock_logger.warning.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.') + mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s): NR_046571.1') + + +if __name__ == "__main__": + unittest.main() From 9327609244e339279ce5a4d311e1073c827ac274 Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Thu, 4 Apr 2024 15:03:43 -0700 Subject: [PATCH 8/9] feat(IPVC-2276): add exonset filtering to build pipeline --- etc/scripts/run-uta-build.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 166c6cb..1bd0bac 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -60,8 +60,12 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \ tee "$logs_dir/ncbi-parse-genomic-gff.log" +sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \ + --missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ + tee "$logs_dir/filter_exonset_transcripts.log" + # generate seqinfo files from exonsets -sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ +sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \ tee "$logs_dir/exonset-to-seqinfo.log" ### update the uta database @@ -77,7 +81,7 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loadi tee "$logs_dir/load-txinfo.log" # gff exon sets -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \ +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \ tee "$logs_dir/load-exonsets.log" # align exons From 348d942957208ac0db0e3d7365f32310b20a0a5c Mon Sep 17 00:00:00 2001 From: Paige Taylor Date: Thu, 4 Apr 2024 15:08:36 -0700 Subject: [PATCH 9/9] style(IPVC-2276): add newline --- sbin/filter_exonset_transcripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py index ce58a3b..b3d1fc2 100755 --- a/sbin/filter_exonset_transcripts.py +++ b/sbin/filter_exonset_transcripts.py @@ -48,4 +48,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main()