diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 index b5214e7..c49316c --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,9 @@ repos: - repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.29.0 + rev: v0.30.0 hooks: - id: yapf - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + - repo: https://github.com/pycqa/flake8 + rev: 3.9.0 hooks: - id: flake8 diff --git a/Makefile b/Makefile old mode 100644 new mode 100755 index f5e801c..dc7cb6b --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ check: yapf -r --diff kb_python && echo OK build: - python setup.py sdist bdist_wheel + python setup.py sdist docs: sphinx-build -a docs docs/_build diff --git a/kb_python/config.py b/kb_python/config.py index 535f3b5..d858657 100755 --- a/kb_python/config.py +++ b/kb_python/config.py @@ -3,6 +3,8 @@ import shutil from collections import namedtuple +import ngs_tools as ngs + PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__)) PLATFORM = platform.system().lower() BINS_DIR = 'bins' @@ -10,7 +12,6 @@ TEMP_DIR = 'tmp' DRY = False VALIDATE = True -CHUNK_SIZE = 1024 * 1024 * 4 # Download files in chunks of 4 Mb def get_provided_kallisto_path(): @@ -50,128 +51,36 @@ def get_provided_bustools_path(): BUSTOOLS_PATH = get_provided_bustools_path() # Technology to file position mapping -Technology = namedtuple( - 'Technology', [ - 'name', 'description', 'nfiles', 'reads_file', 'umi_positions', - 'barcode_positions', 'whitelist_archive', 'map_archive' - ] -) -WHITELIST_DIR = 'whitelists' -MAP_DIR = 'maps' +Technology = namedtuple('Technology', ['name', 'description', 'chemistry']) TECHNOLOGIES = [ + Technology('10XV1', '10x version 1', ngs.chemistry.get_chemistry('10xv1')), + Technology('10XV2', '10x version 2', ngs.chemistry.get_chemistry('10xv2')), + Technology('10XV3', '10x version 3', ngs.chemistry.get_chemistry('10xv3')), + Technology('CELSEQ', 'CEL-Seq', ngs.chemistry.get_chemistry('celseq')), Technology( - '10XV1', - '10x version 1', - 3, - 2, - [(1, 0, 10)], - [(0, 0, 14)], - '10xv1_whitelist.txt.gz', - None, - ), - Technology( - '10XV2', - '10x version 2', - 2, - 1, - [(0, 16, 26)], - [(0, 0, 16)], - '10xv2_whitelist.txt.gz', - None, - ), - Technology( - '10XV3', - '10x version 3', - 2, - 1, - [(0, 16, 28)], - [(0, 0, 16)], - '10xv3_whitelist.txt.gz', - '10xv3_feature_barcode_map.txt.gz', - ), - Technology( - 'CELSEQ', - 'CEL-Seq', - 2, - 1, - [(0, 8, 12)], - [(0, 0, 8)], - None, - None, + 'CELSEQ2', 'CEL-SEQ version 2', ngs.chemistry.get_chemistry('celseq2') ), + Technology('DROPSEQ', 'DropSeq', ngs.chemistry.get_chemistry('dropseq')), Technology( - 'CELSEQ2', - 'CEL-SEQ version 2', - 2, - 1, - [(0, 0, 6)], - [(0, 6, 12)], - None, - None, + 'INDROPSV1', 'inDrops version 1', + ngs.chemistry.get_chemistry('indropsv1') ), Technology( - 'DROPSEQ', - 'DropSeq', - 2, - 1, - [(0, 12, 20)], - [(0, 0, 12)], - None, - None, + 'INDROPSV2', 'inDrops version 2', + ngs.chemistry.get_chemistry('indropsv2') ), Technology( - 'INDROPSV1', - 'inDrops version 1', - 2, - 1, - [(0, 42, 48)], - [(0, 0, 11), (0, 30, 38)], - None, - None, + 'INDROPSV3', 'inDrops version 3', + ngs.chemistry.get_chemistry('indropsv3') ), + Technology('SCRUBSEQ', 'SCRB-Seq', ngs.chemistry.get_chemistry('scrbseq')), Technology( - 'INDROPSV2', - 'inDrops version 2', - 2, - 0, - [(1, 42, 48)], - [(1, 0, 11), (1, 30, 38)], - None, - None, + 'SURECELL', 'SureCell for ddSEQ', + ngs.chemistry.get_chemistry('surecell') ), Technology( - 'INDROPSV3', - 'inDrops version 3', - 3, - 2, - [(1, 8, 14)], - [(0, 0, 8), (1, 0, 8)], - 'inDropsv3_whitelist.txt.gz', - None, + 'SMARTSEQ', 'Smart-seq2', ngs.chemistry.get_chemistry('smartseq2') ), - Technology( - 'SCRUBSEQ', - 'SCRB-Seq', - 2, - 1, - [(0, 6, 16)], - [(0, 0, 6)], - None, - None, - ), - Technology( - 'SURECELL', - 'SureCell for ddSEQ', - 2, - 1, - [(0, 51, 59)], - [(0, 0, 6), (0, 21, 27), (0, 42, 48)], - None, - None, - ), - Technology( - 'SMARTSEQ', 'Smart-seq2', 2, '0, 1 (paired)', [], [], None, None - ) ] TECHNOLOGIES_MAPPING = {t.name: t for t in TECHNOLOGIES} diff --git a/kb_python/count.py b/kb_python/count.py index 7c517d5..a3839fb 100755 --- a/kb_python/count.py +++ b/kb_python/count.py @@ -1,5 +1,4 @@ import json -import logging import os import re from urllib.parse import urlparse @@ -45,6 +44,7 @@ UNFILTERED_COUNTS_DIR, WHITELIST_FILENAME, ) +from .logging import logger from .report import render_report from .utils import ( copy_map, @@ -66,8 +66,6 @@ from .stats import STATS from .validate import validate_files -logger = logging.getLogger(__name__) - INSPECT_PARSER = re.compile(r'^.*?(?P[0-9]+)') @@ -1014,6 +1012,7 @@ def convert_transcripts_to_genes(txnames_path, t2g_path, genes_path): return genes_path +@logger.namespaced('count') def count( index_paths, t2g_path, @@ -1302,6 +1301,7 @@ def count( return results +@logger.namespaced('count_smartseq') def count_smartseq( index_paths, t2g_path, @@ -1397,6 +1397,7 @@ def count_smartseq( return results +@logger.namespaced('count_lamanno') def count_velocity( index_paths, t2g_path, diff --git a/kb_python/dry/utils.py b/kb_python/dry/utils.py old mode 100644 new mode 100755 index 6b6f620..44d5087 --- a/kb_python/dry/utils.py +++ b/kb_python/dry/utils.py @@ -2,11 +2,8 @@ import tempfile from ..config import ( - MAP_DIR, - PACKAGE_PATH, PLATFORM, TECHNOLOGIES_MAPPING, - WHITELIST_DIR, UnsupportedOSException, ) @@ -66,12 +63,10 @@ def copy_whitelist(technology, out_dir): """Dry version of `utils.copy_whitelist`. """ technology = TECHNOLOGIES_MAPPING[technology.upper()] - archive_path = os.path.join( - PACKAGE_PATH, WHITELIST_DIR, technology.whitelist_archive - ) + archive_path = technology.chemistry.whitelist_path whitelist_path = os.path.join( out_dir, - os.path.splitext(technology.whitelist_archive)[0] + os.path.splitext(os.path.basename(archive_path))[0] ) print('gzip -dc {} > {}'.format(archive_path, whitelist_path)) return whitelist_path @@ -81,10 +76,10 @@ def copy_map(technology, out_dir): """Dry version of `utils.copy_map`. """ technology = TECHNOLOGIES_MAPPING[technology.upper()] - archive_path = os.path.join(PACKAGE_PATH, MAP_DIR, technology.map_archive) + archive_path = technology.chemistry.feature_map_path map_path = os.path.join( out_dir, - os.path.splitext(technology.map_archive)[0] + os.path.splitext(os.path.basename(archive_path))[0] ) print('gzip -dc {} > {}'.format(archive_path, map_path)) return map_path diff --git a/kb_python/fasta.py b/kb_python/fasta.py deleted file mode 100755 index 39caf5d..0000000 --- a/kb_python/fasta.py +++ /dev/null @@ -1,669 +0,0 @@ -import itertools -import logging -import re - -import pandas as pd - -from .gtf import GTF -from .utils import open_as_text - -logger = logging.getLogger(__name__) - - -class FASTA: - """Utility class to easily read and manipulate FASTA files. - - :param fasta_path: path to FASTA file - :type fasta_path: str - """ - PARSER = re.compile(r'^>(?P\S+)(?P.*)') - GROUP_PARSER = re.compile(r'(?P\S+?):(?P\S+)') - COMPLEMENT = { - 'A': 'T', - 'C': 'G', - 'G': 'C', - 'T': 'A', - 'Y': 'R', - 'R': 'Y', - 'W': 'W', - 'S': 'S', - 'K': 'M', - 'M': 'K', - 'D': 'H', - 'V': 'B', - 'H': 'D', - 'B': 'V', - 'N': 'N', - } - SEQUENCE_PARSER = re.compile(r'[^atcgATCG]') - - def __init__(self, fasta_path): - self.fasta_path = fasta_path - - @staticmethod - def make_header(seq_id, attributes): - """Create a correctly-formatted FASTA header with the given sequence ID - and attributes. - - :param seq_id: sequence ID - :type seq_id: str - :param attributes: list of key-value pairs corresponding to attributes - of this sequence - :type attributes: list - - :return: FASTA header - :rtype: str - """ - return '>{} {}'.format( - seq_id, ' '.join('{}:{}'.format(k, v) for k, v in attributes) - ) - - @staticmethod - def parse_header(line): - """Parse information from a FASTA header. - - :param line: FASTA header line - :type line: str - - :return: parsed information - :rtype: dict - """ - match = FASTA.PARSER.match(line) - if match: - groupdict = match.groupdict() - groupdict['group'] = dict( - FASTA.GROUP_PARSER.findall(groupdict.get('group', '')) - ) - return groupdict - return None - - @staticmethod - def reverse_complement(sequence): - """Get the reverse complement of the given DNA sequence. - - :param sequence: DNA sequence - :type sequence: str - - :return: reverse complement - :rtype: str - """ - return ''.join(FASTA.COMPLEMENT[b] for b in reversed(sequence.upper())) - - def entries(self, parse=True): - """Generator that yields one FASTA entry (sequence ID + sequence) at a time. - - :param parse: whether or not to parse the header into a dictionary, - defaults to `True` - :type parse: bool, optional - - :return: a generator that yields a tuple of the FASTA entry - :rtype: generator - """ - with open_as_text(self.fasta_path, 'r') as f: - info = None - sequence = '' - for line in f: - if line.startswith('>'): - if info: - yield info, sequence - sequence = '' - - info = FASTA.parse_header(line) if parse else line.strip() - else: - sequence += line.strip() - - if info: - yield info, sequence - - def sort(self, out_path): - """Sort the FASTA file by sequence ID. - - :param out_path: path to generate the sorted FASTA - :type out_path: str - - :return: path to sorted FASTA file, set of chromosomes in FASTA file - :rtype: tuple - """ - chromosomes = set() - to_sort = [] - with open_as_text(self.fasta_path, 'r') as f: - position = 0 - line = f.readline() - - while line: - if line.startswith('>'): - header = FASTA.parse_header(line) - logger.debug( - 'Found FASTA entry {}'.format(header['sequence_id']) - ) - to_sort.append([header['sequence_id'], position, None]) - chromosomes.add(header['sequence_id']) - - position = f.tell() - line = f.readline() - - logger.debug('Sorting {} FASTA entries'.format(len(to_sort))) - for i in range(len(to_sort) - 1): - to_sort[i][2] = to_sort[i + 1][1] - to_sort.sort() - - logger.debug('Writing sorted FASTA {}'.format(out_path)) - with open_as_text(self.fasta_path, - 'r') as fasta, open_as_text(out_path, 'w') as f: - for tup in to_sort: - logger.debug(f'Writing FASTA entry {tup[0]}') - start_position = tup[1] - end_position = tup[2] - fasta.seek(start_position) - if end_position is not None: - while fasta.tell() < end_position: - f.write(fasta.readline()) - else: - for line in fasta: - f.write(line) - - return out_path, chromosomes - - -def generate_kite_fasta(feature_path, out_path, no_mismatches=False): - """Generate a FASTA file for feature barcoding with the KITE workflow. - - This FASTA contains all sequences that are 1 hamming distance from the - provided barcodes. The file of barcodes must be a 2-column TSV containing - the barcode sequences in the first column and their corresponding feature - name in the second column. If hamming distance 1 variants collide for any - pair of barcodes, the hamming distance 1 variants for those barcodes are - not generated. - - :param feature_path: path to TSV containing barcodes and feature names - :type feature_path: str - :param out_path: path to FASTA to generate - :type out_path: str - :param no_mismatches: whether to generate hamming distance 1 variants, - defaults to `False` - :type no_mismatches: bool, optional - - :raises Exception: if there are barcodes of different lengths - :raises Exception: if there are duplicate barcodes - - :return: (path to generated FASTA, set of barcode lengths) - :rtype: tuple - """ - - def generate_mismatches(name, sequence): - """Helper function to generate 1 hamming distance mismatches. - - :param name: name of the sequence - :type name: str - :param sequence: the sequence - :type sequence: str - - :return: a generator that yields a tuple of (unique name, mismatched sequence) - :rtype: generator - """ - sequence = sequence.upper() - for i in range(len(sequence)): - base = sequence[i] - before = sequence[:i] - after = sequence[i + 1:] - - for j, different in enumerate([b for b in ['A', 'C', 'G', 'T'] - if b != base]): - yield f'{name}-{i}.{j+1}', f'{before}{different}{after}' - - df_features = pd.read_csv( - feature_path, sep='\t', header=None, names=['sequence', 'name'] - ) - - lengths = set() - features = {} - variants = {} - # Generate all feature barcode variations before saving to check for collisions. - for i, row in df_features.iterrows(): - # Check that the first column contains the sequence - # and the second column the feature name. - if FASTA.SEQUENCE_PARSER.search(row.sequence.upper()): - raise Exception(( - 'Encountered non-ATCG basepairs in barcode sequence {row.sequence}. ' - 'Does the first column contain the sequences and the second column the feature names?' - )) - - lengths.add(len(row.sequence)) - features[row['name']] = row.sequence - variants[row['name']] = { - name: seq - for name, seq in generate_mismatches(row['name'], row.sequence) - if not no_mismatches - } - - # Check duplicate barcodes. - duplicates = set([ - bc for bc in features.values() if list(features.values()).count(bc) > 1 - ]) - if len(duplicates) > 0: - raise Exception( - 'Duplicate feature barcodes: {}'.format(' '.join(duplicates)) - ) - if len(lengths) > 1: - logger.warning( - 'Detected barcodes of different lengths: {}'.format( - ','.join(str(l) for l in lengths) # noqa - ) - ) - for f1, f2 in itertools.combinations(variants.keys(), 2): - v1 = variants[f1] - v2 = variants[f2] - if len(set.intersection(set(v1.values()), set(v2.values()))) > 0: - logger.warning(( - f'Detected features with barcodes that are not >2 hamming distance apart: {f1}, {f2}. ' - f'Hamming distance 1 variants for these barcodes will not be generated.' - )) - variants[f1] = {} - variants[f2] = {} - - # Write FASTA - with open_as_text(out_path, 'w') as f: - for feature, barcode in features.items(): - attributes = [('feature_id', feature)] - f.write(f'{FASTA.make_header(feature, attributes)}\n') - f.write(f'{barcode}\n') - - for name, variant in variants[feature].items(): - f.write(f'{FASTA.make_header(name, attributes)}\n') - f.write(f'{variant}\n') - - return out_path, min(lengths) - - -def generate_cdna_fasta(fasta_path, gtf_path, out_path, chromosomes=None): - """Generate a cDNA FASTA using the genome and GTF. - - This function assumes the order in which the chromosomes appear in the - genome FASTA is identical to the order in which they appear in the GTF. - Additionally, the GTF must be sorted by start position. - - :param fasta_path: path to genomic FASTA file - :type fasta_path: str - :param gtf_path: path to GTF file - :type gtf_path: str - :param out_path: path to cDNA FASTA to generate - :type out_path: str - :param chromosomes: set of chromosomes to generate sequences for. If not - provided, sequences for all chromosomes are generated - by default, defaults to `None` - :type chromosomes: set, optional - - :return: path to generated cDNA FASTA - :rtype: str - """ - fasta = FASTA(fasta_path) - gtf = GTF(gtf_path) - gtf_entries = gtf.entries() - - count = 0 - with open_as_text(out_path, 'w') as f: - previous_gtf_entry = None - for info, sequence in fasta.entries(): - sequence_id = info['sequence_id'] - if chromosomes and sequence_id not in chromosomes: - continue - - logger.debug( - 'Generating cDNA from chromosome {}'.format(sequence_id) - ) - - transcript_sequences = {} - transcript_infos = {} - while True: - try: - gtf_entry = previous_gtf_entry or next(gtf_entries) - except StopIteration: - break - previous_gtf_entry = None - chromosome = gtf_entry['seqname'] - if chromosomes and chromosome not in chromosomes: - continue - - if sequence_id != chromosome: - previous_gtf_entry = gtf_entry - break - - start = gtf_entry['start'] - end = gtf_entry['end'] - strand = gtf_entry['strand'] - if gtf_entry['feature'] == 'exon': - transcript_id = gtf_entry['group']['transcript_id'] - transcript_version = gtf_entry['group'].get( - 'transcript_version', None - ) - - transcript = '{}.{}'.format( - transcript_id, transcript_version - ) if transcript_version else transcript_id - if transcript not in transcript_sequences: - transcript_sequences[transcript] = '' - transcript_sequences[transcript] += sequence[start - 1:end] - elif gtf_entry['feature'] == 'transcript': - transcript_id = gtf_entry['group']['transcript_id'] - transcript_version = gtf_entry['group'].get( - 'transcript_version', None - ) - transcript = '{}.{}'.format( - transcript_id, transcript_version - ) if transcript_version else transcript_id - - gene_id = gtf_entry['group']['gene_id'] - gene_version = gtf_entry['group'].get('gene_version', None) - gene = '{}.{}'.format( - gene_id, gene_version - ) if gene_version else gene_id - gene_name = gtf_entry['group'].get('gene_name', '') - transcript_name = gtf_entry['group'].get( - 'transcript_name', '' - ) - - if transcript not in transcript_infos: - attributes = [ - ('gene_id', gene), - ('gene_name', gene_name), - ('transcript_name', transcript_name), - ('chr', chromosome), - ('start', start), - ('end', end), - ('strand', strand), - ] - transcript_infos[transcript] = attributes - - logger.debug( - f'Writing {len(transcript_sequences)} cDNA transcripts for chromosome {chromosome}' - ) - # Validation. - info_unique = set(transcript_infos.keys() - ) - set(transcript_sequences.keys()) - sequences_unique = set(transcript_sequences.keys() - ) - set(transcript_infos.keys()) - if info_unique: - raise Exception( - 'The following transcripts have "transcript" feature(s) but no ' - 'corresponding "exon" feature: {}'.format( - ', '.join(info_unique) - ) - ) - if sequences_unique: - raise Exception( - 'The following transcripts have a "exon" feature but no ' - 'corresponding "transcript" feature(s): {}'.format( - ', '.join(sequences_unique) - ) - ) - - for transcript in sorted(transcript_sequences.keys()): - exon = transcript_sequences[transcript] - attributes = transcript_infos[transcript] - f.write( - '{}\n'.format(FASTA.make_header(transcript, attributes)) - ) - f.write( - '{}\n'.format( - exon if dict(attributes)['strand'] == - '+' else FASTA.reverse_complement(exon) - ) - ) - count += 1 - logger.info(f'Wrote {count} cDNA transcripts') - - return out_path - - -def generate_intron_fasta( - fasta_path, gtf_path, out_path, chromosomes=None, flank=30 -): - """Generate an intron FASTA using the genome and GTF. - - This function assumes the order in which the chromosomes appear in the - genome FASTA is identical to the order in which they appear in the GTF. - Additionally, the GTF must be sorted by start position. - The intron for a specific transcript is the collection of the following: - 1. transcript - exons - 2. 5' UTR - 3. 3' UTR - Additionally, append 30-bp (k - 1 where k = 31) flanks to each intron, - combining sections that overlap into a single FASTA entry. - - :param fasta_path: path to genomic FASTA file - :type fasta_path: str - :param gtf_path: path to GTF file - :type gtf_path: str - :param out_path: path to intron FASTA to generate - :type out_path: str - :param chromosomes: set of chromosomes to generate sequences for. If not - provided, sequences for all chromosomes are generated - by default, defaults to `None` - :type chromosomes: set, optional - :param flank: the size of intron flanks, in bases, defaults to `30` - :type flank: int, optional - - :return: path to generated intron FASTA - :rtype: str - """ - fasta = FASTA(fasta_path) - gtf = GTF(gtf_path) - gtf_entries = gtf.entries() - - count = 0 - with open_as_text(out_path, 'w') as f: - previous_gtf_entry = None - for info, sequence in fasta.entries(): - sequence_id = info['sequence_id'] - if chromosomes and sequence_id not in chromosomes: - continue - logger.debug( - 'Generating introns from chromosome {}'.format(sequence_id) - ) - - transcript_exons = {} - transcript_infos = {} - while True: - try: - gtf_entry = previous_gtf_entry or next(gtf_entries) - except StopIteration: - break - previous_gtf_entry = None - chromosome = gtf_entry['seqname'] - if chromosomes and chromosome not in chromosomes: - continue - - if sequence_id != chromosome: - previous_gtf_entry = gtf_entry - break - - start = gtf_entry['start'] - end = gtf_entry['end'] - strand = gtf_entry['strand'] - if gtf_entry['feature'] == 'exon': - transcript_id = gtf_entry['group']['transcript_id'] - transcript_version = gtf_entry['group'].get( - 'transcript_version', None - ) - transcript = '{}.{}'.format( - transcript_id, transcript_version - ) if transcript_version else transcript_id - - transcript_exons.setdefault(transcript, - []).append((start, end)) - elif gtf_entry['feature'] == 'transcript': - transcript_id = gtf_entry['group']['transcript_id'] - transcript_version = gtf_entry['group'].get( - 'transcript_version', None - ) - transcript = '{}.{}'.format( - transcript_id, transcript_version - ) if transcript_version else transcript_id - - gene_id = gtf_entry['group']['gene_id'] - gene_version = gtf_entry['group'].get('gene_version', None) - gene = '{}.{}'.format( - gene_id, gene_version - ) if gene_version else gene_id - gene_name = gtf_entry['group'].get('gene_name', '') - transcript_name = gtf_entry['group'].get( - 'transcript_name', '' - ) - - if transcript not in transcript_infos: - attributes = [ - ('gene_id', gene), - ('gene_name', gene_name), - ('transcript_name', transcript_name), - ('chr', chromosome), - ('start', start), - ('end', end), - ('strand', strand), - ] - transcript_infos[transcript] = attributes - - logger.debug(( - f'Found {len(transcript_exons)} transcripts with exons for chromosome {chromosome}.' - 'Generating introns from these transcripts' - )) - # Validation. - info_unique = set(transcript_infos.keys() - ) - set(transcript_exons.keys()) - sequences_unique = set(transcript_exons.keys() - ) - set(transcript_infos.keys()) - if info_unique: - raise Exception( - 'The following transcripts have "exon" feature(s) but no ' - 'corresponding "transcript" feature: {}'.format( - ', '.join(info_unique) - ) - ) - if sequences_unique: - raise Exception( - 'The following transcripts have a "transcript" feature but no ' - 'corresponding "exon" feature(s): {}'.format( - ', '.join(sequences_unique) - ) - ) - for transcript in sorted(transcript_exons.keys()): - attributes = transcript_infos[transcript] - - # Find transcript interval - all exon intervals - attributes_dict = dict(attributes) - transcript_interval = ( - attributes_dict['start'], attributes_dict['end'] - ) - introns = [] - exons = list(sorted(transcript_exons[transcript])) - if exons: - if exons[0][0] > transcript_interval[0]: - introns.append( - (transcript_interval[0], exons[0][0] - 1) - ) - - for i in range(len(exons) - 1): - start = exons[i][1] - end = exons[i + 1][0] - introns.append((start + 1, end - 1)) - - if exons[-1][1] < transcript_interval[1]: - introns.append( - (exons[-1][1] + 1, transcript_interval[1]) - ) - else: - introns.append(transcript_interval) - - index = 1 - flank_start = None - flank_end = None - for start, end in introns: - if flank_start is None: - flank_start = max(start - flank, transcript_interval[0]) - if flank_end is None or start - flank <= flank_end: - flank_end = min(end + flank, transcript_interval[1]) - else: - intron = sequence[flank_start - 1:flank_end] - f.write( - '{}\n'.format( - FASTA.make_header( - '{}-I.{}'.format(transcript, index), - attributes - ) - ) - ) - f.write( - '{}\n'.format( - intron if dict(attributes)['strand'] == - '+' else FASTA.reverse_complement(intron) - ) - ) - count += 1 - index += 1 - flank_start = max(start - flank, transcript_interval[0]) - flank_end = min(end + flank, transcript_interval[1]) - if flank_start is not None and flank_end is not None: - intron = sequence[flank_start - 1:flank_end] - f.write( - '{}\n'.format( - FASTA.make_header( - '{}-I.{}'.format(transcript, index), attributes - ) - ) - ) - f.write( - '{}\n'.format( - intron if dict(attributes)['strand'] == - '+' else FASTA.reverse_complement(intron) - ) - ) - count += 1 - index += 1 - logger.info(f'Wrote {count} intron sequences') - - return out_path - - -def generate_spliced_fasta(fasta_path, gtf_path, out_path): - """Generate a spliced FASTA using the genome and GTF. - - This function assumes the order in which the chromosomes appear in the - genome FASTA is identical to the order in which they appear in the GTF. - Additionally, the GTF must be sorted by start position. - The spliced FASTA contains entries of length 2 * (k - 1) for k = 31, - centered around exon-exon splice junctions (any overlapping regions are - collapsed). - - :param fasta_path: path to genomic FASTA file - :type fasta_path: str - :param gtf_path: path to GTF file - :type gtf_path: str - :param out_path: path to spliced FASTA to generate - :type out_path: str - - :return: path to generated spliced FASTA - :rtype: str - """ - pass - - -def generate_unspliced_fasta(fasta_path, gtf_path, out_path): - """Generate a unspliced FASTA using the genome and GTF. - - This function assumes the order in which the chromosomes appear in the - genome FASTA is identical to the order in which they appear in the GTF. - Additionally, the GTF must be sorted by start position. - The spliced FASTA contains entries of length 2 * (k - 1) for k = 31, - centered around exon-intron splice junctions + full introns - (any overlapping regions are collapsed). - - :param fasta_path: path to genomic FASTA file - :type fasta_path: str - :param gtf_path: path to GTF file - :type gtf_path: str - :param out_path: path to unspliced FASTA to generate - :type out_path: str - - :return: path to generated unspliced FASTA - :rtype: str - """ - pass diff --git a/kb_python/gtf.py b/kb_python/gtf.py deleted file mode 100755 index afc57d6..0000000 --- a/kb_python/gtf.py +++ /dev/null @@ -1,110 +0,0 @@ -import logging -import re - -from .utils import open_as_text - -logger = logging.getLogger(__name__) - - -class GTF: - """Utility class to easily read and parse GTF files. - - :param gtf_path: path to GTF file - :type gtf_path: str - """ - PARSER = re.compile( - r''' - ^(?P.+?)\s+ # chromosome - .*?\t # source - (?P.+?)\s+ # feature: transcript, exon, etc. - (?P[0-9]+?)\s+ # start position (1-indexed) - (?P[0-9]+?)\s+ # end position (1-indexed, inclusive) - .*?\s+ # score - (?P\+|-|\.)\s+ # +, -, . indicating strand - .*?\s+ # frame - (?P.*) # groups - ''', re.VERBOSE - ) - GROUP_PARSER = re.compile(r'(?P\S+?)\s*"(?P.+?)";?') - - def __init__(self, gtf_path): - self.gtf_path = gtf_path - - @staticmethod - def parse_entry(line): - """Parse a single GTF entry. - - :param line: a line in the GTF file - :type line: str - - :return: parsed GTF information - :rtype: dict - """ - match = GTF.PARSER.match(line) - if match: - groupdict = match.groupdict() - groupdict['start'] = int(groupdict['start']) - groupdict['end'] = int(groupdict['end']) - groupdict['group'] = dict( - GTF.GROUP_PARSER.findall( - groupdict.get('group', '').replace(' ', '') - ) - ) - if not groupdict['group']: - logger.warning( - f'Failed to parse GTF attributes of entry: {line}' - ) - - return groupdict - logger.warning(f'Failed to parse GTF entry: {line}') - return None - - def entries(self): - """Generator that yields one GTF entry at a time. - - :return: a generator that yields a dict of the GTF entry - :rtype: generator - """ - with open_as_text(self.gtf_path, 'r') as f: - for line in f: - if line.startswith('#') or line.isspace(): - continue - - yield GTF.parse_entry(line) - - def sort(self, out_path): - """Sort the GTF file by chromosome, start position, line number. - - :param out_path: path to generate the sorted GTF - :type out_path: str - - :return: path to sorted GTF file, set of chromosomes in GTF file - :rtype: tuple - """ - chromosomes = set() - to_sort = [] - with open_as_text(self.gtf_path, 'r') as f: - position = 0 - line = f.readline() - while line: - if not line.startswith('#') and not line.isspace(): - entry = GTF.parse_entry(line) - if entry['feature'] in ('transcript', 'exon'): - to_sort.append( - (entry['seqname'], entry['start'], position) - ) - chromosomes.add(entry['seqname']) - position = f.tell() - line = f.readline() - logger.debug('Sorting {} GTF entries'.format(len(to_sort))) - to_sort.sort() - - logger.debug('Writing sorted GTF {}'.format(out_path)) - with open_as_text(self.gtf_path, 'r') as gtf, open_as_text(out_path, - 'w') as f: - for tup in to_sort: - position = tup[2] - gtf.seek(position) - f.write(gtf.readline()) - - return out_path, chromosomes diff --git a/kb_python/info.txt b/kb_python/info.txt index 2a71f76..9c8ed60 100755 --- a/kb_python/info.txt +++ b/kb_python/info.txt @@ -1,6 +1,4 @@ -kb is a python package for rapidly pre-processing single-cell RNA-seq data. It is a wrapper for the methods described in - -Melsted, Booeshaghi et al., Modular and efficient pre-processing of single-cell RNA-seq, bioRxiv, 2019 +kb is a python package for rapidly pre-processing single-cell RNA-seq data. It is a wrapper for the methods described in [2]. The goal of the wrapper is to simplify downloading and running of the kallisto [1] and bustools [2] programs. It was inspired by Sten Linnarsson’s loompy fromfq command (http://linnarssonlab.org/loompy/kallisto/index.html) diff --git a/kb_python/logging.py b/kb_python/logging.py new file mode 100755 index 0000000..7acd61d --- /dev/null +++ b/kb_python/logging.py @@ -0,0 +1,4 @@ +import ngs_tools as ngs + +logger = ngs.logging.Logger(__name__) +ngs.logging.set_logger(logger) diff --git a/kb_python/main.py b/kb_python/main.py index 7340457..9a140f4 100755 --- a/kb_python/main.py +++ b/kb_python/main.py @@ -22,6 +22,7 @@ ) from .constants import INFO_FILENAME from .count import count, count_smartseq, count_velocity +from .logging import logger from .ref import download_reference, ref, ref_kite, ref_lamanno from .utils import ( get_bustools_version, @@ -31,8 +32,6 @@ remove_directory, ) -logger = logging.getLogger(__name__) - def display_info(): """Displays kb, kallisto and bustools version + citation information, along @@ -61,20 +60,26 @@ def display_technologies(): """Displays a list of supported technologies along with whether kb provides a whitelist for that technology and the FASTQ argument order for kb count. """ - headers = [ - 'name', 'whitelist provided', 'barcode (file #, start, stop)', - 'umi (file #, start, stop)', 'read file #' - ] + headers = ['name', 'whitelist', 'barcode', 'umi', 'cDNA'] rows = [headers] print('List of supported single-cell technologies\n') + print('Positions syntax: `input file index, start position, end position`') + print('When start & end positions are None, refers to the entire file') + print( + 'Custom technologies may be defined by providing a kallisto-supported ' + 'technology string\n(see https://pachterlab.github.io/kallisto/manual)\n' + ) for t in TECHNOLOGIES: + chem = t.chemistry row = [ t.name, - 'yes' if t.whitelist_archive else '', - ' '.join(str(tup) for tup in t.barcode_positions), - ' '.join(str(tup) for tup in t.umi_positions), - str(t.reads_file), + 'yes' if chem.has_whitelist else '', + ' '.join(str(_def) for _def in chem.cell_barcode_parser) + if chem.has_cell_barcode else '', + ' '.join(str(_def) + for _def in chem.umi_parser) if chem.has_umi else '', + ' '.join(str(_def) for _def in chem.cdna_parser), ] rows.append(row) @@ -179,7 +184,6 @@ def parse_count(parser, args, temp_dir='tmp'): :param args: Command-line arguments dictionary, as parsed by argparse :type args: dict """ - logger = logging.getLogger(__name__) if args.report: logger.warning(( 'Using `--report` may cause `kb` to exceed maximum memory specified ' @@ -725,6 +729,7 @@ def setup_count_args(parser, parent): return parser_count +@logger.namespaced('main') def main(): """Command-line entrypoint. """ @@ -803,10 +808,7 @@ def main(): args = parser.parse_args() - logging.basicConfig( - format='[%(asctime)s] %(levelname)7s %(message)s', - level=logging.DEBUG if args.verbose else logging.INFO, - ) + logger.setLevel(logging.DEBUG if args.verbose else logging.INFO) # Validation if 'no_validate' in args and args.no_validate: @@ -827,17 +829,6 @@ def main(): logging.disable(level=logging.CRITICAL) set_dry() - # Turn off logging from other packages. - try: - anndata_logger = logging.getLogger('anndata') - h5py_logger = logging.getLogger('h5py') - anndata_logger.setLevel(logging.CRITICAL + 10) - anndata_logger.propagate = False - h5py_logger.setLevel(logging.CRITICAL + 10) - h5py_logger.propagate = False - except Exception: - pass - if any(arg in sys.argv for arg in {'--lamanno', '--nucleus'}): logger.warning(( 'The `--lamanno` and `-`-n`ucleus` flags are deprecated. ' @@ -854,10 +845,20 @@ def main(): logger.debug(f'kallisto binary located at {get_kallisto_binary_path()}') logger.debug(f'bustools binary located at {get_bustools_binary_path()}') - temp_dir = args.tmp or os.path.join( - args.o, TEMP_DIR - ) if 'o' in args else TEMP_DIR - logger.debug(f'Creating {temp_dir} directory') + temp_dir = args.tmp or ( + os.path.join(args.o, TEMP_DIR) if 'o' in args else TEMP_DIR + ) + # Check if temp_dir exists and exit if it does. + # This is so that kb doesn't accidently use an existing directory and + # delete it afterwards. + if os.path.exists(temp_dir): + parser.error( + f'Temporary directory `{temp_dir}` exists! Is another instance running? ' + 'Either remove the existing directory or use `--tmp` to specify a ' + 'different temporary directory.' + ) + + logger.debug(f'Creating `{temp_dir}` directory') make_directory(temp_dir) try: logger.debug(args) @@ -871,5 +872,5 @@ def main(): finally: # Always clean temp dir if not args.keep_tmp: - logger.debug(f'Removing {temp_dir} directory') + logger.debug(f'Removing `{temp_dir}` directory') remove_directory(temp_dir) diff --git a/kb_python/maps/10xv3_feature_barcode_map.txt.gz b/kb_python/maps/10xv3_feature_barcode_map.txt.gz deleted file mode 100644 index 593bc55..0000000 Binary files a/kb_python/maps/10xv3_feature_barcode_map.txt.gz and /dev/null differ diff --git a/kb_python/ref.py b/kb_python/ref.py old mode 100644 new mode 100755 index 989822a..5f3a4da --- a/kb_python/ref.py +++ b/kb_python/ref.py @@ -1,16 +1,13 @@ import glob -import logging +import itertools import os import tarfile +import ngs_tools as ngs +import pandas as pd + from .config import get_kallisto_binary_path -from .fasta import ( - FASTA, - generate_cdna_fasta, - generate_intron_fasta, - generate_kite_fasta, -) -from .gtf import GTF +from .logging import logger from .utils import ( concatenate_files, decompress_gzip, @@ -20,66 +17,117 @@ run_executable, ) -logger = logging.getLogger(__name__) - -def sort_gtf(gtf_path, out_path): - """Sorts a GTF file based on its chromosome, start position, line number. +def generate_kite_fasta(feature_path, out_path, no_mismatches=False): + """Generate a FASTA file for feature barcoding with the KITE workflow. - :param gtf_path: path to GTF file - :type gtf_path: str + This FASTA contains all sequences that are 1 hamming distance from the + provided barcodes. The file of barcodes must be a 2-column TSV containing + the barcode sequences in the first column and their corresponding feature + name in the second column. If hamming distance 1 variants collide for any + pair of barcodes, the hamming distance 1 variants for those barcodes are + not generated. - :return: path to sorted GTF file, set of chromosomes in GTF file - :rtype: tuple - """ - logger.info(f'Sorting {gtf_path} to {out_path}') - gtf = GTF(gtf_path) - return gtf.sort(out_path) - - -def sort_fasta(fasta_path, out_path): - """Sorts a FASTA file based on its header. + :param feature_path: path to TSV containing barcodes and feature names + :type feature_path: str + :param out_path: path to FASTA to generate + :type out_path: str + :param no_mismatches: whether to generate hamming distance 1 variants, + defaults to `False` + :type no_mismatches: bool, optional - :param fasta_path: path to FASTA file - :type fasta_path: str + :raises Exception: if there are barcodes of different lengths + :raises Exception: if there are duplicate barcodes - :return: path to sorted FASTA file, set of chromosomes in FASTA file + :return: (path to generated FASTA, set of barcode lengths) :rtype: tuple """ - logger.info(f'Sorting {fasta_path} to {out_path}') - fasta = FASTA(fasta_path) - return fasta.sort(out_path) - -def check_chromosomes(fasta_chromosomes, gtf_chromosomes): - """Compares the two chromosome sets and outputs warnings if there are - unique chromosomes in either set. - - :param fasta_chromosomes: set of chromosomes found in FASTA - :type fasta_chromosomes: set - :param gtf_chromosomes: set of chromosomes found in GTF - :type gtf_chromosomes: set + def generate_mismatches(name, sequence): + """Helper function to generate 1 hamming distance mismatches. + + :param name: name of the sequence + :type name: str + :param sequence: the sequence + :type sequence: str + + :return: a generator that yields a tuple of (unique name, mismatched sequence) + :rtype: generator + """ + sequence = sequence.upper() + for i in range(len(sequence)): + base = sequence[i] + before = sequence[:i] + after = sequence[i + 1:] + + for j, different in enumerate([b for b in ['A', 'C', 'G', 'T'] + if b != base]): + yield f'{name}-{i}.{j+1}', f'{before}{different}{after}' + + df_features = pd.read_csv( + feature_path, sep='\t', header=None, names=['sequence', 'name'] + ) - :return: intersection of the two sets - :rtype: set - """ - fasta_unique = fasta_chromosomes - gtf_chromosomes - gtf_unique = gtf_chromosomes - fasta_chromosomes - if fasta_unique: - logger.warning(( - 'The following chromosomes were found in the FASTA but does not have ' - 'any "transcript" features in the GTF: {}. ' - 'No sequences will be generated for these chromosomes.' - ).format(', '.join(fasta_unique))) - if gtf_unique: - logger.warning(( - 'The following chromosomes were found to have "transcript" features ' - 'in the GTF but does not exist in the FASTA. ' - 'No sequences will be generated for these chromosomes.' - )) - chromosomes = set.intersection(fasta_chromosomes, gtf_chromosomes) - - return chromosomes + lengths = set() + features = {} + variants = {} + # Generate all feature barcode variations before saving to check for collisions. + for i, row in df_features.iterrows(): + # Check that the first column contains the sequence + # and the second column the feature name. + if ngs.sequence.SEQUENCE_PARSER.search(row.sequence.upper()): + raise Exception(( + 'Encountered non-ATCG basepairs in barcode sequence {row.sequence}. ' + 'Does the first column contain the sequences and the second column the feature names?' + )) + + lengths.add(len(row.sequence)) + features[row['name']] = row.sequence + variants[row['name']] = { + name: seq + for name, seq in generate_mismatches(row['name'], row.sequence) + if not no_mismatches + } + + # Check duplicate barcodes. + duplicates = set([ + bc for bc in features.values() if list(features.values()).count(bc) > 1 + ]) + if len(duplicates) > 0: + raise Exception( + 'Duplicate feature barcodes: {}'.format(' '.join(duplicates)) + ) + if len(lengths) > 1: + logger.warning( + 'Detected barcodes of different lengths: {}'.format( + ','.join(str(l) for l in lengths) # noqa + ) + ) + for f1, f2 in itertools.combinations(variants.keys(), 2): + v1 = variants[f1] + v2 = variants[f2] + if len(set.intersection(set(v1.values()), set(v2.values()))) > 0: + logger.warning(( + f'Detected features with barcodes that are not >2 hamming distance apart: {f1}, {f2}. ' + f'Hamming distance 1 variants for these barcodes will not be generated.' + )) + variants[f1] = {} + variants[f2] = {} + + # Write FASTA + with ngs.fasta.Fasta(out_path, 'w') as f: + for feature, barcode in features.items(): + attributes = {'feature_id': feature} + header = ngs.fasta.FastaEntry.make_header(feature, attributes) + entry = ngs.fasta.FastaEntry(header, barcode) + f.write(entry) + + for name, variant in variants[feature].items(): + header = ngs.fasta.FastaEntry.make_header(name, attributes) + entry = ngs.fasta.FastaEntry(header, variant) + f.write(entry) + + return out_path, min(lengths) def create_t2g_from_fasta(fasta_path, t2g_path): @@ -93,81 +141,34 @@ def create_t2g_from_fasta(fasta_path, t2g_path): :return: dictionary containing path to generated t2g mapping :rtype: dict """ - logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path)) - with open_as_text(t2g_path, 'w') as f: - fasta = FASTA(fasta_path) - for info, _ in fasta.entries(): - if 'feature_id' in info['group']: - row = [ - info['sequence_id'], - info['group']['feature_id'], - info['group']['feature_id'], - ] + logger.info(f'Creating transcript-to-gene mapping at {t2g_path}') + with ngs.fasta.Fasta(fasta_path, 'r') as f_in, open_as_text(t2g_path, + 'w') as f_out: + for entry in f_in: + attributes = entry.attributes + + if 'feature_id' in attributes: + feature_id = attributes['feature_id'] + row = [entry.name, feature_id, feature_id] else: + gene_id = attributes['gene_id'] + gene_name = attributes.get('gene_name', '') + transcript_name = attributes.get('transcript_name', '') + chromosome = attributes['chr'] + start = attributes['start'] + end = attributes['end'] + strand = attributes['strand'] row = [ - info['sequence_id'], info['group']['gene_id'], - info['group'].get('gene_name', ''), - info['group'].get('transcript_name', - ''), info['group'].get('chr', ''), - info['group'].get('start', - ''), info['group'].get('end', ''), - info['group'].get('strand', '') - ] - f.write('\t'.join(str(item) for item in row) + '\n') - return {'t2g': t2g_path} - - -def create_t2g_from_gtf(gtf_path, t2g_path, intron=False): - """Creates a transcript-to-gene mapping from a GTF file. - - GTF entries that have `transcript` as its feature are parsed for - the `transcript_id`, `gene_id` and `gene_name`. - - :param gtf_path: path to GTF file - :type gtf_path: str - :param t2g_path: path to output transcript-to-gene mapping - :type t2g_path: str - :param intron: whether or not to include intron transcript ids (with the - `-I` prefix), defaults to `False` - :type intron: bool, optional - - :return: dictionary containing path to generated t2g mapping - :rtype: dict - """ - logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path)) - gtf = GTF(gtf_path) - with open_as_text(t2g_path, 'w') as f: - for entry in gtf.entries(): - if entry['feature'] == 'transcript': - transcript_id = entry['group']['transcript_id'] - transcript_version = entry['group'].get( - 'transcript_version', None - ) - transcript = '{}.{}'.format( - transcript_id, transcript_version - ) if transcript_version else transcript_id - gene_id = entry['group']['gene_id'] - gene_version = entry['group'].get('gene_version', None) - gene = '{}.{}'.format( - gene_id, gene_version - ) if gene_version else gene_id - - row = [ - transcript, - gene, - entry['group'].get('gene_name', ''), - entry['group'].get('transcript_name', ''), - entry.get('seqname', ''), - entry.get('start', ''), - entry.get('end', ''), - entry.get('strand', ''), + entry.name, + gene_id, + gene_name, + transcript_name, + chromosome, + start, + end, + strand, ] - f.write('\t'.join(str(item) for item in row) + '\n') - - if intron: - intron_row = row.copy() - intron_row[0] = intron_row[0] + '-I' - f.write('\t'.join(str(item) for item in intron_row) + '\n') + f_out.write('\t'.join(str(item) for item in row) + '\n') return {'t2g': t2g_path} @@ -183,11 +184,10 @@ def create_t2c(fasta_path, t2c_path): :return: dictionary containing path to generated t2c list :rtype: dict """ - fasta = FASTA(fasta_path) - with open_as_text(t2c_path, 'w') as f: - for info, _ in fasta.entries(): - sequence_id = info['sequence_id'] - f.write('{}\n'.format(sequence_id)) + with ngs.fasta.Fasta(fasta_path, 'r') as f_in, open_as_text(t2c_path, + 'w') as f_out: + for entry in f_in: + f_out.write(f'{entry.name}\n') return {'t2c': t2c_path} @@ -236,27 +236,27 @@ def split_and_index(fasta_path, index_prefix, n=2, k=31, temp_dir='tmp'): logger.info(f'Splitting {fasta_path} into {n} parts') size = int(os.path.getsize(fasta_path) / n) + 4 - fasta = FASTA(fasta_path) - fasta_entries = fasta.entries(parse=False) - finished = False - for i in range(n): - fasta_part_path = get_temporary_filename(temp_dir) - index_part_path = f'{index_prefix}.{i}' - fastas.append(fasta_part_path) - indices.append(index_part_path) - - with open(fasta_part_path, 'w') as f: - logger.debug(f'Writing {fasta_part_path}') - while f.tell() < size: - try: - info, seq = next(fasta_entries) - except StopIteration: - finished = True - break - f.write(f'{info}\n{seq}\n') - - if finished: - break + with ngs.fasta.Fasta(fasta_path, 'r') as f_in: + fasta_iter = iter(f_in) + finished = False + for i in range(n): + fasta_part_path = get_temporary_filename(temp_dir) + index_part_path = f'{index_prefix}.{i}' + fastas.append(fasta_part_path) + indices.append(index_part_path) + + with ngs.fasta.Fasta(fasta_part_path, 'w') as f_out: + logger.debug(f'Writing {fasta_part_path}') + while f_out.tell() < size: + try: + entry = next(fasta_iter) + except StopIteration: + finished = True + break + f_out.write(entry) + + if finished: + break built = [] for fasta_part_path, index_part_path in zip(fastas, indices): @@ -266,6 +266,7 @@ def split_and_index(fasta_path, index_prefix, n=2, k=31, temp_dir='tmp'): return {'indices': built} +@logger.namespaced('download') def download_reference(reference, files, temp_dir='tmp', overwrite=False): """Downloads a provided reference file from a static url. @@ -288,7 +289,7 @@ def download_reference(reference, files, temp_dir='tmp', overwrite=False): :rtype: dict """ results = {} - if not any(os.path.exists(file) for file in files) or overwrite: + if not ngs.utils.all_exists(*list(files.values())) or overwrite: # Make sure all the required file paths are there. diff = set(reference.files.keys()) - set(files.keys()) if diff: @@ -299,14 +300,14 @@ def download_reference(reference, files, temp_dir='tmp', overwrite=False): url = reference.url path = os.path.join(temp_dir, os.path.basename(url)) - logging.info( + logger.info( 'Downloading files for {} from {} to {}'.format( reference.name, url, path ) ) local_path = download_file(url, path) - logging.info('Extracting files from {}'.format(local_path)) + logger.info('Extracting files from {}'.format(local_path)) with tarfile.open(local_path, 'r:gz') as f: f.extractall(temp_dir) @@ -344,6 +345,7 @@ def decompress_file(path, temp_dir='tmp'): return path +@logger.namespaced('ref') def ref( fasta_paths, gtf_paths, @@ -383,60 +385,47 @@ def ref( gtf_paths = [gtf_paths] results = {} - t2gs = [] cdnas = [] - for fasta_path, gtf_path in zip(fasta_paths, gtf_paths): - logger.info(f'Preparing {fasta_path}, {gtf_path}') - gtf_path = decompress_file(gtf_path, temp_dir=temp_dir) - t2g_result = create_t2g_from_gtf( - gtf_path, get_temporary_filename(temp_dir) - ) - t2gs.append(t2g_result['t2g']) - - if not glob.glob(f'{index_path}*') or overwrite: - fasta_path = decompress_file(fasta_path, temp_dir=temp_dir) - sorted_fasta_path, fasta_chromosomes = sort_fasta( - fasta_path, get_temporary_filename(temp_dir) - ) - sorted_gtf_path, gtf_chromosomes = sort_gtf( - gtf_path, get_temporary_filename(temp_dir) + if (not ngs.utils.all_exists(cdna_path, t2g_path)) or overwrite: + for fasta_path, gtf_path in zip(fasta_paths, gtf_paths): + logger.info(f'Preparing {fasta_path}, {gtf_path}') + # Parse GTF for gene and transcripts + gene_infos, transcript_infos = ngs.gtf.genes_and_transcripts_from_gtf( + gtf_path, use_version=True ) + # Split cdna_temp_path = get_temporary_filename(temp_dir) logger.info( f'Splitting genome {fasta_path} into cDNA at {cdna_temp_path}' ) - chromosomes = check_chromosomes(fasta_chromosomes, gtf_chromosomes) - cdna_fasta_path = generate_cdna_fasta( - sorted_fasta_path, - sorted_gtf_path, - cdna_temp_path, - chromosomes=chromosomes + cdna_temp_path = ngs.fasta.split_genomic_fasta_to_cdna( + fasta_path, cdna_temp_path, gene_infos, transcript_infos ) - cdnas.append(cdna_fasta_path) - - # Concatenate t2gs - logger.info( - f'Concatenating {len(t2gs)} transcript-to-gene mappings to {t2g_path}' - ) - t2g_path = concatenate_files(*t2gs, out_path=t2g_path, temp_dir=temp_dir) - results.update({'t2g': t2g_path}) + cdnas.append(cdna_temp_path) - # Concatenate cdnas and index - if not glob.glob(f'{index_path}*') or overwrite: logger.info(f'Concatenating {len(cdnas)} cDNAs to {cdna_path}') - cdna_fasta_path = concatenate_files( + cdna_path = concatenate_files( *cdnas, out_path=cdna_path, temp_dir=temp_dir ) + results.update({'cdna_fasta': cdna_path}) + else: + logger.info( + f'Skipping cDNA FASTA generation because {cdna_path} already exists. Use --overwrite flag to overwrite' + ) + + if not glob.glob(f'{index_path}*') or overwrite: + t2g_result = create_t2g_from_fasta(cdna_path, t2g_path) + results.update(t2g_result) if k and k != 31: logger.warning( f'Using provided k-mer length {k} instead of optimal length 31' ) index_result = split_and_index( - cdna_fasta_path, index_path, n=n, k=k or 31, temp_dir=temp_dir + cdna_path, index_path, n=n, k=k or 31, temp_dir=temp_dir ) if n > 1 else kallisto_index( - cdna_fasta_path, index_path, k=k or 31 + cdna_path, index_path, k=k or 31 ) results.update(index_result) else: @@ -448,6 +437,7 @@ def ref( return results +@logger.namespaced('ref_kite') def ref_kite( feature_path, fasta_path, @@ -514,6 +504,7 @@ def ref_kite( return results +@logger.namespaced('ref_lamanno') def ref_lamanno( fasta_paths, gtf_paths, @@ -571,53 +562,53 @@ def ref_lamanno( introns = [] cdna_t2cs = [] intron_t2cs = [] - if not glob.glob(f'{index_path}*') or overwrite: + if (not ngs.utils.all_exists(cdna_path, intron_path, t2g_path, + cdna_t2c_path, intron_t2c_path)) or overwrite: for fasta_path, gtf_path in zip(fasta_paths, gtf_paths): logger.info(f'Preparing {fasta_path}, {gtf_path}') - - fasta_path = decompress_file(fasta_path, temp_dir=temp_dir) - sorted_fasta_path, fasta_chromosomes = sort_fasta( - fasta_path, get_temporary_filename(temp_dir) - ) - gtf_path = decompress_file(gtf_path, temp_dir=temp_dir) - sorted_gtf_path, gtf_chromosomes = sort_gtf( - gtf_path, get_temporary_filename(temp_dir) + # Parse GTF for gene and transcripts + gene_infos, transcript_infos = ngs.gtf.genes_and_transcripts_from_gtf( + gtf_path, use_version=True ) + + # Split cDNA cdna_temp_path = get_temporary_filename(temp_dir) logger.info( f'Splitting genome {fasta_path} into cDNA at {cdna_temp_path}' ) - chromosomes = check_chromosomes(fasta_chromosomes, gtf_chromosomes) - cdna_fasta_path = generate_cdna_fasta( - sorted_fasta_path, - sorted_gtf_path, - cdna_temp_path, - chromosomes=chromosomes + cdna_temp_path = ngs.fasta.split_genomic_fasta_to_cdna( + fasta_path, cdna_temp_path, gene_infos, transcript_infos ) - cdnas.append(cdna_fasta_path) + cdnas.append(cdna_temp_path) + + # cDNA t2c cdna_t2c_temp_path = get_temporary_filename(temp_dir) logger.info( f'Creating cDNA transcripts-to-capture at {cdna_t2c_temp_path}' ) - cdna_t2c_result = create_t2c(cdna_fasta_path, cdna_t2c_temp_path) + cdna_t2c_result = create_t2c(cdna_temp_path, cdna_t2c_temp_path) cdna_t2cs.append(cdna_t2c_result['t2c']) + + # Split intron intron_temp_path = get_temporary_filename(temp_dir) logger.info(f'Splitting genome into introns at {intron_temp_path}') - intron_fasta_path = generate_intron_fasta( - sorted_fasta_path, - sorted_gtf_path, + intron_temp_path = ngs.fasta.split_genomic_fasta_to_intron( + fasta_path, intron_temp_path, - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=flank if flank is not None else k - 1 if k is not None else 30 ) - introns.append(intron_fasta_path) + introns.append(intron_temp_path) + + # intron t2c intron_t2c_temp_path = get_temporary_filename(temp_dir) logger.info( f'Creating intron transcripts-to-capture at {intron_t2c_temp_path}' ) intron_t2c_result = create_t2c( - intron_fasta_path, intron_t2c_temp_path + intron_temp_path, intron_t2c_temp_path ) intron_t2cs.append(intron_t2c_result['t2c']) @@ -651,6 +642,12 @@ def ref_lamanno( 'intron_t2c': intron_t2c_path }) + else: + logger.info( + 'Skipping cDNA and intron FASTA generation because files already exist. Use --overwrite flag to overwrite' + ) + + if not glob.glob(f'{index_path}*') or overwrite: # Concatenate cDNA and intron fastas to generate T2G and build index combined_path = get_temporary_filename(temp_dir) logger.info(f'Concatenating cDNA and intron FASTAs to {combined_path}') @@ -659,6 +656,7 @@ def ref_lamanno( ) t2g_result = create_t2g_from_fasta(combined_path, t2g_path) results.update(t2g_result) + if k and k != 31: logger.warning( f'Using provided k-mer length {k} instead of optimal length 31' diff --git a/kb_python/report.py b/kb_python/report.py old mode 100644 new mode 100755 index 26edfc0..3b89550 --- a/kb_python/report.py +++ b/kb_python/report.py @@ -1,4 +1,3 @@ -import logging import os import nbformat @@ -13,8 +12,6 @@ from .dry import dryable from .utils import get_temporary_filename -logger = logging.getLogger(__name__) - REPORT_DIR = os.path.join(PACKAGE_PATH, 'report') BASIC_TEMPLATE_PATH = os.path.join(REPORT_DIR, 'report_basic.ipynb') MATRIX_TEMPLATE_PATH = os.path.join(REPORT_DIR, 'report_matrix.ipynb') diff --git a/kb_python/stats.py b/kb_python/stats.py index 8c4de57..231ae37 100755 --- a/kb_python/stats.py +++ b/kb_python/stats.py @@ -40,16 +40,17 @@ def start(self): self.commands = [] self.workdir = os.getcwd() - # Import here to prevent circular imports - from .utils import get_bustools_version, get_kallisto_version - self.kallisto = { - 'path': get_kallisto_binary_path(), - 'version': '.'.join(str(i) for i in get_kallisto_version()) - } - self.bustools = { - 'path': get_bustools_binary_path(), - 'version': '.'.join(str(i) for i in get_bustools_version()) - } + if not is_dry(): + # Import here to prevent circular imports + from .utils import get_bustools_version, get_kallisto_version + self.kallisto = { + 'path': get_kallisto_binary_path(), + 'version': '.'.join(str(i) for i in get_kallisto_version()) + } + self.bustools = { + 'path': get_bustools_binary_path(), + 'version': '.'.join(str(i) for i in get_bustools_version()) + } def command(self, command, runtime=None): """Report a shell command was run. diff --git a/kb_python/utils.py b/kb_python/utils.py index 5915c8a..7ceb862 100755 --- a/kb_python/utils.py +++ b/kb_python/utils.py @@ -1,42 +1,47 @@ import concurrent.futures -import gzip import logging import os import re -import requests import shutil import subprocess as sp -import tempfile import threading import time from urllib.request import urlretrieve import anndata +import ngs_tools as ngs import pandas as pd import scipy.io from scipy import sparse from tqdm import tqdm from .config import ( - CHUNK_SIZE, get_bustools_binary_path, get_kallisto_binary_path, - MAP_DIR, - PACKAGE_PATH, PLATFORM, TECHNOLOGIES_MAPPING, - WHITELIST_DIR, UnsupportedOSException, ) from .dry import dryable from .dry import utils as dry_utils +from .logging import logger from .stats import STATS -logger = logging.getLogger(__name__) - TECHNOLOGY_PARSER = re.compile(r'^(?P\S+)') VERSION_PARSER = re.compile(r'^\S*? ([0-9]+).([0-9]+).([0-9]+)') +# These functions have been moved as of 0.26.1 to the ngs_tools library but are +# imported from this file in other places. For now, let's keep these here. +# TODO: remove these +open_as_text = ngs.utils.open_as_text +decompress_gzip = ngs.utils.decompress_gzip +compress_gzip = ngs.utils.compress_gzip +concatenate_files = ngs.utils.concatenate_files_as_text +download_file = ngs.utils.download_file +get_temporary_filename = dryable(dry_utils.get_temporary_filename)( + ngs.utils.mkstemp +) + class NotImplementedException(Exception): pass @@ -82,53 +87,6 @@ def update_filename(filename, code): return f'{name}.{code}{extension}' -def open_as_text(path, mode): - """Open a textfile or gzip file in text mode. - - :param path: path to textfile or gzip - :type path: str - :param mode: mode to open the file, either `w` for write or `r` for read - :type mode: str - - :return: file object - :rtype: file object - """ - return gzip.open(path, mode + - 't') if path.endswith('.gz') else open(path, mode) - - -def decompress_gzip(gzip_path, out_path): - """Decompress a gzip file to provided file path. - - :param gzip_path: path to gzip file - :type gzip_path: str - :param out_path: path to decompressed file - :type out_path: str - - :return: path to decompressed file - :rtype: str - """ - with gzip.open(gzip_path, 'rb') as f, open(out_path, 'wb') as out: - shutil.copyfileobj(f, out) - return out_path - - -def compress_gzip(file_path, out_path): - """Compress a file into gzip. - - :param file_path: path to file - :type file_path: str - :param out_dir: path to compressed file - :type out_dir: str - - :return: path to compressed file - :rtype: str - """ - with open(file_path, 'rb') as f, gzip.open(out_path, 'wb') as out: - shutil.copyfileobj(f, out) - return out_path - - @dryable(dry_utils.make_directory) def make_directory(path): """Quietly make the specified directory (and any subdirectories). @@ -322,7 +280,7 @@ def whitelist_provided(technology): """ upper = technology.upper() return upper in TECHNOLOGIES_MAPPING and TECHNOLOGIES_MAPPING[ - upper].whitelist_archive + upper].chemistry.has_whitelist @dryable(dry_utils.move_file) @@ -355,12 +313,10 @@ def copy_whitelist(technology, out_dir): :rtype: str """ technology = TECHNOLOGIES_MAPPING[technology.upper()] - archive_path = os.path.join( - PACKAGE_PATH, WHITELIST_DIR, technology.whitelist_archive - ) + archive_path = technology.chemistry.whitelist_path whitelist_path = os.path.join( out_dir, - os.path.splitext(technology.whitelist_archive)[0] + os.path.splitext(os.path.basename(archive_path))[0] ) with open_as_text(archive_path, 'r') as f, open(whitelist_path, 'w') as out: out.write(f.read()) @@ -380,70 +336,16 @@ def copy_map(technology, out_dir): :rtype: str """ technology = TECHNOLOGIES_MAPPING[technology.upper()] - archive_path = os.path.join(PACKAGE_PATH, MAP_DIR, technology.map_archive) + archive_path = technology.chemistry.whitelist_path map_path = os.path.join( out_dir, - os.path.splitext(technology.map_archive)[0] + os.path.splitext(os.path.basename(archive_path))[0] ) with open_as_text(archive_path, 'r') as f, open(map_path, 'w') as out: out.write(f.read()) return map_path -def concatenate_files(*paths, out_path, temp_dir='tmp'): - """Concatenates an arbitrary number of files into one TEXT file. - - Only supports text and gzip files. - - :param paths: an arbitrary number of paths to files - :type paths: str - :param out_path: path to place concatenated file - :type out_path: str - :param temp_dir: temporary directory, defaults to `tmp` - :type temp_dir: str, optional - - :return: path to concatenated file - :rtype: str - """ - with open(out_path, 'w') as out: - for path in paths: - with open_as_text(path, 'r') as f: - for line in f: - if not line.isspace(): - out.write(line.strip() + '\n') - - return out_path - - -def download_file(url, path): - """Download a remote file to the provided path while displaying a progress bar. - - :param url: remote url - :type url: str - :param path: local path to download the file to - :type path: str - - :return: path to downloaded file - :rtype: str - """ - logger.addHandler(TqdmLoggingHandler()) - r = requests.get(url, stream=True) - with open(path, 'wb') as f: - t = tqdm( - unit='B', - total=int(r.headers['Content-Length']), - unit_divisor=1024, - unit_scale=True, - ascii=True, - ) - for chunk in r.iter_content(chunk_size=CHUNK_SIZE): - if chunk: - t.update(len(chunk)) - f.write(chunk) - t.close() - return path - - @dryable(dry_utils.stream_file) def stream_file(url, path): """Creates a FIFO file to use for piping remote files into processes. @@ -475,24 +377,6 @@ def stream_file(url, path): return path -@dryable(dry_utils.get_temporary_filename) -def get_temporary_filename(temp_dir=None): - """Create a temporary file in the provided temprorary directory. - - The caller is responsible for deleting the file. - - :param temp_dir: path to temporary directory, defaults to `None` - :type temp_dir: str, optional - - :return: temporary filename - :rtype: str - """ - fp, path = tempfile.mkstemp(dir=temp_dir) - os.close(fp) - - return path - - def read_t2g(t2g_path): """Given a transcript-to-gene mapping path, read it into a dictionary. The first column is always assumed to tbe the transcript IDs. diff --git a/kb_python/validate.py b/kb_python/validate.py old mode 100644 new mode 100755 index 9416469..12efaa5 --- a/kb_python/validate.py +++ b/kb_python/validate.py @@ -1,15 +1,13 @@ import functools -import logging import os import re import scipy.io from .config import get_bustools_binary_path, is_dry, is_validate +from .logging import logger from .utils import run_executable -logger = logging.getLogger(__name__) - BUSTOOLS_INSPECT_PARSER = re.compile(r'^.*?(?P[0-9]+)') diff --git a/kb_python/whitelists/10xv1_whitelist.txt.gz b/kb_python/whitelists/10xv1_whitelist.txt.gz deleted file mode 100644 index 7e3a7c5..0000000 Binary files a/kb_python/whitelists/10xv1_whitelist.txt.gz and /dev/null differ diff --git a/kb_python/whitelists/10xv2_whitelist.txt.gz b/kb_python/whitelists/10xv2_whitelist.txt.gz deleted file mode 100644 index fbc9759..0000000 Binary files a/kb_python/whitelists/10xv2_whitelist.txt.gz and /dev/null differ diff --git a/kb_python/whitelists/10xv3_whitelist.txt.gz b/kb_python/whitelists/10xv3_whitelist.txt.gz deleted file mode 100644 index 840c3e6..0000000 Binary files a/kb_python/whitelists/10xv3_whitelist.txt.gz and /dev/null differ diff --git a/kb_python/whitelists/inDropsv3_whitelist.txt.gz b/kb_python/whitelists/inDropsv3_whitelist.txt.gz deleted file mode 100644 index 1f66162..0000000 Binary files a/kb_python/whitelists/inDropsv3_whitelist.txt.gz and /dev/null differ diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index f85a650..212f3fc --- a/requirements.txt +++ b/requirements.txt @@ -4,9 +4,10 @@ Jinja2>2.10.1 loompy>=3.0.6 nbconvert>=5.6.0 nbformat>=4.4.0 +ngs-tools>=1.3.0 numpy>=1.17.2 +pandas>=1.0.0 plotly>=4.5.0 -requests>=2.19.0 scanpy>=1.4.4.post1 scikit-learn>=0.21.3 tqdm>=4.39.0 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 index e69de29..5c41a56 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +from kb_python.logging import logger +logger.setLevel(9999) diff --git a/tests/dry/test_utils.py b/tests/dry/test_utils.py old mode 100644 new mode 100755 index a675173..3f2c199 --- a/tests/dry/test_utils.py +++ b/tests/dry/test_utils.py @@ -69,7 +69,7 @@ def test_move_file_windows(self): def test_copy_whitelist(self): with mock.patch('kb_python.dry.utils.print') as p: self.assertEquals( - 'path/10xv2_whitelist.txt', + 'path/10x_version2_whitelist.txt', utils.copy_whitelist('10xv2', 'path') ) p.assert_called() @@ -77,7 +77,7 @@ def test_copy_whitelist(self): def test_copy_map(self): with mock.patch('kb_python.dry.utils.print') as p: self.assertEqual( - 'path/10xv3_feature_barcode_map.txt', + 'path/10x_version3_feature_map.txt', utils.copy_map('10xv3', 'path') ) p.assert_called() diff --git a/tests/test_fasta.py b/tests/test_fasta.py deleted file mode 100755 index 7fe7a12..0000000 --- a/tests/test_fasta.py +++ /dev/null @@ -1,218 +0,0 @@ -import os -import uuid -from unittest import mock, TestCase - -import kb_python.fasta as fasta -from tests.mixins import TestMixin - - -class TestFASTA(TestMixin, TestCase): - - def test_make_header(self): - seq_id = 'TRANSCRIPT_ID' - attributes = [ - ('gene_id', 'GENE_ID'), - ('gene_name', 'GENE_NAME'), - ] - self.assertEqual( - '>TRANSCRIPT_ID gene_id:GENE_ID gene_name:GENE_NAME', - fasta.FASTA.make_header(seq_id, attributes) - ) - - def test_parse_header(self): - header = '>transcript_id TEST:testing' - self.assertEqual({ - 'sequence_id': 'transcript_id', - 'group': { - 'TEST': 'testing' - } - }, fasta.FASTA.parse_header(header)) - - def test_reverse_complement(self): - sequence = 'ATCG' - self.assertEqual('CGAT', fasta.FASTA.reverse_complement(sequence)) - - def test_entries(self): - fa = fasta.FASTA(self.unsorted_fasta_path) - self.assertEqual(2, len(list(fa.entries()))) - - def test_sort(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fa = fasta.FASTA(self.unsorted_fasta_path) - self.assertEqual((out_path, {'1', '2'}), fa.sort(out_path)) - - with open(out_path, 'r') as f, open(self.sorted_fasta_path, - 'r') as sorted: - self.assertEqual(f.read(), sorted.read()) - - def test_generate_kite_fasta(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - (out_path, 15), - fasta.generate_kite_fasta(self.kite_feature_path, out_path) - ) - with open(out_path, 'r') as f, open(self.kite_fasta_path, 'r') as fa: - self.assertEqual(fa.read(), f.read()) - - def test_generate_kite_fasta_no_mismatches(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual((out_path, 15), - fasta.generate_kite_fasta( - self.kite_feature_path, - out_path, - no_mismatches=True - )) - with open(out_path, 'r') as f, open(self.kite_no_mismatches_fasta_path, - 'r') as fa: - self.assertEqual(fa.read(), f.read()) - - def test_generate_kite_fasta_different_length(self): - with mock.patch('kb_python.fasta.logger.warning') as warning: - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual((out_path, 14), - fasta.generate_kite_fasta( - self.kite_different_feature_path, out_path - )) - warning.assert_called_once() - with open(out_path, 'r') as f, open(self.kite_different_fasta_path, - 'r') as fa: - self.assertEqual(fa.read(), f.read()) - - def test_generate_kite_fasta_duplicate(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_kite_fasta( - self.kite_duplicate_feature_path, out_path - ) - - def test_generate_kite_fasta_wrong_order(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_kite_fasta(self.kite_order_feature_path, out_path) - - def test_generate_kite_fasta_collision(self): - with mock.patch('kb_python.fasta.logger.warning') as warning: - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual((out_path, 15), - fasta.generate_kite_fasta( - self.kite_collision_feature_path, out_path - )) - warning.assert_called_once() - with open(out_path, 'r') as f, open(self.kite_collision_fasta_path, - 'r') as fa: - self.assertEqual(fa.read(), f.read()) - - def test_generate_cdna_fasta(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - out_path, - fasta.generate_cdna_fasta( - self.sorted_fasta_path, self.sorted_gtf_path, out_path - ) - ) - with open(out_path, 'r') as f, open(self.split_cdna_fasta_path, - 'r') as split: - self.assertEqual(f.read(), split.read()) - - def test_generate_cdna_fasta_with_space(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - out_path, - fasta.generate_cdna_fasta( - self.sorted_fasta_path, self.sorted_gtf_with_space_path, - out_path - ) - ) - with open(out_path, - 'r') as f, open(self.split_cdna_fasta_with_space_path, - 'r') as split: - self.assertEqual(f.read(), split.read()) - - def test_generate_cdna_fasta_no_exon(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_cdna_fasta( - self.sorted_fasta_path, self.gtf_no_exon, out_path - ) - - def test_generate_cdna_fasta_no_transcript(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_cdna_fasta( - self.sorted_fasta_path, self.gtf_no_transcript, out_path - ) - - def test_generate_cdna_fasta_with_chromosomes(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - out_path, - fasta.generate_cdna_fasta( - self.sorted_fasta_path, - self.sorted_gtf_path, - out_path, - chromosomes={'2'} - ) - ) - with open(out_path, 'r') as f, open(self.partial_cdna_fasta_path, - 'r') as split: - self.assertEqual(f.read(), split.read()) - - def test_generate_intron_fasta(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - out_path, - fasta.generate_intron_fasta( - self.sorted_fasta_path, self.sorted_gtf_path, out_path, flank=1 - ) - ) - with open(out_path, 'r') as f, open(self.split_intron_fasta_path, - 'r') as split: - self.assertEqual(f.read(), split.read()) - - def test_generate_intron_fasta_with_chromosomes(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - self.assertEqual( - out_path, - fasta.generate_intron_fasta( - self.sorted_fasta_path, - self.sorted_gtf_path, - out_path, - chromosomes={'1'}, - flank=1 - ) - ) - with open(out_path, 'r') as f, open(self.partial_intron_fasta_path, - 'r') as split: - self.assertEqual(f.read(), split.read()) - - def test_generate_intron_fasta_no_exon(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_intron_fasta( - self.sorted_fasta_path, self.gtf_no_exon, out_path - ) - - def test_generate_intron_fasta_no_transcript(self): - with self.assertRaises(Exception): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_intron_fasta( - self.sorted_fasta_path, self.gtf_no_transcript, out_path - ) - - def test_generate_spliced_fasta(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_spliced_fasta( - self.sorted_fasta_path, self.sorted_gtf_path, out_path - ) - - def test_generate_unspliced_fasta(self): - out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) - fasta.generate_unspliced_fasta( - self.sorted_fasta_path, self.sorted_gtf_path, out_path - ) - - def test_complement(self): - # check for each code and its complement, - # if the complement's complement is the code. - for key, value in fasta.FASTA.COMPLEMENT.items(): - assert key == fasta.FASTA.COMPLEMENT[value] diff --git a/tests/test_gtf.py b/tests/test_gtf.py deleted file mode 100755 index ebf6246..0000000 --- a/tests/test_gtf.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import uuid -from unittest import TestCase - -from kb_python.gtf import GTF -from tests.mixins import TestMixin - - -class TestGTF(TestMixin, TestCase): - - def test_parser(self): - line = '2\thavana\tgene\t2\t3\t.\t+\t.\tgene_id "ENSMUSG00000102693"; gene_version "1"; gene_name "4933401J01Rik"; gene_source "havana"; gene_biotype "TEC";' # noqa - match = GTF.PARSER.match(line) - self.assertIsNotNone(match) - self.assertEqual( - { - 'seqname': - '2', - 'feature': - 'gene', - 'start': - '2', - 'end': - '3', - 'strand': - '+', - 'group': - 'gene_id "ENSMUSG00000102693"; gene_version "1"; gene_name "4933401J01Rik"; gene_source "havana"; gene_biotype "TEC";' # noqa - }, - match.groupdict() - ) - - def test_group_parser(self): - group = 'gene_id "ENSMUSG00000102693";' - match = GTF.GROUP_PARSER.match(group) - self.assertIsNotNone(match) - self.assertEqual({ - 'key': 'gene_id', - 'value': 'ENSMUSG00000102693' - }, match.groupdict()) - - def test_parse_entry(self): - line = '2\thavana\tgene\t2\t3\t.\t+\t.\tgene_id "ENSMUSG00000102693"; gene_version "1"; gene_name "4933401J01Rik"; gene_source "havana"; gene_biotype "TEC";' # noqa - self.assertEqual({ - 'seqname': '2', - 'feature': 'gene', - 'start': 2, - 'end': 3, - 'strand': '+', - 'group': { - 'gene_id': 'ENSMUSG00000102693', - 'gene_version': '1', - 'gene_name': '4933401J01Rik', - 'gene_source': 'havana', - 'gene_biotype': 'TEC' - } - }, GTF.parse_entry(line)) - - def test_parse_entry_with_space(self): - line = '2\thavana\tgene\t2\t3\t.\t+\t.\tgene_id "ENSMUSG00000102693 [A]"; gene_version "1"; gene_name "4933401J01Rik"; gene_source "havana"; gene_biotype "TEC";' # noqa - self.assertEqual({ - 'seqname': '2', - 'feature': 'gene', - 'start': 2, - 'end': 3, - 'strand': '+', - 'group': { - 'gene_id': 'ENSMUSG00000102693[A]', - 'gene_version': '1', - 'gene_name': '4933401J01Rik', - 'gene_source': 'havana', - 'gene_biotype': 'TEC' - } - }, GTF.parse_entry(line)) - - def test_entries(self): - gtf = GTF(self.unsorted_gtf_path) - self.assertEqual(8, len(list(gtf.entries()))) - - def test_sort(self): - out_path = os.path.join(self.temp_dir, '{}.gtf'.format(uuid.uuid4())) - gtf = GTF(self.unsorted_gtf_path) - self.assertEqual((out_path, {'1', '2'}), gtf.sort(out_path)) - - with open(out_path, 'r') as f, open(self.sorted_gtf_path, - 'r') as sorted: - self.assertEqual(f.read(), sorted.read()) diff --git a/tests/test_ref.py b/tests/test_ref.py index eb70350..cf23932 100755 --- a/tests/test_ref.py +++ b/tests/test_ref.py @@ -11,63 +11,60 @@ class TestRef(TestMixin, TestCase): - def test_sort_gtf(self): - with mock.patch('kb_python.ref.GTF') as GTF: - out_path = 'test' - self.assertEqual( - GTF().sort.return_value, - ref.sort_gtf(self.unsorted_gtf_path, out_path) - ) - GTF().sort.assert_called_once_with(out_path) - - def test_sort_fasta(self): - with mock.patch('kb_python.ref.FASTA') as FASTA: - out_path = 'test' - self.assertEqual( - FASTA().sort.return_value, - ref.sort_fasta(self.unsorted_fasta_path, out_path) - ) - FASTA().sort.assert_called_once_with(out_path) + def test_generate_kite_fasta(self): + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + self.assertEqual( + (out_path, 15), + ref.generate_kite_fasta(self.kite_feature_path, out_path) + ) + with open(out_path, 'r') as f, open(self.kite_fasta_path, 'r') as fa: + self.assertEqual(fa.read(), f.read()) - def test_check_chromosomes(self): - with mock.patch('kb_python.ref.logger.warning') as warning: - fasta_chromosomes = gtf_chromosomes = chromosomes = {'1', '2'} - self.assertEqual( - chromosomes, - ref.check_chromosomes(fasta_chromosomes, gtf_chromosomes) - ) - warning.assert_not_called() + def test_generate_kite_fasta_no_mismatches(self): + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + self.assertEqual((out_path, 15), + ref.generate_kite_fasta( + self.kite_feature_path, + out_path, + no_mismatches=True + )) + with open(out_path, 'r') as f, open(self.kite_no_mismatches_fasta_path, + 'r') as fa: + self.assertEqual(fa.read(), f.read()) - def test_check_chromosomes_fasta_unique(self): + def test_generate_kite_fasta_different_length(self): with mock.patch('kb_python.ref.logger.warning') as warning: - fasta_chromosomes = {'1', '2'} - gtf_chromosomes = chromosomes = {'2'} - self.assertEqual( - chromosomes, - ref.check_chromosomes(fasta_chromosomes, gtf_chromosomes) - ) + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + self.assertEqual((out_path, 14), + ref.generate_kite_fasta( + self.kite_different_feature_path, out_path + )) warning.assert_called_once() + with open(out_path, 'r') as f, open(self.kite_different_fasta_path, + 'r') as fa: + self.assertEqual(fa.read(), f.read()) - def test_check_chromosomes_gtf_unique(self): - with mock.patch('kb_python.ref.logger.warning') as warning: - fasta_chromosomes = chromosomes = {'2'} - gtf_chromosomes = {'1', '2'} - self.assertEqual( - chromosomes, - ref.check_chromosomes(fasta_chromosomes, gtf_chromosomes) - ) - warning.assert_called_once() + def test_generate_kite_fasta_duplicate(self): + with self.assertRaises(Exception): + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + ref.generate_kite_fasta(self.kite_duplicate_feature_path, out_path) - def test_check_chromosomes_both_unique(self): + def test_generate_kite_fasta_wrong_order(self): + with self.assertRaises(Exception): + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + ref.generate_kite_fasta(self.kite_order_feature_path, out_path) + + def test_generate_kite_fasta_collision(self): with mock.patch('kb_python.ref.logger.warning') as warning: - fasta_chromosomes = {'1'} - gtf_chromosomes = {'2'} - chromosomes = set() - self.assertEqual( - chromosomes, - ref.check_chromosomes(fasta_chromosomes, gtf_chromosomes) - ) - self.assertEqual(2, warning.call_count) + out_path = os.path.join(self.temp_dir, '{}.fa'.format(uuid.uuid4())) + self.assertEqual((out_path, 15), + ref.generate_kite_fasta( + self.kite_collision_feature_path, out_path + )) + warning.assert_called_once() + with open(out_path, 'r') as f, open(self.kite_collision_fasta_path, + 'r') as fa: + self.assertEqual(fa.read(), f.read()) def test_kallisto_index(self): index_path = os.path.join(self.temp_dir, '{}.idx'.format(uuid.uuid4())) @@ -124,30 +121,6 @@ def test_create_t2g_from_fasta_kite(self): 'r') as t2g: self.assertEqual(f.read(), t2g.read()) - def test_create_t2g_from_gtf(self): - t2g_path = os.path.join(self.temp_dir, '{}.txt'.format(uuid.uuid4())) - result = ref.create_t2g_from_gtf(self.unsorted_gtf_path, t2g_path) - with open(result['t2g'], 'r') as f, open(self.gtf_t2g_path, 'r') as t2g: - self.assertEqual(f.read(), t2g.read()) - - def test_create_t2g_from_gtf_with_space(self): - t2g_path = os.path.join(self.temp_dir, '{}.txt'.format(uuid.uuid4())) - result = ref.create_t2g_from_gtf( - self.unsorted_gtf_with_space_path, t2g_path - ) - with open(result['t2g'], 'r') as f, open(self.gtf_t2g_with_space_path, - 'r') as t2g: - self.assertEqual(f.read(), t2g.read()) - - def test_create_t2g_from_gtf_with_intron(self): - t2g_path = os.path.join(self.temp_dir, '{}.txt'.format(uuid.uuid4())) - result = ref.create_t2g_from_gtf( - self.unsorted_gtf_path, t2g_path, intron=True - ) - with open(result['t2g'], 'r') as f, open(self.gtf_t2g_intron_path, - 'r') as t2g: - self.assertEqual(f.read(), t2g.read()) - def test_create_t2c(self): t2c_path = os.path.join(self.temp_dir, '{}.txt'.format(uuid.uuid4())) result = ref.create_t2c(self.unsorted_fasta_path, t2c_path) @@ -277,36 +250,32 @@ def test_decompress_file_gzip(self): def test_ref(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ - mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() - chromosomes = {'1', '2'} - get_temporary_filename.side_effect = ['t2g', 'fasta', 'gtf', 'cdna'] - decompress_file.side_effect = [self.gtf_path, self.fasta_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - concatenate_files.side_effect = [t2g_path, cdna_fasta_path] + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path kallisto_index.return_value = {'index': index_path} - create_t2g_from_gtf.return_value = {'t2g': 't2g'} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} self.assertEqual({ 't2g': t2g_path, 'index': index_path, + 'cdna_fasta': cdna_fasta_path, }, ref.ref( self.fasta_path, @@ -316,26 +285,21 @@ def test_ref(self): t2g_path, temp_dir=temp_dir )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.gtf_path, temp_dir=temp_dir), - call(self.fasta_path, temp_dir=temp_dir) - ]) - create_t2g_from_gtf.assert_called_once_with(self.gtf_path, 't2g') - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) + create_t2g_from_fasta.assert_called_once_with( + cdna_fasta_path, t2g_path + ) + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', - chromosomes=chromosomes + gene_infos, + transcript_infos, + ) + concatenate_files.assert_called_once_with( + cdna_fasta_path, out_path=cdna_fasta_path, temp_dir=temp_dir ) - self.assertEqual(2, concatenate_files.call_count) - concatenate_files.assert_has_calls([ - call('t2g', out_path=t2g_path, temp_dir=temp_dir), - call('cdna', out_path=cdna_fasta_path, temp_dir=temp_dir) - ]) kallisto_index.assert_called_once_with( cdna_fasta_path, index_path, k=31 ) @@ -343,36 +307,32 @@ def test_ref(self): def test_ref_split(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ - mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() - chromosomes = {'1', '2'} - get_temporary_filename.side_effect = ['t2g', 'fasta', 'gtf', 'cdna'] - decompress_file.side_effect = [self.gtf_path, self.fasta_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - concatenate_files.side_effect = [t2g_path, cdna_fasta_path] + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path split_and_index.return_value = {'indices': ['index.0', 'index.1']} - create_t2g_from_gtf.return_value = {'t2g': 't2g'} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} self.assertEqual({ 't2g': t2g_path, 'indices': ['index.0', 'index.1'], + 'cdna_fasta': cdna_fasta_path, }, ref.ref( self.fasta_path, @@ -383,63 +343,55 @@ def test_ref_split(self): n=2, temp_dir=temp_dir )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.gtf_path, temp_dir=temp_dir), - call(self.fasta_path, temp_dir=temp_dir) - ]) - create_t2g_from_gtf.assert_called_once_with(self.gtf_path, 't2g') - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) + create_t2g_from_fasta.assert_called_once_with( + cdna_fasta_path, t2g_path + ) + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', - chromosomes=chromosomes + gene_infos, + transcript_infos, ) - self.assertEqual(2, concatenate_files.call_count) - concatenate_files.assert_has_calls([ - call('t2g', out_path=t2g_path, temp_dir=temp_dir), - call('cdna', out_path=cdna_fasta_path, temp_dir=temp_dir) - ]) + concatenate_files.assert_called_once_with( + cdna_fasta_path, out_path=cdna_fasta_path, temp_dir=temp_dir + ) + kallisto_index.assert_not_called() split_and_index.assert_called_once_with( cdna_fasta_path, index_path, k=31, n=2, temp_dir=temp_dir ) - kallisto_index.assert_not_called() def test_ref_override_k(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ - mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] - k = 999 + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir + k = 999 index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() - chromosomes = {'1', '2'} - get_temporary_filename.side_effect = ['t2g', 'fasta', 'gtf', 'cdna'] - decompress_file.side_effect = [self.gtf_path, self.fasta_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - concatenate_files.side_effect = [t2g_path, cdna_fasta_path] + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path kallisto_index.return_value = {'index': index_path} - create_t2g_from_gtf.return_value = {'t2g': 't2g'} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} self.assertEqual({ 't2g': t2g_path, 'index': index_path, + 'cdna_fasta': cdna_fasta_path, }, ref.ref( self.fasta_path, @@ -450,52 +402,54 @@ def test_ref_override_k(self): k=k, temp_dir=temp_dir )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.gtf_path, temp_dir=temp_dir), - call(self.fasta_path, temp_dir=temp_dir) - ]) - create_t2g_from_gtf.assert_called_once_with(self.gtf_path, 't2g') - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) + create_t2g_from_fasta.assert_called_once_with( + cdna_fasta_path, t2g_path + ) + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', - chromosomes=chromosomes + gene_infos, + transcript_infos, + ) + concatenate_files.assert_called_once_with( + cdna_fasta_path, out_path=cdna_fasta_path, temp_dir=temp_dir ) - self.assertEqual(2, concatenate_files.call_count) - concatenate_files.assert_has_calls([ - call('t2g', out_path=t2g_path, temp_dir=temp_dir), - call('cdna', out_path=cdna_fasta_path, temp_dir=temp_dir) - ]) kallisto_index.assert_called_once_with( cdna_fasta_path, index_path, k=k ) + split_and_index.assert_not_called() def test_ref_exists(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ - mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = ['index'] - cdna_fasta_path = mock.MagicMock() + mock.patch('kb_python.ref.glob.glob', return_value=[]): + temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - temp_dir = mock.MagicMock() - get_temporary_filename.side_effect = ['t2g', 'fasta', 'gtf', 'cdna'] - decompress_file.return_value = self.gtf_path + cdna_fasta_path = mock.MagicMock() + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path kallisto_index.return_value = {'index': index_path} - create_t2g_from_gtf.return_value = {'t2g': 't2g'} - concatenate_files.return_value = t2g_path - self.assertEqual({'t2g': t2g_path}, + create_t2g_from_fasta.return_value = {'t2g': t2g_path} + self.assertEqual({ + 'index': index_path, + 't2g': t2g_path + }, ref.ref( self.fasta_path, self.gtf_path, @@ -504,50 +458,85 @@ def test_ref_exists(self): t2g_path, temp_dir=temp_dir )) - decompress_file.assert_called_once_with( - self.gtf_path, temp_dir=temp_dir + genes_and_transcripts_from_gtf.assert_not_called() + split_genomic_fasta_to_cdna.assert_not_called() + concatenate_files.assert_not_called() + create_t2g_from_fasta.assert_called_once_with( + cdna_fasta_path, t2g_path ) - create_t2g_from_gtf.assert_called_once_with(self.gtf_path, 't2g') - concatenate_files.assert_called_once_with( - 't2g', out_path=t2g_path, temp_dir=temp_dir + kallisto_index.assert_called_once_with( + cdna_fasta_path, index_path, k=31 ) - sort_fasta.assert_not_called() - sort_gtf.assert_not_called() - check_chromosomes.assert_not_called() - generate_cdna_fasta.assert_not_called() + split_and_index.assert_not_called() + + def test_ref_exists2(self): + with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ + mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ + mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ + mock.patch('kb_python.ref.glob.glob', return_value=['a']): + temp_dir = self.temp_dir + index_path = mock.MagicMock() + t2g_path = mock.MagicMock() + cdna_fasta_path = mock.MagicMock() + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path + kallisto_index.return_value = {'index': index_path} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} + self.assertEqual({}, + ref.ref( + self.fasta_path, + self.gtf_path, + cdna_fasta_path, + index_path, + t2g_path, + temp_dir=temp_dir + )) + genes_and_transcripts_from_gtf.assert_not_called() + create_t2g_from_fasta.assert_not_called() + split_genomic_fasta_to_cdna.assert_not_called() + concatenate_files.assert_not_called() kallisto_index.assert_not_called() + split_and_index.assert_not_called() def test_ref_overwrite(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ - mock.patch('kb_python.ref.create_t2g_from_gtf') as create_t2g_from_gtf,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = ['index'] + mock.patch('kb_python.ref.glob.glob', return_value=['a']): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() - chromosomes = {'1', '2'} - get_temporary_filename.side_effect = ['t2g', 'fasta', 'gtf', 'cdna'] - decompress_file.side_effect = [self.gtf_path, self.fasta_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - concatenate_files.side_effect = [t2g_path, cdna_fasta_path] + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'cdna' + split_genomic_fasta_to_cdna.return_value = cdna_fasta_path + concatenate_files.return_value = cdna_fasta_path kallisto_index.return_value = {'index': index_path} - create_t2g_from_gtf.return_value = {'t2g': 't2g'} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} self.assertEqual({ 't2g': t2g_path, 'index': index_path, + 'cdna_fasta': cdna_fasta_path, }, ref.ref( self.fasta_path, @@ -558,29 +547,25 @@ def test_ref_overwrite(self): temp_dir=temp_dir, overwrite=True )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.gtf_path, temp_dir=temp_dir), - call(self.fasta_path, temp_dir=temp_dir) - ]) - create_t2g_from_gtf.assert_called_once_with(self.gtf_path, 't2g') - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) + create_t2g_from_fasta.assert_called_once_with( + cdna_fasta_path, t2g_path + ) + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', - chromosomes=chromosomes + gene_infos, + transcript_infos, + ) + concatenate_files.assert_called_once_with( + cdna_fasta_path, out_path=cdna_fasta_path, temp_dir=temp_dir ) - self.assertEqual(2, concatenate_files.call_count) - concatenate_files.assert_has_calls([ - call('t2g', out_path=t2g_path, temp_dir=temp_dir), - call('cdna', out_path=cdna_fasta_path, temp_dir=temp_dir) - ]) kallisto_index.assert_called_once_with( cdna_fasta_path, index_path, k=31 ) + split_and_index.assert_not_called() def test_ref_kite_odd(self): with mock.patch('kb_python.ref.decompress_file') as decompress_file,\ @@ -820,40 +805,34 @@ def test_ref_kite_overwrite(self): def test_ref_lamanno(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() combined_path = mock.MagicMock() - chromosomes = {'1', '2'} + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) get_temporary_filename.side_effect = [ - 'fasta', 'gtf', 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', - 'combined' + 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', 'combined' ] - decompress_file.side_effect = [self.fasta_path, self.gtf_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - generate_intron_fasta.return_value = 'intron' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.return_value = {'index': index_path} create_t2g_from_fasta.return_value = {'t2g': t2g_path} create_t2c.side_effect = [{ @@ -884,28 +863,20 @@ def test_ref_lamanno(self): intron_t2c_path, temp_dir=temp_dir )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.fasta_path, temp_dir=temp_dir), - call(self.gtf_path, temp_dir=temp_dir), - ]) + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) create_t2g_from_fasta.assert_called_once_with( combined_path, t2g_path ) - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, - 'cdna', - chromosomes=chromosomes + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', gene_infos, transcript_infos ) - generate_intron_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + split_genomic_fasta_to_intron.assert_called_once_with( + self.fasta_path, 'intron', - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=30 ) self.assertEqual(2, create_t2c.call_count) @@ -933,39 +904,34 @@ def test_ref_lamanno(self): def test_ref_lamanno_split_2(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir index_path = 'index' t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() combined_path = mock.MagicMock() - chromosomes = {'1', '2'} + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) get_temporary_filename.side_effect = [ - 'fasta', 'gtf', 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', - 'combined' + 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', 'combined' ] - decompress_file.side_effect = [self.fasta_path, self.gtf_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - generate_intron_fasta.return_value = 'intron' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.side_effect = [{ 'index': 'index_cdna' }, { @@ -998,31 +964,23 @@ def test_ref_lamanno_split_2(self): t2g_path, cdna_t2c_path, intron_t2c_path, - n=2, - temp_dir=temp_dir + temp_dir=temp_dir, + n=2 )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.fasta_path, temp_dir=temp_dir), - call(self.gtf_path, temp_dir=temp_dir), - ]) + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) create_t2g_from_fasta.assert_called_once_with( combined_path, t2g_path ) - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, - 'cdna', - chromosomes=chromosomes + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', gene_infos, transcript_infos ) - generate_intron_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + split_genomic_fasta_to_intron.assert_called_once_with( + self.fasta_path, 'intron', - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=30 ) self.assertEqual(2, create_t2c.call_count) @@ -1046,45 +1004,40 @@ def test_ref_lamanno_split_2(self): self.assertEqual(2, kallisto_index.call_count) kallisto_index.assert_has_calls([ call(cdna_fasta_path, 'index_cdna', k=31), - call(intron_fasta_path, 'index_intron', k=31), + call(intron_fasta_path, 'index_intron', k=31) ]) + split_and_index.assert_not_called() def test_ref_lamanno_split_3(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ - mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ mock.patch('kb_python.ref.split_and_index') as split_and_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] + mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir index_path = 'index' t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() combined_path = mock.MagicMock() - chromosomes = {'1', '2'} + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) get_temporary_filename.side_effect = [ - 'fasta', 'gtf', 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', - 'combined' + 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', 'combined' ] - decompress_file.side_effect = [self.fasta_path, self.gtf_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - generate_intron_fasta.return_value = 'intron' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.return_value = {'index': 'index_cdna'} split_and_index.return_value = { 'indices': ['index_intron.0', 'index_intron.1'] @@ -1116,31 +1069,23 @@ def test_ref_lamanno_split_3(self): t2g_path, cdna_t2c_path, intron_t2c_path, - n=3, - temp_dir=temp_dir + temp_dir=temp_dir, + n=3 )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.fasta_path, temp_dir=temp_dir), - call(self.gtf_path, temp_dir=temp_dir), - ]) + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) create_t2g_from_fasta.assert_called_once_with( combined_path, t2g_path ) - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, - 'cdna', - chromosomes=chromosomes + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', gene_infos, transcript_infos ) - generate_intron_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + split_genomic_fasta_to_intron.assert_called_once_with( + self.fasta_path, 'intron', - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=30 ) self.assertEqual(2, create_t2c.call_count) @@ -1170,40 +1115,35 @@ def test_ref_lamanno_split_3(self): def test_ref_lamanno_override_k(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=False),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = [] - k = 999 + mock.patch('kb_python.ref.glob.glob', return_value=[]): temp_dir = self.temp_dir + k = 999 index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() combined_path = mock.MagicMock() - chromosomes = {'1', '2'} + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) get_temporary_filename.side_effect = [ - 'fasta', 'gtf', 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', - 'combined' + 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', 'combined' ] - decompress_file.side_effect = [self.fasta_path, self.gtf_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - generate_intron_fasta.return_value = 'intron' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.return_value = {'index': index_path} create_t2g_from_fasta.return_value = {'t2g': t2g_path} create_t2c.side_effect = [{ @@ -1235,28 +1175,20 @@ def test_ref_lamanno_override_k(self): k=k, temp_dir=temp_dir )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.fasta_path, temp_dir=temp_dir), - call(self.gtf_path, temp_dir=temp_dir), - ]) + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) create_t2g_from_fasta.assert_called_once_with( combined_path, t2g_path ) - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, - 'cdna', - chromosomes=chromosomes + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', gene_infos, transcript_infos ) - generate_intron_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + split_genomic_fasta_to_intron.assert_called_once_with( + self.fasta_path, 'intron', - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=k - 1 ) self.assertEqual(2, create_t2c.call_count) @@ -1280,21 +1212,89 @@ def test_ref_lamanno_override_k(self): kallisto_index.assert_called_once_with( combined_path, index_path, k=k ) + split_and_index.assert_not_called() def test_ref_lamanno_exists(self): - with mock.patch('kb_python.ref.get_temporary_filename'),\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ + with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = ['index'] + mock.patch('kb_python.ref.glob.glob', return_value=[]): + temp_dir = self.temp_dir + index_path = mock.MagicMock() + t2g_path = mock.MagicMock() + cdna_fasta_path = mock.MagicMock() + intron_fasta_path = mock.MagicMock() + cdna_t2c_path = mock.MagicMock() + intron_t2c_path = mock.MagicMock() + combined_path = mock.MagicMock() + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'combined' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' + kallisto_index.return_value = {'index': index_path} + create_t2g_from_fasta.return_value = {'t2g': t2g_path} + create_t2c.side_effect = [{ + 't2c': 'cdna_t2c' + }, { + 't2c': 'intron_t2c' + }] + concatenate_files.return_value = combined_path + self.assertEqual({ + 't2g': t2g_path, + 'index': index_path, + }, + ref.ref_lamanno( + self.fasta_path, + self.gtf_path, + cdna_fasta_path, + intron_fasta_path, + index_path, + t2g_path, + cdna_t2c_path, + intron_t2c_path, + temp_dir=temp_dir + )) + genes_and_transcripts_from_gtf.assert_not_called() + create_t2g_from_fasta.assert_called_once_with( + combined_path, t2g_path + ) + split_genomic_fasta_to_cdna.assert_not_called() + split_genomic_fasta_to_intron.assert_not_called() + create_t2c.assert_not_called() + concatenate_files.assert_called_once_with( + cdna_fasta_path, + intron_fasta_path, + out_path='combined', + temp_dir=temp_dir + ) + kallisto_index.assert_called_once_with( + combined_path, index_path, k=31 + ) + split_and_index.assert_not_called() + + def test_ref_lamanno_exists2(self): + with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ + mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ + mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ + mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ + mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ + mock.patch('kb_python.ref.glob.glob', return_value=['a']): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() @@ -1302,8 +1302,23 @@ def test_ref_lamanno_exists(self): intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() + combined_path = mock.MagicMock() + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) + get_temporary_filename.return_value = 'combined' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.return_value = {'index': index_path} create_t2g_from_fasta.return_value = {'t2g': t2g_path} + create_t2c.side_effect = [{ + 't2c': 'cdna_t2c' + }, { + 't2c': 'intron_t2c' + }] + concatenate_files.return_value = combined_path self.assertEqual({}, ref.ref_lamanno( self.fasta_path, @@ -1316,52 +1331,45 @@ def test_ref_lamanno_exists(self): intron_t2c_path, temp_dir=temp_dir )) - decompress_file.assert_not_called() + genes_and_transcripts_from_gtf.assert_not_called() create_t2g_from_fasta.assert_not_called() - sort_fasta.assert_not_called() - sort_gtf.assert_not_called() - check_chromosomes.assert_not_called() - generate_cdna_fasta.assert_not_called() - generate_intron_fasta.assert_not_called() + split_genomic_fasta_to_cdna.assert_not_called() + split_genomic_fasta_to_intron.assert_not_called() create_t2c.assert_not_called() concatenate_files.assert_not_called() kallisto_index.assert_not_called() + split_and_index.assert_not_called() def test_ref_lamanno_overwrite(self): with mock.patch('kb_python.ref.get_temporary_filename') as get_temporary_filename,\ - mock.patch('kb_python.ref.decompress_file') as decompress_file,\ mock.patch('kb_python.ref.create_t2g_from_fasta') as create_t2g_from_fasta,\ - mock.patch('kb_python.ref.sort_fasta') as sort_fasta,\ - mock.patch('kb_python.ref.sort_gtf') as sort_gtf,\ - mock.patch('kb_python.ref.check_chromosomes') as check_chromosomes,\ - mock.patch('kb_python.ref.generate_cdna_fasta') as generate_cdna_fasta,\ - mock.patch('kb_python.ref.generate_intron_fasta') as generate_intron_fasta,\ mock.patch('kb_python.ref.create_t2c') as create_t2c,\ + mock.patch('kb_python.ref.ngs.gtf.genes_and_transcripts_from_gtf') as genes_and_transcripts_from_gtf,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_cdna') as split_genomic_fasta_to_cdna,\ + mock.patch('kb_python.ref.ngs.fasta.split_genomic_fasta_to_intron') as split_genomic_fasta_to_intron,\ + mock.patch('kb_python.ref.ngs.utils.all_exists', return_value=True),\ mock.patch('kb_python.ref.concatenate_files') as concatenate_files,\ + mock.patch('kb_python.ref.split_and_index') as split_and_index,\ mock.patch('kb_python.ref.kallisto_index') as kallisto_index,\ - mock.patch('kb_python.ref.glob.glob') as glob: - glob.return_value = ['index'] + mock.patch('kb_python.ref.glob.glob', return_value=['a']): temp_dir = self.temp_dir index_path = mock.MagicMock() t2g_path = mock.MagicMock() - sorted_fasta_path = mock.MagicMock() - sorted_gtf_path = mock.MagicMock() cdna_fasta_path = mock.MagicMock() intron_fasta_path = mock.MagicMock() cdna_t2c_path = mock.MagicMock() intron_t2c_path = mock.MagicMock() combined_path = mock.MagicMock() - chromosomes = {'1', '2'} + gene_infos = mock.MagicMock() + transcript_infos = mock.MagicMock() + genes_and_transcripts_from_gtf.return_value = ( + gene_infos, transcript_infos + ) get_temporary_filename.side_effect = [ - 'fasta', 'gtf', 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', - 'combined' + 'cdna', 'cdna_t2c', 'intron', 'intron_t2c', 'combined' ] - decompress_file.side_effect = [self.fasta_path, self.gtf_path] - sort_fasta.return_value = sorted_fasta_path, chromosomes - sort_gtf.return_value = sorted_gtf_path, chromosomes - check_chromosomes.return_value = chromosomes - generate_cdna_fasta.return_value = 'cdna' - generate_intron_fasta.return_value = 'intron' + split_genomic_fasta_to_cdna.return_value = 'cdna' + split_genomic_fasta_to_intron.return_value = 'intron' kallisto_index.return_value = {'index': index_path} create_t2g_from_fasta.return_value = {'t2g': t2g_path} create_t2c.side_effect = [{ @@ -1393,28 +1401,20 @@ def test_ref_lamanno_overwrite(self): temp_dir=temp_dir, overwrite=True )) - self.assertEqual(2, decompress_file.call_count) - decompress_file.assert_has_calls([ - call(self.fasta_path, temp_dir=temp_dir), - call(self.gtf_path, temp_dir=temp_dir), - ]) + genes_and_transcripts_from_gtf.assert_called_once_with( + self.gtf_path, use_version=True + ) create_t2g_from_fasta.assert_called_once_with( combined_path, t2g_path ) - sort_fasta.assert_called_once_with(self.fasta_path, 'fasta') - sort_gtf.assert_called_once_with(self.gtf_path, 'gtf') - check_chromosomes.assert_called_once_with(chromosomes, chromosomes) - generate_cdna_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, - 'cdna', - chromosomes=chromosomes + split_genomic_fasta_to_cdna.assert_called_once_with( + self.fasta_path, 'cdna', gene_infos, transcript_infos ) - generate_intron_fasta.assert_called_once_with( - sorted_fasta_path, - sorted_gtf_path, + split_genomic_fasta_to_intron.assert_called_once_with( + self.fasta_path, 'intron', - chromosomes=chromosomes, + gene_infos, + transcript_infos, flank=30 ) self.assertEqual(2, create_t2c.call_count) @@ -1438,3 +1438,4 @@ def test_ref_lamanno_overwrite(self): kallisto_index.assert_called_once_with( combined_path, index_path, k=31 ) + split_and_index.assert_not_called() diff --git a/tests/test_utils.py b/tests/test_utils.py index 9440668..a856799 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,5 @@ -import gzip import os import subprocess as sp -import uuid from unittest import mock, TestCase from unittest.mock import call from unittest.mock import ANY @@ -9,7 +7,7 @@ import anndata import kb_python.utils as utils -from kb_python.config import CHUNK_SIZE, UnsupportedOSException +from kb_python.config import UnsupportedOSException from tests.mixins import TestMixin @@ -20,44 +18,6 @@ def test_update_filename(self): 'output.s.c.bus', utils.update_filename('output.s.bus', 'c') ) - def test_open_as_text_textfile(self): - path = os.path.join(self.temp_dir, '{}.txt'.format(uuid.uuid4())) - with utils.open_as_text(path, 'w') as f: - f.write('TESTING') - self.assertTrue(os.path.exists(path)) - with utils.open_as_text(path, 'r') as f: - self.assertEqual(f.read(), 'TESTING') - - def test_open_as_text_gzip(self): - path = os.path.join(self.temp_dir, '{}.gz'.format(uuid.uuid4())) - with utils.open_as_text(path, 'w') as f: - f.write('TESTING') - self.assertTrue(os.path.exists(path)) - with utils.open_as_text(path, 'r') as f: - self.assertEqual(f.read(), 'TESTING') - - def test_decompress_gzip(self): - filename = str(uuid.uuid4()) - gzip_path = os.path.join(self.temp_dir, '{}.gz'.format(filename)) - out_path = os.path.join(self.temp_dir, filename) - with gzip.open(gzip_path, 'wt') as f: - f.write('TESTING\nTEST') - self.assertEqual(out_path, utils.decompress_gzip(gzip_path, out_path)) - self.assertTrue(os.path.exists(out_path)) - with open(out_path, 'r') as f: - self.assertEqual('TESTING\nTEST', f.read()) - - def test_compress_gzip(self): - filename = str(uuid.uuid4()) - file_path = os.path.join(self.temp_dir, filename) - out_path = os.path.join(self.temp_dir, '{}.gz'.format(filename)) - with open(file_path, 'w') as f: - f.write('TESTING\nTEST') - self.assertEqual(out_path, utils.compress_gzip(file_path, out_path)) - self.assertTrue(os.path.exists(out_path)) - with gzip.open(out_path, 'rt') as f: - self.assertEqual('TESTING\nTEST', f.read()) - def test_make_directory(self): with mock.patch('kb_python.utils.os.makedirs') as makedirs: utils.make_directory('path') @@ -122,33 +82,6 @@ def test_whitelist_provided(self): self.assertTrue(utils.whitelist_provided('10xv2')) self.assertFalse(utils.whitelist_provided('UNSUPPORTED')) - def test_download_file(self): - with mock.patch('kb_python.utils.requests.get') as get,\ - mock.patch('kb_python.utils.tqdm') as tqdm: - path = os.path.join(self.temp_dir, str(uuid.uuid4())) - get.return_value.headers = {'Content-Length': str(1024 * 2)} - get.return_value.iter_content.return_value = [ - b'1' * 1024, b'2' * 1024 - ] - self.assertEqual(path, utils.download_file('remote/path', path)) - get.assert_called_once_with('remote/path', stream=True) - tqdm.assert_called_once_with( - unit='B', - total=1024 * 2, - unit_divisor=1024, - unit_scale=True, - ascii=True - ) - get.return_value.iter_content.assert_called_once_with( - chunk_size=CHUNK_SIZE - ) - self.assertEqual(2, tqdm.return_value.update.call_count) - tqdm.return_value.update.assert_has_calls([call(1024), call(1024)]) - tqdm.return_value.close.assert_called_once_with() - - with open(path, 'r') as f: - self.assertEqual(('1' * 1024) + ('2' * 1024), f.read()) - def test_stream_file(self): with mock.patch('kb_python.utils.PLATFORM', 'linux'),\ mock.patch('kb_python.utils.os') as os,\ @@ -176,10 +109,6 @@ def test_stream_file_windows(self): threading.thread.assert_not_called() urlretrieve.assert_not_called() - def test_get_temporary_filename(self): - path = utils.get_temporary_filename() - self.assertTrue(os.path.exists(path)) - def test_read_t2g(self): t2g = utils.read_t2g(self.t2g_path) self.assertEqual(16457, len(t2g)) @@ -289,23 +218,3 @@ def test_move_file(self): def test_copy_whitelist(self): whitelist_path = utils.copy_whitelist('10xv1', self.temp_dir) self.assertTrue(os.path.exists(whitelist_path)) - - def test_concatenate_files(self): - temp_dir = self.temp_dir - file1_path = os.path.join(temp_dir, str(uuid.uuid4())) - file2_path = os.path.join(temp_dir, '{}.gz'.format(uuid.uuid4())) - - with open(file1_path, 'w') as f: - f.write('TEST1') - with gzip.open(file2_path, 'wt') as f: - f.write('TEST2') - - out_path = utils.concatenate_files( - file1_path, - file2_path, - out_path=os.path.join(temp_dir, str(uuid.uuid4())), - temp_dir=self.temp_dir - ) - - with open(out_path, 'r') as f: - self.assertEqual(f.read(), 'TEST1\nTEST2\n')