diff --git a/kb_python/constants.py b/kb_python/constants.py old mode 100644 new mode 100755 index 31f1120..9cf1e1c --- a/kb_python/constants.py +++ b/kb_python/constants.py @@ -50,6 +50,7 @@ ABUNDANCE_FILENAME = 'matrix.abundance.mtx' CELLS_FILENAME = 'matrix.cells' GENE_DIR = 'counts_gene' +GENES_FILENAME = 'genes.txt' # File codes. # These are appended to the filename whenever it undergoes some kind of diff --git a/kb_python/count.py b/kb_python/count.py old mode 100644 new mode 100755 index b9ff3ec..7c517d5 --- a/kb_python/count.py +++ b/kb_python/count.py @@ -31,6 +31,7 @@ FILTERED_CODE, FILTERED_COUNTS_DIR, GENE_NAME, + GENES_FILENAME, INSPECT_FILENAME, KALLISTO_INFO_FILENAME, KB_INFO_FILENAME, @@ -39,7 +40,6 @@ REPORT_NOTEBOOK_FILENAME, SORT_CODE, TCC_PREFIX, - TRANSCRIPT_NAME, TXNAMES_FILENAME, UNFILTERED_CODE, UNFILTERED_COUNTS_DIR, @@ -54,7 +54,9 @@ import_tcc_matrix_as_anndata, make_directory, move_file, + open_as_text, overlay_anndatas, + read_t2g, run_executable, stream_file, sum_anndatas, @@ -564,24 +566,24 @@ def bustools_whitelist(bus_path, out_path): return {'whitelist': out_path} -def write_smartseq_batch(pairs_1, pairs_2, out_path): +def write_smartseq_batch(fastq_pairs, cell_ids, out_path): """Write a 3-column TSV specifying batch information for Smart-seq reads. This file is required to use `kallisto pseudo` on multiple samples (= cells). - :param pairs_1: list of paths to FASTQs corresponding to pair 1 - :type pairs_1: list - :param pairs_2: list of paths to FASTQS corresponding to pair 2 - :type pairs_2: list + :param fastq_pairs: list of pairs of FASTQs + :type fastq_pairs: list + :param cell_ids: list of cell IDs + :type cell_ids: list :param out_path: path to batch file to output :type out_path: str :return: dictionary of written batch file :rtype: dict """ - logger.info(f'Writing batch definition textfile to {out_path}') + logger.info(f'Writing batch definition TSV to {out_path}') with open(out_path, 'w') as f: - for i, (read_1, read_2) in enumerate(zip(pairs_1, pairs_2)): - f.write(f'{i}\t{read_1}\t{read_2}\n') + for cell_id, (fastq_1, fastq_2) in zip(cell_ids, fastq_pairs): + f.write(f'{cell_id}\t{fastq_1}\t{fastq_2}\n') return {'batch': out_path} @@ -977,6 +979,41 @@ def copy_or_create_whitelist(technology, bus_path, out_dir): )['whitelist'] +def convert_transcripts_to_genes(txnames_path, t2g_path, genes_path): + """Convert a textfile containing transcript IDs to another textfile containing + gene IDs, given a transcript-to-gene mapping. + + :param txnames_path: path to transcripts.txt + :type txnames_path: str + :param t2g_path: path to transcript-to-genes mapping + :type t2g_path: str + :param genes_path: path to output genes.txt + :type genes_path: str + + :return: path to written genes.txt + :rtype: str + """ + t2g = read_t2g(t2g_path) + with open_as_text(txnames_path, 'r') as f, open_as_text(genes_path, + 'w') as out: + for line in f: + if line.isspace(): + continue + transcript = line.strip() + if transcript not in t2g: + logger.warning( + f'Transcript {transcript} was found in {txnames_path} but not in {t2g_path}. ' + 'This transcript will not be converted to a gene.' + ) + attributes = t2g.get(transcript) + + if attributes: + out.write(f'{attributes[0]}\n') + else: + out.write(f'{transcript}\n') + return genes_path + + def count( index_paths, t2g_path, @@ -1270,7 +1307,8 @@ def count_smartseq( t2g_path, technology, out_dir, - fastqs, + fastq_pairs, + cell_ids=None, temp_dir='tmp', threads=8, memory='4G', @@ -1292,7 +1330,8 @@ def count_smartseq( # Smart-seq does not support fastq streaming. if any(urlparse(fastq).scheme in ('http', 'https', 'ftp', 'ftps') - for fastq in fastqs): + for pair in fastq_pairs + for fastq in pair): raise Exception( f'Technology {technology} does not support FASTQ streaming.' ) @@ -1313,11 +1352,13 @@ def count_smartseq( for name, path in pseudo_result.items()) or overwrite: # Write batch information. batch_result = write_smartseq_batch( - fastqs[::2], fastqs[1::2], os.path.join(out_dir, BATCH_FILENAME) + fastq_pairs, + cell_ids if cell_ids else list(range(len(fastq_pairs))), + os.path.join(out_dir, BATCH_FILENAME), ) results.update(batch_result) - kallisto_pseudo( + pseudo_result = kallisto_pseudo( batch_result['batch'], index_paths[0], out_dir, threads=threads ) else: @@ -1326,6 +1367,13 @@ def count_smartseq( ) results.update(pseudo_result) + # Manually write genes.txt + # NOTE: there will be duplicated genes + genes_path = os.path.join(out_dir, GENES_FILENAME) + results['genes'] = convert_transcripts_to_genes( + pseudo_result['txnames'], t2g_path, genes_path + ) + # Convert outputs. if loom or h5ad: results.update( @@ -1333,9 +1381,8 @@ def count_smartseq( out_dir, pseudo_result['mtx'], pseudo_result['cells'], - pseudo_result['txnames'], + results['genes'], t2g_path=t2g_path, - name=TRANSCRIPT_NAME, loom=loom, h5ad=h5ad, threads=threads diff --git a/kb_python/fasta.py b/kb_python/fasta.py old mode 100644 new mode 100755 diff --git a/kb_python/main.py b/kb_python/main.py old mode 100644 new mode 100755 index a1d5502..37adfbb --- a/kb_python/main.py +++ b/kb_python/main.py @@ -4,6 +4,7 @@ import os import sys import textwrap +import warnings from . import __version__ from .config import ( @@ -24,6 +25,7 @@ get_bustools_version, get_kallisto_version, make_directory, + open_as_text, remove_directory, ) @@ -189,7 +191,7 @@ def parse_count(parser, args, temp_dir='tmp'): # Smartseq can not be used with lamanno or nucleus. if args.x.upper() == 'SMARTSEQ': parser.error( - f'Technology {args.x} can not be used with workflow {args.workflow}.' + f'Technology `{args.x}` can not be used with workflow {args.workflow}.' ) count_velocity( @@ -224,7 +226,7 @@ def parse_count(parser, args, temp_dir='tmp'): # Smart-seq if args.x.upper() == 'SMARTSEQ': if args.dry_run: - parser.error(f'Technology {args.x} does not support dry run.') + parser.error(f'Technology `{args.x}` does not support dry run.') # Check for ignored arguments. (i.e. arguments either not supported or # not yet implemented) @@ -234,23 +236,62 @@ def parse_count(parser, args, temp_dir='tmp'): for arg in ignored: if getattr(args, arg): logger.warning( - f'Argument `{arg}` is not supported for technology {args.x}. This argument will be ignored.' + f'Argument `{arg}` is not supported for technology `{args.x}`. This argument will be ignored.' ) - # Allow glob notation for fastqs. - fastqs = [] - for expr in args.fastqs: - fastqs.extend(glob.glob(expr)) - args.fastqs = sorted(list(set(fastqs))) + # Check if batch TSV was provided. + batch_path = None + if len(args.fastqs) == 1: + try: + with open_as_text(args.fastqs[0], 'r') as f: + if not f.readline().startswith('@'): + batch_path = args.fastqs[0] + except Exception: + pass + + cells = {} + if batch_path: + with open(batch_path, 'r') as f: + for line in f: + if line.isspace() or line.startswith('#'): + continue + cell_id, fastq_1, fastq_2 = line.strip().split('\t') + if cell_id in cells: + parser.error( + f'Found duplicate cell ID {cell_id} in {batch_path}.' + ) + cells[cell_id] = (fastq_1, fastq_2) + else: + # Allow glob notation for fastqs. + fastqs = [] + for expr in args.fastqs: + fastqs.extend(glob.glob(expr)) + if len(fastqs) % 2: + logger.warning(f'{len(fastqs)} FASTQs found. ') + fastqs = sorted(set(fastqs)) + for cell_id, i in enumerate(range(0, len(fastqs), 2)): + fastq_1, fastq_2 = fastqs[i], ( + fastqs[i + 1] if i + 1 < len(fastqs) else '' + ) + cells[cell_id] = (fastq_2, fastq_2) logger.info('Found the following FASTQs:') - for fastq in args.fastqs: - logger.info(' ' * 8 + fastq) + fastq_pairs = [] + cell_ids = [] + for cell_id, (fastq_1, fastq_2) in cells.items(): + logger.info(f' {cell_id} {fastq_1} {fastq_2}') + cell_ids.append(cell_id) + fastq_pairs.append((fastq_1, fastq_2)) + if not fastq_1 or not fastq_2: + parser.error( + f'Single-end reads are currently not supported with technology `{args.x}`.' + ) count_smartseq( args.i, args.g, args.x, args.o, - args.fastqs, + fastq_pairs, + cell_ids=cell_ids, threads=args.t, memory=args.m, overwrite=args.overwrite, @@ -662,7 +703,11 @@ def setup_count_args(parser, parent): parser_count.add_argument( 'fastqs', help=( - 'FASTQ files. Glob expressions can be used only with technology `SMARTSEQ`.' + 'FASTQ files. For technology `SMARTSEQ`, all input FASTQs are ' + 'alphabetically sorted by path and paired in order, and cell IDs ' + 'are assigned as incrementing integers starting from zero. A single ' + 'batch TSV with cell ID, read 1, and read 2 as columns can be ' + 'provided to override this behavior.' ), nargs='+' ) @@ -757,7 +802,12 @@ def main(): # Turn off logging from other packages. try: - logging.getLogger('anndata').disable(level=logging.CRITICAL) + anndata_logger = logging.getLogger('anndata') + h5py_logger = logging.getLogger('h5py') + anndata_logger.setLevel(logging.CRITICAL + 10) + anndata_logger.propagate = False + h5py_logger.setLevel(logging.CRITICAL + 10) + h5py_logger.propagate = False except Exception: pass @@ -782,7 +832,9 @@ def main(): make_directory(temp_dir) try: logger.debug(args) - COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir) except Exception: if is_dry(): raise diff --git a/kb_python/utils.py b/kb_python/utils.py old mode 100644 new mode 100755 index c8f0c58..f4d796f --- a/kb_python/utils.py +++ b/kb_python/utils.py @@ -14,6 +14,7 @@ import anndata import pandas as pd import scipy.io +from scipy import sparse from tqdm import tqdm from .config import ( @@ -481,6 +482,34 @@ def get_temporary_filename(temp_dir=None): return path +def read_t2g(t2g_path): + """Given a transcript-to-gene mapping path, read it into a dictionary. + The first column is always assumed to tbe the transcript IDs. + + :param t2g_path: path to t2g + :type t2g_path: str + + :return: dictionary containing transcript IDs as keys and all other columns + as a tuple as values + :rtype: dict + """ + t2g = {} + with open_as_text(t2g_path, 'r') as f: + for line in f: + if line.isspace(): + continue + split = line.strip().split('\t') + transcript = split[0] + other = tuple(split[1:]) + if transcript in t2g: + logger.warning( + f'Found duplicate entries for {transcript} in {t2g_path}. ' + 'Earlier entries will be ignored.' + ) + t2g[transcript] = other + return t2g + + def import_tcc_matrix_as_anndata( matrix_path, barcodes_path, ec_path, txnames_path, threads=8 ): @@ -567,23 +596,38 @@ def import_matrix_as_anndata( df_genes.index = df_genes.index.astype( str ) # To prevent logging from anndata + mtx = scipy.io.mmread(matrix_path) + + # If any of the genes are duplicated, collapse them by summing + if any(df_genes.index.duplicated()): + logger.debug( + f'Deduplicating genes found in {genes_path} by adding duplicates' + ) + mtx = mtx.tocsc() + gene_indices = {} + for i, gene in enumerate(df_genes.index): + gene_indices.setdefault(gene, []).append(i) + genes = [] + deduplicated_mtx = sparse.lil_matrix( + (len(df_barcodes), len(gene_indices)) + ) + for i, gene in enumerate(sorted(gene_indices.keys())): + genes.append(gene) + indices = gene_indices[gene] + deduplicated_mtx[:, i] = mtx[:, indices].sum(axis=1) + df_genes = pd.DataFrame(index=pd.Series(genes, name=f'{name}_id')) + mtx = deduplicated_mtx + + t2g = read_t2g(t2g_path) if t2g_path else {} id_to_name = {} - if t2g_path: - with open(t2g_path, 'r') as f: - for line in f: - if line.isspace(): - continue - - split = line.strip().split('\t') - if len(split) > 2: - id_to_name[split[1]] = split[2] - gene_names = [id_to_name.get(i, '') for i in df_genes.index] - if any(bool(g) for g in gene_names): - df_genes[f'{name}_name'] = gene_names + for transcript, attributes in t2g.items(): + if len(attributes) > 1: + id_to_name[attributes[0]] = attributes[1] + gene_names = [id_to_name.get(i, '') for i in df_genes.index] + if any(bool(g) for g in gene_names): + df_genes[f'{name}_name'] = gene_names - return anndata.AnnData( - X=scipy.io.mmread(matrix_path).tocsr(), obs=df_barcodes, var=df_genes - ) + return anndata.AnnData(X=mtx.tocsr(), obs=df_barcodes, var=df_genes) def overlay_anndatas(adata_spliced, adata_unspliced): @@ -606,6 +650,7 @@ def overlay_anndatas(adata_spliced, adata_unspliced): df_obs = unspliced_intersection.obs df_var = unspliced_intersection.var return anndata.AnnData( + X=spliced_intersection.X, layers={ 'spliced': spliced_intersection.X, 'unspliced': unspliced_intersection.X diff --git a/tests/fixtures/counts/duplicated.genes.txt b/tests/fixtures/counts/duplicated.genes.txt new file mode 100755 index 0000000..fa05c88 --- /dev/null +++ b/tests/fixtures/counts/duplicated.genes.txt @@ -0,0 +1,17 @@ +ENSMUSG00000026034.17 +ENSMUSG00000026034.17 +ENSMUSG00000091950.3 +ENSMUSG00000113163.1 +ENSMUSG00000111566.1 +ENSMUSG00000110879.2 +ENSMUSG00000082754.2 +ENSMUSG00000050577.8 +ENSMUSG00000111678.2 +ENSMUSG00000068824.4 +ENSMUSG00000089732.5 +ENSMUSG00000046590.3 +ENSMUSG00000075126.4 +ENSMUSG00000075067.4 +ENSMUSG00000082574.3 +ENSMUSG00000080809.3 +ENSMUSG00000109389.1 diff --git a/tests/fixtures/counts/genes.mtx b/tests/fixtures/counts/genes.mtx old mode 100644 new mode 100755 index b591cbd..50c9743 --- a/tests/fixtures/counts/genes.mtx +++ b/tests/fixtures/counts/genes.mtx @@ -1,6 +1,6 @@ %%MatrixMarket matrix coordinate real general % -% +% 29 17 2 10 1 1 16 1 1 diff --git a/tests/fixtures/counts/genes_duplicated.mtx b/tests/fixtures/counts/genes_duplicated.mtx new file mode 100755 index 0000000..7bc3a12 --- /dev/null +++ b/tests/fixtures/counts/genes_duplicated.mtx @@ -0,0 +1,7 @@ +%%MatrixMarket matrix coordinate real general +% +% +29 17 3 +10 1 1 +16 1 2 +16 2 3 diff --git a/tests/fixtures/smartseq/t2g.txt b/tests/fixtures/smartseq/t2g.txt new file mode 100755 index 0000000..4242f3a --- /dev/null +++ b/tests/fixtures/smartseq/t2g.txt @@ -0,0 +1,2 @@ +ENST00000003583.12 ENSG00000001460.18 STPG1 STPG1-201 1 24357005 24413725 - +ENST00000003912.7 ENSG00000001461.17 NIPAL3 NIPAL3-201 1 24415803 24472976 + diff --git a/tests/fixtures/smartseq/transcripts.txt b/tests/fixtures/smartseq/transcripts.txt new file mode 100755 index 0000000..2c1508c --- /dev/null +++ b/tests/fixtures/smartseq/transcripts.txt @@ -0,0 +1,2 @@ +ENST00000003583.12 +ENST00000003912.7 diff --git a/tests/mixins.py b/tests/mixins.py old mode 100644 new mode 100755 index 62bcfaa..2f2c501 --- a/tests/mixins.py +++ b/tests/mixins.py @@ -39,8 +39,14 @@ def setUpClass(cls): cls.counts_dir = os.path.join(cls.fixtures_dir, 'counts') cls.matrix_path = os.path.join(cls.counts_dir, 'genes.mtx') + cls.matrix_duplicated_path = os.path.join( + cls.counts_dir, 'genes_duplicated.mtx' + ) cls.barcodes_path = os.path.join(cls.counts_dir, 'genes.barcodes.txt') cls.genes_path = os.path.join(cls.counts_dir, 'genes.genes.txt') + cls.genes_duplicated_path = os.path.join( + cls.counts_dir, 'duplicated.genes.txt' + ) cls.loom_path = os.path.join(cls.counts_dir, 'genes.loom') cls.h5ad_path = os.path.join(cls.counts_dir, 'genes.h5ad') @@ -171,4 +177,10 @@ def setUpClass(cls): os.path.join(cls.smartseq_dir, 'R1.fastq.gz'), os.path.join(cls.smartseq_dir, 'R2.fastq.gz') ] + cls.smartseq_t2g_path = os.path.join( + cls.smartseq_dir, 'transcripts_to_genes.txt' + ) + cls.smartseq_txnames_path = os.path.join( + cls.smartseq_dir, 'transcripts.txt' + ) cls.smartseq_out_dir = os.path.join(cls.smartseq_dir, 'out') diff --git a/tests/test_count.py b/tests/test_count.py old mode 100644 new mode 100755 index 71372fd..aa0ada1 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -30,12 +30,12 @@ FEATURE_PREFIX, FILTER_WHITELIST_FILENAME, FILTERED_COUNTS_DIR, + GENES_FILENAME, INSPECT_FILENAME, KALLISTO_INFO_FILENAME, KB_INFO_FILENAME, REPORT_HTML_FILENAME, REPORT_NOTEBOOK_FILENAME, - TRANSCRIPT_NAME, TXNAMES_FILENAME, UNFILTERED_COUNTS_DIR, WHITELIST_FILENAME, @@ -303,12 +303,15 @@ def test_bustools_whitelist(self): def test_write_smartseq_batch(self): out_path = os.path.join(self.temp_dir, str(uuid.uuid4())) - pairs_1 = ['r1_1', 'r2_1'] - pairs_2 = ['r1_2', 'r2_2'] - self.assertEqual({'batch': out_path}, - count.write_smartseq_batch(pairs_1, pairs_2, out_path)) + fastq_pairs = [['r1_1', 'r2_1'], ['r1_2', 'r2_2']] + cell_ids = ['cell_1', 'cell_2'] + self.assertEqual({ + 'batch': out_path + }, count.write_smartseq_batch(fastq_pairs, cell_ids, out_path)) with open(out_path, 'r') as f: - self.assertEqual('0\tr1_1\tr1_2\n1\tr2_1\tr2_2\n', f.read()) + self.assertEqual( + 'cell_1\tr1_1\tr2_1\ncell_2\tr1_2\tr2_2\n', f.read() + ) def test_convert_matrix_loom(self): with mock.patch('kb_python.count.import_matrix_as_anndata') as import_matrix_as_anndata,\ @@ -1002,6 +1005,24 @@ def test_copy_or_create_whitelist_not_provided(self): self.bus_s_path, os.path.join(out_dir, WHITELIST_FILENAME) ) + def test_convert_transcripts_to_genes(self): + with mock.patch('kb_python.count.read_t2g') as read_t2g: + read_t2g.return_value = { + 'ENST00000003583.12': ('ENSG00000001460.18', 'STPG1') + } + genes_path = os.path.join(self.temp_dir, 'genes.txt') + self.assertEqual( + genes_path, + count.convert_transcripts_to_genes( + self.smartseq_txnames_path, self.smartseq_t2g_path, + genes_path + ) + ) + with open(genes_path, 'r') as f: + self.assertEqual(['ENSG00000001460.18', 'ENST00000003912.7'], [ + line.strip() for line in f if not line.isspace() + ]) + def test_matrix_to_cellranger(self): out_dir = self.temp_dir result = count.matrix_to_cellranger( @@ -2450,16 +2471,25 @@ def test_count_smartseq(self): with mock.patch('kb_python.count.STATS') as STATS,\ mock.patch('kb_python.count.write_smartseq_batch') as write_smartseq_batch,\ mock.patch('kb_python.count.kallisto_pseudo') as kallisto_pseudo,\ - mock.patch('kb_python.count.convert_matrix') as convert_matrix: + mock.patch('kb_python.count.convert_matrix') as convert_matrix,\ + mock.patch('kb_python.count.convert_transcripts_to_genes') as convert_transcripts_to_genes: out_dir = 'out' temp_dir = 'temp' threads = 99999 memory = 'mem' t2g_path = 't2g' technology = 'tech' - fastqs = ['r1', 'r2', 'r3', 'r4'] + fastq_pairs = [['r1', 'r2'], ['r3', 'r4']] index_paths = ['index'] STATS.save.return_value = 'stats' + convert_transcripts_to_genes.return_value = 'genes.txt' + kallisto_pseudo.return_value = { + 'mtx': os.path.join(out_dir, ABUNDANCE_FILENAME), + 'ecmap': os.path.join(out_dir, ECMAP_FILENAME), + 'cells': os.path.join(out_dir, CELLS_FILENAME), + 'txnames': os.path.join(out_dir, TXNAMES_FILENAME), + 'info': os.path.join(out_dir, KALLISTO_INFO_FILENAME), + } write_smartseq_batch.return_value = {'batch': 'batch.txt'} self.assertEqual({ 'mtx': os.path.join(out_dir, ABUNDANCE_FILENAME), @@ -2468,42 +2498,56 @@ def test_count_smartseq(self): 'txnames': os.path.join(out_dir, TXNAMES_FILENAME), 'info': os.path.join(out_dir, KALLISTO_INFO_FILENAME), 'stats': 'stats', - 'batch': 'batch.txt' + 'batch': 'batch.txt', + 'genes': 'genes.txt' }, count.count_smartseq( index_paths, t2g_path, technology, out_dir, - fastqs, + fastq_pairs, temp_dir=temp_dir, threads=threads, memory=memory )) - write_smartseq_batch.assert_called_once_with(['r1', 'r3'], [ - 'r2', 'r4' - ], os.path.join(out_dir, BATCH_FILENAME)) + write_smartseq_batch.assert_called_once_with( + fastq_pairs, [0, 1], os.path.join(out_dir, BATCH_FILENAME) + ) kallisto_pseudo.assert_called_once_with( 'batch.txt', 'index', out_dir, threads=threads ) + convert_transcripts_to_genes.assert_called_once_with( + os.path.join(out_dir, TXNAMES_FILENAME), t2g_path, + os.path.join(out_dir, GENES_FILENAME) + ) convert_matrix.assert_not_called() def test_count_smartseq_convert(self): with mock.patch('kb_python.count.STATS') as STATS,\ mock.patch('kb_python.count.write_smartseq_batch') as write_smartseq_batch,\ mock.patch('kb_python.count.kallisto_pseudo') as kallisto_pseudo,\ - mock.patch('kb_python.count.convert_matrix') as convert_matrix: + mock.patch('kb_python.count.convert_matrix') as convert_matrix,\ + mock.patch('kb_python.count.convert_transcripts_to_genes') as convert_transcripts_to_genes: out_dir = 'out' temp_dir = 'temp' threads = 99999 memory = 'mem' t2g_path = 't2g' technology = 'tech' - fastqs = ['r1', 'r2', 'r3', 'r4'] + fastq_pairs = [['r1', 'r2'], ['r3', 'r4']] index_paths = ['index'] STATS.save.return_value = 'stats' write_smartseq_batch.return_value = {'batch': 'batch.txt'} + convert_transcripts_to_genes.return_value = 'genes.txt' + kallisto_pseudo.return_value = { + 'mtx': os.path.join(out_dir, ABUNDANCE_FILENAME), + 'ecmap': os.path.join(out_dir, ECMAP_FILENAME), + 'cells': os.path.join(out_dir, CELLS_FILENAME), + 'txnames': os.path.join(out_dir, TXNAMES_FILENAME), + 'info': os.path.join(out_dir, KALLISTO_INFO_FILENAME), + } self.assertEqual({ 'mtx': os.path.join(out_dir, ABUNDANCE_FILENAME), 'ecmap': os.path.join(out_dir, ECMAP_FILENAME), @@ -2511,33 +2555,37 @@ def test_count_smartseq_convert(self): 'txnames': os.path.join(out_dir, TXNAMES_FILENAME), 'info': os.path.join(out_dir, KALLISTO_INFO_FILENAME), 'stats': 'stats', - 'batch': 'batch.txt' + 'batch': 'batch.txt', + 'genes': 'genes.txt' }, count.count_smartseq( index_paths, t2g_path, technology, out_dir, - fastqs, + fastq_pairs, temp_dir=temp_dir, threads=threads, memory=memory, h5ad=True )) - write_smartseq_batch.assert_called_once_with(['r1', 'r3'], [ - 'r2', 'r4' - ], os.path.join(out_dir, BATCH_FILENAME)) + write_smartseq_batch.assert_called_once_with( + fastq_pairs, [0, 1], os.path.join(out_dir, BATCH_FILENAME) + ) kallisto_pseudo.assert_called_once_with( 'batch.txt', 'index', out_dir, threads=threads ) + convert_transcripts_to_genes.assert_called_once_with( + os.path.join(out_dir, TXNAMES_FILENAME), t2g_path, + os.path.join(out_dir, GENES_FILENAME) + ) convert_matrix.assert_called_once_with( out_dir, os.path.join(out_dir, ABUNDANCE_FILENAME), os.path.join(out_dir, CELLS_FILENAME), - os.path.join(out_dir, TXNAMES_FILENAME), + 'genes.txt', t2g_path=t2g_path, - name=TRANSCRIPT_NAME, loom=False, h5ad=True, threads=threads @@ -2554,7 +2602,7 @@ def test_count_smartseq_no_multiple_indices(self): memory = 'mem' t2g_path = 't2g' technology = 'tech' - fastqs = ['r1', 'r2', 'r3', 'r4'] + fastq_pairs = [['r1', 'r2'], ['r3', 'r4']] index_paths = ['index', 'index2'] with self.assertRaises(Exception): count.count_smartseq( @@ -2562,7 +2610,7 @@ def test_count_smartseq_no_multiple_indices(self): t2g_path, technology, out_dir, - fastqs, + fastq_pairs, temp_dir=temp_dir, threads=threads, memory=memory @@ -2579,7 +2627,7 @@ def test_count_smartseq_no_streaming(self): memory = 'mem' t2g_path = 't2g' technology = 'tech' - fastqs = ['http://r1', 'r2', 'r3', 'r4'] + fastq_pairs = [['http://r1', 'r2'], ['r3', 'r4']] index_paths = ['index'] with self.assertRaises(Exception): count.count_smartseq( @@ -2587,7 +2635,7 @@ def test_count_smartseq_no_streaming(self): t2g_path, technology, out_dir, - fastqs, + fastq_pairs, temp_dir=temp_dir, threads=threads, memory=memory diff --git a/tests/test_utils.py b/tests/test_utils.py old mode 100644 new mode 100755 index 650f56f..9440668 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -180,6 +180,13 @@ def test_get_temporary_filename(self): path = utils.get_temporary_filename() self.assertTrue(os.path.exists(path)) + def test_read_t2g(self): + t2g = utils.read_t2g(self.t2g_path) + self.assertEqual(16457, len(t2g)) + self.assertIn('ENSMUST00000158488.1', t2g) + self.assertEqual(('ENSMUSG00000089113.1', 'Gm22902'), + t2g['ENSMUST00000158488.1']) + def test_import_tcc_matrix_as_anndata(self): adata = utils.import_tcc_matrix_as_anndata( self.tcc_matrix_path, self.tcc_barcodes_path, self.tcc_ec_path, @@ -196,10 +203,30 @@ def test_import_matrix_as_anndata(self): self.matrix_path, self.barcodes_path, self.genes_path ) self.assertIsInstance(adata, anndata.AnnData) + self.assertEqual((29, 17), adata.shape) self.assertEqual(set(), set(adata.var)) self.assertEqual(set(), set(adata.obs)) self.assertEqual('gene_id', adata.var.index.name) self.assertEqual('barcode', adata.obs.index.name) + self.assertEqual(1, adata.X[9, 0]) + self.assertEqual(1, adata.X[15, 0]) + + def test_import_matrix_as_anndata_duplicated(self): + adata = utils.import_matrix_as_anndata( + self.matrix_duplicated_path, + self.barcodes_path, + self.genes_duplicated_path, + ) + self.assertIsInstance(adata, anndata.AnnData) + self.assertEqual(set(), set(adata.obs)) + self.assertEqual('gene_id', adata.var.index.name) + self.assertEqual('barcode', adata.obs.index.name) + + self.assertEqual((29, 16), adata.shape) + self.assertNotIn('ENSMUSG00000092572.7', adata.var.index) + self.assertEqual( + 5, adata.X[15, adata.var.index.get_loc('ENSMUSG00000026034.17')] + ) def test_import_matrix_as_anndata_with_t2g(self): adata = utils.import_matrix_as_anndata(