Merge pull request #94 from pachterlab/devel

Merge devel into master
pachterlab · Jan 8, 2021 · 0b48389 · 0b48389
2 parents b997e0d + edb46ba
commit 0b48389
Show file tree

Hide file tree

Showing 13 changed files with 331 additions and 71 deletions.
diff --git a/kb_python/constants.py b/kb_python/constants.py
@@ -50,6 +50,7 @@
 ABUNDANCE_FILENAME = 'matrix.abundance.mtx'
 CELLS_FILENAME = 'matrix.cells'
 GENE_DIR = 'counts_gene'
+GENES_FILENAME = 'genes.txt'
 
 # File codes.
 # These are appended to the filename whenever it undergoes some kind of

diff --git a/kb_python/count.py b/kb_python/count.py
@@ -31,6 +31,7 @@
     FILTERED_CODE,
     FILTERED_COUNTS_DIR,
     GENE_NAME,
+    GENES_FILENAME,
     INSPECT_FILENAME,
     KALLISTO_INFO_FILENAME,
     KB_INFO_FILENAME,
@@ -39,7 +40,6 @@
     REPORT_NOTEBOOK_FILENAME,
     SORT_CODE,
     TCC_PREFIX,
-    TRANSCRIPT_NAME,
     TXNAMES_FILENAME,
     UNFILTERED_CODE,
     UNFILTERED_COUNTS_DIR,
@@ -54,7 +54,9 @@
     import_tcc_matrix_as_anndata,
     make_directory,
     move_file,
+    open_as_text,
     overlay_anndatas,
+    read_t2g,
     run_executable,
     stream_file,
     sum_anndatas,
@@ -564,24 +566,24 @@ def bustools_whitelist(bus_path, out_path):
     return {'whitelist': out_path}
 
 
-def write_smartseq_batch(pairs_1, pairs_2, out_path):
+def write_smartseq_batch(fastq_pairs, cell_ids, out_path):
     """Write a 3-column TSV specifying batch information for Smart-seq reads.
     This file is required to use `kallisto pseudo` on multiple samples (= cells).
 
-    :param pairs_1: list of paths to FASTQs corresponding to pair 1
-    :type pairs_1: list
-    :param pairs_2: list of paths to FASTQS corresponding to pair 2
-    :type pairs_2: list
+    :param fastq_pairs: list of pairs of FASTQs
+    :type fastq_pairs: list
+    :param cell_ids: list of cell IDs
+    :type cell_ids: list
     :param out_path: path to batch file to output
     :type out_path: str
 
     :return: dictionary of written batch file
     :rtype: dict
     """
-    logger.info(f'Writing batch definition textfile to {out_path}')
+    logger.info(f'Writing batch definition TSV to {out_path}')
     with open(out_path, 'w') as f:
-        for i, (read_1, read_2) in enumerate(zip(pairs_1, pairs_2)):
-            f.write(f'{i}\t{read_1}\t{read_2}\n')
+        for cell_id, (fastq_1, fastq_2) in zip(cell_ids, fastq_pairs):
+            f.write(f'{cell_id}\t{fastq_1}\t{fastq_2}\n')
     return {'batch': out_path}
 
 
@@ -977,6 +979,41 @@ def copy_or_create_whitelist(technology, bus_path, out_dir):
         )['whitelist']
 
 
+def convert_transcripts_to_genes(txnames_path, t2g_path, genes_path):
+    """Convert a textfile containing transcript IDs to another textfile containing
+    gene IDs, given a transcript-to-gene mapping.
+
+    :param txnames_path: path to transcripts.txt
+    :type txnames_path: str
+    :param t2g_path: path to transcript-to-genes mapping
+    :type t2g_path: str
+    :param genes_path: path to output genes.txt
+    :type genes_path: str
+
+    :return: path to written genes.txt
+    :rtype: str
+    """
+    t2g = read_t2g(t2g_path)
+    with open_as_text(txnames_path, 'r') as f, open_as_text(genes_path,
+                                                            'w') as out:
+        for line in f:
+            if line.isspace():
+                continue
+            transcript = line.strip()
+            if transcript not in t2g:
+                logger.warning(
+                    f'Transcript {transcript} was found in {txnames_path} but not in {t2g_path}. '
+                    'This transcript will not be converted to a gene.'
+                )
+            attributes = t2g.get(transcript)
+
+            if attributes:
+                out.write(f'{attributes[0]}\n')
+            else:
+                out.write(f'{transcript}\n')
+    return genes_path
+
+
 def count(
     index_paths,
     t2g_path,
@@ -1270,7 +1307,8 @@ def count_smartseq(
     t2g_path,
     technology,
     out_dir,
-    fastqs,
+    fastq_pairs,
+    cell_ids=None,
     temp_dir='tmp',
     threads=8,
     memory='4G',
@@ -1292,7 +1330,8 @@ def count_smartseq(
 
     # Smart-seq does not support fastq streaming.
     if any(urlparse(fastq).scheme in ('http', 'https', 'ftp', 'ftps')
-           for fastq in fastqs):
+           for pair in fastq_pairs
+           for fastq in pair):
         raise Exception(
             f'Technology {technology} does not support FASTQ streaming.'
         )
@@ -1313,11 +1352,13 @@ def count_smartseq(
            for name, path in pseudo_result.items()) or overwrite:
         # Write batch information.
         batch_result = write_smartseq_batch(
-            fastqs[::2], fastqs[1::2], os.path.join(out_dir, BATCH_FILENAME)
+            fastq_pairs,
+            cell_ids if cell_ids else list(range(len(fastq_pairs))),
+            os.path.join(out_dir, BATCH_FILENAME),
         )
         results.update(batch_result)
 
-        kallisto_pseudo(
+        pseudo_result = kallisto_pseudo(
             batch_result['batch'], index_paths[0], out_dir, threads=threads
         )
     else:
@@ -1326,16 +1367,22 @@ def count_smartseq(
         )
     results.update(pseudo_result)
 
+    # Manually write genes.txt
+    # NOTE: there will be duplicated genes
+    genes_path = os.path.join(out_dir, GENES_FILENAME)
+    results['genes'] = convert_transcripts_to_genes(
+        pseudo_result['txnames'], t2g_path, genes_path
+    )
+
     # Convert outputs.
     if loom or h5ad:
         results.update(
             convert_matrix(
                 out_dir,
                 pseudo_result['mtx'],
                 pseudo_result['cells'],
-                pseudo_result['txnames'],
+                results['genes'],
                 t2g_path=t2g_path,
-                name=TRANSCRIPT_NAME,
                 loom=loom,
                 h5ad=h5ad,
                 threads=threads

diff --git a/kb_python/fasta.py b/kb_python/fasta.py
diff --git a/kb_python/main.py b/kb_python/main.py
@@ -4,6 +4,7 @@
 import os
 import sys
 import textwrap
+import warnings
 
 from . import __version__
 from .config import (
@@ -24,6 +25,7 @@
     get_bustools_version,
     get_kallisto_version,
     make_directory,
+    open_as_text,
     remove_directory,
 )
 
@@ -189,7 +191,7 @@ def parse_count(parser, args, temp_dir='tmp'):
         # Smartseq can not be used with lamanno or nucleus.
         if args.x.upper() == 'SMARTSEQ':
             parser.error(
-                f'Technology {args.x} can not be used with workflow {args.workflow}.'
+                f'Technology `{args.x}` can not be used with workflow {args.workflow}.'
             )
 
         count_velocity(
@@ -224,7 +226,7 @@ def parse_count(parser, args, temp_dir='tmp'):
         # Smart-seq
         if args.x.upper() == 'SMARTSEQ':
             if args.dry_run:
-                parser.error(f'Technology {args.x} does not support dry run.')
+                parser.error(f'Technology `{args.x}` does not support dry run.')
 
             # Check for ignored arguments. (i.e. arguments either not supported or
             # not yet implemented)
@@ -234,23 +236,62 @@ def parse_count(parser, args, temp_dir='tmp'):
             for arg in ignored:
                 if getattr(args, arg):
                     logger.warning(
-                        f'Argument `{arg}` is not supported for technology {args.x}. This argument will be ignored.'
+                        f'Argument `{arg}` is not supported for technology `{args.x}`. This argument will be ignored.'
                     )
 
-            # Allow glob notation for fastqs.
-            fastqs = []
-            for expr in args.fastqs:
-                fastqs.extend(glob.glob(expr))
-            args.fastqs = sorted(list(set(fastqs)))
+            # Check if batch TSV was provided.
+            batch_path = None
+            if len(args.fastqs) == 1:
+                try:
+                    with open_as_text(args.fastqs[0], 'r') as f:
+                        if not f.readline().startswith('@'):
+                            batch_path = args.fastqs[0]
+                except Exception:
+                    pass
+
+            cells = {}
+            if batch_path:
+                with open(batch_path, 'r') as f:
+                    for line in f:
+                        if line.isspace() or line.startswith('#'):
+                            continue
+                        cell_id, fastq_1, fastq_2 = line.strip().split('\t')
+                        if cell_id in cells:
+                            parser.error(
+                                f'Found duplicate cell ID {cell_id} in {batch_path}.'
+                            )
+                        cells[cell_id] = (fastq_1, fastq_2)
+            else:
+                # Allow glob notation for fastqs.
+                fastqs = []
+                for expr in args.fastqs:
+                    fastqs.extend(glob.glob(expr))
+                if len(fastqs) % 2:
+                    logger.warning(f'{len(fastqs)} FASTQs found. ')
+                fastqs = sorted(set(fastqs))
+                for cell_id, i in enumerate(range(0, len(fastqs), 2)):
+                    fastq_1, fastq_2 = fastqs[i], (
+                        fastqs[i + 1] if i + 1 < len(fastqs) else ''
+                    )
+                    cells[cell_id] = (fastq_2, fastq_2)
             logger.info('Found the following FASTQs:')
-            for fastq in args.fastqs:
-                logger.info(' ' * 8 + fastq)
+            fastq_pairs = []
+            cell_ids = []
+            for cell_id, (fastq_1, fastq_2) in cells.items():
+                logger.info(f'        {cell_id}    {fastq_1}  {fastq_2}')
+                cell_ids.append(cell_id)
+                fastq_pairs.append((fastq_1, fastq_2))
+                if not fastq_1 or not fastq_2:
+                    parser.error(
+                        f'Single-end reads are currently not supported with technology `{args.x}`.'
+                    )
             count_smartseq(
                 args.i,
                 args.g,
                 args.x,
                 args.o,
-                args.fastqs,
+                fastq_pairs,
+                cell_ids=cell_ids,
                 threads=args.t,
                 memory=args.m,
                 overwrite=args.overwrite,
@@ -662,7 +703,11 @@ def setup_count_args(parser, parent):
     parser_count.add_argument(
         'fastqs',
         help=(
-            'FASTQ files. Glob expressions can be used only with technology `SMARTSEQ`.'
+            'FASTQ files. For technology `SMARTSEQ`, all input FASTQs are '
+            'alphabetically sorted by path and paired in order, and cell IDs '
+            'are assigned as incrementing integers starting from zero. A single '
+            'batch TSV with cell ID, read 1, and read 2 as columns can be '
+            'provided to override this behavior.'
         ),
         nargs='+'
     )
@@ -757,7 +802,12 @@ def main():
 
     # Turn off logging from other packages.
     try:
-        logging.getLogger('anndata').disable(level=logging.CRITICAL)
+        anndata_logger = logging.getLogger('anndata')
+        h5py_logger = logging.getLogger('h5py')
+        anndata_logger.setLevel(logging.CRITICAL + 10)
+        anndata_logger.propagate = False
+        h5py_logger.setLevel(logging.CRITICAL + 10)
+        h5py_logger.propagate = False
     except Exception:
         pass
 
@@ -782,7 +832,9 @@ def main():
     make_directory(temp_dir)
     try:
         logger.debug(args)
-        COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')
+            COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir)
     except Exception:
         if is_dry():
             raise