gzip support for reading and writing (#13)

* gzip support for reading and writing * indropsv3 > indrops for compatibility with kallisto
pachterlab · Oct 29, 2019 · b489520 · b489520
1 parent b80b5fc
commit b489520
Show file tree

Hide file tree

Showing 15 changed files with 108 additions and 37 deletions.
diff --git a/kb_python/config.py b/kb_python/config.py
@@ -11,30 +11,50 @@
 # Technology to file position mapping
 Technology = namedtuple(
     'Technology', [
-        'name', 'description', 'nfiles', 'seq_pos', 'umi_pos',
-        'whitelist_archive', 'whitelist_filename'
+        'name', 'description', 'nfiles', 'reads_file', 'umi_positions',
+        'barcode_positions', 'whitelist_archive'
     ]
 )
 WHITELIST_DIR = 'whitelists'
 TECHNOLOGIES = [
     Technology(
-        '10XV1', '10x version 1', 3, 0, 1, '10xv1_whitelist.tar.gz',
-        '10xv1_whitelist.txt'
+        '10XV1',
+        '10x version 1',
+        3,
+        0,
+        [(1, 0, 0)],
+        [(2, 0, 0)],
+        '10xv1_whitelist.txt.gz',
     ),
     Technology(
-        '10XV2', '10x version 2', 2, 1, 0, '10xv2_whitelist.tar.gz',
-        '10xv2_whitelist.txt'
+        '10XV2', '10x version 2', 2, 1, [(0, 16, 26)], [(0, 0, 16)],
+        '10xv2_whitelist.txt.gz'
     ),
     Technology(
-        '10XV3', '10x version 3', 2, 1, 0, '10xv3_whitelist.tar.gz',
-        '10xv3_whitelist.txt'
+        '10XV3', '10x version 3', 2, 1, [(0, 16, 28)], [(0, 0, 16)],
+        '10xv3_whitelist.txt.gz'
+    ),
+    Technology('CELSEQ', 'CEL-Seq', 2, 1, [(0, 8, 12)], [(0, 0, 8)], None),
+    Technology(
+        'CELSEQ2', 'CEL-SEQ version 2', 2, 1, [(0, 0, 6)], [(0, 6, 12)], None
+    ),
+    Technology('DROPSEQ', 'DropSeq', 2, 1, [(0, 12, 20)], [(0, 0, 12)], None),
+    Technology(
+        'INDROPS',
+        'inDrops version 3',
+        2,
+        1,
+        [(0, 42, 48)],
+        [(0, 0, 11), (0, 30, 38)],
+        'inDropsv3_whitelist.txt.gz',
+    ),
+    Technology('SCRUBSEQ', 'SCRB-Seq', 2, 1, [(0, 6, 16)], [(0, 0, 6)], None),
+    Technology(
+        'SURECELL', 'SureCell for ddSEQ', 2, 1, [(0, 51, 59)], [(0, 0, 6),
+                                                                (0, 21, 27),
+                                                                (0, 42, 48)],
+        None
     ),
-    Technology('CELSEQ', 'CEL-Seq', 2, 1, 0, None, None),
-    Technology('CELSEQ2', 'CEL-SEQ version 2', 2, 1, 0, None, None),
-    Technology('DROPSEQ', 'DropSeq', 2, 1, 0, None, None),
-    Technology('INDROPS', 'inDrops', 2, 1, 0, None, None),
-    Technology('SCRUBSEQ', 'SCRB-Seq', 2, 1, 0, None, None),
-    Technology('SURECELL', 'SureCell for ddSEQ', 2, 1, 0, None, None),
 ]
 TECHNOLOGIES_MAPPING = {t.name: t for t in TECHNOLOGIES}
 

diff --git a/kb_python/count.py b/kb_python/count.py
@@ -50,7 +50,9 @@ def kallisto_bus(fastqs, index_path, technology, out_dir, threads=8):
     :return: dictionary containing path to generated index
     :rtype: dict
     """
-    logger.info('Generating BUS file from {}'.format(' '.join(fastqs)))
+    logger.info('Generating BUS file from')
+    for fastq in fastqs:
+        logger.info((' ' * 8) + fastq)
     command = [get_kallisto_binary_path(), 'bus']
     command += ['-i', index_path]
     command += ['-o', out_dir]

diff --git a/kb_python/fasta.py b/kb_python/fasta.py
@@ -1,5 +1,7 @@
 import logging
+
 from .gtf import GTF
+from .utils import open_as_text
 
 logger = logging.getLogger(__name__)
 
@@ -74,7 +76,7 @@ def entries(self):
         :return: a generator that yields a tuple of the FASTA entry
         :rtype: generator
         """
-        with open(self.fasta_path, 'r') as f:
+        with open_as_text(self.fasta_path, 'r') as f:
             sequence_id = None
             sequence = ''
             for line in f:
@@ -97,7 +99,7 @@ def sort(self, out_path):
         :type out_path: str
         """
         to_sort = []
-        with open(self.fasta_path, 'r') as f:
+        with open_as_text(self.fasta_path, 'r') as f:
             position = 0
             line = f.readline()
 
@@ -114,7 +116,8 @@ def sort(self, out_path):
         to_sort.sort()
 
         logger.debug('Writing sorted FASTA {}'.format(out_path))
-        with open(self.fasta_path, 'r') as fasta, open(out_path, 'w') as f:
+        with open_as_text(self.fasta_path,
+                          'r') as fasta, open_as_text(out_path, 'w') as f:
             for tup in to_sort:
                 start_position = tup[1]
                 end_position = tup[2]
@@ -148,7 +151,7 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
     gtf = GTF(gtf_path)
     gtf_entries = gtf.entries()
 
-    with open(out_path, 'w') as f:
+    with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
         for sequence_id, sequence in fasta.entries():
             logger.debug(
@@ -257,7 +260,7 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
     gtf = GTF(gtf_path)
     gtf_entries = gtf.entries()
 
-    with open(out_path, 'w') as f:
+    with open_as_text(out_path, 'w') as f:
         previous_gtf_entry = None
         for sequence_id, sequence in fasta.entries():
             logger.debug(

diff --git a/kb_python/gtf.py b/kb_python/gtf.py
@@ -1,6 +1,8 @@
 import logging
 import re
 
+from .utils import open_as_text
+
 logger = logging.getLogger(__name__)
 
 
@@ -57,7 +59,7 @@ def entries(self):
         :return: a generator that yields a dict of the GTF entry
         :rtype: generator
         """
-        with open(self.gtf_path, 'r') as f:
+        with open_as_text(self.gtf_path, 'r') as f:
             for line in f:
                 if line.startswith('#') or line.isspace():
                     continue
@@ -71,7 +73,7 @@ def sort(self, out_path):
         :type out_path: str
         """
         to_sort = []
-        with open(self.gtf_path, 'r') as f:
+        with open_as_text(self.gtf_path, 'r') as f:
             position = 0
             line = f.readline()
             while line:
@@ -89,7 +91,8 @@ def sort(self, out_path):
         to_sort.sort()
 
         logger.debug('Writing sorted GTF {}'.format(out_path))
-        with open(self.gtf_path, 'r') as gtf, open(out_path, 'w') as f:
+        with open_as_text(self.gtf_path, 'r') as gtf, open_as_text(out_path,
+                                                                   'w') as f:
             for tup in to_sort:
                 position = tup[2]
                 gtf.seek(position)

diff --git a/kb_python/main.py b/kb_python/main.py
@@ -41,15 +41,19 @@ def display_technologies():
     a whitelist for that technology and the FASTQ argument order for kb count.
     """
     headers = [
-        'name', 'description', 'whitelist provided', 'fastq order for "count"'
+        'name', 'whitelist provided', 'barcode (file #, start, stop)',
+        'umi (file #, start, stop)', 'read file #'
     ]
     rows = [headers]
 
     print('List of supported single-cell technologies\n')
     for t in TECHNOLOGIES:
         row = [
-            t.name, t.description, 'yes' if t.whitelist_archive else '',
-            'seq, umi' if t.seq_pos == 0 else 'umi, seq'
+            t.name,
+            'yes' if t.whitelist_archive else '',
+            ' '.join(str(tup) for tup in t.barcode_positions),
+            ' '.join(str(tup) for tup in t.umi_positions),
+            str(t.reads_file),
         ]
         rows.append(row)
 
@@ -284,7 +288,8 @@ def setup_count_args(parser, parent):
     # count
     parser_count = parser.add_parser(
         'count',
-        description='Generate count matrices from a set of single-cell FASTQ files',  # noqa
+        description=('Generate count matrices from a set of single-cell FASTQ files. '
+                     'Run `kb --list` to view single-cell technology information.'),  # noqa
         help='Generate count matrices from a set of single-cell FASTQ files',
         parents=[parent],
     )
@@ -308,7 +313,7 @@ def setup_count_args(parser, parent):
     required_count.add_argument(
         '-x',
         metavar='TECHNOLOGY',
-        help='Single-cell technology used ("kb --list" to view)',
+        help='Single-cell technology used (`kb --list` to view)',
         type=str,
         required=True
     )
@@ -326,7 +331,7 @@ def setup_count_args(parser, parent):
             'Path to file of whitelisted barcodes to correct to. '
             'If not provided and bustools supports the technology, '
             'a pre-packaged whitelist is used. If not, the bustools '
-            'whitelist command is used. ("kb --list" to view whitelists)'
+            'whitelist command is used. (`kb --list` to view whitelists)'
         ),
         type=str
     )
@@ -468,4 +473,5 @@ def main():
     finally:
         # Always clean temp dir
         if not args.keep_tmp:
+            logger.debug('Removing tmp directory')
             shutil.rmtree(TEMP_DIR, ignore_errors=True)
diff --git a/kb_python/ref.py b/kb_python/ref.py
@@ -22,6 +22,7 @@
 from .gtf import GTF
 from .utils import (
     concatenate_files,
+    open_as_text,
     run_executable,
 )
 
@@ -77,7 +78,7 @@ def create_t2g_from_gtf(gtf_path, t2g_path, intron=False):
     """
     logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path))
     gtf = GTF(gtf_path)
-    with open(t2g_path, 'w') as f:
+    with open_as_text(t2g_path, 'w') as f:
         for entry in gtf.entries():
             if entry['feature'] == 'transcript':
                 transcript_id = entry['group']['transcript_id']
@@ -117,7 +118,7 @@ def create_t2c(fasta_path, t2c_path):
     :rtype: dict
     """
     fasta = FASTA(fasta_path)
-    with open(t2c_path, 'w') as f:
+    with open_as_text(t2c_path, 'w') as f:
         for sequence_id, _ in fasta.entries():
             f.write('{}\n'.format(sequence_id))
     return {'t2c': t2c_path}

diff --git a/kb_python/utils.py b/kb_python/utils.py
@@ -3,7 +3,6 @@
 import os
 import re
 import subprocess as sp
-import tarfile
 import threading
 import time
 from urllib.request import urlretrieve
@@ -36,6 +35,21 @@ class UnmetDependencyException(Exception):
     pass
 
 
+def open_as_text(path, mode):
+    """Open a textfile or gzip file in text mode.
+
+    :param path: path to textfile or gzip
+    :type path: str
+    :param mode: mode to open the file, either `w` for write or `r` for read
+    :type mode: str
+
+    :return: file object
+    :rtype: file object
+    """
+    return gzip.open(path, mode +
+                     't') if path.endswith('.gz') else open(path, mode)
+
+
 def run_executable(
         command,
         stdin=None,
@@ -245,9 +259,13 @@ def copy_whitelist(technology, out_dir):
     archive_path = os.path.join(
         PACKAGE_PATH, WHITELIST_DIR, technology.whitelist_archive
     )
-    with tarfile.open(archive_path, 'r:gz') as f:
-        f.extract(technology.whitelist_filename, path=out_dir)
-    return os.path.join(out_dir, technology.whitelist_filename)
+    whitelist_path = os.path.join(
+        out_dir,
+        os.path.splitext(technology.whitelist_archive)[0]
+    )
+    with open_as_text(archive_path, 'r') as f, open(whitelist_path, 'w') as out:
+        out.write(f.read())
+    return whitelist_path
 
 
 def concatenate_files(*paths, out_path, temp_dir='tmp'):
@@ -267,8 +285,7 @@ def concatenate_files(*paths, out_path, temp_dir='tmp'):
     """
     with open(out_path, 'w') as out:
         for path in paths:
-            with gzip.open(path, 'rt') if path.endswith('.gz') else open(
-                    path, 'r') as f:
+            with open_as_text(path, 'r') as f:
                 for line in f:
                     if not line.isspace():
                         out.write(line.strip() + '\n')

diff --git a/kb_python/whitelists/10xv1_whitelist.tar.gz b/kb_python/whitelists/10xv1_whitelist.tar.gz
diff --git a/kb_python/whitelists/10xv1_whitelist.txt.gz b/kb_python/whitelists/10xv1_whitelist.txt.gz
diff --git a/kb_python/whitelists/10xv2_whitelist.tar.gz b/kb_python/whitelists/10xv2_whitelist.tar.gz
diff --git a/kb_python/whitelists/10xv2_whitelist.txt.gz b/kb_python/whitelists/10xv2_whitelist.txt.gz
diff --git a/kb_python/whitelists/10xv3_whitelist.tar.gz → kb_python/whitelists/10xv3_whitelist.txt.gz b/kb_python/whitelists/10xv3_whitelist.tar.gz → kb_python/whitelists/10xv3_whitelist.txt.gz
diff --git a/kb_python/whitelists/inDropsv3_whitelist.txt.gz b/kb_python/whitelists/inDropsv3_whitelist.txt.gz
diff --git a/tests/mixins.py b/tests/mixins.py
@@ -9,6 +9,7 @@ def setUpClass(cls):
         cls.technology = '10xv2'
         cls.base_dir = os.path.dirname(os.path.abspath(__file__))
         cls.fixtures_dir = os.path.join(cls.base_dir, 'fixtures')
+        cls.test_gz_path = os.path.join(cls.fixtures_dir, 'test.gz')
         cls.small_gtf_path = os.path.join(cls.fixtures_dir, 'small.gtf')
         cls.gtf_path = os.path.join(cls.fixtures_dir, 'mouse_truncated.gtf')
         cls.fasta_path = os.path.join(cls.fixtures_dir, 'mouse_truncated.fasta')

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -13,6 +13,24 @@
 
 class TestUtils(TestMixin, TestCase):
 
+    def test_open_as_text_textfile(self):
+        path = os.path.join(
+            tempfile.gettempdir(), '{}.txt'.format(uuid.uuid4())
+        )
+        with utils.open_as_text(path, 'w') as f:
+            f.write('TESTING')
+        self.assertTrue(os.path.exists(path))
+        with utils.open_as_text(path, 'r') as f:
+            self.assertEqual(f.read(), 'TESTING')
+
+    def test_open_as_text_gzip(self):
+        path = os.path.join(tempfile.gettempdir(), '{}.gz'.format(uuid.uuid4()))
+        with utils.open_as_text(path, 'w') as f:
+            f.write('TESTING')
+        self.assertTrue(os.path.exists(path))
+        with utils.open_as_text(path, 'r') as f:
+            self.assertEqual(f.read(), 'TESTING')
+
     def test_run_executable(self):
         p = utils.run_executable(['echo', 'TEST'], stream=False)
         self.assertEqual(p.stdout.read(), 'TEST\n')