Skip to content

Commit

Permalink
gzip support for reading and writing (#13)
Browse files Browse the repository at this point in the history
* gzip support for reading and writing

* indropsv3 > indrops for compatibility with kallisto
  • Loading branch information
Lioscro authored Oct 29, 2019
1 parent b80b5fc commit b489520
Show file tree
Hide file tree
Showing 15 changed files with 108 additions and 37 deletions.
48 changes: 34 additions & 14 deletions kb_python/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,50 @@
# Technology to file position mapping
Technology = namedtuple(
'Technology', [
'name', 'description', 'nfiles', 'seq_pos', 'umi_pos',
'whitelist_archive', 'whitelist_filename'
'name', 'description', 'nfiles', 'reads_file', 'umi_positions',
'barcode_positions', 'whitelist_archive'
]
)
WHITELIST_DIR = 'whitelists'
TECHNOLOGIES = [
Technology(
'10XV1', '10x version 1', 3, 0, 1, '10xv1_whitelist.tar.gz',
'10xv1_whitelist.txt'
'10XV1',
'10x version 1',
3,
0,
[(1, 0, 0)],
[(2, 0, 0)],
'10xv1_whitelist.txt.gz',
),
Technology(
'10XV2', '10x version 2', 2, 1, 0, '10xv2_whitelist.tar.gz',
'10xv2_whitelist.txt'
'10XV2', '10x version 2', 2, 1, [(0, 16, 26)], [(0, 0, 16)],
'10xv2_whitelist.txt.gz'
),
Technology(
'10XV3', '10x version 3', 2, 1, 0, '10xv3_whitelist.tar.gz',
'10xv3_whitelist.txt'
'10XV3', '10x version 3', 2, 1, [(0, 16, 28)], [(0, 0, 16)],
'10xv3_whitelist.txt.gz'
),
Technology('CELSEQ', 'CEL-Seq', 2, 1, [(0, 8, 12)], [(0, 0, 8)], None),
Technology(
'CELSEQ2', 'CEL-SEQ version 2', 2, 1, [(0, 0, 6)], [(0, 6, 12)], None
),
Technology('DROPSEQ', 'DropSeq', 2, 1, [(0, 12, 20)], [(0, 0, 12)], None),
Technology(
'INDROPS',
'inDrops version 3',
2,
1,
[(0, 42, 48)],
[(0, 0, 11), (0, 30, 38)],
'inDropsv3_whitelist.txt.gz',
),
Technology('SCRUBSEQ', 'SCRB-Seq', 2, 1, [(0, 6, 16)], [(0, 0, 6)], None),
Technology(
'SURECELL', 'SureCell for ddSEQ', 2, 1, [(0, 51, 59)], [(0, 0, 6),
(0, 21, 27),
(0, 42, 48)],
None
),
Technology('CELSEQ', 'CEL-Seq', 2, 1, 0, None, None),
Technology('CELSEQ2', 'CEL-SEQ version 2', 2, 1, 0, None, None),
Technology('DROPSEQ', 'DropSeq', 2, 1, 0, None, None),
Technology('INDROPS', 'inDrops', 2, 1, 0, None, None),
Technology('SCRUBSEQ', 'SCRB-Seq', 2, 1, 0, None, None),
Technology('SURECELL', 'SureCell for ddSEQ', 2, 1, 0, None, None),
]
TECHNOLOGIES_MAPPING = {t.name: t for t in TECHNOLOGIES}

Expand Down
4 changes: 3 additions & 1 deletion kb_python/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ def kallisto_bus(fastqs, index_path, technology, out_dir, threads=8):
:return: dictionary containing path to generated index
:rtype: dict
"""
logger.info('Generating BUS file from {}'.format(' '.join(fastqs)))
logger.info('Generating BUS file from')
for fastq in fastqs:
logger.info((' ' * 8) + fastq)
command = [get_kallisto_binary_path(), 'bus']
command += ['-i', index_path]
command += ['-o', out_dir]
Expand Down
13 changes: 8 additions & 5 deletions kb_python/fasta.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

from .gtf import GTF
from .utils import open_as_text

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -74,7 +76,7 @@ def entries(self):
:return: a generator that yields a tuple of the FASTA entry
:rtype: generator
"""
with open(self.fasta_path, 'r') as f:
with open_as_text(self.fasta_path, 'r') as f:
sequence_id = None
sequence = ''
for line in f:
Expand All @@ -97,7 +99,7 @@ def sort(self, out_path):
:type out_path: str
"""
to_sort = []
with open(self.fasta_path, 'r') as f:
with open_as_text(self.fasta_path, 'r') as f:
position = 0
line = f.readline()

Expand All @@ -114,7 +116,8 @@ def sort(self, out_path):
to_sort.sort()

logger.debug('Writing sorted FASTA {}'.format(out_path))
with open(self.fasta_path, 'r') as fasta, open(out_path, 'w') as f:
with open_as_text(self.fasta_path,
'r') as fasta, open_as_text(out_path, 'w') as f:
for tup in to_sort:
start_position = tup[1]
end_position = tup[2]
Expand Down Expand Up @@ -148,7 +151,7 @@ def generate_cdna_fasta(fasta_path, gtf_path, out_path):
gtf = GTF(gtf_path)
gtf_entries = gtf.entries()

with open(out_path, 'w') as f:
with open_as_text(out_path, 'w') as f:
previous_gtf_entry = None
for sequence_id, sequence in fasta.entries():
logger.debug(
Expand Down Expand Up @@ -257,7 +260,7 @@ def generate_intron_fasta(fasta_path, gtf_path, out_path):
gtf = GTF(gtf_path)
gtf_entries = gtf.entries()

with open(out_path, 'w') as f:
with open_as_text(out_path, 'w') as f:
previous_gtf_entry = None
for sequence_id, sequence in fasta.entries():
logger.debug(
Expand Down
9 changes: 6 additions & 3 deletions kb_python/gtf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import re

from .utils import open_as_text

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -57,7 +59,7 @@ def entries(self):
:return: a generator that yields a dict of the GTF entry
:rtype: generator
"""
with open(self.gtf_path, 'r') as f:
with open_as_text(self.gtf_path, 'r') as f:
for line in f:
if line.startswith('#') or line.isspace():
continue
Expand All @@ -71,7 +73,7 @@ def sort(self, out_path):
:type out_path: str
"""
to_sort = []
with open(self.gtf_path, 'r') as f:
with open_as_text(self.gtf_path, 'r') as f:
position = 0
line = f.readline()
while line:
Expand All @@ -89,7 +91,8 @@ def sort(self, out_path):
to_sort.sort()

logger.debug('Writing sorted GTF {}'.format(out_path))
with open(self.gtf_path, 'r') as gtf, open(out_path, 'w') as f:
with open_as_text(self.gtf_path, 'r') as gtf, open_as_text(out_path,
'w') as f:
for tup in to_sort:
position = tup[2]
gtf.seek(position)
Expand Down
18 changes: 12 additions & 6 deletions kb_python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,19 @@ def display_technologies():
a whitelist for that technology and the FASTQ argument order for kb count.
"""
headers = [
'name', 'description', 'whitelist provided', 'fastq order for "count"'
'name', 'whitelist provided', 'barcode (file #, start, stop)',
'umi (file #, start, stop)', 'read file #'
]
rows = [headers]

print('List of supported single-cell technologies\n')
for t in TECHNOLOGIES:
row = [
t.name, t.description, 'yes' if t.whitelist_archive else '',
'seq, umi' if t.seq_pos == 0 else 'umi, seq'
t.name,
'yes' if t.whitelist_archive else '',
' '.join(str(tup) for tup in t.barcode_positions),
' '.join(str(tup) for tup in t.umi_positions),
str(t.reads_file),
]
rows.append(row)

Expand Down Expand Up @@ -284,7 +288,8 @@ def setup_count_args(parser, parent):
# count
parser_count = parser.add_parser(
'count',
description='Generate count matrices from a set of single-cell FASTQ files', # noqa
description=('Generate count matrices from a set of single-cell FASTQ files. '
'Run `kb --list` to view single-cell technology information.'), # noqa
help='Generate count matrices from a set of single-cell FASTQ files',
parents=[parent],
)
Expand All @@ -308,7 +313,7 @@ def setup_count_args(parser, parent):
required_count.add_argument(
'-x',
metavar='TECHNOLOGY',
help='Single-cell technology used ("kb --list" to view)',
help='Single-cell technology used (`kb --list` to view)',
type=str,
required=True
)
Expand All @@ -326,7 +331,7 @@ def setup_count_args(parser, parent):
'Path to file of whitelisted barcodes to correct to. '
'If not provided and bustools supports the technology, '
'a pre-packaged whitelist is used. If not, the bustools '
'whitelist command is used. ("kb --list" to view whitelists)'
'whitelist command is used. (`kb --list` to view whitelists)'
),
type=str
)
Expand Down Expand Up @@ -468,4 +473,5 @@ def main():
finally:
# Always clean temp dir
if not args.keep_tmp:
logger.debug('Removing tmp directory')
shutil.rmtree(TEMP_DIR, ignore_errors=True)
5 changes: 3 additions & 2 deletions kb_python/ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .gtf import GTF
from .utils import (
concatenate_files,
open_as_text,
run_executable,
)

Expand Down Expand Up @@ -77,7 +78,7 @@ def create_t2g_from_gtf(gtf_path, t2g_path, intron=False):
"""
logger.info('Creating transcript-to-gene mapping at {}'.format(t2g_path))
gtf = GTF(gtf_path)
with open(t2g_path, 'w') as f:
with open_as_text(t2g_path, 'w') as f:
for entry in gtf.entries():
if entry['feature'] == 'transcript':
transcript_id = entry['group']['transcript_id']
Expand Down Expand Up @@ -117,7 +118,7 @@ def create_t2c(fasta_path, t2c_path):
:rtype: dict
"""
fasta = FASTA(fasta_path)
with open(t2c_path, 'w') as f:
with open_as_text(t2c_path, 'w') as f:
for sequence_id, _ in fasta.entries():
f.write('{}\n'.format(sequence_id))
return {'t2c': t2c_path}
Expand Down
29 changes: 23 additions & 6 deletions kb_python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import re
import subprocess as sp
import tarfile
import threading
import time
from urllib.request import urlretrieve
Expand Down Expand Up @@ -36,6 +35,21 @@ class UnmetDependencyException(Exception):
pass


def open_as_text(path, mode):
"""Open a textfile or gzip file in text mode.
:param path: path to textfile or gzip
:type path: str
:param mode: mode to open the file, either `w` for write or `r` for read
:type mode: str
:return: file object
:rtype: file object
"""
return gzip.open(path, mode +
't') if path.endswith('.gz') else open(path, mode)


def run_executable(
command,
stdin=None,
Expand Down Expand Up @@ -245,9 +259,13 @@ def copy_whitelist(technology, out_dir):
archive_path = os.path.join(
PACKAGE_PATH, WHITELIST_DIR, technology.whitelist_archive
)
with tarfile.open(archive_path, 'r:gz') as f:
f.extract(technology.whitelist_filename, path=out_dir)
return os.path.join(out_dir, technology.whitelist_filename)
whitelist_path = os.path.join(
out_dir,
os.path.splitext(technology.whitelist_archive)[0]
)
with open_as_text(archive_path, 'r') as f, open(whitelist_path, 'w') as out:
out.write(f.read())
return whitelist_path


def concatenate_files(*paths, out_path, temp_dir='tmp'):
Expand All @@ -267,8 +285,7 @@ def concatenate_files(*paths, out_path, temp_dir='tmp'):
"""
with open(out_path, 'w') as out:
for path in paths:
with gzip.open(path, 'rt') if path.endswith('.gz') else open(
path, 'r') as f:
with open_as_text(path, 'r') as f:
for line in f:
if not line.isspace():
out.write(line.strip() + '\n')
Expand Down
Binary file removed kb_python/whitelists/10xv1_whitelist.tar.gz
Binary file not shown.
Binary file added kb_python/whitelists/10xv1_whitelist.txt.gz
Binary file not shown.
Binary file removed kb_python/whitelists/10xv2_whitelist.tar.gz
Binary file not shown.
Binary file added kb_python/whitelists/10xv2_whitelist.txt.gz
Binary file not shown.
Binary file not shown.
Binary file added kb_python/whitelists/inDropsv3_whitelist.txt.gz
Binary file not shown.
1 change: 1 addition & 0 deletions tests/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def setUpClass(cls):
cls.technology = '10xv2'
cls.base_dir = os.path.dirname(os.path.abspath(__file__))
cls.fixtures_dir = os.path.join(cls.base_dir, 'fixtures')
cls.test_gz_path = os.path.join(cls.fixtures_dir, 'test.gz')
cls.small_gtf_path = os.path.join(cls.fixtures_dir, 'small.gtf')
cls.gtf_path = os.path.join(cls.fixtures_dir, 'mouse_truncated.gtf')
cls.fasta_path = os.path.join(cls.fixtures_dir, 'mouse_truncated.fasta')
Expand Down
18 changes: 18 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,24 @@

class TestUtils(TestMixin, TestCase):

def test_open_as_text_textfile(self):
path = os.path.join(
tempfile.gettempdir(), '{}.txt'.format(uuid.uuid4())
)
with utils.open_as_text(path, 'w') as f:
f.write('TESTING')
self.assertTrue(os.path.exists(path))
with utils.open_as_text(path, 'r') as f:
self.assertEqual(f.read(), 'TESTING')

def test_open_as_text_gzip(self):
path = os.path.join(tempfile.gettempdir(), '{}.gz'.format(uuid.uuid4()))
with utils.open_as_text(path, 'w') as f:
f.write('TESTING')
self.assertTrue(os.path.exists(path))
with utils.open_as_text(path, 'r') as f:
self.assertEqual(f.read(), 'TESTING')

def test_run_executable(self):
p = utils.run_executable(['echo', 'TEST'], stream=False)
self.assertEqual(p.stdout.read(), 'TEST\n')
Expand Down

0 comments on commit b489520

Please sign in to comment.