Skip to content

Commit

Permalink
Merge pull request #94 from pachterlab/devel
Browse files Browse the repository at this point in the history
Merge devel into master
  • Loading branch information
Lioscro authored Jan 8, 2021
2 parents b997e0d + edb46ba commit 0b48389
Show file tree
Hide file tree
Showing 13 changed files with 331 additions and 71 deletions.
1 change: 1 addition & 0 deletions kb_python/constants.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
ABUNDANCE_FILENAME = 'matrix.abundance.mtx'
CELLS_FILENAME = 'matrix.cells'
GENE_DIR = 'counts_gene'
GENES_FILENAME = 'genes.txt'

# File codes.
# These are appended to the filename whenever it undergoes some kind of
Expand Down
77 changes: 62 additions & 15 deletions kb_python/count.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
FILTERED_CODE,
FILTERED_COUNTS_DIR,
GENE_NAME,
GENES_FILENAME,
INSPECT_FILENAME,
KALLISTO_INFO_FILENAME,
KB_INFO_FILENAME,
Expand All @@ -39,7 +40,6 @@
REPORT_NOTEBOOK_FILENAME,
SORT_CODE,
TCC_PREFIX,
TRANSCRIPT_NAME,
TXNAMES_FILENAME,
UNFILTERED_CODE,
UNFILTERED_COUNTS_DIR,
Expand All @@ -54,7 +54,9 @@
import_tcc_matrix_as_anndata,
make_directory,
move_file,
open_as_text,
overlay_anndatas,
read_t2g,
run_executable,
stream_file,
sum_anndatas,
Expand Down Expand Up @@ -564,24 +566,24 @@ def bustools_whitelist(bus_path, out_path):
return {'whitelist': out_path}


def write_smartseq_batch(pairs_1, pairs_2, out_path):
def write_smartseq_batch(fastq_pairs, cell_ids, out_path):
"""Write a 3-column TSV specifying batch information for Smart-seq reads.
This file is required to use `kallisto pseudo` on multiple samples (= cells).
:param pairs_1: list of paths to FASTQs corresponding to pair 1
:type pairs_1: list
:param pairs_2: list of paths to FASTQS corresponding to pair 2
:type pairs_2: list
:param fastq_pairs: list of pairs of FASTQs
:type fastq_pairs: list
:param cell_ids: list of cell IDs
:type cell_ids: list
:param out_path: path to batch file to output
:type out_path: str
:return: dictionary of written batch file
:rtype: dict
"""
logger.info(f'Writing batch definition textfile to {out_path}')
logger.info(f'Writing batch definition TSV to {out_path}')
with open(out_path, 'w') as f:
for i, (read_1, read_2) in enumerate(zip(pairs_1, pairs_2)):
f.write(f'{i}\t{read_1}\t{read_2}\n')
for cell_id, (fastq_1, fastq_2) in zip(cell_ids, fastq_pairs):
f.write(f'{cell_id}\t{fastq_1}\t{fastq_2}\n')
return {'batch': out_path}


Expand Down Expand Up @@ -977,6 +979,41 @@ def copy_or_create_whitelist(technology, bus_path, out_dir):
)['whitelist']


def convert_transcripts_to_genes(txnames_path, t2g_path, genes_path):
"""Convert a textfile containing transcript IDs to another textfile containing
gene IDs, given a transcript-to-gene mapping.
:param txnames_path: path to transcripts.txt
:type txnames_path: str
:param t2g_path: path to transcript-to-genes mapping
:type t2g_path: str
:param genes_path: path to output genes.txt
:type genes_path: str
:return: path to written genes.txt
:rtype: str
"""
t2g = read_t2g(t2g_path)
with open_as_text(txnames_path, 'r') as f, open_as_text(genes_path,
'w') as out:
for line in f:
if line.isspace():
continue
transcript = line.strip()
if transcript not in t2g:
logger.warning(
f'Transcript {transcript} was found in {txnames_path} but not in {t2g_path}. '
'This transcript will not be converted to a gene.'
)
attributes = t2g.get(transcript)

if attributes:
out.write(f'{attributes[0]}\n')
else:
out.write(f'{transcript}\n')
return genes_path


def count(
index_paths,
t2g_path,
Expand Down Expand Up @@ -1270,7 +1307,8 @@ def count_smartseq(
t2g_path,
technology,
out_dir,
fastqs,
fastq_pairs,
cell_ids=None,
temp_dir='tmp',
threads=8,
memory='4G',
Expand All @@ -1292,7 +1330,8 @@ def count_smartseq(

# Smart-seq does not support fastq streaming.
if any(urlparse(fastq).scheme in ('http', 'https', 'ftp', 'ftps')
for fastq in fastqs):
for pair in fastq_pairs
for fastq in pair):
raise Exception(
f'Technology {technology} does not support FASTQ streaming.'
)
Expand All @@ -1313,11 +1352,13 @@ def count_smartseq(
for name, path in pseudo_result.items()) or overwrite:
# Write batch information.
batch_result = write_smartseq_batch(
fastqs[::2], fastqs[1::2], os.path.join(out_dir, BATCH_FILENAME)
fastq_pairs,
cell_ids if cell_ids else list(range(len(fastq_pairs))),
os.path.join(out_dir, BATCH_FILENAME),
)
results.update(batch_result)

kallisto_pseudo(
pseudo_result = kallisto_pseudo(
batch_result['batch'], index_paths[0], out_dir, threads=threads
)
else:
Expand All @@ -1326,16 +1367,22 @@ def count_smartseq(
)
results.update(pseudo_result)

# Manually write genes.txt
# NOTE: there will be duplicated genes
genes_path = os.path.join(out_dir, GENES_FILENAME)
results['genes'] = convert_transcripts_to_genes(
pseudo_result['txnames'], t2g_path, genes_path
)

# Convert outputs.
if loom or h5ad:
results.update(
convert_matrix(
out_dir,
pseudo_result['mtx'],
pseudo_result['cells'],
pseudo_result['txnames'],
results['genes'],
t2g_path=t2g_path,
name=TRANSCRIPT_NAME,
loom=loom,
h5ad=h5ad,
threads=threads
Expand Down
Empty file modified kb_python/fasta.py
100644 → 100755
Empty file.
80 changes: 66 additions & 14 deletions kb_python/main.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import sys
import textwrap
import warnings

from . import __version__
from .config import (
Expand All @@ -24,6 +25,7 @@
get_bustools_version,
get_kallisto_version,
make_directory,
open_as_text,
remove_directory,
)

Expand Down Expand Up @@ -189,7 +191,7 @@ def parse_count(parser, args, temp_dir='tmp'):
# Smartseq can not be used with lamanno or nucleus.
if args.x.upper() == 'SMARTSEQ':
parser.error(
f'Technology {args.x} can not be used with workflow {args.workflow}.'
f'Technology `{args.x}` can not be used with workflow {args.workflow}.'
)

count_velocity(
Expand Down Expand Up @@ -224,7 +226,7 @@ def parse_count(parser, args, temp_dir='tmp'):
# Smart-seq
if args.x.upper() == 'SMARTSEQ':
if args.dry_run:
parser.error(f'Technology {args.x} does not support dry run.')
parser.error(f'Technology `{args.x}` does not support dry run.')

# Check for ignored arguments. (i.e. arguments either not supported or
# not yet implemented)
Expand All @@ -234,23 +236,62 @@ def parse_count(parser, args, temp_dir='tmp'):
for arg in ignored:
if getattr(args, arg):
logger.warning(
f'Argument `{arg}` is not supported for technology {args.x}. This argument will be ignored.'
f'Argument `{arg}` is not supported for technology `{args.x}`. This argument will be ignored.'
)

# Allow glob notation for fastqs.
fastqs = []
for expr in args.fastqs:
fastqs.extend(glob.glob(expr))
args.fastqs = sorted(list(set(fastqs)))
# Check if batch TSV was provided.
batch_path = None
if len(args.fastqs) == 1:
try:
with open_as_text(args.fastqs[0], 'r') as f:
if not f.readline().startswith('@'):
batch_path = args.fastqs[0]
except Exception:
pass

cells = {}
if batch_path:
with open(batch_path, 'r') as f:
for line in f:
if line.isspace() or line.startswith('#'):
continue
cell_id, fastq_1, fastq_2 = line.strip().split('\t')
if cell_id in cells:
parser.error(
f'Found duplicate cell ID {cell_id} in {batch_path}.'
)
cells[cell_id] = (fastq_1, fastq_2)
else:
# Allow glob notation for fastqs.
fastqs = []
for expr in args.fastqs:
fastqs.extend(glob.glob(expr))
if len(fastqs) % 2:
logger.warning(f'{len(fastqs)} FASTQs found. ')
fastqs = sorted(set(fastqs))
for cell_id, i in enumerate(range(0, len(fastqs), 2)):
fastq_1, fastq_2 = fastqs[i], (
fastqs[i + 1] if i + 1 < len(fastqs) else ''
)
cells[cell_id] = (fastq_2, fastq_2)
logger.info('Found the following FASTQs:')
for fastq in args.fastqs:
logger.info(' ' * 8 + fastq)
fastq_pairs = []
cell_ids = []
for cell_id, (fastq_1, fastq_2) in cells.items():
logger.info(f' {cell_id} {fastq_1} {fastq_2}')
cell_ids.append(cell_id)
fastq_pairs.append((fastq_1, fastq_2))
if not fastq_1 or not fastq_2:
parser.error(
f'Single-end reads are currently not supported with technology `{args.x}`.'
)
count_smartseq(
args.i,
args.g,
args.x,
args.o,
args.fastqs,
fastq_pairs,
cell_ids=cell_ids,
threads=args.t,
memory=args.m,
overwrite=args.overwrite,
Expand Down Expand Up @@ -662,7 +703,11 @@ def setup_count_args(parser, parent):
parser_count.add_argument(
'fastqs',
help=(
'FASTQ files. Glob expressions can be used only with technology `SMARTSEQ`.'
'FASTQ files. For technology `SMARTSEQ`, all input FASTQs are '
'alphabetically sorted by path and paired in order, and cell IDs '
'are assigned as incrementing integers starting from zero. A single '
'batch TSV with cell ID, read 1, and read 2 as columns can be '
'provided to override this behavior.'
),
nargs='+'
)
Expand Down Expand Up @@ -757,7 +802,12 @@ def main():

# Turn off logging from other packages.
try:
logging.getLogger('anndata').disable(level=logging.CRITICAL)
anndata_logger = logging.getLogger('anndata')
h5py_logger = logging.getLogger('h5py')
anndata_logger.setLevel(logging.CRITICAL + 10)
anndata_logger.propagate = False
h5py_logger.setLevel(logging.CRITICAL + 10)
h5py_logger.propagate = False
except Exception:
pass

Expand All @@ -782,7 +832,9 @@ def main():
make_directory(temp_dir)
try:
logger.debug(args)
COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
COMMAND_TO_FUNCTION[args.command](parser, args, temp_dir=temp_dir)
except Exception:
if is_dry():
raise
Expand Down
Loading

0 comments on commit 0b48389

Please sign in to comment.