forked from yjzhang/split-seq-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsplit-seq
executable file
·126 lines (103 loc) · 5.6 KB
/
split-seq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# TODO:
# 1. argparse to parse arguments
# Which arguments should there be???
# should we use snakemake?
# Arguments:
# - fastq1 (read1 - actual sequencing read)
# - fastq2 (read2 - header)
# - output_dir
# algorithm steps:
# 1. convert fastq to unaligned bam
# 2. copy header
# 3. trim read2 to UMI-BC3-BC2-BC1
# 4.
import argparse
from split_seq import tools, tools_debug, process, processv2, analysis
import datetime
parser = argparse.ArgumentParser()
parser.add_argument('mode', help="""Mode: one of "all", "preproc", "star", or "postproc".
"all" runs the entire pipeline.
"preproc" or "preprocess" runs all the steps prior to running STAR, producing a file called single_cells_barcoded_head.fastq in output_dir.
"star" assumes that output of preproc exists in output_dir, and tries to run the STAR alignment, producing a file called single_cells_barcoded_headAligned.out.sam.
"postproc" assumes that the output of star exists in output_dir.
""")
parser.add_argument('--fq1', help='fastq1 - mRNA reads')
parser.add_argument('--fq2', help='fastq2 - reads contain UMI and barcodes')
parser.add_argument('--output_dir', help='output dir')
parser.add_argument('--chemistry', default='v1', help='Using v1 or v2 chemistry')
parser.add_argument('--genome_dir', default='./', help='path containing reference genome')
parser.add_argument('--sample', action='append', nargs=2, metavar=('SAMPLE_NAME','WELLS'),
help='Add a sample_name followed by the range of wells. A1:B6 specifies A1:A6,B1:B6,C1:C6. A1-B6 specifies A1-A12,B1-6. Multiple selections can be made by adding a comma (no space), e.g. A1-A6,B1-B3')
parser.add_argument('--bc_edit_dist', default='2', help='Maximum edit distance that will be corrected for barcodes in each round')
# TODO: this is currently unused
#parser.add_argument('--star_path', default='./', help='path containing STAR')
parser.add_argument('--genome', nargs='*', help='name(s) of genome(s)/species')
parser.add_argument('--genes', nargs='*', help='GTF file(s) with gene annotations')
parser.add_argument('--fasta', nargs='*', help='fasta file for genome(s)')
parser.add_argument('--splicing',default='True', help='whether genome has splicing')
parser.add_argument('--genomeSAindexNbases',default='14', help='set this to min(14, floor(log2(GenomeLength)/2 - 1))')
parser.add_argument('--nthreads', default='4', help='number of threads to use')
parser.add_argument('--sublibraries',nargs='*',help='paths to output directories of each sublibrary')
args = parser.parse_args()
print(args)
print(args.mode)
print(args.fq1)
print(args.fq2)
print(args.output_dir)
print(args.chemistry)
print(args.genome_dir)
print(args.sublibraries)
mode = args.mode.lower()
if mode == 'mkref':
# Generate genome
tools.make_combined_genome(args.genome, args.fasta, args.output_dir)
# Make a combine annotation from GTF
tools.make_gtf_annotations(args.genome, args.genes, args.output_dir, args.splicing)
# Index genome with star
tools.generate_STAR_index(args.output_dir, args.nthreads, args.genomeSAindexNbases, args.splicing)
if mode == 'debug':
# Add barcode and UMI to header of read1 fastq
print(datetime.datetime.now(),'Correcting barcodes...')
tools_debug.preprocess_fastq(args.fq1, args.fq2, args.output_dir, args.chemistry)
if mode == 'all':
# Check that user specified valid samples
valid_samples = analysis.check_valid_samples(args.sample)
if valid_samples:
# Add barcode and UMI to header of read1 fastq
print(datetime.datetime.now(),'Correcting barcodes...')
tools.preprocess_fastq(args.fq1, args.fq2, args.output_dir, args.chemistry, args.bc_edit_dist)
# Align with STAR
tools.run_star(args.genome_dir, args.output_dir, args.nthreads)
print(datetime.datetime.now(),'Sorting aligned bamfile...')
# Sort the output samfile
tools.sort_sam(args.output_dir, args.nthreads)
print(datetime.datetime.now(),'Getting molecular info...')
# Get molecular information for each UMI
processv2.molecule_info(args.genome_dir, args.output_dir, args.nthreads)
print(datetime.datetime.now(),'Generating digital gene expression matrix...')
analysis.generate_all_dge_reports(args.output_dir,args.genome_dir,args.chemistry,args.sample)
print(datetime.datetime.now(),'Finished successfully')
else:
print('Samples were entered with incorrect format. Use split-seq -h for more information')
if mode == 'preproc' or mode == 'preprocess' or mode == 'preprocessing':
tools.preprocess_fastq(args.fq1, args.fq2, args.output_dir, args.chemistry)
if mode == 'star':
tools.run_star(args.genome_dir, args.output_dir, args.nthreads)
if mode == 'postproc' or mode == 'postprocess' or mode == 'postprocessing':
tools.sort_sam(args.output_dir, args.nthreads)
if mode == 'molinfo':
#process.molecule_info(args.gtf_file, args.output_dir, args.nthreads)
processv2.molecule_info(args.genome_dir, args.output_dir, args.nthreads)
if mode=='dge':
valid_samples = analysis.check_valid_samples(args.sample)
if valid_samples:
analysis.generate_all_dge_reports(args.output_dir,args.genome_dir,args.chemistry,args.sample)
else:
print('Samples were entered with incorrect format. Use split-seq -h for more information')
if mode == 'combine':
valid_samples = analysis.check_valid_samples(args.sample)
if valid_samples:
analysis.generate_all_dge_reports(args.output_dir, args.genome_dir, args.chemistry, args.sample, sublibraries=args.sublibraries)
else:
print('Samples were entered with incorrect format. Use split-seq -h for more information')