-
Notifications
You must be signed in to change notification settings - Fork 4
/
LABRAT_rn6annotation.py
688 lines (578 loc) · 24.9 KB
/
LABRAT_rn6annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
#!/usr/bin/env python
import gffutils
import os
import sys
import itertools
import gzip
import numpy as np
from Bio import SeqIO
import argparse
import subprocess
import pandas as pd
from functools import reduce
from collections import OrderedDict
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
from scipy.stats.distributions import chi2
import warnings
#For every gene, the goal is to define what the overall usage of proximal/distal polyA sites is. This is done by defining
#a "psi" value for every gene. For each transcript in a gene, the "terminal fragment" (TF) is the last two
#exons of the transcript concatenated together. For each transcript, a position factor (PF) is calculated as (m / n + 1; will range
#between 0 and 1).
#The tpm for its TF is multiplied by its PF. This is then done for all TFs and their scaled values are summed and divided
#by the unscaled sum of tpms for all TFs to give psi, which will also range between 0 (exculsive usage of the proximal polyA site)
#and 1 (exclusive usage of the distal polyA site).
#Does a gene pass filters?
def genefilters(gene, db):
proteincoding = False
if 'protein_coding' in gene.attributes['biotype']:
proteincoding = True
if proteincoding == True:
return True
else:
return False
#Does a transcript pass filters?
def transcriptfilters(transcript, db):
exonnumberpass = False
TFlengthpass = False
proteincoding = False
mrnaendpass = False
#How many exons does it have
if len(list(db.children(transcript, featuretype = 'exon'))) >= 2:
exonnumberpass = True
else:
return False
#What is the length of the terminal fragment
exons = []
if transcript.strand == '+':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start'):
exons.append([exon.start, exon.end + 1])
elif transcript.strand == '-':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start', reverse = True):
exons.append([exon.start, exon.end + 1])
penultimateexonlen = len(range(exons[-2][0], exons[-2][1]))
lastexonlen = len(range(exons[-1][0], exons[-1][1]))
TFlength = penultimateexonlen + lastexonlen
if TFlength > 200:
TFlengthpass = True
#Is this transcript protein coding
if 'protein_coding' in transcript.attributes['biotype']:
proteincoding = True
#Are we confident in the 3' end of this mrnaendpass
if 'tag' not in transcript.attributes or 'mRNA_end_NF' not in transcript.attributes['tag']:
mrnaendpass = True
if exonnumberpass and TFlengthpass and proteincoding and mrnaendpass:
return True
else:
return False
#Given an annotation in gff format, get the position factors for all transcripts.
#It merges any two transcript ends that are less than <lengthfilter> away from each other into a single end.
#This is so that you dont end up with unique regions that are like 4 nt long.
#They might causes issues when it comes to counting kmers or reads that map to a given region.
def getpositionfactors(gff, lengthfilter):
lengthfilter = int(lengthfilter)
genecount = 0
txends = {} #{ENSMUSG : [strand, [list of distinct transcript end coords]]}
posfactors = {} #{ENSMUSG : {ENSMUST : positionfactor}}
#Make gff database
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
#Get number of distinct transcript ends for each gene
genes = db.features_of_type('gene')
for gene in genes:
#Only protein coding genes
passgenefilters = genefilters(gene, db)
if passgenefilters == False:
continue
genename = str(gene.id).replace('gene:', '')
ends = []
if gene.strand == '+':
for transcript in db.children(gene, featuretype = 'mRNA', level = 1, order_by = 'end'):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
if transcript.end not in ends:
ends.append(transcript.end)
elif gene.strand == '-':
for transcript in db.children(gene, featuretype = 'mRNA', level = 1, order_by = 'start', reverse = True):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
if transcript.start not in ends:
ends.append(transcript.start)
if ends: #Sometimes there are no 'transcripts' for a gene, like with pseudogenes, etc.
txends[genename] = [gene.strand, ends]
#Sort transcript end coords
s_txends = {} #{ENSMUSG : [sorted (most upstream to most downstream) tx end coords]}
for gene in txends:
strand = txends[gene][0]
coords = txends[gene][1]
if strand == '+':
sortedcoords = sorted(coords)
elif strand == '-':
sortedcoords = sorted(coords, reverse = True)
s_txends[gene] = sortedcoords
#Get m values (the numerator of the position factor fraction), combining an end that is less than <lengthfilter> nt away from
#the previous utr into the same m value as the previous utr
mvalues = {} #{ENSMUSG : {txendcoord : mvalue}}
for gene in s_txends:
mvalues[gene] = {}
currentendcoord = s_txends[gene][0]
currentmvalue = 0
mvalues[gene][currentendcoord] = 0 #the first one has to have m = 0
for endcoord in s_txends[gene][1:]:
#If this endcoord is too close to the last one
if abs(endcoord - currentendcoord) <= lengthfilter:
#this end gets the current m value
mvalues[gene][endcoord] = currentmvalue
#we stay on this m value for the next round
currentmvalue = currentmvalue
#update currentendcoord
currentendcoord = endcoord
#If this endcoord is sufficiently far away from the last one
elif abs(endcoord - currentendcoord) > lengthfilter:
#this end coord gets the next m value
mvalues[gene][endcoord] = currentmvalue + 1
#we move on to the next m value for the next round
currentmvalue = currentmvalue + 1
#update currentendcoord
currentendcoord = endcoord
#Figure out postion scaling factor for each transcript (position / (number of total positions - 1)) (m / (n - 1))
genes = db.features_of_type('gene')
for gene in genes:
genecount +=1
genename = str(gene.id).replace('gene:', '')
#Only protein coding genes
passgenefilters = genefilters(gene, db)
if passgenefilters == False:
continue
#If this gene isnt in mvalues or there is only one m value for the entire gene, skip it
if genename not in mvalues:
continue
if len(set(mvalues[genename].values())) == 1:
continue
#Get number of different m values for this gene
n = len(set(mvalues[genename].values()))
posfactors[genename] = {}
for transcript in db.children(gene, featuretype = 'mRNA', level = 1, order_by = 'end'):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
txname = str(transcript.id).replace('transcript:', '')
if gene.strand == '+':
m = mvalues[genename][transcript.end]
elif gene.strand == '-':
m = mvalues[genename][transcript.start]
posfactor = m / float(n - 1)
posfactors[genename][txname] = posfactor
#Output file of the number of posfactors for each gene
with open('numberofposfactors.txt', 'w') as outfh:
outfh.write(('\t').join(['Gene', 'numberofposfactors']) + '\n')
for gene in posfactors:
pfs = []
for tx in posfactors[gene]:
pfs.append(posfactors[gene][tx])
pfs = list(set(pfs))
outfh.write(('\t').join([gene, str(len(pfs))]) + '\n')
return posfactors
#Make a fasta file containing the "terminal fragments" of all transcripts
def makeTFfasta(gff, genomefasta, lasttwoexons):
TFs = {} #{transcriptid: [chrm, strand, [next to last exon1start, next to last exon1stop], [last exon2start, last exon2stop]]}
#Make gff database
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
print('Indexing genome sequence...')
if os.path.basename(genomefasta).endswith('.gz'):
seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta, 'rt'), 'fasta'))
else:
seq_dict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
print('Done indexing!')
genes = db.features_of_type('gene')
#Get last two exons
genecounter = 0
for gene in genes:
genecounter +=1
if genecounter % 5000 == 0:
print('Gene {0}...'.format(genecounter))
#Only protein coding genes
passgenefilters = genefilters(gene, db)
if passgenefilters == False:
continue
for transcript in db.children(gene, featuretype = 'mRNA'):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
exons = [] #[[exon1start, exon1stop], ..., [exonnstart, exonnstop]]
txname = str(transcript.id).replace('transcript:', '').split('.')[0]
if gene.strand == '+':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start'):
exons.append([exon.start, exon.end])
elif gene.strand == '-':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start', reverse = True):
exons.append([exon.start, exon.end])
#For last two exons
if lasttwoexons:
TFs[txname] = [transcript.chrom, transcript.strand, exons[-2], exons[-1]]
#For all exons
elif not lasttwoexons:
TFs[txname] = [transcript.chrom, transcript.strand]
for exon in exons:
TFs[txname].append(exon)
#Get sequences of TFs
if lasttwoexons:
with open('TFseqs.fasta', 'w') as outfh:
for TF in TFs:
chrm, strand, exon1, exon2 = TFs[TF]
if strand == '+':
exon1seq = seq_dict[chrm].seq[exon1[0] - 1:exon1[1]].upper()
exon2seq = seq_dict[chrm].seq[exon2[0] - 1:exon2[1]].upper()
elif strand == '-':
exon1seq = seq_dict[chrm].seq[exon1[0] - 1:exon1[1]].reverse_complement().upper()
exon2seq = seq_dict[chrm].seq[exon2[0] - 1:exon2[1]].reverse_complement().upper()
TFseq = exon1seq + exon2seq
outfh.write('>' + TF + '\n' + str(TFseq) + '\n')
#Get sequences of all exons
elif not lasttwoexons:
with open('wholetranscriptseqs.fasta', 'w') as outfh:
for TF in TFs:
chrm, strand = TFs[TF][0], TFs[TF][1]
exons = TFs[TF][2:]
seq = ''
for exon in exons:
if strand == '+':
exonseq = seq_dict[chrm].seq[exon[0] - 1:exon[1]].upper()
elif strand == '-':
exonseq = seq_dict[chrm].seq[exon[0] - 1:exon[1]].reverse_complement().upper()
seq += exonseq
outfh.write('>' + TF + '\n' + str(seq) + '\n')
def runSalmon(threads, reads1, reads2, samplename):
#paired end
if reads2:
print('Running salmon for {0}...'.format(samplename))
command = ['salmon', 'quant', '--libType', 'A', '-p', threads, '--seqBias', '--gcBias', '-1', reads1, '-2', reads2, '-o', samplename, '--index', 'txfasta.idx', '--validateMapping']
#Single end
elif not reads2:
fldMean = '250' #fragment length distribution mean
fldSD = '20' #fragment length distribution standard deviation
command = ['salmon', 'quant', '--libType', 'A', '-p', threads, '--fldMean', fldMean, '--fldSD', fldSD, '--seqBias', '-r', reads1, '-o', samplename, '--index', 'txfasta.idx', '--validateMapping']
subprocess.call(command)
def calculatepsi(positionfactors, salmondir):
txtpms = {} #{transcriptid : tpm}
genetpms = {} #{geneid : [transcript tpms]}
posfactorgenetpms = {} #{geneid : [transcripttpms scaled by posfactor]}
psis = {} #{geneid : psi}
#Read in salmonout file
#salmondir is the salmon output directory
print('Calculating psi values for {0}...'.format(salmondir))
salmonoutfile = os.path.join(os.path.abspath(salmondir), 'quant.sf')
with open(salmonoutfile, 'r') as infh:
for line in infh:
line = line.strip().split('\t')
#Skip header
if line[0] == 'Name':
continue
transcriptid = str(line[0]).split('.')[0]
tpm = float(line[3])
txtpms[transcriptid] = tpm
#Put the transcript tpms for every gene together
for gene in positionfactors:
genetpms[gene] = []
posfactorgenetpms[gene] = []
for transcript in positionfactors[gene]:
txtpm = txtpms[transcript.split('.')[0]]
genetpms[gene].append(txtpm)
posfactor = positionfactors[gene][transcript]
posfactorgenetpms[gene].append(txtpm * posfactor)
#Calculate psi for every gene (can apply filters for min gene tpm here)
for gene in posfactorgenetpms:
scaledtpm = sum(posfactorgenetpms[gene])
totaltpm = sum(genetpms[gene])
if totaltpm >= 5:
psi = scaledtpm / float(totaltpm)
psis[gene] = float(format(psi, '.3f'))
else:
#psis[gene] = 'NA'
psis[gene] = np.nan
return psis
def getdpsis(psifile, samp_conds_file):
#Given a table of psis, calculate dpsis and LME-based p values
deltapsidict = OrderedDict() #{genename : pvalue} ordered so it's easy to match it up with df
pvaluedict = OrderedDict() #{genename : pvalue} ordered so it's easy to match it up with q values
psidf = pd.read_csv(psifile, sep = '\t', header = 0, index_col = False)
sampconddf = pd.read_csv(samp_conds_file, sep = '\t', index_col = False, header = None)
cond1samps = sampconddf[0].dropna().tolist()
cond2samps = sampconddf[1].dropna().tolist()
print('Condition 1 samples: ' + (', ').join(cond1samps))
print('Condition 2 samples: ' + (', ').join(cond2samps))
#Store relationships of conditions and the samples in that condition
#It's important that this dictionary be ordered because we are going to be iterating through it
samp_conds = OrderedDict({'cond1' : cond1samps, 'cond2' : cond2samps})
#Get a list of all samples
samps = []
for cond in samp_conds:
samps += samp_conds[cond]
#Iterate through rows, making a dictionary from every row, turning it into a dataframe, then calculating p value
genecounter = 0
for index, row in psidf.iterrows():
genecounter +=1
if genecounter % 1000 == 0:
print('Calculating pvalue for gene {0}...'.format(genecounter))
d = {}
d['Gene'] = [row['Gene']] * len(samps)
d['variable'] = samps
values = [] #psi values
for cond in samp_conds:
for sample in samp_conds[cond]:
value = row[sample]
values.append(value)
d['value'] = values
conds = []
for cond in samp_conds:
conds += [cond] * len(samp_conds[cond])
cond1s = []
cond2s = []
for cond in conds:
if cond == 'cond1':
cond1s.append(1)
cond2s.append(0)
elif cond == 'cond2':
cond1s.append(0)
cond2s.append(1)
d['cond1'] = cond1s #e.g. [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
d['cond2'] = cond2s #e.g. [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
d['samples'] = [x + 1 for x in range(len(samps))]
#Turn this dictionary into a DataFrame
rowdf = pd.DataFrame.from_dict(d)
#delta psi is difference between mean psi of two conditions (cond2 - cond1)
cond1meanpsi = float(format(np.mean(rowdf.query('cond1 == 1').value.dropna()), '.3f'))
cond2meanpsi = float(format(np.mean(rowdf.query('cond2 == 1').value.dropna()), '.3f'))
deltapsi = cond2meanpsi - cond1meanpsi
deltapsi = float(format(deltapsi, '.3f'))
deltapsidict[row['Gene']] = deltapsi
#Get LME pvalue
#Lots of warnings about convergence, etc. Suppress them.
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
try:
#actual model
md = smf.mixedlm('value ~ cond1', data = rowdf, groups = 'cond1', missing = 'drop')
mdf = md.fit(reml = False) #REML needs to be false in order to use log-likelihood for pvalue calculation
#null model
nullmd = smf.mixedlm('value ~ 1', data = rowdf, groups = 'samples', missing = 'drop')
nullmdf = nullmd.fit(reml = False)
#Likelihood ratio
LR = 2 * (mdf.llf - nullmdf.llf)
p = chi2.sf(LR, df = 1)
if math.isnan(p):
p = 1
#These exceptions are needed to catch cases where either all psi values are nan (valueerror) or all psi values for one condition are nan (linalgerror)
except (ValueError, np.linalg.LinAlgError):
p = 1
pvaluedict[row['Gene']] = float('{:.2e}'.format(p))
#Correct pvalues using BH method
pvalues = list(pvaluedict.values())
fdrs = list(multipletests(pvalues, method = 'fdr_bh')[1])
fdrs = [float('{:.2e}'.format(fdr)) for fdr in fdrs]
#Add deltapsis, pvalues, and FDRs to df
deltapsis = list(deltapsidict.values())
psidf = psidf.assign(deltapsi = deltapsis)
psidf = psidf.assign(pval = pvalues)
psidf = psidf.assign(FDR = fdrs)
#Write df
fn = os.path.abspath(psifile) + '.pval'
psidf.to_csv(fn, sep = '\t', index = False)
#We want to know if the alternative 3' ends for this gene are either all of ALE forms or all of tandem UTR forms.
#If it's a mixture, we can't really deal with that cleanly, so we are going to ignore it
def getexoniccoords(posfactors, gff):
#Take posfactors dictionary
#Take the transcripts for each gene, get their exonic coords.
#if txend of a transcript is exonic for every transcript that has a higher PF, then this gene is all TUTR
#if all txends are not exonic in any other transcript, then this gene is all ALE
#if neither of these are true, then the gene is somehow mixed
#posfactors = {ENSMUSG : {ENSMUST : positionfactor}}
#Make gff database
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
exoniccoords = {} #{geneid : {txid : [positionfactor, [set of exonic coords]]}}
for gene in posfactors:
txs = posfactors[gene].keys()
geneexons = {} #{txid : [positionfactor, [set of exonic coords]]}
for tx in txs:
txexons = []
pf = posfactors[gene][tx]
tx = db['transcript:' + tx]
if tx.strand == '+':
for exon in db.children(tx, featuretype = 'exon', order_by = 'start'):
txexons += list(range(exon.start, exon.end + 1))
elif tx.strand == '-':
for exon in db.children(tx, featuretype = 'exon', order_by = 'start', reverse = True):
txexons += list(range(exon.start, exon.end + 1))
geneexons[tx.id] = [pf, set(txexons)]
exoniccoords[gene] = geneexons
return exoniccoords
#Could this gene contain only ALEs for 3' ends?
def isitALE(geneexoniccoords, db):
couldbeALE = True
for txid in geneexoniccoords:
tx = db[txid]
if tx.strand == '+':
txend = int(tx.end)
elif tx.strand == '-':
txend = int(tx.start)
#See if this txend is exonic for any other transcript with a different positionfactor
pf = geneexoniccoords[txid][0]
for othertx in geneexoniccoords:
othertxpf = geneexoniccoords[othertx][0]
#If it has the same PF, move on
if pf == othertxpf:
continue
othertxexoncoords = geneexoniccoords[othertx][1]
if txend in othertxexoncoords:
couldbeALE = False
return couldbeALE
#Could this gene contain only TUTRs for 3' ends?
def isitTUTR(geneexoniccoords, db):
couldbeTUTR = True
for txid in geneexoniccoords:
tx = db[txid]
if tx.strand == '+':
txend = int(tx.end)
elif tx.strand == '-':
txend = int(tx.start)
#See if this txend is exonic for every transcript that has a higher PF
pf = geneexoniccoords[txid][0]
for othertx in geneexoniccoords:
othertxpf = geneexoniccoords[othertx][0]
#If the transcript of interest has a greater of equal PF to the transcript we are comparing to, move on
if pf >= othertxpf:
continue
othertxexoncoords = geneexoniccoords[othertx][1]
if txend not in othertxexoncoords:
couldbeTUTR = False
return couldbeTUTR
def classifygenes(exoniccoords, gff):
#exoniccoords = {} #{geneid : {txid : [positionfactor, [set of exonic coords]]}}
genetypes = {} #{geneid : type of 3' UTR end}
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
for gene in exoniccoords:
geneexoniccoords = exoniccoords[gene]
coulditbeALE = isitALE(geneexoniccoords, db)
coulditbeTUTR = isitTUTR(geneexoniccoords, db)
if coulditbeALE == True and coulditbeTUTR == False:
genetype = 'ALE'
elif coulditbeALE == False and coulditbeTUTR == True:
genetype = 'TUTR'
elif coulditbeALE == False and coulditbeTUTR == False:
genetype = 'mixed'
elif coulditbeALE == True and coulditbeTUTR == True:
genetype = 'ERROR'
genetypes[gene] = genetype
return genetypes
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--mode', type = str, choices = ['makeTFfasta', 'runSalmon', 'calculatepsi', 'LME'])
parser.add_argument('--gff', type = str, help = 'GFF of transcript annotation. Needed for makeTFfasta and calculatepsi.')
parser.add_argument('--genomefasta', type = str, help = 'Genome sequence in fasta format. Needed for makeTFfasta.')
parser.add_argument('--lasttwoexons', action = 'store_true', help = 'Used for makeTFfasta. Do you want to only use the last two exons?')
parser.add_argument('--txfasta', type = str, help = 'Fasta file of sequences to quantitate with salmon. Often the output of makeTFfasta mode.')
parser.add_argument('--reads1', type = str, help = 'Comma separated list of forward read fastq files. Needed for runSalmon.')
parser.add_argument('--reads2', type = str, help = 'Comma separated list of reverse read fastq files. Needed for runSalmon.')
parser.add_argument('--samplename', type = str, help = 'Comma separated list of samplenames. Needed for runSalmon.')
parser.add_argument('--threads', type = str, help = 'Number of threads to use. Needed for runSalmon.')
parser.add_argument('--salmondir', type = str, help = 'Salmon output directory. Needed for calculatepsi.')
parser.add_argument('--psifile', type = str, help = 'Psi value table. Needed for LME.')
parser.add_argument('--sampconds', type = str, help = 'File containing sample names split by condition. Two column, tab delimited text file. Condition 1 samples in first column, condition2 samples in second column.')
args = parser.parse_args()
if args.mode == 'makeTFfasta':
if not args.gff or not args.genomefasta or not args.librarytype:
print('You have not supplied all the required arguments! See the --help for more info.')
sys.exit()
makeTFfasta(args.gff, args.genomefasta, args.lasttwoexons)
elif args.mode == 'runSalmon':
if not args.txfasta or not args.reads1 or not args.samplename or not args.threads:
print('You have not supplied all the required arguments! See the --help for more info.')
sys.exit()
forreads = args.reads1.split(',')
if args.reads2:
revreads = args.reads2.split(',')
samplenames = args.samplename.split(',')
command = ['salmon', 'index', '-t', args.txfasta, '-i', 'txfasta.idx', '--type', 'quasi', '-k', '31', '--keepDuplicates']
print('Indexing transcripts...')
subprocess.call(command)
print('Done indexing!')
if args.reads2:
if len(forreads) != len(revreads) or len(forreads) != len(samplenames) or len(revreads) != len(samplenames):
print('ERROR: The number of forward read files, reverse read files, and sample names must match!')
sys.exit()
elif not args.reads2:
if len(forreads) != len(samplenames):
print('ERROR: The number of forward read files and sample names must match!')
sys.exit()
if args.reads2:
for i in range(len(forreads)):
freads = forreads[i]
rreads = revreads[i]
samplename = samplenames[i]
runSalmon(args.threads, freads, rreads, samplename)
elif not args.reads2:
for i in range(len(forreads)):
freads = forreads[i]
samplename = samplenames[i]
runSalmon(args.threads, freads, None, samplename)
elif args.mode == 'calculatepsi':
if not args.gff or not args.salmondir or not args.librarytype:
print('You have not supplied all the required arguments! See the --help for more info.')
sys.exit()
print('Calculating position factors for every transcript...')
positionfactors = getpositionfactors(args.gff, 25)
print('Done with position factors!')
salmondirs = [os.path.join(os.path.abspath(args.salmondir), d) for d in os.listdir(args.salmondir) if os.path.isdir(os.path.join(os.path.abspath(args.salmondir), d)) and d != 'txfasta.idx']
psidfs = []
for sd in salmondirs:
samplename = os.path.basename(sd)
#For ENCODE
#samplename = os.path.basename(sd).split('_')
#samplename = ('_rep').join([samplename[0], samplename[1]])
psis = calculatepsi(positionfactors, sd)
psidf = pd.DataFrame.from_dict(psis, orient = 'index')
psidf.reset_index(level = 0, inplace = True)
psidf.columns = ['Gene', samplename]
psidfs.append(psidf)
bigpsidf = reduce(lambda x, y: pd.merge(x, y, on = 'Gene'), psidfs)
#Add in genetypes (ALE or TUTR or mixed)
exoniccoords = getexoniccoords(positionfactors, args.gff)
genetypes = classifygenes(exoniccoords, args.gff)
genetypedf = pd.DataFrame.from_dict(genetypes, orient = 'index')
genetypedf.reset_index(level = 0, inplace = True)
genetypedf.columns = ['Gene', 'genetype']
finalpsidf = reduce(lambda x, y: pd.merge(x, y, on = 'Gene'), [bigpsidf, genetypedf])
finalpsidf.to_csv('LABRATpsis.3end.python3.txt', sep = '\t', index = False)
elif args.mode == 'LME':
getdpsis_python(args.psifile, args.sampconds)