-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathLABRATsc_dm6.py
847 lines (704 loc) · 30.1 KB
/
LABRATsc_dm6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
#Application of LABRAT to alevin-quantified scRNAseq data
from struct import Struct
import numpy as np
import pandas as pd
import gzip
import sys
import os
from scipy.io import mmread
from scipy.sparse import csr_matrix
from scipy.stats import ranksums
from statsmodels.stats.multitest import multipletests
import gffutils
import pickle
import argparse
def read_quants_bin(base_dir, clipped=False, mtype="data"):
'''
Read the quants Sparse Binary output of Alevin and generates a dataframe
Parameters
----------
base_location: string
Path to the folder containing the output of the alevin run
clipped: bool (default False)
Clip off all zero rows and columns
mtype: "[data(default), tier, var, mean]"
Alevin's matrix type to load into memory
This function is heavily based on a function in https://github.com/k3yavi/vpolo/blob/master/vpolo/alevin/parser.py
'''
if not os.path.isdir(base_dir):
print("{} is not a directory".format( base_dir ))
sys.exit(1)
base_location = os.path.join(base_dir, "alevin")
#print(base_location)
if not os.path.exists(base_location):
print("{} directory doesn't exist".format( base_location ))
sys.exit(1)
data_type = "f"
if mtype == "data":
quant_file = os.path.join(base_location, "quants_mat.gz")
elif mtype == "tier":
data_type = "B"
quant_file = os.path.join(base_location, "quants_tier_mat.gz")
elif mtype == "mean":
quant_file = os.path.join(base_location, "quants_mean_mat.gz")
elif mtype == "var":
quant_file = os.path.join(base_location, "quants_var_mat.gz")
else:
print("wrong mtype: {0}".format( mtype ))
sys.exit(1)
if not os.path.exists(quant_file):
print("quant file {} doesn't exist".format( quant_file ))
sys.exit(1)
if mtype in ["mean", "var"]:
cb_file = os.path.join(base_location, "quants_boot_rows.txt")
else:
cb_file = os.path.join(base_location, "quants_mat_rows.txt")
if not os.path.exists(cb_file):
print("quant file's index: {} doesn't exist".format( cb_file ))
sys.exit(1)
gene_file = os.path.join(base_location, "quants_mat_cols.txt")
if not os.path.exists(gene_file):
print("quant file's header: {} doesn't exist".format(gene_file))
sys.exit(1)
cb_names = pd.read_csv(cb_file, header=None)[0].values
gene_names = pd.read_csv(gene_file, header=None)[0].values
num_genes = len(gene_names)
num_cbs = len(cb_names)
num_entries = int(np.ceil(num_genes/8))
with gzip.open( quant_file ) as f:
line_count = 0
tot_umi_count = 0
umi_matrix = []
header_struct = Struct( "B" * num_entries)
while True:
line_count += 1
if line_count%100 == 0:
print ("\r Done reading " + str(line_count) + " cells", end= "")
sys.stdout.flush()
try:
num_exp_genes = 0
exp_counts = header_struct.unpack_from( f.read(header_struct.size) )
for exp_count in exp_counts:
num_exp_genes += bin(exp_count).count("1")
data_struct = Struct( data_type * num_exp_genes)
sparse_cell_counts_vec = list(data_struct.unpack_from( f.read(data_struct.size) ))[::-1]
cell_umi_counts = sum(sparse_cell_counts_vec)
except:
print ("\nRead total " + str(line_count-1) + " cells")
print ("Found total " + str(round(tot_umi_count)) + " reads")
break
if cell_umi_counts > 0.0:
tot_umi_count += cell_umi_counts
cell_counts_vec = []
for exp_count in exp_counts:
for bit in format(exp_count, '08b'):
if len(cell_counts_vec) >= num_genes:
break
if bit == '0':
cell_counts_vec.append(0.0)
else:
abund = sparse_cell_counts_vec.pop()
cell_counts_vec.append(abund)
if len(sparse_cell_counts_vec) > 0:
print("Failure in consumption of data")
print("left with {} entry(ies)".format(len(sparse_cell_counts_vec)))
umi_matrix.append( cell_counts_vec )
else:
print("Found a CB with no read count, something is wrong")
sys.exit(1)
alv = pd.DataFrame(umi_matrix)
alv.columns = gene_names
alv.index = cb_names
if clipped:
alv = alv.loc[:, (alv != 0).any(axis=0)]
barcodes = list(alv.index.values)
barcodes = [os.path.basename(os.path.abspath(base_dir)) + '_' + b for b in barcodes]
alv = alv.set_index([barcodes])
#for testing, can use alv.head() here to only return first n cells
return alv
def createbigdf(alevindir):
#Given a directory (alevindir) that contains alevin output subdirectories
#concatentate all alevin outputs into one big df
#directory structure:
# alevindir
# |
# |---sample1
# | |
# | |----alevin
# | |----quants_mat.gz, etc.
# |---sample2
# |---sample3
# etc.
#index of big df will be sampleid + '_' + cell barcode
# e.g. sample1_CAGTCTCGTTGCTCAG
alevindir = os.path.abspath(alevindir)
dfs = [] #list of sample dfs
samples = os.listdir(alevindir)
for samp in samples:
samppath = os.path.join(alevindir, samp)
if not os.path.isdir(samppath):
continue
print('Reading data in {0}...'.format(samppath))
p = os.path.join(alevindir, samp)
df = read_quants_bin(p, False, 'data')
dfs.append(df)
bigdf = pd.concat(dfs)
bigdf = bigdf.reset_index()
bigdf = bigdf.rename(columns = {'index': 'cellid'})
return bigdf
def addclusters(bigdf, clusters):
#Add labels of cluster ID to df of counts
clusterdf = pd.read_csv(clusters, sep = '\t', header = 0, index_col = None)
clusterdf = clusterdf.rename(columns = {'sample' : 'cellid'})
newbigdf = pd.merge(clusterdf, bigdf, how = 'inner', on = 'cellid')
return newbigdf
def collapseclusters(bigdf):
#Sum transcript counts across every cell in a cluster
conditions = list(set(bigdf['condition'].tolist()))
allcountsums = [] #list of countsums dictionaries, in order of conditions
for condition in conditions:
countsums = {} #{transcriptid : countsum}
countsums['condition'] = condition
df = bigdf.loc[bigdf['condition'] == condition]
colnames = list(df)
#select only transcriptID columns
colnames = [c for c in colnames if c != 'cellid' and c != 'condition']
for c in colnames:
counts = df[c].tolist()
sumcounts = sum(counts)
countsums[c] = [sumcounts]
allcountsums.append(countsums)
#Turn the countsum dictionaries into dfs
dfs = []
for countsum in allcountsums:
df = pd.DataFrame.from_dict(countsum)
dfs.append(df)
#Concatenate dfs
countsumdf = pd.concat(dfs)
return countsumdf
def calculatepsi_fromcollapsedclusters(countsumdf, posfactors, readcountfilter):
#Calculate psi values for every gene in posfactors
#This function takes a df that has one row per cluster/condition
#that was produced by collapseclusters().
conditions = countsumdf['condition'].tolist()
allpsidicts = [] #list of all psidicts
for condition in conditions:
df = countsumdf.loc[countsumdf['condition'] == condition]
psidict = {} #{gene : psi}
psidict['condition'] = condition
for gene in posfactors:
geneid = gene.split('.')[0]
rawcounts = []
scaledcounts = []
for tx in posfactors[gene]:
txid = tx.split('.')[0]
m = posfactors[gene][tx]
counts = df[txid].tolist()[0]
scaled_counts = counts * m
rawcounts.append(counts)
scaledcounts.append(scaled_counts)
if sum(rawcounts) < readcountfilter: #add read count filter here. this is the summed number of reads across all cells in the cluster
psi = 'NA'
else:
psi = sum(scaledcounts) / sum(rawcounts)
psidict[geneid] = [psi]
allpsidicts.append(psidict)
#Turn the psidicts into dfs
dfs = []
for psidict in allpsidicts:
df = pd.DataFrame.from_dict(psidict)
dfs.append(df)
#Concatenate dfs
psidf = pd.concat(dfs)
return psidf
def calculatedpsi_fromcollapsedclusters(psidf, conditionA, conditionB):
#Given a dataframe of psi values (from calculatepsi_fromcollapsedclusters()),
#calculate a dpsi value for each gene
#dpsi is defined as conditionB - conditionA
dpsis = {} #{gene : [condApsi, condBpsi, dpsi]}
colnames = list(psidf.columns.values)
colnames = [c for c in colnames if c != 'condition']
for gene in colnames:
condApsi = psidf.loc[psidf['condition'] == conditionA].at[0, gene]
condBpsi = psidf.loc[psidf['condition'] == conditionB].at[0, gene]
if condApsi == 'NA' or condBpsi == 'NA':
dpsi = 'NA'
else:
condApsi = round(condApsi, 3)
condBpsi = round(condBpsi, 3)
dpsi = round(condBpsi - condApsi, 3)
dpsis[gene] = [condApsi, condBpsi, dpsi]
#Turn into df
df = pd.DataFrame.from_dict(dpsis, orient = 'index')
df = df.reset_index()
df.columns = ['gene', '{0}_psi'.format(conditionA), '{0}_psi'.format(conditionB), 'deltapsi']
return df
def dotest_bootstrapclusters(bigdf, posfactors, fractosample, numberofsamples, conditionA, conditionB, readcountfilter):
#Given a dataframe of counts per cell (usually bigdf, produced by addclusters())
#for each condition (cluster), randomly select x% of the cells. Then collapse the clusters
#and calculate psivalues using collapseclusters() and calculatepsi_fromcollapsedclusters()
#Record psi for each gene in each cluster.
#Repeat n times
#Compare psi values across clusters to derive p value
condAdf = bigdf.loc[bigdf['condition'] == conditionA]
condBdf = bigdf.loc[bigdf['condition'] == conditionB]
condApsis = {} #{gene : [psi values from samples]}
condBpsis = {} #{gene : [psi values from samples]}
for i in range(numberofsamples):
print('Sample ' + str(i + 1))
#sample rows (cells)
condAsamp = condAdf.sample(frac = fractosample, replace = False, axis = 'index')
condBsamp = condBdf.sample(frac = fractosample, replace = False, axis = 'index')
#Put the condA and condB back together again
df = pd.concat([condAsamp, condBsamp], axis = 0, ignore_index = True)
collapseddf = collapseclusters(df)
psidf = calculatepsi_fromcollapsedclusters(collapseddf, posfactors, readcountfilter)
#Record psi values for each gene
colnames = list(psidf.columns.values)
genes = [c for c in colnames if c != 'condition']
for gene in genes:
condApsi = psidf.loc[psidf['condition'] == conditionA].at[0, gene]
condBpsi = psidf.loc[psidf['condition'] == conditionB].at[0, gene]
if gene not in condApsis:
condApsis[gene] = [condApsi]
else:
condApsis[gene].append(condApsi)
if gene not in condBpsis:
condBpsis[gene] = [condBpsi]
else:
condBpsis[gene].append(condBpsi)
#Test if sampled psis are different across conditions
#wilcoxon rank sum test
pvals = {} #{gene : pval}
for gene in genes:
Apsis = condApsis[gene]
Bpsis = condBpsis[gene]
Apsis = [psi for psi in Apsis if psi != 'NA']
Bpsis = [psi for psi in Bpsis if psi != 'NA']
#if either is just NAs, p = NA
if not Apsis or not Bpsis:
p = 'NA'
else:
p = ranksums(Apsis, Bpsis)[1]
p = round(float('{:.2e}'.format(p)), 3)
pvals[gene] = p
#Correct pvalues using BH method, but only using pvalues that are not NA
genes = list(pvals.keys())
pvalues = list(pvals.values())
pvaluesformultitest = [pval for pval in pvalues if str(pval) != 'NA']
fdrs = list(multipletests(pvaluesformultitest, method = 'fdr_bh')[1])
fdrs = [float('{:.2e}'.format(fdr)) for fdr in fdrs]
#Now we need to incorporate the places where p = NA into the list of FDRs (also as NA)
fdrswithnas = []
fdrindex = 0
for pvalue in pvalues:
if str(pvalue) != 'NA':
fdrswithnas.append(fdrs[fdrindex])
fdrindex +=1
elif str(pvalue) == 'NA':
fdrswithnas.append('NA')
fdrswithnas = [[fdr] for fdr in fdrswithnas] #fdr must be in list for conversion to df
correctedpvals = dict(zip(genes, fdrswithnas))
#make df of pval and FDR
pvalues = [[p] for p in pvalues] #p must be in list for conversion to df
pvals = dict(zip(genes, pvalues))
rawpdf = pd.DataFrame.from_dict(pvals, orient = 'index')
rawpdf.columns = ['raw_pval']
fdrdf = pd.DataFrame.from_dict(correctedpvals, orient = 'index')
fdrdf.columns = ['fdr']
#Merge rawpdf and fdrdf
df = rawpdf.merge(fdrdf, how = 'inner', left_index = True, right_index = True)
df = df.reset_index()
df.columns = ['gene', 'raw_pval', 'FDR']
return df
def calculatepsi_cellbycell(bigdf, posfactors, readcountfilter):
#Calculate psi values for every gene in every cell
#In practice, this is not going to be super useful because many
#gene level counts are going to be 0, giving a psi of NA
#This function takes a df of transcript counts per cell (produced by createbigdf() and addclusters())
#Make a small df of just cellid and condition
conditiondf = bigdf[['cellid', 'condition']]
allpsidicts = {} #{cellid : {gene : psi}}
cellcount = 0
for idx, row in bigdf.iterrows():
cellcount +=1
if cellcount % 10 == 0:
print('Calculating psi for cell {0}'.format(cellcount))
cellid = row['cellid']
allpsidicts[cellid] = {}
for gene in posfactors:
geneid = gene.split('.')[0]
rawcounts = []
scaledcounts = []
for tx in posfactors[gene]:
txid = tx.split('.')[0]
m = posfactors[gene][tx]
counts = row[txid]
scaled_counts = counts * m
rawcounts.append(counts)
scaledcounts.append(scaled_counts)
if sum(rawcounts) < readcountfilter: #add read count filter here
psi = 'NA'
else:
psi = sum(scaledcounts) / sum(rawcounts)
psi = round(psi, 3)
allpsidicts[cellid][geneid] = psi
#Turn nested dict into df
df = pd.concat({k: pd.DataFrame.from_dict(v, 'index') for k, v in allpsidicts.items()}, axis = 1)
df = df.transpose()
#drop first index and get second (cellid) as column name
df.reset_index(level = 1, inplace = True, drop = True)
df.index.name = 'cellid'
df.reset_index(level = 0, inplace = True)
#Merge condition column back in
df = pd.merge(conditiondf, df, how = 'inner', on = 'cellid')
return df
def calculatedpsi_cellbycell(psidf, conditionA, conditionB):
#Given a dataframe a cell-level psi values (from calculatepsi_cellbycell())
#calculate deltapsi (conditionB - conditionA)
#This is defined as the difference in median psi values between conditions
#Also report the number of cells in each condition that had psi values and went
#into the calculation.
dpsis = {} #{gene : [dpsi, number of cells with psi for condA, number of cells with psi for condB]}
colnames = list(psidf.columns.values)
genes = [c for c in colnames if c != 'cellid' and c != 'condition']
condAdf = psidf.loc[psidf['condition'] == conditionA]
condBdf = psidf.loc[psidf['condition'] == conditionB]
for gene in genes:
condApsis = condAdf[gene].tolist()
condBpsis = condBdf[gene].tolist()
condApsis = [psi for psi in condApsis if psi != 'NA']
condBpsis = [psi for psi in condBpsis if psi != 'NA']
condAcells = len(condApsis)
condBcells = len(condBpsis)
#if either all condA psis are NA or all condB psis are NA, delta psi = NA
if condBpsis and not condApsis:
condAmed = 'NA'
condBmed = round(np.median(condBpsis), 3)
dpsi = 'NA'
elif condApsis and not condBpsis:
condAmed = round(np.median(condApsis), 3)
condBmed = 'NA'
dpsi = 'NA'
elif not condApsis and not condBpsis:
condAmed = 'NA'
condBmed = 'NA'
dpsi = 'NA'
elif condApsis and condBpsis:
condAmed = round(np.median(condApsis), 3)
condBmed = round(np.median(condBpsis), 3)
dpsi = round(condBmed - condAmed, 3)
dpsis[gene] = [condAmed, condBmed, dpsi, condAcells, condBcells]
#Turn into df
df = pd.DataFrame.from_dict(dpsis, orient = 'index')
df = df.reset_index()
df.columns = ['gene', '{0}_psi'.format(conditionA), '{0}_psi'.format(conditionB), 'deltapsi', '{0}_cells'.format(conditionA), '{0}_cells'.format(conditionB)]
return df
def dotest_cellbycell(psidf):
#Given a matrix of psi values for every gene in every cell (psidf)
#in which the cells have been split into 2 conditions (column condition)
#perform a statistical test to ask whether the psi values for a gene are
#different across conditons. This is a bit tricky because most psi values
#will be NA, due to the fact that most genes have 0 counts in any given cell
pvals = {} #{gene : pval}
conditions = psidf['condition'].tolist()
conditions = list(set(conditions))
conditionA = conditions[0]
conditionB = conditions[1]
condAdf = psidf.loc[psidf['condition'] == conditionA]
condBdf = psidf.loc[psidf['condition'] == conditionB]
colnames = list(psidf.columns.values)
genes = [c for c in colnames if c != 'cellid' and c != 'condition']
for gene in genes:
condApsis = condAdf[gene].tolist()
condApsis = [p for p in condApsis if p != 'NA']
condBpsis = condBdf[gene].tolist()
condBpsis = [p for p in condBpsis if p != 'NA']
#wilcoxon rank-sum test
#If either condApsis or condBpsis is empty, p = NA
if not condApsis or not condBpsis:
p = 'NA'
else:
p = ranksums(condApsis, condBpsis)[1]
p = round(float('{:.2e}'.format(p)), 3)
pvals[gene] = p
#Correct pvalues using BH method, but only using pvalues that are not NA
genes = list(pvals.keys())
pvalues = list(pvals.values())
pvaluesformultitest = [pval for pval in pvalues if str(pval) != 'NA']
fdrs = list(multipletests(pvaluesformultitest, method = 'fdr_bh')[1])
fdrs = [float('{:.2e}'.format(fdr)) for fdr in fdrs]
#Now we need to incorporate the places where p = NA into the list of FDRs (also as NA)
fdrswithnas = []
fdrindex = 0
for pvalue in pvalues:
if str(pvalue) != 'NA':
fdrswithnas.append(fdrs[fdrindex])
fdrindex +=1
elif str(pvalue) == 'NA':
fdrswithnas.append(np.nan)
fdrswithnas = [[fdr] for fdr in fdrswithnas] #fdr must be in list for conversion to df
correctedpvals = dict(zip(genes, fdrswithnas))
#Make df
pvalues = [[p] for p in pvalues] #p must be in list for conversion to df
pvals = dict(zip(genes, pvalues))
rawpdf = pd.DataFrame.from_dict(pvals, orient = 'index')
rawpdf.columns = ['raw_pval']
fdrdf = pd.DataFrame.from_dict(correctedpvals, orient = 'index')
fdrdf.columns = ['fdr']
#Merge rawpdf and fdrdf
df = rawpdf.merge(fdrdf, how = 'inner', left_index = True, right_index = True)
df = df.reset_index()
df.columns = ['gene', 'raw_pval', 'FDR']
return df
#For every gene, the goal is to define what the overall usage of proximal/distal polyA sites is. This is done by defining
#a "psi" value for every gene. For each transcript in a gene, the "terminal fragment" (TF) is the last two
#exons of the transcript concatenated together. For each transcript, a position factor (PF) is calculated as (m / n + 1; will range
#between 0 and 1).
#The tpm for its TF is multiplied by its PF. This is then done for all TFs and their scaled values are summed and divided
#by the unscaled sum of tpms for all TFs to give psi, which will also range between 0 (exculsive usage of the proximal polyA site)
#and 1 (exclusive usage of the distal polyA site).
#Does a gene pass filters?
def genefilters(gene, db):
proteincoding = False
if 'protein_coding' in gene.attributes['biotype']:
proteincoding = True
if proteincoding == True:
return True
else:
return False
#Does a transcript pass filters?
def transcriptfilters(transcript, db):
exonnumberpass = False
TFlengthpass = False
proteincoding = False
mrnaendpass = False
#How many exons does it have
if len(list(db.children(transcript, featuretype = 'exon'))) >= 2:
exonnumberpass = True
else:
return False
#What is the length of the terminal fragment
exons = []
if transcript.strand == '+':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start'):
exons.append([exon.start, exon.end + 1])
elif transcript.strand == '-':
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start', reverse = True):
exons.append([exon.start, exon.end + 1])
penultimateexonlen = len(range(exons[-2][0], exons[-2][1]))
lastexonlen = len(range(exons[-1][0], exons[-1][1]))
TFlength = penultimateexonlen + lastexonlen
if TFlength > 200:
TFlengthpass = True
#Is this transcript protein coding
if 'protein_coding' in transcript.attributes['biotype']:
proteincoding = True
#Are we confident in the 3' end of this mrnaendpass
if 'tag' not in transcript.attributes or 'mRNA_end_NF' not in transcript.attributes['tag']:
mrnaendpass = True
if exonnumberpass and TFlengthpass and proteincoding and mrnaendpass:
return True
else:
return False
#Given an annotation in gff format, get the position factors for all transcripts.
#It merges any two transcript ends that are less than <lengthfilter> away from each other into a single end.
#This is so that you dont end up with unique regions that are like 4 nt long.
#They might causes issues when it comes to counting kmers or reads that map to a given region.
def getpositionfactors(gff, lengthfilter):
lengthfilter = int(lengthfilter)
genecount = 0
txends = {} #{ENSMUSG : [strand, [list of distinct transcript end coords]]}
posfactors = {} #{ENSMUSG : {ENSMUST : positionfactor}}
#Make gff database
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
#Get number of distinct transcript ends for each gene
genes = db.features_of_type('gene')
for gene in genes:
#Only protein coding genes
passgenefilters = genefilters(gene, db)
if passgenefilters == False:
continue
genename = str(gene.id).replace('gene:', '')
ends = []
if gene.strand == '+':
for transcript in db.children(gene, featuretype = 'transcript', level = 1, order_by = 'end'):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
if transcript.end not in ends:
ends.append(transcript.end)
elif gene.strand == '-':
for transcript in db.children(gene, featuretype = 'transcript', level = 1, order_by = 'start', reverse = True):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
if transcript.start not in ends:
ends.append(transcript.start)
if ends: #Sometimes there are no 'transcripts' for a gene, like with pseudogenes, etc.
txends[genename] = [gene.strand, ends]
#Sort transcript end coords
s_txends = {} #{ENSMUSG : [sorted (most upstream to most downstream) tx end coords]}
for gene in txends:
strand = txends[gene][0]
coords = txends[gene][1]
if strand == '+':
sortedcoords = sorted(coords)
elif strand == '-':
sortedcoords = sorted(coords, reverse = True)
s_txends[gene] = sortedcoords
#Get m values (the numerator of the position factor fraction), combining an end that is less than <lengthfilter> nt away from
#the previous utr into the same m value as the previous utr
mvalues = {} #{ENSMUSG : {txendcoord : mvalue}}
for gene in s_txends:
mvalues[gene] = {}
currentendcoord = s_txends[gene][0]
currentmvalue = 0
mvalues[gene][currentendcoord] = 0 #the first one has to have m = 0
for endcoord in s_txends[gene][1:]:
#If this endcoord is too close to the last one
if abs(endcoord - currentendcoord) <= lengthfilter:
#this end gets the current m value
mvalues[gene][endcoord] = currentmvalue
#we stay on this m value for the next round
currentmvalue = currentmvalue
#update currentendcoord
currentendcoord = endcoord
#If this endcoord is sufficiently far away from the last one
elif abs(endcoord - currentendcoord) > lengthfilter:
#this end coord gets the next m value
mvalues[gene][endcoord] = currentmvalue + 1
#we move on to the next m value for the next round
currentmvalue = currentmvalue + 1
#update currentendcoord
currentendcoord = endcoord
#Figure out postion scaling factor for each transcript (position / (number of total positions - 1)) (m / (n - 1))
genes = db.features_of_type('gene')
for gene in genes:
genecount +=1
genename = str(gene.id).replace('gene:', '')
#Only protein coding genes
passgenefilters = genefilters(gene, db)
if passgenefilters == False:
continue
#If this gene isnt in mvalues or there is only one m value for the entire gene, skip it
if genename not in mvalues:
continue
if len(set(mvalues[genename].values())) == 1:
continue
#Get number of different m values for this gene
n = len(set(mvalues[genename].values()))
posfactors[genename] = {}
for transcript in db.children(gene, featuretype = 'transcript', level = 1, order_by = 'end'):
#Skip transcripts that do not pass filters
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
txname = str(transcript.id).replace('transcript:', '')
if gene.strand == '+':
m = mvalues[genename][transcript.end]
elif gene.strand == '-':
m = mvalues[genename][transcript.start]
posfactor = m / float(n - 1)
posfactors[genename][txname] = posfactor
#Output file of the number of posfactors for each gene
with open('numberofposfactors.txt', 'w') as outfh:
outfh.write(('\t').join(['Gene', 'numberofposfactors', 'txids', 'interpolyAdist']) + '\n')
for gene in posfactors: #{ENSMUSG : {ENSMUST : positionfactor}}
pfs = []
for tx in posfactors[gene]:
pfs.append(posfactors[gene][tx])
pfs = list(set(pfs))
#write distance between polyA sites for those genes that only have 2 pfs
if len(pfs) == 2:
#Have to put the stupid 'gene:' back on and the 'transcript:' back on
dbgenename = 'gene:' + gene
g = db[dbgenename]
for tx in posfactors[gene]:
if posfactors[gene][tx] == 0:
txpf1 = 'transcript:' + tx
elif posfactors[gene][tx] == 1:
txpf2 = 'transcript:' + tx
t1 = db[txpf1]
t2 = db[txpf2]
if g.strand == '+':
interpolyAdist = t2.end - t1.end
elif g.strand == '-':
interpolyAdist = t1.start - t2.start
elif len(pfs) != 2:
interpolyAdist = 'NA'
#Get list of txs that belong to each pf
txids = {} #{positionfactor : [list of transcriptIDs]}
for pf in sorted(pfs):
txids[pf] = []
for tx in posfactors[gene]:
if posfactors[gene][tx] == pf:
txids[float(pf)].append(tx)
alltxids = []
for pf in sorted(list(txids.keys())):
alltxids.append((',').join(txids[pf]))
outfh.write(('\t').join([gene, str(len(pfs)), ('_').join(alltxids), str(interpolyAdist)]) + '\n')
return posfactors
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--mode', type = str, choices = ['cellbycell', 'subsampleClusters'], help = 'How to perform tests? Either compare psi values of individual cells or subsample cells from clusters.')
parser.add_argument('--gff', type = str, help = 'GFF of transcript annotation.')
parser.add_argument('--alevindir', type = str, help = 'Directory containing subdirectories of alevin output.')
parser.add_argument('--readcountfilter', type = int, help = 'Minimum read count necessary for calculation of psi values. Genes that do not pass this filter \
will have reported psi values of NA. If mode == \'cellbycell\', then this is the number of reads mapping to a gene in that single cell. If mode == \'subsampleClusters\' \
then this is the summed number of reads across all cells in a predefined cluster.')
parser.add_argument('--conditions', type = str, help = 'Two column, tab-delimited file with column names \'sample\' and \'condition\'. First column contains cell ids and second column contains cell condition or cluster.')
parser.add_argument('--conditionA', type = str, help = 'Must be found in the second column of the conditions file. Delta psi is calculated as conditionB - conditionA.')
parser.add_argument('--conditionB', type = str, help = 'Must be found in the second column of the conditions file. Delta psi is calculated as conditionB - conditionA.')
args = parser.parse_args()
#bigdf is a df (obv) of counts with transcripts as columns and cell ids as rows
print('Collecting count data...')
bigdf = createbigdf(args.alevindir)
print('Adding condition/cluster info...')
bigdf = addclusters(bigdf, args.conditions)
print('Assigning transcripts to polyA sites...')
pklfile = os.path.abspath(args.gff) + '.posfactors.pkl'
#Check to see if there is a pickled file a position factors
if os.path.exists(pklfile):
with open(pklfile, 'rb') as infh:
posfactors = pickle.load(infh)
else:
posfactors = getpositionfactors(args.gff, 25)
with open(pklfile, 'wb') as outfh:
pickle.dump(posfactors, outfh)
if args.mode == 'cellbycell':
print('Calculating psi values for each cell...')
psidf = calculatepsi_cellbycell(bigdf, posfactors, args.readcountfilter)
print('Writing table of psi values...')
psidf.to_csv('psis.cellbycell.txt.gz', sep = '\t', header = True, index = False, compression = 'gzip')
print('Calculating delta psi values...')
deltapsidf = calculatedpsi_cellbycell(psidf, args.conditionA, args.conditionB)
print('Performing statistical tests...')
fdrdf = dotest_cellbycell(psidf)
#Merge deltapsidf and fdrdf
df = deltapsidf.merge(fdrdf, how = 'inner', on = 'gene', left_index = False, right_index = False)
#Write df
df.to_csv('results.cellbycell.txt', sep = '\t', header = True, index = False)
print('Done!')
elif args.mode == 'subsampleClusters':
print('Collapsing counts across all cells within a cluster...')
countsumdf = collapseclusters(bigdf)
print('Calculating psi values...')
psidf = calculatepsi_fromcollapsedclusters(countsumdf, posfactors, args.readcountfilter)
print('Calculating delta psi values...')
deltapsidf = calculatedpsi_fromcollapsedclusters(psidf, args.conditionA, args.conditionB)
print('Performing statistical tests...')
fdrdf = dotest_bootstrapclusters(bigdf, posfactors, 0.4, 5, args.conditionA, args.conditionB, args.readcountfilter)
#Merge deltapsidf and fdrdf
df = deltapsidf.merge(fdrdf, how = 'inner', on = 'gene', left_index = False, right_index = False)
#Write df
df.to_csv('results.subsampleclusters.txt', sep = '\t', header = True, index = False)
print('Done!')