-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplos_one.Rmd
614 lines (508 loc) · 42.7 KB
/
plos_one.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
---
title: "PLOS ONE figures"
output:
pdf_document: default
html_notebook: default
---
# Overview
This notebook replicates figures in our PLOS One submission. Some of them are essentially duplicates of other notebooks (i.e. `supplementary_revision.Rmd`), with most of the text left as is, so the text will not necessarily be very coherent. However all of the code should run and replicate the figures just fine. Figures were left to float in the pdf, but code for each figure or table is under the appropriate heading in the `.Rmd` document.
```{r setup, include=FALSE}
library(ape)
library(dplyr)
library(parallel)
library(ggtree)
library(phangorn)
library(Biostrings)
library(BiocGenerics)
library(scales)
library(ggplot2)
library(reshape2)
library(hutan)
library(treeio)
library(knitr)
library(runjags)
library(entropy)
library(treeinform)
cores = detectCores()
if (cores < 1) { cores = 1 }
set.seed(1287234)
source("code/functions.R")
source("code/summary_stats.R")
```
```{r, parse_trees, include=FALSE, warnings=FALSE}
# Parse trees before everything else
# make speciation calibration times (taken from chronos)
age <- c(0.1718036, 0.1430510, 0.2490215, 0.3227746, 0.5157844, 1.0000000)
clade <- c("Calycophora", "Agalmatidae", "Chodorophora", "Siphonophora", "Hydrozoa", "Cnidaria")
calibration_times = data.frame(age, clade, stringsAsFactors = FALSE)
#s <- 1:5606
#before <- dt_phyldog(s, "data/revisions/phyldog/before/", calibration_times)
#trees_before <- before[[1]]
#calibrated_before <- before[[2]]
#dt_before <- before[[3]]
#save(dt_before, file="data/revisions/dt_before.Rdata")
#save(trees_before, file="data/revisions/trees_before.Rdata")
#save(calibrated_before, file="data/revisions/calibrated_before.Rdata")
load("data/revisions/dt_before.Rdata")
load("data/revisions/trees_before.Rdata")
load("data/revisions/calibrated_before.Rdata")
#threshold_4 <- dt_phyldog("data/revisions/phyldog/threshold-4/",calibration_times)[[3]]
#save(threshold_4,file="data/revisions/dt_threshold-4.Rdata")
load("data/revisions/dt_threshold-4.Rdata")
#l <- 36523:43609
#threshold_5 <- dt_phyldog(l, "data/revisions/phyldog/threshold-5/", calibration_times)[[3]]
#save(threshold_5, file="data/revisions/dt_threshold-5.Rdata")
load("data/revisions/dt_threshold-5.Rdata")
#threshold_6 <- dt_phyldog("data/revisions/phyldog/threshold-6/",calibration_times)[[3]]
#save(threshold_6,file="data/revisions/dt_threshold-6.Rdata")
load("data/revisions/dt_threshold-6.Rdata")
#l <- 51176:56944
#threshold_7 <- dt_phyldog(l, "data/revisions/phyldog/threshold-7/", calibration_times)
#dt_7 <- threshold_7[[3]]
#save(dt_7, file="data/revisions/dt_threshold-7.Rdata")
load("data/revisions/dt_threshold-7.Rdata")
#l <- 58509:65608
#threshold_8 <- dt_phyldog(l, "data/revisions/phyldog/threshold-8/", calibration_times)[[3]]
#save(threshold_8, file="data/revisions/dt_threshold-8.Rdata")
load("data/revisions/dt_threshold-8.Rdata")
#l <- 88484:94021
#threshold_mix <- dt_phyldog(l, "data/revisions/phyldog/mix/", calibration_times)[[3]]
#save(threshold_mix, file="data/revisions/dt_threshold-mix.Rdata")
load("data/revisions/dt_threshold-mix.Rdata")
```
# Figure 1: Example Siphonophora gene tree pre-treeinform
Data consists of:
- **data/revisions/phyldog/before/7.ReconciledTree**: Gene tree generated by Agalma1.1.0 and phyldog before running treeinform. This can be regenerated using our scripts in **code/2_agalma_analysis**.
- **data/revisions/7.fa**: Multiple sequence alignment 7 from Agalma1.1.0 which corresponds to gene tree 7. This gene tree was selected through a manual search through the set of pre-treeinform gene trees for a calde of genes from the same species with improbably short terminal branches.
```{r fig1a, echo=FALSE, include=FALSE}
fig1a_tree = read.nhx("data/revisions/phyldog/before/7.ReconciledTree")@phylo
fig1a_colors = rep(1, length(fig1a_tree$tip.label)+fig1a_tree$Nnode)
#32566, 32567, 32568, 32569 are the model IDs we're looking for
# here they are 30,31,32,33
fig1a_colors[30:33] = 2
hydra_ancestor = getMRCA(fig1a_tree, 30:33)
fig1a_colors[hydra_ancestor] = 2
phyDat = read.phyDat("data/revisions/7.fa", format="fasta", type="AA")
fit = pml(fig1a_tree, phyDat)
#anc.ml = ancestral.pml(fit, "ml")
anc.ml = ancestral.pml(fit, "ml", return="phyDat")
# root is 34
test = as.character(anc.ml)
anc.seq = test[34,]
diff = sweep(test, 2, anc.seq, "==")
test_sum = apply(diff*1, 2, sum)
test_order = order(test_sum, decreasing=TRUE)
ordered_diff = diff[,test_order]
test_diff = apply(ordered_diff, c(1,2), function(x) if(x) { return("A") } else { return("C") })
test_DNAbin = as.DNAbin(test_diff)
#write.dna(test_DNAbin, "data/plos_msa.fasta", format="fasta", nbcol=-6, colsep='')
label = sapply(strsplit(sapply(strsplit(fig1a_tree$tip.label, "\\."), function(x) x[1]), "_"), function(x) x[1])
new_tree = chronos(fig1a_tree)
class(new_tree) = "phylo"
ggnt <- ggtree(new_tree)
ggnt$data$label[34:65] <- 34:65 # to deal with the fact that test_DNAbin labels are 34:65 instead of NA
```
```{r plot, echo=FALSE, warning=FALSE, fig.cap='An example gene phylogeny from the test dataset before running treeinform. Each tip is an exemplar transcript that was initially assigned to a different gene. In front, corresponding multiple sequence alignment, with sites ordered from highest to lowest identity to the inferred ancestral site for clarity on sequence diversity. Black indicates a difference from the ancestral sequence. The four Hydra transcripts in color were assigned to different genes by Trinity despite two of the transcripts sharing the exact same sequences, and the two other transcripts differing by a small gap. After treeinform, all transcripts from these four genes are reassigned to a single gene.'}
#tiff(filename="Figures/figure1.tiff",width=7,height=5,units="in",pointsize=8,res=350)
#postscript(file="Figures/figure1_revision.eps",width=7,height=5,fonts=c("sans"))
#cairo_ps(file="Figures/figure1_revision.eps",width=7,height=5,pointsize=8)
msaplot(ggnt, test_DNAbin, color=c("white","black","white","white"), offset=0.3, width=0.8) + geom_tiplab(size=4, aes(color=fig1a_colors), label=label) + theme(legend.position="none")
#dev.off()
```
# Table 1: Summary of parameter estimates from JAGS
* The code to run JAGS is commented out as it takes ~3 hours on a 2018 MacBook Pro. Instead we load it from **data/revisions/runjags_results.Rdata**.
```{r, runjags, include=FALSE}
# Data
Y = dt_before
N = length(dt_before)
# Initial values to get the chains started:
alpha <- 1
beta <- 1
lambda <- 0.571990480546
mu <- 0.544504933034
t <- 1
Constant <- 10
Ones <- rep(1, N)
#results <- run.jags(model="code/runjags.txt", n.chains=3, thin=1) # will take ~1hr
#results <- autorun.jags(model="code/runjags_revision.txt", n.chains=3, thin=1) # will take ~3hrs
#results_extend <- extend.jags(results, add.monitor=c("component_chosen")) # will take ~24hrs
#save(results, file="data/revisions/runjags_results2.Rdata")
load("data/revisions/runjags_results2.Rdata")
sum_results = results$summaries
alpha = sum_results[1,4]
beta = sum_results[2,4]
mu = sum_results[3,4]
lambda = sum_results[4,4]
p1 = sum_results[5,4]
p2 = sum_results[6,4]
mix1 = function(x, p1=p1, alpha=alpha, beta=beta) { p1*dgamma(x,shape=alpha,scale=beta)}
mix2 = function(x, p2=p2, lambda=lambda,mu=mu) { p2*dup_pdf(x,1,lambda,mu)}
load("data/revisions/posterior_component1.Rdata")
df_post = suppressWarnings(data.frame(dt=dt_before,pc1=posterior_component1) %>% filter(posterior_component1>0))
intersection = filter(df_post, pc1 >= 0.95) %>% select(dt) %>% max
#intersection = uniroot(function(x) mix1(x,p1,alpha,beta)-mix2(x,p2,lambda,mu), c(0,0.25))$root
```
We expected the implied duplication events of transcripts of the same gene that were misassigned to different genes to have very short implied duplication times approaching 0, and thus chose to model that component (Component 1) as a gamma distribution with parameters shape$=\alpha$ and rate$=\beta$. To model duplication events and associated times arising from the correctly assigned transcripts (Component 2), we used a birth-death process [@gernhard2008conditioned], which is well studied and often applied to gene analyses of duplication and loss. The probability distribution function in the model we used has parameters birth rate $\lambda$, death rate $\mu$, and tree time of origin $t_{or}$. Because we fitted a chronogram with time of origin 1 onto the gene trees $G = \{ G_1, G_2, \ldots, G_K \}$, we made the assumption that all gene tree times of origin are $t_{or} = 1$. Some gene trees have duplication events predating the first speciation event, thus when we fitted chronograms onto those gene trees they had times of origin greater than 1. We chose to filter these gene trees out of the mixture model and subsequent analyses.
Let $x_{i, k}$ represent duplication time $i$ from gene tree $G_k$, with $z_{i} \in \{ 1, 2 \}$ representing whether $x_{i,k}$ is drawn from the 1st component $(z_i = 1)$ or the 2nd component $(z_i = 2)$. Then if $\pi_1$ and $\pi_2$ denote the overall probability that a duplication time belongs to the 1st and 2nd component respectively, $\Gamma(x_{i,k} | \alpha, \beta)$ is the probability density function for the gamma distribution, and $f(x_{i,k} | t_{or,k} = 1, \lambda, \mu)$, we get the expression
$$ P(x_{i,k}) = \pi_1 \Gamma(x_{i,k}|\alpha, \beta) + \pi_2 f(x_{i,k} | t_{or,k}=t, \lambda, \mu)$$
We used Just Another Gibbs Sampler (JAGS) [@Plummer03jags] to perform Bayesian Gibbs Sampling in order to infer the parameters $\alpha$, $\beta$, $\lambda$, and $\mu$ as well as the mixing proportions $\pi_1$ and $\pi_2$. This gave us the parameter estimates in Table 2.
```{r, runjags_table, echo=FALSE}
sum_format = sum_results[,c("Lower95", "Mean", "Upper95", "MCerr")]
rownames(sum_format) <- c("$\\alpha$", "$\\beta$", "$\\mu$", "$\\lambda$", "$\\pi_1$", "$\\pi_2$")
kable(sum_format, format='pandoc', caption='Summary of parameter estimates from JAGS.')
#kable(sum_format, format='latex', caption='Summary of parameter estimates from JAGS.', booktabs = T)
```
# Table 2: Adjusted Rand Index between Trinity and Corset
# Figure 2: Subtree lengths for Siphonophora pre-treeinform
We first examined the prevalence of transcript misassignment. For each node in each of the `r toString(length(trees_before))` gene phylogenies, we calculated the length of the corresponding subtree. This is the sum of the length of all branches in the subtree defined by the node. An excess of very short subtrees would be a strong indication of assigning different transcripts of the same gene, which have very similar sequences and therefore short branches connecting them in phylogenetic trees, to different genes. This is the pattern we found (Supplementary Figure 1).
Two issues could create a misleading impression in the histogram of subtree lengths for internal nodes (Supplementary Figure 1). First, it considers all subtrees, including those defined by both speciation and duplication nodes. Misassigning transcripts from the same gene to multiple genes will artificially inflate only the number of duplication nodes, since variation across transcripts within a gene are essentially misassigned to gene duplication events. Examining just the duplication events in the gene trees therefore provides a more direct perspective on the problem we investigate here. Second, subtree lengths are in units of expected numbers of substitution, which depend on both rates of molecular evolution and time. Because the rates of evolution can vary within and between gene phylogenies, variation in rates could confound the interpretation of gene tree sublength.
We therefore performed a calibrated analysis and focused only on duplication nodes. We first created a time calibrated species tree, with all tips with age 0 and the root node with age 1. We then transformed the branch lengths of the gene trees so that each speciation node in each gene tree had the same age as the corresponding node in the species tree (see source code for this document). A histogram of the calibrated duplication times (Supplementary Figure 2) indicates there is a large excess of recent duplications. This provides additional evidence for the frequent misassignment of transcripts from the same gene to artefactual recent gene duplicates.
```{r, subtree_histo, echo=FALSE, warning=FALSE, fig.cap="Histogram of subtree lengths for internal nodes in each Siphonophora subset gene tree from Agalma containing tip descendants from the same species. Subtree lengths greater than 1 were filtered out for clarity."}
lengths = unlist(multi_subtree_lengths(trees_before, cores))
lengths_lim = lengths[which(lengths<1)]
#tiff(filename="Figures/subtree_lengths.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=350)
#pdf(file="Figures/subtree_lengths.eps",width=7,height=5,fonts=c("sans"))
ggplot(data=data.frame(x=lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency") #+ geom_vline(xintercept=0.02, linetype='dashed', color='#56B4E9') + geom_text(data=data.frame(x=0.02,y=0.08), map=aes(x=x, y=y), label="0.02", vjust=1.4, angle=90, colour="#56B4E9", size=3)
#dev.off()
```
# Figure 3: Corset clusterings
In order to get a sense of whether the transcript misassignment errors were localized to Trinity or are a more general problem to transcriptome assembly, we compared Trinity transcript clustering results with another transcript clustering tool, Corset [@Davidson2014] for the 5 species that had to be assembled. For 3 of the samples (SRX288285, SRX288430, SRX288431) we also ran cd-hit [@Fu2012] to remove transcripts with 100% identity in order to address some speed issues in Corset. The distribution of cluster sizes (Supplementary Figure 2) suggests that Corset tends to overcluster compared to Trinity, which would lead to similar misassignment errors.
```{r, corset_compare, echo=FALSE, warnings=FALSE, fig.cap="Cluster size counts for Trinity assembly and Corset clustering algorithm on Trinity contigs. There are 3 Trinity clusters with size greater than 30, while there are 20 Corset clusters with size greater than 30. This suggests that the same misassignment errors are generated by other transcriptome assemblers and clustering algorithms as well."}
#clusters.mod.txt <- (Sys.glob("data/revisions/corset/*-clusters.mod.txt"))
#corset_files <- lapply(clusters.mod.txt, function(x) read.csv(x, sep="\t", col.names = c("Trinity gene", "Trinity isoform", "Corset gene")))
#save(corset_files, file="data/revisions/corset_clusters.Rdata")
load("data/revisions/corset_clusters.Rdata")
trinity_clusters <- lapply(corset_files, function(x) cluster_size_distribution(x$Trinity.gene))
corset_clusters <- lapply(corset_files, function(x) cluster_size_distribution(x$Corset.gene))
catalog_ids=c('SRX288276','SRX288285','SRX288430','SRX288431','SRX288432')
suppressWarnings(size_dist_dfs<-lapply(1:5, function(x)
{
nrx<-nrow(trinity_clusters[[x]])
nry<-nrow(corset_clusters[[x]])
nrt<-nrx+nry
df<-rbind(trinity_clusters[[x]],corset_clusters[[x]])
tmp <- rep("Trinity", nrt)
tmp[nrx+1:nry]<-"Corset"
df$'Method' <- tmp
df$'ID' <- catalog_ids[x]
return(df)
}))
suppressWarnings(new_df <- do.call(rbind, size_dist_dfs))
# basic stats about corset & trinity
trinity_over_30<-filter(new_df, Method=='Trinity' & as.numeric(levels(size))[size]>30)
num_trinity_over_30<-sum(trinity_over_30$freq) # returns 3
corset_over_30<-filter(new_df, Method=='Corset' & as.numeric(levels(size))[size]>30)
num_corset_over_30<-sum(corset_over_30$freq) # returns 20
#tiff(filename="Figures/corset_cluster.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#pdf(file="Figures/corset_cluster.pdf",width=7,height=5,fonts=c("sans"))
suppressWarnings(ggplot(data=new_df,aes(x=size,y=freq,color=Method,group=Method))+geom_point(alpha=0.7)+scale_y_log10()+xlab("Cluster Size")+ylab("Count") + theme_classic() + geom_line() + facet_wrap(~ID, ncol=2) + scale_x_discrete(breaks=seq(0,70,by=10)))
# no titles for plos
#+ggtitle("Cluster size counts for Trinity & Corset by sample")
#dev.off()
```
# Figure 4: Subtree lengths after corset
```{r, corset_trees, echo=FALSE, warning=FALSE, fig.cap="Histogram of subtree lengths for internal nodes in each Siphonophora subset gene tree from Agalma with Corset clusterings containing tip descendants from the same species. Subtree lengths greater than 1 were filtered out for clarity."}
#k<-1:5644
#corset_tree_files <- "data/revisions/corset/phyldog/"
#corset_trees <- mclapply(k, function(x) parse_gene_trees(processTree(paste0(corset_tree_files, x, ".ReconciledTree"))), mc.cores = cores)
#corset_trees <- corset_trees[which(!unlist(mclapply(corset_trees, is.null)))]
#save(corset_trees, file="data/revisions/corset_trees.Rdata")
load("data/revisions/corset_trees.Rdata")
corset_lengths <- unlist(multi_subtree_lengths(corset_trees,cores))
corset_lengths_lim = corset_lengths[which(corset_lengths<1)]
# are corset subtree lengths different from trinity...?
trinity_corset_ks <- ks.test(corset_lengths,lengths)$statistic
rejection_level_tc <- 1.36*sqrt((length(corset_lengths)+length(lengths))/(length(corset_lengths)*length(lengths)))
#tiff(filename="Figures/corset_lengths.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#postscript(file="Figures/corset_lengths.eps",width=7,height=5,fonts=c("sans"))
ggplot(data=data.frame(x=corset_lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency") #+ geom_vline(xintercept=0.02, linetype='dashed', color='#56B4E9') + geom_text(data=data.frame(x=0.02,y=0.08), map=aes(x=x, y=y), label="0.02", vjust=1.4, angle=90, colour="#56B4E9", size=3)
#dev.off()
```
# Figure 5: Mixture model
```{r, runjags_viz, echo=FALSE, warnings=FALSE, fig.cap='Histogram of the inferred duplication times with an overlaid mixture model. Component 1 of the mixture model captures the technical issues we address where transcripts from the same gene are assigned to different genes, and component 2 captures the biological pattern, i.e. transcripts from different genes correctly assigned so. We first ran phyldog [@boussau2013genome] on the Siphonophora subset multiple sequence alignments from Agalma and a user-inputted species tree. This provided gene trees with internal nodes annotated as duplication or speciation events. We then fitted chronograms onto these gene trees with our user-inputted species tree.'}
#tiff(filename="Figures/mixmodel.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=350)
#pdf(file="Figures/mixmodel.pdf",width=7,height=5,fonts=c("sans"))
ggplot(data.frame(x=dt_before)) + geom_histogram(aes(x=x,y=..density..), fill='white', color='black', binwidth=0.01) + stat_function(geom="line",fun=mix1,args=list(p1,alpha,beta),xlim=c(0.005,1), aes(color="line1"))+stat_function(geom="line",fun=mix2,args=list(p2,lambda,mu),aes(color="line2"))+ggtitle("Density Curves of Mixture Model Plotted on Histogram of\n Inferred Duplication Times Before Treeinform") + scale_color_manual("Components of Mixture Model", labels=c('line1' = expression("Component 1, " * pi[1]*Gamma(x, alpha, beta)), 'line2' = expression("Component 2, "*pi[2]*f(x,1,lambda,mu))), values=c("#FF000080", "blue")) + xlab("Duplication Times") + ylab("Density") + theme_classic() + theme(legend.position = c(0.8, 0.6))
#dev.off()
#z = theme(legend.position = "none", text=element_text(colour='#838787', size = 20), plot.background=element_rect(fill ='#222222'), panel.background=element_rect(fill='#222222'), legend.background=element_rect(fill='#222222'), axis.line=element_line(colour='#838787'), axis.text=element_text(colour='#838787', size=15), axis.ticks = element_line(colour='#838787'), line=element_line(size=5))
#ggplot(data.frame(x=dt_before)) + geom_histogram(aes(x=x,y=..density..), fill='#838787', color='black', binwidth=0.01) + geom_vline(xintercept=0.01, linetype='dashed', color='yellow') + geom_text(data=data.frame(x=0.01,y=5), map=aes(x=x, y=y), label="0.01", vjust=1.4, angle=90, colour="yellow", size=8) + stat_function(geom="line",fun=mix1,args=list(p1,alpha,beta),xlim=c(0.003,1), colour="#E42832", size=1)+stat_function(geom="line",fun=mix2,args=list(p2,lambda,mu),colour="#34A5DA", size=1) + ylab("Density") + xlab("Time") + theme_classic() + z
```
# Figure 6: Reassigned percentage
```{r, fused_percentage_pre, echo=FALSE, warnings=FALSE}
genetree_path <- "data/revisions/genetrees/"
thresholds<-head(list.files(genetree_path), -1)
before_genetrees<-read_trees(paste0(genetree_path,"before"),"newick",cores)
phylos<-lapply(before_genetrees, function(x) x@phylo)
before_counts<-count_all_transcripts(phylos,cores)
orig_assembly = sum(before_counts)
#genetrees<-lapply(thresholds, function(x) read_trees(paste0(genetree_path,x), "newick",cores))
#counts<-lapply(genetrees, function(x) count_all_transcripts(x,cores))
#save(counts,file="data/revisions/counts.Rdata")
load(file="data/revisions/counts.Rdata")
Total_percentage<-sapply(counts, function(x) sum(before_counts-x)/orig_assembly)
percentages<-mclapply(counts, function(x) (before_counts-x)/before_counts,mc.cores=cores)
size<-rep(1,8*length(percentages))
size[(7*length(percentages)+1):(8*length(percentages))]<-2
size<-as.factor(size)
thresholds<-as.numeric(thresholds)
wide_table<-data.frame(cbind(do.call(rbind,percentages),thresholds,Total_percentage))
colnames(wide_table) <- sapply(strsplit(colnames(wide_table),"_"),function(x) x[1])
long_table<-melt(wide_table,id.vars=c("thresholds"))
long_table<-cbind(long_table,size)
long_table$value<-as.numeric(long_table$value)
long_table$thresholds<-as.numeric(long_table$thresholds)
# coloring for threshold selection & mixture model selection
#grp <- c(rep(1, 18), 2)
#fused.df.percents <- cbind(fused.df.percents, grp)
cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")
hydra_percentage<-sapply(1:4, function(x) (before_counts-counts[[x]])[4]/sum(before_counts-counts[[x]]))
default_percentages<-filter(long_table, thresholds==5e-04)
```
In order to validate that treeinform improves the accuracy of assigning transcripts to genes under the specified threshold, we performed two analyses. First, we plotted the percentage of reassigned genes at different thresholds to assess the performance of the default threshold value of 0.0005 (Supplementary Figure 5). Below the default value, the percentage of reassigned genes begins to plateau, while above the default value the percentage of reassigned genes increases very quickly, increasing the likelihood of treeinform to reassign transcripts from different genes to the same gene in addition to reassign transcripts from the same gene together.
```{r, fused_percentage, echo=FALSE, warnings=FALSE, fig.cap="The percentage of reassigned tips is plotted above on a log scale. The original assembly had 433,071 genes, with 47,688 included in the gene trees after Agalma filtering criteria, and at most 23,396 possible candidates (49.06% of genes) for reassignment. The default threshold for treeinform is marked by the grey vertical dashed line."}
#tiff(filename="Figures/reassigned.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#pdf(file="Figures/reassigned.pdf",width=7,height=5,fonts=c("sans"))
ggplot(long_table,aes(thresholds,value*100)) + geom_vline(xintercept=0.0005, linetype='dashed', color='grey') + geom_text(data=data.frame(x=0.0005,y=0.25), map=aes(x=x, y=y), label="0.0005", vjust=1.4, angle=90, colour="grey", size=3) + geom_point(aes(group=variable,color=variable,shape=size), alpha=0.5) + geom_line(aes(group=variable,color=variable,linetype=size)) + xlab("Threshold") + ylab("Percentage") + scale_x_log10() + theme_classic() + scale_shape(guide="none") + scale_linetype(guide="none") + scale_colour_manual(name="Species",labels=c("Abylopsis","Agalma","Craseoa","Hydra","Nanomia","Nematostella","Physalia","Total"),values=cbbPalette)
#+ ggtitle("Percentage of reassigned genes by species + total percentage vs threshold")
#dev.off()
```
We also looked at the percentage of reassigned genes for each species in order to get a sense of how variable by species transcript misassignment was. The percentage of reassigned genes for each species was rather variable, with Hydra magnipapillata having a much higher proportion of reassigned genes (16.03%) at the threshold. This affected the total proportion of reassigned genes as well, with the majority (46-47%) of reassigned genes at and around the treeinform threshold coming from Hydra magnapapillata. However, even for the other species, 1.88-6.18% of genes were reassigned at the threshold.
# Figure 7 - Density of duplication times
Second, we compared the density of duplication times under the model provided for Component 2 of the mixture model to the distribution of estimated duplication times for gene trees from Agalma before treeinform, and gene trees from Agalma after treeinform under 3 different thresholds: 0.05, 0.0005, and 0.07 (Supplementary Figure 6). We again fitted chronograms with the same Siphonophora species tree onto all gene trees from Agalma and filtered out those gene trees with time of origin greater than 1, so that duplication times were comparable between trees. Visually, the analyses with the 0.0005 threshold comes closest to the theoretical.
```{r, duplication_theoretical, echo=FALSE, warnings=FALSE, fig.cap="Density from theoretical and the empirical density under the 3 different thresholds as well as before treeinform was run. The distribution before treeinform has a large peak on the left that is removed by treeinform with all examined thresholds."}
dt <- c(dt_before, threshold_mix, threshold_5, dt_7)
l1 = length(dt_before)
l2 = length(threshold_mix)
l3 = length(threshold_5)
l4 = length(dt_7)
Threshold <- rep(NA, l1+l2+l3+l4)
Threshold[1:l1] <- "Before"
Threshold[l1+1:l2] <- "0.07"
Threshold[l1+l2+1:l3] <- "0.05"
Threshold[l1+l2+l3+1:l4] <- "0.0005"
data = data.frame(dt, Threshold)
f = function(x) dup_pdf(x, 1, lambda, mu)
#tiff(filename="Figures/validate.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#pdf(file="Figures/validate.pdf",width=7,height=5,fonts=c("sans"))
ggplot(data, aes(dt)) + geom_density(aes(group=Threshold, color=Threshold), bw=0.02) + stat_function(fun=f) + xlab("Duplication Times") + ylab("Density") + theme_classic() + theme(legend.position = c(0.8, 0.6)) + geom_hline(yintercept=0, color="white") + geom_vline(xintercept=0, color="white")
#+ ggtitle("Density of inferred and theoretical duplication times\nunder different treeinform thresholds")
#dev.off()
# FOR PRES
#ggplot(data, aes(dt)) + geom_density(aes(group=Threshold, color=Threshold), bw=0.02) + stat_function(fun=f, color='white') + ggtitle("Density of inferred and theoretical duplication times\nunder different treeinform thresholds") + xlab("Duplication Times") + ylab("Density") + theme_classic() + geom_hline(yintercept=0, color="#222222") + geom_vline(xintercept=0, color="#222222") + z
```
# Table 3 - KL Divergences
Additionally, we computed the Kullback-Leibler distance [@kullback1951] between the distributions of duplication times under different thresholds and the theoretical distribution of duplication times (Table 3). Kullback-Leibler distance, otherwise known as relative entropy, measures the distance between two distributions. The KL distance between the distribution of duplication times after running treeinform with the default threshold value of 0.0005 is not minimal, but is robust as compared to both threshold levels below and above the default value. This indicates that treeinform produces more accurate gene trees with appropriate threshold selection.
```{r, kl, echo=FALSE, warning=FALSE, fig.cap=''}
z.pre <- rep(NA, 10000)
u <- runif(10000)
theoretical = function(x, u) dup_cdf_sampler(x, 1, lambda, mu, u)
my.uniroot <- function(x) uniroot(theoretical, c(0, 1), u=x)$root
r <- vapply(u, my.uniroot, numeric(1))
d_theo <- discretize(r, numBins=100)
d_before <- discretize(dt_before, numBins=100)
d_mix <- discretize(threshold_mix, numBins=100)
#d_2 <- discretize(threshold_2, numBins=100)
d_5 <- discretize(threshold_5, numBins=100)
d_6 <- discretize(threshold_6, numBins=100)
d_7 <- discretize(dt_7, numBins=100)
d_8 <- discretize(threshold_8, numBins=100)
kl_dist <- c(KL.empirical(d_before, d_theo), KL.empirical(d_mix, d_theo), KL.empirical(d_5, d_theo), KL.empirical(d_6,d_theo), KL.empirical(d_7,d_theo), KL.empirical(d_8, d_theo))
kl <- data.frame("KL Distance" = kl_dist)
rownames(kl) <- c("Before", "0.07", "0.05", "0.005", "0.0005", "5e-05")
kable(kl, format='pandoc', caption='Kullback-Leibler distances between duplication times after running treeinform with different thresholds and theoretical duplication times.')
#kable(kl, format='latex', caption='Kullback-Leibler distances between duplication times after running treeinform with different thresholds and theoretical duplication times.', booktabs = T)
```
# Figures 8 and 9 - Echinoderm and Drosophila subtree lengths and CDS precision/recall
```{r, isoform_num, echo=FALSE, warning=FALSE}
PATH="data/drosophila_analysis"
TRINITY_PATH=file.path(PATH,"trinity")
thresholds=c(50,5,.5,.005,.0005,.00005,.0000005)
genetree_versions=c(18,22,26,30,34,38,42) # treeinform genetree versions
versions=c(15,19,23,27,31,35,39) # treeinform versions
csv_files<-c("trinity.csv",sapply(versions, function(x) paste0("treeinform",x,".csv")))
isoform_genes<-lapply(csv_files, function(x) read.csv(file.path(TRINITY_PATH,x)))
isoform_num<-mclapply(isoform_genes, function(x) cluster_size_distribution(x$gene), mc.cores=cores)
cds<-read.table(file.path(PATH,"dmel_unique_protein_isoforms_fb_2018_04.tsv"),sep="\t",skip=4,col.names=c("FBgn","FB_gene_symbol","representative_protein","identical_protein(s)"),header=FALSE,fill=TRUE)
cds_isoform_num<-cluster_size_distribution(cds$FBgn)
genetrees <- read_trees(file.path(PATH,"genetree-10"),"newick",cores=cores)
```
## Data
Data for Drosophila is in:
* `PATH = "data/drosophila_analysis"`
* genetrees from Trinity and treeinform are stored under `genetree-x` folders in $PATH, where `x` corresponds to their run id in the agalma database.
* blast alignments are in `blast`
Data for S. Purpuratus is in:
* `PATH="data/echinoderm_analysis"`
* CDS isoform data for S. purpuratus is
```{r, subtree, echo=FALSE, warning=FALSE, fig.cap="The subtree branch lengths for Drosophila actually look pretty good."}
# * Trinity and treeinform isoform to gene mappings for Drosophila melanogaster are stored in the `trinity` folder in $PATH. They were generated from the agalma database as `.csv` files with the following command:
# `python dump_sqlite.py agalma_trinity.sqlite 0 15 19 23 27 31 35 39`.
#tiff(filename="Figures/drosophila_subtree.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
lengths = unlist(multi_subtree_lengths(genetrees, cores))
lengths_lim = lengths[which(lengths<1)]
ggplot(data=data.frame(x=lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency") #+ geom_vline(xintercept=0.02, linetype='dashed', color='#56B4E9') + geom_text(data=data.frame(x=0.02,y=0.08), map=aes(x=x, y=y), label="0.02", vjust=1.4, angle=90, colour="#56B4E9", size=3)
#dev.off()
```
```{r, echino_subtree, echo=FALSE, warning=FALSE, fig.cap="Not a big peak again."}
# don't have this data...
PATH="/Users/aguang/Dropbox\ (Brown)/treeinform/echinoderm_analysis"
#echino_lengths2<-multi_subtree_lengths2(echino_genetrees,4)
#echino_lengths2 <- unlist(echino_lengths2)
#save(echino_lengths2,file=file.path(PATH,"echino_lengths2.Rdata"))
load(file.path(PATH,"echino_lengths2.Rdata"))
#load(file.path(PATH,"echino_lengths.Rdata"))
#echino_genetrees <- read_trees(file.path(PATH,"genetrees"),"newick",cores=cores)
echino_genetrees <- read.tree(file.path(PATH,"genetrees.newick"))
#echino_lengths = unlist(multi_subtree_lengths(echino_genetrees, cores))
echino_lengths_lim = echino_lengths2[which(echino_lengths2<1)]
#tiff(filename="Figures/echinoderm_subtree.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
ggplot(data=data.frame(x=echino_lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency") #+ geom_vline(xintercept=0.02, linetype='dashed', color='#56B4E9') + geom_text(data=data.frame(x=0.02,y=0.08), map=aes(x=x, y=y), label="0.02", vjust=1.4, angle=90, colour="#56B4E9", size=3)
#test<-ast$literal_eval(readLines(file.path(PATH,"histo.txt")))
```
There were `r toString(length(echino_genetrees))` gene trees from Agalma for the Echinoderm dataset pre-treeinform. We plotted 2 figures here: subtree branch lengths across all gene trees (Figure 5), and correct clusterings vs transcripts incorrectly assigned to the same gene (Figure 7).
The CDS comparisons are actually pretty interesting.
```{r, cds_comparisons, echo=FALSE, warnings=FALSE}
run_ids=c(2,3,4,5,6)
treeinform_ids=c(11,15,19,23,27,31,35,39,43,47,58,59,60,61,62,63,64,65)
srx_ids=c("SRX247001","SRX247003","SRX054483","SRX054470","SRX054462")
# melanogaster is 6 / SRX054470
#files <- grep(".tsv",list.files(file.path(PATH,"blast")),value=TRUE)
blast <- mclapply(srx_ids,
function(x) read.delim(file.path("data","drosophila_analysis","blast",paste0(x,".tsv")),header=FALSE,col.names=c("trinity","CDS","percent_identity","align_len","mismatch","gapopen","qstart","qned","sstart","send","evalue","bitscore")),
mc.cores=cores)
gn_tr <- read.delim(file.path("data","drosophila_analysis","fbgn_fbtr_fbpp_fb_2018_04.tsv"),skip=4,col.names=c("FBgn","FBtr","FBpp"))[,2:1]
replaced_blast <- mclapply(blast,
function(x) {
df <- replace_values(x, gn_tr, 2)
df <- best_unique(df, 'trinity')
trinity_split(ungroup(df))
},
mc.cores=cores)
cs <- mclapply(replaced_blast, cluster_stats, mc.cores=cores)
# 1=number of singletons, 2=number of pairs with same clusterings, 3=number of pairs with same clustering in 2 and diff in 1, 4=number of pairs with same clustering in 1 and diff in 2
agg <- Reduce(function(x,y) mapply(function(a,b) a+b, x, y),cs)
#treeinform_csv <- grep(".csv", list.files(file.path(PATH,"blast")),value=TRUE)
trinitys <- mclapply(run_ids, function(x) {
csvs <- read.csv(file.path("data","drosophila_analysis","blast",paste0("trinity_",x,".csv")))
return(csvs) }, mc.cores=cores)
replaced_blast_modelids <- mclapply(1:5,function(x) {
left_join(replaced_blast[[x]],trinitys[[x]],by=c("gene","isoform"))
}, mc.cores=cores)
# THESE ARE BY RUNID
#treeinforms <- mclapply(run_ids,
# function(x) {
# csvs <- lapply(treeinform_ids, function(y) {
# csv <- read.csv(file.path(PATH,"blast",paste0("treeinform_",y,"_",x,".csv")))
# colnames(csv) <- #c("model_id",paste0("treeinform_",y,"_gene"),paste0("treeinform_",y,"_isoform"))
# return(csv)
# })
# csvs[[length(csvs)+1]] = replaced_blast_modelids[[x-1]] #run_ids - 1
# df <- Reduce(function(x,y) left_join(x,y,by="model_id"), csvs)
#df[complete.cases(df),]
# },
# mc.cores=cores
#)
#cs_treeinforms <- mclapply(treeinforms,
# function(x) {
# lapply(treeinform_ids,
# function(y) cluster_stats(select(x,paste0("treeinform_",y,"_gene"),CDS))
# )
# }, mc.cores=cores)
#save(cs_treeinforms,file=file.path(PATH,"cs_treeinforms.Rdata"))
load(file.path("data","drosophila_analysis","cs_treeinforms.Rdata"))
# the cs_treeinforms are then by... treeinform_ids
# we want to tally across species/run ids
list_lists <- lapply(cs_treeinforms, function(x) do.call(cbind, x))
agg_treeinforms <- matrix(Reduce(function(x,y) mapply(function(a,b) a + b, x, y), list_lists), nrow=4)
#jaccard <- c(agg[2]/(agg[2]+agg[3]+agg[4]),agg_treeinforms[2,]/(agg_treeinforms[2,]+agg_treeinforms[3,]+agg_treeinforms[4,]))
```
```{r, cds_plots}
thresholds=c(0,500,50,5,.5,.05,.005,.0005,.00005,.000005,.0000005, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06)
correct <- c(agg[2],agg_treeinforms[2,])
diff_Y_same_X <- c(agg[4],agg_treeinforms[4,])
diff_X_same_Y <- c(agg[3],agg_treeinforms[3,])
cds_table <- data.frame(correct,diff_Y_same_X,diff_X_same_Y,thresholds)
cds_table$precision <- cds_table$correct/(cds_table$correct+cds_table$diff_Y_same_X)
cds_table$recall <- cds_table$correct/(cds_table$correct+cds_table$diff_X_same_Y)
#tiff(filename="figures/drosophila_pr.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#ggplot(cds_table, aes(x=precision,y=recall)) + geom_point() + geom_line() + xlab("Recall") + ylab("Precision") + theme_classic() + ggrepel::geom_label_repel(aes(label = thresholds))
#ggplot(cds_table, aes(x=diff_Y_same_X,y=correct)) + geom_point() + geom_line() + xlab("Recall") + ylab("Precision") + theme_classic() + ggrepel::geom_label_repel(aes(label = thresholds)) #+ ggtitle("Pairs of orrectly assigned transcripts vs\n pairs of transcripts incorrectly assigned to the same gene\n as threshold increases for 6 species of Drosophila")
# Precision is correctly assigned transcripts/correct + incorrectly assigned to same gene. Recall is correctly assigned/correct + incorrectly assigned to different genes.
```
```{r, cds_comparisons_echino, echo=FALSE, warnings=FALSE}
treeinform_ids=176:194
echino_blast <- read.delim(file.path("data","echinoderm_analysis","spur.tsv"),header=FALSE,col.names=c("trinity","CDS","percent_identity","align_len","mismatch","gapopen","qstart","qned","sstart","send","evalue","bitscore"))
transcriptome_headers <- fasta.index(file.path(PATH,"sp","Transcriptome.fasta"))$desc
splits <- lapply(transcriptome_headers, function(x) strsplit(x,split=" TSA: Strongylocentrotus purpuratus ")[[1]])
echino_header <- sapply(splits, function(x) x[1])
echino_gene <- sapply(splits, function(x) {
tmp <- strsplit(x[2],split="\\.|\\_")[[1]]
paste(tmp[1],tmp[2],sep=".")})
gn_tr_echino <- data.frame(CDS=echino_header,gene=echino_gene)
replaced_blast_df_echino <- replace_values(echino_blast, gn_tr_echino, 2)
replaced_blast_df_echino <- best_unique(replaced_blast_df_echino, 'trinity')
trinity_csv <- read.csv(file.path("data","echinoderm_analysis","treeinforms","trinity_64.csv"))
colnames(trinity_csv) <- c("model_id", "gene", "isoform")
colnames(replaced_blast_df_echino)[1] <- "model_id"
combined_blast_echino <- left_join(replaced_blast_df_echino, trinity_csv, by="model_id")
combined_blast_echino <- ungroup(combined_blast_echino)
echino_cs <- cluster_stats(combined_blast_echino[,c("gene","CDS")])
# 1=number of singletons, 2=number of pairs with same clusterings, 3=number of pairs with same clustering in 2 and diff in 1, 4=number of pairs with same clustering in 1 and diff in 2
# don't need the further Reduce step because we only have ONE species
#echino_agg <- Reduce(function(x,y) mapply(function(a,b) a+b, x, y),echino_cs)
#treeinforms <- mclapply(treeinform_ids, function(y) {
# csv <- #read.csv(file.path(PATH,"treeinforms_with_spur",paste0("treeinform_",y,"_64.csv")))
# colnames(csv) <- #c("model_id",paste0("treeinform_",y,"_gene"),paste0("treeinform_",y,"_isoform"))
# return(csv)
# }, mc.cores=cores)
#treeinforms_all <- Reduce(function(x,y) left_join(x,y,by="model_id"), treeinforms)
#save(file="treeinforms_withspur_all.Rdata",treeinforms_all)
#load("treeinforms_all.Rdata")
load("data/echinoderm_analysis/treeinforms_withspur_all.Rdata")
#echino_treeinform_genes <- left_join(combined_blast_echino, treeinforms_all, by="model_id")
#echino_cs_treeinforms <- mclapply(treeinform_ids, function(y)
# cluster_stats(select(echino_treeinform_genes,paste0("treeinform_",y,"_gene"),CDS)),
# mc.cores=cores)
#save(file="echino_cs_treeinforms_withspur.Rdata",echino_cs_treeinforms)
#load("echino_cs_treeinforms.Rdata")
load("data/echinoderm_analysis/echino_cs_treeinforms_withspur.Rdata")
echino_agg_treeinforms <- matrix(unlist(echino_cs_treeinforms),nrow=4)[,1:19]
#echino_jaccard <- c(echino_cs[[2]]/(echino_cs[[2]]+echino_cs[[3]]+echino_cs[[4]]),echino_agg_treeinforms[2,]/(echino_agg_treeinforms[2,]+echino_agg_treeinforms[3,]+echino_agg_treeinforms[4,]))
```
```{r, cds_plots_echino}
echino_thresholds=c(0,500,50,5,.5,.05,.005,.0005,.00005,.000005,.0000005, 0.01, 0.02, 0.035, 0.15, 0.25, 0.35, 1.5, 2.5, 3.5)
echino_correct <- c(echino_cs[[2]],echino_agg_treeinforms[2,])
echino_diff_Y_same_X <- c(echino_cs[[4]],echino_agg_treeinforms[4,])
echino_diff_X_same_Y <- c(echino_cs[[3]],echino_agg_treeinforms[3,])
echino_cds_table <- data.frame(correct=echino_correct,diff_Y_same_X=echino_diff_Y_same_X,diff_X_same_Y=echino_diff_X_same_Y,thresholds=echino_thresholds)
echino_cds_table$precision <- echino_cds_table$correct/(echino_cds_table$correct+echino_cds_table$diff_Y_same_X)
echino_cds_table$recall <- echino_cds_table$correct/(echino_cds_table$correct+echino_cds_table$diff_X_same_Y)
#tiff(filename="figures/echinoderm_cds.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
#ggplot(echino_cds_table, aes(x=recall,y=precision)) +
# geom_point() + geom_line() +
# xlab("Recall") +
# ylab("Precision") + theme_classic() +
# ggrepel::geom_label_repel(aes(label = thresholds))
#dev.off()
#+ggtitle("Pairs of correctly assigned transcripts vs\n pairs of transcripts incorrectly assigned to the same gene\n as threshold increases for S. purpuratus")
#echino_ratio_table <- data.frame(echino_ratio, echino_thresholds)
#ggplot(echino_ratio_table, aes(x=echino_thresholds,y=echino_ratio)) + geom_point() + ggtitle("Jaccard index of correctly assigned transcripts vs treeinform threshold") + xlab("Threshold") + ylab("Jaccard") + theme_classic()
```
For the manuscript submission we also put the plots together using patchwork.
```{r togtherplots}
library(patchwork)
p_dcds<-ggplot(cds_table, aes(x=recall,y=precision)) + geom_point() + geom_line() + xlab("Recall") + ylab("Precision") + theme_classic() + ggrepel::geom_label_repel(aes(label = thresholds))
p_ecds<-ggplot(echino_cds_table, aes(x=recall,y=precision)) +
geom_point() + geom_line() +
xlab("Recall") +
ylab("Precision") + theme_classic() +
ggrepel::geom_label_repel(aes(label = thresholds))
#tiff(filename="figures/cds.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
p_dcds / p_ecds
#dev.off()
p_dlen<-ggplot(data=data.frame(x=lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency")
p_elen<-ggplot(data=data.frame(x=echino_lengths_lim)) + geom_histogram(aes(x=x,y=..count../sum(..count..)), binwidth=0.01, fill='white', color='black') + theme_classic() + xlab("Subtree length") + ylab("Frequency")
#tiff(filename="figures/model_subtree.tiff",width=7,height=5,family="Arial",units="in",pointsize=8,res=300)
p_dlen / p_elen
#dev.off()
```