Skip to content

Commit

Permalink
feat: compute and plot cluster information distance for all relevant …
Browse files Browse the repository at this point in the history
…tree sets
  • Loading branch information
dlaehnemann committed Dec 12, 2024
1 parent 89183f7 commit 9917ebb
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 15 deletions.
6 changes: 6 additions & 0 deletions workflow/envs/treedist.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- nodefaults
dependencies:
- r-treedist =2.9
- r-tidyverse =2.0
2 changes: 1 addition & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_final_output():
expand(
[
"results/trees/{ind}.{software}.max_missing_stable_topology_selection.pdf",
"results/trees/{ind}.{software}.support_across_missingness.pdf",
"results/trees/{ind}.{software}.tree_values_across_missingness.pdf",
"results/trees/{ind}.{software}.support_vs_branch_length.full_data.pdf",
"results/trees/{ind}.{software}.support_vs_branch_length.summary.pdf",
"results/trees/{ind}/{model}/max_{max_missing}_missing/{ind}.{model}.max_{max_missing}_missing.{software}.support.pdf",
Expand Down
35 changes: 30 additions & 5 deletions workflow/rules/phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,23 @@ rule gotree_support_collapsed_trees:
") 2>{log}"


rule cluster_info_dist_across_trees:
input:
trees=lambda wc: expand(
"results/{{software}}/{{individual}}/results/{{model}}/max_{{n_missing_cells}}_missing/{{individual}}.{{model}}.max_{{n_missing_cells}}_missing.{infix}.raxml.{{type}}",
infix="bootstraps" if wc.type == "bootstraps" else "search",
),
output:
cluster_info_dist="results/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{type}.cluster_info_dist.tsv",
log:
"logs/{software}/{individual}/results/{model}/max_{n_missing_cells}_missing/{individual}.{model}.max_{n_missing_cells}_missing.{type}.cluster_info_dist.log",
conda:
"../envs/treedist.yaml"
threads: 1
script:
"../scripts/cluster_info_dist.R"


rule raxml_ng_rfdist_across_trees:
input:
trees=lambda wc: expand(
Expand Down Expand Up @@ -597,21 +614,29 @@ rule plot_support_tree:
"../scripts/plot_support_tree.R"


rule plot_support_values_across_missingness:
rule plot_values_across_missingness:
input:
support_trees=expand(
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.raxml.support",
model=lookup("raxml_ng/models", within=config),
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config)
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
),
cluster_info_dist=expand(
"results/{{software}}/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.{type}.cluster_info_dist.tsv",
model=lookup("raxml_ng/models", within=config),
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
type=["startTree", "mlTrees", "bootstraps"],
)
output:
support_plot="results/trees/{individual}.{software}.support_across_missingness.pdf",
support_plot="results/trees/{individual}.{software}.tree_values_across_missingness.pdf",
log:
"logs/trees/{individual}.{software}.support_across_missingness_plot.log",
"logs/trees/{individual}.{software}.tree_values_across_missingness_plot.log",
conda:
"../envs/ggtree.yaml"
resources:
mem_mb=lambda wc, input: input.size_mb * 1.2
script:
"../scripts/plot_bootstrap_support_values_across_missingness.R"
"../scripts/plot_tree_values_across_missingness.R"


rule plot_support_values_and_branch_lengths:
Expand Down
28 changes: 28 additions & 0 deletions workflow/scripts/cluster_info_dist.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")


library("TreeDist")
library("tidyverse")

trees <- ape::read.tree(snakemake@input[["trees"]])

pairwise_cid <- ClusteringInfoDistance(trees, trees, normalize=TRUE)

dedup_cids <- pairwise_cid[lower.tri(pairwise_cid)]

annotated_tibble <- dedup_cids |>
as_tibble_col(column_name = "cluster_information_distance") |>
add_column(
model=snakemake@wildcards[["model"]],
max_missing=snakemake@wildcards[["n_missing_cells"]],
tree_type=snakemake@wildcards[["type"]],
software=snakemake@wildcards[["software"]]
)

write_tsv(
x = annotated_tibble,
file = snakemake@output[["cluster_info_dist"]]
)

Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,47 @@ best_trees_support_values <- snakemake@input[["support_trees"]] |>
group = 1
)
) |>
mutate(max_missing = as.integer(max_missing)) |>
select(support, max_missing, model)) |>
list_rbind()
mutate(max_missing = as.integer(max_missing)) |>
select(support, max_missing, model)) |>
list_rbind() |>
add_column(
value_type = "bootstrap support"
) |>
rename(
value = support
)

cluster_info_dist <- read_tsv(snakemake@input[["cluster_info_dist"]]) |>
mutate(
tree_type = str_c(tree_type, " cluster info dist")
) |>
rename(
value = cluster_information_distance,
value_type = tree_type
)

support_across_missingness <- ggplot(
all_values <- bind_rows(
best_trees_support_values,
cluster_info_dist
)

support_across_missingness <- ggplot(
all_values,
aes(
x=model,
y=support
y=value
)
) +
geom_violin() +
geom_jitter(width=0.25) +
stat_summary(color="red") +
geom_violin(
draw_quantiles = c(0.25, 0.5, 0.75)
) +
stat_summary(
fun.data="mean_se",
color="red"
) +
facet_grid(
cols = vars(factor(max_missing))
cols = vars(factor(max_missing)),
rows = vars(factor(value_type))
) +
theme_bw() +
theme(
Expand All @@ -64,5 +89,6 @@ ggsave(
filename = snakemake@output[["support_plot"]],
plot = support_across_missingness,
width = plot_width,
height = 4 * 5 + 1,
limitsize = FALSE
)

0 comments on commit 9917ebb

Please sign in to comment.