Skip to content

Commit

Permalink
feat: overhaul summary plots (add information criteria, add ecdf and …
Browse files Browse the repository at this point in the history
…histogram plots)
  • Loading branch information
dlaehnemann committed Dec 2, 2024
1 parent e79a8ce commit 7f111f6
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 25 deletions.
23 changes: 20 additions & 3 deletions workflow/rules/phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,9 @@ rule raxml_ng_bsconverge_to_tsv:
for line in f:
if line.startswith("Alignment sites: "):
usable_sites = line.split("Alignment sites: ")[1].rstrip()
elif line.startswith("AIC score: "):
scores = line.rstrip().split(" / ")
[ aic, aicc, bic ] = [ s.split(" score: ")[1] for s in scores]
break
with open(output.tsv, mode="w") as o:
o.write("\t".join(
Expand All @@ -448,6 +451,9 @@ rule raxml_ng_bsconverge_to_tsv:
"tree_type",
"total_trees",
"convergence",
"aic_score",
"aicc_score",
"bic_score",
]
))
o.write("\n")
Expand All @@ -460,6 +466,9 @@ rule raxml_ng_bsconverge_to_tsv:
"bootstraps",
total_trees,
convergence,
aic,
aicc,
bic,
]
))

Expand Down Expand Up @@ -553,14 +562,22 @@ rule plot_support_values_across_missingness:
"../scripts/plot_bootstrap_support_values_across_missingness.R"


rule plot_support_values_vs_branch_lengths:
rule plot_support_values_and_branch_lengths:
input:
support_trees=expand(
"results/raxml_ng/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.support.raxml.support",
model=lookup("raxml_ng/models", within=config),
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config)
)
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
),
tsv=expand(
"results/raxml_ng/{{individual}}/results/{model}/max_{n_missing_cells}_missing/{{individual}}.{model}.max_{n_missing_cells}_missing.bootstraps.raxml.bsconverge.tsv",
model=lookup("raxml_ng/models", within=config),
n_missing_cells=lookup(dpath="raxml_ng/max_missing", within=config),
),
output:
support_hist="results/trees/{individual}.raxml.support.histogram.pdf",
branch_length_hist="results/trees/{individual}.raxml.branch_length.histogram.pdf",
branch_length_ecdf="results/trees/{individual}.raxml.branch_length.ecdf.pdf",
data_plot="results/trees/{individual}.raxml.support_vs_branch_length.full_data.pdf",
summary_plot="results/trees/{individual}.raxml.support_vs_branch_length.summary.pdf",
log:
Expand Down
119 changes: 101 additions & 18 deletions workflow/scripts/plot_bootstrap_support_values_vs_branch_lengths.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ best_trees_support_vs_branch_length <- snakemake@input[["support_trees"]] |>
node.label = "support"
) |>
as_tibble() |>
drop_na(
branch.length,
support
) |>
add_column(
max_missing = str_extract(
filename,
Expand All @@ -41,15 +37,100 @@ best_trees_support_vs_branch_length <- snakemake@input[["support_trees"]] |>
) |>
list_rbind()

usable_sites <- read_tsv(
snakemake@input[["tsv"]]
) |>
select(model, max_missing, usable_sites) |>
distinct()


number_of_models <- best_trees_support_vs_branch_length |> distinct(model) |> count() |> pull(n)
plot_height = number_of_models * 3
plot_width = 2 + plot_height

non_na_branch_lengths <- best_trees_support_vs_branch_length |>
drop_na(branch.length)

branch_length_hist <- ggplot(
data = non_na_branch_lengths
) +
geom_histogram(
aes(x = branch.length)
) +
facet_grid(
cols=vars(model),
rows=vars(max_missing)
)

ggsave(
filename = snakemake@output[["branch_length_hist"]],
plot = branch_length_hist,
width = plot_width,
height = plot_height * 0.75,
limitsize = FALSE
)

normalized_branch_lengths <- non_na_branch_lengths |>
left_join(
usable_sites,
by = join_by(
model,
max_missing
)
)

branch_length_ecdf <- ggplot(
data = normalized_branch_lengths,
aes(
x = branch.length
)
) +
stat_ecdf(geom = "point") +
facet_grid(
cols=vars(model),
rows=vars(max_missing)
)

ggsave(
filename = snakemake@output[["branch_length_ecdf"]],
plot = branch_length_ecdf,
width = plot_width,
height = plot_height * 0.75,
limitsize = FALSE
)



support_hist <- ggplot(
data = best_trees_support_vs_branch_length |>
drop_na(support)
) +
geom_histogram(
aes(x = support)
) +
facet_grid(
cols=vars(model),
rows=vars(max_missing)
)

ggsave(
filename = snakemake@output[["support_hist"]],
plot = support_hist,
width = plot_width,
height = plot_height * 0.75,
limitsize = FALSE
)


data_plot <- ggplot(
data = best_trees_support_vs_branch_length,
data = best_trees_support_vs_branch_length |>
drop_na(
branch.length,
support
),
mapping = aes(
x=branch.length,
y=support
x=support,
y=branch.length
)
) +
geom_hex(
Expand Down Expand Up @@ -81,7 +162,8 @@ ggsave(
filename = snakemake@output[["data_plot"]],
plot = data_plot,
width = plot_width,
height = plot_height
height = plot_height,
limitsize = FALSE
)


Expand All @@ -102,25 +184,25 @@ best_trees_2d_summary <- best_trees_support_vs_branch_length |>

summary_plot <- ggplot(
data = best_trees_2d_summary,
mapping = aes(
x=branch_length_summary_y,
y=support_summary_y
mapping = aes(
x=support_summary_y,
y=branch_length_summary_y
)
) +
geom_point(
color="red"
) +
geom_errorbarh(
geom_errorbar(
mapping = aes(
xmin=branch_length_summary_ymin,
xmax=branch_length_summary_ymax
ymin=branch_length_summary_ymin,
ymax=branch_length_summary_ymax
),
color="red"
) +
geom_errorbar(
geom_errorbarh(
mapping = aes(
ymin=support_summary_ymin,
ymax=support_summary_ymax
xmin=support_summary_ymin,
xmax=support_summary_ymax
),
color="red"
) +
Expand All @@ -144,5 +226,6 @@ ggsave(
filename = snakemake@output[["summary_plot"]],
plot = summary_plot,
width = plot_width,
height = plot_height
height = plot_height,
limitsize = FALSE
)
38 changes: 34 additions & 4 deletions workflow/scripts/plot_max_missing_topologies.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ rfs_and_topos <- read_tsv(
convergence,
relative_rf_distance,
absolute_rf_distance,
usable_sites
usable_sites,
aic_score,
aicc_score,
bic_score
),
names_to = "label",
values_to = "value"
Expand All @@ -26,6 +29,7 @@ rfs_and_topos <- read_tsv(
value_type = case_match(
label,
"usable_sites" ~ "usable sites",
c("aic_score", "aicc_score", "bic_score") ~ "information criteria scores",
c("total_trees", "unique_topologies", "convergence") ~ "number of trees",
# "relative_rf_distance" ~ "Robinson-Foulds distance, relative",
"absolute_rf_distance" ~ "Robinson-Foulds distance"
Expand All @@ -34,13 +38,21 @@ rfs_and_topos <- read_tsv(
value_type,
levels = c(
"usable sites",
"information criteria scores",
"Robinson-Foulds distance",
"number of trees"
)
),
`number of trees` = case_match(
label,
c("total_trees", "absolute_rf_distance", "usable_sites") ~ "total",
c(
"total_trees",
"absolute_rf_distance",
"usable_sites",
"aic_score",
"aicc_score",
"bic_score"
) ~ "total",
"unique_topologies" ~ "distinct topologies",
"convergence" ~ "bootstrapping converges"
),
Expand All @@ -49,6 +61,22 @@ rfs_and_topos <- read_tsv(
"startTree" ~ "start of ML search",
"mlTrees" ~ "result of ML search",
"bootstraps" ~ "bootstrapping"
),
`information criteria scores` = case_match(
label,
"aic_score" ~ "Akaike",
"aicc_score" ~ "Akaike corrected",
"bic_score" ~ "Bayesian",
.default = "non-score lines"
),
`information criteria scores` = factor(
`information criteria scores`,
levels = c(
"non-score lines",
"bic_score" ~ "Bayesian",
"aicc_score" ~ "Akaike corrected",
"aic_score" ~ "Akaike"
)
)
)

Expand All @@ -58,7 +86,8 @@ plot <- ggplot(
x = max_missing,
y = value,
color = `number of trees`,
shape = `trees from`
shape = `trees from`,
linetype = `information criteria scores`
)
) +
scale_color_brewer(
Expand All @@ -80,5 +109,6 @@ plot_width = 3 + number_of_models * 3.5
ggsave(
snakemake@output[["plot"]],
plot = plot,
width = plot_width
width = plot_width,
limitsize = FALSE
)

0 comments on commit 7f111f6

Please sign in to comment.