Skip to content

Commit

Permalink
fixed bugs and added scale to tree
Browse files Browse the repository at this point in the history
  • Loading branch information
DOH-JDJ0303 committed Mar 29, 2024
1 parent 96bef12 commit 6c37a56
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 26 deletions.
38 changes: 33 additions & 5 deletions bin/cluster.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,44 @@
#!/usr/bin/env Rscript

library(tidyverse)
library(ggtree)
#!/usr/bin/env Rscript
version <- "1.0"

# cluster.R
# Author: Jared Johnson, [email protected]

#---- ARGUMENTS ----#
args <- commandArgs(trailingOnly = T)
dist_path <- args[1]
taxa_name <- args[2]
segment_name <- args[3]
threshold <- args[4]

file.name <- paste(taxa_name,segment_name,sep="-")
#---- VERSION ----#
if(dist_path == "version"){
cat(version, sep = "\n")
quit(status=0)
}

#---- LIBRARY ----#
library(tidyverse)
library(ggtree)
library(ape)

# set output file name
file.name <- paste(taxa_name,segment_name,sep="-")

#---- LOAD PAIRWISE DISTANCES ----#
dist.mat <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST","PVAL","HASHES")) %>%
select(ID1, ID2, DIST) %>%
pivot_wider(names_from="ID2", values_from="DIST") %>%
column_to_rownames(var="ID1") %>%
as.matrix() %>%
as.dist()

#---- HIERARCHICAL CLUSTER ----#
tree <- hclust(dist.mat, method = "complete")

#---- CUT DENDROGRAM AT DISTANCE THRESHOLD ----#
clusters <- cutree(tree, h = as.numeric(threshold)) %>%
data.frame() %>%
rownames_to_column(var = "seq") %>%
Expand All @@ -29,14 +47,23 @@ clusters <- cutree(tree, h = as.numeric(threshold)) %>%
segment = segment_name) %>%
select(seq, taxa, segment, cluster)

write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F)
# adjust height to percentage for more intuitive interpretation
tree$height <- 100*tree$height

# plot tree
p <- ggtree(tree)%<+%clusters+
geom_tippoint(aes(color = as.character(cluster)))+
labs(color = "Cluster")
geom_vline(xintercept = -100*as.numeric(threshold), linetype = "dashed", color = "#E35335")+
theme_tree2()+
labs(color = "Cluster", x = "Approx. Nucleotide Difference (%)")

#---- SAVE OUTPUTS ----#
# cluster info
write.csv(clusters, file = paste0(file.name,'.csv'), quote = F, row.names = F)
n_iso <- p$data %>%
drop_na() %>%
nrow()

# set image dimensions
wdth <- n_iso/100
if(wdth < 200){
Expand All @@ -46,4 +73,5 @@ hght=n_iso/100
if(hght < 200){
hght <- 10
}
# plot
ggsave(plot = p, filename = paste0(file.name,'.jpg'), dpi = 300, width = wdth, height = hght, limitsize=F)
21 changes: 17 additions & 4 deletions modules/local/cluster.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ process CLUSTER {
tuple val(taxa), val(segment), path(dist), val(seq_count)

output:
path "*.csv", emit: results
path "*.jpg", emit: plot
path "*.csv", emit: results
path "*.jpg", emit: plot
path "versions.yml", emit: versions


when:
task.ext.when == null || task.ext.when
Expand All @@ -20,6 +22,11 @@ process CLUSTER {
gzip -d ${dist}
# run script
cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cluster: \$(cluster.R version)
END_VERSIONS
"""
}

Expand All @@ -32,8 +39,9 @@ process CLUSTER_LARGE {
tuple val(taxa), val(segment), path(dist), val(seq_count)

output:
path "*.csv", emit: results
path "*.jpg", emit: plot
path "*.csv", emit: results
path "*.jpg", emit: plot
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -44,5 +52,10 @@ process CLUSTER_LARGE {
gzip -d ${dist}
# run script
cluster.R *.txt "${taxa}" "${segment}" ${params.dist_threshold}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cluster: \$(cluster.R version)
END_VERSIONS
"""
}
9 changes: 5 additions & 4 deletions modules/local/consensus.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ process CONSENSUS {

output:
tuple val(taxa), val(segment), val(cluster), path("${prefix}.fa"), env(length), emit: fa
path "${prefix}_length.csv", emit: len
path "${prefix}_length.csv", emit: len
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -24,8 +25,8 @@ process CONSENSUS {
echo "${prefix},\${length}" > ${prefix}_length.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
consensus: \$(consensus.sh version)
END_VERSIONS
"${task.process}":
consensus: \$(consensus.sh version)
END_VERSIONS
"""
}
6 changes: 2 additions & 4 deletions modules/local/input-qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ process INPUT_QC {
# set sequence count
seq_count=\$(cat ${prefix}-qc-summary.csv | cut -f 5 -d ',' | grep -v 'filter4' | tr -d '\t\r\n ')
cat <<-END_VERSIONS > versions.yml
"${task.process}":
input-qc: \$(input-qc.sh version)
END_VERSIONS
# something about the normal way of getting version info messes with the creations of .command.env
echo "${task.process}:\n\tinput-qc: \$(input-qc.sh version | tr -d '\t\r\')" > versions.yml
"""
}
12 changes: 7 additions & 5 deletions modules/local/summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ process SUMMARY {


output:
path "*.csv", emit: summary
path "*.jpg", emit: plots
path "*.csv", emit: summary
path "*.jpg", emit: plots
path "versions.yml", emit: versions


when:
task.ext.when == null || task.ext.when
Expand All @@ -25,8 +27,8 @@ process SUMMARY {
summary.R clusters-no-header.csv ${lengths} ${ani_ava} ${ani_seeds} ${seeds}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
summary: \$(summary.R version)
END_VERSIONS
"${task.process}":
summary: \$(summary.R version)
END_VERSIONS
"""
}
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ params {
// Input options
input = null
seeds = null
dist_threshold = 0.2
len_threshold = 0.2
dist_threshold = 0.15
len_threshold = 0.15

// References
genome = null
Expand Down
4 changes: 2 additions & 2 deletions workflows/epitome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ workflow EPITOME {
CLUSTER (
MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() <= 2000 }
)
ch_versions = ch_versions.mix(CLUSTERS.out.versions.first())
ch_versions = ch_versions.mix(CLUSTER.out.versions.first())

// Large datasets - requires much more memory!
CLUSTER_LARGE (
MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() > 2000 }
)
ch_versions = ch_versions.mix(CLUSTERS_LARGE.out.versions.first())
ch_versions = ch_versions.mix(CLUSTER_LARGE.out.versions.first())

// Combine small and large dataset cluster results and add clean sequence paths
CLUSTER
Expand Down

0 comments on commit 6c37a56

Please sign in to comment.