Skip to content

Commit

Permalink
Write metadata TSV file when clustering
Browse files Browse the repository at this point in the history
This file contains information about the clusters, which can be useful for users,
especially in determining which contigs might be good or bad.
  • Loading branch information
jakobnissen committed Nov 14, 2023
1 parent 374e912 commit ad42dbf
Showing 1 changed file with 25 additions and 12 deletions.
37 changes: 25 additions & 12 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pycoverm
import itertools
from math import isfinite
from typing import Optional, Tuple, Union
from typing import Optional, Tuple, Union, cast
from pathlib import Path
from collections.abc import Sequence
from collections import defaultdict
Expand Down Expand Up @@ -757,17 +757,30 @@ def cluster_and_write_files(
cuda=vamb_options.cuda,
rng_seed=vamb_options.seed,
)

renamed = (
(str(cluster_index + 1), {sequence_names[i] for i in members})
for (cluster_index, (_, members)) in enumerate(
map(lambda x: x.as_tuple(), cluster_generator)
)
)

# This also works correctly when max_clusters is None
first_clusters = itertools.islice(renamed, cluster_options.max_clusters)
unsplit_clusters = dict(first_clusters)
clusters = itertools.islice(cluster_generator, cluster_options.max_clusters)
cluster_dict: dict[str, set[str]] = dict()

# Write the cluster metadata to file
with open(Path(base_clusters_name + "_metadata.tsv"), "w") as file:
print("name\tradius\tpeak valley ratio\tkind\tbp\tncontigs", file=file)
for i, cluster in enumerate(clusters):
cluster_dict[str(i + 1)] = {
sequence_names[cast(int, i)] for i in cluster.members
}
print(
str(i + 1),
None if cluster.radius is None else round(cluster.radius, 3),
None
if cluster.observed_pvr is None
else round(cluster.observed_pvr, 2),
cluster.kind_str,
sum(sequence_lens[i] for i in cluster.members),
len(cluster.members),
file=file,
sep="\t",
)

elapsed = round(time.time() - begintime, 2)

write_clusters_and_bins(
Expand All @@ -776,7 +789,7 @@ def cluster_and_write_files(
base_clusters_name,
bins_dir,
fasta_catalogue,
unsplit_clusters,
cluster_dict,
sequence_names,
sequence_lens,
)
Expand Down

0 comments on commit ad42dbf

Please sign in to comment.