Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
theosanderson committed Oct 13, 2021
1 parent b8116f3 commit d9b40b8
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 30 deletions.
19 changes: 19 additions & 0 deletions data_processing/compare_trees.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from Bio import Phylo

tree1_path = "new2.nwk"
tree2_path = "done_bp.newick"
print("B")
tree1 = Phylo.read(tree1_path, "newick")
print("B")
tree2 = Phylo.read(tree2_path, "newick")
print("B")
tree1_preorder = tree1.find_clades()
tree2_preorder = tree2.find_clades()
print("B")
for node1 in tree1_preorder:
node2 = next(tree2_preorder)
if node1.name != node2.name:
# print(node1.name, node2.name)
pass
if node1.branch_length != node2.branch_length and node2.branch_length is not None:
print(node1.name, node2.name, node1.branch_length, node2.branch_length)
114 changes: 114 additions & 0 deletions data_processing/get_tree_from_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import dendropy
import parsimony_pb2, tqdm

from Bio import Phylo


class UsherMutationAnnotatedTree:
def __init__(self, tree_file):
self.data = parsimony_pb2.data()
self.data.ParseFromString(tree_file.read())
self.condensed_nodes_dict = self.get_condensed_nodes_dict(
self.data.condensed_nodes)

self.tree = dendropy.Tree.get(data=self.data.newick, schema="newick")
self.annotate_mutations()
self.set_branch_lengths()
self.expand_condensed_nodes()

def annotate_mutations(self):
for i, node in enumerate(self.tree.preorder_node_iter()):
node.nuc_mutations = self.data.node_mutations[i]

def expand_condensed_nodes(self):
for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()),
desc="Expanding condensed nodes"):

if node.taxon and node.taxon.label in self.condensed_nodes_dict:
assert node.edge_length == 0
for new_node_label in self.condensed_nodes_dict[
node.taxon.label]:
new_node = dendropy.Node(
taxon=dendropy.Taxon(new_node_label))

node.parent_node.add_child(new_node)
node.parent_node.remove_child(node)

def get_condensed_nodes_dict(self, condensed_nodes_dict):
output_dict = {}
for condensed_node in tqdm.tqdm(condensed_nodes_dict,
desc="Reading condensed nodes dict"):
output_dict[condensed_node.node_name.replace(
"_", " ")] = condensed_node.condensed_leaves
return output_dict

def set_branch_lengths(self):
for i, node in enumerate(self.tree.preorder_node_iter()):
node.edge_length = len(node.nuc_mutations.mutation)


import io


def get_parent(tree, child_clade):
node_path = tree.get_path(child_clade)
return node_path[-2]


# The same but using BioPython
class UsherMutationAnnotatedTreeBioPython:
def __init__(self, tree_file):
self.data = parsimony_pb2.data()
self.data.ParseFromString(tree_file.read())
self.condensed_nodes_dict = self.get_condensed_nodes_dict(
self.data.condensed_nodes)
self.tree = Phylo.read(io.StringIO(self.data.newick), "newick")
#print("aa", self.condensed_nodes_dict)
self.annotate_mutations()
self.set_branch_lengths()
self.name_nodes()
self.expand_condensed_nodes()

def annotate_mutations(self):
for i, node in enumerate(self.tree.find_clades()):
node.nuc_mutations = self.data.node_mutations[i]

def name_nodes(self):
for i, node in enumerate(self.tree.find_clades()):
if not node.name:
node.name = f"node_{i}"

def expand_condensed_nodes(self):
for i, parent in tqdm.tqdm(enumerate(self.tree.find_clades()),
desc="Expanding condensed nodes"):
for node in parent.clades:
if node.name in self.condensed_nodes_dict:
assert node.branch_length == 0
for new_node_label in self.condensed_nodes_dict[node.name]:
new_node = Phylo.BaseTree.Clade(name=new_node_label)
parent.clades.append(new_node)
parent.clades.remove(node)
else:
# print(node.name)
pass

def get_condensed_nodes_dict(self, condensed_nodes_dict):
output_dict = {}
for condensed_node in tqdm.tqdm(condensed_nodes_dict,
desc="Reading condensed nodes dict"):
output_dict[
condensed_node.node_name] = condensed_node.condensed_leaves
return output_dict

def set_branch_lengths(self):
for i, node in enumerate(self.tree.find_clades()):
node.branch_length = len(node.nuc_mutations.mutation)


# f = open("./public-2021-09-15.all.masked.pb", "rb")
# mat = UsherMutationAnnotatedTree(f)
# mat.tree.write(path="./done.newick", schema="newick")

f = open("./public-2021-09-15.all.masked.pb", "rb")
mat = UsherMutationAnnotatedTreeBioPython(f)
Phylo.write(mat.tree, "./done_bp.newick", "newick")
49 changes: 35 additions & 14 deletions data_processing/usher_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def get_aa_ref(pos):
return None


def get_nuc_ref(pos):

return f"nuc:X_{pos}_{cov2_genome.seq[pos-1]}"
return None


def get_aa_sub(pos, par, alt):

for gene_name, gene_range in cov2_genome.genes.items():
Expand Down Expand Up @@ -103,7 +109,7 @@ def __init__(self, tree_file):
self.annotate_mutations()
self.set_branch_lengths()
self.annotate_aa_mutations()
#self.annotate_nuc_mutations()
self.annotate_nuc_mutations()
self.expand_condensed_nodes()

def annotate_mutations(self):
Expand Down Expand Up @@ -134,7 +140,7 @@ def annotate_nuc_mutations(self):
ref = NUC_ENUM[mut.ref_nuc]
alt = NUC_ENUM[mut.mut_nuc[0]]
par = NUC_ENUM[mut.par_nuc]
aa_style_sub = f"{par}{mut.position}{alt}"
aa_style_sub = f"nuc:{par}_{mut.position}_{alt}"
node.aa_subs.append(aa_style_sub)

def expand_condensed_nodes(self):
Expand Down Expand Up @@ -162,23 +168,32 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict):


# In[19]:
input_file = "public-latest.all.masked.pb.gz"

f = open("./public-latest.all.masked.pb", "rb")
if input_file.endswith(".gz"):
f = gzip.open(input_file, "rb")
else:
f = open(input_file, "rb")

mat = UsherMutationAnnotatedTree(f)
print("Ladderizing tree")
mat.tree.ladderize()
print("Writing distance tree")

#Name any unnamed nodes:
for i, node in enumerate(mat.tree.preorder_node_iter()):
if not node.taxon:
node.taxon = dendropy.Taxon(f"temporaryname__{i}")
mat.tree.write(path="./distance.nwk",
schema="newick",
unquoted_underscores=True)

print("Launching chronumental")
import os

os.system(
"chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 200"
)
# os.system(
# "chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 4000"
# )

print("Reading time tree")
time_tree = dendropy.Tree.get(path="./timetree__distance.nwk", schema="newick")
Expand All @@ -187,11 +202,16 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict):
desc="Adding time tree"):
time_tree_node = next(time_tree_iter)
node.time = time_tree_node.edge_length
if node.taxon.label.startswith("temporaryname__"):
node.taxon.label = ""
del time_tree
del time_tree_iter

all_ref_muts = set(get_aa_ref(x) for x in range(len(cov2_genome.seq)))
all_ref_muts = [x for x in all_ref_muts if x is not None]

all_ref_nuc_muts = set(get_nuc_ref(x) for x in range(len(cov2_genome.seq)))
all_ref_muts.extend(all_ref_nuc_muts)
mat.tree.seed_node.aa_subs = all_ref_muts

# In[14]:
Expand Down Expand Up @@ -299,13 +319,13 @@ def add_paths(tree_by_level):

print("B")

alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False)
#alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False)

alt_genbank_lookup = {}
for i, row in tqdm.tqdm(alt_metadata.iterrows()):
# for i, row in tqdm.tqdm(alt_metadata.iterrows()):

name = row['strain'] #.split("|")[0]
alt_genbank_lookup[name] = str(row['genbank_accession'])
# name = row['strain'] #.split("|")[0]
# alt_genbank_lookup[name] = str(row['genbank_accession'])


def make_mapping(list_of_strings):
Expand Down Expand Up @@ -347,11 +367,11 @@ def make_mapping(list_of_strings):
print("C")

del metadata
del alt_metadata
#del alt_metadata

for i, x in tqdm.tqdm(enumerate(all_nodes)):
xes.append(x.x * 0.2)
time_xes.append(x.x * 0.02)
time_xes.append(x.x_time * 0.02)
yes.append(x.y / 40000)
path_list_rev = x.path_list[::-1]
if len(path_list_rev) > 0:
Expand Down Expand Up @@ -404,7 +424,7 @@ def make_mapping(list_of_strings):
all_node_data = taxonium_pb2.AllNodeData(
genbanks=genbanks,
names=names,
x=time_xes,
x=xes,
time_x=time_xes,
y=yes,
dates=dates,
Expand All @@ -421,6 +441,7 @@ def make_mapping(list_of_strings):
mutation_mapping=mutation_mapping_list,
date_mapping=date_mapping_list)

f = gzip.open("../public/nodelist.pb.gz", "wb")
#f = gzip.open("../public/nodelist.pb.gz", "wb")
f = open("/mnt/d/timetree.pb", "wb")
f.write(all_data.SerializeToString())
f.close()
Binary file modified public/nodelist.pb.gz
Binary file not shown.
1 change: 1 addition & 0 deletions public/taxonium.proto
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ repeated int32 epi_isl_numbers = 10; // Deprecated: DO NOT USE
repeated int32 num_tips = 11;
repeated MetadataSingleValuePerNode metadata_singles = 12;
repeated MetadataUniqueStringPerNode metadata_unique_strings=13;
repeated float time_x = 14;


}
Expand Down
2 changes: 2 additions & 0 deletions src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ function App() {
}
const [query, setQuery] = useQueryAsState({
blinking:"false",
xAccessor: "x",
search: JSON.stringify([
{
id: 0.123,

category: "lineage",
value: "",
enabled: true,
Expand Down
23 changes: 12 additions & 11 deletions src/Deck.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) {
const filtered = nodeIds.filter((node) => {
if (line_mode) {
if (
(Math.abs(node_data.x[node] - node_data.x[node_data.parents[node]]) >
(Math.abs(node_data.x_set[node] - node_data.x_set[node_data.parents[node]]) >
1) |
(Math.abs(node_data.y[node] - node_data.y[node_data.parents[node]]) >
0.5)
Expand All @@ -74,7 +74,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) {
}
}

const rounded_x = Math.round(node_data.x[node] * precision) / precision;
const rounded_x = Math.round(node_data.x_set[node] * precision) / precision;
const rounded_y = Math.round(node_data.y[node] * precision) / precision;
if (included_points[rounded_x]) {
if (included_points[rounded_x][rounded_y]) {
Expand Down Expand Up @@ -453,6 +453,7 @@ function Deck({
}, [node_data, data, colourBy]);

const coarseScatterIds = useMemo(() => {
console.log("calcing coarse scatter")
return reduceOverPlotting(scatterIds, node_data, 100, false);
}, [scatterIds, node_data]);

Expand Down Expand Up @@ -487,7 +488,7 @@ function Deck({

onHover: (info) => setHoverInfo(info),
getPosition: (d) => {
return [node_data.x[d], node_data.y[d]];
return [node_data.x_set[d], node_data.y[d]];
},
updateTriggers: {
getFillColor: scatterFillFunction,
Expand Down Expand Up @@ -532,7 +533,7 @@ function Deck({

getLineColor: [0, 0, 0],
getPosition: (d) => {
return [node_data.x[d], node_data.y[d]];
return [node_data.x_set[d], node_data.y[d]];
},
lineWidthUnits: "pixels",
lineWidthScale: 2,
Expand Down Expand Up @@ -592,10 +593,10 @@ function Deck({
pickable: true,
onHover: (info) => setHoverInfo(info),
getTargetPosition: (d) => [
node_data.x[node_data.parents[d]],
node_data.x_set[node_data.parents[d]],
node_data.y[d],
],
getSourcePosition: (d) => [node_data.x[d], node_data.y[d]],
getSourcePosition: (d) => [node_data.x_set[d], node_data.y[d]],
getColor: [150, 150, 150],
}),
[node_data]
Expand All @@ -608,11 +609,11 @@ function Deck({
pickable: false,
getWidth: 1,
getTargetPosition: (d) => [
node_data.x[node_data.parents[d]],
node_data.x_set[node_data.parents[d]],
node_data.y[node_data.parents[d]],
],
getSourcePosition: (d) => [
node_data.x[node_data.parents[d]],
node_data.x_set[node_data.parents[d]],
node_data.y[d],
],
getColor: [150, 150, 150],
Expand Down Expand Up @@ -722,7 +723,7 @@ function Deck({
() => ({
id: "main-text-layer",
data: textInfo.ids,
getPosition: (d) => [node_data.x[d] + 0.3, node_data.y[d]],
getPosition: (d) => [node_data.x_set[d] + 0.3, node_data.y[d]],
getText: (d) => node_data.names[d],
getColor: [180, 180, 180],
getAngle: 0,
Expand All @@ -738,7 +739,7 @@ function Deck({
() => ({
id: "main-text-muts-layer",
data: mutTextIds.filter(() => true),
getPosition: (d) => [node_data.x[d], node_data.y[d]],
getPosition: (d) => [node_data.x_set[d], node_data.y[d]],
getText: (d) =>
node_data.mutations[d].mutation
? node_data.mutations[d].mutation
Expand All @@ -761,7 +762,7 @@ function Deck({
data.mutation_mapping,
mutTextIds,
node_data.mutations,
node_data.x,
node_data.x_set,
node_data.y,
]
);
Expand Down
Loading

0 comments on commit d9b40b8

Please sign in to comment.