diff --git a/data_processing/compare_trees.py b/data_processing/compare_trees.py new file mode 100644 index 00000000..8045d463 --- /dev/null +++ b/data_processing/compare_trees.py @@ -0,0 +1,19 @@ +from Bio import Phylo + +tree1_path = "new2.nwk" +tree2_path = "done_bp.newick" +print("B") +tree1 = Phylo.read(tree1_path, "newick") +print("B") +tree2 = Phylo.read(tree2_path, "newick") +print("B") +tree1_preorder = tree1.find_clades() +tree2_preorder = tree2.find_clades() +print("B") +for node1 in tree1_preorder: + node2 = next(tree2_preorder) + if node1.name != node2.name: + # print(node1.name, node2.name) + pass + if node1.branch_length != node2.branch_length and node2.branch_length is not None: + print(node1.name, node2.name, node1.branch_length, node2.branch_length) diff --git a/data_processing/get_tree_from_proto.py b/data_processing/get_tree_from_proto.py new file mode 100644 index 00000000..88d7d264 --- /dev/null +++ b/data_processing/get_tree_from_proto.py @@ -0,0 +1,114 @@ +import dendropy +import parsimony_pb2, tqdm + +from Bio import Phylo + + +class UsherMutationAnnotatedTree: + def __init__(self, tree_file): + self.data = parsimony_pb2.data() + self.data.ParseFromString(tree_file.read()) + self.condensed_nodes_dict = self.get_condensed_nodes_dict( + self.data.condensed_nodes) + + self.tree = dendropy.Tree.get(data=self.data.newick, schema="newick") + self.annotate_mutations() + self.set_branch_lengths() + self.expand_condensed_nodes() + + def annotate_mutations(self): + for i, node in enumerate(self.tree.preorder_node_iter()): + node.nuc_mutations = self.data.node_mutations[i] + + def expand_condensed_nodes(self): + for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()), + desc="Expanding condensed nodes"): + + if node.taxon and node.taxon.label in self.condensed_nodes_dict: + assert node.edge_length == 0 + for new_node_label in self.condensed_nodes_dict[ + node.taxon.label]: + new_node = dendropy.Node( + taxon=dendropy.Taxon(new_node_label)) + + node.parent_node.add_child(new_node) + node.parent_node.remove_child(node) + + def get_condensed_nodes_dict(self, condensed_nodes_dict): + output_dict = {} + for condensed_node in tqdm.tqdm(condensed_nodes_dict, + desc="Reading condensed nodes dict"): + output_dict[condensed_node.node_name.replace( + "_", " ")] = condensed_node.condensed_leaves + return output_dict + + def set_branch_lengths(self): + for i, node in enumerate(self.tree.preorder_node_iter()): + node.edge_length = len(node.nuc_mutations.mutation) + + +import io + + +def get_parent(tree, child_clade): + node_path = tree.get_path(child_clade) + return node_path[-2] + + +# The same but using BioPython +class UsherMutationAnnotatedTreeBioPython: + def __init__(self, tree_file): + self.data = parsimony_pb2.data() + self.data.ParseFromString(tree_file.read()) + self.condensed_nodes_dict = self.get_condensed_nodes_dict( + self.data.condensed_nodes) + self.tree = Phylo.read(io.StringIO(self.data.newick), "newick") + #print("aa", self.condensed_nodes_dict) + self.annotate_mutations() + self.set_branch_lengths() + self.name_nodes() + self.expand_condensed_nodes() + + def annotate_mutations(self): + for i, node in enumerate(self.tree.find_clades()): + node.nuc_mutations = self.data.node_mutations[i] + + def name_nodes(self): + for i, node in enumerate(self.tree.find_clades()): + if not node.name: + node.name = f"node_{i}" + + def expand_condensed_nodes(self): + for i, parent in tqdm.tqdm(enumerate(self.tree.find_clades()), + desc="Expanding condensed nodes"): + for node in parent.clades: + if node.name in self.condensed_nodes_dict: + assert node.branch_length == 0 + for new_node_label in self.condensed_nodes_dict[node.name]: + new_node = Phylo.BaseTree.Clade(name=new_node_label) + parent.clades.append(new_node) + parent.clades.remove(node) + else: + # print(node.name) + pass + + def get_condensed_nodes_dict(self, condensed_nodes_dict): + output_dict = {} + for condensed_node in tqdm.tqdm(condensed_nodes_dict, + desc="Reading condensed nodes dict"): + output_dict[ + condensed_node.node_name] = condensed_node.condensed_leaves + return output_dict + + def set_branch_lengths(self): + for i, node in enumerate(self.tree.find_clades()): + node.branch_length = len(node.nuc_mutations.mutation) + + +# f = open("./public-2021-09-15.all.masked.pb", "rb") +# mat = UsherMutationAnnotatedTree(f) +# mat.tree.write(path="./done.newick", schema="newick") + +f = open("./public-2021-09-15.all.masked.pb", "rb") +mat = UsherMutationAnnotatedTreeBioPython(f) +Phylo.write(mat.tree, "./done_bp.newick", "newick") diff --git a/data_processing/usher_processing.py b/data_processing/usher_processing.py index 2dced7ae..c6934b59 100644 --- a/data_processing/usher_processing.py +++ b/data_processing/usher_processing.py @@ -64,6 +64,12 @@ def get_aa_ref(pos): return None +def get_nuc_ref(pos): + + return f"nuc:X_{pos}_{cov2_genome.seq[pos-1]}" + return None + + def get_aa_sub(pos, par, alt): for gene_name, gene_range in cov2_genome.genes.items(): @@ -103,7 +109,7 @@ def __init__(self, tree_file): self.annotate_mutations() self.set_branch_lengths() self.annotate_aa_mutations() - #self.annotate_nuc_mutations() + self.annotate_nuc_mutations() self.expand_condensed_nodes() def annotate_mutations(self): @@ -134,7 +140,7 @@ def annotate_nuc_mutations(self): ref = NUC_ENUM[mut.ref_nuc] alt = NUC_ENUM[mut.mut_nuc[0]] par = NUC_ENUM[mut.par_nuc] - aa_style_sub = f"{par}{mut.position}{alt}" + aa_style_sub = f"nuc:{par}_{mut.position}_{alt}" node.aa_subs.append(aa_style_sub) def expand_condensed_nodes(self): @@ -162,13 +168,22 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict): # In[19]: +input_file = "public-latest.all.masked.pb.gz" -f = open("./public-latest.all.masked.pb", "rb") +if input_file.endswith(".gz"): + f = gzip.open(input_file, "rb") +else: + f = open(input_file, "rb") mat = UsherMutationAnnotatedTree(f) print("Ladderizing tree") mat.tree.ladderize() print("Writing distance tree") + +#Name any unnamed nodes: +for i, node in enumerate(mat.tree.preorder_node_iter()): + if not node.taxon: + node.taxon = dendropy.Taxon(f"temporaryname__{i}") mat.tree.write(path="./distance.nwk", schema="newick", unquoted_underscores=True) @@ -176,9 +191,9 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict): print("Launching chronumental") import os -os.system( - "chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 200" -) +# os.system( +# "chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 4000" +# ) print("Reading time tree") time_tree = dendropy.Tree.get(path="./timetree__distance.nwk", schema="newick") @@ -187,11 +202,16 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict): desc="Adding time tree"): time_tree_node = next(time_tree_iter) node.time = time_tree_node.edge_length + if node.taxon.label.startswith("temporaryname__"): + node.taxon.label = "" del time_tree del time_tree_iter all_ref_muts = set(get_aa_ref(x) for x in range(len(cov2_genome.seq))) all_ref_muts = [x for x in all_ref_muts if x is not None] + +all_ref_nuc_muts = set(get_nuc_ref(x) for x in range(len(cov2_genome.seq))) +all_ref_muts.extend(all_ref_nuc_muts) mat.tree.seed_node.aa_subs = all_ref_muts # In[14]: @@ -299,13 +319,13 @@ def add_paths(tree_by_level): print("B") -alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False) +#alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False) alt_genbank_lookup = {} -for i, row in tqdm.tqdm(alt_metadata.iterrows()): +# for i, row in tqdm.tqdm(alt_metadata.iterrows()): - name = row['strain'] #.split("|")[0] - alt_genbank_lookup[name] = str(row['genbank_accession']) +# name = row['strain'] #.split("|")[0] +# alt_genbank_lookup[name] = str(row['genbank_accession']) def make_mapping(list_of_strings): @@ -347,11 +367,11 @@ def make_mapping(list_of_strings): print("C") del metadata -del alt_metadata +#del alt_metadata for i, x in tqdm.tqdm(enumerate(all_nodes)): xes.append(x.x * 0.2) - time_xes.append(x.x * 0.02) + time_xes.append(x.x_time * 0.02) yes.append(x.y / 40000) path_list_rev = x.path_list[::-1] if len(path_list_rev) > 0: @@ -404,7 +424,7 @@ def make_mapping(list_of_strings): all_node_data = taxonium_pb2.AllNodeData( genbanks=genbanks, names=names, - x=time_xes, + x=xes, time_x=time_xes, y=yes, dates=dates, @@ -421,6 +441,7 @@ def make_mapping(list_of_strings): mutation_mapping=mutation_mapping_list, date_mapping=date_mapping_list) -f = gzip.open("../public/nodelist.pb.gz", "wb") +#f = gzip.open("../public/nodelist.pb.gz", "wb") +f = open("/mnt/d/timetree.pb", "wb") f.write(all_data.SerializeToString()) f.close() diff --git a/public/nodelist.pb.gz b/public/nodelist.pb.gz index 5fe85a87..a52698a5 100644 Binary files a/public/nodelist.pb.gz and b/public/nodelist.pb.gz differ diff --git a/public/taxonium.proto b/public/taxonium.proto index d45b493e..6931c3de 100644 --- a/public/taxonium.proto +++ b/public/taxonium.proto @@ -33,6 +33,7 @@ repeated int32 epi_isl_numbers = 10; // Deprecated: DO NOT USE repeated int32 num_tips = 11; repeated MetadataSingleValuePerNode metadata_singles = 12; repeated MetadataUniqueStringPerNode metadata_unique_strings=13; +repeated float time_x = 14; } diff --git a/src/App.jsx b/src/App.jsx index 9f16ac23..4adaeedd 100644 --- a/src/App.jsx +++ b/src/App.jsx @@ -20,9 +20,11 @@ function App() { } const [query, setQuery] = useQueryAsState({ blinking:"false", + xAccessor: "x", search: JSON.stringify([ { id: 0.123, + category: "lineage", value: "", enabled: true, diff --git a/src/Deck.jsx b/src/Deck.jsx index cb71308d..c16d418d 100644 --- a/src/Deck.jsx +++ b/src/Deck.jsx @@ -65,7 +65,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) { const filtered = nodeIds.filter((node) => { if (line_mode) { if ( - (Math.abs(node_data.x[node] - node_data.x[node_data.parents[node]]) > + (Math.abs(node_data.x_set[node] - node_data.x_set[node_data.parents[node]]) > 1) | (Math.abs(node_data.y[node] - node_data.y[node_data.parents[node]]) > 0.5) @@ -74,7 +74,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) { } } - const rounded_x = Math.round(node_data.x[node] * precision) / precision; + const rounded_x = Math.round(node_data.x_set[node] * precision) / precision; const rounded_y = Math.round(node_data.y[node] * precision) / precision; if (included_points[rounded_x]) { if (included_points[rounded_x][rounded_y]) { @@ -453,6 +453,7 @@ function Deck({ }, [node_data, data, colourBy]); const coarseScatterIds = useMemo(() => { + console.log("calcing coarse scatter") return reduceOverPlotting(scatterIds, node_data, 100, false); }, [scatterIds, node_data]); @@ -487,7 +488,7 @@ function Deck({ onHover: (info) => setHoverInfo(info), getPosition: (d) => { - return [node_data.x[d], node_data.y[d]]; + return [node_data.x_set[d], node_data.y[d]]; }, updateTriggers: { getFillColor: scatterFillFunction, @@ -532,7 +533,7 @@ function Deck({ getLineColor: [0, 0, 0], getPosition: (d) => { - return [node_data.x[d], node_data.y[d]]; + return [node_data.x_set[d], node_data.y[d]]; }, lineWidthUnits: "pixels", lineWidthScale: 2, @@ -592,10 +593,10 @@ function Deck({ pickable: true, onHover: (info) => setHoverInfo(info), getTargetPosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[d], ], - getSourcePosition: (d) => [node_data.x[d], node_data.y[d]], + getSourcePosition: (d) => [node_data.x_set[d], node_data.y[d]], getColor: [150, 150, 150], }), [node_data] @@ -608,11 +609,11 @@ function Deck({ pickable: false, getWidth: 1, getTargetPosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[node_data.parents[d]], ], getSourcePosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[d], ], getColor: [150, 150, 150], @@ -722,7 +723,7 @@ function Deck({ () => ({ id: "main-text-layer", data: textInfo.ids, - getPosition: (d) => [node_data.x[d] + 0.3, node_data.y[d]], + getPosition: (d) => [node_data.x_set[d] + 0.3, node_data.y[d]], getText: (d) => node_data.names[d], getColor: [180, 180, 180], getAngle: 0, @@ -738,7 +739,7 @@ function Deck({ () => ({ id: "main-text-muts-layer", data: mutTextIds.filter(() => true), - getPosition: (d) => [node_data.x[d], node_data.y[d]], + getPosition: (d) => [node_data.x_set[d], node_data.y[d]], getText: (d) => node_data.mutations[d].mutation ? node_data.mutations[d].mutation @@ -761,7 +762,7 @@ function Deck({ data.mutation_mapping, mutTextIds, node_data.mutations, - node_data.x, + node_data.x_set, node_data.y, ] ); diff --git a/src/Taxonium.jsx b/src/Taxonium.jsx index 04c58837..9186f79d 100644 --- a/src/Taxonium.jsx +++ b/src/Taxonium.jsx @@ -61,6 +61,13 @@ function Taxonium({protoUrl,uploadedData, query,setQuery}) { [setQuery, query] ); + const setXaccessor = useCallback( + (acc) => { + setQuery({ ...query, xAccessor: acc }); + }, + [setQuery, query] + ); + const setBlinkingEnabled = useCallback( (enabled) => { setQuery({ ...query, blinking:enabled? "true":"false" }); @@ -173,7 +180,7 @@ function getRawfile(protoUrl, uploadedData) { - result.node_data.ids = [...Array(result.node_data.x.length).keys()]; + result.node_data.ids = [...Array(result.node_data.y.length).keys()]; const all_genes = new Set(); result.mutation_mapping = result.mutation_mapping.map((x, i) => { @@ -200,9 +207,22 @@ function getRawfile(protoUrl, uploadedData) { }, [nodeData.status, query.protoUrl, uploadedData]); const data = useMemo( - () => - nodeData.status === "loaded" ? nodeData.data : { node_data: { ids: [] } }, - [nodeData] + () =>{ + if( nodeData.status === "loaded") { + + console.log("xaccessor isA"+query.xAccessor) + const new_data = {...nodeData.data } + new_data.node_data.x_set = nodeData.data.node_data[query.xAccessor] + window.NDD = new_data.node_data + return new_data; + + } else{ + + return{node_data:{ids:[],x_set:[],y:[],metadata_singles:[]}} + } + + }, + [nodeData,query.xAccessor] ); const scatterIds = useMemo( @@ -304,7 +324,7 @@ function getRawfile(protoUrl, uploadedData) { lineWidthScale: 1, getPosition: (d) => { - return [data.node_data.x[d], data.node_data.y[d]]; + return [data.node_data.x_set[d], data.node_data.y[d]]; }, getFillColor: (d) => [0, 0, 0], getLineColor: (d) => searchColors[counter % searchColors.length], @@ -341,6 +361,8 @@ function getRawfile(protoUrl, uploadedData) {
)} +

+ + + Tree type{" "} +

+ {/*} */} + +
{selectedNode && (