diff --git a/data_processing/compare_trees.py b/data_processing/compare_trees.py new file mode 100644 index 00000000..8045d463 --- /dev/null +++ b/data_processing/compare_trees.py @@ -0,0 +1,19 @@ +from Bio import Phylo + +tree1_path = "new2.nwk" +tree2_path = "done_bp.newick" +print("B") +tree1 = Phylo.read(tree1_path, "newick") +print("B") +tree2 = Phylo.read(tree2_path, "newick") +print("B") +tree1_preorder = tree1.find_clades() +tree2_preorder = tree2.find_clades() +print("B") +for node1 in tree1_preorder: + node2 = next(tree2_preorder) + if node1.name != node2.name: + # print(node1.name, node2.name) + pass + if node1.branch_length != node2.branch_length and node2.branch_length is not None: + print(node1.name, node2.name, node1.branch_length, node2.branch_length) diff --git a/data_processing/get_tree_from_proto.py b/data_processing/get_tree_from_proto.py new file mode 100644 index 00000000..88d7d264 --- /dev/null +++ b/data_processing/get_tree_from_proto.py @@ -0,0 +1,114 @@ +import dendropy +import parsimony_pb2, tqdm + +from Bio import Phylo + + +class UsherMutationAnnotatedTree: + def __init__(self, tree_file): + self.data = parsimony_pb2.data() + self.data.ParseFromString(tree_file.read()) + self.condensed_nodes_dict = self.get_condensed_nodes_dict( + self.data.condensed_nodes) + + self.tree = dendropy.Tree.get(data=self.data.newick, schema="newick") + self.annotate_mutations() + self.set_branch_lengths() + self.expand_condensed_nodes() + + def annotate_mutations(self): + for i, node in enumerate(self.tree.preorder_node_iter()): + node.nuc_mutations = self.data.node_mutations[i] + + def expand_condensed_nodes(self): + for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()), + desc="Expanding condensed nodes"): + + if node.taxon and node.taxon.label in self.condensed_nodes_dict: + assert node.edge_length == 0 + for new_node_label in self.condensed_nodes_dict[ + node.taxon.label]: + new_node = dendropy.Node( + taxon=dendropy.Taxon(new_node_label)) + + node.parent_node.add_child(new_node) + node.parent_node.remove_child(node) + + def get_condensed_nodes_dict(self, condensed_nodes_dict): + output_dict = {} + for condensed_node in tqdm.tqdm(condensed_nodes_dict, + desc="Reading condensed nodes dict"): + output_dict[condensed_node.node_name.replace( + "_", " ")] = condensed_node.condensed_leaves + return output_dict + + def set_branch_lengths(self): + for i, node in enumerate(self.tree.preorder_node_iter()): + node.edge_length = len(node.nuc_mutations.mutation) + + +import io + + +def get_parent(tree, child_clade): + node_path = tree.get_path(child_clade) + return node_path[-2] + + +# The same but using BioPython +class UsherMutationAnnotatedTreeBioPython: + def __init__(self, tree_file): + self.data = parsimony_pb2.data() + self.data.ParseFromString(tree_file.read()) + self.condensed_nodes_dict = self.get_condensed_nodes_dict( + self.data.condensed_nodes) + self.tree = Phylo.read(io.StringIO(self.data.newick), "newick") + #print("aa", self.condensed_nodes_dict) + self.annotate_mutations() + self.set_branch_lengths() + self.name_nodes() + self.expand_condensed_nodes() + + def annotate_mutations(self): + for i, node in enumerate(self.tree.find_clades()): + node.nuc_mutations = self.data.node_mutations[i] + + def name_nodes(self): + for i, node in enumerate(self.tree.find_clades()): + if not node.name: + node.name = f"node_{i}" + + def expand_condensed_nodes(self): + for i, parent in tqdm.tqdm(enumerate(self.tree.find_clades()), + desc="Expanding condensed nodes"): + for node in parent.clades: + if node.name in self.condensed_nodes_dict: + assert node.branch_length == 0 + for new_node_label in self.condensed_nodes_dict[node.name]: + new_node = Phylo.BaseTree.Clade(name=new_node_label) + parent.clades.append(new_node) + parent.clades.remove(node) + else: + # print(node.name) + pass + + def get_condensed_nodes_dict(self, condensed_nodes_dict): + output_dict = {} + for condensed_node in tqdm.tqdm(condensed_nodes_dict, + desc="Reading condensed nodes dict"): + output_dict[ + condensed_node.node_name] = condensed_node.condensed_leaves + return output_dict + + def set_branch_lengths(self): + for i, node in enumerate(self.tree.find_clades()): + node.branch_length = len(node.nuc_mutations.mutation) + + +# f = open("./public-2021-09-15.all.masked.pb", "rb") +# mat = UsherMutationAnnotatedTree(f) +# mat.tree.write(path="./done.newick", schema="newick") + +f = open("./public-2021-09-15.all.masked.pb", "rb") +mat = UsherMutationAnnotatedTreeBioPython(f) +Phylo.write(mat.tree, "./done_bp.newick", "newick") diff --git a/data_processing/parsimony_pb2.py b/data_processing/parsimony_pb2.py index 71efa094..45b4ea8d 100644 --- a/data_processing/parsimony_pb2.py +++ b/data_processing/parsimony_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: parsimony.proto - +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection @@ -18,6 +18,7 @@ package='Parsimony', syntax='proto3', serialized_options=None, + create_key=_descriptor._internal_create_key, serialized_pb=b'\n\x0fparsimony.proto\x12\tParsimony\"^\n\x03mut\x12\x10\n\x08position\x18\x01 \x01(\x05\x12\x0f\n\x07ref_nuc\x18\x02 \x01(\x05\x12\x0f\n\x07par_nuc\x18\x03 \x01(\x05\x12\x0f\n\x07mut_nuc\x18\x04 \x03(\x05\x12\x12\n\nchromosome\x18\x05 \x01(\t\"1\n\rmutation_list\x12 \n\x08mutation\x18\x01 \x03(\x0b\x32\x0e.Parsimony.mut\"=\n\x0e\x63ondensed_node\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x18\n\x10\x63ondensed_leaves\x18\x02 \x03(\t\"*\n\rnode_metadata\x12\x19\n\x11\x63lade_annotations\x18\x01 \x03(\t\"\xa8\x01\n\x04\x64\x61ta\x12\x0e\n\x06newick\x18\x01 \x01(\t\x12\x30\n\x0enode_mutations\x18\x02 \x03(\x0b\x32\x18.Parsimony.mutation_list\x12\x32\n\x0f\x63ondensed_nodes\x18\x03 \x03(\x0b\x32\x19.Parsimony.condensed_node\x12*\n\x08metadata\x18\x04 \x03(\x0b\x32\x18.Parsimony.node_metadatab\x06proto3' ) @@ -30,6 +31,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='position', full_name='Parsimony.mut.position', index=0, @@ -37,35 +39,35 @@ has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='ref_nuc', full_name='Parsimony.mut.ref_nuc', index=1, number=2, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='par_nuc', full_name='Parsimony.mut.par_nuc', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='mut_nuc', full_name='Parsimony.mut.mut_nuc', index=3, number=4, type=5, cpp_type=1, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='chromosome', full_name='Parsimony.mut.chromosome', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -89,6 +91,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='mutation', full_name='Parsimony.mutation_list.mutation', index=0, @@ -96,7 +99,7 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -120,6 +123,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='node_name', full_name='Parsimony.condensed_node.node_name', index=0, @@ -127,14 +131,14 @@ has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='condensed_leaves', full_name='Parsimony.condensed_node.condensed_leaves', index=1, number=2, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -158,6 +162,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='clade_annotations', full_name='Parsimony.node_metadata.clade_annotations', index=0, @@ -165,7 +170,7 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -189,6 +194,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='newick', full_name='Parsimony.data.newick', index=0, @@ -196,28 +202,28 @@ has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='node_mutations', full_name='Parsimony.data.node_mutations', index=1, number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='condensed_nodes', full_name='Parsimony.data.condensed_nodes', index=2, number=3, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='metadata', full_name='Parsimony.data.metadata', index=3, number=4, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], diff --git a/data_processing/taxonium_pb2.py b/data_processing/taxonium_pb2.py index 46c27b4d..70b42d96 100644 --- a/data_processing/taxonium_pb2.py +++ b/data_processing/taxonium_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: taxonium.proto - +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection @@ -10,532 +10,414 @@ _sym_db = _symbol_database.Default() + + + DESCRIPTOR = _descriptor.FileDescriptor( - name='taxonium.proto', - package='', - syntax='proto3', - serialized_options=None, - serialized_pb= - b'\n\x0etaxonium.proto\" \n\x0cMutationList\x12\x10\n\x08mutation\x18\x01 \x03(\x05\"q\n\x1aMetadataSingleValuePerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x0f\n\x07mapping\x18\x03 \x03(\t\x12\x13\n\x0bnode_values\x18\x04 \x03(\x05\"\x8d\x02\n\x0b\x41llNodeData\x12\r\n\x05names\x18\x01 \x03(\t\x12\t\n\x01x\x18\x02 \x03(\x02\x12\t\n\x01y\x18\x03 \x03(\x02\x12\x11\n\tcountries\x18\x04 \x03(\x05\x12\x10\n\x08lineages\x18\x05 \x03(\x05\x12 \n\tmutations\x18\x06 \x03(\x0b\x32\r.MutationList\x12\r\n\x05\x64\x61tes\x18\x07 \x03(\x05\x12\x0f\n\x07parents\x18\x08 \x03(\x05\x12\x10\n\x08genbanks\x18\t \x03(\t\x12\x17\n\x0f\x65pi_isl_numbers\x18\n \x03(\x05\x12\x10\n\x08num_tips\x18\x0b \x03(\x05\x12\x35\n\x10metadata_singles\x18\x0c \x03(\x0b\x32\x1b.MetadataSingleValuePerNode\"\xba\x01\n\x07\x41llData\x12\x1f\n\tnode_data\x18\x01 \x01(\x0b\x32\x0c.AllNodeData\x12\x17\n\x0f\x63ountry_mapping\x18\x02 \x03(\t\x12\x17\n\x0flineage_mapping\x18\x03 \x03(\t\x12\x18\n\x10mutation_mapping\x18\x04 \x03(\t\x12\x14\n\x0c\x64\x61te_mapping\x18\x05 \x03(\t\x12\x18\n\x10tree_description\x18\x06 \x01(\t\x12\x12\n\ntree_title\x18\x07 \x01(\tb\x06proto3' + name='taxonium.proto', + package='', + syntax='proto3', + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x0etaxonium.proto\" \n\x0cMutationList\x12\x10\n\x08mutation\x18\x01 \x03(\x05\"a\n\x1bMetadataUniqueStringPerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x13\n\x0bnode_values\x18\x03 \x03(\t\"q\n\x1aMetadataSingleValuePerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x0f\n\x07mapping\x18\x03 \x03(\t\x12\x13\n\x0bnode_values\x18\x04 \x03(\x05\"\xdc\x02\n\x0b\x41llNodeData\x12\r\n\x05names\x18\x01 \x03(\t\x12\t\n\x01x\x18\x02 \x03(\x02\x12\t\n\x01y\x18\x03 \x03(\x02\x12\x11\n\tcountries\x18\x04 \x03(\x05\x12\r\n\x05\x64\x61tes\x18\x07 \x03(\x05\x12\x10\n\x08lineages\x18\x05 \x03(\x05\x12 \n\tmutations\x18\x06 \x03(\x0b\x32\r.MutationList\x12\x0f\n\x07parents\x18\x08 \x03(\x05\x12\x10\n\x08genbanks\x18\t \x03(\t\x12\x17\n\x0f\x65pi_isl_numbers\x18\n \x03(\x05\x12\x10\n\x08num_tips\x18\x0b \x03(\x05\x12\x35\n\x10metadata_singles\x18\x0c \x03(\x0b\x32\x1b.MetadataSingleValuePerNode\x12=\n\x17metadata_unique_strings\x18\r \x03(\x0b\x32\x1c.MetadataUniqueStringPerNode\x12\x0e\n\x06time_x\x18\x0e \x03(\x02\"\x8c\x02\n\x07\x41llData\x12\x1f\n\tnode_data\x18\x01 \x01(\x0b\x32\x0c.AllNodeData\x12\x17\n\x0f\x63ountry_mapping\x18\x02 \x03(\t\x12\x17\n\x0flineage_mapping\x18\x03 \x03(\t\x12\x18\n\x10mutation_mapping\x18\x04 \x03(\t\x12\x14\n\x0c\x64\x61te_mapping\x18\x05 \x03(\t\x12\x18\n\x10tree_description\x18\x06 \x01(\t\x12\x12\n\ntree_title\x18\x07 \x01(\t\x12\x16\n\x0e\x64\x65\x66\x61ult_search\x18\x08 \x01(\t\x12\x1e\n\x16\x64\x65\x66\x61ult_seq_name_field\x18\t \x01(\t\x12\x18\n\x10\x64\x65\x66\x61ult_colourby\x18\n \x01(\tb\x06proto3' ) + + + _MUTATIONLIST = _descriptor.Descriptor( - name='MutationList', - full_name='MutationList', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='mutation', - full_name='MutationList.mutation', - index=0, - number=1, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=18, - serialized_end=50, + name='MutationList', + full_name='MutationList', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='mutation', full_name='MutationList.mutation', index=0, + number=1, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=18, + serialized_end=50, ) + +_METADATAUNIQUESTRINGPERNODE = _descriptor.Descriptor( + name='MetadataUniqueStringPerNode', + full_name='MetadataUniqueStringPerNode', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='metadata_name', full_name='MetadataUniqueStringPerNode.metadata_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_title', full_name='MetadataUniqueStringPerNode.metadata_title', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='node_values', full_name='MetadataUniqueStringPerNode.node_values', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=52, + serialized_end=149, +) + + _METADATASINGLEVALUEPERNODE = _descriptor.Descriptor( - name='MetadataSingleValuePerNode', - full_name='MetadataSingleValuePerNode', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='metadata_name', - full_name='MetadataSingleValuePerNode.metadata_name', - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='metadata_title', - full_name='MetadataSingleValuePerNode.metadata_title', - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='mapping', - full_name='MetadataSingleValuePerNode.mapping', - index=2, - number=3, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='node_values', - full_name='MetadataSingleValuePerNode.node_values', - index=3, - number=4, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=52, - serialized_end=165, + name='MetadataSingleValuePerNode', + full_name='MetadataSingleValuePerNode', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='metadata_name', full_name='MetadataSingleValuePerNode.metadata_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_title', full_name='MetadataSingleValuePerNode.metadata_title', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mapping', full_name='MetadataSingleValuePerNode.mapping', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='node_values', full_name='MetadataSingleValuePerNode.node_values', index=3, + number=4, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=151, + serialized_end=264, ) + _ALLNODEDATA = _descriptor.Descriptor( - name='AllNodeData', - full_name='AllNodeData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='names', - full_name='AllNodeData.names', - index=0, - number=1, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='x', - full_name='AllNodeData.x', - index=1, - number=2, - type=2, - cpp_type=6, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='y', - full_name='AllNodeData.y', - index=2, - number=3, - type=2, - cpp_type=6, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='countries', - full_name='AllNodeData.countries', - index=3, - number=4, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='lineages', - full_name='AllNodeData.lineages', - index=4, - number=5, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='mutations', - full_name='AllNodeData.mutations', - index=5, - number=6, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='dates', - full_name='AllNodeData.dates', - index=6, - number=7, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='parents', - full_name='AllNodeData.parents', - index=7, - number=8, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='genbanks', - full_name='AllNodeData.genbanks', - index=8, - number=9, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='epi_isl_numbers', - full_name='AllNodeData.epi_isl_numbers', - index=9, - number=10, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='num_tips', - full_name='AllNodeData.num_tips', - index=10, - number=11, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='metadata_singles', - full_name='AllNodeData.metadata_singles', - index=11, - number=12, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=168, - serialized_end=437, + name='AllNodeData', + full_name='AllNodeData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='names', full_name='AllNodeData.names', index=0, + number=1, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='x', full_name='AllNodeData.x', index=1, + number=2, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='y', full_name='AllNodeData.y', index=2, + number=3, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='countries', full_name='AllNodeData.countries', index=3, + number=4, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='dates', full_name='AllNodeData.dates', index=4, + number=7, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='lineages', full_name='AllNodeData.lineages', index=5, + number=5, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mutations', full_name='AllNodeData.mutations', index=6, + number=6, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='parents', full_name='AllNodeData.parents', index=7, + number=8, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='genbanks', full_name='AllNodeData.genbanks', index=8, + number=9, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='epi_isl_numbers', full_name='AllNodeData.epi_isl_numbers', index=9, + number=10, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='num_tips', full_name='AllNodeData.num_tips', index=10, + number=11, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_singles', full_name='AllNodeData.metadata_singles', index=11, + number=12, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_unique_strings', full_name='AllNodeData.metadata_unique_strings', index=12, + number=13, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='time_x', full_name='AllNodeData.time_x', index=13, + number=14, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=267, + serialized_end=615, ) + _ALLDATA = _descriptor.Descriptor( - name='AllData', - full_name='AllData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='node_data', - full_name='AllData.node_data', - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='country_mapping', - full_name='AllData.country_mapping', - index=1, - number=2, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='lineage_mapping', - full_name='AllData.lineage_mapping', - index=2, - number=3, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='mutation_mapping', - full_name='AllData.mutation_mapping', - index=3, - number=4, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='date_mapping', - full_name='AllData.date_mapping', - index=4, - number=5, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='tree_description', - full_name='AllData.tree_description', - index=5, - number=6, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='tree_title', - full_name='AllData.tree_title', - index=6, - number=7, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=440, - serialized_end=626, + name='AllData', + full_name='AllData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='node_data', full_name='AllData.node_data', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='country_mapping', full_name='AllData.country_mapping', index=1, + number=2, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='lineage_mapping', full_name='AllData.lineage_mapping', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mutation_mapping', full_name='AllData.mutation_mapping', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='date_mapping', full_name='AllData.date_mapping', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='tree_description', full_name='AllData.tree_description', index=5, + number=6, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='tree_title', full_name='AllData.tree_title', index=6, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_search', full_name='AllData.default_search', index=7, + number=8, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_seq_name_field', full_name='AllData.default_seq_name_field', index=8, + number=9, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_colourby', full_name='AllData.default_colourby', index=9, + number=10, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=618, + serialized_end=886, ) _ALLNODEDATA.fields_by_name['mutations'].message_type = _MUTATIONLIST -_ALLNODEDATA.fields_by_name[ - 'metadata_singles'].message_type = _METADATASINGLEVALUEPERNODE +_ALLNODEDATA.fields_by_name['metadata_singles'].message_type = _METADATASINGLEVALUEPERNODE +_ALLNODEDATA.fields_by_name['metadata_unique_strings'].message_type = _METADATAUNIQUESTRINGPERNODE _ALLDATA.fields_by_name['node_data'].message_type = _ALLNODEDATA DESCRIPTOR.message_types_by_name['MutationList'] = _MUTATIONLIST -DESCRIPTOR.message_types_by_name[ - 'MetadataSingleValuePerNode'] = _METADATASINGLEVALUEPERNODE +DESCRIPTOR.message_types_by_name['MetadataUniqueStringPerNode'] = _METADATAUNIQUESTRINGPERNODE +DESCRIPTOR.message_types_by_name['MetadataSingleValuePerNode'] = _METADATASINGLEVALUEPERNODE DESCRIPTOR.message_types_by_name['AllNodeData'] = _ALLNODEDATA DESCRIPTOR.message_types_by_name['AllData'] = _ALLDATA _sym_db.RegisterFileDescriptor(DESCRIPTOR) -MutationList = _reflection.GeneratedProtocolMessageType( - 'MutationList', - (_message.Message, ), - { - 'DESCRIPTOR': _MUTATIONLIST, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:MutationList) - }) +MutationList = _reflection.GeneratedProtocolMessageType('MutationList', (_message.Message,), { + 'DESCRIPTOR' : _MUTATIONLIST, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MutationList) + }) _sym_db.RegisterMessage(MutationList) -MetadataSingleValuePerNode = _reflection.GeneratedProtocolMessageType( - 'MetadataSingleValuePerNode', - (_message.Message, ), - { - 'DESCRIPTOR': _METADATASINGLEVALUEPERNODE, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:MetadataSingleValuePerNode) - }) +MetadataUniqueStringPerNode = _reflection.GeneratedProtocolMessageType('MetadataUniqueStringPerNode', (_message.Message,), { + 'DESCRIPTOR' : _METADATAUNIQUESTRINGPERNODE, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MetadataUniqueStringPerNode) + }) +_sym_db.RegisterMessage(MetadataUniqueStringPerNode) + +MetadataSingleValuePerNode = _reflection.GeneratedProtocolMessageType('MetadataSingleValuePerNode', (_message.Message,), { + 'DESCRIPTOR' : _METADATASINGLEVALUEPERNODE, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MetadataSingleValuePerNode) + }) _sym_db.RegisterMessage(MetadataSingleValuePerNode) -AllNodeData = _reflection.GeneratedProtocolMessageType( - 'AllNodeData', - (_message.Message, ), - { - 'DESCRIPTOR': _ALLNODEDATA, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:AllNodeData) - }) +AllNodeData = _reflection.GeneratedProtocolMessageType('AllNodeData', (_message.Message,), { + 'DESCRIPTOR' : _ALLNODEDATA, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:AllNodeData) + }) _sym_db.RegisterMessage(AllNodeData) -AllData = _reflection.GeneratedProtocolMessageType( - 'AllData', - (_message.Message, ), - { - 'DESCRIPTOR': _ALLDATA, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:AllData) - }) +AllData = _reflection.GeneratedProtocolMessageType('AllData', (_message.Message,), { + 'DESCRIPTOR' : _ALLDATA, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:AllData) + }) _sym_db.RegisterMessage(AllData) + # @@protoc_insertion_point(module_scope) diff --git a/data_processing/usher_processing.py b/data_processing/usher_processing.py index ea19e0e4..c6934b59 100644 --- a/data_processing/usher_processing.py +++ b/data_processing/usher_processing.py @@ -64,6 +64,12 @@ def get_aa_ref(pos): return None +def get_nuc_ref(pos): + + return f"nuc:X_{pos}_{cov2_genome.seq[pos-1]}" + return None + + def get_aa_sub(pos, par, alt): for gene_name, gene_range in cov2_genome.genes.items(): @@ -103,6 +109,7 @@ def __init__(self, tree_file): self.annotate_mutations() self.set_branch_lengths() self.annotate_aa_mutations() + self.annotate_nuc_mutations() self.expand_condensed_nodes() def annotate_mutations(self): @@ -115,7 +122,7 @@ def set_branch_lengths(self): def annotate_aa_mutations(self): for i, node in tqdm.tqdm(enumerate(self.tree.preorder_node_iter()), - desc="Annotating mutations"): + desc="Annotating AA mutations"): node.aa_subs = [] for mut in node.nuc_mutations.mutation: ref = NUC_ENUM[mut.ref_nuc] @@ -126,6 +133,16 @@ def annotate_aa_mutations(self): if aa_sub: node.aa_subs.append(aa_sub) + def annotate_nuc_mutations(self): + for i, node in tqdm.tqdm(enumerate(self.tree.preorder_node_iter()), + desc="Annotating nucleotide mutations"): + for mut in node.nuc_mutations.mutation: + ref = NUC_ENUM[mut.ref_nuc] + alt = NUC_ENUM[mut.mut_nuc[0]] + par = NUC_ENUM[mut.par_nuc] + aa_style_sub = f"nuc:{par}_{mut.position}_{alt}" + node.aa_subs.append(aa_style_sub) + def expand_condensed_nodes(self): for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()), desc="Expanding condensed nodes"): @@ -151,14 +168,50 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict): # In[19]: +input_file = "public-latest.all.masked.pb.gz" -f = open("./public-latest.all.masked.pb", "rb") +if input_file.endswith(".gz"): + f = gzip.open(input_file, "rb") +else: + f = open(input_file, "rb") mat = UsherMutationAnnotatedTree(f) +print("Ladderizing tree") mat.tree.ladderize() +print("Writing distance tree") + +#Name any unnamed nodes: +for i, node in enumerate(mat.tree.preorder_node_iter()): + if not node.taxon: + node.taxon = dendropy.Taxon(f"temporaryname__{i}") +mat.tree.write(path="./distance.nwk", + schema="newick", + unquoted_underscores=True) + +print("Launching chronumental") +import os + +# os.system( +# "chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 4000" +# ) + +print("Reading time tree") +time_tree = dendropy.Tree.get(path="./timetree__distance.nwk", schema="newick") +time_tree_iter = time_tree.preorder_node_iter() +for i, node in tqdm.tqdm(enumerate(mat.tree.preorder_node_iter()), + desc="Adding time tree"): + time_tree_node = next(time_tree_iter) + node.time = time_tree_node.edge_length + if node.taxon.label.startswith("temporaryname__"): + node.taxon.label = "" +del time_tree +del time_tree_iter all_ref_muts = set(get_aa_ref(x) for x in range(len(cov2_genome.seq))) all_ref_muts = [x for x in all_ref_muts if x is not None] + +all_ref_nuc_muts = set(get_nuc_ref(x) for x in range(len(cov2_genome.seq))) +all_ref_muts.extend(all_ref_nuc_muts) mat.tree.seed_node.aa_subs = all_ref_muts # In[14]: @@ -188,6 +241,18 @@ def assign_x(tree, current_branch_length=0, current_level=0): assign_x(clade, current_branch_length, current_level) +def assign_x_time(tree, current_branch_length=0, current_level=0): + + by_level[current_level].append(tree) + + if tree.time: + current_branch_length = current_branch_length + tree.time + current_level += 1 + tree.x_time = current_branch_length + for clade in tree.child_nodes(): + assign_x_time(clade, current_branch_length, current_level) + + def assign_terminal_y(terminals): for i, node in enumerate(terminals): node.y = i @@ -203,6 +268,7 @@ def align_parents(tree_by_level): root = mat.tree.seed_node assign_x(root) +assign_x_time(root) terminals = mat.tree.leaf_nodes() assign_terminal_y(terminals) align_parents(by_level) @@ -253,13 +319,13 @@ def add_paths(tree_by_level): print("B") -alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False) +#alt_metadata = pd.read_csv("metadata.tsv.gz", sep="\t", low_memory=False) alt_genbank_lookup = {} -for i, row in tqdm.tqdm(alt_metadata.iterrows()): +# for i, row in tqdm.tqdm(alt_metadata.iterrows()): - name = row['strain'] #.split("|")[0] - alt_genbank_lookup[name] = str(row['genbank_accession']) +# name = row['strain'] #.split("|")[0] +# alt_genbank_lookup[name] = str(row['genbank_accession']) def make_mapping(list_of_strings): @@ -286,6 +352,7 @@ def make_mapping(list_of_strings): mutation_mapping_list, mutation_mapping_lookup = make_mapping(all_genotypes) xes = [] +time_xes = [] yes = [] parents = [] names = [] @@ -300,10 +367,11 @@ def make_mapping(list_of_strings): print("C") del metadata -del alt_metadata +#del alt_metadata for i, x in tqdm.tqdm(enumerate(all_nodes)): xes.append(x.x * 0.2) + time_xes.append(x.x_time * 0.02) yes.append(x.y / 40000) path_list_rev = x.path_list[::-1] if len(path_list_rev) > 0: @@ -340,6 +408,8 @@ def make_mapping(list_of_strings): epi_isls.append( get_epi_isl(genbank_lookup[name], final_name, date_lookup[name])) +print("D") + country_metadata_obj = taxonium_pb2.MetadataSingleValuePerNode( metadata_name="Country", mapping=country_mapping_list, @@ -349,23 +419,29 @@ def make_mapping(list_of_strings): metadata_name="Lineage", mapping=lineage_mapping_list, node_values=lineages) +print("E") all_node_data = taxonium_pb2.AllNodeData( genbanks=genbanks, names=names, x=xes, + time_x=time_xes, y=yes, dates=dates, - mutations=[taxonium_pb2.MutationList(mutation=x) for x in mutations], + mutations=[ + taxonium_pb2.MutationList(mutation=x) for x in tqdm.tqdm(mutations) + ], parents=parents, num_tips=num_tips, epi_isl_numbers=epi_isls, metadata_singles=[country_metadata_obj, lineage_metadata_obj]) +print("F") all_data = taxonium_pb2.AllData(node_data=all_node_data, mutation_mapping=mutation_mapping_list, date_mapping=date_mapping_list) -f = gzip.open("../public/nodelist.pb.gz", "wb") +#f = gzip.open("../public/nodelist.pb.gz", "wb") +f = open("/mnt/d/timetree.pb", "wb") f.write(all_data.SerializeToString()) f.close() diff --git a/public/nodelist.pb.gz.REMOVED.git-id b/public/nodelist.pb.gz.REMOVED.git-id index c4381156..0ba373ea 100644 --- a/public/nodelist.pb.gz.REMOVED.git-id +++ b/public/nodelist.pb.gz.REMOVED.git-id @@ -1 +1 @@ -5fe85a8795e1b2d121b58225ac162e8cd675d697 \ No newline at end of file +a52698a5f3395efe2fdd76fade947573e09cae11 \ No newline at end of file diff --git a/public/taxonium.proto b/public/taxonium.proto index d45b493e..6931c3de 100644 --- a/public/taxonium.proto +++ b/public/taxonium.proto @@ -33,6 +33,7 @@ repeated int32 epi_isl_numbers = 10; // Deprecated: DO NOT USE repeated int32 num_tips = 11; repeated MetadataSingleValuePerNode metadata_singles = 12; repeated MetadataUniqueStringPerNode metadata_unique_strings=13; +repeated float time_x = 14; } diff --git a/src/App.jsx b/src/App.jsx index 9f16ac23..4adaeedd 100644 --- a/src/App.jsx +++ b/src/App.jsx @@ -20,9 +20,11 @@ function App() { } const [query, setQuery] = useQueryAsState({ blinking:"false", + xAccessor: "x", search: JSON.stringify([ { id: 0.123, + category: "lineage", value: "", enabled: true, diff --git a/src/Deck.jsx b/src/Deck.jsx index cb71308d..c16d418d 100644 --- a/src/Deck.jsx +++ b/src/Deck.jsx @@ -65,7 +65,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) { const filtered = nodeIds.filter((node) => { if (line_mode) { if ( - (Math.abs(node_data.x[node] - node_data.x[node_data.parents[node]]) > + (Math.abs(node_data.x_set[node] - node_data.x_set[node_data.parents[node]]) > 1) | (Math.abs(node_data.y[node] - node_data.y[node_data.parents[node]]) > 0.5) @@ -74,7 +74,7 @@ function reduceOverPlotting(nodeIds, node_data, precision, line_mode) { } } - const rounded_x = Math.round(node_data.x[node] * precision) / precision; + const rounded_x = Math.round(node_data.x_set[node] * precision) / precision; const rounded_y = Math.round(node_data.y[node] * precision) / precision; if (included_points[rounded_x]) { if (included_points[rounded_x][rounded_y]) { @@ -453,6 +453,7 @@ function Deck({ }, [node_data, data, colourBy]); const coarseScatterIds = useMemo(() => { + console.log("calcing coarse scatter") return reduceOverPlotting(scatterIds, node_data, 100, false); }, [scatterIds, node_data]); @@ -487,7 +488,7 @@ function Deck({ onHover: (info) => setHoverInfo(info), getPosition: (d) => { - return [node_data.x[d], node_data.y[d]]; + return [node_data.x_set[d], node_data.y[d]]; }, updateTriggers: { getFillColor: scatterFillFunction, @@ -532,7 +533,7 @@ function Deck({ getLineColor: [0, 0, 0], getPosition: (d) => { - return [node_data.x[d], node_data.y[d]]; + return [node_data.x_set[d], node_data.y[d]]; }, lineWidthUnits: "pixels", lineWidthScale: 2, @@ -592,10 +593,10 @@ function Deck({ pickable: true, onHover: (info) => setHoverInfo(info), getTargetPosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[d], ], - getSourcePosition: (d) => [node_data.x[d], node_data.y[d]], + getSourcePosition: (d) => [node_data.x_set[d], node_data.y[d]], getColor: [150, 150, 150], }), [node_data] @@ -608,11 +609,11 @@ function Deck({ pickable: false, getWidth: 1, getTargetPosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[node_data.parents[d]], ], getSourcePosition: (d) => [ - node_data.x[node_data.parents[d]], + node_data.x_set[node_data.parents[d]], node_data.y[d], ], getColor: [150, 150, 150], @@ -722,7 +723,7 @@ function Deck({ () => ({ id: "main-text-layer", data: textInfo.ids, - getPosition: (d) => [node_data.x[d] + 0.3, node_data.y[d]], + getPosition: (d) => [node_data.x_set[d] + 0.3, node_data.y[d]], getText: (d) => node_data.names[d], getColor: [180, 180, 180], getAngle: 0, @@ -738,7 +739,7 @@ function Deck({ () => ({ id: "main-text-muts-layer", data: mutTextIds.filter(() => true), - getPosition: (d) => [node_data.x[d], node_data.y[d]], + getPosition: (d) => [node_data.x_set[d], node_data.y[d]], getText: (d) => node_data.mutations[d].mutation ? node_data.mutations[d].mutation @@ -761,7 +762,7 @@ function Deck({ data.mutation_mapping, mutTextIds, node_data.mutations, - node_data.x, + node_data.x_set, node_data.y, ] ); diff --git a/src/Taxonium.jsx b/src/Taxonium.jsx index 04c58837..9186f79d 100644 --- a/src/Taxonium.jsx +++ b/src/Taxonium.jsx @@ -61,6 +61,13 @@ function Taxonium({protoUrl,uploadedData, query,setQuery}) { [setQuery, query] ); + const setXaccessor = useCallback( + (acc) => { + setQuery({ ...query, xAccessor: acc }); + }, + [setQuery, query] + ); + const setBlinkingEnabled = useCallback( (enabled) => { setQuery({ ...query, blinking:enabled? "true":"false" }); @@ -173,7 +180,7 @@ function getRawfile(protoUrl, uploadedData) { - result.node_data.ids = [...Array(result.node_data.x.length).keys()]; + result.node_data.ids = [...Array(result.node_data.y.length).keys()]; const all_genes = new Set(); result.mutation_mapping = result.mutation_mapping.map((x, i) => { @@ -200,9 +207,22 @@ function getRawfile(protoUrl, uploadedData) { }, [nodeData.status, query.protoUrl, uploadedData]); const data = useMemo( - () => - nodeData.status === "loaded" ? nodeData.data : { node_data: { ids: [] } }, - [nodeData] + () =>{ + if( nodeData.status === "loaded") { + + console.log("xaccessor isA"+query.xAccessor) + const new_data = {...nodeData.data } + new_data.node_data.x_set = nodeData.data.node_data[query.xAccessor] + window.NDD = new_data.node_data + return new_data; + + } else{ + + return{node_data:{ids:[],x_set:[],y:[],metadata_singles:[]}} + } + + }, + [nodeData,query.xAccessor] ); const scatterIds = useMemo( @@ -304,7 +324,7 @@ function getRawfile(protoUrl, uploadedData) { lineWidthScale: 1, getPosition: (d) => { - return [data.node_data.x[d], data.node_data.y[d]]; + return [data.node_data.x_set[d], data.node_data.y[d]]; }, getFillColor: (d) => [0, 0, 0], getLineColor: (d) => searchColors[counter % searchColors.length], @@ -341,6 +361,8 @@ function getRawfile(protoUrl, uploadedData) {