From 69f000c06c4ed64fabacb61f2d650290d6b70463 Mon Sep 17 00:00:00 2001 From: Theo Sanderson Date: Tue, 12 Oct 2021 21:05:54 +0100 Subject: [PATCH] updates Former-commit-id: b8116f340f885b96628d0c9d109b4567b6220c65 --- data_processing/parsimony_pb2.py | 34 +- data_processing/taxonium_pb2.py | 880 ++++++++++++---------------- data_processing/usher_processing.py | 61 +- 3 files changed, 459 insertions(+), 516 deletions(-) diff --git a/data_processing/parsimony_pb2.py b/data_processing/parsimony_pb2.py index 71efa094..45b4ea8d 100644 --- a/data_processing/parsimony_pb2.py +++ b/data_processing/parsimony_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: parsimony.proto - +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection @@ -18,6 +18,7 @@ package='Parsimony', syntax='proto3', serialized_options=None, + create_key=_descriptor._internal_create_key, serialized_pb=b'\n\x0fparsimony.proto\x12\tParsimony\"^\n\x03mut\x12\x10\n\x08position\x18\x01 \x01(\x05\x12\x0f\n\x07ref_nuc\x18\x02 \x01(\x05\x12\x0f\n\x07par_nuc\x18\x03 \x01(\x05\x12\x0f\n\x07mut_nuc\x18\x04 \x03(\x05\x12\x12\n\nchromosome\x18\x05 \x01(\t\"1\n\rmutation_list\x12 \n\x08mutation\x18\x01 \x03(\x0b\x32\x0e.Parsimony.mut\"=\n\x0e\x63ondensed_node\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x18\n\x10\x63ondensed_leaves\x18\x02 \x03(\t\"*\n\rnode_metadata\x12\x19\n\x11\x63lade_annotations\x18\x01 \x03(\t\"\xa8\x01\n\x04\x64\x61ta\x12\x0e\n\x06newick\x18\x01 \x01(\t\x12\x30\n\x0enode_mutations\x18\x02 \x03(\x0b\x32\x18.Parsimony.mutation_list\x12\x32\n\x0f\x63ondensed_nodes\x18\x03 \x03(\x0b\x32\x19.Parsimony.condensed_node\x12*\n\x08metadata\x18\x04 \x03(\x0b\x32\x18.Parsimony.node_metadatab\x06proto3' ) @@ -30,6 +31,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='position', full_name='Parsimony.mut.position', index=0, @@ -37,35 +39,35 @@ has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='ref_nuc', full_name='Parsimony.mut.ref_nuc', index=1, number=2, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='par_nuc', full_name='Parsimony.mut.par_nuc', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='mut_nuc', full_name='Parsimony.mut.mut_nuc', index=3, number=4, type=5, cpp_type=1, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='chromosome', full_name='Parsimony.mut.chromosome', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -89,6 +91,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='mutation', full_name='Parsimony.mutation_list.mutation', index=0, @@ -96,7 +99,7 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -120,6 +123,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='node_name', full_name='Parsimony.condensed_node.node_name', index=0, @@ -127,14 +131,14 @@ has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='condensed_leaves', full_name='Parsimony.condensed_node.condensed_leaves', index=1, number=2, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -158,6 +162,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='clade_annotations', full_name='Parsimony.node_metadata.clade_annotations', index=0, @@ -165,7 +170,7 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -189,6 +194,7 @@ filename=None, file=DESCRIPTOR, containing_type=None, + create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( name='newick', full_name='Parsimony.data.newick', index=0, @@ -196,28 +202,28 @@ has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='node_mutations', full_name='Parsimony.data.node_mutations', index=1, number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='condensed_nodes', full_name='Parsimony.data.condensed_nodes', index=2, number=3, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='metadata', full_name='Parsimony.data.metadata', index=3, number=4, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], diff --git a/data_processing/taxonium_pb2.py b/data_processing/taxonium_pb2.py index 46c27b4d..70b42d96 100644 --- a/data_processing/taxonium_pb2.py +++ b/data_processing/taxonium_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: taxonium.proto - +"""Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection @@ -10,532 +10,414 @@ _sym_db = _symbol_database.Default() + + + DESCRIPTOR = _descriptor.FileDescriptor( - name='taxonium.proto', - package='', - syntax='proto3', - serialized_options=None, - serialized_pb= - b'\n\x0etaxonium.proto\" \n\x0cMutationList\x12\x10\n\x08mutation\x18\x01 \x03(\x05\"q\n\x1aMetadataSingleValuePerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x0f\n\x07mapping\x18\x03 \x03(\t\x12\x13\n\x0bnode_values\x18\x04 \x03(\x05\"\x8d\x02\n\x0b\x41llNodeData\x12\r\n\x05names\x18\x01 \x03(\t\x12\t\n\x01x\x18\x02 \x03(\x02\x12\t\n\x01y\x18\x03 \x03(\x02\x12\x11\n\tcountries\x18\x04 \x03(\x05\x12\x10\n\x08lineages\x18\x05 \x03(\x05\x12 \n\tmutations\x18\x06 \x03(\x0b\x32\r.MutationList\x12\r\n\x05\x64\x61tes\x18\x07 \x03(\x05\x12\x0f\n\x07parents\x18\x08 \x03(\x05\x12\x10\n\x08genbanks\x18\t \x03(\t\x12\x17\n\x0f\x65pi_isl_numbers\x18\n \x03(\x05\x12\x10\n\x08num_tips\x18\x0b \x03(\x05\x12\x35\n\x10metadata_singles\x18\x0c \x03(\x0b\x32\x1b.MetadataSingleValuePerNode\"\xba\x01\n\x07\x41llData\x12\x1f\n\tnode_data\x18\x01 \x01(\x0b\x32\x0c.AllNodeData\x12\x17\n\x0f\x63ountry_mapping\x18\x02 \x03(\t\x12\x17\n\x0flineage_mapping\x18\x03 \x03(\t\x12\x18\n\x10mutation_mapping\x18\x04 \x03(\t\x12\x14\n\x0c\x64\x61te_mapping\x18\x05 \x03(\t\x12\x18\n\x10tree_description\x18\x06 \x01(\t\x12\x12\n\ntree_title\x18\x07 \x01(\tb\x06proto3' + name='taxonium.proto', + package='', + syntax='proto3', + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x0etaxonium.proto\" \n\x0cMutationList\x12\x10\n\x08mutation\x18\x01 \x03(\x05\"a\n\x1bMetadataUniqueStringPerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x13\n\x0bnode_values\x18\x03 \x03(\t\"q\n\x1aMetadataSingleValuePerNode\x12\x15\n\rmetadata_name\x18\x01 \x01(\t\x12\x16\n\x0emetadata_title\x18\x02 \x01(\t\x12\x0f\n\x07mapping\x18\x03 \x03(\t\x12\x13\n\x0bnode_values\x18\x04 \x03(\x05\"\xdc\x02\n\x0b\x41llNodeData\x12\r\n\x05names\x18\x01 \x03(\t\x12\t\n\x01x\x18\x02 \x03(\x02\x12\t\n\x01y\x18\x03 \x03(\x02\x12\x11\n\tcountries\x18\x04 \x03(\x05\x12\r\n\x05\x64\x61tes\x18\x07 \x03(\x05\x12\x10\n\x08lineages\x18\x05 \x03(\x05\x12 \n\tmutations\x18\x06 \x03(\x0b\x32\r.MutationList\x12\x0f\n\x07parents\x18\x08 \x03(\x05\x12\x10\n\x08genbanks\x18\t \x03(\t\x12\x17\n\x0f\x65pi_isl_numbers\x18\n \x03(\x05\x12\x10\n\x08num_tips\x18\x0b \x03(\x05\x12\x35\n\x10metadata_singles\x18\x0c \x03(\x0b\x32\x1b.MetadataSingleValuePerNode\x12=\n\x17metadata_unique_strings\x18\r \x03(\x0b\x32\x1c.MetadataUniqueStringPerNode\x12\x0e\n\x06time_x\x18\x0e \x03(\x02\"\x8c\x02\n\x07\x41llData\x12\x1f\n\tnode_data\x18\x01 \x01(\x0b\x32\x0c.AllNodeData\x12\x17\n\x0f\x63ountry_mapping\x18\x02 \x03(\t\x12\x17\n\x0flineage_mapping\x18\x03 \x03(\t\x12\x18\n\x10mutation_mapping\x18\x04 \x03(\t\x12\x14\n\x0c\x64\x61te_mapping\x18\x05 \x03(\t\x12\x18\n\x10tree_description\x18\x06 \x01(\t\x12\x12\n\ntree_title\x18\x07 \x01(\t\x12\x16\n\x0e\x64\x65\x66\x61ult_search\x18\x08 \x01(\t\x12\x1e\n\x16\x64\x65\x66\x61ult_seq_name_field\x18\t \x01(\t\x12\x18\n\x10\x64\x65\x66\x61ult_colourby\x18\n \x01(\tb\x06proto3' ) + + + _MUTATIONLIST = _descriptor.Descriptor( - name='MutationList', - full_name='MutationList', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='mutation', - full_name='MutationList.mutation', - index=0, - number=1, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=18, - serialized_end=50, + name='MutationList', + full_name='MutationList', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='mutation', full_name='MutationList.mutation', index=0, + number=1, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=18, + serialized_end=50, ) + +_METADATAUNIQUESTRINGPERNODE = _descriptor.Descriptor( + name='MetadataUniqueStringPerNode', + full_name='MetadataUniqueStringPerNode', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='metadata_name', full_name='MetadataUniqueStringPerNode.metadata_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_title', full_name='MetadataUniqueStringPerNode.metadata_title', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='node_values', full_name='MetadataUniqueStringPerNode.node_values', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=52, + serialized_end=149, +) + + _METADATASINGLEVALUEPERNODE = _descriptor.Descriptor( - name='MetadataSingleValuePerNode', - full_name='MetadataSingleValuePerNode', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='metadata_name', - full_name='MetadataSingleValuePerNode.metadata_name', - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='metadata_title', - full_name='MetadataSingleValuePerNode.metadata_title', - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='mapping', - full_name='MetadataSingleValuePerNode.mapping', - index=2, - number=3, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='node_values', - full_name='MetadataSingleValuePerNode.node_values', - index=3, - number=4, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=52, - serialized_end=165, + name='MetadataSingleValuePerNode', + full_name='MetadataSingleValuePerNode', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='metadata_name', full_name='MetadataSingleValuePerNode.metadata_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_title', full_name='MetadataSingleValuePerNode.metadata_title', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mapping', full_name='MetadataSingleValuePerNode.mapping', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='node_values', full_name='MetadataSingleValuePerNode.node_values', index=3, + number=4, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=151, + serialized_end=264, ) + _ALLNODEDATA = _descriptor.Descriptor( - name='AllNodeData', - full_name='AllNodeData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='names', - full_name='AllNodeData.names', - index=0, - number=1, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='x', - full_name='AllNodeData.x', - index=1, - number=2, - type=2, - cpp_type=6, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='y', - full_name='AllNodeData.y', - index=2, - number=3, - type=2, - cpp_type=6, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='countries', - full_name='AllNodeData.countries', - index=3, - number=4, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='lineages', - full_name='AllNodeData.lineages', - index=4, - number=5, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='mutations', - full_name='AllNodeData.mutations', - index=5, - number=6, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='dates', - full_name='AllNodeData.dates', - index=6, - number=7, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='parents', - full_name='AllNodeData.parents', - index=7, - number=8, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='genbanks', - full_name='AllNodeData.genbanks', - index=8, - number=9, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='epi_isl_numbers', - full_name='AllNodeData.epi_isl_numbers', - index=9, - number=10, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='num_tips', - full_name='AllNodeData.num_tips', - index=10, - number=11, - type=5, - cpp_type=1, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='metadata_singles', - full_name='AllNodeData.metadata_singles', - index=11, - number=12, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=168, - serialized_end=437, + name='AllNodeData', + full_name='AllNodeData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='names', full_name='AllNodeData.names', index=0, + number=1, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='x', full_name='AllNodeData.x', index=1, + number=2, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='y', full_name='AllNodeData.y', index=2, + number=3, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='countries', full_name='AllNodeData.countries', index=3, + number=4, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='dates', full_name='AllNodeData.dates', index=4, + number=7, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='lineages', full_name='AllNodeData.lineages', index=5, + number=5, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mutations', full_name='AllNodeData.mutations', index=6, + number=6, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='parents', full_name='AllNodeData.parents', index=7, + number=8, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='genbanks', full_name='AllNodeData.genbanks', index=8, + number=9, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='epi_isl_numbers', full_name='AllNodeData.epi_isl_numbers', index=9, + number=10, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='num_tips', full_name='AllNodeData.num_tips', index=10, + number=11, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_singles', full_name='AllNodeData.metadata_singles', index=11, + number=12, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metadata_unique_strings', full_name='AllNodeData.metadata_unique_strings', index=12, + number=13, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='time_x', full_name='AllNodeData.time_x', index=13, + number=14, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=267, + serialized_end=615, ) + _ALLDATA = _descriptor.Descriptor( - name='AllData', - full_name='AllData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor(name='node_data', - full_name='AllData.node_data', - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='country_mapping', - full_name='AllData.country_mapping', - index=1, - number=2, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='lineage_mapping', - full_name='AllData.lineage_mapping', - index=2, - number=3, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='mutation_mapping', - full_name='AllData.mutation_mapping', - index=3, - number=4, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='date_mapping', - full_name='AllData.date_mapping', - index=4, - number=5, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='tree_description', - full_name='AllData.tree_description', - index=5, - number=6, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - _descriptor.FieldDescriptor(name='tree_title', - full_name='AllData.tree_title', - index=6, - number=7, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=440, - serialized_end=626, + name='AllData', + full_name='AllData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='node_data', full_name='AllData.node_data', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='country_mapping', full_name='AllData.country_mapping', index=1, + number=2, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='lineage_mapping', full_name='AllData.lineage_mapping', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='mutation_mapping', full_name='AllData.mutation_mapping', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='date_mapping', full_name='AllData.date_mapping', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='tree_description', full_name='AllData.tree_description', index=5, + number=6, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='tree_title', full_name='AllData.tree_title', index=6, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_search', full_name='AllData.default_search', index=7, + number=8, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_seq_name_field', full_name='AllData.default_seq_name_field', index=8, + number=9, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='default_colourby', full_name='AllData.default_colourby', index=9, + number=10, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=618, + serialized_end=886, ) _ALLNODEDATA.fields_by_name['mutations'].message_type = _MUTATIONLIST -_ALLNODEDATA.fields_by_name[ - 'metadata_singles'].message_type = _METADATASINGLEVALUEPERNODE +_ALLNODEDATA.fields_by_name['metadata_singles'].message_type = _METADATASINGLEVALUEPERNODE +_ALLNODEDATA.fields_by_name['metadata_unique_strings'].message_type = _METADATAUNIQUESTRINGPERNODE _ALLDATA.fields_by_name['node_data'].message_type = _ALLNODEDATA DESCRIPTOR.message_types_by_name['MutationList'] = _MUTATIONLIST -DESCRIPTOR.message_types_by_name[ - 'MetadataSingleValuePerNode'] = _METADATASINGLEVALUEPERNODE +DESCRIPTOR.message_types_by_name['MetadataUniqueStringPerNode'] = _METADATAUNIQUESTRINGPERNODE +DESCRIPTOR.message_types_by_name['MetadataSingleValuePerNode'] = _METADATASINGLEVALUEPERNODE DESCRIPTOR.message_types_by_name['AllNodeData'] = _ALLNODEDATA DESCRIPTOR.message_types_by_name['AllData'] = _ALLDATA _sym_db.RegisterFileDescriptor(DESCRIPTOR) -MutationList = _reflection.GeneratedProtocolMessageType( - 'MutationList', - (_message.Message, ), - { - 'DESCRIPTOR': _MUTATIONLIST, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:MutationList) - }) +MutationList = _reflection.GeneratedProtocolMessageType('MutationList', (_message.Message,), { + 'DESCRIPTOR' : _MUTATIONLIST, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MutationList) + }) _sym_db.RegisterMessage(MutationList) -MetadataSingleValuePerNode = _reflection.GeneratedProtocolMessageType( - 'MetadataSingleValuePerNode', - (_message.Message, ), - { - 'DESCRIPTOR': _METADATASINGLEVALUEPERNODE, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:MetadataSingleValuePerNode) - }) +MetadataUniqueStringPerNode = _reflection.GeneratedProtocolMessageType('MetadataUniqueStringPerNode', (_message.Message,), { + 'DESCRIPTOR' : _METADATAUNIQUESTRINGPERNODE, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MetadataUniqueStringPerNode) + }) +_sym_db.RegisterMessage(MetadataUniqueStringPerNode) + +MetadataSingleValuePerNode = _reflection.GeneratedProtocolMessageType('MetadataSingleValuePerNode', (_message.Message,), { + 'DESCRIPTOR' : _METADATASINGLEVALUEPERNODE, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:MetadataSingleValuePerNode) + }) _sym_db.RegisterMessage(MetadataSingleValuePerNode) -AllNodeData = _reflection.GeneratedProtocolMessageType( - 'AllNodeData', - (_message.Message, ), - { - 'DESCRIPTOR': _ALLNODEDATA, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:AllNodeData) - }) +AllNodeData = _reflection.GeneratedProtocolMessageType('AllNodeData', (_message.Message,), { + 'DESCRIPTOR' : _ALLNODEDATA, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:AllNodeData) + }) _sym_db.RegisterMessage(AllNodeData) -AllData = _reflection.GeneratedProtocolMessageType( - 'AllData', - (_message.Message, ), - { - 'DESCRIPTOR': _ALLDATA, - '__module__': 'taxonium_pb2' - # @@protoc_insertion_point(class_scope:AllData) - }) +AllData = _reflection.GeneratedProtocolMessageType('AllData', (_message.Message,), { + 'DESCRIPTOR' : _ALLDATA, + '__module__' : 'taxonium_pb2' + # @@protoc_insertion_point(class_scope:AllData) + }) _sym_db.RegisterMessage(AllData) + # @@protoc_insertion_point(module_scope) diff --git a/data_processing/usher_processing.py b/data_processing/usher_processing.py index ea19e0e4..2dced7ae 100644 --- a/data_processing/usher_processing.py +++ b/data_processing/usher_processing.py @@ -103,6 +103,7 @@ def __init__(self, tree_file): self.annotate_mutations() self.set_branch_lengths() self.annotate_aa_mutations() + #self.annotate_nuc_mutations() self.expand_condensed_nodes() def annotate_mutations(self): @@ -115,7 +116,7 @@ def set_branch_lengths(self): def annotate_aa_mutations(self): for i, node in tqdm.tqdm(enumerate(self.tree.preorder_node_iter()), - desc="Annotating mutations"): + desc="Annotating AA mutations"): node.aa_subs = [] for mut in node.nuc_mutations.mutation: ref = NUC_ENUM[mut.ref_nuc] @@ -126,6 +127,16 @@ def annotate_aa_mutations(self): if aa_sub: node.aa_subs.append(aa_sub) + def annotate_nuc_mutations(self): + for i, node in tqdm.tqdm(enumerate(self.tree.preorder_node_iter()), + desc="Annotating nucleotide mutations"): + for mut in node.nuc_mutations.mutation: + ref = NUC_ENUM[mut.ref_nuc] + alt = NUC_ENUM[mut.mut_nuc[0]] + par = NUC_ENUM[mut.par_nuc] + aa_style_sub = f"{par}{mut.position}{alt}" + node.aa_subs.append(aa_style_sub) + def expand_condensed_nodes(self): for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()), desc="Expanding condensed nodes"): @@ -155,7 +166,29 @@ def get_condensed_nodes_dict(self, condensed_nodes_dict): f = open("./public-latest.all.masked.pb", "rb") mat = UsherMutationAnnotatedTree(f) +print("Ladderizing tree") mat.tree.ladderize() +print("Writing distance tree") +mat.tree.write(path="./distance.nwk", + schema="newick", + unquoted_underscores=True) + +print("Launching chronumental") +import os + +os.system( + "chronumental --tree distance.nwk --dates ./public-latest.metadata.tsv.gz -s 200" +) + +print("Reading time tree") +time_tree = dendropy.Tree.get(path="./timetree__distance.nwk", schema="newick") +time_tree_iter = time_tree.preorder_node_iter() +for i, node in tqdm.tqdm(enumerate(mat.tree.preorder_node_iter()), + desc="Adding time tree"): + time_tree_node = next(time_tree_iter) + node.time = time_tree_node.edge_length +del time_tree +del time_tree_iter all_ref_muts = set(get_aa_ref(x) for x in range(len(cov2_genome.seq))) all_ref_muts = [x for x in all_ref_muts if x is not None] @@ -188,6 +221,18 @@ def assign_x(tree, current_branch_length=0, current_level=0): assign_x(clade, current_branch_length, current_level) +def assign_x_time(tree, current_branch_length=0, current_level=0): + + by_level[current_level].append(tree) + + if tree.time: + current_branch_length = current_branch_length + tree.time + current_level += 1 + tree.x_time = current_branch_length + for clade in tree.child_nodes(): + assign_x_time(clade, current_branch_length, current_level) + + def assign_terminal_y(terminals): for i, node in enumerate(terminals): node.y = i @@ -203,6 +248,7 @@ def align_parents(tree_by_level): root = mat.tree.seed_node assign_x(root) +assign_x_time(root) terminals = mat.tree.leaf_nodes() assign_terminal_y(terminals) align_parents(by_level) @@ -286,6 +332,7 @@ def make_mapping(list_of_strings): mutation_mapping_list, mutation_mapping_lookup = make_mapping(all_genotypes) xes = [] +time_xes = [] yes = [] parents = [] names = [] @@ -304,6 +351,7 @@ def make_mapping(list_of_strings): for i, x in tqdm.tqdm(enumerate(all_nodes)): xes.append(x.x * 0.2) + time_xes.append(x.x * 0.02) yes.append(x.y / 40000) path_list_rev = x.path_list[::-1] if len(path_list_rev) > 0: @@ -340,6 +388,8 @@ def make_mapping(list_of_strings): epi_isls.append( get_epi_isl(genbank_lookup[name], final_name, date_lookup[name])) +print("D") + country_metadata_obj = taxonium_pb2.MetadataSingleValuePerNode( metadata_name="Country", mapping=country_mapping_list, @@ -349,18 +399,23 @@ def make_mapping(list_of_strings): metadata_name="Lineage", mapping=lineage_mapping_list, node_values=lineages) +print("E") all_node_data = taxonium_pb2.AllNodeData( genbanks=genbanks, names=names, - x=xes, + x=time_xes, + time_x=time_xes, y=yes, dates=dates, - mutations=[taxonium_pb2.MutationList(mutation=x) for x in mutations], + mutations=[ + taxonium_pb2.MutationList(mutation=x) for x in tqdm.tqdm(mutations) + ], parents=parents, num_tips=num_tips, epi_isl_numbers=epi_isls, metadata_singles=[country_metadata_obj, lineage_metadata_obj]) +print("F") all_data = taxonium_pb2.AllData(node_data=all_node_data, mutation_mapping=mutation_mapping_list,