Skip to content

Commit

Permalink
Merge pull request #205 from nextstrain/feat/dataset-capabilities
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Jun 4, 2024
2 parents 783b312 + 573507d commit 23f88ed
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 5 deletions.
96 changes: 96 additions & 0 deletions data_output/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 43,
"customClades": {
"Nextclade_pango": 2969,
"partiallyAliased": 2969,
"clade_nextstrain": 43,
"clade_who": 13,
"clade_display": 43
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -168,6 +176,14 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 43,
"customClades": {
"Nextclade_pango": 2969,
"partiallyAliased": 2969,
"clade_nextstrain": 43,
"clade_who": 13,
"clade_display": 43
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -267,6 +283,14 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 20,
"customClades": {
"Nextclade_pango": 2232,
"partiallyAliased": 2232,
"clade_nextstrain": 20,
"clade_who": 4,
"clade_display": 20
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -364,6 +388,14 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 20,
"customClades": {
"Nextclade_pango": 2232,
"partiallyAliased": 2232,
"clade_nextstrain": 20,
"clade_who": 4,
"clade_display": 20
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -461,6 +493,14 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 20,
"customClades": {
"Nextclade_pango": 2232,
"partiallyAliased": 2232,
"clade_nextstrain": 20,
"clade_who": 4,
"clade_display": 20
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -561,6 +601,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 21,
"customClades": {
"short-clade": 15,
"subclade": 21
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -644,6 +689,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 16,
"customClades": {
"short-clade": 13,
"subclade": 21
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -726,6 +776,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 23,
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -807,6 +858,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 37,
"customClades": {
"subclade": 42,
"short-clade": 37
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -898,6 +954,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 30,
"customClades": {
"subclade": 36,
"short-clade": 30
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -989,6 +1050,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 17,
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1072,6 +1134,10 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 8,
"customClades": {
"subclade": 22
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1154,6 +1220,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 19,
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1237,6 +1304,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 3,
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1318,6 +1386,10 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 24,
"customClades": {
"G_clade": 15
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1396,6 +1468,10 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 17,
"customClades": {
"G_clade": 9
},
"qc": [
"privateMutations",
"mixedSites",
Expand Down Expand Up @@ -1452,6 +1528,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 5,
"customClades": {
"outbreak": 1,
"lineage": 33
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -1508,6 +1589,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 2,
"customClades": {
"outbreak": 1,
"lineage": 33
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -1565,6 +1651,11 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 2,
"customClades": {
"outbreak": 1,
"lineage": 25
},
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -2469,6 +2560,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 33,
"qc": [
"frameShifts",
"mixedSites",
Expand Down Expand Up @@ -2525,6 +2617,7 @@
"treeJson": "tree.json"
},
"capabilities": {
"clades": 100,
"qc": [
"frameShifts",
"missingData",
Expand Down Expand Up @@ -2572,6 +2665,7 @@
"examples": "example_sequences.fasta"
},
"capabilities": {
"clades": 11,
"qc": [
"frameShifts",
"mixedSites",
Expand Down Expand Up @@ -2621,6 +2715,7 @@
"examples": "example_sequences.fasta"
},
"capabilities": {
"clades": 57,
"qc": [
"frameShifts",
"mixedSites",
Expand Down Expand Up @@ -2670,6 +2765,7 @@
"examples": "example_sequences.fasta"
},
"capabilities": {
"clades": 10,
"qc": [
"frameShifts",
"mixedSites",
Expand Down
58 changes: 53 additions & 5 deletions scripts/rebuild
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from os.path import dirname, realpath, join, relpath, isfile

from lib.changelog import changelog_prepare, changelog_get_unreleased_section
from lib.container import dict_get, dict_get_required, find_index_by, first, format_list, \
dict_remove_many, find_duplicates, dict_cleanup, find
dict_remove_many, find_duplicates, dict_cleanup, find, unique
from lib.date import now_iso, iso_to_iso_safe
from lib.fasta import fasta_read_exactly_one_seq
from lib.fs import json_read, find_files, json_write, copy, make_zip, file_write, rmrf
Expand All @@ -33,16 +33,19 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str):
filepath = join(dataset_dir, filename)
if not isfile(filepath):
raise FileNotFoundError(
f"'Filename '{filename}' is declared in `.files.{name}` field of pathogen.json, but the actual file is not found: '{filepath}'")
f"'Filename '{filename}' is declared in `.files.{name}` field of pathogen.json, but the actual file is not "
f"found: '{filepath}'")

other = []

tree_filename = dict_get(pathogen_json, ["files", "tree"])
tree_filename = dict_get(pathogen_json, ["files", "treeJson"])
tree_json_path = join(dataset_dir, tree_filename) if tree_filename else None
clades = []
custom_clades = {}
if tree_json_path is not None and isfile(tree_json_path):
tree_json = json_read(tree_json_path)
if dict_get(tree_json, ["extensions", "nextclade", "clade_node_attrs"]) is not None:
other.append("customClades")
clades = tree_find_clades(tree_json)
custom_clades = tree_find_clade_like_attrs(tree_json)

if dict_get(pathogen_json, ["mutLabels"]) is not None:
other.append("mutLabels")
Expand All @@ -58,13 +61,58 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str):
if dict_get(q, ["enabled"]):
qc.append(k)

custom_clades = dict_cleanup({attr: len(values) for attr, values in custom_clades.items() if len(values) > 0})

return dict_cleanup({
"clades": len(clades) if len(clades) > 0 else None,
"customClades": custom_clades,
"qc": qc,
"primers": True if len(dict_get(pathogen_json, ["primers"]) or []) > 0 else None,
"other": other
})


def tree_find_clades(auspice_json):
def tree_find_clades_recursive(node, clades=None):
if clades is None:
clades = []

clade_membership = node.get('node_attrs', {}).get('clade_membership', {}).get('value')
if clade_membership:
clades.append(clade_membership)

children = node.get('children', [])
for child in children:
tree_find_clades_recursive(child, clades)

return clades

clades = tree_find_clades_recursive(auspice_json["tree"])
return list(sorted(unique(clades)))


def tree_find_clade_like_attrs(auspice_json):
def tree_find_clade_like_attrs_recursive(node, attr_names, attributes=None):
if attributes is None:
attributes = {attr: [] for attr in attr_names}

for attr in attr_names:
attr_value = node.get('node_attrs', {}).get(attr, {}).get('value')
if attr_value is not None:
attributes[attr].append(attr_value)

children = node.get('children', [])
for child in children:
tree_find_clade_like_attrs_recursive(child, attr_names, attributes)

return attributes

clade_node_attrs = dict_get(auspice_json, ["meta", "extensions", "nextclade", "clade_node_attrs"]) or []
attr_names = [attr["name"] for attr in clade_node_attrs]
attributes = tree_find_clade_like_attrs_recursive(auspice_json["tree"], attr_names)
return {attr: list(sorted(unique(values))) for attr, values in attributes.items() if len(values) > 0}


def dataset_get_versions(dataset):
versions = dict_get(dataset, ["versions"]) or []
versions = list(filter(lambda version: version["tag"] != "unreleased", versions))
Expand Down

0 comments on commit 23f88ed

Please sign in to comment.