Skip to content

Commit 6817ece

Browse files
d-callandannonhunterckx
authored
feat: landing page organism viz - data (#289) (#290)
* feat: add data for landing page organism viz * removing strains from sunburst * add taxonomic rank to taxa tree data * fix up tree data w more organisms * chore: increase feedback while fetching gene model urls (#290) * fix: use a post instead of a giant GET for filtered_subtree * Rebuild genomes, tree * bump: update catalog version to 2.0.0 * Apply suggestions from code review Co-authored-by: Hunter Craft <[email protected]> * chore: rebuild catalog --------- Co-authored-by: Dannon Baker <[email protected]> Co-authored-by: Hunter Craft <[email protected]>
1 parent dacdbb2 commit 6817ece

File tree

6 files changed

+8766
-7
lines changed

6 files changed

+8766
-7
lines changed

catalog/build/intermediate/genomes-from-ncbi.tsv

+1-2
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ Indian Wild Type (Walter Reed) 30069 GCA_000300775.2 False Scaffold 221324304 2
7171
P15 658858 GCA_000182665.1 False Contig 11522052 47.0x 47.0 Giardia intestinalis 5741 1,131567,2759,2611341,207245,5738,5739,68459,5740,5741,658858 Metamonada Eukaryota Fornicata Diplomonadida Hexamitidae Giardia Giardia intestinalis Giardia lamblia P15 https://genome.ucsc.edu/h/GCA_000182665.1 GCA_000182665.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/182/665/GCA_000182665.1/genes/GCA_000182665.1_ASM18266v1.augustus.gtf.gz
7272
GS/M clone H7 598745 GCA_000182405.1 False Contig 11001532 16.0x 47.5 Giardia intestinalis 5741 1,131567,2759,2611341,207245,5738,5739,68459,5740,5741,598745 Metamonada Eukaryota Fornicata Diplomonadida Hexamitidae Giardia Giardia intestinalis Giardia intestinalis ATCC 50581 https://genome.ucsc.edu/h/GCA_000182405.1 GCA_000182405.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/182/405/GCA_000182405.1/genes/GCA_000182405.1_ASM18240v1.augustus.gtf.gz
7373
G3 412133 GCA_000002825.3 False Scaffold 176420065 64769.0 27122.0 1020.0 7.2x 33.0 Trichomonas vaginalis 5722 1,131567,2759,2611341,5719,37104,181550,5721,5722,412133 Metamonada Eukaryota Parabasalia Trichomonadida Trichomonadidae Trichomonas Trichomonas vaginalis https://genome.ucsc.edu/h/GCA_000002825.3 GCA_000002825.3 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/002/825/GCA_000002825.3/genes/GCA_000002825.3_ASM282v3.augustus.gtf.gz
74+
MS6 1126212 GCA_000302655.1 False Contig 48882845 66.0x 52.5 Macrophomina phaseolina 35725 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715962,147541,159987,451869,45131,35724,35725,1126212 Ascomycota Eukaryota Fungi Ascomycota Dothideomycetes Botryosphaeriales Botryosphaeriaceae Macrophomina Macrophomina phaseolina Macrophomina phaseolina MS6 https://genome.ucsc.edu/h/GCA_000302655.1 GCA_000302655.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/302/655/GCA_000302655.1/genes/GCA_000302655.1_MphMS6_1.0.augustus.gtf.gz
7475
RP-78 273507 GCA_000167175.1 False Contig 29842556 57.0 Phanerodontia chrysosporium 2822231 1,131567,2759,33154,4751,451864,5204,5302,155619,355688,5303,396331,2492674,2822231,273507 Basidiomycota Eukaryota Fungi Basidiomycota Agaricomycetes Polyporales Phanerochaetaceae Phanerodontia Phanerodontia chrysosporium Phanerochaete chrysosporium RP-78 https://genome.ucsc.edu/h/GCA_000167175.1 GCA_000167175.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/167/175/GCA_000167175.1/genes/GCA_000167175.1_ASM16717v1.augustus.gtf.gz
7576
1006PhL 1220926 GCA_000401635.1 False Scaffold 36348485 470.0 140649.0 82.0 45.0x 39.5 Mucor circinelloides 36080 1,131567,2759,33154,4751,112252,1913637,451507,2212703,4827,1344963,34489,4830,36080,1220926 Mucoromycota Eukaryota Fungi Mucoromycota Mucoromycetes Mucorales Mucoraceae Mucor Mucor circinelloides Mucor circinelloides 1006PhL https://genome.ucsc.edu/h/GCA_000401635.1 GCA_000401635.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/401/635/GCA_000401635.1/genes/GCA_000401635.1_Muco_sp_1006Ph_V1.augustus.gtf.gz
7677
H143 544712 GCA_000151035.1 False Scaffold 38922587 48.0 1740271.0 7.0 41.5 Histoplasma capsulatum 5037 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,147545,451871,33183,299071,5036,5037,544712 Ascomycota Eukaryota Fungi Ascomycota Eurotiomycetes Onygenales Ajellomycetaceae Histoplasma Histoplasma capsulatum Histoplasma capsulatum H143 https://genome.ucsc.edu/h/GCA_000151035.1 GCA_000151035.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/151/035/GCA_000151035.1/genes/GCA_000151035.1_ASM15103v1.augustus.gtf.gz
77-
MS6 1126212 GCA_000302655.1 False Contig 48882845 66.0x 52.5 Macrophomina phaseolina 35725 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715962,147541,159987,451869,45131,35724,35725,1126212 Ascomycota Eukaryota Fungi Ascomycota Dothideomycetes Botryosphaeriales Botryosphaeriaceae Macrophomina Macrophomina phaseolina Macrophomina phaseolina MS6 https://genome.ucsc.edu/h/GCA_000302655.1 GCA_000302655.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/302/655/GCA_000302655.1/genes/GCA_000302655.1_MphMS6_1.0.augustus.gtf.gz
7878
JEL423 403673 GCA_000149865.1 False Scaffold 23722384 69.0 1707251.0 5.0 7.4x 39.5 Batrachochytrium dendrobatidis 109871 1,131567,2759,33154,4751,112252,4761,2683659,451435,451442,1142503,100474,109871,403673 Chytridiomycota Eukaryota Fungi Chytridiomycota Chytridiomycetes Rhizophydiales Batrachochytrium Batrachochytrium dendrobatidis Batrachochytrium dendrobatidis JEL423 https://genome.ucsc.edu/h/GCA_000149865.1 GCA_000149865.1 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/149/865/GCA_000149865.1/genes/GCA_000149865.1_BD_JEL423.augustus.gtf.gz
7979
43151 GCA_000211455.3 False Contig 136935538 20.0x 48.5 Anopheles darlingi 43151 1,131567,2759,33154,33208,6072,33213,33317,1206794,88770,6656,197563,197562,6960,50557,85512,7496,33340,33392,7147,7148,43786,41827,7157,43816,7164,44543,44545,44546,44552,43151 Arthropoda Eukaryota Metazoa Arthropoda Insecta Diptera Culicidae Anopheles Anopheles darlingi https://genome.ucsc.edu/h/GCA_000211455.3 GCA_000211455.3 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/211/455/GCA_000211455.3/genes/GCA_000211455.3_A_darlingi_v1.augustus.gtf.gz
8080
1-1 BBBD Race 1 630390 GCA_000151525.2 False Scaffold 135343689 14818.0 544256.0 68.0 31.0x 46.5 Puccinia triticina 208348 1,131567,2759,33154,4751,451864,5204,29000,162484,5258,5262,5296,208348,630390 Basidiomycota Eukaryota Fungi Basidiomycota Pucciniomycetes Pucciniales Pucciniaceae Puccinia Puccinia triticina Puccinia triticina 1-1 BBBD Race 1 https://genome.ucsc.edu/h/GCA_000151525.2 GCA_000151525.2 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/151/525/GCA_000151525.2/genes/GCA_000151525.2_P_triticina_1_1_V2.augustus.gtf.gz
@@ -754,7 +754,6 @@ aabys 7370 GCF_030504385.1 True Scaffold 1032604506 339.0 12461067.0 23.0 135.0
754754
ET1 1227346 GCF_900067095.1 True Scaffold 45210324 32.0 3311891.0 5.0 100.0x 48.5 Full annotation GCA_900067095.1 Fusarium proliferatum 948311 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715989,147550,222543,5125,110618,5506,171627,948311,1227346 Ascomycota Eukaryota Fungi Ascomycota Sordariomycetes Hypocreales Nectriaceae Fusarium Fusarium proliferatum Fusarium proliferatum ET1 https://genome.ucsc.edu/h/GCF_900067095.1 GCA_900067095.1 GCF_900067095.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/067/095/GCF_900067095.1/genes/GCF_900067095.1_F._proliferatum_ET1_version_1.ncbiRefSeq.gtf.gz
755755
164328 GCF_020800215.1 True Scaffold 57451392 28.0 3611744.0 6.0 340.0x 54.5 Full annotation GCA_020800215.1 Phytophthora ramorum 164328 1,131567,2759,2698737,33634,4762,4776,4777,4783,164328 Oomycota Eukaryota Oomycota Peronosporales Peronosporaceae Phytophthora Phytophthora ramorum https://genome.ucsc.edu/h/GCA_020800215.1 GCA_020800215.1 GCF_020800215.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/020/800/215/GCF_020800215.1/genes/GCF_020800215.1_PR-102_v3_p.ncbiRefSeq.gtf.gz
756756
GKB4 4785 GCF_018691715.1 True Scaffold 109702272 133.0 1187988.0 30.0 100.0x 54.0 Full annotation GCA_018691715.1 Phytophthora cinnamomi 4785 1,131567,2759,2698737,33634,4762,4776,4777,4783,4785 Oomycota Eukaryota Oomycota Peronosporales Peronosporaceae Phytophthora Phytophthora cinnamomi https://genome.ucsc.edu/h/GCA_018691715.1 GCA_018691715.1 GCF_018691715.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/018/691/715/GCF_018691715.1/genes/GCF_018691715.1_ASM1869171v1.ncbiRefSeq.gtf.gz
757-
34620 GCF_023375885.1 True Scaffold 2485971885 3118.0 56155975.0 12.0 38.0x 46.5 Full annotation GCA_023375885.2 Dermacentor andersoni 34620 1,131567,2759,33154,33208,6072,33213,33317,1206794,88770,6656,6843,6854,6933,6934,6935,297308,6939,426437,34619,34620 Arthropoda Eukaryota Metazoa Arthropoda Arachnida Ixodida Ixodidae Dermacentor Dermacentor andersoni https://genome.ucsc.edu/h/GCF_023375885.1 GCA_023375885.2 GCF_023375885.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/023/375/885/GCF_023375885.1/genes/GCF_023375885.1_qqDerAnde1.2.ncbiRefSeq.gtf.gz
758757
BP57 273372 GCF_017655625.1 True Scaffold 13093275 9.0 2138570.0 3.0 150.0x 38.0 Full annotation GCA_017655625.1 Candida metapsilosis 273372 1,131567,2759,33154,4751,451864,4890,716545,147537,3239874,2916678,766764,1535325,5475,273372 Ascomycota Eukaryota Fungi Ascomycota Pichiomycetes Serinales Debaryomycetaceae Candida Candida metapsilosis https://genome.ucsc.edu/h/GCA_017655625.1 GCA_017655625.1 GCF_017655625.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/017/655/625/GCF_017655625.1/genes/GCF_017655625.1_BP57.ncbiRefSeq.gtf.gz
759758
8A 5849 GCF_900005855.1 True Scaffold 24997804 152.0 1343538.0 7.0 150.0x 18.0 Full annotation GCA_900005855.1 Plasmodium gallinaceum 5849 1,131567,2759,2698737,33630,5794,422676,5819,1639119,5820,418104,5849 Apicomplexa Eukaryota Apicomplexa Aconoidasida Haemosporida Plasmodiidae Plasmodium Plasmodium gallinaceum https://genome.ucsc.edu/h/GCF_900005855.1 GCA_900005855.1 GCF_900005855.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/005/855/GCF_900005855.1/genes/GCF_900005855.1_PGAL8A.ncbiRefSeq.gtf.gz
760759
5871 GCF_024862765.1 True Contig 12779071 186.0x 53.5 Full annotation GCA_024862765.1 Babesia caballi 5871 1,131567,2759,2698737,33630,5794,422676,5863,32594,5864,5871 Apicomplexa Eukaryota Apicomplexa Aconoidasida Piroplasmida Babesiidae Babesia Babesia caballi https://genome.ucsc.edu/h/GCF_024862765.1 GCA_024862765.1 GCF_024862765.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/024/862/765/GCF_024862765.1/genes/GCF_024862765.1_Bcaballi_D6B2_v1.0.ncbiRefSeq.gtf.gz

catalog/build/py/build-files-from-ncbi.py

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
GENOMES_OUTPUT_PATH = "catalog/build/intermediate/genomes-from-ncbi.tsv"
88

99
QC_REPORT_PATH = "catalog/output/qc-report.md"
10+
TREE_OUTPUT_PATH = "catalog/output/ncbi-taxa-tree.json"
1011

1112
TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
1213
2: "Bacteria",
@@ -43,6 +44,7 @@
4344
ASSEMBLIES_PATH,
4445
GENOMES_OUTPUT_PATH,
4546
UCSC_ASSEMBLIES_URL,
47+
TREE_OUTPUT_PATH,
4648
TAXANOMIC_LEVELS_FOR_TREE,
4749
{"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID},
4850
qc_report_path=QC_REPORT_PATH

catalog/build/py/package/catalog_build/build.py

+180-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import yaml
33
import requests
44
import urllib
5+
import re
6+
import json
57
import time
68
from functools import partial
79
from bs4 import BeautifulSoup
@@ -127,15 +129,175 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
127129
}
128130

129131

130-
def get_species_df(taxonomy_ids, taxonomic_group_sets, taxonomic_levels):
131-
species_info = get_batched_ncbi_results(
132+
def get_species_info(taxonomy_ids):
133+
"""
134+
Fetches species information from NCBI API for the given taxonomy IDs.
135+
136+
Args:
137+
taxonomy_ids: List of taxonomy IDs to fetch information for
138+
139+
Returns:
140+
List of species information dictionaries from NCBI
141+
"""
142+
return get_batched_ncbi_results(
132143
lambda ids: f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join(ids)}/dataset_report",
133144
[str(id) for id in set(taxonomy_ids)],
134145
"taxa"
135146
)
147+
148+
149+
def get_species_df(species_info, taxonomic_group_sets, taxonomic_levels):
150+
"""
151+
Converts species information into a DataFrame.
152+
153+
Args:
154+
species_info: List of species information dictionaries from NCBI
155+
taxonomic_group_sets: Dictionary of taxonomic group sets
156+
taxonomic_levels: List of taxonomic levels to include
157+
158+
Returns:
159+
DataFrame containing species information
160+
"""
136161
return pd.DataFrame([get_species_row(info, taxonomic_group_sets, taxonomic_levels) for info in species_info])
137162

138163

164+
def get_species_tree(taxonomy_ids, taxonomic_levels, species_info=None):
165+
"""
166+
Builds a species tree from taxonomy IDs and taxonomic levels.
167+
168+
Args:
169+
taxonomy_ids: List of taxonomy IDs to include in the tree
170+
taxonomic_levels: List of taxonomic levels to include in the tree
171+
species_info: Optional pre-fetched species information to avoid additional API calls
172+
173+
Returns:
174+
A nested tree structure of species
175+
"""
176+
species_tree_response = requests.post(
177+
"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/filtered_subtree",
178+
json={"taxons": [str(int(t)) for t in taxonomy_ids], "rank_limits": [t.upper() for t in taxonomic_levels]},
179+
).json()
180+
181+
# Build a tree from the response
182+
edges = species_tree_response.get("edges", {})
183+
all_children = {child for edge in edges.values() for child in edge.get("visible_children", [])}
184+
all_children = [str(num) for num in all_children]
185+
root_ids = [node_id for node_id in edges if node_id not in all_children]
186+
root_ids = [str(num) for num in root_ids]
187+
188+
if not root_ids:
189+
return {}
190+
191+
# this bc the ncbi result is odd, multi-root
192+
root_id = "1"
193+
for root_id_candidate in root_ids:
194+
if root_id_candidate != root_id:
195+
edges[root_id]["visible_children"].append(root_id_candidate)
196+
197+
species_tree = ncbi_tree_to_nested_tree(root_id, edges, taxonomy_ids)
198+
199+
# Find the set of all unique tax_ids and their display names
200+
tax_ids = all_children + root_ids
201+
tax_ids = set(tax_ids)
202+
203+
# Initialize maps with root node
204+
taxon_name_map = {"1": "root"}
205+
taxon_rank_map = {"1": "NA"}
206+
207+
# If we have pre-fetched species_info, use it to populate the name and rank maps
208+
if species_info:
209+
# Extract taxon names and ranks from species_info
210+
for info in species_info:
211+
tax_id = str(info["taxonomy"]["tax_id"])
212+
if tax_id in tax_ids:
213+
taxon_name_map[tax_id] = info["taxonomy"]["current_scientific_name"]["name"]
214+
if "rank" in info["taxonomy"]:
215+
taxon_rank_map[tax_id] = info["taxonomy"]["rank"]
216+
else:
217+
print(f"rank not found for tax_id: {tax_id}")
218+
219+
# Also extract parent taxa information if available
220+
if "classification" in info["taxonomy"]:
221+
for rank_level, rank_info in info["taxonomy"]["classification"].items():
222+
if isinstance(rank_info, dict) and "id" in rank_info and "name" in rank_info:
223+
parent_name = rank_info["name"]
224+
parent_id = rank_info["id"]
225+
parent_id_str = str(parent_id)
226+
if parent_id_str in tax_ids and parent_id_str not in taxon_name_map:
227+
taxon_name_map[parent_id_str] = parent_name
228+
taxon_rank_map[parent_id_str] = rank_level
229+
230+
# Fetch any missing taxa information
231+
fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, "missing parent taxa")
232+
233+
named_species_tree = update_species_tree_names(species_tree, taxon_name_map, taxon_rank_map)
234+
235+
return named_species_tree
236+
237+
238+
def fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, description="taxa"):
239+
"""
240+
Fetches taxonomic information in batches and updates the provided name and rank maps.
241+
242+
Args:
243+
tax_ids: List or set of taxonomy IDs to fetch
244+
taxon_name_map: Dictionary to update with taxon ID to name mappings
245+
taxon_rank_map: Dictionary to update with taxon ID to rank mappings
246+
description: Description of the taxa being fetched for logging
247+
248+
Returns:
249+
None (updates the provided maps in-place)
250+
"""
251+
# Filter out tax_ids that are already in the map
252+
missing_tax_ids = [tid for tid in tax_ids if tid not in taxon_name_map and tid != "1"]
253+
254+
if not missing_tax_ids:
255+
return
256+
257+
print(f"Fetching information for {len(missing_tax_ids)} {description}")
258+
259+
# Process in batches of 100 to avoid API limitations
260+
batch_size = 100
261+
for i in range(0, len(missing_tax_ids), batch_size):
262+
batch = missing_tax_ids[i:i+batch_size]
263+
print(f"Fetching batch {i//batch_size + 1} of {(len(missing_tax_ids) + batch_size - 1)//batch_size} ({len(batch)} taxa)")
264+
265+
taxa_info = get_paginated_ncbi_results(
266+
f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{','.join(batch)}/dataset_report",
267+
f"{description} batch {i//batch_size + 1}"
268+
)
269+
270+
for report in taxa_info:
271+
tax_id = str(report["taxonomy"]["tax_id"])
272+
taxon_name_map[tax_id] = report["taxonomy"]["current_scientific_name"]["name"]
273+
if "rank" in report["taxonomy"]:
274+
taxon_rank_map[tax_id] = report["taxonomy"]["rank"]
275+
else:
276+
print(f"rank not found for tax_id: {tax_id}")
277+
278+
279+
def ncbi_tree_to_nested_tree(node_id, edges, taxonomy_ids):
280+
children = edges.get(str(node_id), {}).get("visible_children", [])
281+
children = [str(num) for num in children]
282+
# ncbi results odd again, dup children
283+
children = set(children)
284+
if (len(children) > 0 or int(node_id) in taxonomy_ids):
285+
child_trees = [ncbi_tree_to_nested_tree(child, edges, taxonomy_ids) for child in children]
286+
child_trees = [item for item in child_trees if item is not None]
287+
return {
288+
"name": node_id,
289+
"ncbi_tax_id": node_id,
290+
"children": child_trees
291+
}
292+
293+
def update_species_tree_names(tree, taxon_name_map, taxon_rank_map):
294+
tree["rank"] = taxon_rank_map.get(tree["name"], "Unknown")
295+
tree["name"] = taxon_name_map.get(tree["name"], tree["name"])
296+
297+
for child in tree.get("children", []):
298+
update_species_tree_names(child, taxon_name_map, taxon_rank_map)
299+
return tree
300+
139301
def get_genome_row(genome_info):
140302
refseq_category = genome_info["assembly_info"].get("refseq_category")
141303
return {
@@ -177,6 +339,7 @@ def get_genomes_and_primarydata_df(accessions):
177339

178340

179341
def _id_to_gene_model_url(asm_id: str, session: requests.Session):
342+
print(f"finding gene model url for: {asm_id}")
180343
ucsc_files_endpoint = "https://genome.ucsc.edu/list/files"
181344
download_base_url = "https://hgdownload.soe.ucsc.edu"
182345
response = session.get(ucsc_files_endpoint, params={"genome": asm_id})
@@ -415,6 +578,7 @@ def build_files(
415578
assemblies_path,
416579
genomes_output_path,
417580
ucsc_assemblies_url,
581+
tree_output_path,
418582
taxonomic_levels_for_tree,
419583
taxonomic_group_sets={},
420584
do_gene_model_urls=True,
@@ -442,7 +606,11 @@ def build_files(
442606

443607
qc_report_params["missing_ncbi_assemblies"] = report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
444608

445-
species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets, taxonomic_levels_for_tree)
609+
# Fetch species information once to be used by both species_df and species_tree
610+
species_info = get_species_info(base_genomes_df["taxonomyId"])
611+
612+
# Create species DataFrame using the fetched species_info
613+
species_df = get_species_df(species_info, taxonomic_group_sets, taxonomic_levels_for_tree)
446614

447615
report_missing_values_from("species", "found on NCBI", base_genomes_df["taxonomyId"], species_df["taxonomyId"])
448616

@@ -471,10 +639,18 @@ def build_files(
471639

472640
if extract_primary_data:
473641
primarydata_df.to_csv(primary_output_path, index=False, sep="\t")
474-
475642
print(f"Wrote to {primary_output_path}")
476643

477644
if qc_report_path is not None:
478645
qc_report_text = make_qc_report(**qc_report_params)
479646
with open(qc_report_path, "w") as file:
480647
file.write(qc_report_text)
648+
649+
if len(taxonomic_levels_for_tree) > 0:
650+
# Use the taxonomy IDs from the genomes_df to build the species tree
651+
# Pass the previously fetched species_info to avoid another API call
652+
species_tree = get_species_tree(list(genomes_df["taxonomyId"]), taxonomic_levels_for_tree, species_info)
653+
with open(tree_output_path, 'w') as outfile:
654+
json.dump(species_tree, outfile, indent=4)
655+
print(f"Wrote to {tree_output_path}")
656+

catalog/build/py/package/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="catalog_build",
5-
version="1.5.1",
5+
version="2.0.0",
66
packages=["catalog_build"],
77
install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],
88
)

0 commit comments

Comments
 (0)