feat: landing page organism viz - data (#289) (#290)

d-callan · dannon · hunterckx · web-flow · commit 6817ece12b0f · 2025-03-21T20:31:59.000-07:00
* feat: add data for landing page organism viz * removing strains from sunburst * add taxonomic rank to taxa tree data * fix up tree data w more organisms * chore: increase feedback while fetching gene model urls (#290) * fix: use a post instead of a giant GET for filtered_subtree * Rebuild genomes, tree * bump: update catalog version to 2.0.0 * Apply suggestions from code review Co-authored-by: Hunter Craft <118154470+hunterckx@users.noreply.github.com> * chore: rebuild catalog --------- Co-authored-by: Dannon Baker <dannon.baker@gmail.com> Co-authored-by: Hunter Craft <118154470+hunterckx@users.noreply.github.com>
diff --git a/catalog/build/intermediate/genomes-from-ncbi.tsv b/catalog/build/intermediate/genomes-from-ncbi.tsv
@@ -71,10 +71,10 @@ Indian Wild Type (Walter Reed)	30069	GCA_000300775.2	False	Scaffold		221324304	2
 P15	658858	GCA_000182665.1	False	Contig		11522052				47.0x	47.0			Giardia intestinalis	5741	1,131567,2759,2611341,207245,5738,5739,68459,5740,5741,658858	Metamonada	Eukaryota			Fornicata		Diplomonadida	Hexamitidae	Giardia	Giardia intestinalis	Giardia lamblia P15	https://genome.ucsc.edu/h/GCA_000182665.1	GCA_000182665.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/182/665/GCA_000182665.1/genes/GCA_000182665.1_ASM18266v1.augustus.gtf.gz
 GS/M clone H7	598745	GCA_000182405.1	False	Contig		11001532				16.0x	47.5			Giardia intestinalis	5741	1,131567,2759,2611341,207245,5738,5739,68459,5740,5741,598745	Metamonada	Eukaryota			Fornicata		Diplomonadida	Hexamitidae	Giardia	Giardia intestinalis	Giardia intestinalis ATCC 50581	https://genome.ucsc.edu/h/GCA_000182405.1	GCA_000182405.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/182/405/GCA_000182405.1/genes/GCA_000182405.1_ASM18240v1.augustus.gtf.gz
 G3	412133	GCA_000002825.3	False	Scaffold		176420065	64769.0	27122.0	1020.0	7.2x	33.0			Trichomonas vaginalis	5722	1,131567,2759,2611341,5719,37104,181550,5721,5722,412133	Metamonada	Eukaryota			Parabasalia		Trichomonadida	Trichomonadidae	Trichomonas	Trichomonas vaginalis		https://genome.ucsc.edu/h/GCA_000002825.3	GCA_000002825.3		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/002/825/GCA_000002825.3/genes/GCA_000002825.3_ASM282v3.augustus.gtf.gz
+MS6	1126212	GCA_000302655.1	False	Contig		48882845				66.0x	52.5			Macrophomina phaseolina	35725	1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715962,147541,159987,451869,45131,35724,35725,1126212	Ascomycota	Eukaryota		Fungi	Ascomycota	Dothideomycetes	Botryosphaeriales	Botryosphaeriaceae	Macrophomina	Macrophomina phaseolina	Macrophomina phaseolina MS6	https://genome.ucsc.edu/h/GCA_000302655.1	GCA_000302655.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/302/655/GCA_000302655.1/genes/GCA_000302655.1_MphMS6_1.0.augustus.gtf.gz
 RP-78	273507	GCA_000167175.1	False	Contig		29842556					57.0			Phanerodontia chrysosporium	2822231	1,131567,2759,33154,4751,451864,5204,5302,155619,355688,5303,396331,2492674,2822231,273507	Basidiomycota	Eukaryota		Fungi	Basidiomycota	Agaricomycetes	Polyporales	Phanerochaetaceae	Phanerodontia	Phanerodontia chrysosporium	Phanerochaete chrysosporium RP-78	https://genome.ucsc.edu/h/GCA_000167175.1	GCA_000167175.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/167/175/GCA_000167175.1/genes/GCA_000167175.1_ASM16717v1.augustus.gtf.gz
 1006PhL	1220926	GCA_000401635.1	False	Scaffold		36348485	470.0	140649.0	82.0	45.0x	39.5			Mucor circinelloides	36080	1,131567,2759,33154,4751,112252,1913637,451507,2212703,4827,1344963,34489,4830,36080,1220926	Mucoromycota	Eukaryota		Fungi	Mucoromycota	Mucoromycetes	Mucorales	Mucoraceae	Mucor	Mucor circinelloides	Mucor circinelloides 1006PhL	https://genome.ucsc.edu/h/GCA_000401635.1	GCA_000401635.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/401/635/GCA_000401635.1/genes/GCA_000401635.1_Muco_sp_1006Ph_V1.augustus.gtf.gz
 H143	544712	GCA_000151035.1	False	Scaffold		38922587	48.0	1740271.0	7.0		41.5			Histoplasma capsulatum	5037	1,131567,2759,33154,4751,451864,4890,716545,147538,716546,147545,451871,33183,299071,5036,5037,544712	Ascomycota	Eukaryota		Fungi	Ascomycota	Eurotiomycetes	Onygenales	Ajellomycetaceae	Histoplasma	Histoplasma capsulatum	Histoplasma capsulatum H143	https://genome.ucsc.edu/h/GCA_000151035.1	GCA_000151035.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/151/035/GCA_000151035.1/genes/GCA_000151035.1_ASM15103v1.augustus.gtf.gz
-MS6	1126212	GCA_000302655.1	False	Contig		48882845				66.0x	52.5			Macrophomina phaseolina	35725	1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715962,147541,159987,451869,45131,35724,35725,1126212	Ascomycota	Eukaryota		Fungi	Ascomycota	Dothideomycetes	Botryosphaeriales	Botryosphaeriaceae	Macrophomina	Macrophomina phaseolina	Macrophomina phaseolina MS6	https://genome.ucsc.edu/h/GCA_000302655.1	GCA_000302655.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/302/655/GCA_000302655.1/genes/GCA_000302655.1_MphMS6_1.0.augustus.gtf.gz
 JEL423	403673	GCA_000149865.1	False	Scaffold		23722384	69.0	1707251.0	5.0	7.4x	39.5			Batrachochytrium dendrobatidis	109871	1,131567,2759,33154,4751,112252,4761,2683659,451435,451442,1142503,100474,109871,403673	Chytridiomycota	Eukaryota		Fungi	Chytridiomycota	Chytridiomycetes	Rhizophydiales		Batrachochytrium	Batrachochytrium dendrobatidis	Batrachochytrium dendrobatidis JEL423	https://genome.ucsc.edu/h/GCA_000149865.1	GCA_000149865.1		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/149/865/GCA_000149865.1/genes/GCA_000149865.1_BD_JEL423.augustus.gtf.gz
 	43151	GCA_000211455.3	False	Contig		136935538				20.0x	48.5			Anopheles darlingi	43151	1,131567,2759,33154,33208,6072,33213,33317,1206794,88770,6656,197563,197562,6960,50557,85512,7496,33340,33392,7147,7148,43786,41827,7157,43816,7164,44543,44545,44546,44552,43151	Arthropoda	Eukaryota		Metazoa	Arthropoda	Insecta	Diptera	Culicidae	Anopheles	Anopheles darlingi		https://genome.ucsc.edu/h/GCA_000211455.3	GCA_000211455.3		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/211/455/GCA_000211455.3/genes/GCA_000211455.3_A_darlingi_v1.augustus.gtf.gz
 1-1 BBBD Race 1	630390	GCA_000151525.2	False	Scaffold		135343689	14818.0	544256.0	68.0	31.0x	46.5			Puccinia triticina	208348	1,131567,2759,33154,4751,451864,5204,29000,162484,5258,5262,5296,208348,630390	Basidiomycota	Eukaryota		Fungi	Basidiomycota	Pucciniomycetes	Pucciniales	Pucciniaceae	Puccinia	Puccinia triticina	Puccinia triticina 1-1 BBBD Race 1	https://genome.ucsc.edu/h/GCA_000151525.2	GCA_000151525.2		https://hgdownload.soe.ucsc.edu/hubs/GCA/000/151/525/GCA_000151525.2/genes/GCA_000151525.2_P_triticina_1_1_V2.augustus.gtf.gz
@@ -754,7 +754,6 @@ aabys	7370	GCF_030504385.1	True	Scaffold		1032604506	339.0	12461067.0	23.0	135.0
 ET1	1227346	GCF_900067095.1	True	Scaffold		45210324	32.0	3311891.0	5.0	100.0x	48.5	Full annotation	GCA_900067095.1	Fusarium proliferatum	948311	1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715989,147550,222543,5125,110618,5506,171627,948311,1227346	Ascomycota	Eukaryota		Fungi	Ascomycota	Sordariomycetes	Hypocreales	Nectriaceae	Fusarium	Fusarium proliferatum	Fusarium proliferatum ET1	https://genome.ucsc.edu/h/GCF_900067095.1	GCA_900067095.1	GCF_900067095.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/900/067/095/GCF_900067095.1/genes/GCF_900067095.1_F._proliferatum_ET1_version_1.ncbiRefSeq.gtf.gz
 	164328	GCF_020800215.1	True	Scaffold		57451392	28.0	3611744.0	6.0	340.0x	54.5	Full annotation	GCA_020800215.1	Phytophthora ramorum	164328	1,131567,2759,2698737,33634,4762,4776,4777,4783,164328	Oomycota	Eukaryota			Oomycota		Peronosporales	Peronosporaceae	Phytophthora	Phytophthora ramorum		https://genome.ucsc.edu/h/GCA_020800215.1	GCA_020800215.1	GCF_020800215.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/020/800/215/GCF_020800215.1/genes/GCF_020800215.1_PR-102_v3_p.ncbiRefSeq.gtf.gz
 GKB4	4785	GCF_018691715.1	True	Scaffold		109702272	133.0	1187988.0	30.0	100.0x	54.0	Full annotation	GCA_018691715.1	Phytophthora cinnamomi	4785	1,131567,2759,2698737,33634,4762,4776,4777,4783,4785	Oomycota	Eukaryota			Oomycota		Peronosporales	Peronosporaceae	Phytophthora	Phytophthora cinnamomi		https://genome.ucsc.edu/h/GCA_018691715.1	GCA_018691715.1	GCF_018691715.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/018/691/715/GCF_018691715.1/genes/GCF_018691715.1_ASM1869171v1.ncbiRefSeq.gtf.gz
-	34620	GCF_023375885.1	True	Scaffold		2485971885	3118.0	56155975.0	12.0	38.0x	46.5	Full annotation	GCA_023375885.2	Dermacentor andersoni	34620	1,131567,2759,33154,33208,6072,33213,33317,1206794,88770,6656,6843,6854,6933,6934,6935,297308,6939,426437,34619,34620	Arthropoda	Eukaryota		Metazoa	Arthropoda	Arachnida	Ixodida	Ixodidae	Dermacentor	Dermacentor andersoni		https://genome.ucsc.edu/h/GCF_023375885.1	GCA_023375885.2	GCF_023375885.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/023/375/885/GCF_023375885.1/genes/GCF_023375885.1_qqDerAnde1.2.ncbiRefSeq.gtf.gz
 BP57	273372	GCF_017655625.1	True	Scaffold		13093275	9.0	2138570.0	3.0	150.0x	38.0	Full annotation	GCA_017655625.1	Candida metapsilosis	273372	1,131567,2759,33154,4751,451864,4890,716545,147537,3239874,2916678,766764,1535325,5475,273372	Ascomycota	Eukaryota		Fungi	Ascomycota	Pichiomycetes	Serinales	Debaryomycetaceae	Candida	Candida metapsilosis		https://genome.ucsc.edu/h/GCA_017655625.1	GCA_017655625.1	GCF_017655625.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/017/655/625/GCF_017655625.1/genes/GCF_017655625.1_BP57.ncbiRefSeq.gtf.gz
 8A	5849	GCF_900005855.1	True	Scaffold		24997804	152.0	1343538.0	7.0	150.0x	18.0	Full annotation	GCA_900005855.1	Plasmodium gallinaceum	5849	1,131567,2759,2698737,33630,5794,422676,5819,1639119,5820,418104,5849	Apicomplexa	Eukaryota			Apicomplexa	Aconoidasida	Haemosporida	Plasmodiidae	Plasmodium	Plasmodium gallinaceum		https://genome.ucsc.edu/h/GCF_900005855.1	GCA_900005855.1	GCF_900005855.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/900/005/855/GCF_900005855.1/genes/GCF_900005855.1_PGAL8A.ncbiRefSeq.gtf.gz
 	5871	GCF_024862765.1	True	Contig		12779071				186.0x	53.5	Full annotation	GCA_024862765.1	Babesia caballi	5871	1,131567,2759,2698737,33630,5794,422676,5863,32594,5864,5871	Apicomplexa	Eukaryota			Apicomplexa	Aconoidasida	Piroplasmida	Babesiidae	Babesia	Babesia caballi		https://genome.ucsc.edu/h/GCF_024862765.1	GCA_024862765.1	GCF_024862765.1	https://hgdownload.soe.ucsc.edu/hubs/GCF/024/862/765/GCF_024862765.1/genes/GCF_024862765.1_Bcaballi_D6B2_v1.0.ncbiRefSeq.gtf.gz
diff --git a/catalog/build/py/build-files-from-ncbi.py b/catalog/build/py/build-files-from-ncbi.py
@@ -7,6 +7,7 @@
 GENOMES_OUTPUT_PATH = "catalog/build/intermediate/genomes-from-ncbi.tsv"
 
 QC_REPORT_PATH = "catalog/output/qc-report.md"
+TREE_OUTPUT_PATH = "catalog/output/ncbi-taxa-tree.json"
 
 TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
   2: "Bacteria",
@@ -43,6 +44,7 @@
     ASSEMBLIES_PATH,
     GENOMES_OUTPUT_PATH,
     UCSC_ASSEMBLIES_URL,
+    TREE_OUTPUT_PATH,
     TAXANOMIC_LEVELS_FOR_TREE,
     {"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID},
     qc_report_path=QC_REPORT_PATH
diff --git a/catalog/build/py/package/catalog_build/build.py b/catalog/build/py/package/catalog_build/build.py
@@ -2,6 +2,8 @@
 import yaml
 import requests
 import urllib
+import re
+import json
 import time
 from functools import partial
 from bs4 import BeautifulSoup
@@ -127,15 +129,175 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
   }
 
 
-def get_species_df(taxonomy_ids, taxonomic_group_sets, taxonomic_levels):
-  species_info = get_batched_ncbi_results(
+def get_species_info(taxonomy_ids):
+  """
+  Fetches species information from NCBI API for the given taxonomy IDs.
+  
+  Args:
+    taxonomy_ids: List of taxonomy IDs to fetch information for
+    
+  Returns:
+    List of species information dictionaries from NCBI
+  """
+  return get_batched_ncbi_results(
     lambda ids: f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join(ids)}/dataset_report",
     [str(id) for id in set(taxonomy_ids)],
     "taxa"
   )
+
+
+def get_species_df(species_info, taxonomic_group_sets, taxonomic_levels):
+  """
+  Converts species information into a DataFrame.
+  
+  Args:
+    species_info: List of species information dictionaries from NCBI
+    taxonomic_group_sets: Dictionary of taxonomic group sets
+    taxonomic_levels: List of taxonomic levels to include
+    
+  Returns:
+    DataFrame containing species information
+  """
   return pd.DataFrame([get_species_row(info, taxonomic_group_sets, taxonomic_levels) for info in species_info])
 
 
+def get_species_tree(taxonomy_ids, taxonomic_levels, species_info=None):
+  """
+  Builds a species tree from taxonomy IDs and taxonomic levels.
+  
+  Args:
+    taxonomy_ids: List of taxonomy IDs to include in the tree
+    taxonomic_levels: List of taxonomic levels to include in the tree
+    species_info: Optional pre-fetched species information to avoid additional API calls
+    
+  Returns:
+    A nested tree structure of species
+  """
+  species_tree_response = requests.post(
+    "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/filtered_subtree",
+    json={"taxons": [str(int(t)) for t in taxonomy_ids], "rank_limits": [t.upper() for t in taxonomic_levels]},
+  ).json()
+
+  # Build a tree from the response
+  edges = species_tree_response.get("edges", {})
+  all_children = {child for edge in edges.values() for child in edge.get("visible_children", [])}
+  all_children = [str(num) for num in all_children]
+  root_ids = [node_id for node_id in edges if node_id not in all_children]
+  root_ids = [str(num) for num in root_ids]
+
+  if not root_ids:
+      return {}
+  
+  # this bc the ncbi result is odd, multi-root
+  root_id = "1"
+  for root_id_candidate in root_ids:
+      if root_id_candidate != root_id:
+          edges[root_id]["visible_children"].append(root_id_candidate)
+
+  species_tree = ncbi_tree_to_nested_tree(root_id, edges, taxonomy_ids)
+
+  # Find the set of all unique tax_ids and their display names
+  tax_ids = all_children + root_ids
+  tax_ids = set(tax_ids)
+  
+  # Initialize maps with root node
+  taxon_name_map = {"1": "root"}
+  taxon_rank_map = {"1": "NA"}
+  
+  # If we have pre-fetched species_info, use it to populate the name and rank maps
+  if species_info:
+    # Extract taxon names and ranks from species_info
+    for info in species_info:
+      tax_id = str(info["taxonomy"]["tax_id"])
+      if tax_id in tax_ids:
+        taxon_name_map[tax_id] = info["taxonomy"]["current_scientific_name"]["name"]
+        if "rank" in info["taxonomy"]:
+          taxon_rank_map[tax_id] = info["taxonomy"]["rank"]
+        else:
+          print(f"rank not found for tax_id: {tax_id}")
+      
+      # Also extract parent taxa information if available
+      if "classification" in info["taxonomy"]:
+        for rank_level, rank_info in info["taxonomy"]["classification"].items():
+          if isinstance(rank_info, dict) and "id" in rank_info and "name" in rank_info:
+            parent_name = rank_info["name"]
+            parent_id = rank_info["id"]
+            parent_id_str = str(parent_id)
+            if parent_id_str in tax_ids and parent_id_str not in taxon_name_map:
+              taxon_name_map[parent_id_str] = parent_name
+              taxon_rank_map[parent_id_str] = rank_level
+    
+  # Fetch any missing taxa information
+  fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, "missing parent taxa")
+
+  named_species_tree = update_species_tree_names(species_tree, taxon_name_map, taxon_rank_map)
+
+  return named_species_tree
+
+
+def fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, description="taxa"):
+  """
+  Fetches taxonomic information in batches and updates the provided name and rank maps.
+  
+  Args:
+    tax_ids: List or set of taxonomy IDs to fetch
+    taxon_name_map: Dictionary to update with taxon ID to name mappings
+    taxon_rank_map: Dictionary to update with taxon ID to rank mappings
+    description: Description of the taxa being fetched for logging
+    
+  Returns:
+    None (updates the provided maps in-place)
+  """
+  # Filter out tax_ids that are already in the map
+  missing_tax_ids = [tid for tid in tax_ids if tid not in taxon_name_map and tid != "1"]
+  
+  if not missing_tax_ids:
+    return
+    
+  print(f"Fetching information for {len(missing_tax_ids)} {description}")
+  
+  # Process in batches of 100 to avoid API limitations
+  batch_size = 100
+  for i in range(0, len(missing_tax_ids), batch_size):
+    batch = missing_tax_ids[i:i+batch_size]
+    print(f"Fetching batch {i//batch_size + 1} of {(len(missing_tax_ids) + batch_size - 1)//batch_size} ({len(batch)} taxa)")
+    
+    taxa_info = get_paginated_ncbi_results(
+      f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{','.join(batch)}/dataset_report", 
+      f"{description} batch {i//batch_size + 1}"
+    )
+    
+    for report in taxa_info:
+      tax_id = str(report["taxonomy"]["tax_id"])
+      taxon_name_map[tax_id] = report["taxonomy"]["current_scientific_name"]["name"]
+      if "rank" in report["taxonomy"]:
+        taxon_rank_map[tax_id] = report["taxonomy"]["rank"]
+      else:
+        print(f"rank not found for tax_id: {tax_id}")
+
+
+def ncbi_tree_to_nested_tree(node_id, edges, taxonomy_ids):
+  children = edges.get(str(node_id), {}).get("visible_children", [])
+  children = [str(num) for num in children]
+  # ncbi results odd again, dup children
+  children = set(children)
+  if (len(children) > 0 or int(node_id) in taxonomy_ids):
+    child_trees = [ncbi_tree_to_nested_tree(child, edges, taxonomy_ids) for child in children]
+    child_trees = [item for item in child_trees if item is not None]
+    return {
+      "name": node_id,
+      "ncbi_tax_id": node_id,
+      "children": child_trees
+    }
+
+def update_species_tree_names(tree, taxon_name_map, taxon_rank_map):
+    tree["rank"] = taxon_rank_map.get(tree["name"], "Unknown")
+    tree["name"] = taxon_name_map.get(tree["name"], tree["name"])
+
+    for child in tree.get("children", []):
+        update_species_tree_names(child, taxon_name_map, taxon_rank_map)
+    return tree
+
 def get_genome_row(genome_info):
   refseq_category = genome_info["assembly_info"].get("refseq_category")
   return {
@@ -177,6 +339,7 @@ def get_genomes_and_primarydata_df(accessions):
 
 
 def _id_to_gene_model_url(asm_id: str, session: requests.Session):
+  print(f"finding gene model url for: {asm_id}")
   ucsc_files_endpoint = "https://genome.ucsc.edu/list/files"
   download_base_url = "https://hgdownload.soe.ucsc.edu"
   response = session.get(ucsc_files_endpoint, params={"genome": asm_id})
@@ -415,6 +578,7 @@ def build_files(
   assemblies_path,
   genomes_output_path,
   ucsc_assemblies_url,
+  tree_output_path,
   taxonomic_levels_for_tree, 
   taxonomic_group_sets={},
   do_gene_model_urls=True,
@@ -442,7 +606,11 @@ def build_files(
   
   qc_report_params["missing_ncbi_assemblies"] = report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
 
-  species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets, taxonomic_levels_for_tree)
+  # Fetch species information once to be used by both species_df and species_tree
+  species_info = get_species_info(base_genomes_df["taxonomyId"])
+  
+  # Create species DataFrame using the fetched species_info
+  species_df = get_species_df(species_info, taxonomic_group_sets, taxonomic_levels_for_tree)
 
   report_missing_values_from("species", "found on NCBI", base_genomes_df["taxonomyId"], species_df["taxonomyId"])
 
@@ -471,10 +639,18 @@ def build_files(
 
   if extract_primary_data:
     primarydata_df.to_csv(primary_output_path, index=False, sep="\t")
-
     print(f"Wrote to {primary_output_path}")
   
   if qc_report_path is not None:
     qc_report_text = make_qc_report(**qc_report_params)
     with open(qc_report_path, "w") as file:
       file.write(qc_report_text)
+
+  if len(taxonomic_levels_for_tree) > 0:
+    # Use the taxonomy IDs from the genomes_df to build the species tree
+    # Pass the previously fetched species_info to avoid another API call
+    species_tree = get_species_tree(list(genomes_df["taxonomyId"]), taxonomic_levels_for_tree, species_info)
+    with open(tree_output_path, 'w') as outfile:
+      json.dump(species_tree, outfile, indent=4)
+    print(f"Wrote to {tree_output_path}")
+  
diff --git a/catalog/build/py/package/setup.py b/catalog/build/py/package/setup.py
@@ -2,7 +2,7 @@
 
 setup(
   name="catalog_build",
-  version="1.5.1",
+  version="2.0.0",
   packages=["catalog_build"],
   install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],
 )
diff --git a/catalog/output/ncbi-taxa-tree.json b/catalog/output/ncbi-taxa-tree.json
diff --git a/catalog/output/qc-report.md b/catalog/output/qc-report.md

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`
`3`	`3`	`setup(`
`4`	`4`	`name="catalog_build",`
`5`		`- version="1.5.1",`
	`5`	`+ version="2.0.0",`
`6`	`6`	`packages=["catalog_build"],`
`7`	`7`	`install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],`
`8`	`8`	`)`