|
2 | 2 | import yaml
|
3 | 3 | import requests
|
4 | 4 | import urllib
|
| 5 | +import re |
| 6 | +import json |
5 | 7 | import time
|
6 | 8 | from functools import partial
|
7 | 9 | from bs4 import BeautifulSoup
|
@@ -127,15 +129,175 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
|
127 | 129 | }
|
128 | 130 |
|
129 | 131 |
|
130 |
| -def get_species_df(taxonomy_ids, taxonomic_group_sets, taxonomic_levels): |
131 |
| - species_info = get_batched_ncbi_results( |
| 132 | +def get_species_info(taxonomy_ids): |
| 133 | + """ |
| 134 | + Fetches species information from NCBI API for the given taxonomy IDs. |
| 135 | + |
| 136 | + Args: |
| 137 | + taxonomy_ids: List of taxonomy IDs to fetch information for |
| 138 | + |
| 139 | + Returns: |
| 140 | + List of species information dictionaries from NCBI |
| 141 | + """ |
| 142 | + return get_batched_ncbi_results( |
132 | 143 | lambda ids: f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join(ids)}/dataset_report",
|
133 | 144 | [str(id) for id in set(taxonomy_ids)],
|
134 | 145 | "taxa"
|
135 | 146 | )
|
| 147 | + |
| 148 | + |
| 149 | +def get_species_df(species_info, taxonomic_group_sets, taxonomic_levels): |
| 150 | + """ |
| 151 | + Converts species information into a DataFrame. |
| 152 | + |
| 153 | + Args: |
| 154 | + species_info: List of species information dictionaries from NCBI |
| 155 | + taxonomic_group_sets: Dictionary of taxonomic group sets |
| 156 | + taxonomic_levels: List of taxonomic levels to include |
| 157 | + |
| 158 | + Returns: |
| 159 | + DataFrame containing species information |
| 160 | + """ |
136 | 161 | return pd.DataFrame([get_species_row(info, taxonomic_group_sets, taxonomic_levels) for info in species_info])
|
137 | 162 |
|
138 | 163 |
|
| 164 | +def get_species_tree(taxonomy_ids, taxonomic_levels, species_info=None): |
| 165 | + """ |
| 166 | + Builds a species tree from taxonomy IDs and taxonomic levels. |
| 167 | + |
| 168 | + Args: |
| 169 | + taxonomy_ids: List of taxonomy IDs to include in the tree |
| 170 | + taxonomic_levels: List of taxonomic levels to include in the tree |
| 171 | + species_info: Optional pre-fetched species information to avoid additional API calls |
| 172 | + |
| 173 | + Returns: |
| 174 | + A nested tree structure of species |
| 175 | + """ |
| 176 | + species_tree_response = requests.post( |
| 177 | + "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/filtered_subtree", |
| 178 | + json={"taxons": [str(int(t)) for t in taxonomy_ids], "rank_limits": [t.upper() for t in taxonomic_levels]}, |
| 179 | + ).json() |
| 180 | + |
| 181 | + # Build a tree from the response |
| 182 | + edges = species_tree_response.get("edges", {}) |
| 183 | + all_children = {child for edge in edges.values() for child in edge.get("visible_children", [])} |
| 184 | + all_children = [str(num) for num in all_children] |
| 185 | + root_ids = [node_id for node_id in edges if node_id not in all_children] |
| 186 | + root_ids = [str(num) for num in root_ids] |
| 187 | + |
| 188 | + if not root_ids: |
| 189 | + return {} |
| 190 | + |
| 191 | + # this bc the ncbi result is odd, multi-root |
| 192 | + root_id = "1" |
| 193 | + for root_id_candidate in root_ids: |
| 194 | + if root_id_candidate != root_id: |
| 195 | + edges[root_id]["visible_children"].append(root_id_candidate) |
| 196 | + |
| 197 | + species_tree = ncbi_tree_to_nested_tree(root_id, edges, taxonomy_ids) |
| 198 | + |
| 199 | + # Find the set of all unique tax_ids and their display names |
| 200 | + tax_ids = all_children + root_ids |
| 201 | + tax_ids = set(tax_ids) |
| 202 | + |
| 203 | + # Initialize maps with root node |
| 204 | + taxon_name_map = {"1": "root"} |
| 205 | + taxon_rank_map = {"1": "NA"} |
| 206 | + |
| 207 | + # If we have pre-fetched species_info, use it to populate the name and rank maps |
| 208 | + if species_info: |
| 209 | + # Extract taxon names and ranks from species_info |
| 210 | + for info in species_info: |
| 211 | + tax_id = str(info["taxonomy"]["tax_id"]) |
| 212 | + if tax_id in tax_ids: |
| 213 | + taxon_name_map[tax_id] = info["taxonomy"]["current_scientific_name"]["name"] |
| 214 | + if "rank" in info["taxonomy"]: |
| 215 | + taxon_rank_map[tax_id] = info["taxonomy"]["rank"] |
| 216 | + else: |
| 217 | + print(f"rank not found for tax_id: {tax_id}") |
| 218 | + |
| 219 | + # Also extract parent taxa information if available |
| 220 | + if "classification" in info["taxonomy"]: |
| 221 | + for rank_level, rank_info in info["taxonomy"]["classification"].items(): |
| 222 | + if isinstance(rank_info, dict) and "id" in rank_info and "name" in rank_info: |
| 223 | + parent_name = rank_info["name"] |
| 224 | + parent_id = rank_info["id"] |
| 225 | + parent_id_str = str(parent_id) |
| 226 | + if parent_id_str in tax_ids and parent_id_str not in taxon_name_map: |
| 227 | + taxon_name_map[parent_id_str] = parent_name |
| 228 | + taxon_rank_map[parent_id_str] = rank_level |
| 229 | + |
| 230 | + # Fetch any missing taxa information |
| 231 | + fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, "missing parent taxa") |
| 232 | + |
| 233 | + named_species_tree = update_species_tree_names(species_tree, taxon_name_map, taxon_rank_map) |
| 234 | + |
| 235 | + return named_species_tree |
| 236 | + |
| 237 | + |
| 238 | +def fetch_taxa_info_in_batches(tax_ids, taxon_name_map, taxon_rank_map, description="taxa"): |
| 239 | + """ |
| 240 | + Fetches taxonomic information in batches and updates the provided name and rank maps. |
| 241 | + |
| 242 | + Args: |
| 243 | + tax_ids: List or set of taxonomy IDs to fetch |
| 244 | + taxon_name_map: Dictionary to update with taxon ID to name mappings |
| 245 | + taxon_rank_map: Dictionary to update with taxon ID to rank mappings |
| 246 | + description: Description of the taxa being fetched for logging |
| 247 | + |
| 248 | + Returns: |
| 249 | + None (updates the provided maps in-place) |
| 250 | + """ |
| 251 | + # Filter out tax_ids that are already in the map |
| 252 | + missing_tax_ids = [tid for tid in tax_ids if tid not in taxon_name_map and tid != "1"] |
| 253 | + |
| 254 | + if not missing_tax_ids: |
| 255 | + return |
| 256 | + |
| 257 | + print(f"Fetching information for {len(missing_tax_ids)} {description}") |
| 258 | + |
| 259 | + # Process in batches of 100 to avoid API limitations |
| 260 | + batch_size = 100 |
| 261 | + for i in range(0, len(missing_tax_ids), batch_size): |
| 262 | + batch = missing_tax_ids[i:i+batch_size] |
| 263 | + print(f"Fetching batch {i//batch_size + 1} of {(len(missing_tax_ids) + batch_size - 1)//batch_size} ({len(batch)} taxa)") |
| 264 | + |
| 265 | + taxa_info = get_paginated_ncbi_results( |
| 266 | + f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{','.join(batch)}/dataset_report", |
| 267 | + f"{description} batch {i//batch_size + 1}" |
| 268 | + ) |
| 269 | + |
| 270 | + for report in taxa_info: |
| 271 | + tax_id = str(report["taxonomy"]["tax_id"]) |
| 272 | + taxon_name_map[tax_id] = report["taxonomy"]["current_scientific_name"]["name"] |
| 273 | + if "rank" in report["taxonomy"]: |
| 274 | + taxon_rank_map[tax_id] = report["taxonomy"]["rank"] |
| 275 | + else: |
| 276 | + print(f"rank not found for tax_id: {tax_id}") |
| 277 | + |
| 278 | + |
| 279 | +def ncbi_tree_to_nested_tree(node_id, edges, taxonomy_ids): |
| 280 | + children = edges.get(str(node_id), {}).get("visible_children", []) |
| 281 | + children = [str(num) for num in children] |
| 282 | + # ncbi results odd again, dup children |
| 283 | + children = set(children) |
| 284 | + if (len(children) > 0 or int(node_id) in taxonomy_ids): |
| 285 | + child_trees = [ncbi_tree_to_nested_tree(child, edges, taxonomy_ids) for child in children] |
| 286 | + child_trees = [item for item in child_trees if item is not None] |
| 287 | + return { |
| 288 | + "name": node_id, |
| 289 | + "ncbi_tax_id": node_id, |
| 290 | + "children": child_trees |
| 291 | + } |
| 292 | + |
| 293 | +def update_species_tree_names(tree, taxon_name_map, taxon_rank_map): |
| 294 | + tree["rank"] = taxon_rank_map.get(tree["name"], "Unknown") |
| 295 | + tree["name"] = taxon_name_map.get(tree["name"], tree["name"]) |
| 296 | + |
| 297 | + for child in tree.get("children", []): |
| 298 | + update_species_tree_names(child, taxon_name_map, taxon_rank_map) |
| 299 | + return tree |
| 300 | + |
139 | 301 | def get_genome_row(genome_info):
|
140 | 302 | refseq_category = genome_info["assembly_info"].get("refseq_category")
|
141 | 303 | return {
|
@@ -177,6 +339,7 @@ def get_genomes_and_primarydata_df(accessions):
|
177 | 339 |
|
178 | 340 |
|
179 | 341 | def _id_to_gene_model_url(asm_id: str, session: requests.Session):
|
| 342 | + print(f"finding gene model url for: {asm_id}") |
180 | 343 | ucsc_files_endpoint = "https://genome.ucsc.edu/list/files"
|
181 | 344 | download_base_url = "https://hgdownload.soe.ucsc.edu"
|
182 | 345 | response = session.get(ucsc_files_endpoint, params={"genome": asm_id})
|
@@ -415,6 +578,7 @@ def build_files(
|
415 | 578 | assemblies_path,
|
416 | 579 | genomes_output_path,
|
417 | 580 | ucsc_assemblies_url,
|
| 581 | + tree_output_path, |
418 | 582 | taxonomic_levels_for_tree,
|
419 | 583 | taxonomic_group_sets={},
|
420 | 584 | do_gene_model_urls=True,
|
@@ -442,7 +606,11 @@ def build_files(
|
442 | 606 |
|
443 | 607 | qc_report_params["missing_ncbi_assemblies"] = report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
|
444 | 608 |
|
445 |
| - species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets, taxonomic_levels_for_tree) |
| 609 | + # Fetch species information once to be used by both species_df and species_tree |
| 610 | + species_info = get_species_info(base_genomes_df["taxonomyId"]) |
| 611 | + |
| 612 | + # Create species DataFrame using the fetched species_info |
| 613 | + species_df = get_species_df(species_info, taxonomic_group_sets, taxonomic_levels_for_tree) |
446 | 614 |
|
447 | 615 | report_missing_values_from("species", "found on NCBI", base_genomes_df["taxonomyId"], species_df["taxonomyId"])
|
448 | 616 |
|
@@ -471,10 +639,18 @@ def build_files(
|
471 | 639 |
|
472 | 640 | if extract_primary_data:
|
473 | 641 | primarydata_df.to_csv(primary_output_path, index=False, sep="\t")
|
474 |
| - |
475 | 642 | print(f"Wrote to {primary_output_path}")
|
476 | 643 |
|
477 | 644 | if qc_report_path is not None:
|
478 | 645 | qc_report_text = make_qc_report(**qc_report_params)
|
479 | 646 | with open(qc_report_path, "w") as file:
|
480 | 647 | file.write(qc_report_text)
|
| 648 | + |
| 649 | + if len(taxonomic_levels_for_tree) > 0: |
| 650 | + # Use the taxonomy IDs from the genomes_df to build the species tree |
| 651 | + # Pass the previously fetched species_info to avoid another API call |
| 652 | + species_tree = get_species_tree(list(genomes_df["taxonomyId"]), taxonomic_levels_for_tree, species_info) |
| 653 | + with open(tree_output_path, 'w') as outfile: |
| 654 | + json.dump(species_tree, outfile, indent=4) |
| 655 | + print(f"Wrote to {tree_output_path}") |
| 656 | + |
0 commit comments