diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e8808a..bedc6fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,17 @@ All notable changes to this project will be documented in this file. +## Unreleased + ## [0.10.0] +### Feature + +- Add Orphante diseases (`OrphaDisease`) to Ontology +- Filter gene and disease annotations in subontology based on association with phenotypes +- Add binary version 3 +- Add new example ontology + ### Documentation - Change orders of methods in `Ontology` to clean up the documentation. @@ -11,6 +20,9 @@ All notable changes to this project will be documented in this file. ### Refactor - Improve the OBO parser with better error handling +- [**breaking**] Add `Disease` trait that is needed to work with `OmimDisease` and `OrphaDisease` +- Update example ontology +- Update unit- and doctests to align with updated example ontology ## [0.9.1] - 2024-03-30 diff --git a/README.md b/README.md index ef43acf..6dc1cf0 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,10 @@ The main structs used in `hpo` are: - [`HpoSet`](https://docs.rs/hpo/latest/hpo/struct.HpoSet.html) is a collection of `HpoTerm`s, like a patient's clinical information. - [`Gene`](https://docs.rs/hpo/latest/hpo/annotations/struct.Gene.html) represents a single gene, including information about associated `HpoTerm`s. - [`OmimDisease`](https://docs.rs/hpo/latest/hpo/annotations/struct.OmimDisease.html) represents a single OMIM-diseases, including information about associated `HpoTerm`s. +- [`OrphaDisease`](https://docs.rs/hpo/latest/hpo/annotations/struct.OrphaDisease.html) represents a single ORPHA-diseases, including information about associated `HpoTerm`s. The most relevant modules are: -- [`annotations`](https://docs.rs/hpo/latest/hpo/annotations/index.html) contains the `Gene` and `OmimDisease` structs, and some related important types. +- [`annotations`](https://docs.rs/hpo/latest/hpo/annotations/index.html) contains the `Gene`, `OmimDisease` and `OrphaDisease` structs, and some related important types. - [`similarity`](https://docs.rs/hpo/latest/hpo/similarity/index.html) contains structs and helper functions for similarity comparisons for `HpoTerm` and `HpoSet`. - [`stats`](https://docs.rs/hpo/latest/hpo/stats/index.html) contains functions to calculate the hypergeometric enrichment score of genes or diseases. @@ -67,7 +68,7 @@ Finally, load the data using [`Ontology::from_binary`]: ### Ontology ```rust use hpo::{Ontology, HpoTermId}; -use hpo::annotations::{GeneId, OmimDiseaseId}; +use hpo::annotations::{GeneId, OmimDiseaseId, OrphaDiseaseId}; fn example() { let ontology = Ontology::from_standard("/path/to/master-data/").unwrap(); @@ -87,6 +88,11 @@ fn example() { // do something with disease } + // iterate orpha diseases + for disease in ontology.orpha_diseases() { + // do something with disease + } + // get a single HPO term using HPO ID let hpo_id = HpoTermId::try_from("HP:0000123").unwrap(); let term = ontology.hpo(hpo_id); @@ -98,6 +104,10 @@ fn example() { let disease_id = OmimDiseaseId::from(12345u32); let disease = ontology.omim_disease(&disease_id); + // get a single Orpha disease + let disease_id = OrphaDiseaseId::from(12345u32); + let disease = ontology.orpha_disease(&disease_id); + // get a single Gene let hgnc_id = GeneId::from(12345u32); let gene = ontology.gene(&hgnc_id); diff --git a/examples/bench_compare_gene_to_diseases.rs b/examples/bench_compare_gene_to_diseases.rs index 4691bfe..6e9d28c 100644 --- a/examples/bench_compare_gene_to_diseases.rs +++ b/examples/bench_compare_gene_to_diseases.rs @@ -2,6 +2,7 @@ use std::path::Path; use rayon::prelude::*; +use hpo::annotations::Disease; use hpo::similarity::GroupSimilarity; use hpo::{HpoSet, Ontology}; diff --git a/examples/compare_ontologies.rs b/examples/compare_ontologies.rs index f32dd3d..2dade3d 100644 --- a/examples/compare_ontologies.rs +++ b/examples/compare_ontologies.rs @@ -3,6 +3,7 @@ //! This is helpful for checking correctness of the parser modules //! or to see changes after a new HPO release +use hpo::annotations::Disease; use hpo::comparison::Comparison; use hpo::{HpoTermId, Ontology}; use std::{path::Path, process}; @@ -52,6 +53,14 @@ fn overview(diffs: &Comparison) { for disease in diffs.removed_omim_diseases() { println!("Removed\t{}\t{}", disease.id(), disease.name()); } + + for disease in diffs.added_orpha_diseases() { + println!("Added\t{}\t{}", disease.id(), disease.name()); + } + + for disease in diffs.removed_orpha_diseases() { + println!("Removed\t{}\t{}", disease.id(), disease.name()); + } } /// Prints info about Term-specific changes @@ -141,7 +150,11 @@ fn changed_genes(diffs: &Comparison) { /// Prints info about Gene-specific changes fn changed_diseases(diffs: &Comparison) { println!("#Disease Delta\tID\tOld Name:New Name\tn Terms Old\tn Terms New\tAdded Terms\tRemoved Terms"); - for disease in diffs.changed_omim_diseases() { + for disease in diffs + .changed_omim_diseases() + .iter() + .chain(diffs.changed_orpha_diseases().iter()) + { print!("Delta\t{}", disease.id()); if let Some(names) = disease.changed_name() { print!("\t{}:{}", names.0, names.1); diff --git a/examples/dendogram.rs b/examples/dendogram.rs index ea09b26..57289be 100644 --- a/examples/dendogram.rs +++ b/examples/dendogram.rs @@ -1,7 +1,7 @@ use std::env::Args; use std::path::Path; -use hpo::annotations::OmimDisease; +use hpo::annotations::{Disease, OmimDisease}; use rayon::prelude::*; use hpo::similarity::GroupSimilarity; diff --git a/examples/disease_similarity.rs b/examples/disease_similarity.rs index 67a27de..d48d805 100644 --- a/examples/disease_similarity.rs +++ b/examples/disease_similarity.rs @@ -4,7 +4,7 @@ use std::io::Write; use std::{env::Args, time::SystemTime}; use hpo::{ - annotations::{OmimDisease, OmimDiseaseId}, + annotations::{Disease, OmimDisease, OmimDiseaseId}, similarity::{GraphIc, GroupSimilarity, StandardCombiner}, term::HpoGroup, HpoSet, HpoTermId, Ontology, @@ -120,6 +120,29 @@ fn cross_compare_diseases( } } +fn compare_omim_to_orpha(ontology: &Ontology, sim: &GroupSimilarity) { + let omim: Vec<&OmimDisease> = ontology.omim_diseases().collect(); + + let omim_names: Vec<&str> = omim.iter().map(|d| d.name()).collect(); + + let omim_sets: Vec = omim.iter().map(|d| d.to_hpo_set(ontology)).collect(); + + println!("Orpha\\Omim\t{}", omim_names.join("\t")); + + ontology + .orpha_diseases() + .take(100) + .par_bridge() + .for_each(|orpha| { + let orpha_set = orpha.to_hpo_set(ontology); + let mut row = orpha.name().to_string(); + for omim_set in omim_sets.iter() { + row.push_str(&format!("\t{}", sim.calculate(&orpha_set, omim_set))); + } + println!("{row}"); + }) +} + fn main() { let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap(); let combiner = StandardCombiner::FunSimAvg; @@ -135,6 +158,12 @@ fn main() { if let Ok(num) = arg.parse::() { // integer provided, using disease x disease comparisons cross_compare_diseases(&ontology, &sim, num); + } else if arg == "orpha" { + let sim = GroupSimilarity::new( + StandardCombiner::FunSimAvg, + GraphIc::new(hpo::term::InformationContentKind::Gene), + ); + compare_omim_to_orpha(&ontology, &sim); } else { // List of HPO terms provided compare_custom_set_to_diseases(&ontology, &sim, arg); @@ -153,6 +182,9 @@ fn main() { - Cross compare N diseases\n\ disease_similarity \n\ disease_similarity 20 + - Compare all OMIM to all ORPHA diseases\n\ + disease_similarity orpha\n\ + disease_similarity orpha "); } } diff --git a/examples/enrichment.rs b/examples/enrichment.rs index adf4f9a..98d38e8 100644 --- a/examples/enrichment.rs +++ b/examples/enrichment.rs @@ -1,6 +1,10 @@ use std::{env::Args, process}; -use hpo::{annotations::OmimDiseaseId, term::HpoGroup, HpoResult, HpoSet, HpoTermId, Ontology}; +use hpo::{ + annotations::{Disease, OmimDiseaseId}, + term::HpoGroup, + HpoResult, HpoSet, HpoTermId, Ontology, +}; /// Tries to parse an HpoTermId from a string `HP:0007768` or a `u32` fn id_from_freetext(value: &str) -> HpoResult { diff --git a/examples/gene_similarity.rs b/examples/gene_similarity.rs new file mode 100644 index 0000000..e45307b --- /dev/null +++ b/examples/gene_similarity.rs @@ -0,0 +1,153 @@ +use rayon::prelude::*; +use std::io; +use std::io::Write; +use std::{env::Args, time::SystemTime}; + +use hpo::{ + annotations::{Gene, GeneId}, + similarity::{GraphIc, GroupSimilarity, StandardCombiner}, + term::HpoGroup, + HpoSet, HpoTermId, Ontology, +}; + +/// Calculates the similarity score of two diseases +/// The two diseases are specified as OMIM-ID via CLI arguments +fn compare_two_genes( + ontology: &Ontology, + sim: &GroupSimilarity, + mut args: Args, +) { + let symbol_a = args.nth(1).unwrap(); + let symbol_b = args.next().unwrap(); + + let gene_a = ontology + .gene_by_name(&symbol_a) + .expect("The first gene is not part of the Ontology"); + let gene_b = ontology + .gene_by_name(&symbol_b) + .expect("The second gene is not part of the Ontology"); + + let set_a = gene_a.to_hpo_set(ontology); + let set_b = gene_b.to_hpo_set(ontology); + + let res = sim.calculate(&set_a, &set_b); + println!("Similarity is {res}"); +} + +/// Calculates the similarity score of a custom HPO-Set +/// to all OMIM diseases +/// The HPO-Set is specified as a comma separated list of HPO-Term-IDs +fn compare_custom_set_to_genes( + ontology: &Ontology, + sim: &GroupSimilarity, + terms: String, +) { + let hpo_terms = terms.split(','); + let mut group = HpoGroup::default(); + for t in hpo_terms { + group.insert(HpoTermId::try_from(t).expect("Invalid HpoTermId")); + } + let set_a = HpoSet::new(ontology, group); + + let start = SystemTime::now(); + let mut results: Vec<(&Gene, f32)> = ontology + .genes() + .par_bridge() + .map(|gene| { + let res = sim.calculate(&set_a, &gene.to_hpo_set(ontology)); + (gene, res) + }) + .collect(); + let end = SystemTime::now(); + let duration = end.duration_since(start).unwrap(); + + println!( + "Number of comparisons: {} in {} sec", + results.len(), + duration.as_secs() + ); + results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + let mut stdout = io::stdout().lock(); + for x in results { + stdout + .write_all(format!("{}\t{}\t{}\n", x.0.id(), x.0.name(), x.1).as_bytes()) + .unwrap(); + } +} + +/// Calculate the pairwise similarity of all diseases to other +/// diseases. +fn cross_compare_genes( + ontology: &Ontology, + sim: &GroupSimilarity, + num: usize, +) { + let start = SystemTime::now(); + let results: Vec<(GeneId, GeneId, f32)> = ontology + .genes() + .par_bridge() + .flat_map(|gene_a| { + ontology + .genes() + .take(num) + .map(|gene_b| { + let res = + sim.calculate(&gene_a.to_hpo_set(ontology), &gene_b.to_hpo_set(ontology)); + (*gene_a.id(), *gene_b.id(), res) + }) + .collect::>() + }) + .collect(); + let end = SystemTime::now(); + let duration = end.duration_since(start).unwrap(); + + println!( + "Number of comparisons: {} in {} sec", + results.len(), + duration.as_secs() + ); + + let mut stdout = io::stdout().lock(); + for x in results { + stdout + .write_all(format!("{}\t{}\t{}\n", x.0, x.1, x.2).as_bytes()) + .unwrap(); + } +} + +fn main() { + let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + let sim = GroupSimilarity::default(); + + let mut args = std::env::args(); + + match args.len() { + 3 => compare_two_genes(&ontology, &sim, args), + 2 => { + let arg = args.nth(1).unwrap(); + if let Ok(num) = arg.parse::() { + // integer provided, using disease x disease comparisons + cross_compare_genes(&ontology, &sim, num); + } else { + // List of HPO terms provided + compare_custom_set_to_genes(&ontology, &sim, arg); + } + } + _ => { + println!("Calculate similarities of genes\n\n"); + println!("\ + There are 3 different options:\n\ + - Compare 2 genes\n\ + gene_similarity \n\ + gene_similarity 618395 615368\n\n\ + - Compare all genes to a custom HPO-Set\n\ + gene_similarity \n\ + gene_similarity HP:0000750,HP:0000752,HP:0001249,HP:0007018,HP:0010818,HP:0011463\n\n\ + - Cross compare N genes\n\ + gene_similarity \n\ + gene_similarity 20 + "); + } + } +} diff --git a/examples/search_by_name.rs b/examples/search_by_name.rs index 3d4ce9e..5025e16 100644 --- a/examples/search_by_name.rs +++ b/examples/search_by_name.rs @@ -1,12 +1,13 @@ //! Prints every term and its associated genes +use hpo::annotations::Disease; use hpo::Ontology; fn main() { let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap(); - let cystinosis = ontology.omim_disease_by_name("Cystinosis").unwrap(); - println!("first match: {:?}", cystinosis.name()); - for result in ontology.omim_diseases_by_name("Cystinosis") { + let congenital = ontology.omim_disease_by_name("congenital").unwrap(); + println!("first match: {:?}", congenital.name()); + for result in ontology.omim_diseases_by_name("congenital") { println!("{:?}", result.name()); } } diff --git a/examples/subontology.rs b/examples/subontology.rs new file mode 100644 index 0000000..408f3a7 --- /dev/null +++ b/examples/subontology.rs @@ -0,0 +1,46 @@ +use hpo::{HpoTermId, Ontology}; +use std::io::Write; +use std::process; +use std::{fs::File, path::Path}; + +fn ontology(path_arg: &str) -> Ontology { + let path = Path::new(path_arg); + + match path.is_file() { + true => Ontology::from_binary(path).unwrap(), + false => Ontology::from_standard(&path.to_string_lossy()).unwrap(), + } +} + +fn main() { + let mut args = std::env::args(); + + if args.len() != 3 { + println!("Create a subontology\n\n"); + println!("Usage:\nsubontology "); + println!("e.g.:\nsubontology tests/ontology.hpo HP:0000001,HP:0000005,HP:0000007,HP:0000118,HP:0000707,HP:0000818,HP:0000864,HP:0001939,HP:0002011,HP:0002715,HP:0003581,HP:0003674,HP:0010662,HP:0010978,HP:0011017,HP:0012285,HP:0012443,HP:0012638,HP:0012639,HP:0012647,HP:0012648,HP:0012823,HP:0025454,HP:0031797,HP:0034345,HP:0100547\n"); + process::exit(1) + } + let path = args.nth(1).unwrap(); + let terms = args.next().unwrap(); + let terms = terms + .split(',') + .map(|id| HpoTermId::try_from(id).expect("Term-ID must be properly formatted")); + + let ontology = ontology(&path); + + let sub = ontology + .sub_ontology( + ontology + .hpo(1u32) + .expect("Root term must exist in ontology"), + terms.map(|id| ontology.hpo(id).expect("Term must exist in ontology")), + ) + .expect("sub ontology must work"); + + let mut fh = File::create("out.hpo").expect("Cannot create file"); + match fh.write_all(&sub.as_bytes()) { + Ok(_) => println!("Saved subontology to out.hpo"), + Err(err) => println!("Error: {}", err), + }; +} diff --git a/examples/term_to_disease.rs b/examples/term_to_disease.rs new file mode 100644 index 0000000..365eace --- /dev/null +++ b/examples/term_to_disease.rs @@ -0,0 +1,59 @@ +//! Prints every term and its associated Omim and Orpha Disease + +use hpo::annotations::Disease; +use hpo::Ontology; + +fn main() { + let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap(); + + let mut diseases: Vec = Vec::new(); + for term in ontology.hpos() { + diseases.clear(); + for disease in term.omim_diseases() { + diseases.push(disease.id().to_string()); + } + diseases.sort(); + println!("{}\t{}", term.id(), diseases.join(",")); + } + + for term in ontology.hpos() { + diseases.clear(); + for disease in term.orpha_diseases() { + diseases.push(disease.id().to_string()); + } + diseases.sort(); + println!("{}\t{}", term.id(), diseases.join(",")); + } +} + +/* +Python comparison code +from pyhpo import Ontology + +_ = Ontology() + +with open("term2omim.py.txt", "w") as fh: + for term in Ontology: + omim_diseases = [] + for disease in term.omim_diseases: + omim_diseases.append(f"OMIM:{disease.id}") + omim_diseases = ",".join(sorted(omim_diseases)) + _ = fh.write(f"{term.id}\t{omim_diseases}\n") + + for term in Ontology: + orpha_diseases = [] + for disease in term.orpha_diseases: + orpha_diseases.append(f"ORPHA:{disease.id}") + orpha_diseases = ",".join(sorted(orpha_diseases)) + _ = fh.write(f"{term.id}\t{orpha_diseases}\n") + +*/ + +/* +diff-ing + +diff \ +<(sort example_data/term2omim.rs.txt) \ +<(sort example_data/term2omim.py.txt) + +*/ diff --git a/examples/term_to_ic.rs b/examples/term_to_ic.rs index 8be469d..8e22966 100644 --- a/examples/term_to_ic.rs +++ b/examples/term_to_ic.rs @@ -7,10 +7,11 @@ fn main() { for term in ontology.hpos() { println!( - "{}\t{}\t{}", + "{}\t{}\t{}\t{}", term.id(), term.information_content().gene(), term.information_content().omim_disease(), + term.information_content().orpha_disease(), ); } } @@ -23,15 +24,15 @@ _ = Ontology() with open("term2ic.py.txt", "w") as fh: for term in Ontology: - _ = fh.write(f"{term.id}\t{term.information_content.gene}\t{term.information_content.omim}\n") + _ = fh.write(f"{term.id}\t{term.information_content.gene}\t{term.information_content.omim}\t{term.information_content.orpha}\n") */ /* -Diff'ing (only use the first 3 decimal digits to ignore rounding errors: +Diff'ing (only use the first 2 decimal digits to ignore rounding errors: diff \ -<(awk '{print $1, substr($2,1,5), substr($3,1,5)}' example_data/term2ic.rs.txt) \ -<(awk '{print $1, substr($2,1,5), substr($3,1,5)}' example_data/term2ic.py.txt) +<(sort example_data/term2ic.rs.txt | awk '{print $1, substr($2,1,4), substr($3,1,4), substr($4,1,4)}') \ +<(sort example_data/term2ic.py.txt | awk '{print $1, substr($2,1,4), substr($3,1,4), substr($4,1,4)}') */ diff --git a/examples/term_to_omim.rs b/examples/term_to_omim.rs deleted file mode 100644 index 23d28e6..0000000 --- a/examples/term_to_omim.rs +++ /dev/null @@ -1,33 +0,0 @@ -//! Prints every term and its associated Omim Disease - -use hpo::Ontology; - -fn main() { - let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap(); - - let mut omim_diseases: Vec = Vec::new(); - for term in ontology.hpos() { - omim_diseases.clear(); - for disease in term.omim_diseases() { - omim_diseases.push(disease.id().to_string()); - } - omim_diseases.sort(); - println!("{}\t{}", term.id(), omim_diseases.join(",")); - } -} - -/* -Python comparison code -from pyhpo import Ontology - -_ = Ontology() - -with open("term2omim.py.txt", "w") as fh: - for term in Ontology: - omim_diseases = [] - for disease in term.omim_diseases: - omim_diseases.append(f"OMIM:{disease.id}") - omim_diseases = ",".join(sorted(omim_diseases)) - _ = fh.write(f"{term.id}\t{omim_diseases}\n") - -*/ diff --git a/src/annotations.rs b/src/annotations.rs index 3beaca9..910a5c2 100644 --- a/src/annotations.rs +++ b/src/annotations.rs @@ -11,16 +11,27 @@ //! For more information check out the individual sections for [`Gene`] and [`OmimDisease`] mod gene; +use core::fmt::Debug; +use core::hash::Hash; pub use gene::{Gene, GeneId, GeneIterator, Genes}; +use std::fmt::Display; +mod disease; mod omim_disease; -pub use omim_disease::{OmimDisease, OmimDiseaseId, OmimDiseaseIterator, OmimDiseases}; +mod orpha_disease; +pub use disease::Disease; +pub use omim_disease::{ + OmimDisease, OmimDiseaseFilter, OmimDiseaseId, OmimDiseaseIterator, OmimDiseases, +}; +pub use orpha_disease::{OrphaDisease, OrphaDiseaseId, OrphaDiseaseIterator, OrphaDiseases}; /// All annotations ([`Gene`]s, [`OmimDisease`]s) are defined by a unique ID that is constrained by this trait /// /// The ID must be unique only within the annotation type, i.e. a gene and a disease /// can have the same ID. -pub trait AnnotationId { +pub trait AnnotationId: + Clone + Copy + Debug + Hash + PartialEq + PartialOrd + Eq + Ord + Display + From +{ /// Return the integer representation of the annotation ID fn as_u32(&self) -> u32; diff --git a/src/annotations/disease.rs b/src/annotations/disease.rs new file mode 100644 index 0000000..8d695fe --- /dev/null +++ b/src/annotations/disease.rs @@ -0,0 +1,221 @@ +use std::collections::HashSet; +use std::fmt::Debug; +use std::hash::Hash; +use tracing::error; + +use crate::annotations::AnnotationId; +use crate::term::HpoGroup; +use crate::u32_from_bytes; +use crate::{HpoError, HpoSet, HpoTermId, Ontology}; + +/// Defines common methods that all types of diseases have +pub trait Disease: PartialEq + Eq + Hash + Sized { + /// The type of the corresponding ID + /// e.g. `OmimDisease` has `OmimDiseaseId` + type AnnoID: AnnotationId; + + /// Initializes a new disease + fn new(id: Self::AnnoID, name: &str) -> Self; + + /// Returns the ID of the disease + fn id(&self) -> &Self::AnnoID; + + /// Returns the name of the disease + fn name(&self) -> &str; + + /// Connect another [HPO term](`crate::HpoTerm`) to the disease + fn add_term>(&mut self, term_id: I) -> bool; + + /// Returns a reference to the associated [group of HPO terms](`crate::term::HpoGroup`) + fn hpo_terms(&self) -> &HpoGroup; + + /// Creates a new `crate::set::HpoSet` + fn to_hpo_set<'a>(&self, ontology: &'a Ontology) -> HpoSet<'a> { + HpoSet::new(ontology, self.hpo_terms().clone()) + } + + /// Returns a binary representation of the `Disease` + /// + /// The binary layout is defined as: + /// + /// | Byte offset | Number of bytes | Description | + /// | --- | --- | --- | + /// | 0 | 4 | The total length of the binary data blob as big-endian `u32` | + /// | 4 | 4 | The `OmimDiseaseId` as big-endian `u32` | + /// | 8 | 4 | The length of the `Disease` Name as big-endian `u32` | + /// | 12 | n | The Disease name as u8 vector | + /// | 12 + n | 4 | The number of associated HPO terms as big-endian `u32` | + /// | 16 + n | x * 4 | The [`HpoTermId`]s of the associated terms, each encoded as big-endian `u32` | + /// + /// # Examples + /// + /// ``` + /// use hpo::annotations::{Disease, OmimDisease}; + /// + /// let mut disease = OmimDisease::new(123.into(), "FooBar"); + /// let bytes = disease.as_bytes(); + /// + /// assert_eq!(bytes.len(), 4 + 4 + 4 + 6 + 4); + /// assert_eq!(bytes[4..8], [0u8, 0u8, 0u8, 123u8]); // ID of disease => 123 + /// assert_eq!(bytes[8..12], [0u8, 0u8, 0u8, 6u8]); // Length of Name => 6 + /// ``` + fn as_bytes(&self) -> Vec { + fn usize_to_u32(n: usize) -> u32 { + n.try_into().expect("unable to convert {n} to u32") + } + let name = self.name().as_bytes(); + let name_length = name.len(); + let size = 4 + 4 + 4 + name_length + 4 + self.hpo_terms().len() * 4; + + let mut res = Vec::new(); + + // 4 bytes for total length + res.append(&mut usize_to_u32(size).to_be_bytes().to_vec()); + + // 4 bytes for Disease-ID + res.append(&mut self.id().to_be_bytes().to_vec()); + + // 4 bytes for Length of Disease Name + res.append(&mut usize_to_u32(name_length).to_be_bytes().to_vec()); + + // Disease name (n bytes) + for c in name { + res.push(*c); + } + + // 4 bytes for number of HPO terms + res.append(&mut usize_to_u32(self.hpo_terms().len()).to_be_bytes().to_vec()); + + // HPO terms + res.append(&mut self.hpo_terms().as_bytes()); + + res + } + + /// Returns an [`Disease`] from a bytes vector + /// + /// The byte layout for this method is defined in + /// [`Disease::as_bytes`] + /// + /// # Errors + /// + /// This method can fail with a `ParseBinaryError` if the input + /// bytes don't have the correct length or the name is not UTF-8 + /// encoded. + /// + /// # Examples + /// + /// ``` + /// use hpo::annotations::{Disease, OmimDisease, OmimDiseaseId}; + /// + /// let bytes = vec![ + /// 0u8, 0u8, 0u8, 22u8, // Total size of Blop + /// 0u8, 0u8, 0u8, 123u8, // ID of the disease => 123 + /// 0u8, 0u8, 0u8, 6u8, // Length of name => 6 + /// b'F', b'o', b'o', b'b', b'a', b'r', // Foobar + /// 0u8, 0u8, 0u8, 0u8 // Number of associated HPO Terms => 0 + /// ]; + /// let disease = OmimDisease::try_from(&bytes[..]).unwrap(); + /// + /// assert_eq!(disease.name(), "Foobar"); + /// assert_eq!(disease.id(), &OmimDiseaseId::from(123u32)); + /// ``` + fn from_bytes(bytes: &[u8]) -> Result { + // minimum length for a Disease without name and no HPO terms + // This check is important because we're accessing the bytes + // for size and ID directly and don't want to panic + if bytes.len() < 4 + 4 + 4 + 4 { + error!("Too few bytes for a Disease"); + return Err(HpoError::ParseBinaryError); + } + + let total_len = u32_from_bytes(&bytes[0..]) as usize; + + if bytes.len() != total_len { + error!( + "Too few bytes to build a Disease. Expected {}, received {}", + total_len, + bytes.len() + ); + return Err(HpoError::ParseBinaryError); + } + + let id = u32_from_bytes(&bytes[4..]); + let name_len = u32_from_bytes(&bytes[8..]) as usize; + + // Minimum length considering the name + if bytes.len() < 16 + name_len { + error!("Too few bytes for a Disease (including the name)"); + return Err(HpoError::ParseBinaryError); + } + + let Ok(name) = String::from_utf8(bytes[12..12 + name_len].to_vec()) else { + error!("Unable to parse the name of the Disease. It should consist of only utf8 characters"); + return Err(HpoError::ParseBinaryError); + }; + + let mut disease = Self::new(id.into(), &name); + + // An index for accessing the bytes during the iteration + let mut idx_terms = 12 + name_len; + let n_terms = u32_from_bytes(&bytes[idx_terms..]); + + if bytes.len() < 16 + name_len + n_terms as usize * 4 { + error!( + "Too few bytes in {}. {} terms, but {} bytes", + name, + n_terms, + bytes.len() + ); + return Err(HpoError::ParseBinaryError); + } + + idx_terms += 4; + for _ in 0..n_terms { + let term_id = HpoTermId::from([ + bytes[idx_terms], + bytes[idx_terms + 1], + bytes[idx_terms + 2], + bytes[idx_terms + 3], + ]); + idx_terms += 4; + + disease.add_term(term_id); + } + + if idx_terms == total_len && idx_terms == bytes.len() { + Ok(disease) + } else { + error!( + "The length of the bytes blob did not match: {} vs {}", + total_len, idx_terms + ); + Err(HpoError::ParseBinaryError) + } + } +} + +/// [`Disease`] Iterator +pub struct DiseaseIterator<'a, DID> { + pub(crate) ontology: &'a Ontology, + pub(crate) diseases: std::collections::hash_set::Iter<'a, DID>, +} + +impl<'a, DID> DiseaseIterator<'a, DID> { + /// Initialize a new [`DiseaseIterator`] + /// + /// This method requires the [`Ontology`] as a parameter since + /// the actual [`Disease`] entities are stored in it. + pub fn new(diseases: &'a HashSet, ontology: &'a Ontology) -> Self { + DiseaseIterator { + diseases: diseases.iter(), + ontology, + } + } +} + +impl Debug for DiseaseIterator<'_, DID> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DiseaseIterator") + } +} diff --git a/src/annotations/omim_disease.rs b/src/annotations/omim_disease.rs index f08aa02..138151c 100644 --- a/src/annotations/omim_disease.rs +++ b/src/annotations/omim_disease.rs @@ -1,17 +1,13 @@ +use std::collections::hash_map::Values; use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; +use std::fmt::{Debug, Display}; use std::hash::Hash; -use tracing::error; - -use crate::annotations::AnnotationId; -use crate::set::HpoSet; +use crate::annotations::disease::DiseaseIterator; +use crate::annotations::{AnnotationId, Disease}; use crate::term::HpoGroup; -use crate::u32_from_bytes; use crate::HpoError; use crate::HpoTermId; -use crate::Ontology; /// A set of OMIM diseases /// @@ -71,13 +67,15 @@ pub struct OmimDisease { hpos: HpoGroup, } -impl OmimDisease { +impl Disease for OmimDisease { + type AnnoID = OmimDiseaseId; + /// Initializes a new OMIM disease /// /// This method should rarely, if ever, be used directly. The - /// preferred way to create new genes is through [`Ontology::add_omim_disease`] + /// preferred way to create new genes is through [`crate::Ontology::add_omim_disease`] /// to ensure that each disease exists only once. - pub fn new(id: OmimDiseaseId, name: &str) -> OmimDisease { + fn new(id: Self::AnnoID, name: &str) -> OmimDisease { Self { name: name.to_string(), id, @@ -86,17 +84,17 @@ impl OmimDisease { } /// The unique [`OmimDiseaseId`] of the disease, the OMIM MIM number - pub fn id(&self) -> &OmimDiseaseId { + fn id(&self) -> &Self::AnnoID { &self.id } /// The OMIM disease name - pub fn name(&self) -> &str { + fn name(&self) -> &str { &self.name } /// The set of connected HPO terms - pub fn hpo_terms(&self) -> &HpoGroup { + fn hpo_terms(&self) -> &HpoGroup { &self.hpos } @@ -179,6 +177,7 @@ impl PartialEq for OmimDisease { self.id == other.id } } + impl Eq for OmimDisease {} impl TryFrom<&[u8]> for OmimDisease { @@ -186,12 +185,12 @@ impl TryFrom<&[u8]> for OmimDisease { /// Returns an [`OmimDisease`] from a bytes vector /// /// The byte layout for this method is defined in - /// [`OmimDisease::as_bytes`] + /// [`Disease::as_bytes`] /// /// # Examples /// /// ``` - /// use hpo::annotations::{OmimDisease, OmimDiseaseId}; + /// use hpo::annotations::{Disease, OmimDisease, OmimDiseaseId}; /// /// let bytes = vec![ /// 0u8, 0u8, 0u8, 22u8, // Total size of Blop @@ -206,77 +205,7 @@ impl TryFrom<&[u8]> for OmimDisease { /// assert_eq!(disease.id(), &OmimDiseaseId::from(123u32)); /// ``` fn try_from(bytes: &[u8]) -> Result { - // minimum length for a Disease without name and no HPO terms - // This check is important because we're accessing the bytes - // for size and ID directly and don't want to panic - if bytes.len() < 4 + 4 + 4 + 4 { - error!("Too few bytes for an OmimDisease"); - return Err(HpoError::ParseBinaryError); - } - - let total_len = u32_from_bytes(&bytes[0..]) as usize; - - if bytes.len() != total_len { - error!( - "Too few bytes to build OmimDisease. Expected {}, received {}", - total_len, - bytes.len() - ); - return Err(HpoError::ParseBinaryError); - } - - let id = u32_from_bytes(&bytes[4..]); - let name_len = u32_from_bytes(&bytes[8..]) as usize; - - // Minimum length considering the name - if bytes.len() < 16 + name_len { - error!("Too few bytes for an OmimDisease (including the name)"); - return Err(HpoError::ParseBinaryError); - } - - let Ok(name) = String::from_utf8(bytes[12..12 + name_len].to_vec()) else { - error!("Unable to parse the name of the OmimDisease"); - return Err(HpoError::ParseBinaryError); - }; - - let mut gene = OmimDisease::new(id.into(), &name); - - // An index for accessing the bytes during the iteration - let mut idx_terms = 12 + name_len; - let n_terms = u32_from_bytes(&bytes[idx_terms..]); - - if bytes.len() < 16 + name_len + n_terms as usize * 4 { - error!( - "Too few bytes in {}. {} terms, but {} bytes", - name, - n_terms, - bytes.len() - ); - return Err(HpoError::ParseBinaryError); - } - - idx_terms += 4; - for _ in 0..n_terms { - let term_id = HpoTermId::from([ - bytes[idx_terms], - bytes[idx_terms + 1], - bytes[idx_terms + 2], - bytes[idx_terms + 3], - ]); - idx_terms += 4; - - gene.add_term(term_id); - } - - if idx_terms == total_len && idx_terms == bytes.len() { - Ok(gene) - } else { - error!( - "The length of the bytes blob did not match: {} vs {}", - total_len, idx_terms - ); - Err(HpoError::ParseBinaryError) - } + Self::from_bytes(bytes) } } @@ -286,39 +215,41 @@ impl Hash for OmimDisease { } } -/// [`OmimDisease`] Iterator -pub struct OmimDiseaseIterator<'a> { - ontology: &'a Ontology, - diseases: std::collections::hash_set::Iter<'a, OmimDiseaseId>, -} +/// Iterates [`OmimDisease`] +pub type OmimDiseaseIterator<'a> = DiseaseIterator<'a, OmimDiseaseId>; -impl<'a> OmimDiseaseIterator<'a> { - /// Initialize a new [`OmimDiseaseIterator`] - /// - /// This method requires the [`Ontology`] as a parameter since - /// the actual [`OmimDisease`] entities are stored in it and - /// not in [`OmimDiseases`] itself - pub fn new(diseases: &'a OmimDiseases, ontology: &'a Ontology) -> Self { - OmimDiseaseIterator { - diseases: diseases.iter(), - ontology, - } +impl<'a> std::iter::Iterator for DiseaseIterator<'a, OmimDiseaseId> { + type Item = &'a OmimDisease; + fn next(&mut self) -> Option { + self.diseases.next().map(|omim_id| { + self.ontology + .omim_disease(omim_id) + .expect("disease must exist in Ontology") + }) } } -impl<'a> std::iter::Iterator for OmimDiseaseIterator<'a> { - type Item = &'a OmimDisease; - fn next(&mut self) -> Option { - match self.diseases.next() { - Some(omim_id) => Some(self.ontology.omim_disease(omim_id).unwrap()), - None => None, - } +/// Iterates [`OmimDisease`] that match the query string +/// +/// This struct is returned by [`crate::Ontology::omim_diseases_by_name`] +pub struct OmimDiseaseFilter<'a> { + iter: Values<'a, OmimDiseaseId, OmimDisease>, + query: &'a str, +} + +impl<'a> OmimDiseaseFilter<'a> { + pub(crate) fn new(iter: Values<'a, OmimDiseaseId, OmimDisease>, query: &'a str) -> Self { + OmimDiseaseFilter { iter, query } } } -impl Debug for OmimDiseaseIterator<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "OmimDiseaseIterator") +impl<'a> Iterator for OmimDiseaseFilter<'a> { + type Item = &'a OmimDisease; + + fn next(&mut self) -> Option { + self.iter + .by_ref() + .find(|&item| item.name().contains(self.query)) } } diff --git a/src/annotations/orpha_disease.rs b/src/annotations/orpha_disease.rs new file mode 100644 index 0000000..4f073a9 --- /dev/null +++ b/src/annotations/orpha_disease.rs @@ -0,0 +1,161 @@ +use std::collections::HashSet; +use std::fmt::{Debug, Display}; +use std::hash::Hash; + +use crate::annotations::disease::DiseaseIterator; +use crate::annotations::{AnnotationId, Disease}; +use crate::term::HpoGroup; +use crate::HpoError; +use crate::HpoTermId; + +/// A set of Orpha diseases +/// +/// The set does not contain [`OrphaDisease`]s itself, but only +/// their [`OrphaDiseaseId`]s. +/// Currently implemented using [`HashSet`] but any other implementation +/// should work as well given that each [`OrphaDiseaseId`] must appear only once +/// and it provides an iterator of [`OrphaDiseaseId`] +pub type OrphaDiseases = HashSet; + +/// A unique identifier for an [`OrphaDisease`] +/// +/// This value can - in theory - represent any numerical unique value. +/// When using the default JAX provided masterdata, it represents +/// the actual Orpha MIM ID. +#[derive(Clone, Copy, Default, Debug, Hash, PartialEq, PartialOrd, Eq, Ord)] +pub struct OrphaDiseaseId { + inner: u32, +} + +impl AnnotationId for OrphaDiseaseId { + /// Convert `self` to `u32` + fn as_u32(&self) -> u32 { + self.inner + } +} + +impl TryFrom<&str> for OrphaDiseaseId { + type Error = HpoError; + fn try_from(value: &str) -> Result { + Ok(OrphaDiseaseId { + inner: value.parse::()?, + }) + } +} + +impl From for OrphaDiseaseId { + fn from(inner: u32) -> Self { + OrphaDiseaseId { inner } + } +} + +impl Display for OrphaDiseaseId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ORPHA:{}", self.inner) + } +} + +/// A single ORPHA disease +/// +/// A disease has a unique [`OrphaDiseaseId`] and a name and is +/// connected to a set of HPO terms +#[derive(Default, Debug, Clone)] +pub struct OrphaDisease { + id: OrphaDiseaseId, + name: String, + hpos: HpoGroup, +} + +impl Disease for OrphaDisease { + type AnnoID = OrphaDiseaseId; + + /// Initializes a new Orpha disease + /// + /// This method should rarely, if ever, be used directly. The + /// preferred way to create new genes is through [`crate::Ontology::add_orpha_disease`] + /// to ensure that each disease exists only once. + fn new(id: Self::AnnoID, name: &str) -> OrphaDisease { + Self { + name: name.to_string(), + id, + hpos: HpoGroup::default(), + } + } + + /// The unique [`OrphaDiseaseId`] of the disease, the Orpha MIM number + fn id(&self) -> &Self::AnnoID { + &self.id + } + + /// The Orpha disease name + fn name(&self) -> &str { + &self.name + } + + /// Connect another [HPO term](`crate::HpoTerm`) to the disease + fn add_term>(&mut self, term_id: I) -> bool { + self.hpos.insert(term_id) + } + + /// The set of connected HPO terms + fn hpo_terms(&self) -> &HpoGroup { + &self.hpos + } +} + +impl PartialEq for OrphaDisease { + fn eq(&self, other: &OrphaDisease) -> bool { + self.id == other.id + } +} + +impl Eq for OrphaDisease {} + +impl TryFrom<&[u8]> for OrphaDisease { + type Error = HpoError; + /// Returns an [`OrphaDisease`] from a bytes vector + /// + /// The byte layout for this method is defined in + /// [`Disease::as_bytes`] + /// + /// # Examples + /// + /// ``` + /// use hpo::annotations::{Disease, OrphaDisease, OrphaDiseaseId}; + /// + /// let bytes = vec![ + /// 0u8, 0u8, 0u8, 22u8, // Total size of Blop + /// 0u8, 0u8, 0u8, 123u8, // ID of the disease => 123 + /// 0u8, 0u8, 0u8, 6u8, // Length of name => 6 + /// b'F', b'o', b'o', b'b', b'a', b'r', // Foobar + /// 0u8, 0u8, 0u8, 0u8 // Number of associated HPO Terms => 0 + /// ]; + /// let disease = OrphaDisease::try_from(&bytes[..]).unwrap(); + /// + /// assert_eq!(disease.name(), "Foobar"); + /// assert_eq!(disease.id(), &OrphaDiseaseId::from(123u32)); + /// ``` + fn try_from(bytes: &[u8]) -> Result { + Self::from_bytes(bytes) + } +} + +impl Hash for OrphaDisease { + fn hash(&self, state: &mut H) { + self.id.hash(state); + } +} + +/// Iterates [`OrphaDisease`] +pub type OrphaDiseaseIterator<'a> = DiseaseIterator<'a, OrphaDiseaseId>; + +impl<'a> std::iter::Iterator for DiseaseIterator<'a, OrphaDiseaseId> { + type Item = &'a OrphaDisease; + fn next(&mut self) -> Option { + self.diseases.next().map(|orpha_id| { + self.ontology + .orpha_disease(orpha_id) + .expect("disease must exist in Ontology") + }) + } +} diff --git a/src/lib.rs b/src/lib.rs index 1ef6d1d..3d631da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,7 @@ const DEFAULT_NUM_PARENTS: usize = 10; const DEFAULT_NUM_ALL_PARENTS: usize = 30; const DEFAULT_NUM_GENES: usize = 50; const DEFAULT_NUM_OMIM: usize = 20; +const DEFAULT_NUM_ORPHA: usize = 20; const MAX_HPO_ID_INTEGER: usize = 10_000_000; const OBO_FILENAME: &str = "hp.obo"; diff --git a/src/ontology.rs b/src/ontology.rs index 26f8ef0..dce474d 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1,5 +1,6 @@ +use crate::annotations::Disease; use core::fmt::Debug; -use std::collections::hash_map::{Entry, Values}; +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::Read; @@ -11,7 +12,8 @@ use tracing::debug; use crate::annotations::AnnotationId; use crate::annotations::{Gene, GeneId}; -use crate::annotations::{OmimDisease, OmimDiseaseId}; +use crate::annotations::{OmimDisease, OmimDiseaseFilter, OmimDiseaseId}; +use crate::annotations::{OrphaDisease, OrphaDiseaseId}; use crate::parser; use crate::parser::binary::{BinaryTermBuilder, BinaryVersion, Bytes}; use crate::term::internal::HpoTermInternal; @@ -38,7 +40,6 @@ use termarena::Arena; /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// -/// // get single terms from the ontology /// /// let absent_term = HpoTermId::try_from("HP:9999999").unwrap(); /// assert!(ontology.hpo(absent_term).is_none()); @@ -52,10 +53,13 @@ use termarena::Arena; /// assert_eq!(term.name(), "Phenotypic abnormality"); /// /// // get all genes of the ontology -/// assert_eq!(ontology.genes().count(), 4852); +/// assert_eq!(ontology.genes().count(), 615); /// -/// // get all diseases of the ontology -/// assert_eq!(ontology.omim_diseases().count(), 4431); +/// // get all OMIM diseases of the ontology +/// assert_eq!(ontology.omim_diseases().count(), 147); +/// +/// // get all ORPHA diseases of the ontology +/// assert_eq!(ontology.orpha_diseases().count(), 319); /// /// // Iterate all HPO terms /// for term in &ontology { @@ -321,6 +325,7 @@ pub struct Ontology { hpo_terms: Arena, genes: HashMap, omim_diseases: HashMap, + orpha_diseases: HashMap, hpo_version: (u16, u8, u8), categories: HpoGroup, modifier: HpoGroup, @@ -332,30 +337,6 @@ impl Debug for Ontology { } } -/// Iterates [`OmimDisease`] that match the query string -/// -/// This struct is returned by [`Ontology::omim_diseases_by_name`] -pub struct OmimDiseaseFilter<'a> { - iter: Values<'a, OmimDiseaseId, OmimDisease>, - query: &'a str, -} - -impl<'a> OmimDiseaseFilter<'a> { - fn new(iter: Values<'a, OmimDiseaseId, OmimDisease>, query: &'a str) -> Self { - OmimDiseaseFilter { iter, query } - } -} - -impl<'a> Iterator for OmimDiseaseFilter<'a> { - type Item = &'a OmimDisease; - - fn next(&mut self) -> Option { - self.iter - .by_ref() - .find(|&item| item.name().contains(self.query)) - } -} - /// Public API of the Ontology /// /// Those methods are all safe to use @@ -612,6 +593,14 @@ impl Ontology { ont.add_omim_disease_from_bytes(&bytes[section_start + 4..section_end])?; section_start += section_len + 4; + // Orpha Diseases + if bytes.version() > BinaryVersion::V2 { + section_len = u32_from_bytes(&bytes[section_start..]) as usize; + section_end += 4 + section_len; + ont.add_orpha_disease_from_bytes(&bytes[section_start + 4..section_end])?; + section_start += section_len + 4; + } + if section_start == bytes.len() { ont.calculate_information_content()?; ont.set_default_categories()?; @@ -622,6 +611,87 @@ impl Ontology { } } + /// Returns a binary representation of the Ontology + /// + /// The binary data is separated into sections: + /// + /// - Metadata (HPO and Bindat Version) (see `Ontology::metadata_as_bytes`) + /// - Terms (Names + IDs) (see `HpoTermInternal::as_bytes`) + /// - Term - Parent connection (Child ID - Parent ID) + /// (see `HpoTermInternal::parents_as_byte`) + /// - Genes (Names + IDs + Connected HPO Terms) ([`Gene::as_bytes`]) + /// - OMIM Diseases (Names + IDs + Connected HPO Terms) + /// ([`OmimDisease::as_bytes`]) + /// + /// Every section starts with 4 bytes to indicate its size + /// (big-endian encoded `u32`) + /// + /// This method is only useful if you use are modifying the ontology + /// and want to save data for later re-use. + /// + /// # Panics + /// + /// Panics when the buffer length of any subsegment larger than `u32::MAX` + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// let bytes = ontology.as_bytes(); + /// ``` + pub fn as_bytes(&self) -> Vec { + fn usize_to_u32(n: usize) -> u32 { + n.try_into().expect("unable to convert {n} to u32") + } + let mut res = Vec::new(); + + // Add metadata, version info + res.append(&mut self.metadata_as_bytes()); + + // All HPO Terms + let mut buffer = Vec::new(); + for term in self.hpo_terms.values() { + buffer.append(&mut term.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // All Term - Parent connections + buffer.clear(); + for term in self.hpo_terms.values() { + buffer.append(&mut term.parents_as_byte()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // Genes and Gene-Term connections + buffer.clear(); + for gene in self.genes.values() { + buffer.append(&mut gene.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // OMIM Disease and Disease-Term connections + buffer.clear(); + for omim_disease in self.omim_diseases.values() { + buffer.append(&mut omim_disease.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + // ORPHA Disease and Disease-Term connections + buffer.clear(); + for orpha_disease in self.orpha_diseases.values() { + buffer.append(&mut orpha_disease.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + + res + } + /// Returns the number of HPO-Terms in the Ontology /// /// # Examples @@ -684,8 +754,8 @@ impl Ontology { /// ``` /// use hpo::Ontology; /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// let gene = ontology.gene(&57505u32.into()).unwrap(); - /// assert_eq!(gene.name(), "AARS2"); + /// let gene = ontology.gene(&57492u32.into()).unwrap(); + /// assert_eq!(gene.name(), "ARID1B"); /// ``` pub fn gene(&self, gene_id: &GeneId) -> Option<&Gene> { self.genes.get(gene_id) @@ -706,8 +776,8 @@ impl Ontology { /// use hpo::Ontology; /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let gene = ontology.gene_by_name("AARS2").unwrap(); - /// assert_eq!(gene.name(), "AARS2"); + /// let gene = ontology.gene_by_name("ARID1B").unwrap(); + /// assert_eq!(gene.name(), "ARID1B"); /// /// assert!(ontology.gene_by_name("FOOBAR66").is_none()); /// ``` @@ -740,9 +810,11 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// let disease = ontology.omim_disease(&601495u32.into()).unwrap(); - /// assert_eq!(disease.name(), "Agammaglobulinemia 1, autosomal recessive"); + /// let disease = ontology.omim_disease(&269880u32.into()).unwrap(); + /// assert_eq!(disease.name(), "Short syndrome"); /// ``` pub fn omim_disease(&self, omim_disease_id: &OmimDiseaseId) -> Option<&OmimDisease> { self.omim_diseases.get(omim_disease_id) @@ -756,6 +828,8 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// for disease in ontology.omim_diseases() { /// println!("{}", disease.name()); @@ -774,6 +848,8 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// /// for result in ontology.omim_diseases_by_name("Cystinosis") { @@ -793,6 +869,8 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// /// let cystinosis = ontology.omim_disease_by_name("Cystinosis"); @@ -803,6 +881,53 @@ impl Ontology { .find(|&disease| disease.name().contains(substring)) } + /// Returns a reference to the [`OrphaDisease`] of the provided [`OrphaDiseaseId`] + /// + /// If no such disease is present, `None` is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// let disease = ontology.orpha_disease(&110u32.into()).unwrap(); + /// assert_eq!(disease.name(), "Bardet-Biedl syndrome"); + /// ``` + pub fn orpha_disease(&self, orpha_disease_id: &OrphaDiseaseId) -> Option<&OrphaDisease> { + self.orpha_diseases.get(orpha_disease_id) + } + + /// Returns an Iterator of all [`OrphaDisease`]s from the Ontology + /// + /// It is likely that the return type will change to a dedicated Iterator + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// for disease in ontology.orpha_diseases() { + /// println!("{}", disease.name()); + /// } + /// ``` + pub fn orpha_diseases( + &self, + ) -> std::collections::hash_map::Values<'_, OrphaDiseaseId, OrphaDisease> { + self.orpha_diseases.values() + } + + /// Returns the Jax-Ontology release version + /// + /// e.g. `2023-03-13` + pub fn hpo_version(&self) -> String { + format!( + "{:0>4}-{:0>2}-{:0>2}", + self.hpo_version.0, self.hpo_version.1, self.hpo_version.2, + ) + } + /// Compares `self` to another `Ontology` to identify added/removed terms, genes and diseases /// /// # Examples @@ -1000,6 +1125,10 @@ impl Ontology { terms.insert(self.get_unchecked(parent)); } } + + // The IDs of all Terms that will be present in the new ontology + // This list is used to identify which terms each disease or gene + // should be connected to let ids: HpoGroup = terms.iter().map(|term| *term.id()).collect(); let mut ont = Self::default(); @@ -1019,30 +1148,43 @@ impl Ontology { ont.create_cache(); - // Iterate all genes, check the associated terms and see if one - // is part of the `ids` set + // We only want to add genes and diseases to the ontology that are + // associated with an actual phenotype of the ontology and not just + // with a modifier terms. e.g. the modifier "Recessive inheritance" + // is linked to roughly half of all diseases + let phenotype_ids: HpoGroup = terms + .iter() + .filter(|term| (term.all_parents() & self.modifier()).is_empty()) + .map(|term| *term.id()) + .collect(); + + // Iterate all genes for gene in self.genes() { - let matched_terms = gene.hpo_terms() & &ids; - if matched_terms.is_empty() { + // check if the gene is linked to any phenotype terms + // --> don't include modifier terms here + if (gene.hpo_terms() & &phenotype_ids).is_empty() { continue; } - ont.add_gene( + let gene_id = ont.add_gene( self.gene(gene.id()).ok_or(HpoError::DoesNotExist)?.name(), - *gene.id(), - ); - for term in &matched_terms { - ont.link_gene_term(term, *gene.id())?; - ont.gene_mut(gene.id()) + &gene.id().as_u32().to_string() + )?; + + // Link the gene to every term in the new ontology + // --> also modifier terms + for term in &(gene.hpo_terms() & &ids) { + ont.link_gene_term(term, gene_id)?; + ont.gene_mut(&gene_id) .ok_or(HpoError::DoesNotExist)? .add_term(term); } } - // Iterate all genes, check the associated terms and see if one - // is part of the `ids` set + // Iterate all Omim diseases for omim_disease in self.omim_diseases() { - let matched_terms = omim_disease.hpo_terms() & &ids; - if matched_terms.is_empty() { + // check if the omim_disease is linked to any phenotype terms + // --> don't include modifier terms here + if (omim_disease.hpo_terms() & &phenotype_ids).is_empty() { continue; } let omim_disease_id = ont.add_omim_disease( @@ -1051,13 +1193,41 @@ impl Ontology { .name(), &omim_disease.id().as_u32().to_string(), )?; - for term in &matched_terms { + + // Link the omim_disease to every term in the new ontology + // --> also modifier terms + for term in &(omim_disease.hpo_terms() & &ids) { ont.link_omim_disease_term(term, omim_disease_id)?; ont.omim_disease_mut(&omim_disease_id) .ok_or(HpoError::DoesNotExist)? .add_term(term); } } + + // Iterate all Orpha diseases + for orpha_disease in self.orpha_diseases() { + // check if the orpha_disease is linked to any phenotype terms + // --> don't include modifier terms here + if (orpha_disease.hpo_terms() & &phenotype_ids).is_empty() { + continue; + } + let orpha_disease_id = ont.add_orpha_disease( + self.orpha_disease(orpha_disease.id()) + .ok_or(HpoError::DoesNotExist)? + .name(), + &orpha_disease.id().as_u32().to_string(), + )?; + + // Link the orpha_disease to every term in the new ontology + // --> also modifier terms + for term in &(orpha_disease.hpo_terms() & &ids) { + ont.link_orpha_disease_term(term, orpha_disease_id)?; + ont.orpha_disease_mut(&orpha_disease_id) + .ok_or(HpoError::DoesNotExist)? + .add_term(term); + } + } + ont.calculate_information_content()?; Ok(ont) @@ -1335,7 +1505,7 @@ impl Ontology { } } - /// Add a OMIM disease to the Ontology. and return the [`OmimDiseaseId`] + /// Add an OMIM disease to the Ontology and return the [`OmimDiseaseId`] /// /// If the disease does not yet exist, a new [`OmimDisease`] entity is /// created and stored in the Ontology. @@ -1354,6 +1524,7 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; /// /// let mut ontology = Ontology::default(); /// assert!(ontology.omim_disease(&1u32.into()).is_none()); @@ -1384,6 +1555,56 @@ impl Ontology { } } + /// Add an ORPHA disease to the Ontology and return the [`OrphaDiseaseId`] + /// + /// If the disease does not yet exist, a new [`OrphaDisease`] entity is + /// created and stored in the Ontology. + /// If the disease already exists in the ontology, it is not added again. + /// + /// # Note + /// + /// Adding a disease does not connect it to any HPO terms. + /// Use [`Ontology::link_orpha_disease_term`] for creating connections. + /// + /// # Errors + /// + /// If the `orpha_disease_id` is invalid, an [`HpoError::ParseIntError`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// + /// let mut ontology = Ontology::default(); + /// assert!(ontology.orpha_disease(&1u32.into()).is_none()); + /// + /// ontology.add_orpha_disease("Foo", "1"); + /// + /// // Diseases can be iterated... + /// let mut disease_iterator = ontology.orpha_diseases(); + /// let orpha_disease = disease_iterator.next().unwrap(); + /// assert_eq!(orpha_disease.name(), "Foo"); + /// assert!(disease_iterator.next().is_none()); + /// + /// // .. or accessed directly + /// assert!(ontology.orpha_disease(&1u32.into()).is_some()); + /// ``` + pub fn add_orpha_disease( + &mut self, + orpha_disease_name: &str, + orpha_disease_id: &str, + ) -> HpoResult { + let id = OrphaDiseaseId::try_from(orpha_disease_id)?; + match self.orpha_diseases.entry(id) { + std::collections::hash_map::Entry::Occupied(_) => Ok(id), + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(OrphaDisease::new(id, orpha_disease_name)); + Ok(id) + } + } + } + /// Add the [`Gene`] as annotation to the [`HpoTerm`] /// /// The gene will be recursively connected to all parent `HpoTerms` as well. @@ -1443,7 +1664,7 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; - /// use hpo::annotations::OmimDiseaseId; + /// use hpo::annotations::{Disease, OmimDiseaseId}; /// /// let mut ontology = Ontology::default(); /// ontology.insert_term("Term-Foo".into(), 1u32); @@ -1472,6 +1693,50 @@ impl Ontology { Ok(()) } + /// Add the [`OrphaDisease`] as annotation to the [`HpoTerm`] + /// + /// The disease will be recursively connected to all parent `HpoTerms` as well. + /// + /// This method does not add the HPO-term to the [`OrphaDisease`], this + /// must be handled by the client. + /// + /// # Errors + /// + /// If the HPO term is not present, an [`HpoError`] is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::{Disease, OrphaDiseaseId}; + /// + /// let mut ontology = Ontology::default(); + /// ontology.insert_term("Term-Foo".into(), 1u32); + /// ontology.add_orpha_disease("Foo", "5"); + /// ontology.link_orpha_disease_term(1u32, OrphaDiseaseId::from(5u32)).unwrap(); + /// + /// let term = ontology.hpo(1u32).unwrap(); + /// assert_eq!(term.orpha_diseases().next().unwrap().name(), "Foo"); + /// ``` + pub fn link_orpha_disease_term>( + &mut self, + term_id: I, + orpha_disease_id: OrphaDiseaseId, + ) -> HpoResult<()> { + let term = self.get_mut(term_id).ok_or(HpoError::DoesNotExist)?; + + if term.add_orpha_disease(orpha_disease_id) { + // If the disease is already associated to the term, this branch will + // be skipped. That is desired, because by definition + // all parent terms are already linked as well + let parents = term.all_parents().clone(); + for parent in &parents { + self.link_orpha_disease_term(parent, orpha_disease_id)?; + } + } + Ok(()) + } + /// Returns a mutable reference to the [`Gene`] of the provided [`GeneId`] /// /// If no such gene is present, `None` is returned @@ -1483,10 +1748,10 @@ impl Ontology { /// /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let mut gene = ontology.gene_mut(&57505u32.into()).unwrap(); - /// assert_eq!(gene.hpo_terms().len(), 10); + /// let mut gene = ontology.gene_mut(&2175u32.into()).unwrap(); + /// assert_eq!(gene.hpo_terms().len(), 3); /// gene.add_term(1u32); - /// assert_eq!(gene.hpo_terms().len(), 11); + /// assert_eq!(gene.hpo_terms().len(), 4); /// ``` pub fn gene_mut(&mut self, gene_id: &GeneId) -> Option<&mut Gene> { self.genes.get_mut(gene_id) @@ -1500,10 +1765,11 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::Disease; /// /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let mut disease = ontology.omim_disease_mut(&601495u32.into()).unwrap(); + /// let mut disease = ontology.omim_disease_mut(&269880u32.into()).unwrap(); /// assert_eq!(disease.hpo_terms().len(), 1); /// disease.add_term(1u32); /// assert_eq!(disease.hpo_terms().len(), 2); @@ -1515,6 +1781,27 @@ impl Ontology { self.omim_diseases.get_mut(omim_disease_id) } + /// Returns a mutable reference to the [`OrphaDisease`] of the provided [`OrphaDiseaseId`] + /// + /// If no such disease is present, `None` is returned + /// + /// # Examples + /// + /// ``` + /// use hpo::Ontology; + /// use hpo::annotations::Disease; + /// + /// let mut ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// + /// let mut disease = ontology.orpha_disease_mut(&110u32.into()).unwrap(); + /// assert_eq!(disease.hpo_terms().len(), 1); + /// disease.add_term(1u32); + /// assert_eq!(disease.hpo_terms().len(), 2); + /// ``` + pub fn orpha_disease_mut(&mut self, disease_id: &OrphaDiseaseId) -> Option<&mut OrphaDisease> { + self.orpha_diseases.get_mut(disease_id) + } + /// Calculates the [`crate::term::InformationContent`]s for every term /// /// This method should only be called **after** all terms are added, @@ -1543,6 +1830,7 @@ impl Ontology { pub fn calculate_information_content(&mut self) -> HpoResult<()> { self.calculate_gene_ic()?; self.calculate_omim_disease_ic()?; + self.calculate_orpha_disease_ic()?; Ok(()) } } @@ -1591,7 +1879,7 @@ impl Ontology { bytes.extend_from_slice(&[0x48, 0x50, 0x4f]); // Version - bytes.push(0x2); + bytes.push(0x3); bytes.extend_from_slice(&self.hpo_version.0.to_be_bytes()[..]); bytes.push(self.hpo_version.1); @@ -1703,6 +1991,32 @@ impl Ontology { Ok(()) } + /// Adds [`OrphaDisease`]s to the ontoloigy and connects them to connected terms + /// + /// This method is part of the Ontology-building, based on the binary + /// data format and requires a specified data layout. + /// + /// It connects all connected terms and their parents properly. The + /// method assumes that the bytes encode all Disease-term connections. + /// + /// See [`OrphaDisease::as_bytes`] for explanation of the binary layout + fn add_orpha_disease_from_bytes(&mut self, bytes: &[u8]) -> HpoResult<()> { + let mut idx: usize = 0; + loop { + if idx >= bytes.len() { + break; + } + let disease_len = u32_from_bytes(&bytes[idx..]) as usize; + let disease = OrphaDisease::try_from(&bytes[idx..idx + disease_len])?; + for term in disease.hpo_terms() { + self.link_orpha_disease_term(term, *disease.id())?; + } + self.orpha_diseases.insert(*disease.id(), disease); + idx += disease_len; + } + Ok(()) + } + /// This method is part of the cache creation to link all terms to their /// direct and indirect parents (grandparents) /// @@ -1809,6 +2123,20 @@ impl Ontology { } Ok(()) } + + /// Calculates the Orpha-Disease-specific Information Content for every term + /// + /// If no diseases are present in the Ontology, no IC are calculated + fn calculate_orpha_disease_ic(&mut self) -> HpoResult<()> { + let n_orpha_diseases = self.orpha_diseases.len(); + + for term in self.hpo_terms.values_mut() { + let current_diseases = term.orpha_diseases().len(); + term.information_content_mut() + .set_orpha_disease(n_orpha_diseases, current_diseases)?; + } + Ok(()) + } } /// Iterates the Ontology and yields [`HpoTerm`]s @@ -1911,7 +2239,7 @@ mod test { #[test] fn check_v2_parsing() { - let ont = Ontology::from_binary("tests/example.hpo").unwrap(); + let ont = Ontology::from_binary("tests/example_v2.hpo").unwrap(); assert_eq!(ont.hpo_version, (2023, 1, 31)); assert_eq!(ont.hpo_version(), "2023-01-31"); @@ -1920,7 +2248,7 @@ mod test { #[test] fn compare_v1_v2() { let ont1 = Ontology::from_binary("tests/example_v1.hpo").unwrap(); - let ont2 = Ontology::from_binary("tests/example.hpo").unwrap(); + let ont2 = Ontology::from_binary("tests/example_v2.hpo").unwrap(); let diff = ont1.compare(&ont2); assert_eq!(diff.added_hpo_terms().len(), 0); @@ -1939,9 +2267,9 @@ mod test { #[test] fn diseases_by_name() { let ont = Ontology::from_binary("tests/example.hpo").unwrap(); - assert_eq!(ont.omim_diseases_by_name("Cystinosis").count(), 3); + assert_eq!(ont.omim_diseases_by_name("dysplasia").count(), 10); assert_eq!( - ont.omim_diseases_by_name("Macdermot-Winter syndrome") + ont.omim_diseases_by_name("Fetal iodine deficiency disorder") .count(), 1 ); @@ -1950,18 +2278,23 @@ mod test { 0 ); - let cystinosis = [ - "Cystinosis, adult nonnephropathic", - "Cystinosis, late-onset juvenile or adolescent nephropathic", - "Cystinosis, nephropathic", + let susceptibility = [ + "Encephalopathy, acute, infection-induced, susceptibility to, 3", + "Epidermodysplasia verruciformis, susceptibility to, 1", + "Leprosy, susceptibility to", + "Spondyloarthropathy, susceptibility to, 2", + "Systemic lupus erythematosus, susceptibility to, 6", + "Tinea imbricata, susceptibility to", ]; - assert!(cystinosis.contains(&ont.omim_disease_by_name("Cystinosis").unwrap().name())); + assert!( + susceptibility.contains(&ont.omim_disease_by_name("susceptibility").unwrap().name()) + ); assert_eq!( - ont.omim_disease_by_name("Macdermot-Winter syndrome") + ont.omim_disease_by_name("Fetal iodine deficiency disorder") .unwrap() .name(), - "Macdermot-Winter syndrome" + "Fetal iodine deficiency disorder" ); assert!(ont.omim_disease_by_name("anergictcell syndrome").is_none()); diff --git a/src/ontology/comparison.rs b/src/ontology/comparison.rs index f7dab4b..584ad53 100644 --- a/src/ontology/comparison.rs +++ b/src/ontology/comparison.rs @@ -22,7 +22,7 @@ use std::collections::HashSet; use std::fmt::Display; -use crate::annotations::{Gene, OmimDisease}; +use crate::annotations::{Disease, Gene, OmimDisease, OrphaDisease}; use crate::term::HpoGroup; use crate::{HpoTerm, HpoTermId, Ontology}; @@ -40,7 +40,7 @@ impl<'a> Display for Comparison<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "Version\t{}\t{}\nTerms\t{}\t{}\nGenes\t{}\t{}\nOmim Diseases\t{}\t{}", + "Version\t{}\t{}\nTerms\t{}\t{}\nGenes\t{}\t{}\nOmim Diseases\t{}\t{}\nOrpha Diseases\t{}\t{}", self.lhs.hpo_version(), self.rhs.hpo_version(), self.lhs.len(), @@ -48,7 +48,9 @@ impl<'a> Display for Comparison<'a> { self.lhs.genes().count(), self.rhs.genes().count(), self.lhs.omim_diseases().count(), - self.rhs.omim_diseases().count() + self.rhs.omim_diseases().count(), + self.lhs.orpha_diseases().count(), + self.rhs.orpha_diseases().count() ) } } @@ -167,6 +169,41 @@ impl<'a> Comparison<'a> { }) .collect() } + + /// Returns all [`OrphaDisease`]s that are exclusively in the `new` Ontology + pub fn added_orpha_diseases(&self) -> Vec<&OrphaDisease> { + self.rhs + .orpha_diseases() + .filter(|disease| self.lhs.orpha_disease(disease.id()).is_none()) + .collect() + } + + /// Returns all [`OrphaDisease`]s that are exclusively in the `old` Ontology + pub fn removed_orpha_diseases(&self) -> Vec<&OrphaDisease> { + self.lhs + .orpha_diseases() + .filter(|disease| self.rhs.orpha_disease(disease.id()).is_none()) + .collect() + } + + /// Returns an [`AnnotationDelta`] struct for every [`OrphaDisease`] that is different + /// between the `old` and `new` Ontology. + /// + /// Differences are defined as either: + /// - Changed name + /// - Changed direct associated `HpoTerm`s + pub fn changed_orpha_diseases(&self) -> Vec { + self.lhs + .orpha_diseases() + .filter_map(|disease| { + if let Some(rhs) = self.rhs.orpha_disease(disease.id()) { + AnnotationDelta::disease(disease, rhs) + } else { + None + } + }) + .collect() + } } /// Differences between two [`HpoTerm`]s @@ -307,7 +344,7 @@ impl<'a> AnnotationDelta { /// Constructs a new [`AnnotationDelta`] by comparing two [`OmimDisease`]s /// /// Returns `None` if both are identical - pub fn disease(lhs: &OmimDisease, rhs: &OmimDisease) -> Option { + pub fn disease(lhs: &D, rhs: &D) -> Option { let lhs_terms = lhs.hpo_terms(); let rhs_terms = rhs.hpo_terms(); diff --git a/src/parser.rs b/src/parser.rs index a2d3c4c..82821fe 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -342,6 +342,7 @@ pub(crate) mod gene_to_hpo { /// ``` /// pub(crate) mod disease_to_hpo { + use crate::annotations::Disease; use crate::HpoError; use crate::HpoResult; use crate::HpoTermId; @@ -352,50 +353,48 @@ pub(crate) mod disease_to_hpo { use crate::Ontology; - struct Omim<'a> { + enum DiseaseKind<'a> { + Omim(DiseaseComponents<'a>), + Orpha(DiseaseComponents<'a>), + } + + struct DiseaseComponents<'a> { id: &'a str, name: &'a str, hpo_id: HpoTermId, } - fn parse_line(line: &str) -> HpoResult>> { - // TODO (nice to have): Add check to skip `database_id` header row - // It is not strictly needed, because we're discarding non-OMIM rows - if line.starts_with('#') { - return Ok(None); + fn parse_line(line: &str) -> HpoResult>> { + if line.starts_with("OMIM") { + parse_disease_components(line).map(DiseaseKind::Omim) + } else if line.starts_with("ORPHA") { + parse_disease_components(line).map(DiseaseKind::Orpha) + } else { + Ok(None) } - if !line.starts_with("OMIM") { + } + + fn parse_disease_components(line: &str) -> HpoResul>> { + let cols: Vec<&str> = line.trim().split('\t').collect(); + if cols[2] == "NOT" { return Ok(None); } - let mut cols = line.trim().splitn(5, '\t'); - - let Some(id_col) = cols.next() else { - return Err(HpoError::InvalidInput(line.to_string())); - }; - let Some((_, omim_id)) = id_col.split_once(':') else { - return Err(HpoError::InvalidInput(line.to_string())); - }; - - let Some(omim_name) = cols.next() else { + let Some((_, omim_id)) = cols[0].split_once(':') else { + error!("cannot parse Disease ID from {}", cols[0]); return Err(HpoError::InvalidInput(line.to_string())); }; - - if let Some("NOT") = cols.next() { - return Ok(None); - }; - - let hpo_id = if let Some(id) = cols.next() { - HpoTermId::try_from(id)? - } else { + + let Ok(hpo_id) = HpoTermId::try_from(cols[3]) else { + error!("invalid HPO ID: {}", cols[3]); return Err(HpoError::InvalidInput(line.to_string())); }; - - Ok(Some(Omim { + + Ok(Some(DiseaseComponents { id: omim_id, name: omim_name, hpo_id, - })) + }))) } /// Quick and dirty parser for development and debugging @@ -411,14 +410,26 @@ pub(crate) mod disease_to_hpo { let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); - if let Some(omim) = parse_line(&line)? { - let omim_disease_id = ontology.add_omim_disease(omim.name, omim.id)?; - ontology.link_omim_disease_term(omim.hpo_id, omim_disease_id)?; - - ontology - .omim_disease_mut(&omim_disease_id) - .expect("Omim disease was just added and cannot be missing") - .add_term(omim.hpo_id); + match parse_line(&line)? { + Some(DiseaseKind::Omim(omim)) => { + let omim_disease_id = ontology.add_omim_disease(omim.name, omim.id)?; + ontology.link_omim_disease_term(omim.hpo_id, omim_disease_id)?; + + ontology + .omim_disease_mut(&omim_disease_id) + .ok_or(HpoError::DoesNotExist)? + .add_term(omim.hpo_id); + } + Some(DiseaseKind::Orpha(orpha)) => { + let orpha_disease_id = ontology.add_orpha_disease(orpha.name, orpha.id)?; + ontology.link_orpha_disease_term(orpha.hpo_id, orpha_disease_id)?; + + ontology + .orpha_disease_mut(&orpha_disease_id) + .ok_or(HpoError::DoesNotExist)? + .add_term(orpha.hpo_id); + } + _ => {} } } Ok(()) diff --git a/src/parser/binary.rs b/src/parser/binary.rs index 4975d7b..d6174dc 100644 --- a/src/parser/binary.rs +++ b/src/parser/binary.rs @@ -6,7 +6,7 @@ //! to update their versions to the newest one. pub(crate) mod ontology; pub(crate) mod term; -use std::fmt::Display; +use std::{cmp::Ordering, fmt::Display}; use crate::{term::internal::HpoTermInternal, HpoError}; @@ -14,6 +14,7 @@ use crate::{term::internal::HpoTermInternal, HpoError}; pub(crate) enum BinaryVersion { V1, V2, + V3, } impl TryFrom for BinaryVersion { @@ -22,6 +23,7 @@ impl TryFrom for BinaryVersion { match value { 1u8 => Ok(BinaryVersion::V1), 2u8 => Ok(BinaryVersion::V2), + 3u8 => Ok(BinaryVersion::V3), _ => Err(HpoError::NotImplemented), } } @@ -35,11 +37,34 @@ impl Display for BinaryVersion { match self { BinaryVersion::V1 => "1", BinaryVersion::V2 => "2", + BinaryVersion::V3 => "3", } ) } } +impl From<&BinaryVersion> for u8 { + fn from(value: &BinaryVersion) -> Self { + match *value { + BinaryVersion::V1 => 1, + BinaryVersion::V2 => 2, + BinaryVersion::V3 => 3, + } + } +} + +impl PartialOrd for BinaryVersion { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for BinaryVersion { + fn cmp(&self, other: &Self) -> Ordering { + u8::from(self).cmp(&u8::from(other)) + } +} + #[derive(Debug, Clone, Copy)] pub(crate) struct Bytes<'a> { data: &'a [u8], diff --git a/src/parser/binary/ontology.rs b/src/parser/binary/ontology.rs index 77b906a..4e01d84 100644 --- a/src/parser/binary/ontology.rs +++ b/src/parser/binary/ontology.rs @@ -9,6 +9,7 @@ pub(crate) fn version(bytes: &[u8]) -> HpoResult { if bytes[0..3] == [0x48, 0x50, 0x4f] { match bytes[3] { + 3u8 => Ok(Bytes::new(&bytes[4..], super::BinaryVersion::V3)), 2u8 => Ok(Bytes::new(&bytes[4..], super::BinaryVersion::V2)), _ => Err(HpoError::NotImplemented), } diff --git a/src/set.rs b/src/set.rs index 2fec0f2..1fc5485 100644 --- a/src/set.rs +++ b/src/set.rs @@ -59,7 +59,7 @@ use crate::Ontology; /// StandardCombiner::default() /// ); /// -/// assert_eq!(similarity, 0.8177036); +/// assert_eq!(similarity, 0.695935); /// ``` #[must_use] pub struct HpoSet<'a> { @@ -468,8 +468,8 @@ impl<'a> HpoSet<'a> { /// hpos.insert(12639u32); /// hpos.insert(818u32); /// let set = HpoSet::new(&ontology, hpos); - /// // `ABCD1 (HGNC:215)` is linked to `HP:0012639` - /// assert!(set.gene_ids().contains(&215u32.into())); + /// // `KRAS (HGNC:3845)` is linked to `HP:0012639` + /// assert!(set.gene_ids().contains(&3845u32.into())); /// ``` pub fn gene_ids(&self) -> Genes { self.group @@ -583,7 +583,7 @@ impl<'a> HpoSet<'a> { /// hpos.insert(818u32); /// hpos.insert(2715u32); /// let set = HpoSet::new(&ontology, hpos); - /// assert_eq!(set.information_content().unwrap().gene(), 0.14216587); + /// assert_eq!(set.information_content().unwrap().gene(), 0.22111309); /// ``` pub fn information_content(&self) -> HpoResult { let n_diseases = self.ontology.omim_diseases().len(); @@ -640,7 +640,7 @@ impl<'a> HpoSet<'a> { /// StandardCombiner::default() /// ); /// - /// assert_eq!(similarity, 0.8177036); + /// assert_eq!(similarity, 0.695935); /// ``` pub fn similarity( &self, @@ -759,8 +759,7 @@ mod test { Builtins::GraphIc(InformationContentKind::Omim), StandardCombiner::default(), ); - - assert!((similarity - 0.817_703_6).abs() < f32::EPSILON); + assert!((similarity - 0.695_935).abs() < f32::EPSILON); } #[test] diff --git a/src/similarity.rs b/src/similarity.rs index 6910b03..1598662 100644 --- a/src/similarity.rs +++ b/src/similarity.rs @@ -312,7 +312,7 @@ impl SimilarityCombiner for StandardCombiner { /// StandardCombiner::default() /// ); /// -/// assert_eq!(similarity, 0.8177036); +/// assert_eq!(similarity, 0.695935); /// ``` /// /// ## Using `GroupSimilarity` directly @@ -354,7 +354,7 @@ impl SimilarityCombiner for StandardCombiner { /// Builtins::GraphIc(InformationContentKind::Omim) /// ); /// -/// assert_eq!(sim.calculate(&set_1, &set_2), 0.8177036); +/// assert_eq!(sim.calculate(&set_1, &set_2), 0.695935); /// ``` pub struct GroupSimilarity { combiner: C, @@ -456,9 +456,9 @@ impl Default for GroupSimilarity { pub enum Builtins { /// [Distance](`Distance`) - based similarity Distance(InformationContentKind), - /// [GraphIc](`GraphIc`) - based similarity + /// [`GraphIc`](`GraphIc`) - based similarity GraphIc(InformationContentKind), - /// [InformationCoefficient](`InformationCoefficient`) - based similarity + /// [`InformationCoefficient`](`InformationCoefficient`) - based similarity InformationCoefficient(InformationContentKind), /// [Jiang & Conrath](`Jc`) - based similarity Jc(InformationContentKind), diff --git a/src/similarity/defaults.rs b/src/similarity/defaults.rs index a3f6a08..b8ccb3c 100644 --- a/src/similarity/defaults.rs +++ b/src/similarity/defaults.rs @@ -3,6 +3,9 @@ //! //! All of the algorithms can also be accessed via [`crate::similarity::Builtins`] +use std::collections::HashSet; +use std::hash::Hash; + use crate::similarity::{usize_to_f32, Similarity}; use crate::term::InformationContentKind; use crate::HpoTerm; @@ -342,11 +345,25 @@ impl Mutation { } fn omim_disease_similarity(a: &HpoTerm, b: &HpoTerm) -> f32 { - let omim_diseases_a = a.omim_disease_ids(); - let omim_diseases_b = b.omim_disease_ids(); + let diseases_a = a.omim_disease_ids(); + let diseases_b = b.omim_disease_ids(); + + Self::disease_similarity(diseases_a, diseases_b) + } + + fn orpha_disease_similarity(a: &HpoTerm, b: &HpoTerm) -> f32 { + let diseases_a = a.orpha_disease_ids(); + let diseases_b = b.orpha_disease_ids(); + + Self::disease_similarity(diseases_a, diseases_b) + } - let all = omim_diseases_a | omim_diseases_b; - let common = omim_diseases_a & omim_diseases_b; + fn disease_similarity( + disease_a: &HashSet, + disease_b: &HashSet, + ) -> f32 { + let all = disease_a | disease_b; + let common = disease_a & disease_b; if all.is_empty() { return 0.0; @@ -364,6 +381,7 @@ impl Similarity for Mutation { match self.kind { InformationContentKind::Gene => Mutation::gene_similarity(a, b), InformationContentKind::Omim => Mutation::omim_disease_similarity(a, b), + InformationContentKind::Orpha => Mutation::orpha_disease_similarity(a, b), } } } diff --git a/src/stats.rs b/src/stats.rs index d3c7f4d..df0688d 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -12,9 +12,9 @@ //! In addition, it provides methods for [hierarchical clustering](`linkage::Linkage`) of `HpoSet`s. use std::collections::HashMap; -use std::hash::Hash; +use std::marker::PhantomData; -use crate::annotations::{GeneId, OmimDiseaseId}; +use crate::annotations::{AnnotationId, Disease, GeneId, OmimDiseaseId}; use crate::HpoTerm; pub mod hypergeom; @@ -84,50 +84,77 @@ impl Enrichment { } } +impl Enrichment { + /// Constructs an `Enrichment` for any annotated item (Gene, Disease) + pub fn annotation(annotation: T, pvalue: f64, count: u64, enrichment: f64) -> Self { + Self { + annotation, + pvalue, + count, + enrichment, + } + } +} + struct SampleSet { - /// The total number of HpoTerms in the full set + /// The total number of `HpoTerms` in the full set size: u64, - /// A map containing the counts of each Gene/Disease in the SampleSet - counts: HashMap, + /// A map containing the counts of each Gene/Disease in the `SampleSet` + counts: HashMap, + phantom: PhantomData, +} + +fn calculate_counts< + 'a, + U: FnMut(HpoTerm<'a>) -> IT, + I: IntoIterator>, + IT: IntoIterator, +>( + terms: I, + mut iter: U, +) -> (u64, HashMap) { + let mut size = 0u64; + let mut counts: HashMap = HashMap::new(); + for term in terms { + size += 1; + for id in iter(term) { + counts + .entry(id) + .and_modify(|count| *count += 1) + .or_insert(1); + } + } + (size, counts) } impl<'a> SampleSet { /// Constructs a new [`SampleSet`] with gene counts from an iterator of [`HpoTerm`]s - pub fn gene>>(iter: I) -> Self { - let mut size = 0u64; - let mut counts: HashMap = HashMap::new(); - for term in iter { - size += 1; - for gene in term.genes() { - counts - .entry(*gene.id()) - .and_modify(|count| *count += 1) - .or_insert(1); - } + pub fn gene>>(terms: I) -> Self { + let term2geneid = |term: HpoTerm<'a>| term.genes().map(|d| d.id().as_u32()); + + let (size, counts) = calculate_counts(terms, term2geneid); + Self { + size, + counts, + phantom: PhantomData, } - Self { size, counts } } } impl<'a> SampleSet { /// Constructs a new `SampleSet` with disease counts from an iterator of [`HpoTerm`]s - pub fn disease>>(ontology: I) -> Self { - let mut size = 0u64; - let mut counts: HashMap = HashMap::new(); - for term in ontology { - size += 1; - for disease in term.omim_diseases() { - counts - .entry(*disease.id()) - .and_modify(|count| *count += 1) - .or_insert(1); - } + pub fn disease>>(terms: I) -> Self { + let term2omimid = |term: HpoTerm<'a>| term.omim_diseases().map(|d| d.id().as_u32()); + let (size, counts) = calculate_counts(terms, term2omimid); + Self { + size, + counts, + phantom: PhantomData, } - Self { size, counts } } } -impl SampleSet { +impl SampleSet { /// Returns the total number of [`HpoTerm`]s in the [`SampleSet`] fn len(&self) -> u64 { self.size @@ -140,16 +167,13 @@ impl SampleSet { fn is_empty(&self) -> bool { self.size == 0 } -} - -impl SampleSet { /// The number of terms in the `SampleSet` that are connected to the given gene or disease /// /// Terms can be directly or indirectly connected to the gene/disease. /// /// Returns `None` if the key is not present fn get(&self, key: &T) -> Option<&u64> { - self.counts.get(key) + self.counts.get(&key.as_u32()) } /// Returns the frequency of terms that are connected to the given gene or disease @@ -161,61 +185,74 @@ impl SampleSet { #[allow(dead_code)] fn frequency(&self, key: &T) -> Option { self.counts - .get(key) + .get(&key.as_u32()) .map(|count| f64_from_u64(*count) / f64_from_u64(self.size)) } /// An iterator of [`SampleSet::frequency`] values, along with their key #[allow(dead_code)] fn frequencies(&self) -> Frequencies { - Frequencies::new(self.counts.iter(), self.size) + Frequencies::new(self.counts.iter(), self.size, self.phantom) } } -impl<'a, T: Clone> IntoIterator for &'a SampleSet { +impl<'a, T: AnnotationId> IntoIterator for &'a SampleSet { type Item = (T, u64); type IntoIter = Counts<'a, T>; fn into_iter(self) -> Self::IntoIter { - Counts::new(self.counts.iter()) + Counts::new(self.counts.iter(), self.phantom) } } /// An iterator of [`SampleSet::frequency`] values, along with their key struct Frequencies<'a, K> { - inner: std::collections::hash_map::Iter<'a, K, u64>, + inner: std::collections::hash_map::Iter<'a, u32, u64>, total: u64, + phantom: PhantomData, } impl<'a, K> Frequencies<'a, K> { - pub fn new(inner: std::collections::hash_map::Iter<'a, K, u64>, total: u64) -> Self { - Self { inner, total } + pub fn new( + inner: std::collections::hash_map::Iter<'a, u32, u64>, + total: u64, + phantom: PhantomData, + ) -> Self { + Self { + inner, + total, + phantom, + } } } -impl Iterator for Frequencies<'_, K> { +impl Iterator for Frequencies<'_, K> { type Item = (K, f64); fn next(&mut self) -> Option { self.inner .next() - .map(|(k, v)| (k.clone(), f64_from_u64(*v) / f64_from_u64(self.total))) + .map(|(k, v)| (K::from(*k), f64_from_u64(*v) / f64_from_u64(self.total))) } } /// An iterator of [`SampleSet::frequency`] values, along with their key struct Counts<'a, K> { - inner: std::collections::hash_map::Iter<'a, K, u64>, + inner: std::collections::hash_map::Iter<'a, u32, u64>, + phantom: PhantomData, } impl<'a, K> Counts<'a, K> { - pub fn new(inner: std::collections::hash_map::Iter<'a, K, u64>) -> Self { - Self { inner } + pub fn new( + inner: std::collections::hash_map::Iter<'a, u32, u64>, + phantom: PhantomData, + ) -> Self { + Self { inner, phantom } } } -impl Iterator for Counts<'_, K> { +impl Iterator for Counts<'_, K> { type Item = (K, u64); fn next(&mut self) -> Option { - self.inner.next().map(|(k, v)| (k.clone(), *v)) + self.inner.next().map(|(k, v)| (K::from(*k), *v)) } } @@ -246,18 +283,26 @@ mod test { #[test] fn iterate_frequencies() { let mut map = HashMap::new(); - map.insert(String::from("foo"), 12u64); - map.insert(String::from("bar"), 21u64); + map.insert(12u32, 12u64); + map.insert(21u32, 21u64); - let mut iter = Frequencies::new(map.iter(), 3); + let mut iter: Frequencies<'_, OmimDiseaseId> = Frequencies::new(map.iter(), 3, PhantomData); match iter.next() { - Some((key, x)) if key == "foo" => assert!((x - 4.0).abs() < f64::EPSILON), - Some((key, x)) if key == "bar" => assert!((x - 7.0).abs() < f64::EPSILON), + Some((key, x)) if key == OmimDiseaseId::from(12) => { + assert!((x - 4.0).abs() < f64::EPSILON); + } + Some((key, x)) if key == OmimDiseaseId::from(21) => { + assert!((x - 7.0).abs() < f64::EPSILON); + } _ => panic!("invalid"), } match iter.next() { - Some((key, x)) if key == "foo" => assert!((x - 4.0).abs() < f64::EPSILON), - Some((key, x)) if key == "bar" => assert!((x - 7.0).abs() < f64::EPSILON), + Some((key, x)) if key == OmimDiseaseId::from(12) => { + assert!((x - 4.0).abs() < f64::EPSILON); + } + Some((key, x)) if key == OmimDiseaseId::from(21) => { + assert!((x - 7.0).abs() < f64::EPSILON); + } _ => panic!("invalid"), } assert!(iter.next().is_none()); @@ -266,18 +311,18 @@ mod test { #[test] fn iterate_counts() { let mut map = HashMap::new(); - map.insert(String::from("foo"), 12u64); - map.insert(String::from("bar"), 21u64); + map.insert(12u32, 12u64); + map.insert(21u32, 21u64); - let mut iter = Counts::new(map.iter()); + let mut iter: Counts<'_, OmimDiseaseId> = Counts::new(map.iter(), PhantomData); match iter.next() { - Some((key, x)) if key == "foo" => assert_eq!(x, 12), - Some((key, x)) if key == "bar" => assert_eq!(x, 21), + Some((key, x)) if key == OmimDiseaseId::from(12) => assert_eq!(x, 12), + Some((key, x)) if key == OmimDiseaseId::from(21) => assert_eq!(x, 21), _ => panic!("invalid"), } match iter.next() { - Some((key, x)) if key == "foo" => assert_eq!(x, 12), - Some((key, x)) if key == "bar" => assert_eq!(x, 21), + Some((key, x)) if key == OmimDiseaseId::from(12) => assert_eq!(x, 12), + Some((key, x)) if key == OmimDiseaseId::from(21) => assert_eq!(x, 21), _ => panic!("invalid"), } assert!(iter.next().is_none()); diff --git a/src/stats/hypergeom.rs b/src/stats/hypergeom.rs index cd92354..620cda6 100644 --- a/src/stats/hypergeom.rs +++ b/src/stats/hypergeom.rs @@ -49,7 +49,7 @@ //! //! let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); //! -//! let gene = ontology.gene_by_name("EZH2").unwrap(); +//! let gene = ontology.gene_by_name("KRAS").unwrap(); //! let gene_hpo_set = gene.to_hpo_set(&ontology); //! //! let mut enrichments = gene_enrichment(&ontology, &gene_hpo_set); diff --git a/src/stats/hypergeom/disease.rs b/src/stats/hypergeom/disease.rs index f29a86c..14e7d2a 100644 --- a/src/stats/hypergeom/disease.rs +++ b/src/stats/hypergeom/disease.rs @@ -16,7 +16,7 @@ use crate::HpoTerm; /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// -/// let gene = ontology.gene_by_name("EZH2").unwrap(); +/// let gene = ontology.gene_by_name("KRAS").unwrap(); /// let gene_hpo_set = gene.to_hpo_set(&ontology); /// /// let mut enrichments = disease_enrichment(&ontology, &gene_hpo_set); @@ -63,7 +63,7 @@ where let pvalue = hyper.sf(observed_successes - 1); let enrichment = (f64_from_u64(observed_successes) / f64_from_u64(sample_set.len())) / (f64_from_u64(*successes) / f64_from_u64(background.len())); - res.push(Enrichment::disease( + res.push(Enrichment::annotation( disease, pvalue, observed_successes, diff --git a/src/stats/hypergeom/gene.rs b/src/stats/hypergeom/gene.rs index 87432c7..4d8c90d 100644 --- a/src/stats/hypergeom/gene.rs +++ b/src/stats/hypergeom/gene.rs @@ -16,7 +16,7 @@ use crate::HpoTerm; /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// -/// let gene = ontology.gene_by_name("EZH2").unwrap(); +/// let gene = ontology.gene_by_name("KRAS").unwrap(); /// let gene_hpo_set = gene.to_hpo_set(&ontology); /// /// let mut enrichments = gene_enrichment(&ontology, &gene_hpo_set); diff --git a/src/stats/linkage.rs b/src/stats/linkage.rs index bda60a9..f4d584d 100644 --- a/src/stats/linkage.rs +++ b/src/stats/linkage.rs @@ -75,18 +75,18 @@ impl<'a> DistanceMatrix { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// let sets = vec![ -/// ontology.gene_by_name("GBA1").unwrap().to_hpo_set(&ontology), -/// ontology.gene_by_name("BRCA2").unwrap().to_hpo_set(&ontology), -/// ontology.gene_by_name("EZH2").unwrap().to_hpo_set(&ontology), -/// ontology.gene_by_name("DMD").unwrap().to_hpo_set(&ontology), +/// ontology.gene_by_name("KRAS").unwrap().to_hpo_set(&ontology), +/// ontology.gene_by_name("WDR45").unwrap().to_hpo_set(&ontology), +/// ontology.gene_by_name("TP53").unwrap().to_hpo_set(&ontology), +/// ontology.gene_by_name("CLCN7").unwrap().to_hpo_set(&ontology), /// ]; /// /// /// let mut cluster = Linkage::union(sets, distance).into_cluster(); /// let first = cluster.next().unwrap(); -/// println!("{:?}", first); +/// // println!("{:?}", first); /// // Cluster { idx1: 0, idx2: 3, distance: 0.008127391, size: 2 } -/// assert_eq!(cluster.next().unwrap().len(), 3); +/// assert_eq!(cluster.next().unwrap().len(), 2); /// assert_eq!(cluster.next().unwrap().len(), 4); /// assert!(cluster.next().is_none()); /// ``` @@ -129,18 +129,18 @@ impl<'a> Linkage<'a> { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// let sets = vec![ - /// ontology.gene_by_name("GBA1").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("BRCA2").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("EZH2").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("DMD").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("KRAS").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("WDR45").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("TP53").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("CLCN7").unwrap().to_hpo_set(&ontology), /// ]; /// /// /// let mut cluster = Linkage::union(sets, distance).into_cluster(); /// let first = cluster.next().unwrap(); /// println!("{:?}", first); - /// // Cluster { idx1: 0, idx2: 3, distance: 0.008127391, size: 2 } - /// assert_eq!(cluster.next().unwrap().len(), 3); + /// // Cluster { idx1: 0, idx2: 3, distance: 0.16666663, size: 2 } + /// assert_eq!(cluster.next().unwrap().len(), 2); /// assert_eq!(cluster.next().unwrap().len(), 4); /// assert!(cluster.next().is_none()); /// ``` @@ -182,18 +182,18 @@ impl<'a> Linkage<'a> { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// let sets = vec![ - /// ontology.gene_by_name("GBA1").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("BRCA2").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("EZH2").unwrap().to_hpo_set(&ontology), - /// ontology.gene_by_name("DMD").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("KRAS").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("WDR45").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("TP53").unwrap().to_hpo_set(&ontology), + /// ontology.gene_by_name("CLCN7").unwrap().to_hpo_set(&ontology), /// ]; /// /// /// let mut cluster = Linkage::single(sets, distance).into_cluster(); /// let first = cluster.next().unwrap(); /// println!("{:?}", first); - /// // Cluster { idx1: 0, idx2: 3, distance: 0.008127391, size: 2 } - /// assert_eq!(cluster.next().unwrap().len(), 3); + /// // Cluster { idx1: 0, idx2: 3, distance: 0.16666663, size: 2 } + /// assert_eq!(cluster.next().unwrap().len(), 2); /// assert_eq!(cluster.next().unwrap().len(), 4); /// assert!(cluster.next().is_none()); /// ``` @@ -297,16 +297,25 @@ impl<'a> Linkage<'a> { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let genes = vec!["GBA1", "BRCA2", "EZH2", "DMD"]; + /// let genes = vec!["KRAS", "WDR45", "ZNRF3", "CLCN7"]; /// let sets = genes.iter().map(|gene| ontology.gene_by_name(gene).unwrap().to_hpo_set(&ontology)); /// - /// let linkage = Linkage::union(sets, distance); + /// let linkage = Linkage::single(sets, distance); /// let indicies = linkage.indicies(); - /// assert_eq!(indicies, vec![0usize, 3usize, 2usize, 1usize]); + /// + /// // Similarities: + /// // CLCN7 WDR45 0.8333334 + /// // CLCN7 ZNRF3 0.66666 + /// // KRAS WDR45 0.5 + /// // KRAS CLCN7 0.4166667 + /// // KRAS ZNRF3 0 + /// // WDR45 ZNRF3 0 + /// + /// assert_eq!(indicies, vec![1usize, 3usize, 2usize, 0usize]); /// for idx in indicies { /// print!("{} ", genes[idx]); /// } - /// // "GBA1 DMD EZH2 BRCA2" + /// // "KRAS ZNRF3 WDR45 CLCN7" /// ``` pub fn indicies(&self) -> Vec { let mut res = Vec::with_capacity(self.initial_len); diff --git a/src/term/hpoterm.rs b/src/term/hpoterm.rs index de9d673..9642d0a 100644 --- a/src/term/hpoterm.rs +++ b/src/term/hpoterm.rs @@ -2,6 +2,8 @@ use crate::annotations::GeneIterator; use crate::annotations::Genes; use crate::annotations::OmimDiseaseIterator; use crate::annotations::OmimDiseases; +use crate::annotations::OrphaDiseaseIterator; +use crate::annotations::OrphaDiseases; use crate::similarity::Similarity; use crate::term::internal::HpoTermInternal; use crate::term::HpoGroup; @@ -30,6 +32,7 @@ pub struct HpoTerm<'a> { children: &'a HpoGroup, genes: &'a Genes, omim_diseases: &'a OmimDiseases, + orpha_diseases: &'a OrphaDiseases, information_content: &'a InformationContent, obsolete: bool, replaced_by: Option, @@ -80,6 +83,7 @@ impl<'a> HpoTerm<'a> { children: term.children(), genes: term.genes(), omim_diseases: term.omim_diseases(), + orpha_diseases: term.orpha_diseases(), information_content: term.information_content(), obsolete: term.obsolete(), replaced_by: term.replacement(), @@ -685,8 +689,8 @@ impl<'a> HpoTerm<'a> { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let term = ontology.hpo(11017u32).unwrap(); - /// assert_eq!(term.gene_ids().len(), 575); + /// let term = ontology.hpo(12638u32).unwrap(); + /// assert_eq!(term.gene_ids().len(), 41); /// ``` pub fn gene_ids(&self) -> &Genes { self.genes @@ -698,6 +702,7 @@ impl<'a> HpoTerm<'a> { /// /// ``` /// use hpo::{HpoTerm, Ontology}; + /// use hpo::annotations::Disease; /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// @@ -720,12 +725,48 @@ impl<'a> HpoTerm<'a> { /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// /// let term = ontology.hpo(1939u32).unwrap(); - /// assert_eq!(term.omim_disease_ids().len(), 143); + /// assert_eq!(term.omim_disease_ids().len(), 93); /// ``` pub fn omim_disease_ids(&self) -> &OmimDiseases { self.omim_diseases } + /// Returns an iterator of all associated [`crate::annotations::OrphaDisease`]s + /// + /// # Examples + /// + /// ``` + /// use hpo::{HpoTerm, Ontology}; + /// use hpo::annotations::Disease; + /// + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// + /// let term = ontology.hpo(11017u32).unwrap(); + /// for disease in term.orpha_diseases() { + /// println!("{}", disease.name()); + /// } + /// ``` + pub fn orpha_diseases(&self) -> OrphaDiseaseIterator<'a> { + OrphaDiseaseIterator::new(self.orpha_diseases, self.ontology) + } + + /// Returns the set of all associated [`crate::annotations::OrphaDisease`]s + /// + /// # Examples + /// + /// ``` + /// use hpo::{HpoTerm, Ontology}; + /// use hpo::annotations::Disease; + /// + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// + /// let term = ontology.hpo(1939u32).unwrap(); + /// assert_eq!(term.orpha_disease_ids().len(), 32); + /// ``` + pub fn orpha_disease_ids(&self) -> &OrphaDiseases { + self.orpha_diseases + } + /// Returns the [`InformationContent`] of the term /// /// # Examples @@ -737,8 +778,9 @@ impl<'a> HpoTerm<'a> { /// /// let term = ontology.hpo(1939u32).unwrap(); /// let ic = term.information_content(); - /// assert_eq!(ic.gene(), 0.6816717); - /// assert_eq!(ic.omim_disease(), 3.4335358); + /// assert_eq!(ic.gene(), 1.9442855); + /// assert_eq!(ic.omim_disease(), 0.4578331); + /// assert_eq!(ic.orpha_disease(), 2.2994552); /// ``` pub fn information_content(&self) -> &InformationContent { self.information_content @@ -755,11 +797,11 @@ impl<'a> HpoTerm<'a> { /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// - /// let term1 = ontology.hpo(11017u32).unwrap(); - /// let term2 = ontology.hpo(12639u32).unwrap(); + /// let term1 = ontology.hpo(12638u32).unwrap(); // Abnormal nervous system physiology + /// let term2 = ontology.hpo(100547u32).unwrap(); // Abnormal forebrain morphology /// /// let sim = term1.similarity_score(&term2, &Builtins::GraphIc(InformationContentKind::Omim)); - /// assert_eq!(sim, 0.2765914); + /// assert_eq!(sim, 0.112043366); /// ``` pub fn similarity_score(&self, other: &HpoTerm, similarity: &impl Similarity) -> f32 { similarity.calculate(self, other) diff --git a/src/term/information_content.rs b/src/term/information_content.rs index 8897a0e..2d5a387 100644 --- a/src/term/information_content.rs +++ b/src/term/information_content.rs @@ -13,6 +13,7 @@ use crate::{f32_from_usize, HpoResult}; pub struct InformationContent { gene: f32, omim: f32, + orpha: f32, } impl InformationContent { @@ -36,11 +37,22 @@ impl InformationContent { &mut self.omim } + /// The ORPHA-disease-specific information content + pub fn orpha_disease(&self) -> f32 { + self.orpha + } + + /// A mutable reference to the ORPHA-disease-specific information content + pub fn orpha_disease_mut(&mut self) -> &mut f32 { + &mut self.orpha + } + /// Returns the information content of the provided kind pub fn get_kind(&self, kind: &InformationContentKind) -> f32 { match kind { InformationContentKind::Gene => self.gene(), InformationContentKind::Omim => self.omim_disease(), + InformationContentKind::Orpha => self.orpha_disease(), } } @@ -65,7 +77,7 @@ impl InformationContent { Ok(()) } - /// Calculates and caches the gene `InformationContent` + /// Calculates and caches the OMIM `InformationContent` /// /// # Errors /// @@ -75,6 +87,17 @@ impl InformationContent { self.omim = Self::calculate(total, current)?; Ok(()) } + + /// Calculates and caches the ORPHA `InformationContent` + /// + /// # Errors + /// + /// This method returns an error if there are more ORPHA diseases than `u16::MAX` + /// because larger numbers can't be safely converted to `f32` + pub fn set_orpha_disease(&mut self, total: usize, current: usize) -> HpoResult<()> { + self.orpha = Self::calculate(total, current)?; + Ok(()) + } } /// Different types of information contents @@ -84,4 +107,6 @@ pub enum InformationContentKind { Gene, /// Information content related to the associated OMIM-diseases Omim, + /// Information content related to the associated ORPHA-diseases + Orpha, } diff --git a/src/term/internal.rs b/src/term/internal.rs index 23e6d7f..b9172a6 100644 --- a/src/term/internal.rs +++ b/src/term/internal.rs @@ -1,4 +1,5 @@ -use crate::annotations::AnnotationId; +use crate::annotations::OrphaDiseases; +use crate::annotations::{AnnotationId, OrphaDiseaseId}; use crate::parser::binary::term::{from_bytes_v1, from_bytes_v2}; use crate::parser::binary::{BinaryVersion, Bytes}; use std::hash::Hash; @@ -9,7 +10,7 @@ use crate::term::{HpoGroup, HpoTermId, InformationContent}; use crate::DEFAULT_NUM_PARENTS; use crate::{HpoError, DEFAULT_NUM_GENES}; use crate::{HpoResult, DEFAULT_NUM_ALL_PARENTS}; -use crate::{HpoTerm, DEFAULT_NUM_OMIM}; +use crate::{HpoTerm, DEFAULT_NUM_OMIM, DEFAULT_NUM_ORPHA}; #[derive(Clone, Debug)] pub(crate) struct HpoTermInternal { @@ -20,6 +21,7 @@ pub(crate) struct HpoTermInternal { children: HpoGroup, genes: Genes, omim_diseases: OmimDiseases, + orpha_diseases: OrphaDiseases, ic: InformationContent, obsolete: bool, replacement: Option, @@ -47,6 +49,7 @@ impl HpoTermInternal { children: HpoGroup::with_capacity(DEFAULT_NUM_PARENTS), genes: Genes::with_capacity(DEFAULT_NUM_GENES), omim_diseases: OmimDiseases::with_capacity(DEFAULT_NUM_OMIM), + orpha_diseases: OrphaDiseases::with_capacity(DEFAULT_NUM_ORPHA), ic: InformationContent::default(), obsolete: false, replacement: None, @@ -90,6 +93,10 @@ impl HpoTermInternal { &self.omim_diseases } + pub fn orpha_diseases(&self) -> &OrphaDiseases { + &self.orpha_diseases + } + pub fn parents_cached(&self) -> bool { if self.parents.is_empty() { true @@ -114,6 +121,10 @@ impl HpoTermInternal { self.omim_diseases.insert(omim_disease_id) } + pub fn add_orpha_disease(&mut self, orpha_disease_id: OrphaDiseaseId) -> bool { + self.orpha_diseases.insert(orpha_disease_id) + } + pub fn information_content(&self) -> &InformationContent { &self.ic } @@ -243,7 +254,7 @@ impl TryFrom> for HpoTermInternal { fn try_from(bytes: Bytes) -> Result { match bytes.version() { BinaryVersion::V1 => from_bytes_v1(bytes), - BinaryVersion::V2 => from_bytes_v2(bytes), + _ => from_bytes_v2(bytes), } } } diff --git a/tests/example.hpo b/tests/example.hpo index c3d8f3c..98d2d2c 100644 Binary files a/tests/example.hpo and b/tests/example.hpo differ diff --git a/tests/example_v2.hpo b/tests/example_v2.hpo new file mode 100644 index 0000000..c3d8f3c Binary files /dev/null and b/tests/example_v2.hpo differ diff --git a/tests/ontology.hpo b/tests/ontology.hpo index 9c5dfae..125a690 100644 Binary files a/tests/ontology.hpo and b/tests/ontology.hpo differ