diff --git a/CHANGELOG.md b/CHANGELOG.md index bedc6fe..00fb226 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. ## Unreleased +## [0.10.1] + +### Refactor + +- Add some missing methods for Orpha diseases + + ## [0.10.0] ### Feature diff --git a/Cargo.toml b/Cargo.toml index dde9d34..31894f6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hpo" -version = "0.10.0" +version = "0.10.1" edition = "2021" authors = ["Jonas Marcello "] description = "Human Phenotype Ontology Similarity" diff --git a/examples/bench_disease_enrichment.rs b/examples/bench_disease_enrichment.rs index c7ef249..0b13c54 100644 --- a/examples/bench_disease_enrichment.rs +++ b/examples/bench_disease_enrichment.rs @@ -5,7 +5,7 @@ use std::path::Path; use rayon::prelude::*; -use hpo::stats::hypergeom::disease_enrichment; +use hpo::stats::hypergeom::omim_disease_enrichment; use hpo::Ontology; const GENES: [&str; 6] = ["GBA1", "NPC1", "EZH2", "DMD", "MUC7", "ARID1B"]; @@ -34,7 +34,7 @@ fn multi_threaded_enrichment(ontology: &Ontology) -> usize { .unwrap_or_else(|| panic!("{gene} does not exist")) .to_hpo_set(ontology); total - + disease_enrichment(ontology, &set) + + omim_disease_enrichment(ontology, &set) .iter() .fold(0, |count, enrichment| { if enrichment.pvalue() < 0.0000005 && enrichment.enrichment() > 100.0 { @@ -55,7 +55,7 @@ fn single_threaded_enrichment(ontology: &Ontology) -> usize { .unwrap_or_else(|| panic!("{gene} does not exist")) .to_hpo_set(ontology); total - + disease_enrichment(ontology, &set) + + omim_disease_enrichment(ontology, &set) .iter() .fold(0, |count, enrichment| { if enrichment.pvalue() < 0.0000005 && enrichment.enrichment() > 100.0 { diff --git a/examples/enrichment.rs b/examples/enrichment.rs index 98d38e8..43b806d 100644 --- a/examples/enrichment.rs +++ b/examples/enrichment.rs @@ -37,7 +37,7 @@ fn gene_enrichments(ontology: &Ontology, hposet: &HpoSet, output_len: usize) { /// Prints diseases that are enriched in the hposet fn disease_enrichments(ontology: &Ontology, hposet: &HpoSet, output_len: usize) { - let mut disease_enrichments = hpo::stats::hypergeom::disease_enrichment(ontology, hposet); + let mut disease_enrichments = hpo::stats::hypergeom::omim_disease_enrichment(ontology, hposet); disease_enrichments.sort_by(|a, b| { a.pvalue() diff --git a/src/set.rs b/src/set.rs index 1fc5485..143fec1 100644 --- a/src/set.rs +++ b/src/set.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use crate::annotations::Genes; -use crate::annotations::OmimDiseases; +use crate::annotations::{OmimDiseases, OrphaDiseases}; use crate::similarity::GroupSimilarity; use crate::similarity::Similarity; use crate::similarity::SimilarityCombiner; @@ -516,6 +516,40 @@ impl<'a> HpoSet<'a> { .fold(OmimDiseases::default(), |acc, element| &acc | element) } + /// Returns all [`crate::annotations::OrphaDiseaseId`]s that are associated to the set + /// + /// # Panics + /// + /// When an `HpoTermId` of the set is not part of the Ontology + /// + /// # Examples + /// + /// ``` + /// use hpo::{Ontology, HpoSet}; + /// use hpo::term::HpoGroup; + /// + /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); + /// + /// let mut hpos = HpoGroup::new(); + /// hpos.insert(12639u32); + /// hpos.insert(818u32); + /// let set = HpoSet::new(&ontology, hpos); + /// for d in set.orpha_disease_ids() {println!("{}", d);}; + /// // `Papilloma of choroid plexus (ORPHA:2807)` is linked to `HP:0012639` + /// assert!(set.orpha_disease_ids().contains(&2807u32.into())); + /// ``` + pub fn orpha_disease_ids(&self) -> OrphaDiseases { + self.group + .iter() + .map(|term_id| { + self.ontology + .get(term_id) + .expect("HpoTermId must be in Ontology") + .orpha_diseases() + }) + .fold(OrphaDiseases::default(), |acc, diseases| &acc | diseases) + } + /// Returns the counts of all categories in the set /// /// # Examples diff --git a/src/stats.rs b/src/stats.rs index df0688d..c20da82 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; use std::marker::PhantomData; -use crate::annotations::{AnnotationId, Disease, GeneId, OmimDiseaseId}; +use crate::annotations::{AnnotationId, Disease, GeneId, OmimDiseaseId, OrphaDiseaseId}; use crate::HpoTerm; pub mod hypergeom; @@ -25,7 +25,7 @@ pub use linkage::Linkage; /// The fold enrichment and p-value for an enriched Gene or Disease /// /// [`Enrichment`] is returned from statistics enrichment methods, such as -/// [`hypergeom::gene_enrichment`] and [`hypergeom::disease_enrichment`]. +/// [`hypergeom::gene_enrichment`], [`hypergeom::omim_disease_enrichment`] and [`hypergeom::orpha_disease_enrichment`]. /// #[derive(Debug)] pub struct Enrichment { @@ -143,7 +143,7 @@ impl<'a> SampleSet { impl<'a> SampleSet { /// Constructs a new `SampleSet` with disease counts from an iterator of [`HpoTerm`]s - pub fn disease>>(terms: I) -> Self { + pub fn omim_disease>>(terms: I) -> Self { let term2omimid = |term: HpoTerm<'a>| term.omim_diseases().map(|d| d.id().as_u32()); let (size, counts) = calculate_counts(terms, term2omimid); Self { @@ -154,6 +154,19 @@ impl<'a> SampleSet { } } +impl<'a> SampleSet { + /// Constructs a new `SampleSet` with disease counts from an iterator of [`HpoTerm`]s + pub fn orpha_disease>>(terms: I) -> Self { + let term2omimid = |term: HpoTerm<'a>| term.orpha_diseases().map(|d| d.id().as_u32()); + let (size, counts) = calculate_counts(terms, term2omimid); + Self { + size, + counts, + phantom: PhantomData, + } + } +} + impl SampleSet { /// Returns the total number of [`HpoTerm`]s in the [`SampleSet`] fn len(&self) -> u64 { diff --git a/src/stats/hypergeom.rs b/src/stats/hypergeom.rs index 620cda6..a8ba9c2 100644 --- a/src/stats/hypergeom.rs +++ b/src/stats/hypergeom.rs @@ -65,5 +65,5 @@ mod disease; mod gene; mod statrs; -pub use disease::disease_enrichment; +pub use disease::{omim_disease_enrichment, orpha_disease_enrichment}; pub use gene::gene_enrichment; diff --git a/src/stats/hypergeom/disease.rs b/src/stats/hypergeom/disease.rs index 14e7d2a..dd7dd2a 100644 --- a/src/stats/hypergeom/disease.rs +++ b/src/stats/hypergeom/disease.rs @@ -1,25 +1,25 @@ use tracing::debug; -use crate::annotations::OmimDiseaseId; +use crate::annotations::{AnnotationId, OmimDiseaseId, OrphaDiseaseId}; use crate::stats::hypergeom::statrs::Hypergeometric; use crate::stats::{f64_from_u64, Enrichment, SampleSet}; use crate::HpoTerm; -/// Calculates the hypergeometric enrichment of diseases within the `set` compared to the `background` +/// Calculates the hypergeometric enrichment of OMIM diseases within the `set` compared to the `background` /// /// # Examples /// /// ``` /// use hpo::Ontology; /// use hpo::{HpoSet, term::HpoGroup}; -/// use hpo::stats::hypergeom::disease_enrichment; +/// use hpo::stats::hypergeom::omim_disease_enrichment; /// /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); /// /// let gene = ontology.gene_by_name("KRAS").unwrap(); /// let gene_hpo_set = gene.to_hpo_set(&ontology); /// -/// let mut enrichments = disease_enrichment(&ontology, &gene_hpo_set); +/// let mut enrichments = omim_disease_enrichment(&ontology, &gene_hpo_set); /// /// // the results are not sorted by default /// enrichments.sort_by(|a, b| { @@ -28,60 +28,94 @@ use crate::HpoTerm; /// assert!(enrichments.first().unwrap().pvalue() < enrichments.last().unwrap().pvalue()); /// assert!(enrichments.first().unwrap().enrichment() > enrichments.last().unwrap().enrichment()); /// ``` -pub fn disease_enrichment<'a, T, U>(background: T, set: U) -> Vec> +pub fn omim_disease_enrichment<'a, T, U>(background: T, set: U) -> Vec> where T: IntoIterator>, U: IntoIterator>, { - fn inner_disease_enrichment( - background: &SampleSet, - sample_set: &SampleSet, - ) -> Vec> { - let mut res = Vec::new(); - for (disease, observed_successes) in sample_set { - if observed_successes == 0 { - debug!("Skipping {}", disease); - continue; - } - let successes = background - .get(&disease) - .expect("disease must be present in background set"); - let hyper = Hypergeometric::new( - // Total number of HPOTerms in the Ontology - // ==> population - background.len(), - // Number of terms in the Ontology that are associated to the disease - // ==> successes - *successes, - // Number of terms in the set - // ==> draws - sample_set.len(), - ) - .expect("the set must not be larger than the background"); - // subtracting 1, because we want to test including observed_successes - // e.g. "7 or more", but sf by default calculates "more than 7" - let pvalue = hyper.sf(observed_successes - 1); - let enrichment = (f64_from_u64(observed_successes) / f64_from_u64(sample_set.len())) - / (f64_from_u64(*successes) / f64_from_u64(background.len())); - res.push(Enrichment::annotation( - disease, - pvalue, - observed_successes, - enrichment, - )); - debug!( - "Disease:{}\tPopulation: {}, Successes: {}, Draws: {}, Observed: {}", - disease, - background.len(), - successes, - sample_set.len(), - observed_successes - ); - } - res - } + let background = SampleSet::omim_disease(background); + let sample_set = SampleSet::omim_disease(set); + inner_disease_enrichment(&background, &sample_set) +} - let background = SampleSet::disease(background); - let sample_set = SampleSet::disease(set); +/// Calculates the hypergeometric enrichment of ORPHA diseases within the `set` compared to the `background` +/// +/// # Examples +/// +/// ``` +/// use hpo::Ontology; +/// use hpo::{HpoSet, term::HpoGroup}; +/// use hpo::stats::hypergeom::orpha_disease_enrichment; +/// +/// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); +/// +/// let gene = ontology.gene_by_name("KRAS").unwrap(); +/// let gene_hpo_set = gene.to_hpo_set(&ontology); +/// +/// let mut enrichments = orpha_disease_enrichment(&ontology, &gene_hpo_set); +/// +/// // the results are not sorted by default +/// enrichments.sort_by(|a, b| { +/// a.pvalue().partial_cmp(&b.pvalue()).unwrap() +/// }); +/// assert!(enrichments.first().unwrap().pvalue() < enrichments.last().unwrap().pvalue()); +/// assert!(enrichments.first().unwrap().enrichment() > enrichments.last().unwrap().enrichment()); +/// ``` +pub fn orpha_disease_enrichment<'a, T, U>(background: T, set: U) -> Vec> +where + T: IntoIterator>, + U: IntoIterator>, +{ + let background = SampleSet::orpha_disease(background); + let sample_set = SampleSet::orpha_disease(set); inner_disease_enrichment(&background, &sample_set) } + +#[inline] +fn inner_disease_enrichment( + background: &SampleSet, + sample_set: &SampleSet, +) -> Vec> { + let mut res = Vec::new(); + for (disease, observed_successes) in sample_set { + if observed_successes == 0 { + debug!("Skipping {}", disease); + continue; + } + let successes = background + .get(&disease) + .expect("disease must be present in background set"); + let hyper = Hypergeometric::new( + // Total number of HPOTerms in the Ontology + // ==> population + background.len(), + // Number of terms in the Ontology that are associated to the disease + // ==> successes + *successes, + // Number of terms in the set + // ==> draws + sample_set.len(), + ) + .expect("the set must not be larger than the background"); + // subtracting 1, because we want to test including observed_successes + // e.g. "7 or more", but sf by default calculates "more than 7" + let pvalue = hyper.sf(observed_successes - 1); + let enrichment = (f64_from_u64(observed_successes) / f64_from_u64(sample_set.len())) + / (f64_from_u64(*successes) / f64_from_u64(background.len())); + res.push(Enrichment::annotation( + disease, + pvalue, + observed_successes, + enrichment, + )); + debug!( + "Disease:{}\tPopulation: {}, Successes: {}, Draws: {}, Observed: {}", + disease, + background.len(), + successes, + sample_set.len(), + observed_successes + ); + } + res +}