diff --git a/pyrocs/biosciences/affinity.py b/pyrocs/biosciences/affinity.py index 344a243..1312e96 100644 --- a/pyrocs/biosciences/affinity.py +++ b/pyrocs/biosciences/affinity.py @@ -1,10 +1,9 @@ from itertools import repeat -from typing import Union import numpy as np from pandas import DataFrame -def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np.array: +def affinity(data: np.ndarray, weights=None, to_bool=bool) -> float: """ Returns the affinity between all pairs of columns in binary data. @@ -24,13 +23,12 @@ def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np results in a binary implementation of affinity within this software. Args: - data: array or dataframe - weights: (optional) float or array - to_bool: boolean type + data (array) + weights (optional array) + to_bool: function or type to convert array values to boolean Returns: - affinity between columns in data - + float """ num_cols = data.shape[1] diff --git a/pyrocs/biosciences/functional_redundancy.py b/pyrocs/biosciences/functional_redundancy.py index 7267097..c6ac433 100644 --- a/pyrocs/biosciences/functional_redundancy.py +++ b/pyrocs/biosciences/functional_redundancy.py @@ -1,6 +1,6 @@ import numpy as np -def functional_redundancy(p: np.array, delta: np.array) -> float: +def functional_redundancy(p: np.ndarray, delta: np.ndarray) -> float: ''' This metric evaluates how interchangeable groups within a population are based on the specific function they perform. As a biological concept, @@ -18,19 +18,15 @@ def functional_redundancy(p: np.array, delta: np.array) -> float: D &= \\sum_i(p_i*(1-p_i)) Args: - ---------- - p : np.array - Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1 and where the constraint 0 < p[i] - means that all calculations involve only those species that are actually present in - the assemblage with nonzero abundances. - delta : np.array - :math:`δ_{ij}` symmetric array of pairwise functional dissimilarities between species i and j + p (array): Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1 + and where the constraint 0 < p[i] + means that all calculations involve only those species that + are actually present in the assemblage with nonzero abundances. + delta (array): :math:`δ_{ij}` symmetric array of pairwise functional + dissimilarities between species i and j Returns: - -------- - FR : float - Functional Redundancy Score - + float ''' dim = len(p) diff --git a/pyrocs/biosciences/hill_diversity.py b/pyrocs/biosciences/hill_diversity.py index f3fc4b7..152705f 100644 --- a/pyrocs/biosciences/hill_diversity.py +++ b/pyrocs/biosciences/hill_diversity.py @@ -2,7 +2,7 @@ import numpy as np -def hill_shannon(p: np.array) -> float: +def hill_shannon(p: np.ndarray) -> float: """ The Hill-Shannon number is a specific instance (i.e. the Perplexity) of Hill Diversity, which prioritizes neither common nor rare species. @@ -20,15 +20,15 @@ def hill_shannon(p: np.array) -> float: where :math:`q` approaches :math:`1` and the mean is the geometric mean Args: - p: p[i] is the proportion of all individuals that belong to species i + p (array): p[i] is the proportion of all individuals that belong to species i Returns: - A metric for effective count of species (diversity) + float """ entropy = -sum(x * np.log(x) for x in p if x > 0) return math.exp(entropy) -def hill_simpson(p: np.array) -> float: +def hill_simpson(p: np.ndarray) -> float: """ The Hill-Simpson number is a specific instance (i.e. the Inverse Simpson Index) of Hill Diversity that prioritizes the common species. @@ -45,14 +45,14 @@ def hill_simpson(p: np.array) -> float: where :math:`q=2` and the mean is the usual arithmetic mean Args: - p: p[i] is the proportion of all individuals that belong to species i + p (array): p[i] is the proportion of all individuals that belong to species i Returns: - A metric for effective count of species (diversity) + float """ return 1.0 / p.dot(p) -def hill_diversity(p: np.array, q: float) -> float: +def hill_diversity(p: np.ndarray, q: float) -> float: """ The Hill Numbers are a family of diversity metrics describing "effective number of species". @@ -87,11 +87,11 @@ def hill_diversity(p: np.array, q: float) -> float: species :math:`i`, :math:`q` is the exponent that determines the rarity scale on which the mean is taken Args: - p: p[i] is the proportion of all individuals that belong to species i, - q: The exponent that determines the rarity scale on which the mean is taken. + p (array): p[i] is the proportion of all individuals that belong to species i, + q (float): The exponent that determines the rarity scale on which the mean is taken. Species richness (q=0), Hill-Simpson diversity (q=2), Hill-Shannon diversity (q=1), Returns: - D: a metric for effective count of species (diversity) + float """ # Special cases diff --git a/pyrocs/complex_systems/causal_complexity.py b/pyrocs/complex_systems/causal_complexity.py index 0d1e46d..2984526 100644 --- a/pyrocs/complex_systems/causal_complexity.py +++ b/pyrocs/complex_systems/causal_complexity.py @@ -2,7 +2,7 @@ import networkx as nx -def cyclomatic_complexity(A : np.ndarray, directed : bool = False): +def cyclomatic_complexity(A : np.ndarray, directed : bool = False) -> float: ''' Cyclomatic complexity reflects the number of linearly independent paths within a system of interest @@ -23,9 +23,9 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False): higher cyclomatic complexity values). Args: - A: array + A (array) Returns: - cyclomatic complexity of the graph + float ''' if directed: @@ -43,7 +43,7 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False): return E - N + 2.0 * P -def feedback_density(A : np.ndarray, directed : bool = False): +def feedback_density(A : np.ndarray, directed : bool = False) -> float: ''' Feedback density captures the fraction of edges :math:`(E_{loop})` and nodes (:math:`N_{loop}`) that are involved in at least one feedback loop. @@ -64,9 +64,9 @@ def feedback_density(A : np.ndarray, directed : bool = False): edges are included in one or more feedback loops. Args: - A: array + A (array) Returns: - feedback density of the graph + float ''' if directed: @@ -95,7 +95,7 @@ def feedback_density(A : np.ndarray, directed : bool = False): return (Eloop + Nloop) / (Etot + Ntot) -def causal_complexity(A: np.ndarray, directed : bool = False): +def causal_complexity(A: np.ndarray, directed : bool = False) -> float: ''' Causal complexity measures the underlying causal structure of a system by considering both the system’s intricacy as @@ -124,9 +124,9 @@ def causal_complexity(A: np.ndarray, directed : bool = False): of causal complexity than those systems with lower feedback density. Args: - A: array + A (array) Returns: - causal complexity of the graph + float ''' M = cyclomatic_complexity(A, directed=directed) D = feedback_density(A, directed=directed) diff --git a/pyrocs/complex_systems/fluctuation_complexity.py b/pyrocs/complex_systems/fluctuation_complexity.py index a877484..3c0cfac 100644 --- a/pyrocs/complex_systems/fluctuation_complexity.py +++ b/pyrocs/complex_systems/fluctuation_complexity.py @@ -2,7 +2,7 @@ from collections import Counter from functools import lru_cache -def fluctuation_complexity(A : list, L : int = 1): +def fluctuation_complexity(A, L : int = 1) -> float: ''' Fluctuating complexity extends the characterization of discrete entropy @@ -24,10 +24,10 @@ def fluctuation_complexity(A : list, L : int = 1): respective frequencies of event :math:`i` and :math:`j` within the series. Args: - A: Sequence of symbols - L: If > 1, groups symbols into short subsequences of length L. + A (array): Sequence of symbols + L (int): If > 1, groups symbols into short subsequences of length L. Returns: - The Fluctuation Complexity of the sequence + float ''' if L > 1: A = [tuple(A[i: i + L]) for i in range(len(A) + 1 - L)] diff --git a/pyrocs/complex_systems/grc.py b/pyrocs/complex_systems/grc.py index 37d10ff..3fdda0c 100644 --- a/pyrocs/complex_systems/grc.py +++ b/pyrocs/complex_systems/grc.py @@ -2,7 +2,7 @@ import numpy as np -def grc(A : np.ndarray, directed : bool): +def grc(A : np.ndarray, directed : bool) -> float: """ Global reaching centrality (GRC) measures the level of hierarchy within a network based on flow. The equation within the package follows the formulations from @@ -23,11 +23,11 @@ def grc(A : np.ndarray, directed : bool): versa :cite:p:`lakkaraju_complexity_2019`. Args: - A: Square matrix of adjacencies in the network + A (array): Square matrix of adjacencies in the network directed (bool): If true, assume A represents a directed graph (row -> column). If false, assume A represents an undirected graph. Returns: - Global reaching centrality of the graph + float """ if directed: diff --git a/pyrocs/information_theory/entropy.py b/pyrocs/information_theory/entropy.py index 2c5b595..21fe3c3 100644 --- a/pyrocs/information_theory/entropy.py +++ b/pyrocs/information_theory/entropy.py @@ -1,15 +1,13 @@ from collections import Counter -from collections.abc import Sequence - from scipy.stats import entropy import numpy as np def discrete_entropy( - values: Sequence, - counts: Sequence = None, - base: int = 2) -> float: + values: np.ndarray, + counts: np.ndarray = None, + base: int = 2) -> float: """ Entropy is often used to measure the state of disorder/randomness in a system. The general equation follows the form: @@ -18,7 +16,7 @@ def discrete_entropy( H = - \\sum_{i=1}^N [p_i * \\log p_i] - where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i`th category, + where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i^{\mathrm{th}}` category, and :math:`N` is the total number of categories. Low entropy values indicate a higher state of disorder while higher entropy values indicate a well-ordered system. The maximum possible value of the entropy for a given system is :math:`log(N)`, and is thus varies by group size. Please see @@ -33,11 +31,11 @@ def discrete_entropy( `scipy documentation `_ as well as the references noted above. Args: - values (Sequence): Sequence of observed values from a random process - counts (Sequence[int]): Number of times each value was observed + values (array): Sequence of observed values from a random process + counts (array[int]): Number of times each value was observed base (int): Base of returned entropy (default returns number of bits) Returns: - mutual information between x and y + float """ if counts is None: diff --git a/pyrocs/information_theory/kl_divergence.py b/pyrocs/information_theory/kl_divergence.py index 1e210f5..e3b7011 100644 --- a/pyrocs/information_theory/kl_divergence.py +++ b/pyrocs/information_theory/kl_divergence.py @@ -1,7 +1,6 @@ import numpy as np -# from scipy.special import xlogy -def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array: +def kl_divergence(p: np.ndarray, q: np.ndarray, base: int = 2) -> float: """ Sometimes called relative entropy, the Kullback-Leibler Divergence (KLD) measures the similarity between two distributions @@ -17,21 +16,18 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array: where :math:`D` is the KLD value, :math:`N` is the total number of categories, and :math:`p_i` and :math:`q_i` reflect the discrete probability of the occurrence - of an event from the :math:`i`th category of the sample distribution and + of an event from the :math:`i^{\mathrm{th}}` category of the sample distribution and reference distribution respectively. The function is able to calculate KLD for cases where not all categories from the reference distribution are present within the sample distribution. Args: - p,q (numpy.ndarray): arrays, where rows for each constitute the two - probability distributions from which to calculate divergence. p - contains the distributions holding probabilities in the numerator of the - KL divergence summand. - base: log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems) + p (array): discrete probability distribution + q (array): discrete probability distribution + base (int): log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems) Returns: - numpy.ndarray: KL divergences, where the second array's rows are the - distributions in the numerator of the log in KL divergence + float """ assert p.shape == q.shape, 'p and q shapes must be identical' @@ -52,7 +48,9 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array: return kl_div -def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list: +def novelty_transience_resonance( + thetas_arr: np.ndarray, + window: int) -> tuple[np.ndarray]: """ These three related metrics extend the Kullback-Leibler Divergence formulation to consider how a distribution differs from past and future distributions within a sequence. Specifically, novelty @@ -66,8 +64,8 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list: .. math:: - N_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\ - T_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\ + N_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\ + T_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\ R_w(p_i) &= N_w(p_i) - T_w(p_i) where :math:`N` is novelty, :math:`T` is transience, :math:`R` is resonance, @@ -78,12 +76,10 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list: equation for the KLD. Args: - thetas_arr (numpy.ndarray): rows are topic mixtures + thetas_arr (array): rows are topic mixtures window (int): positive integer defining scale or scale size Returns: - novelties - transiences - resonances + tuple(array): novelties, transiences, resonances """ # Find the first and last center speech offset, given window size. diff --git a/pyrocs/information_theory/mutual_info.py b/pyrocs/information_theory/mutual_info.py index 2595532..21f3808 100644 --- a/pyrocs/information_theory/mutual_info.py +++ b/pyrocs/information_theory/mutual_info.py @@ -1,15 +1,11 @@ - -from collections.abc import Sequence -import os -import sys - +import numpy as np from pyrocs.information_theory import discrete_entropy def mutual_info( - x: Sequence, - y: Sequence, - counts: Sequence = None, + x: np.ndarray, + y: np.ndarray, + counts: np.ndarray = None, base: int = 2) -> float: """ Mutual information measures how much knowledge is gained about one random variable when another is observed. @@ -36,14 +32,16 @@ def mutual_info( when the other is observed. Args: - x,y (numpy.ndarray): arrays, discretized observations from random - distributions x \in X and y \in Y - counts (Sequence[int]): If present, the number of times each (x,y) pair was + x (array): discretized observations from random + distribution x \in X + y (array): discretized observations from random + distribution y \in Y + counts (array[int]): If present, the number of times each (x,y) pair was observed base (int): If present the base in which to return the entropy Returns: - mutual information between x and y + float """ x_entropy = discrete_entropy(x, counts, base) y_entropy = discrete_entropy(y, counts, base)