Merge pull request #19 from kbonney/kld-doccs

Documentation syntax fixes
sandialabs · Oct 21, 2024 · d756ca9 · d756ca9
2 parents b568c57 + fe6c891
commit d756ca9
Show file tree

Hide file tree

Showing 9 changed files with 69 additions and 83 deletions.
diff --git a/pyrocs/biosciences/affinity.py b/pyrocs/biosciences/affinity.py
@@ -1,10 +1,9 @@
 
 from itertools import repeat
-from typing import Union
 import numpy as np
 from pandas import DataFrame
 
-def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np.array:
+def affinity(data: np.ndarray, weights=None, to_bool=bool) -> float:
     """
     Returns the affinity between all pairs of columns in binary data.
 
@@ -24,13 +23,12 @@ def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np
     results in a binary implementation of affinity within this software.
     
     Args:
-        data: array or dataframe 
-        weights: (optional) float or array 
-        to_bool: boolean type
+        data (array) 
+        weights (optional array) 
+        to_bool: function or type to convert array values to boolean
         
     Returns:
-        affinity between columns in data
-    
+        float
     """
 
     num_cols = data.shape[1]

diff --git a/pyrocs/biosciences/functional_redundancy.py b/pyrocs/biosciences/functional_redundancy.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-def functional_redundancy(p: np.array, delta: np.array) -> float:
+def functional_redundancy(p: np.ndarray, delta: np.ndarray) -> float:
     '''
     This metric evaluates how interchangeable groups within a population are based 
     on the specific function they perform. As a biological concept, 
@@ -18,19 +18,15 @@ def functional_redundancy(p: np.array, delta: np.array) -> float:
         D &= \\sum_i(p_i*(1-p_i))
 
     Args:
-    ----------
-    p : np.array
-        Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1 and where the constraint 0 < p[i]
-        means that all calculations involve only those species that are actually present in 
-        the assemblage with nonzero abundances.
-    delta : np.array
-        :math:`δ_{ij}` symmetric array of pairwise functional dissimilarities between species i and j 
+        p (array): Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1 
+            and where the constraint 0 < p[i]
+            means that all calculations involve only those species that 
+            are actually present in the assemblage with nonzero abundances.
+        delta (array): :math:`δ_{ij}` symmetric array of pairwise functional 
+            dissimilarities between species i and j 
 
     Returns:
-    --------
-    FR : float
-        Functional Redundancy Score
-
+        float
     '''
 
     dim = len(p)

diff --git a/pyrocs/biosciences/hill_diversity.py b/pyrocs/biosciences/hill_diversity.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def hill_shannon(p: np.array) -> float:
+def hill_shannon(p: np.ndarray) -> float:
     """
     The Hill-Shannon number is a specific instance (i.e. the Perplexity) of Hill Diversity, 
     which prioritizes neither common nor rare species. 
@@ -20,15 +20,15 @@ def hill_shannon(p: np.array) -> float:
     where :math:`q` approaches :math:`1` and the mean is the geometric mean
     
     Args:
-        p: p[i] is the proportion of all individuals that belong to species i
+        p (array): p[i] is the proportion of all individuals that belong to species i
     Returns:
-        A metric for effective count of species (diversity)
+        float
     """
     entropy = -sum(x * np.log(x) for x in p if x > 0)
     return math.exp(entropy)
 
 
-def hill_simpson(p: np.array) -> float:
+def hill_simpson(p: np.ndarray) -> float:
     """
     The Hill-Simpson number is a specific instance (i.e. the Inverse Simpson Index) 
     of Hill Diversity that prioritizes the common species. 
@@ -45,14 +45,14 @@ def hill_simpson(p: np.array) -> float:
     where :math:`q=2` and the mean is the usual arithmetic mean
 
     Args:
-        p: p[i] is the proportion of all individuals that belong to species i
+        p (array): p[i] is the proportion of all individuals that belong to species i
     Returns:
-        A metric for effective count of species (diversity)
+        float
     """
     return 1.0 / p.dot(p)
 
 
-def hill_diversity(p: np.array, q: float) -> float:
+def hill_diversity(p: np.ndarray, q: float) -> float:
     """
     The Hill Numbers are a family of diversity metrics describing "effective number of species".
     
@@ -87,11 +87,11 @@ def hill_diversity(p: np.array, q: float) -> float:
     species :math:`i`, :math:`q` is the exponent that determines the rarity scale on which the mean is taken
     
     Args:
-        p: p[i] is the proportion of all individuals that belong to species i, 
-        q: The exponent that determines the rarity scale on which the mean is taken.
+        p (array): p[i] is the proportion of all individuals that belong to species i, 
+        q (float): The exponent that determines the rarity scale on which the mean is taken.
             Species richness (q=0), Hill-Simpson diversity (q=2), Hill-Shannon diversity (q=1), 
     Returns:
-        D: a metric for effective count of species (diversity) 
+        float
     """
 
     # Special cases

diff --git a/pyrocs/complex_systems/causal_complexity.py b/pyrocs/complex_systems/causal_complexity.py
@@ -2,7 +2,7 @@
 import networkx as nx
 
 
-def cyclomatic_complexity(A : np.ndarray, directed : bool = False):
+def cyclomatic_complexity(A : np.ndarray, directed : bool = False) -> float:
     '''
     Cyclomatic complexity reflects the number of linearly 
     independent paths within a system of interest 
@@ -23,9 +23,9 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False):
     higher cyclomatic complexity values).     
     
     Args:
-        A: array
+        A (array)
     Returns:
-        cyclomatic complexity of the graph   
+        float
     '''
 
     if directed:
@@ -43,7 +43,7 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False):
 
     return E - N + 2.0 * P 
 
-def feedback_density(A : np.ndarray, directed : bool = False):
+def feedback_density(A : np.ndarray, directed : bool = False) -> float:
     '''
     Feedback density captures the fraction of edges :math:`(E_{loop})` 
     and nodes (:math:`N_{loop}`) that are involved in at least one feedback loop.
@@ -64,9 +64,9 @@ def feedback_density(A : np.ndarray, directed : bool = False):
     edges are included in one or more feedback loops.
     
     Args:
-        A: array
+        A (array)
     Returns:
-        feedback density of the graph   
+        float
     '''
 
     if directed: 
@@ -95,7 +95,7 @@ def feedback_density(A : np.ndarray, directed : bool = False):
 
     return (Eloop + Nloop) / (Etot + Ntot)
 
-def causal_complexity(A: np.ndarray, directed : bool = False):
+def causal_complexity(A: np.ndarray, directed : bool = False) -> float:
     '''
     Causal complexity measures the underlying causal structure 
     of a system by considering both the system’s intricacy as
@@ -124,9 +124,9 @@ def causal_complexity(A: np.ndarray, directed : bool = False):
     of causal complexity than those systems with lower feedback density.
     
     Args:
-        A: array
+        A (array)
     Returns:
-        causal complexity of the graph
+        float
     '''
     M = cyclomatic_complexity(A, directed=directed)
     D = feedback_density(A, directed=directed)

diff --git a/pyrocs/complex_systems/fluctuation_complexity.py b/pyrocs/complex_systems/fluctuation_complexity.py
@@ -2,7 +2,7 @@
 from collections import Counter
 from functools import lru_cache
 
-def fluctuation_complexity(A : list, L : int = 1):
+def fluctuation_complexity(A, L : int = 1) -> float:
     '''
     
     Fluctuating complexity extends the characterization of discrete entropy 
@@ -24,10 +24,10 @@ def fluctuation_complexity(A : list, L : int = 1):
     respective frequencies of event :math:`i` and :math:`j` within the series. 
     
     Args:
-        A: Sequence of symbols
-        L: If > 1, groups symbols into short subsequences of length L.
+        A (array): Sequence of symbols
+        L (int): If > 1, groups symbols into short subsequences of length L.
     Returns:
-        The Fluctuation Complexity of the sequence
+        float
     '''
     if L > 1:
         A = [tuple(A[i: i + L]) for i in range(len(A) + 1 - L)]

diff --git a/pyrocs/complex_systems/grc.py b/pyrocs/complex_systems/grc.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def grc(A : np.ndarray, directed : bool):
+def grc(A : np.ndarray, directed : bool) -> float:
     """
     Global reaching centrality (GRC) measures the level of hierarchy within a network based on flow. 
     The equation within the package follows the formulations from 
@@ -23,11 +23,11 @@ def grc(A : np.ndarray, directed : bool):
     versa :cite:p:`lakkaraju_complexity_2019`.
 
     Args:
-        A: Square matrix of adjacencies in the network
+        A (array): Square matrix of adjacencies in the network
         directed (bool): If true, assume A represents a directed graph (row -> column).
             If false, assume A represents an undirected graph.
     Returns:
-        Global reaching centrality of the graph 
+        float 
     """
 
     if directed:

diff --git a/pyrocs/information_theory/entropy.py b/pyrocs/information_theory/entropy.py
@@ -1,15 +1,13 @@
 
 from collections import Counter
-from collections.abc import Sequence
-
 from scipy.stats import entropy
 import numpy as np
 
 
 def discrete_entropy(
-        values: Sequence,
-        counts: Sequence = None,
-        base: int = 2) -> float:
+    values: np.ndarray, 
+    counts: np.ndarray = None, 
+    base: int = 2) -> float:
     """
     Entropy is often used to measure the state of disorder/randomness in a system. 
     The general equation follows the form:
@@ -18,7 +16,7 @@ def discrete_entropy(
     
         H = - \\sum_{i=1}^N [p_i * \\log p_i]
     
-    where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i`th category, 
+    where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i^{\mathrm{th}}` category, 
     and :math:`N` is the total number of categories. Low entropy values indicate a higher state of disorder 
     while higher entropy values indicate a well-ordered system. The maximum possible value of the
     entropy for a given system is :math:`log(N)`, and is thus varies by group size. Please see 
@@ -33,11 +31,11 @@ def discrete_entropy(
     `scipy documentation <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html>`_ as well as the references noted above. 
 
     Args:
-        values (Sequence): Sequence of observed values from a random process
-        counts (Sequence[int]): Number of times each value was observed
+        values (array): Sequence of observed values from a random process
+        counts (array[int]): Number of times each value was observed
         base (int): Base of returned entropy (default returns number of bits)
     Returns:
-        mutual information between x and y
+        float
     """
 
     if counts is None:

diff --git a/pyrocs/information_theory/kl_divergence.py b/pyrocs/information_theory/kl_divergence.py
@@ -1,7 +1,6 @@
 import numpy as np
-# from scipy.special import xlogy
 
-def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
+def kl_divergence(p: np.ndarray, q: np.ndarray, base: int = 2) -> float:
     """
     Sometimes called relative entropy, the Kullback-Leibler Divergence (KLD) 
     measures the similarity between two distributions 
@@ -17,21 +16,18 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
         
     where :math:`D` is the KLD value, :math:`N` is the total number of categories, 
     and :math:`p_i` and :math:`q_i` reflect the discrete probability of the occurrence 
-    of an event from the :math:`i`th category of the sample distribution and 
+    of an event from the :math:`i^{\mathrm{th}}` category of the sample distribution and 
     reference distribution respectively.
 
     The function is able to calculate KLD for cases where not all categories from the reference distribution are present within the sample distribution. 
 
     Args:
-        p,q (numpy.ndarray): arrays, where rows for each constitute the two
-        probability distributions from which to calculate divergence. p
-        contains the distributions holding probabilities in the numerator of the
-        KL divergence summand.
-        base: log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems)
+        p (array): discrete probability distribution
+        q (array): discrete probability distribution
+        base (int): log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems)
 
     Returns:
-        numpy.ndarray: KL divergences, where the second array's rows are the
-        distributions in the numerator of the log in KL divergence
+        float
     """
 
     assert p.shape == q.shape, 'p and q shapes must be identical'
@@ -52,7 +48,9 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
     return kl_div
 
 
-def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
+def novelty_transience_resonance(
+    thetas_arr: np.ndarray, 
+    window: int) -> tuple[np.ndarray]:
     """
     These three related metrics extend the Kullback-Leibler Divergence formulation to consider how 
     a distribution differs from past and future distributions within a sequence. Specifically, novelty 
@@ -66,8 +64,8 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
     
     .. math::
     
-        N_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\
-        T_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\
+        N_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\
+        T_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\
         R_w(p_i) &= N_w(p_i) - T_w(p_i)
         
     where :math:`N` is novelty, :math:`T` is transience, :math:`R` is resonance, 
@@ -78,12 +76,10 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
     equation for the KLD.
 
     Args:
-        thetas_arr (numpy.ndarray): rows are topic mixtures
+        thetas_arr (array): rows are topic mixtures
         window (int): positive integer defining scale or scale size
     Returns:
-        novelties 
-        transiences 
-        resonances    
+        tuple(array): novelties, transiences, resonances   
     """
 
     # Find the first and last center speech offset, given window size.

diff --git a/pyrocs/information_theory/mutual_info.py b/pyrocs/information_theory/mutual_info.py
@@ -1,15 +1,11 @@
-
-from collections.abc import Sequence
-import os
-import sys
-
+import numpy as np
 from pyrocs.information_theory import discrete_entropy
 
 
 def mutual_info(
-        x: Sequence,
-        y: Sequence,
-        counts: Sequence = None,
+        x: np.ndarray,
+        y: np.ndarray,
+        counts: np.ndarray = None,
         base: int = 2) -> float:
     """
     Mutual information measures how much knowledge is gained about one random variable when another is observed.
@@ -36,14 +32,16 @@ def mutual_info(
     when the other is observed.
 
     Args:
-        x,y (numpy.ndarray): arrays, discretized observations from random
-            distributions x \in X and y \in Y
-        counts (Sequence[int]): If present, the number of times each (x,y) pair was
+        x (array): discretized observations from random
+            distribution x \in X
+        y (array): discretized observations from random
+            distribution y \in Y
+        counts (array[int]): If present, the number of times each (x,y) pair was
             observed
         base (int): If present the base in which to return the entropy
 
     Returns:
-        mutual information between x and y
+        float
     """
     x_entropy = discrete_entropy(x, counts, base)
     y_entropy = discrete_entropy(y, counts, base)