Skip to content

Commit

Permalink
Merge pull request #19 from kbonney/kld-doccs
Browse files Browse the repository at this point in the history
Documentation syntax fixes
  • Loading branch information
tgunda authored Oct 21, 2024
2 parents b568c57 + fe6c891 commit d756ca9
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 83 deletions.
12 changes: 5 additions & 7 deletions pyrocs/biosciences/affinity.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@

from itertools import repeat
from typing import Union
import numpy as np
from pandas import DataFrame

def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np.array:
def affinity(data: np.ndarray, weights=None, to_bool=bool) -> float:
"""
Returns the affinity between all pairs of columns in binary data.
Expand All @@ -24,13 +23,12 @@ def affinity(data: Union[np.array, DataFrame], weights=None, to_bool=bool) -> np
results in a binary implementation of affinity within this software.
Args:
data: array or dataframe
weights: (optional) float or array
to_bool: boolean type
data (array)
weights (optional array)
to_bool: function or type to convert array values to boolean
Returns:
affinity between columns in data
float
"""

num_cols = data.shape[1]
Expand Down
20 changes: 8 additions & 12 deletions pyrocs/biosciences/functional_redundancy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

def functional_redundancy(p: np.array, delta: np.array) -> float:
def functional_redundancy(p: np.ndarray, delta: np.ndarray) -> float:
'''
This metric evaluates how interchangeable groups within a population are based
on the specific function they perform. As a biological concept,
Expand All @@ -18,19 +18,15 @@ def functional_redundancy(p: np.array, delta: np.array) -> float:
D &= \\sum_i(p_i*(1-p_i))
Args:
----------
p : np.array
Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1 and where the constraint 0 < p[i]
means that all calculations involve only those species that are actually present in
the assemblage with nonzero abundances.
delta : np.array
:math:`δ_{ij}` symmetric array of pairwise functional dissimilarities between species i and j
p (array): Relative abundances p[i] (i = 1, 2,…,N) with 0 < p[i] ≤ 1
and where the constraint 0 < p[i]
means that all calculations involve only those species that
are actually present in the assemblage with nonzero abundances.
delta (array): :math:`δ_{ij}` symmetric array of pairwise functional
dissimilarities between species i and j
Returns:
--------
FR : float
Functional Redundancy Score
float
'''

dim = len(p)
Expand Down
20 changes: 10 additions & 10 deletions pyrocs/biosciences/hill_diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np


def hill_shannon(p: np.array) -> float:
def hill_shannon(p: np.ndarray) -> float:
"""
The Hill-Shannon number is a specific instance (i.e. the Perplexity) of Hill Diversity,
which prioritizes neither common nor rare species.
Expand All @@ -20,15 +20,15 @@ def hill_shannon(p: np.array) -> float:
where :math:`q` approaches :math:`1` and the mean is the geometric mean
Args:
p: p[i] is the proportion of all individuals that belong to species i
p (array): p[i] is the proportion of all individuals that belong to species i
Returns:
A metric for effective count of species (diversity)
float
"""
entropy = -sum(x * np.log(x) for x in p if x > 0)
return math.exp(entropy)


def hill_simpson(p: np.array) -> float:
def hill_simpson(p: np.ndarray) -> float:
"""
The Hill-Simpson number is a specific instance (i.e. the Inverse Simpson Index)
of Hill Diversity that prioritizes the common species.
Expand All @@ -45,14 +45,14 @@ def hill_simpson(p: np.array) -> float:
where :math:`q=2` and the mean is the usual arithmetic mean
Args:
p: p[i] is the proportion of all individuals that belong to species i
p (array): p[i] is the proportion of all individuals that belong to species i
Returns:
A metric for effective count of species (diversity)
float
"""
return 1.0 / p.dot(p)


def hill_diversity(p: np.array, q: float) -> float:
def hill_diversity(p: np.ndarray, q: float) -> float:
"""
The Hill Numbers are a family of diversity metrics describing "effective number of species".
Expand Down Expand Up @@ -87,11 +87,11 @@ def hill_diversity(p: np.array, q: float) -> float:
species :math:`i`, :math:`q` is the exponent that determines the rarity scale on which the mean is taken
Args:
p: p[i] is the proportion of all individuals that belong to species i,
q: The exponent that determines the rarity scale on which the mean is taken.
p (array): p[i] is the proportion of all individuals that belong to species i,
q (float): The exponent that determines the rarity scale on which the mean is taken.
Species richness (q=0), Hill-Simpson diversity (q=2), Hill-Shannon diversity (q=1),
Returns:
D: a metric for effective count of species (diversity)
float
"""

# Special cases
Expand Down
18 changes: 9 additions & 9 deletions pyrocs/complex_systems/causal_complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import networkx as nx


def cyclomatic_complexity(A : np.ndarray, directed : bool = False):
def cyclomatic_complexity(A : np.ndarray, directed : bool = False) -> float:
'''
Cyclomatic complexity reflects the number of linearly
independent paths within a system of interest
Expand All @@ -23,9 +23,9 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False):
higher cyclomatic complexity values).
Args:
A: array
A (array)
Returns:
cyclomatic complexity of the graph
float
'''

if directed:
Expand All @@ -43,7 +43,7 @@ def cyclomatic_complexity(A : np.ndarray, directed : bool = False):

return E - N + 2.0 * P

def feedback_density(A : np.ndarray, directed : bool = False):
def feedback_density(A : np.ndarray, directed : bool = False) -> float:
'''
Feedback density captures the fraction of edges :math:`(E_{loop})`
and nodes (:math:`N_{loop}`) that are involved in at least one feedback loop.
Expand All @@ -64,9 +64,9 @@ def feedback_density(A : np.ndarray, directed : bool = False):
edges are included in one or more feedback loops.
Args:
A: array
A (array)
Returns:
feedback density of the graph
float
'''

if directed:
Expand Down Expand Up @@ -95,7 +95,7 @@ def feedback_density(A : np.ndarray, directed : bool = False):

return (Eloop + Nloop) / (Etot + Ntot)

def causal_complexity(A: np.ndarray, directed : bool = False):
def causal_complexity(A: np.ndarray, directed : bool = False) -> float:
'''
Causal complexity measures the underlying causal structure
of a system by considering both the system’s intricacy as
Expand Down Expand Up @@ -124,9 +124,9 @@ def causal_complexity(A: np.ndarray, directed : bool = False):
of causal complexity than those systems with lower feedback density.
Args:
A: array
A (array)
Returns:
causal complexity of the graph
float
'''
M = cyclomatic_complexity(A, directed=directed)
D = feedback_density(A, directed=directed)
Expand Down
8 changes: 4 additions & 4 deletions pyrocs/complex_systems/fluctuation_complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import Counter
from functools import lru_cache

def fluctuation_complexity(A : list, L : int = 1):
def fluctuation_complexity(A, L : int = 1) -> float:
'''
Fluctuating complexity extends the characterization of discrete entropy
Expand All @@ -24,10 +24,10 @@ def fluctuation_complexity(A : list, L : int = 1):
respective frequencies of event :math:`i` and :math:`j` within the series.
Args:
A: Sequence of symbols
L: If > 1, groups symbols into short subsequences of length L.
A (array): Sequence of symbols
L (int): If > 1, groups symbols into short subsequences of length L.
Returns:
The Fluctuation Complexity of the sequence
float
'''
if L > 1:
A = [tuple(A[i: i + L]) for i in range(len(A) + 1 - L)]
Expand Down
6 changes: 3 additions & 3 deletions pyrocs/complex_systems/grc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np


def grc(A : np.ndarray, directed : bool):
def grc(A : np.ndarray, directed : bool) -> float:
"""
Global reaching centrality (GRC) measures the level of hierarchy within a network based on flow.
The equation within the package follows the formulations from
Expand All @@ -23,11 +23,11 @@ def grc(A : np.ndarray, directed : bool):
versa :cite:p:`lakkaraju_complexity_2019`.
Args:
A: Square matrix of adjacencies in the network
A (array): Square matrix of adjacencies in the network
directed (bool): If true, assume A represents a directed graph (row -> column).
If false, assume A represents an undirected graph.
Returns:
Global reaching centrality of the graph
float
"""

if directed:
Expand Down
16 changes: 7 additions & 9 deletions pyrocs/information_theory/entropy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@

from collections import Counter
from collections.abc import Sequence

from scipy.stats import entropy
import numpy as np


def discrete_entropy(
values: Sequence,
counts: Sequence = None,
base: int = 2) -> float:
values: np.ndarray,
counts: np.ndarray = None,
base: int = 2) -> float:
"""
Entropy is often used to measure the state of disorder/randomness in a system.
The general equation follows the form:
Expand All @@ -18,7 +16,7 @@ def discrete_entropy(
H = - \\sum_{i=1}^N [p_i * \\log p_i]
where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i`th category,
where :math:`H` = entropy, :math:`p` = discrete probability of the occurrence of an event from the :math:`i^{\mathrm{th}}` category,
and :math:`N` is the total number of categories. Low entropy values indicate a higher state of disorder
while higher entropy values indicate a well-ordered system. The maximum possible value of the
entropy for a given system is :math:`log(N)`, and is thus varies by group size. Please see
Expand All @@ -33,11 +31,11 @@ def discrete_entropy(
`scipy documentation <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html>`_ as well as the references noted above.
Args:
values (Sequence): Sequence of observed values from a random process
counts (Sequence[int]): Number of times each value was observed
values (array): Sequence of observed values from a random process
counts (array[int]): Number of times each value was observed
base (int): Base of returned entropy (default returns number of bits)
Returns:
mutual information between x and y
float
"""

if counts is None:
Expand Down
30 changes: 13 additions & 17 deletions pyrocs/information_theory/kl_divergence.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
# from scipy.special import xlogy

def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
def kl_divergence(p: np.ndarray, q: np.ndarray, base: int = 2) -> float:
"""
Sometimes called relative entropy, the Kullback-Leibler Divergence (KLD)
measures the similarity between two distributions
Expand All @@ -17,21 +16,18 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
where :math:`D` is the KLD value, :math:`N` is the total number of categories,
and :math:`p_i` and :math:`q_i` reflect the discrete probability of the occurrence
of an event from the :math:`i`th category of the sample distribution and
of an event from the :math:`i^{\mathrm{th}}` category of the sample distribution and
reference distribution respectively.
The function is able to calculate KLD for cases where not all categories from the reference distribution are present within the sample distribution.
Args:
p,q (numpy.ndarray): arrays, where rows for each constitute the two
probability distributions from which to calculate divergence. p
contains the distributions holding probabilities in the numerator of the
KL divergence summand.
base: log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems)
p (array): discrete probability distribution
q (array): discrete probability distribution
base (int): log base to compute from; base 2 (bits), base 10 (decimal/whole numbers), or base e (ecology, earth systems)
Returns:
numpy.ndarray: KL divergences, where the second array's rows are the
distributions in the numerator of the log in KL divergence
float
"""

assert p.shape == q.shape, 'p and q shapes must be identical'
Expand All @@ -52,7 +48,9 @@ def kl_divergence(p: np.array, q: np.array, base: int = 2) -> np.array:
return kl_div


def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
def novelty_transience_resonance(
thetas_arr: np.ndarray,
window: int) -> tuple[np.ndarray]:
"""
These three related metrics extend the Kullback-Leibler Divergence formulation to consider how
a distribution differs from past and future distributions within a sequence. Specifically, novelty
Expand All @@ -66,8 +64,8 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
.. math::
N_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\
T_w(p_i) &= (1/w)Sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\
N_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i-k))]\\\\
T_w(p_i) &= (1/w)\sum(1 \\leq k \\leq w)[D(p_i || p_(i+k))]\\\\
R_w(p_i) &= N_w(p_i) - T_w(p_i)
where :math:`N` is novelty, :math:`T` is transience, :math:`R` is resonance,
Expand All @@ -78,12 +76,10 @@ def novelty_transience_resonance(thetas_arr : np.array, window : int) -> list:
equation for the KLD.
Args:
thetas_arr (numpy.ndarray): rows are topic mixtures
thetas_arr (array): rows are topic mixtures
window (int): positive integer defining scale or scale size
Returns:
novelties
transiences
resonances
tuple(array): novelties, transiences, resonances
"""

# Find the first and last center speech offset, given window size.
Expand Down
22 changes: 10 additions & 12 deletions pyrocs/information_theory/mutual_info.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@

from collections.abc import Sequence
import os
import sys

import numpy as np
from pyrocs.information_theory import discrete_entropy


def mutual_info(
x: Sequence,
y: Sequence,
counts: Sequence = None,
x: np.ndarray,
y: np.ndarray,
counts: np.ndarray = None,
base: int = 2) -> float:
"""
Mutual information measures how much knowledge is gained about one random variable when another is observed.
Expand All @@ -36,14 +32,16 @@ def mutual_info(
when the other is observed.
Args:
x,y (numpy.ndarray): arrays, discretized observations from random
distributions x \in X and y \in Y
counts (Sequence[int]): If present, the number of times each (x,y) pair was
x (array): discretized observations from random
distribution x \in X
y (array): discretized observations from random
distribution y \in Y
counts (array[int]): If present, the number of times each (x,y) pair was
observed
base (int): If present the base in which to return the entropy
Returns:
mutual information between x and y
float
"""
x_entropy = discrete_entropy(x, counts, base)
y_entropy = discrete_entropy(y, counts, base)
Expand Down

0 comments on commit d756ca9

Please sign in to comment.