Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add fisher z test #7

Merged
merged 15 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
sudo apt install libspatialindex-dev xdg-utils
- python/install-packages:
pkg-manager: poetry
args: "-E graph_func -E viz --with docs"
args: "--with docs"
cache-version: "v1" # change to clear cache
- run:
name: Check poetry package versions
Expand Down
6 changes: 5 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@ Conditional Independence Testing
Testing for conditional independence among variables is a core part
of many data analysis procedures.

TBD...
.. currentmodule:: pywhy_stats
.. autosummary::
:toctree: generated/

fisherz
1 change: 1 addition & 0 deletions doc/whats_new/v0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Version 0.1
Changelog
---------

- |Feature| Implement partial correlation test :func:`pywhy_stats.fisherz`, by `Adam Li`_ (:pr:`7`)


Code and Documentation Contributors
Expand Down
40 changes: 20 additions & 20 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pywhy_stats/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from . import fisherz
from ._version import __version__ # noqa: F401
116 changes: 116 additions & 0 deletions pywhy_stats/fisherz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from math import log, sqrt
from typing import Optional

import numpy as np
from numpy.typing import ArrayLike
from scipy.stats import norm

from .p_value_result import PValueResult


def ind(X: ArrayLike, Y: ArrayLike, correlation_matrix: Optional[ArrayLike] = None) -> PValueResult:
"""Perform an independence test using Fisher-Z's test.

Works on Gaussian random variables. This test is also known as the
correlation test.

Parameters
----------
X : ArrayLike of shape (n_samples,)
The first node variable.
Y : ArrayLike of shape (n_samples,)
The second node variable.
correlation_matrix : ArrayLike of shape (2, 2), optional
The precomputed correlation matrix between X and Y., by default None.

Returns
-------
X : float
adam2392 marked this conversation as resolved.
Show resolved Hide resolved
The test statistic.
p : float
The p-value of the test.
"""
return _fisherz(X, Y, condition_on=None, correlation_matrix=correlation_matrix)


def condind(
X: ArrayLike,
Y: ArrayLike,
condition_on: ArrayLike,
correlation_matrix: Optional[ArrayLike] = None,
) -> PValueResult:
"""Perform an independence test using Fisher-Z's test.

Works on Gaussian random variables. This test is also known as the
correlation test.

Parameters
----------
X : ArrayLike of shape (n_samples,)
The first node variable.
Y : ArrayLike of shape (n_samples,)
The second node variable.
condition_on : ArrayLike of shape (n_samples, n_variables)
The conditioning set.
correlation_matrix : ArrayLike of shape (2 + n_variables, 2 + n_variables), optional
The precomputed correlation matrix between X, Y and ``condition_on``, by default None.

Returns
-------
X : float
adam2392 marked this conversation as resolved.
Show resolved Hide resolved
The test statistic.
p : float
The p-value of the test.
"""
return _fisherz(X, Y, condition_on=condition_on, correlation_matrix=correlation_matrix)


def _fisherz(
X: ArrayLike,
Y: ArrayLike,
condition_on: Optional[ArrayLike] = None,
correlation_matrix: Optional[ArrayLike] = None,
) -> PValueResult:
"""Perform an independence test using Fisher-Z's test.
adam2392 marked this conversation as resolved.
Show resolved Hide resolved

Works on Gaussian random variables. This test is also known as the
partial correlation test.

Parameters
----------
X : ArrayLike of shape (n_samples,)
The first node variable.
Y : ArrayLike of shape (n_samples,)
The second node variable.
condition_on : ArrayLike of shape (n_samples, n_variables)
If `None` (default), will run a marginal independence test.
correlation_matrix : np.ndarray of shape (n_variables, n_variables), optional
``None`` means without the parameter of correlation matrix and
the correlation will be computed from the data., by default None.

Returns
-------
X : float
The test statistic.
p : float
The p-value of the test.
"""
if condition_on is None:
condition_on = np.empty((X.shape[0], 0))

# compute the correlation matrix within the specified data
data = np.hstack((X, Y, condition_on))
sample_size = data.shape[0]
if correlation_matrix is None:
correlation_matrix = np.corrcoef(data.T)

inv = np.linalg.pinv(correlation_matrix)
r = -inv[0, 1] / sqrt(inv[0, 0] * inv[1, 1])

# apply the Fisher Z-transformation
Z = 0.5 * log((1 + r) / (1 - r))

# compute the test statistic
statistic = sqrt(sample_size - condition_on.shape[1] - 3) * abs(Z)
p = 2 * (1 - norm.cdf(abs(statistic)))
adam2392 marked this conversation as resolved.
Show resolved Hide resolved
return PValueResult(statistic, p)
27 changes: 22 additions & 5 deletions pywhy_stats/independence.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from enum import Enum
from types import ModuleType
from typing import Optional

from numpy.testing import ArrayLike
from numpy.typing import ArrayLike

from pywhy_stats import fisherz

from .p_value_result import PValueResult

Expand All @@ -10,6 +13,7 @@ class Methods(Enum):
"""Methods for independence testing."""

AUTO = 0
FISHERZ = fisherz


def independence_test(
Expand All @@ -26,11 +30,11 @@ def independence_test(

Parameters
----------
X : numpy.ndarray, shape (n, d)
X : ArrayLike, shape (n_samples, n_features_x)
Data matrix for X.
Y : numpy.ndarray, shape (n, m)
Y : ArrayLike, shape (n_samples, n_features_y)
Data matrix for Y.
condition_on : numpy.ndarray or None, shape (n, k), optional
condition_on : ArrayLike or None, shape (n_samples, n_features_z), optional
Data matrix for the conditioning variables. If None is given, an unconditional test
is performed.
method : Methods, optional
Expand All @@ -44,5 +48,18 @@ def independence_test(
result : PValueResult
An instance of the PValueResult data class, containing the p-value, test statistic,
and any additional information related to the independence test.

See Also
--------
fisherz : Fisher's Z test for independence
"""
pass
method_func: ModuleType
if method == Methods.AUTO:
method_func = Methods.FISHERZ
else:
method_func = method

if condition_on is None:
return method_func.ind(X, Y, method, **kwargs)
adam2392 marked this conversation as resolved.
Show resolved Hide resolved
else:
return method_func.condind(X, Y, condition_on, method, **kwargs)
12 changes: 6 additions & 6 deletions pywhy_stats/p_value_result.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from dataclasses import dataclass
from typing import Optional, Union

from numpy.testing import ArrayLike
from numpy.typing import ArrayLike


@dataclass
Expand All @@ -10,16 +10,16 @@ class PValueResult:

Attributes
----------
p_value: float
pvalue : float
The p-value represents the probability of observing the given test statistic, or more
extreme results, under a certain null hypothesis.
test_statistic: float or numpy.ndarray or None
statistic : float or ArrayLike or None
The test statistic of the hypothesis test, which might not always be available.
additional_information: object or None
additional_information : object or None
Any additional information or metadata relevant to the specific test conducted. These could
also be a state of the method to re-use it.
"""

p_value: float
test_statistic: Optional[Union[float, ArrayLike]] = None
pvalue: float
statistic: Optional[Union[float, ArrayLike]] = None
additional_information: Optional[object] = None
44 changes: 44 additions & 0 deletions tests/test_fisherz_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import flaky
import numpy as np

from pywhy_stats import fisherz


@flaky.flaky(max_runs=3, min_passes=1)
def test_fisherz_marg_ind():
"""Test FisherZ marginal independence test for Gaussian data."""
rng = np.random.default_rng()

# We construct a SCM where X1 -> Y <- X and Y -> Z
# so X1 is independent from X, but conditionally dependent
# given Y or Z
X = rng.standard_normal((300, 1))
X1 = rng.standard_normal((300, 1))
Y = X + X1 + 0.5 * rng.standard_normal((300, 1))
Z = Y + 0.1 * rng.standard_normal((300, 1))

res = fisherz.ind(X, X1)
assert res.pvalue > 0.05
res = fisherz.ind(X, Z)
assert res.pvalue < 0.05
bloebp marked this conversation as resolved.
Show resolved Hide resolved


@flaky.flaky(max_runs=3, min_passes=1)
def test_fisherz_cond_ind():
"""Test FisherZ conditional independence test for Gaussian data."""
rng = np.random.default_rng()

# We construct a SCM where X1 -> Y <- X and Y -> Z
# so X1 is independent from X, but conditionally dependent
# given Y or Z
X = rng.standard_normal((300, 1))
X1 = rng.standard_normal((300, 1))
Y = X + X1 + 0.5 * rng.standard_normal((300, 1))
Z = Y + 0.1 * rng.standard_normal((300, 1))

res = fisherz.condind(X, X1, Z)
assert res.pvalue < 0.05
res = fisherz.condind(X, X1, Y)
assert res.pvalue < 0.05
res = fisherz.condind(X, Z, Y)
assert res.pvalue > 0.05