Merge pull request #1 from HarrisonWilde/master

Moving to working from main repo
knowlab · Sep 12, 2024 · f8083e8 · f8083e8
2 parents c5c1b44 + 0a164ad
commit f8083e8
Show file tree

Hide file tree

Showing 15 changed files with 3,960 additions and 705 deletions.
diff --git a/.github/workflows/ruff.yaml b/.github/workflows/ruff.yaml
@@ -0,0 +1,12 @@
+name: Run `ruff`
+
+on: [push, pull_request]
+
+jobs:
+  check_format:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+    - name: Run ruff
+      uses: chartboost/ruff-action@v1
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,45 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Auxiliary
+.DS_Store
+site/
+.ruff_cache/
+
+# Tests
+.coverage
+.tox/
+.pytest_cache/
+coverage.xml
+coverage-badge.svg
diff --git a/.poetry-sync-db.json b/.poetry-sync-db.json
@@ -0,0 +1,14 @@
+{
+    "nbstripout": {
+        "repo": "https://github.com/kynan/nbstripout",
+        "rev": "${rev}"
+    },
+    "pre-commit-hooks": {
+        "repo": "https://github.com/pre-commit/pre-commit-hooks",
+        "rev": "v${rev}"
+    },
+    "ruff": {
+        "repo": "https://github.com/astral-sh/ruff-pre-commit",
+        "rev": "v${rev}"
+    }
+}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+  - repo: https://github.com/floatingpurr/sync_with_poetry
+    rev: 1.1.0
+    hooks:
+      - id: sync_with_poetry
+        name: sync poetry versions
+        args: [--db, .poetry-sync-db.json]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-toml
+      - id: check-added-large-files
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.4
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+        types_or: [python, pyi, jupyter]
+      - id: ruff-format
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.2
+    hooks:
+      - id: nbstripout
diff --git a/DAindex-tutorial.ipynb b/DAindex-tutorial.ipynb
diff --git a/DAindex/__init__.py b/DAindex/__init__.py
diff --git a/DAindex/core.py b/DAindex/core.py
@@ -0,0 +1,296 @@
+import logging
+from typing import Any, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import KernelDensity
+
+
+def vis_plot(
+    x,
+    x_d,
+    logprob,
+    ylabel="Percentage/PDF",
+    title="",
+    vlines=[],
+    vline_colors=[],
+    vline_labels=[],
+    hist=True,
+    reverse=False,
+):
+    """
+    plot probability density function (PDF) and histogram for comparison
+    """
+    if hist:
+        pd.DataFrame(x).rename(columns={0: "Histogram"}).plot.hist(
+            bins=20, alpha=0.5, color="goldenrod", **{"density": True}, figsize=(18, 16)
+        )
+    else:
+        plt.figure(figsize=(8, 2), dpi=100)
+    plt.fill_between(x_d, np.exp(logprob), alpha=0.5, label="Probability Density Function")
+    # plt.plot(x, np.full_like(x, -0.005), '|k', markeredgewidth=1)
+    for vl, c, label in zip(vlines, vline_colors, vline_labels):
+        plt.axvline(x=vl, color=c, linestyle="--", label=f"{label}:{vl}")
+
+    if len(vlines) > 0:
+        if not reverse:
+            plt.axvspan(vlines[-1], x_d[-1], facecolor="b", alpha=0.05)
+        else:
+            plt.axvspan(x_d[0], vlines[-1], facecolor="b", alpha=0.05)
+
+    plt.legend()
+    plt.ylabel("")
+    plt.title(title)
+    plt.yticks([])
+    # plt.ylim((0, .03))
+    # plt.xlim((10, 40))
+
+
+def grid_search_bandwith(x):
+    """
+    search for the best bandwith for KDE
+    """
+    from sklearn.model_selection import GridSearchCV, KFold
+
+    bandwidths = np.linspace(0, 1, 20)
+    grid = GridSearchCV(KernelDensity(kernel="gaussian"), {"bandwidth": bandwidths}, cv=KFold(5))
+    grid.fit(x)
+    return grid.best_params_["bandwidth"]
+
+
+def get_transformer(X):
+    """
+    tried for transform input data before KDE
+    - didn't work, not in use
+    """
+    from sklearn.preprocessing import PowerTransformer
+
+    # bc = PowerTransformer(method="box-cox")
+    yj = PowerTransformer(method="yeo-johnson")
+    X_trans_bc = yj.fit(X)
+    # norm_t = Normalizer()
+    # norm_t = norm_t.fit(X_trans_bc.transform(X))
+    return X_trans_bc
+    # .transform(X)
+    # logging.info(min(X_trans_bc), max(X_trans_bc), np.std(X_trans_bc))
+    # return X_trans_bc
+
+
+def get_threshold_index(threshold, low_bound, is_discrete, prev_val_offset, step, boundary_offset):
+    """
+    Calculate threshold value index
+    """
+    threshold_index = int((threshold - low_bound) / step)
+    if threshold == low_bound + boundary_offset:
+        logging.info(f"adjust boundary {threshold}")
+        threshold_index = int((threshold - low_bound - boundary_offset) / step)
+    elif is_discrete:
+        # discrete values will lead to PDF shape like pulses,
+        # it's important to start with the valley between the pulse you want to include and the one before that
+        # the following is to find the valley index
+        threshold_index_prev = int((threshold - prev_val_offset - low_bound) / step)
+        threshold_index = int(threshold_index_prev + (threshold_index - threshold_index_prev) / 2)
+    return threshold_index
+
+
+def search_for_zero_mass_index(kde, min_v, n_samples=100):
+    """
+    Search near zero probability mass for boundary adjustment
+    """
+    bins = np.linspace(min_v - 10, min_v, n_samples)
+    pd_vals = np.exp(kde.score_samples(bins.reshape(-1, 1)))
+    first_zero_idx = np.max(np.where(pd_vals < 1e-10))
+    return first_zero_idx, bins[first_zero_idx]
+
+
+def kde_estimate(X, bandwidth=1, kernel="gaussian", search_bandwidth=True):
+    """
+    Kernel density estimation to get probability
+    """
+    # KDE estimate
+    from sklearn.neighbors import KernelDensity
+
+    best_bw = bandwidth
+    if search_bandwidth:
+        best_bw = grid_search_bandwith(X)
+        logging.info(f"learned best bandwidth {best_bw}")
+
+    kde = KernelDensity(bandwidth=best_bw, kernel=kernel)
+    kde.fit(X)
+
+    return kde, best_bw
+
+
+def deterioration_index(
+    X,
+    low_bound,
+    up_bound,
+    threshold,
+    n_samples=10000,
+    plot_title="",
+    is_discrete=False,
+    prev_discrete_value_offset=1,
+    weight_sum_steps=10,
+    reverse=False,
+    bandwidth=1,
+    kernel="gaussian",
+    search_bandwidth=True,
+    do_plot=True,
+):
+    """
+    obtain deterioration index
+    X - the random sample of measurements
+    low_bound/up_nbound - the boundary values of the measurement
+    n_samples - number of bins to use for probability calculation. default is 2000.
+    plot_title - the title of the plot, if generates plot. default is empty string
+    is_discrete - whether the random sample is discrete. NB: this might be overwritten based on bandwidth learned.
+        Small bandwidths will always bring out pulse like PDFs. default is False.
+    prev_discrete_value_offset - the difference between the threshold and the previous legitimate value. default is 1.
+    weight_sum_steps - the number of bins for weighted sum of k-step cutoffs, default is 20
+    reverse - for calculating p(X<threshold), i.e., the smaller the measure value the more severe a patient is.
+        default is False
+    bandwidth - default bandwidth to use if not search bandwidth, default is 1
+    kernel - the kernel to use for KDE, default is gaussian.
+    search_bandwidth - whether to use grid search to find optimal bandwidth for X. default is True
+    do_plot - whether to generate plots, default is True
+    """
+
+    # estimate density function
+    kde, fitted_bandwith = kde_estimate(X, bandwidth=bandwidth, kernel=kernel, search_bandwidth=search_bandwidth)
+
+    # detect pulse like PDF
+    if fitted_bandwith < 0.1:
+        # force is_discrete because the bestfitted_bandwith_bw would lead to one
+        logging.info(f"FORCED to be discrete because the bandwidth is {fitted_bandwith}")
+        is_discrete = True
+    elif fitted_bandwith > 0.7:
+        # force is_discrete to be false because the fitted_bandwith would lead to one
+        logging.info(f"FORCED to be NOT discrete because the bandwidth is {fitted_bandwith}")
+        is_discrete = False
+
+    # automatically adjust on boundaries
+    _, adjusted_min = search_for_zero_mass_index(kde, low_bound)
+    logging.info(f"adjusted min val {adjusted_min}")
+    boundary_offset = low_bound - adjusted_min
+    # orig_low_bound = low_bound
+    low_bound -= boundary_offset
+    up_bound += boundary_offset
+
+    # use learned KDE estimator to get probability
+    bins = np.linspace(low_bound, up_bound, n_samples)
+    kd_vals = kde.score_samples(bins.reshape(-1, 1))  # Get PDF values for each x
+    step_width = (up_bound - low_bound) / n_samples  # get the step
+    prob = np.exp(kd_vals) * step_width  # get the approximate prob at each point using the integral of the PDF
+
+    if do_plot:
+        tidx = get_threshold_index(
+            threshold,
+            low_bound,
+            is_discrete,
+            prev_discrete_value_offset,
+            step_width,
+            boundary_offset=boundary_offset,
+        )
+        vis_plot(
+            X,
+            bins,
+            kd_vals,
+            title=plot_title,
+            vlines=[threshold, round(tidx * step_width + low_bound, 2)],
+            vline_colors=["r", "b"],
+            vline_labels=["Threshold", "Boudary-adjusted"],
+            hist=False,
+            reverse=reverse,
+        )
+
+    # severity quantification
+
+    if reverse:
+        s = low_bound
+        e = min(threshold, up_bound)
+    else:
+        s = max(threshold, low_bound)
+        e = up_bound
+    # 1. binary like multimorbidity num > 3, yes or no
+    sq1 = stepped_severity(
+        prob,
+        s,
+        e,
+        1,
+        low_bound,
+        step_width,
+        is_discrete,
+        boundary_offset,
+        prev_val_offset=prev_discrete_value_offset,
+        reverse=reverse,
+    )
+    # 2. stepped quantification that considers higher/lower the value, more severe the patients are
+    sqs = stepped_severity(
+        prob,
+        s,
+        e,
+        weight_sum_steps,
+        low_bound,
+        step_width,
+        is_discrete,
+        boundary_offset,
+        prev_val_offset=prev_discrete_value_offset,
+        reverse=reverse,
+    )
+
+    return {"overall-prob": round(prob.sum(), 4), "one-step": round(sq1, 4), "k-step": round(sqs, 6), "|X|": len(X)}
+
+
+def stepped_severity(
+    probs, s, e, steps, low_bound, step_width, is_discrete, boundary_offset, prev_val_offset=1, reverse=False
+) -> float:
+    """
+    To quantify severity by considering higher values as more severe.
+    This is done by weighted sum by integrating probs from a threshold - s
+    """
+
+    bins = np.linspace(s, e, steps)
+    # bin_probs = [probs[get_threshold_index(t, low_bound, is_discrete, step_width):].sum()  for t in bins]
+    bin_probs = []
+    for i, t in enumerate(bins):
+        idx1 = get_threshold_index(t, low_bound, is_discrete, prev_val_offset, step_width, boundary_offset)
+        if i < len(bins) - 1:
+            idx2 = get_threshold_index(
+                bins[i + 1], low_bound, is_discrete, prev_val_offset, step_width, boundary_offset
+            )
+            bin_probs.append(probs[idx1:idx2].sum())
+        else:
+            (
+                bin_probs.append(probs[idx1:].sum())
+                if not reverse
+                else bin_probs.append(
+                    probs[
+                        idx1 : get_threshold_index(
+                            e, low_bound, is_discrete, prev_val_offset, step_width, boundary_offset
+                        )
+                    ].sum()
+                )
+            )
+    s = 0
+
+    # weight functions
+    # weight_function = lambda x: log(x+2, 2)
+    def weight_function(x: Any) -> Union[int, Any]:
+        return x + 1
+
+    w = 0
+    for i, p in enumerate(bin_probs):
+        if not reverse:
+            s += weight_function(i) * p
+        else:
+            s += weight_function(len(bin_probs) - i) * p
+        w += weight_function(i)
+    return s / w
+
+
+def db_ineq(di1, di2) -> float:
+    """
+    quantify inequality
+    """
+    return di1["k-step"] / di2["k-step"] - 1