Skip to content

Commit

Permalink
Merge pull request #1 from HarrisonWilde/master
Browse files Browse the repository at this point in the history
Moving to working from main repo
  • Loading branch information
HarrisonWilde authored Sep 12, 2024
2 parents c5c1b44 + 0a164ad commit f8083e8
Show file tree
Hide file tree
Showing 15 changed files with 3,960 additions and 705 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/ruff.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Run `ruff`

on: [push, pull_request]

jobs:
check_format:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v3
- name: Run ruff
uses: chartboost/ruff-action@v1
45 changes: 45 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Auxiliary
.DS_Store
site/
.ruff_cache/

# Tests
.coverage
.tox/
.pytest_cache/
coverage.xml
coverage-badge.svg
14 changes: 14 additions & 0 deletions .poetry-sync-db.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"nbstripout": {
"repo": "https://github.com/kynan/nbstripout",
"rev": "${rev}"
},
"pre-commit-hooks": {
"repo": "https://github.com/pre-commit/pre-commit-hooks",
"rev": "v${rev}"
},
"ruff": {
"repo": "https://github.com/astral-sh/ruff-pre-commit",
"rev": "v${rev}"
}
}
26 changes: 26 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
repos:
- repo: https://github.com/floatingpurr/sync_with_poetry
rev: 1.1.0
hooks:
- id: sync_with_poetry
name: sync poetry versions
args: [--db, .poetry-sync-db.json]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-toml
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
types_or: [python, pyi, jupyter]
- id: ruff-format
- repo: https://github.com/kynan/nbstripout
rev: 0.6.2
hooks:
- id: nbstripout
271 changes: 30 additions & 241 deletions DAindex-tutorial.ipynb

Large diffs are not rendered by default.

Empty file added DAindex/__init__.py
Empty file.
296 changes: 296 additions & 0 deletions DAindex/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
import logging
from typing import Any, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity


def vis_plot(
x,
x_d,
logprob,
ylabel="Percentage/PDF",
title="",
vlines=[],
vline_colors=[],
vline_labels=[],
hist=True,
reverse=False,
):
"""
plot probability density function (PDF) and histogram for comparison
"""
if hist:
pd.DataFrame(x).rename(columns={0: "Histogram"}).plot.hist(
bins=20, alpha=0.5, color="goldenrod", **{"density": True}, figsize=(18, 16)
)
else:
plt.figure(figsize=(8, 2), dpi=100)
plt.fill_between(x_d, np.exp(logprob), alpha=0.5, label="Probability Density Function")
# plt.plot(x, np.full_like(x, -0.005), '|k', markeredgewidth=1)
for vl, c, label in zip(vlines, vline_colors, vline_labels):
plt.axvline(x=vl, color=c, linestyle="--", label=f"{label}:{vl}")

if len(vlines) > 0:
if not reverse:
plt.axvspan(vlines[-1], x_d[-1], facecolor="b", alpha=0.05)
else:
plt.axvspan(x_d[0], vlines[-1], facecolor="b", alpha=0.05)

plt.legend()
plt.ylabel("")
plt.title(title)
plt.yticks([])
# plt.ylim((0, .03))
# plt.xlim((10, 40))


def grid_search_bandwith(x):
"""
search for the best bandwith for KDE
"""
from sklearn.model_selection import GridSearchCV, KFold

bandwidths = np.linspace(0, 1, 20)
grid = GridSearchCV(KernelDensity(kernel="gaussian"), {"bandwidth": bandwidths}, cv=KFold(5))
grid.fit(x)
return grid.best_params_["bandwidth"]


def get_transformer(X):
"""
tried for transform input data before KDE
- didn't work, not in use
"""
from sklearn.preprocessing import PowerTransformer

# bc = PowerTransformer(method="box-cox")
yj = PowerTransformer(method="yeo-johnson")
X_trans_bc = yj.fit(X)
# norm_t = Normalizer()
# norm_t = norm_t.fit(X_trans_bc.transform(X))
return X_trans_bc
# .transform(X)
# logging.info(min(X_trans_bc), max(X_trans_bc), np.std(X_trans_bc))
# return X_trans_bc


def get_threshold_index(threshold, low_bound, is_discrete, prev_val_offset, step, boundary_offset):
"""
Calculate threshold value index
"""
threshold_index = int((threshold - low_bound) / step)
if threshold == low_bound + boundary_offset:
logging.info(f"adjust boundary {threshold}")
threshold_index = int((threshold - low_bound - boundary_offset) / step)
elif is_discrete:
# discrete values will lead to PDF shape like pulses,
# it's important to start with the valley between the pulse you want to include and the one before that
# the following is to find the valley index
threshold_index_prev = int((threshold - prev_val_offset - low_bound) / step)
threshold_index = int(threshold_index_prev + (threshold_index - threshold_index_prev) / 2)
return threshold_index


def search_for_zero_mass_index(kde, min_v, n_samples=100):
"""
Search near zero probability mass for boundary adjustment
"""
bins = np.linspace(min_v - 10, min_v, n_samples)
pd_vals = np.exp(kde.score_samples(bins.reshape(-1, 1)))
first_zero_idx = np.max(np.where(pd_vals < 1e-10))
return first_zero_idx, bins[first_zero_idx]


def kde_estimate(X, bandwidth=1, kernel="gaussian", search_bandwidth=True):
"""
Kernel density estimation to get probability
"""
# KDE estimate
from sklearn.neighbors import KernelDensity

best_bw = bandwidth
if search_bandwidth:
best_bw = grid_search_bandwith(X)
logging.info(f"learned best bandwidth {best_bw}")

kde = KernelDensity(bandwidth=best_bw, kernel=kernel)
kde.fit(X)

return kde, best_bw


def deterioration_index(
X,
low_bound,
up_bound,
threshold,
n_samples=10000,
plot_title="",
is_discrete=False,
prev_discrete_value_offset=1,
weight_sum_steps=10,
reverse=False,
bandwidth=1,
kernel="gaussian",
search_bandwidth=True,
do_plot=True,
):
"""
obtain deterioration index
X - the random sample of measurements
low_bound/up_nbound - the boundary values of the measurement
n_samples - number of bins to use for probability calculation. default is 2000.
plot_title - the title of the plot, if generates plot. default is empty string
is_discrete - whether the random sample is discrete. NB: this might be overwritten based on bandwidth learned.
Small bandwidths will always bring out pulse like PDFs. default is False.
prev_discrete_value_offset - the difference between the threshold and the previous legitimate value. default is 1.
weight_sum_steps - the number of bins for weighted sum of k-step cutoffs, default is 20
reverse - for calculating p(X<threshold), i.e., the smaller the measure value the more severe a patient is.
default is False
bandwidth - default bandwidth to use if not search bandwidth, default is 1
kernel - the kernel to use for KDE, default is gaussian.
search_bandwidth - whether to use grid search to find optimal bandwidth for X. default is True
do_plot - whether to generate plots, default is True
"""

# estimate density function
kde, fitted_bandwith = kde_estimate(X, bandwidth=bandwidth, kernel=kernel, search_bandwidth=search_bandwidth)

# detect pulse like PDF
if fitted_bandwith < 0.1:
# force is_discrete because the bestfitted_bandwith_bw would lead to one
logging.info(f"FORCED to be discrete because the bandwidth is {fitted_bandwith}")
is_discrete = True
elif fitted_bandwith > 0.7:
# force is_discrete to be false because the fitted_bandwith would lead to one
logging.info(f"FORCED to be NOT discrete because the bandwidth is {fitted_bandwith}")
is_discrete = False

# automatically adjust on boundaries
_, adjusted_min = search_for_zero_mass_index(kde, low_bound)
logging.info(f"adjusted min val {adjusted_min}")
boundary_offset = low_bound - adjusted_min
# orig_low_bound = low_bound
low_bound -= boundary_offset
up_bound += boundary_offset

# use learned KDE estimator to get probability
bins = np.linspace(low_bound, up_bound, n_samples)
kd_vals = kde.score_samples(bins.reshape(-1, 1)) # Get PDF values for each x
step_width = (up_bound - low_bound) / n_samples # get the step
prob = np.exp(kd_vals) * step_width # get the approximate prob at each point using the integral of the PDF

if do_plot:
tidx = get_threshold_index(
threshold,
low_bound,
is_discrete,
prev_discrete_value_offset,
step_width,
boundary_offset=boundary_offset,
)
vis_plot(
X,
bins,
kd_vals,
title=plot_title,
vlines=[threshold, round(tidx * step_width + low_bound, 2)],
vline_colors=["r", "b"],
vline_labels=["Threshold", "Boudary-adjusted"],
hist=False,
reverse=reverse,
)

# severity quantification

if reverse:
s = low_bound
e = min(threshold, up_bound)
else:
s = max(threshold, low_bound)
e = up_bound
# 1. binary like multimorbidity num > 3, yes or no
sq1 = stepped_severity(
prob,
s,
e,
1,
low_bound,
step_width,
is_discrete,
boundary_offset,
prev_val_offset=prev_discrete_value_offset,
reverse=reverse,
)
# 2. stepped quantification that considers higher/lower the value, more severe the patients are
sqs = stepped_severity(
prob,
s,
e,
weight_sum_steps,
low_bound,
step_width,
is_discrete,
boundary_offset,
prev_val_offset=prev_discrete_value_offset,
reverse=reverse,
)

return {"overall-prob": round(prob.sum(), 4), "one-step": round(sq1, 4), "k-step": round(sqs, 6), "|X|": len(X)}


def stepped_severity(
probs, s, e, steps, low_bound, step_width, is_discrete, boundary_offset, prev_val_offset=1, reverse=False
) -> float:
"""
To quantify severity by considering higher values as more severe.
This is done by weighted sum by integrating probs from a threshold - s
"""

bins = np.linspace(s, e, steps)
# bin_probs = [probs[get_threshold_index(t, low_bound, is_discrete, step_width):].sum() for t in bins]
bin_probs = []
for i, t in enumerate(bins):
idx1 = get_threshold_index(t, low_bound, is_discrete, prev_val_offset, step_width, boundary_offset)
if i < len(bins) - 1:
idx2 = get_threshold_index(
bins[i + 1], low_bound, is_discrete, prev_val_offset, step_width, boundary_offset
)
bin_probs.append(probs[idx1:idx2].sum())
else:
(
bin_probs.append(probs[idx1:].sum())
if not reverse
else bin_probs.append(
probs[
idx1 : get_threshold_index(
e, low_bound, is_discrete, prev_val_offset, step_width, boundary_offset
)
].sum()
)
)
s = 0

# weight functions
# weight_function = lambda x: log(x+2, 2)
def weight_function(x: Any) -> Union[int, Any]:
return x + 1

w = 0
for i, p in enumerate(bin_probs):
if not reverse:
s += weight_function(i) * p
else:
s += weight_function(len(bin_probs) - i) * p
w += weight_function(i)
return s / w


def db_ineq(di1, di2) -> float:
"""
quantify inequality
"""
return di1["k-step"] / di2["k-step"] - 1
Loading

0 comments on commit f8083e8

Please sign in to comment.