Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add support for uncertainty in ratios #61

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions carl/distributions/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from astropy.stats import bayesian_blocks
from itertools import product
from scipy.interpolate import interp1d
from sklearn.utils import check_random_state
from sklearn.utils import check_array
from scipy.interpolate import interp1d

from .base import DistributionMixin

Expand Down Expand Up @@ -41,7 +41,7 @@ def __init__(self, bins=10, range=None, interpolation=None,
self.interpolation = interpolation
self.variable_width = variable_width

def pdf(self, X, **kwargs):
def pdf(self, X, return_std=False, **kwargs):
X = check_array(X)

if self.ndim_ == 1 and self.interpolation:
Expand All @@ -58,10 +58,27 @@ def pdf(self, X, **kwargs):
indices[X[:, j] == self.edges_[j][-2]] -= 1
all_indices.append(indices)

return self.histogram_[all_indices]
p = self.histogram_[all_indices]

if return_std:
return p, self.errors_[all_indices]
else:
return p

def nll(self, X, return_std=False, **kwargs):
if not return_std:
p = self.pdf(X, **kwargs)
else:
p, std = self.pdf(X, return_std=True, **kwargs)

if not return_std:
return -np.log(p)
else:
s = std / p
s[~np.isfinite(s)] = 0

return -np.log(p), s

def nll(self, X, **kwargs):
return -np.log(self.pdf(X, **kwargs))

def rvs(self, n_samples, random_state=None, **kwargs):
rng = check_random_state(random_state)
Expand All @@ -88,15 +105,21 @@ def rvs(self, n_samples, random_state=None, **kwargs):
def fit(self, X, sample_weight=None, **kwargs):
# Checks
X = check_array(X)

if sample_weight is not None and len(sample_weight) != len(X):
raise ValueError
if (self.bins == "blocks" or
self.variable_width) and (X.shape[1] != 1):
raise ValueError

# Compute histogram and edges
if self.bins == "blocks":
bins = bayesian_blocks(X.ravel(), fitness="events", p0=0.0001)
range_ = self.range[0] if self.range else None
h, e = np.histogram(X.ravel(), bins=bins, range=range_,
weights=sample_weight, normed=False)
counts = h
volumes = e[1:] - e[:-1]
e = [e]

elif self.variable_width:
Expand All @@ -107,19 +130,30 @@ def fit(self, X, sample_weight=None, **kwargs):
h, e = np.histogram(X.ravel(), bins=ticks, range=range_,
normed=False, weights=sample_weight)
h, e = h.astype(float), e.astype(float)
widths = e[1:] - e[:-1]
h = h / widths / h.sum()
counts = h
volumes = e[1:] - e[:-1]
e = [e]

else:
bins = self.bins
h, e = np.histogramdd(X, bins=bins, range=self.range,
weights=sample_weight, normed=True)
weights=sample_weight, normed=False)
counts = h
volumes = np.ones_like(h)
for e_i in np.meshgrid(*[e_i[1:] - e_i[:-1] for e_i in e],
indexing="ij"):
volumes *= e_i

# Histogram and bin uncertainties
h = counts / counts.sum() / volumes
errors = np.sqrt(counts) / counts.sum() / volumes # Poisson errors

# Add empty bins for out of bound samples
for j in range(X.shape[1]):
h = np.insert(h, 0, 0., axis=j)
h = np.insert(h, h.shape[j], 0., axis=j)
errors = np.insert(errors, 0, 0., axis=j)
errors = np.insert(errors, errors.shape[j], 0., axis=j)
e[j] = np.insert(e[j], 0, -np.inf)
e[j] = np.insert(e[j], len(e[j]), np.inf)

Expand All @@ -135,6 +169,8 @@ def fit(self, X, sample_weight=None, **kwargs):

self.histogram_ = h
self.edges_ = e
self.counts_ = counts
self.errors_ = errors
self.ndim_ = X.shape[1]
return self

Expand Down
13 changes: 10 additions & 3 deletions carl/learning/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,23 @@ def predict(self, X):
self.classes_[1],
self.classes_[0])

def predict_proba(self, X):
def predict_proba(self, X, return_std=False):
X = check_array(X)

df = self.regressor_.predict(X)
if not return_std:
df = self.regressor_.predict(X)
else:
df, std = self.regressor_.predict(X, return_std=True)

df = np.clip(df, 0., 1.)
probas = np.zeros((len(X), 2))
probas[:, 0] = 1. - df
probas[:, 1] = df

return probas
if not return_std:
return probas
else:
return probas, std

def score(self, X, y):
return self.regressor_.score(X, y)
Expand Down
72 changes: 61 additions & 11 deletions carl/learning/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
"""

def __init__(self, base_estimator, method="histogram", bins="auto",
interpolation=None, variable_width=False, cv=1):
range=None, interpolation=None, variable_width=False, cv=1):
"""Constructor.

Parameters
Expand All @@ -51,6 +51,10 @@ def __init__(self, base_estimator, method="histogram", bins="auto",
* `bins` [int, default="auto"]:
The number of bins, if `method` is `"histogram"`.

* `range` [(lower, upper), optional]:
The lower and upper bounds. If `None`, bounds are automatically
inferred from the data. Used only if `method` is `"histogram"`.

* `interpolation` [string, optional]
Specifies the kind of interpolation between bins as a string
(`"linear"`, `"nearest"`, `"zero"`, `"slinear"`, `"quadratic"`,
Expand All @@ -75,6 +79,7 @@ def __init__(self, base_estimator, method="histogram", bins="auto",
self.base_estimator = base_estimator
self.method = method
self.bins = bins
self.range = range
self.interpolation = interpolation
self.variable_width = variable_width
self.cv = cv
Expand Down Expand Up @@ -109,7 +114,8 @@ def fit(self, X, y, sample_weight=None):
# Calibrator
if self.method == "histogram":
base_calibrator = HistogramCalibrator(
bins=self.bins, interpolation=self.interpolation,
bins=self.bins, range=self.range,
interpolation=self.interpolation,
variable_width=self.variable_width)
elif self.method == "kde":
base_calibrator = KernelDensityCalibrator()
Expand Down Expand Up @@ -205,7 +211,7 @@ def predict(self, X):
self.classes_[1],
self.classes_[0])

def predict_proba(self, X):
def predict_proba(self, X, return_std=False):
"""Predict the posterior probabilities of classification for `X`.

Parameters
Expand All @@ -218,15 +224,31 @@ def predict_proba(self, X):
* `probas` [array, shape=(n_samples, n_classes)]:
The predicted probabilities.
"""
if return_std and self.method != "histogram":
raise ValueError

p = np.zeros((len(X), 2))
std = np.zeros(len(X))

for clf, calibrator in zip(self.classifiers_, self.calibrators_):
p[:, 1] += calibrator.predict(clf.predict_proba(X)[:, 1])
if not return_std:
p[:, 1] += calibrator.predict(clf.predict_proba(X)[:, 1])

else:
p_, std_ = calibrator.predict(clf.predict_proba(X)[:, 1],
return_std=True)
p[:, 1] += p_
std += std_ ** 2

p[:, 1] /= len(self.classifiers_)
p[:, 0] = 1. - p[:, 1]
std = (1. / len(self.classifiers_) ** 2 * std) ** 0.5
# assume independence? ok for cv==2

return p
if not return_std:
return p
else:
return p, std

def _clone(self):
estimator = clone(self, original=True)
Expand Down Expand Up @@ -311,6 +333,7 @@ def fit(self, T, y, sample_weight=None):
t_min = max(0, min(np.min(t0), np.min(t1)) - self.eps)
t_max = min(1, max(np.max(t0), np.max(t1)) + self.eps)
range = [(t_min, t_max)]

# Fit
self.calibrator0 = Histogram(bins=bins, range=range,
interpolation=self.interpolation,
Expand All @@ -324,27 +347,54 @@ def fit(self, T, y, sample_weight=None):

return self

def predict(self, T):
def predict(self, T, return_std=False):
"""Calibrate data.

Parameters
----------
* `T` [array-like, shape=(n_samples,)]:
Data to calibrate.

* `return_std` [boolean]
If True, return the error associated with
`p(T|y=1)/(p(T|y=0)+p(T|y=1))`.

Returns
-------
* `Tt` [array, shape=(n_samples,)]:
Calibrated data.
"""
T = column_or_1d(T).reshape(-1, 1)
num = self.calibrator1.pdf(T)
den = self.calibrator0.pdf(T) + self.calibrator1.pdf(T)

p = num / den
p[den == 0] = 0.5
if not return_std:
num = self.calibrator1.pdf(T)
den = self.calibrator0.pdf(T) + self.calibrator1.pdf(T)

return p
p = num / den
p[den == 0] = 0.5

return p

else:
p1, std1 = self.calibrator1.pdf(T, return_std=True)
p0, std0 = self.calibrator0.pdf(T, return_std=True)

num = p1
den = p0 + p1
p = num / den
p[den == 0] = 0.5

std_num = std1
std_den = (std0 ** 2 + std1 ** 2) ** 0.5
std_p = (p ** 2 * ((std_num / num) ** 2 +
(std_den / den) ** 2 -
2 * std_num ** 2 / (num * den))) ** 0.5
# nb: cov(a, a+b) = var(a) when a and b are independent

std_p[den == 0] = 0
std_p[~np.isfinite(std_p)] = 0

return p, std_p


class KernelDensityCalibrator(BaseEstimator, RegressorMixin):
Expand Down
15 changes: 11 additions & 4 deletions carl/ratios/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def fit(self, X=None, y=None, numerator=None,
"""
return self

def predict(self, X, log=False, **kwargs):
def predict(self, X, return_std=False, log=False, **kwargs):
"""Predict the density ratio `r(x_i)` for all `x_i` in `X`.

Parameters
Expand All @@ -73,7 +73,7 @@ def predict(self, X, log=False, **kwargs):
"""
raise NotImplementedError

def nllr(self, X, **kwargs):
def nllr(self, X, return_std=False, **kwargs):
"""Negative log-likelihood ratio.

This method is a shortcut for `-ratio.predict(X, log=True).sum()`.
Expand All @@ -88,13 +88,20 @@ def nllr(self, X, **kwargs):
* `nllr` [float]:
The negative log-likelihood ratio.
"""
ratios = self.predict(X, log=True, **kwargs)
if not return_std:
ratios = self.predict(X, log=True, **kwargs)
else:
ratios, std = self.predict(X, log=True, return_std=True, **kwargs)

mask = np.isfinite(ratios)

if mask.sum() < len(ratios):
warnings.warn("r(X) contains non-finite values.")

return -np.sum(ratios[mask])
if not return_std:
return -np.sum(ratios[mask])
else:
return -np.sum(ratios[mask]), (std[mask] ** 2).sum() ** 0.5

def score(self, X, y, finite_only=True, **kwargs):
"""Negative MSE between predicted and known ratios.
Expand Down
45 changes: 37 additions & 8 deletions carl/ratios/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def fit(self, X=None, y=None, sample_weight=None,

return self

def predict(self, X, log=False, **kwargs):
def predict(self, X, return_std=False, log=False, **kwargs):
"""Predict the density ratio `r(x_i)` for all `x_i` in `X`.

Parameters
Expand All @@ -149,15 +149,44 @@ def predict(self, X, log=False, **kwargs):
The predicted ratio `r(X)`.
"""
if self.identity_:
if log:
return np.zeros(len(X))
if not return_std:
if log:
return np.zeros(len(X))
else:
return np.ones(len(X))
else:
return np.ones(len(X))
if log:
return np.zeros(len(X)), np.zeros(len(X))
else:
return np.ones(len(X)), np.zeros(len(X))

else:
p = self.classifier_.predict_proba(X)
if not return_std:
p = self.classifier_.predict_proba(X)

if log:
return np.log(p[:, 0]) - np.log(p[:, 1])
else:
return np.divide(p[:, 0], p[:, 1])

if log:
return np.log(p[:, 0]) - np.log(p[:, 1])
else:
return np.divide(p[:, 0], p[:, 1])
p, std_p = self.classifier_.predict_proba(X, return_std=True)

r = np.divide(p[:, 0], p[:, 1])
std_r = (r ** 2 * ((std_p / p[:, 0]) ** 2 +
(std_p / p[:, 1]) ** 2 -
2 * (-std_p ** 2) / (p[:, 0] *
p[:, 1]))) ** 0.5
# nb: cov(p, 1-p) = -var(p) = -std^2(p)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead, derive std_r as dr/dp * std_p

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried it both ways and got the same answer. Maybe good to double check.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure if there are numerical issues.
pretty sure this simplifies to
std_r = std_p / ( p[:,1]**2)

#std_r = std_p / (p[:, 1] ** 2)

std_r[~np.isfinite(std_r)] = 0

if not log:
return r, std_r

else:
std_r = std_r / r
std_r[~np.isfinite(std_r)] = 0

return np.log(p[:, 0]) - np.log(p[:, 1]), std_r
Loading