Skip to content

Commit

Permalink
fix: Reduced warnings in distance calculations
Browse files Browse the repository at this point in the history
- added np errstate context
- added clips for non negative sqrt
- added clips for positive log
  • Loading branch information
sidchaini committed Oct 21, 2024
1 parent 9ca010c commit 5a3fe07
Show file tree
Hide file tree
Showing 4 changed files with 1,579 additions and 35 deletions.
2 changes: 1 addition & 1 deletion distclassipy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
from .classifier import DistanceMetricClassifier # noqa
from .distances import Distance # noqa

__version__ = "0.2.0a0"
__version__ = "0.2.0a1"
98 changes: 64 additions & 34 deletions distclassipy/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,11 @@ def hellinger(self, u, v):
1(4), 300-307.
"""
u, v = np.asarray(u), np.asarray(v)
return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
# Clip negative values to zero for valid sqrt
with np.errstate(divide="ignore", invalid="ignore"):
u = np.clip(u, a_min=0, a_max=None)
v = np.clip(v, a_min=0, a_max=None)
return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))

def jaccard(self, u, v):
"""Calculate the Jaccard distance between two vectors.
Expand Down Expand Up @@ -448,7 +452,8 @@ def lorentzian(self, u, v):
eschew the log of zero.
"""
u, v = np.asarray(u), np.asarray(v)
return np.sum(np.log(np.abs(u - v) + 1))
with np.errstate(divide="ignore", invalid="ignore"):
return np.sum(np.log(np.abs(u - v) + 1))

def marylandbridge(self, u, v):
"""Calculate the Maryland Bridge distance between two vectors.
Expand Down Expand Up @@ -679,7 +684,8 @@ def acc(self, u, v):
# 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
# """
# u, v = np.asarray(u), np.asarray(v)
# return -np.log(np.sum(np.sqrt(u * v)))
# with np.errstate(divide="ignore", invalid="ignore"):
# return -np.log(np.sum(np.sqrt(u * v)))

def chebyshev_min(self, u, v):
"""Calculate the minimum value distance between two vectors.
Expand Down Expand Up @@ -854,9 +860,12 @@ def jeffreys(self, u, v):
# vectors could be ignored or masked (see below).
# u = ma.masked_where(u == 0, u)
# v = ma.masked_where(v == 0, u)
u = np.where(u == 0, self.epsilon, u)
v = np.where(v == 0, self.epsilon, v)
return np.sum((u - v) * np.log(u / v))
with np.errstate(divide="ignore", invalid="ignore"):
u[u == 0] = self.epsilon
v[v == 0] = self.epsilon
# Clip negative values to zero for valid log
udivv = np.clip(u / v, a_min=self.epsilon, a_max=None)
return np.sum((u - v) * np.log(udivv))

def jensenshannon_divergence(self, u, v):
"""Calculate the Jensen-Shannon divergence between two vectors.
Expand Down Expand Up @@ -890,11 +899,17 @@ def jensenshannon_divergence(self, u, v):
return np.sum(el1 - el2 * el3)
"""
u, v = np.asarray(u), np.asarray(v)
u = np.where(u == 0, self.epsilon, u)
v = np.where(v == 0, self.epsilon, v)
dl = u * np.log(2 * u / (u + v))
dr = v * np.log(2 * v / (u + v))
return (np.sum(dl) + np.sum(dr)) / 2
with np.errstate(divide="ignore", invalid="ignore"):
# Clip negative values to zero for valid log
u[u == 0] = self.epsilon
v[v == 0] = self.epsilon

term1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
term2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)

dl = u * np.log(term1)
dr = v * np.log(term2)
return (np.sum(dl) + np.sum(dr)) / 2

def jensen_difference(self, u, v):
"""Calculate the Jensen difference between two vectors.
Expand Down Expand Up @@ -923,11 +938,14 @@ def jensen_difference(self, u, v):
1(4), 300-307.
"""
u, v = np.asarray(u), np.asarray(v)
u = np.where(u == 0, self.epsilon, u)
v = np.where(v == 0, self.epsilon, v)
el1 = (u * np.log(u) + v * np.log(v)) / 2
el2 = (u + v) / 2
return np.sum(el1 - el2 * np.log(el2))

with np.errstate(divide="ignore", invalid="ignore"):
# Clip negative values to eps for valid log
u = np.clip(u, self.epsilon, None)
v = np.clip(v, self.epsilon, None)
el1 = (u * np.log(u) + v * np.log(v)) / 2
el2 = np.clip((u + v) / 2, a_min=self.epsilon, a_max=None)
return np.sum(el1 - el2 * np.log(el2))

def kumarjohnson(self, u, v):
"""Calculate the Kumar-Johnson distance between two vectors.
Expand Down Expand Up @@ -980,7 +998,8 @@ def matusita(self, u, v):
Equals square root of Squared-chord distance.
"""
u, v = np.asarray(u), np.asarray(v)
return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
with np.errstate(divide="ignore", invalid="ignore"):
return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))

def minkowski(self, u, v, p=2):
"""Calculate the Minkowski distance between two vectors.
Expand Down Expand Up @@ -1027,7 +1046,8 @@ def penroseshape(self, u, v):
u, v = np.asarray(u), np.asarray(v)
umu = np.mean(u)
vmu = np.mean(v)
return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
with np.errstate(divide="ignore", invalid="ignore"):
return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))

def prob_chisq(self, u, v):
"""Calculate the Probabilistic chi-square distance between two vectors.
Expand Down Expand Up @@ -1139,7 +1159,8 @@ def squaredchord(self, u, v):
Equals to squared Matusita distance.
"""
u, v = np.asarray(u), np.asarray(v)
return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
with np.errstate(divide="ignore", invalid="ignore"):
return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)

def squared_euclidean(self, u, v):
"""Calculate the Squared Euclidean distance between two vectors.
Expand Down Expand Up @@ -1191,10 +1212,14 @@ def taneja(self, u, v):
1(4), 300-307.
"""
u, v = np.asarray(u), np.asarray(v)
u = np.where(u == 0, self.epsilon, u)
v = np.where(v == 0, self.epsilon, v)
uvsum = u + v
return np.sum((uvsum / 2) * np.log(uvsum / (2 * np.sqrt(u * v))))
with np.errstate(divide="ignore", invalid="ignore"):
u[u == 0] = self.epsilon
v[v == 0] = self.epsilon
uvsum = u + v
logarg = np.clip(
uvsum / (2 * np.sqrt(u * v)), a_min=self.epsilon, a_max=None
)
return np.sum((uvsum / 2) * np.log(logarg))

def tanimoto(self, u, v):
"""Calculate the Tanimoto distance between two vectors.
Expand Down Expand Up @@ -1248,11 +1273,14 @@ def topsoe(self, u, v):
Equals two times Jensen-Shannon divergence.
"""
u, v = np.asarray(u), np.asarray(v)
u = np.where(u == 0, self.epsilon, u)
v = np.where(v == 0, self.epsilon, v)
dl = u * np.log(2 * u / (u + v))
dr = v * np.log(2 * v / (u + v))
return np.sum(dl + dr)
with np.errstate(divide="ignore", invalid="ignore"):
u[u == 0] = self.epsilon
v[v == 0] = self.epsilon
logarg1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
logarg2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
dl = u * np.log(logarg1)
dr = v * np.log(logarg2)
return np.sum(dl + dr)

def vicis_symmetric_chisq(self, u, v):
"""Calculate the Vicis Symmetric chi-square distance.
Expand Down Expand Up @@ -1376,9 +1404,10 @@ def vicis_wave_hedges(self, u, v):
# 1(4), 300-307.
# """
# u, v = np.asarray(u), np.asarray(v)
# u = np.where(u == 0, self.epsilon, u)
# v = np.where(v == 0, self.epsilon, v)
# return np.sum(u * np.log(2 * u / (u + v)))
# u[u == 0] = self.epsilon
# v[v == 0] = self.epsilon
# with np.errstate(divide="ignore", invalid="ignore"):
# return np.sum(u * np.log(2 * u / (u + v)))

# def kl_divergence(self, u, v):
# """Calculate the Kullback-Leibler divergence between two vectors.
Expand All @@ -1404,9 +1433,10 @@ def vicis_wave_hedges(self, u, v):
# 1(4):300-307.
# """
# u, v = np.asarray(u), np.asarray(v)
# u = np.where(u == 0, self.epsilon, u)
# v = np.where(v == 0, self.epsilon, v)
# return np.sum(u * np.log(u / v))
# u[u == 0] = self.epsilon
# v[v == 0] = self.epsilon
# with np.errstate(divide="ignore", invalid="ignore"):
# return np.sum(u * np.log(u / v))

# def max_symmetric_chisq(self, u, v):
# """Calculate the maximum symmetric chi-square distance.
Expand Down
47 changes: 47 additions & 0 deletions distclassipy/metric_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import numpy as np
import pandas as pd
from .distances import Distance


class MetricEvaluator:
def __init__(self, metrics=None):
self.metrics = metrics or [
"euclidean",
"manhattan",
"canberra",
"chebyshev",
"cosine",
]
self.distance_calculator = Distance()

def evaluate_across_quantiles(self, X, y, quantiles=4):
quantile_indices = np.array_split(np.argsort(X, axis=0), quantiles)
best_metrics = {}

for q, indices in enumerate(quantile_indices):
X_q, y_q = X[indices], y[indices]
results = self.evaluate(X_q, y_q)
best_metric = max(results, key=results.get)
best_metrics[q] = best_metric

return best_metrics

def evaluate(self, X, y):
results = {}
for metric in self.metrics:
# Example: Calculate some performance metric for each distance
# This could be accuracy, computation time, etc.
performance = self._evaluate_metric(X, y, metric)
results[metric] = performance
return results

def _evaluate_metric(self, X, y, metric):
# Implement the logic to evaluate the performance of a given metric
# This is a placeholder for demonstration purposes
distances = []
for i in range(len(X)):
for j in range(i + 1, len(X)):
dist = getattr(self.distance_calculator, metric)(X[i], X[j])
distances.append(dist)
# Return some evaluation metric, e.g., mean distance
return np.mean(distances)
Loading

0 comments on commit 5a3fe07

Please sign in to comment.