fix: Reduced warnings in distance calculations

- added np errstate context - added clips for non negative sqrt - added clips for positive log
sidchaini · Oct 21, 2024 · 5a3fe07 · 5a3fe07
1 parent 9ca010c
commit 5a3fe07
Show file tree

Hide file tree

Showing 4 changed files with 1,579 additions and 35 deletions.
diff --git a/distclassipy/__init__.py b/distclassipy/__init__.py
@@ -25,4 +25,4 @@
 from .classifier import DistanceMetricClassifier  # noqa
 from .distances import Distance  # noqa
 
-__version__ = "0.2.0a0"
+__version__ = "0.2.0a1"
diff --git a/distclassipy/distances.py b/distclassipy/distances.py
@@ -398,7 +398,11 @@ def hellinger(self, u, v):
               1(4), 300-307.
         """
         u, v = np.asarray(u), np.asarray(v)
-        return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
+        # Clip negative values to zero for valid sqrt
+        with np.errstate(divide="ignore", invalid="ignore"):
+            u = np.clip(u, a_min=0, a_max=None)
+            v = np.clip(v, a_min=0, a_max=None)
+            return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
 
     def jaccard(self, u, v):
         """Calculate the Jaccard distance between two vectors.
@@ -448,7 +452,8 @@ def lorentzian(self, u, v):
             eschew the log of zero.
         """
         u, v = np.asarray(u), np.asarray(v)
-        return np.sum(np.log(np.abs(u - v) + 1))
+        with np.errstate(divide="ignore", invalid="ignore"):
+            return np.sum(np.log(np.abs(u - v) + 1))
 
     def marylandbridge(self, u, v):
         """Calculate the Maryland Bridge distance between two vectors.
@@ -679,7 +684,8 @@ def acc(self, u, v):
     #         3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
     #     """
     #     u, v = np.asarray(u), np.asarray(v)
-    #     return -np.log(np.sum(np.sqrt(u * v)))
+    #     with np.errstate(divide="ignore", invalid="ignore"):
+    #         return -np.log(np.sum(np.sqrt(u * v)))
 
     def chebyshev_min(self, u, v):
         """Calculate the minimum value distance between two vectors.
@@ -854,9 +860,12 @@ def jeffreys(self, u, v):
         # vectors could be ignored or masked (see below).
         # u = ma.masked_where(u == 0, u)
         # v = ma.masked_where(v == 0, u)
-        u = np.where(u == 0, self.epsilon, u)
-        v = np.where(v == 0, self.epsilon, v)
-        return np.sum((u - v) * np.log(u / v))
+        with np.errstate(divide="ignore", invalid="ignore"):
+            u[u == 0] = self.epsilon
+            v[v == 0] = self.epsilon
+            # Clip negative values to zero for valid log
+            udivv = np.clip(u / v, a_min=self.epsilon, a_max=None)
+            return np.sum((u - v) * np.log(udivv))
 
     def jensenshannon_divergence(self, u, v):
         """Calculate the Jensen-Shannon divergence between two vectors.
@@ -890,11 +899,17 @@ def jensenshannon_divergence(self, u, v):
             return np.sum(el1 - el2 * el3)
         """
         u, v = np.asarray(u), np.asarray(v)
-        u = np.where(u == 0, self.epsilon, u)
-        v = np.where(v == 0, self.epsilon, v)
-        dl = u * np.log(2 * u / (u + v))
-        dr = v * np.log(2 * v / (u + v))
-        return (np.sum(dl) + np.sum(dr)) / 2
+        with np.errstate(divide="ignore", invalid="ignore"):
+            # Clip negative values to zero for valid log
+            u[u == 0] = self.epsilon
+            v[v == 0] = self.epsilon
+
+            term1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
+            term2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
+
+            dl = u * np.log(term1)
+            dr = v * np.log(term2)
+            return (np.sum(dl) + np.sum(dr)) / 2
 
     def jensen_difference(self, u, v):
         """Calculate the Jensen difference between two vectors.
@@ -923,11 +938,14 @@ def jensen_difference(self, u, v):
                1(4), 300-307.
         """
         u, v = np.asarray(u), np.asarray(v)
-        u = np.where(u == 0, self.epsilon, u)
-        v = np.where(v == 0, self.epsilon, v)
-        el1 = (u * np.log(u) + v * np.log(v)) / 2
-        el2 = (u + v) / 2
-        return np.sum(el1 - el2 * np.log(el2))
+
+        with np.errstate(divide="ignore", invalid="ignore"):
+            # Clip negative values to eps for valid log
+            u = np.clip(u, self.epsilon, None)
+            v = np.clip(v, self.epsilon, None)
+            el1 = (u * np.log(u) + v * np.log(v)) / 2
+            el2 = np.clip((u + v) / 2, a_min=self.epsilon, a_max=None)
+            return np.sum(el1 - el2 * np.log(el2))
 
     def kumarjohnson(self, u, v):
         """Calculate the Kumar-Johnson distance between two vectors.
@@ -980,7 +998,8 @@ def matusita(self, u, v):
             Equals square root of Squared-chord distance.
         """
         u, v = np.asarray(u), np.asarray(v)
-        return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
+        with np.errstate(divide="ignore", invalid="ignore"):
+            return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
 
     def minkowski(self, u, v, p=2):
         """Calculate the Minkowski distance between two vectors.
@@ -1027,7 +1046,8 @@ def penroseshape(self, u, v):
         u, v = np.asarray(u), np.asarray(v)
         umu = np.mean(u)
         vmu = np.mean(v)
-        return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
+        with np.errstate(divide="ignore", invalid="ignore"):
+            return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
 
     def prob_chisq(self, u, v):
         """Calculate the Probabilistic chi-square distance between two vectors.
@@ -1139,7 +1159,8 @@ def squaredchord(self, u, v):
             Equals to squared Matusita distance.
         """
         u, v = np.asarray(u), np.asarray(v)
-        return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
+        with np.errstate(divide="ignore", invalid="ignore"):
+            return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
 
     def squared_euclidean(self, u, v):
         """Calculate the Squared Euclidean distance between two vectors.
@@ -1191,10 +1212,14 @@ def taneja(self, u, v):
                1(4), 300-307.
         """
         u, v = np.asarray(u), np.asarray(v)
-        u = np.where(u == 0, self.epsilon, u)
-        v = np.where(v == 0, self.epsilon, v)
-        uvsum = u + v
-        return np.sum((uvsum / 2) * np.log(uvsum / (2 * np.sqrt(u * v))))
+        with np.errstate(divide="ignore", invalid="ignore"):
+            u[u == 0] = self.epsilon
+            v[v == 0] = self.epsilon
+            uvsum = u + v
+            logarg = np.clip(
+                uvsum / (2 * np.sqrt(u * v)), a_min=self.epsilon, a_max=None
+            )
+            return np.sum((uvsum / 2) * np.log(logarg))
 
     def tanimoto(self, u, v):
         """Calculate the Tanimoto distance between two vectors.
@@ -1248,11 +1273,14 @@ def topsoe(self, u, v):
             Equals two times Jensen-Shannon divergence.
         """
         u, v = np.asarray(u), np.asarray(v)
-        u = np.where(u == 0, self.epsilon, u)
-        v = np.where(v == 0, self.epsilon, v)
-        dl = u * np.log(2 * u / (u + v))
-        dr = v * np.log(2 * v / (u + v))
-        return np.sum(dl + dr)
+        with np.errstate(divide="ignore", invalid="ignore"):
+            u[u == 0] = self.epsilon
+            v[v == 0] = self.epsilon
+            logarg1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
+            logarg2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
+            dl = u * np.log(logarg1)
+            dr = v * np.log(logarg2)
+            return np.sum(dl + dr)
 
     def vicis_symmetric_chisq(self, u, v):
         """Calculate the Vicis Symmetric chi-square distance.
@@ -1376,9 +1404,10 @@ def vicis_wave_hedges(self, u, v):
     #            1(4), 300-307.
     #     """
     #     u, v = np.asarray(u), np.asarray(v)
-    #     u = np.where(u == 0, self.epsilon, u)
-    #     v = np.where(v == 0, self.epsilon, v)
-    #     return np.sum(u * np.log(2 * u / (u + v)))
+    #     u[u == 0] = self.epsilon
+    #     v[v == 0] = self.epsilon
+    #     with np.errstate(divide="ignore", invalid="ignore"):
+    #         return np.sum(u * np.log(2 * u / (u + v)))
 
     # def kl_divergence(self, u, v):
     #     """Calculate the Kullback-Leibler divergence between two vectors.
@@ -1404,9 +1433,10 @@ def vicis_wave_hedges(self, u, v):
     #            1(4):300-307.
     #     """
     #     u, v = np.asarray(u), np.asarray(v)
-    #     u = np.where(u == 0, self.epsilon, u)
-    #     v = np.where(v == 0, self.epsilon, v)
-    #     return np.sum(u * np.log(u / v))
+    #     u[u == 0] = self.epsilon
+    #     v[v == 0] = self.epsilon
+    #     with np.errstate(divide="ignore", invalid="ignore"):
+    #         return np.sum(u * np.log(u / v))
 
     # def max_symmetric_chisq(self, u, v):
     #     """Calculate the maximum symmetric chi-square distance.

diff --git a/distclassipy/metric_evaluation.py b/distclassipy/metric_evaluation.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pandas as pd
+from .distances import Distance
+
+
+class MetricEvaluator:
+    def __init__(self, metrics=None):
+        self.metrics = metrics or [
+            "euclidean",
+            "manhattan",
+            "canberra",
+            "chebyshev",
+            "cosine",
+        ]
+        self.distance_calculator = Distance()
+
+    def evaluate_across_quantiles(self, X, y, quantiles=4):
+        quantile_indices = np.array_split(np.argsort(X, axis=0), quantiles)
+        best_metrics = {}
+
+        for q, indices in enumerate(quantile_indices):
+            X_q, y_q = X[indices], y[indices]
+            results = self.evaluate(X_q, y_q)
+            best_metric = max(results, key=results.get)
+            best_metrics[q] = best_metric
+
+        return best_metrics
+
+    def evaluate(self, X, y):
+        results = {}
+        for metric in self.metrics:
+            # Example: Calculate some performance metric for each distance
+            # This could be accuracy, computation time, etc.
+            performance = self._evaluate_metric(X, y, metric)
+            results[metric] = performance
+        return results
+
+    def _evaluate_metric(self, X, y, metric):
+        # Implement the logic to evaluate the performance of a given metric
+        # This is a placeholder for demonstration purposes
+        distances = []
+        for i in range(len(X)):
+            for j in range(i + 1, len(X)):
+                dist = getattr(self.distance_calculator, metric)(X[i], X[j])
+                distances.append(dist)
+        # Return some evaluation metric, e.g., mean distance
+        return np.mean(distances)