From 80534f23c1862f72ee5ebea21bec7da64a3ccf91 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 29 Sep 2017 21:17:06 -0400 Subject: [PATCH] Support more metrics in cdist and pdist Adds support for the `mahalanobis`, `seuclidean`, and `wminkowski` metrics in `cdist` and `pdist`. Includes supporting default arguments for these different metrics in `cdist` and `pdist` as needed. Provides tests against the SciPy equivalents for all of these functions and metrics with and without default arguments filled in. --- dask_distance/__init__.py | 28 +++++++++++++++++++++++++-- tests/test_dask_distance.py | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/dask_distance/__init__.py b/dask_distance/__init__.py index 878e698..d6f4123 100644 --- a/dask_distance/__init__.py +++ b/dask_distance/__init__.py @@ -53,12 +53,15 @@ def cdist(XA, XB, metric="euclidean", **kwargs): "hamming": hamming, "jaccard": jaccard, "kulsinski": kulsinski, + "mahalanobis": mahalanobis, "minkowski": minkowski, "rogerstanimoto": rogerstanimoto, "russellrao": russellrao, "sokalmichener": sokalmichener, "sokalsneath": sokalsneath, + "seuclidean": seuclidean, "sqeuclidean": sqeuclidean, + "wminkowski": wminkowski, "yule": yule, } @@ -93,8 +96,22 @@ def cdist(XA, XB, metric="euclidean", **kwargs): metric = func_mappings[metric] - if metric == minkowski: - kwargs["p"] = kwargs.get("p", 2) + if metric == mahalanobis: + if "VI" not in kwargs: + kwargs["VI"] = ( + dask.array.linalg.inv( + dask.array.cov(dask.array.vstack([XA, XB]).T) + ).T + ) + elif metric == minkowski: + kwargs.setdefault("p", 2) + elif metric == seuclidean: + if "V" not in kwargs: + kwargs["V"] = ( + dask.array.var(dask.array.vstack([XA, XB]), axis=0, ddof=1) + ) + elif metric == wminkowski: + kwargs.setdefault("p", 2) result = metric(XA, XB, **kwargs) @@ -124,6 +141,13 @@ def pdist(X, metric="euclidean", **kwargs): other tradeoffs. """ + if metric == "mahalanobis": + if "VI" not in kwargs: + kwargs["VI"] = dask.array.linalg.inv(dask.array.cov(X.T)).T + elif metric == "seuclidean": + if "V" not in kwargs: + kwargs["V"] = dask.array.var(X, axis=0, ddof=1) + result = cdist(X, X, metric, **kwargs) result = dask.array.triu(result, 1) diff --git a/tests/test_dask_distance.py b/tests/test_dask_distance.py index 7fbea64..bf81449 100644 --- a/tests/test_dask_distance.py +++ b/tests/test_dask_distance.py @@ -106,9 +106,15 @@ def test_1d_dist(funcname, kw, seed, size, chunks): ("correlation", {}), ("cosine", {}), ("euclidean", {}), + ("mahalanobis", {"VI": None}), + ("mahalanobis", {}), ("minkowski", {}), ("minkowski", {"p": 3}), + ("seuclidean", {"V": None}), + ("seuclidean", {}), ("sqeuclidean", {}), + ("wminkowski", {}), + ("wminkowski", {"p": 1.6}), (lambda u, v: (abs(u - v) ** 3).sum() ** (1.0 / 3.0), {}), ] ) @@ -133,6 +139,19 @@ def test_2d_cdist(metric, kw, seed, u_shape, u_chunks, v_shape, v_chunks): d_u = da.from_array(a_u, chunks=u_chunks) d_v = da.from_array(a_v, chunks=v_chunks) + if metric == "mahalanobis": + if "VI" not in kw: + kw["VI"] = 2 * np.random.random(2 * u_shape[-1:]) - 1 + elif kw["VI"] is None: + kw.pop("VI") + elif metric == "seuclidean": + if "V" not in kw: + kw["V"] = 2 * np.random.random(u_shape[-1:]) - 1 + elif kw["V"] is None: + kw.pop("V") + elif metric == "wminkowski": + kw["w"] = np.random.random(u_shape[-1:]) + a_r = spdist.cdist(a_u, a_v, metric, **kw) d_r = dask_distance.cdist(d_u, d_v, metric, **kw) @@ -148,9 +167,15 @@ def test_2d_cdist(metric, kw, seed, u_shape, u_chunks, v_shape, v_chunks): ("correlation", {}), ("cosine", {}), ("euclidean", {}), + ("mahalanobis", {"VI": None}), + ("mahalanobis", {}), ("minkowski", {}), ("minkowski", {"p": 3}), + ("seuclidean", {"V": None}), + ("seuclidean", {}), ("sqeuclidean", {}), + ("wminkowski", {}), + ("wminkowski", {"p": 1.6}), (lambda u, v: (abs(u - v) ** 3).sum() ** (1.0 / 3.0), {}), ] ) @@ -172,6 +197,19 @@ def test_2d_pdist(metric, kw, seed, u_shape, u_chunks): a_u = 2 * np.random.random(u_shape) - 1 d_u = da.from_array(a_u, chunks=u_chunks) + if metric == "mahalanobis": + if "VI" not in kw: + kw["VI"] = 2 * np.random.random(2 * u_shape[-1:]) - 1 + elif kw["VI"] is None: + kw.pop("VI") + elif metric == "seuclidean": + if "V" not in kw: + kw["V"] = 2 * np.random.random(u_shape[-1:]) - 1 + elif kw["V"] is None: + kw.pop("V") + elif metric == "wminkowski": + kw["w"] = np.random.random(u_shape[-1:]) + a_r = spdist.pdist(a_u, metric, **kw) d_r = dask_distance.pdist(d_u, metric, **kw)