From 85028e47e6e3e945049e95fdd1e12b4f2471b27a Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 18 Jun 2023 16:06:02 -0700 Subject: [PATCH 1/2] fix #252 and add test --- src/hclust.jl | 33 ++++++++++++++++++++---------- test/data/hclust_dist_issue252.txt | 30 +++++++++++++++++++++++++++ test/hclust.jl | 12 +++++++++++ 3 files changed, 64 insertions(+), 11 deletions(-) create mode 100644 test/data/hclust_dist_issue252.txt diff --git a/src/hclust.jl b/src/hclust.jl index d72ff28f..f67d3cce 100644 --- a/src/hclust.jl +++ b/src/hclust.jl @@ -818,31 +818,42 @@ function cutree(hclu::Hclust; # use k and h to calculate how many merges to do before cutting if k !== nothing k >= min(1, n) || throw(ArgumentError("`k` should be greater or equal $(min(1,n))")) - cutm = n - k + cutm = max(n - k, 0) else cutm = m end + horder = sortperm(hclu.heights) # indices of nodes by height if h !== nothing # adjust cutm w.r.t h - hix = findlast(hh -> hh ≤ h, hclu.heights) + hix = findlast(i -> hclu.heights[i] ≤ h, horder) if hix !== nothing && hix < cutm cutm = hix - elseif nmerges(hclu) >= 1 && first(hclu.heights) > h + elseif nmerges(hclu) >= 1 && hclu.heights[horder[1]] > h # corner case, the requested h smaller that the smallest nontrivial subtree cutm = 0 end end - clusters = Vector{Int}[] + clusters = Vector{Int}[] # contents of the tree nodes, nodes are indexed by height unmerged = fill(true, n) # if a node is not merged to a cluster noels = Int[] # placeholder for empty deactivated trees - i = 1 - while i ≤ cutm - c1 = hclu.merges[i, 1] - c2 = hclu.merges[i, 2] - (c1 < 0) && (unmerged[-c1] = false) - (c2 < 0) && (unmerged[-c2] = false) + hindex = invperm(horder) # index of each tree node by height, i.e. pos in `clusters` + resize!(horder, cutm) # we only process the first cutm merges + for i in horder # visit nodes by height + m1 = hclu.merges[i, 1] + m2 = hclu.merges[i, 2] + if m1 < 0 + unmerged[-m1] = false + c1 = m1 + else + c1 = hindex[m1] + end + if m2 < 0 + unmerged[-m2] = false + c2 = m2 + else + c2 = hindex[m2] + end merge_clusters!(clusters, c1, c2, noels) - i += 1 end ## build an array of cluster indices (R's order) res = fill(0, n) diff --git a/test/data/hclust_dist_issue252.txt b/test/data/hclust_dist_issue252.txt new file mode 100644 index 00000000..c8180523 --- /dev/null +++ b/test/data/hclust_dist_issue252.txt @@ -0,0 +1,30 @@ +0.0, 14.0327, 59.0164, 12.9196, 12.9393, 15.0348, 24.5178, 13.2429, 50.9107, 17.6459, 69.355, 34.1675, 34.8547, 34.5543, 33.8201, 27.5937, 29.64, 21.8638, 59.9892, 34.7607, 35.4906, 35.7891, 37.9107, 36.4857, 36.4511, 36.2258, 37.8759, 38.8741, 35.7656, 38.2971 +14.0327, 0.0, 61.1595, 6.30511, 12.8899, 12.7985, 33.9231, 8.07949, 50.2559, 20.5129, 74.5649, 45.2048, 45.9518, 45.6064, 44.8762, 37.2706, 39.4445, 31.1391, 63.9621, 45.8435, 32.9283, 33.4716, 35.3048, 38.4813, 35.9144, 34.0031, 37.2324, 40.6031, 35.2004, 37.4559 +59.0164, 61.1595, 0.0, 61.4733, 60.4953, 60.955, 59.8696, 61.2138, 35.0577, 60.5834, 24.4867, 62.4919, 62.9371, 62.5128, 62.5346, 62.4564, 63.0566, 61.0877, 28.2786, 62.741, 48.8001, 48.5941, 45.2675, 40.3098, 45.9336, 48.5559, 48.0879, 36.863, 45.1274, 47.4693 +12.9196, 6.30511, 61.4733, 0.0, 7.17129, 6.79235, 29.065, 2.02677, 50.3155, 14.8143, 73.6088, 41.6313, 42.4676, 42.0772, 41.3241, 36.4111, 38.6118, 30.1195, 65.193, 42.3441, 33.5152, 35.6795, 35.9863, 39.6374, 36.5063, 35.0409, 37.0726, 40.9757, 36.3902, 37.2138 +12.9393, 12.8899, 60.4953, 7.17129, 0.0, 3.38181, 23.2558, 5.60681, 49.1724, 7.73406, 71.3702, 37.696, 38.6409, 38.1869, 37.4212, 36.0701, 38.2433, 29.7466, 65.5283, 38.4972, 32.8945, 36.201, 35.4466, 39.1436, 35.346, 34.6379, 35.2062, 39.6623, 35.764, 35.3437 +15.0348, 12.7985, 60.955, 6.79235, 3.38181, 0.0, 24.1823, 4.84921, 49.4286, 8.84564, 71.7331, 38.3743, 39.3199, 38.8627, 38.1068, 37.3524, 39.4705, 31.2373, 66.2063, 39.1746, 32.9313, 36.8285, 35.414, 39.9843, 36.0543, 34.9663, 35.572, 40.3826, 36.4925, 35.5858 +24.5178, 33.9231, 59.8696, 29.065, 23.2558, 24.1823, 0.0, 27.6238, 52.8461, 18.214, 62.7188, 17.9421, 19.0976, 18.5033, 17.8206, 32.2154, 33.3521, 29.2072, 64.0474, 18.9069, 40.5728, 45.7027, 42.6558, 41.8837, 41.2799, 42.5292, 39.6789, 41.4957, 42.3084, 39.7116 +13.2429, 8.07949, 61.2138, 2.02677, 5.60681, 4.84921, 27.6238, 0.0, 49.972, 13.059, 72.9892, 40.655, 41.5241, 41.1136, 40.3593, 36.7822, 38.9544, 30.5531, 65.448, 41.3934, 33.2165, 35.8833, 35.6824, 39.6879, 36.3018, 34.9147, 36.5244, 40.7368, 36.3057, 36.6204 +50.9107, 50.2559, 35.0577, 50.3155, 49.1724, 49.4286, 52.8461, 49.972, 0.0, 49.0877, 50.2537, 60.5985, 61.285, 60.7887, 60.5459, 61.184, 62.4036, 57.734, 51.2385, 61.1149, 33.4539, 34.5031, 27.9481, 29.4544, 30.6737, 33.5955, 32.5981, 21.5229, 30.3649, 31.4588 +17.6459, 20.5129, 60.5834, 14.8143, 7.73406, 8.84564, 18.214, 13.059, 49.0877, 0.0, 69.7972, 34.7658, 35.8289, 35.3009, 34.5495, 37.947, 39.9788, 31.9987, 67.1828, 35.6611, 33.8895, 38.5825, 36.366, 40.4432, 35.9588, 35.9468, 34.8032, 39.9293, 36.9684, 34.8607 +69.355, 74.5649, 24.4867, 73.6088, 71.3702, 71.7331, 62.7188, 72.9892, 50.2537, 69.7972, 0.0, 59.8114, 59.9828, 59.6617, 59.9994, 66.1775, 65.9726, 67.0255, 33.9888, 59.8101, 61.755, 63.6975, 58.9579, 52.0227, 59.1528, 62.1158, 59.7825, 48.992, 58.9636, 59.1353 +34.1675, 45.2048, 62.4919, 41.6313, 37.696, 38.3743, 17.9421, 40.655, 60.5985, 34.7658, 59.8114, 0.0, 1.21978, 0.59072, 0.531466, 28.1729, 27.8415, 29.6464, 61.293, 0.98288, 50.2948, 54.4321, 51.9987, 46.6301, 50.4169, 51.844, 49.4732, 47.6202, 50.863, 49.5274 +34.8547, 45.9518, 62.9371, 42.4676, 38.6409, 39.3199, 19.0976, 41.5241, 61.285, 35.8289, 59.9828, 1.21978, 0.0, 0.721548, 1.29349, 28.1007, 27.6601, 29.8686, 61.3334, 0.331306, 51.0317, 55.0633, 52.7294, 47.0979, 51.12, 52.5397, 50.2334, 48.2034, 51.5207, 50.2971 +34.5543, 45.6064, 62.5128, 42.0772, 38.1869, 38.8627, 18.5033, 41.1136, 60.7887, 35.3009, 59.6617, 0.59072, 0.721548, 0.0, 0.860275, 28.1988, 27.8146, 29.813, 61.1547, 0.457807, 50.5757, 54.6751, 52.2596, 46.7539, 50.6713, 52.1072, 49.7508, 47.7774, 51.0987, 49.8052 +33.8201, 44.8762, 62.5346, 41.3241, 37.4212, 38.1068, 17.8206, 40.3593, 60.5459, 34.5495, 59.9994, 0.531466, 1.29349, 0.860275, 0.0, 27.8671, 27.5444, 29.3226, 61.2772, 1.15562, 50.1121, 54.2075, 51.8404, 46.4638, 50.2493, 51.6484, 49.3308, 47.5068, 50.6786, 49.3934 +27.5937, 37.2706, 62.4564, 36.4111, 36.0701, 37.3524, 32.2154, 36.7822, 61.184, 37.947, 66.1775, 28.1729, 28.1007, 28.1988, 27.8671, 0.0, 2.63232, 7.52505, 54.5206, 28.13, 49.1585, 49.4252, 51.3311, 42.0002, 48.1103, 49.1794, 50.9314, 46.4442, 48.1012, 51.52 +29.64, 39.4445, 63.0566, 38.6118, 38.2433, 39.4705, 33.3521, 38.9544, 62.4036, 39.9788, 65.9726, 27.8415, 27.6601, 27.8146, 27.5444, 2.63232, 0.0, 10.1261, 54.4047, 27.7083, 50.7126, 50.9708, 52.8134, 43.0546, 49.6321, 50.7315, 52.3909, 47.5628, 49.5588, 52.9636 +21.8638, 31.1391, 61.0877, 30.1195, 29.7466, 31.2373, 29.2072, 30.5531, 57.734, 31.9987, 67.0255, 29.6464, 29.8686, 29.813, 29.3226, 7.52505, 10.1261, 0.0, 55.5336, 29.8498, 44.7055, 45.0597, 47.0916, 39.2788, 43.7759, 44.7524, 46.6849, 43.3984, 43.9685, 47.3156 +59.9892, 63.9621, 28.2786, 65.193, 65.5283, 66.2063, 64.0474, 65.448, 51.2385, 67.1828, 33.9888, 61.293, 61.3334, 61.1547, 61.2772, 54.5206, 54.4047, 55.5336, 0.0, 61.2173, 58.136, 55.947, 55.9753, 45.0717, 55.1481, 57.2634, 58.9341, 46.0706, 53.7966, 58.7892 +34.7607, 45.8435, 62.741, 42.3441, 38.4972, 39.1746, 18.9069, 41.3934, 61.1149, 35.6611, 59.8101, 0.98288, 0.331306, 0.457807, 1.15562, 28.13, 27.7083, 29.8498, 61.2173, 0.0, 50.8985, 54.9515, 52.5866, 46.9935, 50.9871, 52.416, 50.0921, 48.0676, 51.3958, 50.1514 +35.4906, 32.9283, 48.8001, 33.5152, 32.8945, 32.9313, 40.5728, 33.2165, 33.4539, 33.8895, 61.755, 50.2948, 51.0317, 50.5757, 50.1121, 49.1585, 50.7126, 44.7055, 58.136, 50.8985, 0.0, 10.885, 6.06269, 18.3396, 8.17191, 4.12425, 7.03208, 18.8825, 9.18919, 7.23492 +35.7891, 33.4716, 48.5941, 35.6795, 36.201, 36.8285, 45.7027, 35.8833, 34.5031, 38.5825, 63.6975, 54.4321, 55.0633, 54.6751, 54.2075, 49.4252, 50.9708, 45.0597, 55.947, 54.9515, 10.885, 0.0, 11.9635, 16.2107, 9.97399, 7.72602, 14.2576, 19.2875, 6.84512, 15.0716 +37.9107, 35.3048, 45.2675, 35.9863, 35.4466, 35.414, 42.6558, 35.6824, 27.9481, 36.366, 58.9579, 51.9987, 52.7294, 52.2596, 51.8404, 51.3311, 52.8134, 47.0916, 55.9753, 52.5866, 6.06269, 11.9635, 0.0, 17.0465, 8.89131, 7.38189, 8.74113, 15.4745, 8.97042, 7.92026 +36.4857, 38.4813, 40.3098, 39.6374, 39.1436, 39.9843, 41.8837, 39.6879, 29.4544, 40.4432, 52.0227, 46.6301, 47.0979, 46.7539, 46.4638, 42.0002, 43.0546, 39.2788, 45.0717, 46.9935, 18.3396, 16.2107, 17.0465, 0.0, 13.7033, 16.5447, 18.8165, 8.94293, 12.2873, 19.1652 +36.4511, 35.9144, 45.9336, 36.5063, 35.346, 36.0543, 41.2799, 36.3018, 30.6737, 35.9588, 59.1528, 50.4169, 51.12, 50.6713, 50.2493, 48.1103, 49.6321, 43.7759, 55.1481, 50.9871, 8.17191, 9.97399, 8.89131, 13.7033, 0.0, 5.92747, 7.55535, 14.097, 4.73174, 8.52622 +36.2258, 34.0031, 48.5559, 35.0409, 34.6379, 34.9663, 42.5292, 34.9147, 33.5955, 35.9468, 62.1158, 51.844, 52.5397, 52.1072, 51.6484, 49.1794, 50.7315, 44.7524, 57.2634, 52.416, 4.12425, 7.72602, 7.38189, 16.5447, 5.92747, 0.0, 8.07625, 18.0647, 6.55522, 8.88438 +37.8759, 37.2324, 48.0879, 37.0726, 35.2062, 35.572, 39.6789, 36.5244, 32.5981, 34.8032, 59.7825, 49.4732, 50.2334, 49.7508, 49.3308, 50.9314, 52.3909, 46.6849, 58.9341, 50.0921, 7.03208, 14.2576, 8.74113, 18.8165, 7.55535, 8.07625, 0.0, 17.6948, 9.82219, 1.90955 +38.8741, 40.6031, 36.863, 40.9757, 39.6623, 40.3826, 41.4957, 40.7368, 21.5229, 39.9293, 48.992, 47.6202, 48.2034, 47.7774, 47.5068, 46.4442, 47.5628, 43.3984, 46.0706, 48.0676, 18.8825, 19.2875, 15.4745, 8.94293, 14.097, 18.0647, 17.6948, 0.0, 13.5721, 17.3675 +35.7656, 35.2004, 45.1274, 36.3902, 35.764, 36.4925, 42.3084, 36.3057, 30.3649, 36.9684, 58.9636, 50.863, 51.5207, 51.0987, 50.6786, 48.1012, 49.5588, 43.9685, 53.7966, 51.3958, 9.18919, 6.84512, 8.97042, 12.2873, 4.73174, 6.55522, 9.82219, 13.5721, 0.0, 10.5332 +38.2971, 37.4559, 47.4693, 37.2138, 35.3437, 35.5858, 39.7116, 36.6204, 31.4588, 34.8607, 59.1353, 49.5274, 50.2971, 49.8052, 49.3934, 51.52, 52.9636, 47.3156, 58.7892, 50.1514, 7.23492, 15.0716, 7.92026, 19.1652, 8.52622, 8.88438, 1.90955, 17.3675, 10.5332, 0.0 diff --git a/test/hclust.jl b/test/hclust.jl index b8ae530d..76c9e3df 100644 --- a/test/hclust.jl +++ b/test/hclust.jl @@ -221,4 +221,16 @@ end hclu2_ward = hclust(dist2_mtx, linkage=:ward) end +@testset "cuttree() with merges not sorted by height (#252)" begin + dist_mtx = readdlm(joinpath(@__DIR__, "data", "hclust_dist_issue252.txt"), ',', Float64) + + hclu_opt = hclust(dist_mtx, linkage=:complete, branchorder=:optimal) + clu_opt = cutree(hclu_opt, h=20) + + hclu_r = hclust(dist_mtx, linkage=:complete) + clu_r = cutree(hclu_r, h=20) + + @test clu_opt == clu_r +end + end # testset "hclust()" From b3908ebebeb3a8e3fe60b04aba886b0305da7ae5 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 18 Jun 2023 15:24:43 -0700 Subject: [PATCH 2/2] bump patch version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index ad4b6671..9ef8448b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Clustering" uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -version = "0.15.2" +version = "0.15.3" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"