Skip to content

Commit

Permalink
Merge pull request #254 from JuliaStats/fix_cuttree_172
Browse files Browse the repository at this point in the history
Fix cutree() for trees with unsorted merges
  • Loading branch information
alyst authored Jun 18, 2023
2 parents 6021a28 + b3908eb commit 45d4ccc
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Clustering"
uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
version = "0.15.2"
version = "0.15.3"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand Down
33 changes: 22 additions & 11 deletions src/hclust.jl
Original file line number Diff line number Diff line change
Expand Up @@ -818,31 +818,42 @@ function cutree(hclu::Hclust;
# use k and h to calculate how many merges to do before cutting
if k !== nothing
k >= min(1, n) || throw(ArgumentError("`k` should be greater or equal $(min(1,n))"))
cutm = n - k
cutm = max(n - k, 0)
else
cutm = m
end
horder = sortperm(hclu.heights) # indices of nodes by height
if h !== nothing
# adjust cutm w.r.t h
hix = findlast(hh -> hh h, hclu.heights)
hix = findlast(i -> hclu.heights[i] h, horder)
if hix !== nothing && hix < cutm
cutm = hix
elseif nmerges(hclu) >= 1 && first(hclu.heights) > h
elseif nmerges(hclu) >= 1 && hclu.heights[horder[1]] > h
# corner case, the requested h smaller that the smallest nontrivial subtree
cutm = 0
end
end
clusters = Vector{Int}[]
clusters = Vector{Int}[] # contents of the tree nodes, nodes are indexed by height
unmerged = fill(true, n) # if a node is not merged to a cluster
noels = Int[] # placeholder for empty deactivated trees
i = 1
while i cutm
c1 = hclu.merges[i, 1]
c2 = hclu.merges[i, 2]
(c1 < 0) && (unmerged[-c1] = false)
(c2 < 0) && (unmerged[-c2] = false)
hindex = invperm(horder) # index of each tree node by height, i.e. pos in `clusters`
resize!(horder, cutm) # we only process the first cutm merges
for i in horder # visit nodes by height
m1 = hclu.merges[i, 1]
m2 = hclu.merges[i, 2]
if m1 < 0
unmerged[-m1] = false
c1 = m1
else
c1 = hindex[m1]
end
if m2 < 0
unmerged[-m2] = false
c2 = m2
else
c2 = hindex[m2]
end
merge_clusters!(clusters, c1, c2, noels)
i += 1
end
## build an array of cluster indices (R's order)
res = fill(0, n)
Expand Down
30 changes: 30 additions & 0 deletions test/data/hclust_dist_issue252.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
0.0, 14.0327, 59.0164, 12.9196, 12.9393, 15.0348, 24.5178, 13.2429, 50.9107, 17.6459, 69.355, 34.1675, 34.8547, 34.5543, 33.8201, 27.5937, 29.64, 21.8638, 59.9892, 34.7607, 35.4906, 35.7891, 37.9107, 36.4857, 36.4511, 36.2258, 37.8759, 38.8741, 35.7656, 38.2971
14.0327, 0.0, 61.1595, 6.30511, 12.8899, 12.7985, 33.9231, 8.07949, 50.2559, 20.5129, 74.5649, 45.2048, 45.9518, 45.6064, 44.8762, 37.2706, 39.4445, 31.1391, 63.9621, 45.8435, 32.9283, 33.4716, 35.3048, 38.4813, 35.9144, 34.0031, 37.2324, 40.6031, 35.2004, 37.4559
59.0164, 61.1595, 0.0, 61.4733, 60.4953, 60.955, 59.8696, 61.2138, 35.0577, 60.5834, 24.4867, 62.4919, 62.9371, 62.5128, 62.5346, 62.4564, 63.0566, 61.0877, 28.2786, 62.741, 48.8001, 48.5941, 45.2675, 40.3098, 45.9336, 48.5559, 48.0879, 36.863, 45.1274, 47.4693
12.9196, 6.30511, 61.4733, 0.0, 7.17129, 6.79235, 29.065, 2.02677, 50.3155, 14.8143, 73.6088, 41.6313, 42.4676, 42.0772, 41.3241, 36.4111, 38.6118, 30.1195, 65.193, 42.3441, 33.5152, 35.6795, 35.9863, 39.6374, 36.5063, 35.0409, 37.0726, 40.9757, 36.3902, 37.2138
12.9393, 12.8899, 60.4953, 7.17129, 0.0, 3.38181, 23.2558, 5.60681, 49.1724, 7.73406, 71.3702, 37.696, 38.6409, 38.1869, 37.4212, 36.0701, 38.2433, 29.7466, 65.5283, 38.4972, 32.8945, 36.201, 35.4466, 39.1436, 35.346, 34.6379, 35.2062, 39.6623, 35.764, 35.3437
15.0348, 12.7985, 60.955, 6.79235, 3.38181, 0.0, 24.1823, 4.84921, 49.4286, 8.84564, 71.7331, 38.3743, 39.3199, 38.8627, 38.1068, 37.3524, 39.4705, 31.2373, 66.2063, 39.1746, 32.9313, 36.8285, 35.414, 39.9843, 36.0543, 34.9663, 35.572, 40.3826, 36.4925, 35.5858
24.5178, 33.9231, 59.8696, 29.065, 23.2558, 24.1823, 0.0, 27.6238, 52.8461, 18.214, 62.7188, 17.9421, 19.0976, 18.5033, 17.8206, 32.2154, 33.3521, 29.2072, 64.0474, 18.9069, 40.5728, 45.7027, 42.6558, 41.8837, 41.2799, 42.5292, 39.6789, 41.4957, 42.3084, 39.7116
13.2429, 8.07949, 61.2138, 2.02677, 5.60681, 4.84921, 27.6238, 0.0, 49.972, 13.059, 72.9892, 40.655, 41.5241, 41.1136, 40.3593, 36.7822, 38.9544, 30.5531, 65.448, 41.3934, 33.2165, 35.8833, 35.6824, 39.6879, 36.3018, 34.9147, 36.5244, 40.7368, 36.3057, 36.6204
50.9107, 50.2559, 35.0577, 50.3155, 49.1724, 49.4286, 52.8461, 49.972, 0.0, 49.0877, 50.2537, 60.5985, 61.285, 60.7887, 60.5459, 61.184, 62.4036, 57.734, 51.2385, 61.1149, 33.4539, 34.5031, 27.9481, 29.4544, 30.6737, 33.5955, 32.5981, 21.5229, 30.3649, 31.4588
17.6459, 20.5129, 60.5834, 14.8143, 7.73406, 8.84564, 18.214, 13.059, 49.0877, 0.0, 69.7972, 34.7658, 35.8289, 35.3009, 34.5495, 37.947, 39.9788, 31.9987, 67.1828, 35.6611, 33.8895, 38.5825, 36.366, 40.4432, 35.9588, 35.9468, 34.8032, 39.9293, 36.9684, 34.8607
69.355, 74.5649, 24.4867, 73.6088, 71.3702, 71.7331, 62.7188, 72.9892, 50.2537, 69.7972, 0.0, 59.8114, 59.9828, 59.6617, 59.9994, 66.1775, 65.9726, 67.0255, 33.9888, 59.8101, 61.755, 63.6975, 58.9579, 52.0227, 59.1528, 62.1158, 59.7825, 48.992, 58.9636, 59.1353
34.1675, 45.2048, 62.4919, 41.6313, 37.696, 38.3743, 17.9421, 40.655, 60.5985, 34.7658, 59.8114, 0.0, 1.21978, 0.59072, 0.531466, 28.1729, 27.8415, 29.6464, 61.293, 0.98288, 50.2948, 54.4321, 51.9987, 46.6301, 50.4169, 51.844, 49.4732, 47.6202, 50.863, 49.5274
34.8547, 45.9518, 62.9371, 42.4676, 38.6409, 39.3199, 19.0976, 41.5241, 61.285, 35.8289, 59.9828, 1.21978, 0.0, 0.721548, 1.29349, 28.1007, 27.6601, 29.8686, 61.3334, 0.331306, 51.0317, 55.0633, 52.7294, 47.0979, 51.12, 52.5397, 50.2334, 48.2034, 51.5207, 50.2971
34.5543, 45.6064, 62.5128, 42.0772, 38.1869, 38.8627, 18.5033, 41.1136, 60.7887, 35.3009, 59.6617, 0.59072, 0.721548, 0.0, 0.860275, 28.1988, 27.8146, 29.813, 61.1547, 0.457807, 50.5757, 54.6751, 52.2596, 46.7539, 50.6713, 52.1072, 49.7508, 47.7774, 51.0987, 49.8052
33.8201, 44.8762, 62.5346, 41.3241, 37.4212, 38.1068, 17.8206, 40.3593, 60.5459, 34.5495, 59.9994, 0.531466, 1.29349, 0.860275, 0.0, 27.8671, 27.5444, 29.3226, 61.2772, 1.15562, 50.1121, 54.2075, 51.8404, 46.4638, 50.2493, 51.6484, 49.3308, 47.5068, 50.6786, 49.3934
27.5937, 37.2706, 62.4564, 36.4111, 36.0701, 37.3524, 32.2154, 36.7822, 61.184, 37.947, 66.1775, 28.1729, 28.1007, 28.1988, 27.8671, 0.0, 2.63232, 7.52505, 54.5206, 28.13, 49.1585, 49.4252, 51.3311, 42.0002, 48.1103, 49.1794, 50.9314, 46.4442, 48.1012, 51.52
29.64, 39.4445, 63.0566, 38.6118, 38.2433, 39.4705, 33.3521, 38.9544, 62.4036, 39.9788, 65.9726, 27.8415, 27.6601, 27.8146, 27.5444, 2.63232, 0.0, 10.1261, 54.4047, 27.7083, 50.7126, 50.9708, 52.8134, 43.0546, 49.6321, 50.7315, 52.3909, 47.5628, 49.5588, 52.9636
21.8638, 31.1391, 61.0877, 30.1195, 29.7466, 31.2373, 29.2072, 30.5531, 57.734, 31.9987, 67.0255, 29.6464, 29.8686, 29.813, 29.3226, 7.52505, 10.1261, 0.0, 55.5336, 29.8498, 44.7055, 45.0597, 47.0916, 39.2788, 43.7759, 44.7524, 46.6849, 43.3984, 43.9685, 47.3156
59.9892, 63.9621, 28.2786, 65.193, 65.5283, 66.2063, 64.0474, 65.448, 51.2385, 67.1828, 33.9888, 61.293, 61.3334, 61.1547, 61.2772, 54.5206, 54.4047, 55.5336, 0.0, 61.2173, 58.136, 55.947, 55.9753, 45.0717, 55.1481, 57.2634, 58.9341, 46.0706, 53.7966, 58.7892
34.7607, 45.8435, 62.741, 42.3441, 38.4972, 39.1746, 18.9069, 41.3934, 61.1149, 35.6611, 59.8101, 0.98288, 0.331306, 0.457807, 1.15562, 28.13, 27.7083, 29.8498, 61.2173, 0.0, 50.8985, 54.9515, 52.5866, 46.9935, 50.9871, 52.416, 50.0921, 48.0676, 51.3958, 50.1514
35.4906, 32.9283, 48.8001, 33.5152, 32.8945, 32.9313, 40.5728, 33.2165, 33.4539, 33.8895, 61.755, 50.2948, 51.0317, 50.5757, 50.1121, 49.1585, 50.7126, 44.7055, 58.136, 50.8985, 0.0, 10.885, 6.06269, 18.3396, 8.17191, 4.12425, 7.03208, 18.8825, 9.18919, 7.23492
35.7891, 33.4716, 48.5941, 35.6795, 36.201, 36.8285, 45.7027, 35.8833, 34.5031, 38.5825, 63.6975, 54.4321, 55.0633, 54.6751, 54.2075, 49.4252, 50.9708, 45.0597, 55.947, 54.9515, 10.885, 0.0, 11.9635, 16.2107, 9.97399, 7.72602, 14.2576, 19.2875, 6.84512, 15.0716
37.9107, 35.3048, 45.2675, 35.9863, 35.4466, 35.414, 42.6558, 35.6824, 27.9481, 36.366, 58.9579, 51.9987, 52.7294, 52.2596, 51.8404, 51.3311, 52.8134, 47.0916, 55.9753, 52.5866, 6.06269, 11.9635, 0.0, 17.0465, 8.89131, 7.38189, 8.74113, 15.4745, 8.97042, 7.92026
36.4857, 38.4813, 40.3098, 39.6374, 39.1436, 39.9843, 41.8837, 39.6879, 29.4544, 40.4432, 52.0227, 46.6301, 47.0979, 46.7539, 46.4638, 42.0002, 43.0546, 39.2788, 45.0717, 46.9935, 18.3396, 16.2107, 17.0465, 0.0, 13.7033, 16.5447, 18.8165, 8.94293, 12.2873, 19.1652
36.4511, 35.9144, 45.9336, 36.5063, 35.346, 36.0543, 41.2799, 36.3018, 30.6737, 35.9588, 59.1528, 50.4169, 51.12, 50.6713, 50.2493, 48.1103, 49.6321, 43.7759, 55.1481, 50.9871, 8.17191, 9.97399, 8.89131, 13.7033, 0.0, 5.92747, 7.55535, 14.097, 4.73174, 8.52622
36.2258, 34.0031, 48.5559, 35.0409, 34.6379, 34.9663, 42.5292, 34.9147, 33.5955, 35.9468, 62.1158, 51.844, 52.5397, 52.1072, 51.6484, 49.1794, 50.7315, 44.7524, 57.2634, 52.416, 4.12425, 7.72602, 7.38189, 16.5447, 5.92747, 0.0, 8.07625, 18.0647, 6.55522, 8.88438
37.8759, 37.2324, 48.0879, 37.0726, 35.2062, 35.572, 39.6789, 36.5244, 32.5981, 34.8032, 59.7825, 49.4732, 50.2334, 49.7508, 49.3308, 50.9314, 52.3909, 46.6849, 58.9341, 50.0921, 7.03208, 14.2576, 8.74113, 18.8165, 7.55535, 8.07625, 0.0, 17.6948, 9.82219, 1.90955
38.8741, 40.6031, 36.863, 40.9757, 39.6623, 40.3826, 41.4957, 40.7368, 21.5229, 39.9293, 48.992, 47.6202, 48.2034, 47.7774, 47.5068, 46.4442, 47.5628, 43.3984, 46.0706, 48.0676, 18.8825, 19.2875, 15.4745, 8.94293, 14.097, 18.0647, 17.6948, 0.0, 13.5721, 17.3675
35.7656, 35.2004, 45.1274, 36.3902, 35.764, 36.4925, 42.3084, 36.3057, 30.3649, 36.9684, 58.9636, 50.863, 51.5207, 51.0987, 50.6786, 48.1012, 49.5588, 43.9685, 53.7966, 51.3958, 9.18919, 6.84512, 8.97042, 12.2873, 4.73174, 6.55522, 9.82219, 13.5721, 0.0, 10.5332
38.2971, 37.4559, 47.4693, 37.2138, 35.3437, 35.5858, 39.7116, 36.6204, 31.4588, 34.8607, 59.1353, 49.5274, 50.2971, 49.8052, 49.3934, 51.52, 52.9636, 47.3156, 58.7892, 50.1514, 7.23492, 15.0716, 7.92026, 19.1652, 8.52622, 8.88438, 1.90955, 17.3675, 10.5332, 0.0
12 changes: 12 additions & 0 deletions test/hclust.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,16 @@ end
hclu2_ward = hclust(dist2_mtx, linkage=:ward)
end

@testset "cuttree() with merges not sorted by height (#252)" begin
dist_mtx = readdlm(joinpath(@__DIR__, "data", "hclust_dist_issue252.txt"), ',', Float64)

hclu_opt = hclust(dist_mtx, linkage=:complete, branchorder=:optimal)
clu_opt = cutree(hclu_opt, h=20)

hclu_r = hclust(dist_mtx, linkage=:complete)
clu_r = cutree(hclu_r, h=20)

@test clu_opt == clu_r
end

end # testset "hclust()"

2 comments on commit 45d4ccc

@alyst
Copy link
Member Author

@alyst alyst commented on 45d4ccc Jun 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/85823

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.15.3 -m "<description of version>" 45d4ccce981bc28dd88a409dd9a6c16b0408c28f
git push origin v0.15.3

Please sign in to comment.