JuliaStats · oxinabox · Mar 26, 2018 · Mar 27, 2018
diff --git a/REQUIRE b/REQUIRE
@@ -3,3 +3,4 @@ Compat 0.17
 Distances 0.3.1
 NearestNeighbors 0.0.3
 StatsBase 0.9.0
+DataStructures # need a newer as yet un released version
diff --git a/src/Clustering.jl b/src/Clustering.jl
@@ -5,6 +5,7 @@ module Clustering
     using Distances
     using NearestNeighbors
     using StatsBase
+    using DataStructures
 
     import Base: show
     import StatsBase: IntegerVector, RealVector, RealMatrix, counts
@@ -54,7 +55,11 @@ module Clustering
     Hclust, hclust, cutree,
 
     # MCL
-    mcl, MCLResult
+    mcl, MCLResult,
+
+    # chinese_whispers
+    chinese_whispers, ChineseWhispersResult
+
 
     ## source files
 
@@ -66,6 +71,8 @@ module Clustering
     include("affprop.jl")
     include("dbscan.jl")
     include("mcl.jl")
+    include("chinesewhispers.jl")
+
     include("fuzzycmeans.jl")
 
     include("silhouette.jl")

diff --git a/src/chinesewhispers.jl b/src/chinesewhispers.jl
@@ -0,0 +1,74 @@
+
+
+# Abstractions proposed in https://github.com/JuliaLang/julia/issues/26613
+colinds(A::AbstractMatrix)  = indices(A,2)
+
+rowinds(A::AbstractMatrix, col::Integer) = indices(A,1)
+rowinds(A::SparseMatrixCSC, col::Integer) =	rowvals(A)[nzrange(A, col)]
+
+type ChineseWhispersResult <: ClusteringResult
+    assignments::Vector{Int}   # assignments (n)
+    counts::Vector{Int}        # number of samples assigned to each cluster (k)
+    iterations::Int            # number of elapsed iterations
+    converged::Bool            # whether the procedure converged
+end
+
+function ChineseWhispersResult(raw_assignments::Associative, iterations, converged)
+	raw_labels = getindex.(raw_assignments, 1:length(raw_assignments))
+	normalised_names = Dict{eltype(raw_labels), Int}()
+    counts = Int[]
+    assignments = Vector{Int}(length(raw_labels))
+    for (node, raw_lbl) in enumerate(raw_labels)
+        name = get!(normalised_names, raw_lbl) do
+            push!(counts, 0)
+            length(counts) #Normalised name is next usused integer
+        end
+
+		counts[name]+=1
+        assignments[node]=name
+    end
+    ChineseWhispersResult(assignments, counts, iterations, converged)
+end
+
+
+function chinese_whispers(sim::AbstractMatrix, max_iter=100; verbose=false)
+    node_labels = DefaultDict{Int,Int}(identity; passkey=true)
+    # Initially all nodes are labelled with their own ID. (nclusters==nnodes)
+
+    for ii in 1:max_iter
+        changed = false
+        for node in shuffle(colinds(sim))
+            old_lbl = node_labels[node]
+            node_labels[node] = update_node_label(node, sim, node_labels)
+            changed |= node_labels[node]==old_lbl
+        end
+
+        verbose && println("Iteration: $ii, lbls: $(node_labels)")
+
+        if !changed
+            return ChineseWhispersResult(node_labels, ii, true)
+        end
+    end
+
+    ChineseWhispersResult(node_labels, max_iter, false)
+end
+
+function update_node_label(node::N, adj::AbstractMatrix{W}, node_labels::Associative{N, L}) where {N<:Integer, W<:Real, L}
+	label_weights = Accumulator(L, W==Bool ? Int : W)
+
+    neighbours = rowinds(adj, node)
+    for neighbour in neighbours
+        lbl = node_labels[neighbour]
+        label_weights[lbl] += adj[node, neighbour]
+    end
+
+	old_lbl = node_labels[node]
+	label_weights[old_lbl]+=zero(W) # Make sure at least one entry in the weights
+    new_lbl, weight = first(most_common(label_weights, 1))
+    if weight==0 # No connection
+    	return old_lbl
+	else
+        return new_lbl
+    end
+end
+
diff --git a/test/chinesewhispers.jl b/test/chinesewhispers.jl
@@ -0,0 +1,66 @@
+using Base.Test
+using Distances
+using Clustering
+
+
+@testset "basic seperated graph" begin
+    eg1 = [
+        0 1 0 1;
+        1 0 0 0;
+        0 0 0 0;
+        1 0 0 0;
+    ]
+    @testset "$(first(vv))" for vv in [("dense", eg1), ("sparse", sparse(eg1))]
+        eg = last(vv)
+        res = chinese_whispers(eg)
+        lbls = assignments(res)
+        @test lbls[3] != lbls[1]
+        @test lbls[3] != lbls[2]
+        @test lbls[3] != lbls[4]
+
+        @test nclusters(res) >= 2
+        @test sum(counts(res)) == 4
+    end
+end
+
+@testset "planar based" begin
+    srand(1) # make determanistic
+    coordersA = randn(10, 2)
+    coordersB = randn(10, 2) .+ [5 5]
+
+    coords = [coordersA; coordersB]';
+
+    adj = 1./pairwise(Euclidean(), coords)
+    adj[isinf.(adj)]=0 # no selfsim
+    adj[rand(size(adj)).<0.6]=0 #remove some connections
+
+    res = chinese_whispers(adj)
+    lbls = assignments(res)
+    @test all(lbls[1].==(lbls[1:10]))
+    @test all(lbls[20].==(lbls[11:20]))
+
+    @test nclusters(res) == 2
+    @test counts(res) == [10, 10]
+end
+
+
+@testset "acts the same for all types" begin
+    examples = [
+        sprand(500,500,0.3),
+        sprand(1500,1500,0.1).>0.5, #Boolean elements
+        rand(200, 200)
+    ]
+	function test_assignments(x)
+		srand(1)
+		assignments(chinese_whispers(x))
+	end
+
+    for eg in (examples)
+        eg = collect(Symmetric(eg))
+        dense_res =  test_assignments(eg)
+        sparse_res = test_assignments(sparse(eg))
+        symetric_res = test_assignments(Symmetric(eg))
+
+        @test  dense_res == sparse_res == symetric_res
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,6 @@
 include("../src/Clustering.jl")
 using Compat
+using Base.Test
 
 tests = ["seeding",
          "kmeans",
@@ -11,11 +12,14 @@ tests = ["seeding",
          "varinfo",
          "randindex",
          "hclust",
-         "mcl"]
+         "mcl",
+         "chinesewhispers"
+        ]
 
 println("Runing tests:")
 for t in tests
     fp = "$(t).jl"
-    println("* $fp ...")
-    include(fp)
+    @testset "$t" begin
+        include(fp)
+    end
 end