diff --git a/REQUIRE b/REQUIRE index 64ce6eae..c62f524c 100644 --- a/REQUIRE +++ b/REQUIRE @@ -3,3 +3,4 @@ Compat 0.17 Distances 0.3.1 NearestNeighbors 0.0.3 StatsBase 0.9.0 +DataStructures # need a newer as yet un released version diff --git a/src/Clustering.jl b/src/Clustering.jl index 5a32eb41..7ecc3e57 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -5,6 +5,7 @@ module Clustering using Distances using NearestNeighbors using StatsBase + using DataStructures import Base: show import StatsBase: IntegerVector, RealVector, RealMatrix, counts @@ -54,7 +55,11 @@ module Clustering Hclust, hclust, cutree, # MCL - mcl, MCLResult + mcl, MCLResult, + + # chinese_whispers + chinese_whispers, ChineseWhispersResult + ## source files @@ -66,6 +71,8 @@ module Clustering include("affprop.jl") include("dbscan.jl") include("mcl.jl") + include("chinesewhispers.jl") + include("fuzzycmeans.jl") include("silhouette.jl") diff --git a/src/chinesewhispers.jl b/src/chinesewhispers.jl new file mode 100644 index 00000000..51f66efc --- /dev/null +++ b/src/chinesewhispers.jl @@ -0,0 +1,74 @@ + + +# Abstractions proposed in https://github.com/JuliaLang/julia/issues/26613 +colinds(A::AbstractMatrix) = indices(A,2) + +rowinds(A::AbstractMatrix, col::Integer) = indices(A,1) +rowinds(A::SparseMatrixCSC, col::Integer) = rowvals(A)[nzrange(A, col)] + +type ChineseWhispersResult <: ClusteringResult + assignments::Vector{Int} # assignments (n) + counts::Vector{Int} # number of samples assigned to each cluster (k) + iterations::Int # number of elapsed iterations + converged::Bool # whether the procedure converged +end + +function ChineseWhispersResult(raw_assignments::Associative, iterations, converged) + raw_labels = getindex.(raw_assignments, 1:length(raw_assignments)) + normalised_names = Dict{eltype(raw_labels), Int}() + counts = Int[] + assignments = Vector{Int}(length(raw_labels)) + for (node, raw_lbl) in enumerate(raw_labels) + name = get!(normalised_names, raw_lbl) do + push!(counts, 0) + length(counts) #Normalised name is next usused integer + end + + counts[name]+=1 + assignments[node]=name + end + ChineseWhispersResult(assignments, counts, iterations, converged) +end + + +function chinese_whispers(sim::AbstractMatrix, max_iter=100; verbose=false) + node_labels = DefaultDict{Int,Int}(identity; passkey=true) + # Initially all nodes are labelled with their own ID. (nclusters==nnodes) + + for ii in 1:max_iter + changed = false + for node in shuffle(colinds(sim)) + old_lbl = node_labels[node] + node_labels[node] = update_node_label(node, sim, node_labels) + changed |= node_labels[node]==old_lbl + end + + verbose && println("Iteration: $ii, lbls: $(node_labels)") + + if !changed + return ChineseWhispersResult(node_labels, ii, true) + end + end + + ChineseWhispersResult(node_labels, max_iter, false) +end + +function update_node_label(node::N, adj::AbstractMatrix{W}, node_labels::Associative{N, L}) where {N<:Integer, W<:Real, L} + label_weights = Accumulator(L, W==Bool ? Int : W) + + neighbours = rowinds(adj, node) + for neighbour in neighbours + lbl = node_labels[neighbour] + label_weights[lbl] += adj[node, neighbour] + end + + old_lbl = node_labels[node] + label_weights[old_lbl]+=zero(W) # Make sure at least one entry in the weights + new_lbl, weight = first(most_common(label_weights, 1)) + if weight==0 # No connection + return old_lbl + else + return new_lbl + end +end + diff --git a/test/chinesewhispers.jl b/test/chinesewhispers.jl new file mode 100644 index 00000000..1be373ec --- /dev/null +++ b/test/chinesewhispers.jl @@ -0,0 +1,66 @@ +using Base.Test +using Distances +using Clustering + + +@testset "basic seperated graph" begin + eg1 = [ + 0 1 0 1; + 1 0 0 0; + 0 0 0 0; + 1 0 0 0; + ] + @testset "$(first(vv))" for vv in [("dense", eg1), ("sparse", sparse(eg1))] + eg = last(vv) + res = chinese_whispers(eg) + lbls = assignments(res) + @test lbls[3] != lbls[1] + @test lbls[3] != lbls[2] + @test lbls[3] != lbls[4] + + @test nclusters(res) >= 2 + @test sum(counts(res)) == 4 + end +end + +@testset "planar based" begin + srand(1) # make determanistic + coordersA = randn(10, 2) + coordersB = randn(10, 2) .+ [5 5] + + coords = [coordersA; coordersB]'; + + adj = 1./pairwise(Euclidean(), coords) + adj[isinf.(adj)]=0 # no selfsim + adj[rand(size(adj)).<0.6]=0 #remove some connections + + res = chinese_whispers(adj) + lbls = assignments(res) + @test all(lbls[1].==(lbls[1:10])) + @test all(lbls[20].==(lbls[11:20])) + + @test nclusters(res) == 2 + @test counts(res) == [10, 10] +end + + +@testset "acts the same for all types" begin + examples = [ + sprand(500,500,0.3), + sprand(1500,1500,0.1).>0.5, #Boolean elements + rand(200, 200) + ] + function test_assignments(x) + srand(1) + assignments(chinese_whispers(x)) + end + + for eg in (examples) + eg = collect(Symmetric(eg)) + dense_res = test_assignments(eg) + sparse_res = test_assignments(sparse(eg)) + symetric_res = test_assignments(Symmetric(eg)) + + @test dense_res == sparse_res == symetric_res + end +end diff --git a/test/runtests.jl b/test/runtests.jl index b04295d9..bb9e0604 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ include("../src/Clustering.jl") using Compat +using Base.Test tests = ["seeding", "kmeans", @@ -11,11 +12,14 @@ tests = ["seeding", "varinfo", "randindex", "hclust", - "mcl"] + "mcl", + "chinesewhispers" + ] println("Runing tests:") for t in tests fp = "$(t).jl" - println("* $fp ...") - include(fp) + @testset "$t" begin + include(fp) + end end