JuliaStats · matbesancon · Nov 4, 2018 · Jul 9, 2017 · Jul 9, 2017 · Jul 11, 2017
diff --git a/docs/src/univariate.md b/docs/src/univariate.md
@@ -144,6 +144,7 @@ BetaBinomial
 Binomial
 Categorical
 DiscreteUniform
+DiscreteNonParametric
 Geometric
 Hypergeometric
 NegativeBinomial

diff --git a/src/Distributions.jl b/src/Distributions.jl
@@ -86,6 +86,7 @@ export
     FullNormal,
     FullNormalCanon,
     Gamma,
+    DiscreteNonParametric,
     GeneralizedPareto,
     GeneralizedExtremeValue,
     Geometric,

diff --git a/src/samplers.jl b/src/samplers.jl
@@ -1,14 +1,16 @@
 # delegation of samplers
 
-for fname in ["categorical.jl",
+for fname in ["aliastable.jl",
               "binomial.jl",
               "poissonbinomial.jl",
               "poisson.jl",
               "exponential.jl",
               "gamma.jl",
               "multinomial.jl",
               "vonmises.jl",
-              "vonmisesfisher.jl"]
+              "vonmisesfisher.jl",
+              "discretenonparametric.jl",
+              "categorical.jl"]
 
     include(joinpath("samplers", fname))
 end
diff --git a/src/samplers/aliastable.jl b/src/samplers/aliastable.jl
@@ -0,0 +1,25 @@
+struct AliasTable{S} <: Sampleable{Univariate,Discrete}
+    accept::Vector{Float64}
+    alias::Vector{Int}
+    isampler::S
+end
+ncategories(s::AliasTable) = length(s.accept)
+
+function AliasTable(probs::AbstractVector{T}) where T<:Real
+    n = length(probs)
+    n > 0 || throw(ArgumentError("The input probability vector is empty."))
+    accp = Vector{Float64}(undef, n)
+    alias = Vector{Int}(undef, n)
+    StatsBase.make_alias_table!(probs, 1.0, accp, alias)
+    AliasTable(accp, alias, Random.RangeGenerator(1:n))
+end
+
+function rand(rng::AbstractRNG, s::AliasTable)
+    i = rand(GLOBAL_RNG, s.isampler) % Int
+    u = rand()
+    @inbounds r = u < s.accept[i] ? i : s.alias[i]
+    r
+end
+rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)
+
+show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))
diff --git a/src/samplers/categorical.jl b/src/samplers/categorical.jl
@@ -4,7 +4,7 @@ struct CategoricalDirectSampler <: Sampleable{Univariate,Discrete}
     prob::Vector{Float64}
 
     function CategoricalDirectSampler(p::Vector{Float64})
-        isempty(p) && error("p is empty.")
+        isempty(p) && throw(ArgumentError("p is empty."))
         new(p)
     end
 end
@@ -22,32 +22,3 @@ function rand(s::CategoricalDirectSampler)
     return i
 end
 
-
-##### Alias Table #####
-
-struct AliasTable{S} <: Sampleable{Univariate,Discrete}
-    accept::Vector{Float64}
-    alias::Vector{Int}
-    isampler::S
-end
-ncategories(s::AliasTable) = length(s.accept)
-
-function AliasTable(probs::AbstractVector{T}) where T<:Real
-    n = length(probs)
-    n > 0 || error("The input probability vector is empty.")
-    accp = Vector{Float64}(undef, n)
-    alias = Vector{Int}(undef, n)
-    StatsBase.make_alias_table!(probs, 1.0, accp, alias)
-    AliasTable(accp, alias, Random.RangeGenerator(1:n))
-end
-
-function rand(rng::AbstractRNG, s::AliasTable)
-    i = rand(GLOBAL_RNG, s.isampler) % Int
-    u = rand()
-    @inbounds r = u < s.accept[i] ? i : s.alias[i]
-    r
-end
-rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)
-
-show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))
-
diff --git a/src/samplers/discretenonparametric.jl b/src/samplers/discretenonparametric.jl
@@ -0,0 +1,19 @@
+"""
+    DiscreteNonParametricSampler(xs, ps)
+
+Data structure for efficiently sampling from an arbitrary probability mass
+function defined by support `xs` and probabilities `ps`.
+"""
+struct DiscreteNonParametricSampler{T<:Real, S<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
+    support::S
+    aliastable::AliasTable
+
+    DiscreteNonParametricSampler{T,S}(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+        new(support, AliasTable(probs))
+end
+
+DiscreteNonParametricSampler(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+    DiscreteNonParametricSampler{T,S}(support, probs)
+
+rand(s::DiscreteNonParametricSampler) =
+    (@inbounds v = s.support[rand(s.aliastable)]; v)
diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl
@@ -13,60 +13,50 @@ probs(d)         # Get the probability vector, i.e. p
 ncategories(d)   # Get the number of categories, i.e. K
 ```
 Here, `p` must be a real vector, of which all components are nonnegative and sum to one.
+
 **Note:** The input vector `p` is directly used as a field of the constructed distribution, without being copied.
+
+`Categorical` is simply a type alias describing a special case of a
+`DiscreteNonParametric` distribution, so non-specialized methods defined for
+`DiscreteNonParametric` apply to `Categorical` as well.
+
 External links:
+
 * [Categorical distribution on Wikipedia](http://en.wikipedia.org/wiki/Categorical_distribution)
 """
-struct Categorical{T<:Real} <: DiscreteUnivariateDistribution
-    K::Int
-    p::Vector{T}
+Categorical{T} = DiscreteNonParametric{Int,T,UnitRange{Int}}
 
-    Categorical{T}(p::Vector{T}, ::NoArgCheck) where {T} = new{T}(length(p), p)
+Categorical{P}(p::Vector{P}, ::NoArgCheck) where P =
+    Categorical{P}(1:length(p), p, NoArgCheck())
+Categorical(p::Vector{P}, ::NoArgCheck) where P = Categorical{P}(p, NoArgCheck())
 
-    function Categorical{T}(p::Vector{T}) where T
-        @check_args(Categorical, isprobvec(p))
-        new{T}(length(p), p)
-    end
-
-    function Categorical{T}(k::Integer) where T
-        @check_args(Categorical, k >= 1)
-        new{T}(k, fill(1/k, k))
-    end
+function Categorical{P}(p::Vector{P}) where P
+    @check_args(DiscreteNonParametric, isprobvec(p))
+    Categorical{P}(1:length(p), p, NoArgCheck())
 end
+Categorical(p::Vector{P}) where P = Categorical{P}(p)
 
-Categorical(p::Vector{T}, ::NoArgCheck) where {T<:Real} = Categorical{T}(p, NoArgCheck())
-Categorical(p::Vector{T}) where {T<:Real} = Categorical{T}(p)
-Categorical(k::Integer) = Categorical{Float64}(k)
+function Categorical(k::Integer)
+    @check_args(DiscreteNonParametric, k >= 1)
+    Categorical{Float64}(1:k, fill(1/k, k), NoArgCheck())
+end
 
-@distr_support Categorical 1 d.K
+@distr_support Categorical 1 support(d).stop
 
 ### Conversions
 
 convert(::Type{Categorical{T}}, p::Vector{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(p))
-convert(::Type{Categorical{T}}, d::Categorical{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(d.p))
+convert(::Type{Categorical{T}}, d::Categorical{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(probs(d)))
 
 ### Parameters
 
-ncategories(d::Categorical) = d.K
-probs(d::Categorical) = d.p
-params(d::Categorical) = (d.p,)
+ncategories(d::Categorical) = support(d).stop
+params(d::Categorical) = (probs(d),)
 @inline partype(d::Categorical{T}) where {T<:Real} = T
 
-
 ### Statistics
 
-function categorical_mean(p::AbstractArray{T}) where T<:Real
-    k = length(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += p[i] * i
-    end
-    s
-end
-
-mean(d::Categorical) = categorical_mean(d.p)
-
-function median(d::Categorical{T}) where T<:Real
+function median(d::Categorical{T}) where {T<:Real}
     k = ncategories(d)
     p = probs(d)
     cp = zero(T)
@@ -78,78 +68,6 @@ function median(d::Categorical{T}) where T<:Real
     i
 end
 
-function var(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += abs2(i - m) * p[i]
-    end
-    s
-end
-
-function skewness(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^3 * p[i]
-    end
-    v = var(d)
-    s / (v * sqrt(v))
-end
-
-function kurtosis(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^4 * p[i]
-    end
-    s / abs2(var(d)) - 3
-end
-
-entropy(d::Categorical) = entropy(d.p)
-
-function mgf(d::Categorical{T}, t::Real) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += p[i] * exp(t)
-    end
-    s
-end
-
-function cf(d::Categorical{T}, t::Real) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    s = zero(T) + zero(T)*im
-    for i = 1:k
-        @inbounds s += p[i] * cis(t)
-    end
-    s
-end
-
-mode(d::Categorical) = argmax(probs(d))
-
-function modes(d::Categorical)
-    K = ncategories(d)
-    p = probs(d)
-    maxp = maximum(p)
-    r = Vector{Int}()
-    for k = 1:K
-        @inbounds if p[k] == maxp
-            push!(r, k)
-        end
-    end
-    r
-end
-
-
 ### Evaluation
 
 function cdf(d::Categorical{T}, x::Int) where T<:Real
@@ -164,21 +82,31 @@ function cdf(d::Categorical{T}, x::Int) where T<:Real
     return c
 end
 
-pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? d.p[x] : zero(T)
+pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? probs(d)[x] : zero(T)
 
-logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(d.p[x]) : -Inf
+logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(probs(d)[x]) : -Inf
 
-function quantile(d::Categorical, p::Float64)
-    0 <= p <= 1 || throw(DomainError())
-    k = ncategories(d)
-    pv = probs(d)
-    i = 1
-    v = pv[1]
-    while v < p && i < k
-        i += 1
-        @inbounds v += pv[i]
+function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Real}
+    vfirst = round(Int, first(rgn))
+    vlast = round(Int, last(rgn))
+    vl = max(vfirst, 1)
+    vr = min(vlast, ncategories(d))
+    p = probs(d)
+    if vl > vfirst
+        for i = 1:(vl - vfirst)
+            r[i] = zero(T)
+        end
     end
-    i
+    fm1 = vfirst - 1
+    for v = vl:vr
+        r[v - fm1] = p[v]
+    end
+    if vr < vlast
+        for i = (vr - vfirst + 2):length(rgn)
+            r[i] = zero(T)
+        end
+    end
+    return r
 end
 
 
@@ -250,4 +178,4 @@ fit_mle(::Type{Categorical}, x::AbstractArray{T}, w::AbstractArray{Float64}) whe
 fit(::Type{Categorical}, data::CategoricalData) = fit_mle(Categorical, data)
 fit(::Type{Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = fit_mle(Categorical, data, w)
 
-==(c1::Categorical,c2::Categorical) = (c1.K == c2.K) && all(c1.p .== c2.p)
+==(c1::Categorical,c2::Categorical) = (support(c1) == support(c2)) && all(probs(c1) .== probs(c2))