Convert Categorical to a type alias of Generic

JuliaStats · Jul 13, 2017 · 11a576e · 11a576e
1 parent 78195a8
commit 11a576e
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 137 deletions.
diff --git a/src/samplers.jl b/src/samplers.jl
@@ -8,7 +8,8 @@ for fname in ["categorical.jl",
               "gamma.jl",
               "multinomial.jl",
               "vonmises.jl",
-              "vonmisesfisher.jl"]
+              "vonmisesfisher.jl",
+              "generic.jl"]
 
     include(joinpath("samplers", fname))
 end
diff --git a/src/samplers/generic.jl b/src/samplers/generic.jl
@@ -1,16 +1,16 @@
-struct GenericSampler{T<:Real} <: Sampleable{Univariate,Discrete}
-    vals::Vector{T}
+struct GenericSampler{T<:Real, S<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
+    support::S
     aliastable::AliasTable
 
-    GenericSampler{T}(vals::Vector{T}, probs::Vector{<:Real}) where T<:Real =
-        new(vals, AliasTable(probs))
+    GenericSampler{T,S}(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+        new(support, AliasTable(probs))
 end
 
-GenericSampler(vals::Vector{T}, probs::Vector{<:Real}) where T<:Real =
-    GenericSampler{T}(vals, probs)
+GenericSampler(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+    GenericSampler{T,S}(support, probs)
 
 sampler(d::Generic) =
-    GenericSampler(d.vals, d.probs)
+    GenericSampler(d.support, d.p)
 
 rand(s::GenericSampler) =
-    (@inbounds v = s.vals[rand(s.aliastable)]; v)
+    (@inbounds v = s.support[rand(s.aliastable)]; v)
diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl
@@ -14,55 +14,36 @@ External links:
 * [Categorical distribution on Wikipedia](http://en.wikipedia.org/wiki/Categorical_distribution)
 """
 
-immutable Categorical{T<:Real} <: DiscreteUnivariateDistribution
-    K::Int
-    p::Vector{T}
+Categorical{T} = Generic{Int,T,UnitRange{Int}}
 
-    (::Type{Categorical{T}}){T}(p::Vector{T}, ::NoArgCheck) = new{T}(length(p), p)
+(::Type{Categorical})(p::Vector{P}, ::NoArgCheck) where P =
+    Generic{Int,P,UnitRange{Int}}(1:length(p), p, NoArgCheck())
 
-    function (::Type{Categorical{T}}){T}(p::Vector{T})
-        @check_args(Categorical, isprobvec(p))
-        new{T}(length(p), p)
-    end
+function (::Type{Categorical})(p::Vector{P}) where P
+    @check_args(Generic, isprobvec(p))
+    Generic{Int,P,UnitRange{Int}}(1:length(p), p, NoArgCheck())
+end
 
-    function (::Type{Categorical{T}}){T}(k::Integer)
-        @check_args(Categorical, k >= 1)
-        new{T}(k, fill(1/k, k))
-    end
+function (::Type{Categorical})(k::Integer)
+    @check_args(Generic, k >= 1)
+    Generic{Int,Float64,UnitRange{Int}}(1:k, fill(1/k, k), NoArgCheck())
 end
 
-Categorical{T<:Real}(p::Vector{T}, ::NoArgCheck) = Categorical{T}(p, NoArgCheck())
-Categorical{T<:Real}(p::Vector{T}) = Categorical{T}(p)
-Categorical(k::Integer) = Categorical{Float64}(k)
 
-@distr_support Categorical 1 d.K
+@distr_support Categorical 1 support(d).stop
 
 ### Conversions
 
 convert{T<:Real, S<:Real}(::Type{Categorical{T}}, p::Vector{S}) = Categorical(Vector{T}(p))
-convert{T<:Real, S<:Real}(::Type{Categorical{T}}, d::Categorical{S}) = Categorical(Vector{T}(d.p))
 
 ### Parameters
 
-ncategories(d::Categorical) = d.K
-probs(d::Categorical) = d.p
-params(d::Categorical) = (d.p,)
+ncategories(d::Categorical) = support(d).stop
+params(d::Categorical) = (probs(d),)
 @inline partype{T<:Real}(d::Categorical{T}) = T
 
-
 ### Statistics
 
-function categorical_mean{T<:Real}(p::AbstractArray{T})
-    k = length(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += p[i] * i
-    end
-    s
-end
-
-mean(d::Categorical) = categorical_mean(d.p)
-
 function median{T<:Real}(d::Categorical{T})
     k = ncategories(d)
     p = probs(d)
@@ -75,42 +56,6 @@ function median{T<:Real}(d::Categorical{T})
     i
 end
 
-function var{T<:Real}(d::Categorical{T})
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += abs2(i - m) * p[i]
-    end
-    s
-end
-
-function skewness{T<:Real}(d::Categorical{T})
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^3 * p[i]
-    end
-    v = var(d)
-    s / (v * sqrt(v))
-end
-
-function kurtosis{T<:Real}(d::Categorical{T})
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^4 * p[i]
-    end
-    s / abs2(var(d)) - 3
-end
-
-entropy(d::Categorical) = entropy(d.p)
-
 function mgf{T<:Real}(d::Categorical{T}, t::Real)
     k = ncategories(d)
     p = probs(d)
@@ -131,22 +76,6 @@ function cf{T<:Real}(d::Categorical{T}, t::Real)
     s
 end
 
-mode(d::Categorical) = indmax(probs(d))
-
-function modes(d::Categorical)
-    K = ncategories(d)
-    p = probs(d)
-    maxp = maximum(p)
-    r = Vector{Int}(0)
-    for k = 1:K
-        @inbounds if p[k] == maxp
-            push!(r, k)
-        end
-    end
-    r
-end
-
-
 ### Evaluation
 
 function cdf{T<:Real}(d::Categorical{T}, x::Int)
@@ -161,17 +90,15 @@ function cdf{T<:Real}(d::Categorical{T}, x::Int)
     return c
 end
 
-pdf{T<:Real}(d::Categorical{T}, x::Int) = insupport(d, x) ? d.p[x] : zero(T)
-
-logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(d.p[x]) : -Inf
+pdf{T<:Real}(d::Categorical{T}, x::Int) = insupport(d, x) ? probs(d)[x] : zero(T)
 
-pdf(d::Categorical) = copy(d.p)
+logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(probs(d)[x]) : -Inf
 
 function _pdf!{T<:Real}(r::AbstractArray, d::Categorical{T}, rgn::UnitRange)
     vfirst = round(Int, first(rgn))
     vlast = round(Int, last(rgn))
     vl = max(vfirst, 1)
-    vr = min(vlast, d.K)
+    vr = min(vlast, ncategories(d))
     p = probs(d)
     if vl > vfirst
         for i = 1:(vl - vfirst)
@@ -191,20 +118,6 @@ function _pdf!{T<:Real}(r::AbstractArray, d::Categorical{T}, rgn::UnitRange)
 end
 
 
-function quantile(d::Categorical, p::Float64)
-    0 <= p <= 1 || throw(DomainError())
-    k = ncategories(d)
-    pv = probs(d)
-    i = 1
-    v = pv[1]
-    while v < p && i < k
-        i += 1
-        @inbounds v += pv[i]
-    end
-    i
-end
-
-
 # sampling
 
 sampler(d::Categorical) = AliasTable(d.p)

diff --git a/src/univariate/discrete/generic.jl b/src/univariate/discrete/generic.jl
@@ -1,6 +1,6 @@
 struct Generic{T<:Real,P<:Real,S<:AbstractVector{T}} <: DiscreteUnivariateDistribution
-    vals::S
-    probs::Vector{P}
+    support::S
+    p::Vector{P}
 
     Generic{T,P,S}(vs::S, ps::Vector{P}, ::NoArgCheck) where {T<:Real,P<:Real,S<:AbstractVector{T}} =
         new(vs, ps)
@@ -16,19 +16,27 @@ end
 Generic(vs::S, ps::Vector{P}) where {T<:Real,P<:Real,S<:AbstractVector{T}} =
     Generic{T,P,S}(vs, ps)
 
-support(d::Generic) = d.vals
-probs(d::Generic)  = d.probs
-params(d::Generic) = (d.vals, d.probs)
+# Conversion
+convert(::Type{Generic{T,P,S}}, d::Generic) where {T,P,S} =
+    Generic{T,P,S}(S(support(d)), Vector{P}(probs(d)), NoArgCheck())
+
+params(d::Generic) = (d.support, d.p)
+support(d::Generic) = d.support
+probs(d::Generic)  = d.p
 
 rand(d::Generic{T,P}) where {T,P} =
-    d.vals[searchsortedfirst(cumsum(d.probs), rand(P))]
+    support(d)[searchsortedfirst(cumsum(probs(d)), rand(P))] #TODO: Switch to single pass
+
+# Evaluation
+
+pdf(d::Generic) = copy(probs(d))
 
 # Helper functions for pdf and cdf required to fix ambiguous method
 # error involving [pc]df(::DisceteUnivariateDistribution, ::Int)
 function _pdf(d::Generic{T,P}, x::T) where {T,P}
-    idx_range = searchsorted(d.vals, x)
+    idx_range = searchsorted(support(d), x)
     if length(idx_range) > 0
-        return d.probs[first(idx_range)]
+        return probs(d)[first(idx_range)]
     else
         return zero(P)
     end
@@ -37,23 +45,36 @@ pdf(d::Generic{T}, x::Int) where T  = _pdf(d, convert(T, x))
 pdf(d::Generic{T}, x::Real) where T = _pdf(d, convert(T, x))
 
 _cdf(d::Generic{T}, x::T) where T =
-    sum(d.probs[1:searchsortedlast(d.vals, x)])
+    sum(probs(d)[1:searchsortedlast(support(d), x)]) #TODO: Switch to single-pass
 cdf(d::Generic{T}, x::Int) where T = _cdf(d, convert(T, x))
 cdf(d::Generic{T}, x::Real) where T = _cdf(d, convert(T, x))
 
-quantile(d::Generic, q::Real) =
-    d.vals[searchsortedfirst(cumsum(d.probs), q)]
+function quantile(d::Generic, q::Real)
+    0 <= q <= 1 || throw(DomainError())
+    x = support(d)
+    p = probs(d)
+    k = length(x)
+    i = 1
+    cp = p[1]
+    while cp < q && i < k #Note: is i < k necessary?
+        i += 1
+        @inbounds cp += p[i]
+    end
+    x[i]
+end
+
 
-minimum(d::Generic) = d.vals[1]
-maximum(d::Generic) = d.vals[end]
+minimum(d::Generic) = support(d)[1]
+maximum(d::Generic) = support(d)[end]
 insupport(d::Generic, x::Real) =
-    length(searchsorted(d.vals, x)) > 0
+    length(searchsorted(support(d), x)) > 0
 
-mean(d::Generic) = dot(d.probs, d.vals)
+mean(d::Generic) = dot(probs(d), support(d))
 
 function var(d::Generic{T}) where T
     m = mean(d)
-    x, p = params(d)
+    x = support(d)
+    p = probs(d)
     k = length(x)
     σ² = zero(T)
     for i in 1:k
@@ -64,7 +85,8 @@ end
 
 function skewness(d::Generic{T}) where T
     m = mean(d)
-    x, p = params(d)
+    x = support(d)
+    p = probs(d)
     k = length(x)
     μ₃ = zero(T)
     σ² = zero(T)
@@ -79,7 +101,8 @@ end
 
 function kurtosis(d::Generic{T}) where T
     m = mean(d)
-    x, p = params(d)
+    x = support(d)
+    p = probs(d)
     k = length(x)
     μ₄ = zero(T)
     σ² = zero(T)
@@ -92,12 +115,13 @@ function kurtosis(d::Generic{T}) where T
     μ₄ / abs2(σ²) - 3
 end
 
-entropy(d::Generic) = entropy(d.probs)
-entropy(d::Generic, b::Real) = entropy(d.probs, b)
+entropy(d::Generic) = entropy(probs(d))
+entropy(d::Generic, b::Real) = entropy(probs(d), b)
 
-mode(d::Generic) = d.vals[indmax(d.probs)]
+mode(d::Generic) = support(d)[indmax(probs(d))]
 function modes(d::Generic{T,P}) where {T,P}
-    x, p = params(d)
+    x = support(d)
+    p = probs(d)
     k = length(x)
     mds = T[]
     max_p = zero(P)
@@ -113,3 +137,10 @@ function modes(d::Generic{T,P}) where {T,P}
     end
     mds
 end
+
+mgf(d::Generic) = nothing # TODO
+cf(d::Generic) = nothing # TODO
+
+# TODO: Sufficient statistics
+
+# TODO: MLE
diff --git a/src/univariates.jl b/src/univariates.jl
@@ -329,9 +329,9 @@ const discrete_distributions = [
     "bernoulli",
     "betabinomial",
     "binomial",
-    "categorical",
     "discreteuniform",
     "generic",
+    "categorical",
     "geometric",
     "hypergeometric",
     "negativebinomial",

diff --git a/test/fit.jl b/test/fit.jl
@@ -106,7 +106,7 @@ h = Float64[countnz(x .== i) for i = 1 : 3]
 
 d = fit(Categorical, (3, x))
 @test isa(d, Categorical)
-@test d.K == 3
+@test ncategories(d) == 3
 @test probs(d) ≈ h / sum(h)
 
 d2 = fit(Categorical, x)

diff --git a/test/generic.jl b/test/generic.jl
@@ -38,7 +38,7 @@ d = Generic([40., 80., 120., -60.],
 xs = support(d)
 ws = ProbabilityWeights(probs(d))
 @test mean(d) ≈ mean(xs, ws)
-@test var(d) ≈ var(xs, ws)
+@test var(d) ≈ var(xs, ws, corrected=false)
 @test skewness(d) ≈ skewness(xs, ws)
 @test kurtosis(d) ≈ kurtosis(xs, ws)
 @test entropy(d) ≈ 1.2798542258336676