Skip to content

Commit

Permalink
Convert Categorical to a type alias of Generic
Browse files Browse the repository at this point in the history
  • Loading branch information
Gord Stephen committed Jul 13, 2017
1 parent 78195a8 commit 11a576e
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 137 deletions.
3 changes: 2 additions & 1 deletion src/samplers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ for fname in ["categorical.jl",
"gamma.jl",
"multinomial.jl",
"vonmises.jl",
"vonmisesfisher.jl"]
"vonmisesfisher.jl",
"generic.jl"]

include(joinpath("samplers", fname))
end
16 changes: 8 additions & 8 deletions src/samplers/generic.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
struct GenericSampler{T<:Real} <: Sampleable{Univariate,Discrete}
vals::Vector{T}
struct GenericSampler{T<:Real, S<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
support::S
aliastable::AliasTable

GenericSampler{T}(vals::Vector{T}, probs::Vector{<:Real}) where T<:Real =
new(vals, AliasTable(probs))
GenericSampler{T,S}(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
new(support, AliasTable(probs))
end

GenericSampler(vals::Vector{T}, probs::Vector{<:Real}) where T<:Real =
GenericSampler{T}(vals, probs)
GenericSampler(support::S, probs::Vector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
GenericSampler{T,S}(support, probs)

sampler(d::Generic) =
GenericSampler(d.vals, d.probs)
GenericSampler(d.support, d.p)

rand(s::GenericSampler) =
(@inbounds v = s.vals[rand(s.aliastable)]; v)
(@inbounds v = s.support[rand(s.aliastable)]; v)
119 changes: 16 additions & 103 deletions src/univariate/discrete/categorical.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,55 +14,36 @@ External links:
* [Categorical distribution on Wikipedia](http://en.wikipedia.org/wiki/Categorical_distribution)
"""

immutable Categorical{T<:Real} <: DiscreteUnivariateDistribution
K::Int
p::Vector{T}
Categorical{T} = Generic{Int,T,UnitRange{Int}}

(::Type{Categorical{T}}){T}(p::Vector{T}, ::NoArgCheck) = new{T}(length(p), p)
(::Type{Categorical})(p::Vector{P}, ::NoArgCheck) where P =
Generic{Int,P,UnitRange{Int}}(1:length(p), p, NoArgCheck())

function (::Type{Categorical{T}}){T}(p::Vector{T})
@check_args(Categorical, isprobvec(p))
new{T}(length(p), p)
end
function (::Type{Categorical})(p::Vector{P}) where P
@check_args(Generic, isprobvec(p))
Generic{Int,P,UnitRange{Int}}(1:length(p), p, NoArgCheck())
end

function (::Type{Categorical{T}}){T}(k::Integer)
@check_args(Categorical, k >= 1)
new{T}(k, fill(1/k, k))
end
function (::Type{Categorical})(k::Integer)
@check_args(Generic, k >= 1)
Generic{Int,Float64,UnitRange{Int}}(1:k, fill(1/k, k), NoArgCheck())
end

Categorical{T<:Real}(p::Vector{T}, ::NoArgCheck) = Categorical{T}(p, NoArgCheck())
Categorical{T<:Real}(p::Vector{T}) = Categorical{T}(p)
Categorical(k::Integer) = Categorical{Float64}(k)

@distr_support Categorical 1 d.K
@distr_support Categorical 1 support(d).stop

### Conversions

convert{T<:Real, S<:Real}(::Type{Categorical{T}}, p::Vector{S}) = Categorical(Vector{T}(p))
convert{T<:Real, S<:Real}(::Type{Categorical{T}}, d::Categorical{S}) = Categorical(Vector{T}(d.p))

### Parameters

ncategories(d::Categorical) = d.K
probs(d::Categorical) = d.p
params(d::Categorical) = (d.p,)
ncategories(d::Categorical) = support(d).stop
params(d::Categorical) = (probs(d),)
@inline partype{T<:Real}(d::Categorical{T}) = T


### Statistics

function categorical_mean{T<:Real}(p::AbstractArray{T})
k = length(p)
s = zero(T)
for i = 1:k
@inbounds s += p[i] * i
end
s
end

mean(d::Categorical) = categorical_mean(d.p)

function median{T<:Real}(d::Categorical{T})
k = ncategories(d)
p = probs(d)
Expand All @@ -75,42 +56,6 @@ function median{T<:Real}(d::Categorical{T})
i
end

function var{T<:Real}(d::Categorical{T})
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += abs2(i - m) * p[i]
end
s
end

function skewness{T<:Real}(d::Categorical{T})
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += (i - m)^3 * p[i]
end
v = var(d)
s / (v * sqrt(v))
end

function kurtosis{T<:Real}(d::Categorical{T})
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += (i - m)^4 * p[i]
end
s / abs2(var(d)) - 3
end

entropy(d::Categorical) = entropy(d.p)

function mgf{T<:Real}(d::Categorical{T}, t::Real)
k = ncategories(d)
p = probs(d)
Expand All @@ -131,22 +76,6 @@ function cf{T<:Real}(d::Categorical{T}, t::Real)
s
end

mode(d::Categorical) = indmax(probs(d))

function modes(d::Categorical)
K = ncategories(d)
p = probs(d)
maxp = maximum(p)
r = Vector{Int}(0)
for k = 1:K
@inbounds if p[k] == maxp
push!(r, k)
end
end
r
end


### Evaluation

function cdf{T<:Real}(d::Categorical{T}, x::Int)
Expand All @@ -161,17 +90,15 @@ function cdf{T<:Real}(d::Categorical{T}, x::Int)
return c
end

pdf{T<:Real}(d::Categorical{T}, x::Int) = insupport(d, x) ? d.p[x] : zero(T)

logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(d.p[x]) : -Inf
pdf{T<:Real}(d::Categorical{T}, x::Int) = insupport(d, x) ? probs(d)[x] : zero(T)

pdf(d::Categorical) = copy(d.p)
logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(probs(d)[x]) : -Inf

function _pdf!{T<:Real}(r::AbstractArray, d::Categorical{T}, rgn::UnitRange)
vfirst = round(Int, first(rgn))
vlast = round(Int, last(rgn))
vl = max(vfirst, 1)
vr = min(vlast, d.K)
vr = min(vlast, ncategories(d))
p = probs(d)
if vl > vfirst
for i = 1:(vl - vfirst)
Expand All @@ -191,20 +118,6 @@ function _pdf!{T<:Real}(r::AbstractArray, d::Categorical{T}, rgn::UnitRange)
end


function quantile(d::Categorical, p::Float64)
0 <= p <= 1 || throw(DomainError())
k = ncategories(d)
pv = probs(d)
i = 1
v = pv[1]
while v < p && i < k
i += 1
@inbounds v += pv[i]
end
i
end


# sampling

sampler(d::Categorical) = AliasTable(d.p)
Expand Down
75 changes: 53 additions & 22 deletions src/univariate/discrete/generic.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
struct Generic{T<:Real,P<:Real,S<:AbstractVector{T}} <: DiscreteUnivariateDistribution
vals::S
probs::Vector{P}
support::S
p::Vector{P}

Generic{T,P,S}(vs::S, ps::Vector{P}, ::NoArgCheck) where {T<:Real,P<:Real,S<:AbstractVector{T}} =
new(vs, ps)
Expand All @@ -16,19 +16,27 @@ end
Generic(vs::S, ps::Vector{P}) where {T<:Real,P<:Real,S<:AbstractVector{T}} =
Generic{T,P,S}(vs, ps)

support(d::Generic) = d.vals
probs(d::Generic) = d.probs
params(d::Generic) = (d.vals, d.probs)
# Conversion
convert(::Type{Generic{T,P,S}}, d::Generic) where {T,P,S} =
Generic{T,P,S}(S(support(d)), Vector{P}(probs(d)), NoArgCheck())

params(d::Generic) = (d.support, d.p)
support(d::Generic) = d.support
probs(d::Generic) = d.p

rand(d::Generic{T,P}) where {T,P} =
d.vals[searchsortedfirst(cumsum(d.probs), rand(P))]
support(d)[searchsortedfirst(cumsum(probs(d)), rand(P))] #TODO: Switch to single pass

# Evaluation

pdf(d::Generic) = copy(probs(d))

# Helper functions for pdf and cdf required to fix ambiguous method
# error involving [pc]df(::DisceteUnivariateDistribution, ::Int)
function _pdf(d::Generic{T,P}, x::T) where {T,P}
idx_range = searchsorted(d.vals, x)
idx_range = searchsorted(support(d), x)
if length(idx_range) > 0
return d.probs[first(idx_range)]
return probs(d)[first(idx_range)]
else
return zero(P)
end
Expand All @@ -37,23 +45,36 @@ pdf(d::Generic{T}, x::Int) where T = _pdf(d, convert(T, x))
pdf(d::Generic{T}, x::Real) where T = _pdf(d, convert(T, x))

_cdf(d::Generic{T}, x::T) where T =
sum(d.probs[1:searchsortedlast(d.vals, x)])
sum(probs(d)[1:searchsortedlast(support(d), x)]) #TODO: Switch to single-pass
cdf(d::Generic{T}, x::Int) where T = _cdf(d, convert(T, x))
cdf(d::Generic{T}, x::Real) where T = _cdf(d, convert(T, x))

quantile(d::Generic, q::Real) =
d.vals[searchsortedfirst(cumsum(d.probs), q)]
function quantile(d::Generic, q::Real)
0 <= q <= 1 || throw(DomainError())
x = support(d)
p = probs(d)
k = length(x)
i = 1
cp = p[1]
while cp < q && i < k #Note: is i < k necessary?
i += 1
@inbounds cp += p[i]
end
x[i]
end


minimum(d::Generic) = d.vals[1]
maximum(d::Generic) = d.vals[end]
minimum(d::Generic) = support(d)[1]
maximum(d::Generic) = support(d)[end]
insupport(d::Generic, x::Real) =
length(searchsorted(d.vals, x)) > 0
length(searchsorted(support(d), x)) > 0

mean(d::Generic) = dot(d.probs, d.vals)
mean(d::Generic) = dot(probs(d), support(d))

function var(d::Generic{T}) where T
m = mean(d)
x, p = params(d)
x = support(d)
p = probs(d)
k = length(x)
σ² = zero(T)
for i in 1:k
Expand All @@ -64,7 +85,8 @@ end

function skewness(d::Generic{T}) where T
m = mean(d)
x, p = params(d)
x = support(d)
p = probs(d)
k = length(x)
μ₃ = zero(T)
σ² = zero(T)
Expand All @@ -79,7 +101,8 @@ end

function kurtosis(d::Generic{T}) where T
m = mean(d)
x, p = params(d)
x = support(d)
p = probs(d)
k = length(x)
μ₄ = zero(T)
σ² = zero(T)
Expand All @@ -92,12 +115,13 @@ function kurtosis(d::Generic{T}) where T
μ₄ / abs2(σ²) - 3
end

entropy(d::Generic) = entropy(d.probs)
entropy(d::Generic, b::Real) = entropy(d.probs, b)
entropy(d::Generic) = entropy(probs(d))
entropy(d::Generic, b::Real) = entropy(probs(d), b)

mode(d::Generic) = d.vals[indmax(d.probs)]
mode(d::Generic) = support(d)[indmax(probs(d))]
function modes(d::Generic{T,P}) where {T,P}
x, p = params(d)
x = support(d)
p = probs(d)
k = length(x)
mds = T[]
max_p = zero(P)
Expand All @@ -113,3 +137,10 @@ function modes(d::Generic{T,P}) where {T,P}
end
mds
end

mgf(d::Generic) = nothing # TODO
cf(d::Generic) = nothing # TODO

# TODO: Sufficient statistics

# TODO: MLE
2 changes: 1 addition & 1 deletion src/univariates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,9 @@ const discrete_distributions = [
"bernoulli",
"betabinomial",
"binomial",
"categorical",
"discreteuniform",
"generic",
"categorical",
"geometric",
"hypergeometric",
"negativebinomial",
Expand Down
2 changes: 1 addition & 1 deletion test/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ h = Float64[countnz(x .== i) for i = 1 : 3]

d = fit(Categorical, (3, x))
@test isa(d, Categorical)
@test d.K == 3
@test ncategories(d) == 3
@test probs(d) h / sum(h)

d2 = fit(Categorical, x)
Expand Down
2 changes: 1 addition & 1 deletion test/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ d = Generic([40., 80., 120., -60.],
xs = support(d)
ws = ProbabilityWeights(probs(d))
@test mean(d) mean(xs, ws)
@test var(d) var(xs, ws)
@test var(d) var(xs, ws, corrected=false)
@test skewness(d) skewness(xs, ws)
@test kurtosis(d) kurtosis(xs, ws)
@test entropy(d) 1.2798542258336676
Expand Down

0 comments on commit 11a576e

Please sign in to comment.