diff --git a/src/fit/analytical.jl b/src/fit/analytical.jl index fc356f9..97dfff6 100644 --- a/src/fit/analytical.jl +++ b/src/fit/analytical.jl @@ -15,13 +15,13 @@ Assuming `n` dominates `p`, * iterative (conjugate gradient): O(κnp) - with κ the number of CG steps (κ ≤ p). """ -function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch) +function _fit(::Type{T}, glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch) where {T<:Real} # full solve if !solver.iterative λ = get_penalty_scale(glr, length(y)) if iszero(λ) # standard LS solution - return augment_X(X, glr.fit_intercept) \ y + return augment_X(T, X, glr.fit_intercept) \ y else # Ridge case -- form the Hat Matrix then solve H = form_XtX(X, glr.fit_intercept, λ, glr.penalize_intercept) @@ -43,3 +43,7 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch) glr.fit_intercept && (b = vcat(b, sum(y))) return cg(Hm, b; maxiter=max_cg_steps) end + +function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch) + return _fit(eltype(X), glr, solver, X, y, scratch) +end \ No newline at end of file diff --git a/src/fit/default.jl b/src/fit/default.jl index 459853c..6096de4 100644 --- a/src/fit/default.jl +++ b/src/fit/default.jl @@ -33,8 +33,8 @@ $SIGNATURES Fit a generalised linear regression model using an appropriate solver based on the loss and penalty of the model. A method can, in some cases, be specified. """ -function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing, - solver::Solver=_solver(glr, size(X))) +function fit(::Type{T}, glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing, + solver::Solver=_solver(glr, size(X))) where {T<:Real} if hasproperty(solver, :gram) && solver.gram # interpret X,y as X'X, X'y data = verify_or_construct_gramian(glr, X, y, data) @@ -44,9 +44,14 @@ function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing, check_nrows(X, y) n, p = size(X) c = getc(glr, y) - return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept)) + return _fit(T, glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept)) end end + +function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; kwargs...) + return fit(eltype(X), glr, X, y; kwargs...) +end + fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...) diff --git a/src/fit/iwls.jl b/src/fit/iwls.jl index 2a41372..84a23b8 100644 --- a/src/fit/iwls.jl +++ b/src/fit/iwls.jl @@ -1,5 +1,5 @@ -function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch - ) where {ρ} +function _fit(::Type{T}, glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch + ) where {ρ, T<:Real} n,p,_ = npc(scratch) _Mv! = Mv!(glr, X, y, scratch; threshold=solver.threshold) κ = solver.damping # between 0 and 1, 1 = fully take the new iteration @@ -34,3 +34,8 @@ function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch @warn "IWLS did not converge in $(solver.max_iter) iterations." return θ end + +function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, + solver::IWLSCG, X, y, scratch) where {ρ} + return _fit(eltype(X), glr, solver, X, y, scratch) +end diff --git a/src/fit/newton.jl b/src/fit/newton.jl index a3d4cd7..8d478bf 100644 --- a/src/fit/newton.jl +++ b/src/fit/newton.jl @@ -12,17 +12,22 @@ Fit a GLR using Newton's method. Assuming `n` dominates `p`, O(κnp²), dominated by the construction of the Hessian at each step with κ the number of Newton steps. """ -function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, - solver::Newton, X, y, scratch) +function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, + solver::Newton, X, y, scratch) where {T<:Real} _,p,_ = npc(scratch) - θ₀ = zeros(p) - _fgh! = fgh!(glr, X, y, scratch) + θ₀ = zeros(T, p) + _fgh! = fgh!(T, glr, X, y, scratch) opt = Optim.only_fgh!(_fgh!) res = Optim.optimize(opt, θ₀, Optim.Newton(; solver.newton_options...), solver.optim_options) return Optim.minimizer(res) end +function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, + solver::Newton, X, y, scratch) + return _fit(eltype(X), glr, solver, X, y, scratch) +end + """ $SIGNATURES @@ -35,13 +40,13 @@ Assuming `n` dominates `p`, O(κ₁κ₂np), dominated by the application of the Hessian at each step where κ₁ is the number of Newton steps and κ₂ is the average number of CG steps per Newton step (which is at most p). """ -function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, - solver::NewtonCG, X, y, scratch) +function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, + solver::NewtonCG, X, y, scratch) where {T<:Real} _,p,_ = npc(scratch) - θ₀ = zeros(p) + θ₀ = zeros(T, p) _f = objective(glr, X, y) - _fg! = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738 - _Hv! = Hv!(glr, X, y, scratch) + _fg! = (g, θ) -> fgh!(T, glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738 + _Hv! = Hv!(T, glr, X, y, scratch) opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀) res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion(; solver.newtoncg_options...), solver.optim_options) @@ -58,11 +63,11 @@ Fit a GLR using LBFGS. Assuming `n` dominates `p`, O(κnp), dominated by the computation of the gradient at each step with κ the number of LBFGS steps. """ -function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, - solver::LBFGS, X, y, scratch) +function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, + solver::LBFGS, X, y, scratch) where {T<:Real} _,p,_ = npc(scratch) - θ₀ = zeros(p) - _fg! = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ) + θ₀ = zeros(T, p) + _fg! = (f, g, θ) -> fgh!(T, glr, X, y, scratch)(f, g, nothing, θ) opt = Optim.only_fg!(_fg!) res = Optim.optimize(opt, θ₀, Optim.LBFGS(; solver.lbfgs_options...), solver.optim_options) @@ -85,19 +90,23 @@ computations are dominated by the application of the Hessian at each step with κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton step. """ -function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG, - X, y, scratch) +function _fit(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG, + X, y, scratch) where {T<:Real} _,p,c = npc(scratch) - θ₀ = zeros(p * c) + θ₀ = zeros(T, p * c) _f = objective(glr, X, y; c=c) - _fg! = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738 - _Hv! = Hv!(glr, X, y, scratch) + _fg! = (g, θ) -> fg!(T, glr, X, y, scratch)(T(0.0), g, θ) # XXX: Optim.jl/738 + _Hv! = Hv!(T, glr, X, y, scratch) opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀) res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion(; solver.newtoncg_options...), solver.optim_options) return Optim.minimizer(res) end +function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG, X, y, scratch) + return _fit(eltype(X), glr, solver, X, y, scratch) +end + """ $SIGNATURES @@ -109,13 +118,17 @@ Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated by the computation of the gradient at each step with κ the number of LBFGS steps. """ -function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS, - X, y, scratch) +function _fit(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS, + X, y, scratch) where {T<:Real} _,p,c = npc(scratch) - θ₀ = zeros(p * c) - _fg! = fg!(glr, X, y, scratch) + θ₀ = zeros(T, p * c) + _fg! = fg!(T, glr, X, y, scratch) opt = Optim.only_fg!(_fg!) res = Optim.optimize(opt, θ₀, Optim.LBFGS(; solver.lbfgs_options...), solver.optim_options) return Optim.minimizer(res) end + +function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS, X, y, scratch) + return _fit(eltype(X), glr, solver, X, y, scratch) +end diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl index 1a04fe1..a84fd89 100644 --- a/src/fit/proxgrad.jl +++ b/src/fit/proxgrad.jl @@ -2,27 +2,27 @@ # Assumption: loss has gradient; penalty has prox e.g.: Lasso # J(θ) = f(θ) + r(θ) where f is smooth -function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) +function _fit(::Type{T}, glr::GLR, solver::ProxGrad, X, y, scratch) where {T<:Real} n,p,c = npc(scratch) c > 0 && (p *= c) # vector caches + eval cache - θ = zeros(p) # θ_k - Δθ = zeros(p) # (θ_k - θ_{k-1}) - θ̄ = zeros(p) # θ_k + ρ Δθ // extrapolation - ∇fθ̄ = zeros(p) - fθ̄ = 0.0 # useful for backtracking function - θ̂ = zeros(p) # candidate before becoming θ_k + θ = zeros(T, p) # θ_k + Δθ = zeros(T, p) # (θ_k - θ_{k-1}) + θ̄ = zeros(T, p) # θ_k + ρ Δθ // extrapolation + ∇fθ̄ = zeros(T, p) + fθ̄ = T(0.0) # useful for backtracking function + θ̂ = zeros(T, p) # candidate before becoming θ_k # cache for extrapolation constant and stepsizes - ω = 0.0 # ω_k - ω_ = 0.0 # ω_{k-1} - ω__ = 0.0 # ω_{k-2} - η = 1.0 # stepsize (1/L) - acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA) + ω = T(0.0) # ω_k + ω_ = T(0.0) # ω_{k-1} + ω__ = T(0.0) # ω_{k-2} + η = T(1.0) # stepsize (1/L) + acc = ifelse(solver.accel, T(1.0), T(0.0)) # if 0, no extrapolation (ISTA) # functions _f = if solver.gram - smooth_gram_objective(glr, X, y, n) + smooth_gram_objective(T, glr, X, y, n) else - smooth_objective(glr, X, y; c=c) + smooth_objective(T, glr, X, y; c=c) end _fg! = if solver.gram @@ -32,7 +32,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) end _prox! = prox!(glr, n) bt_cond = θ̂ -> - _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η) + _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (T(2)*η) # loop-related k, tol = 1, Inf while k ≤ solver.max_iter && tol > solver.tol @@ -43,7 +43,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) # for Linear Inverse Problems" (page 193) # -------------------------------------------------- # 1. linear extrapolation of past iterates - ω = (1.0 + sqrt(1.0 + 4.0 * ω_^2)) / 2.0 + ω = (T(1.0) + sqrt(T(1.0) + T(4.0) * ω_^T(2))) / T(2.0) ρ = acc * ω__ / ω # ω_{k-2}/ω; note that ρ != 0 only as k > 2 θ̄ .= θ + ρ * Δθ # 2. attempt a prox step, modify the step until verifies condition @@ -74,3 +74,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) "$(solver.max_iter) iterations." return θ end + +function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) + return _fit(eltype(X), glr, solver, X, y, scratch) +end \ No newline at end of file diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl index aaeccb2..76d3881 100644 --- a/src/fit/solvers.jl +++ b/src/fit/solvers.jl @@ -127,12 +127,12 @@ Proximal Gradient solver for non-smooth objective functions. backtracking step * `beta`: rate of shrinkage in the backtracking step (between 0 and 1) """ -@with_kw struct ProxGrad <: Solver +@with_kw struct ProxGrad{T<:Real} <: Solver accel::Bool = false # use Nesterov style acceleration (see also FISTA) max_iter::Int = 1000 # max number of overall iterations - tol::Float64 = 1e-4 # tol relative change of θ i.e. norm(θ-θ_)/norm(θ) + tol::T = 1e-4 # tol relative change of θ i.e. norm(θ-θ_)/norm(θ) max_inner::Int = 100 # β^max_inner should be > 1e-10 - beta::Float64 = 0.8 # in (0, 1); shrinkage in the backtracking step + beta::T = 0.8 # in (0, 1); shrinkage in the backtracking step gram::Bool = false # use precomputed Gramian for lsq where possible end @@ -156,12 +156,12 @@ computations. * `damping` (Float64): how much to trust iterates (1=full trust) * `threshold` (Float64): threshold for the residuals """ -@with_kw struct IWLSCG <: Solver +@with_kw struct IWLSCG{T<:Real} <: Solver max_iter::Int = 100 max_inner::Int = 200 - tol::Float64 = 1e-4 - damping::Float64 = 1.0 # should be between 0 and 1, 1 = trust iterates - threshold::Float64 = 1e-6 # thresh for residuals; used eg in quantile reg + tol::T = 1e-4 + damping::T = 1.0 # should be between 0 and 1, 1 = trust iterates + threshold::T = 1e-6 # thresh for residuals; used eg in quantile reg end # ===================== admm.jl diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl index 1184c9f..51f31f9 100644 --- a/src/glr/d_l2loss.jl +++ b/src/glr/d_l2loss.jl @@ -12,21 +12,21 @@ # * Hv! used in iterative solution # --------------------------------------------------------- -function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch) +function Hv!(::Type{T}, glr::GLR{L2Loss,<:L2R}, X, y, scratch) where {T<:Real} n, p = size(X) λ = get_penalty_scale(glr, n) if glr.fit_intercept # H = [X 1]'[X 1] + λ I # rows a 1:p = [X'X + λI | X'1] # row e end = [1'X | n+λι] where ι is 1 if glr.penalize_intercept - ι = float(glr.penalize_intercept) + ι = T(glr.penalize_intercept) (Hv, v) -> begin # view on the first p rows a = 1:p Hvₐ = view(Hv, a) vₐ = view(v, a) Xt1 = view(scratch.p, a) - Xt1 .*= 0 + Xt1 .*= T(0) @inbounds for i in a, j in 1:n Xt1[i] += X[j, i] # -- X'1 end @@ -49,6 +49,10 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch) end end +function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch) + return Hv!(eltype(X), glr, X, y, scratch) +end + # ----------------------------- # # -- Lasso/Elnet Regression -- # # ----------------------------- # diff --git a/src/glr/d_logistic.jl b/src/glr/d_logistic.jl index 6ae90a1..d329236 100644 --- a/src/glr/d_logistic.jl +++ b/src/glr/d_logistic.jl @@ -11,7 +11,7 @@ # NOTE: https://github.com/JuliaAI/MLJLinearModels.jl/issues/104 # --------------------------------------------------------- -function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) +function fgh!(::Type{T}, glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) where {T<:Real} n, p = size(X) J = objective(glr, n) # GLR objective (loss+penalty) λ = get_penalty_scale(glr, n) @@ -34,12 +34,12 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) # ΛX but computing the full hessian allocates a lot anyway so # probably not really worth it t = scratch.n3 - t .= w .- w.^2 # σ(yXθ)(1-σ(yXθ)) + t .= w .- w.^T(2) # σ(yXθ)(1-σ(yXθ)) a = 1:p ΛX = t .* X # !! big allocs mul!(view(H, a, a), X', ΛX) # -- H[1:p,1:p] = X'ΛX ΛXt1 = view(scratch.p, a) - ΛXt1 .*= 0 + ΛXt1 .*= T(0) @inbounds for i in a, j in 1:n ΛXt1[i] += ΛX[j, i] # -- (ΛX)'1 end @@ -67,7 +67,7 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) end H === nothing || begin t = scratch.n3 - t .= w .- w.^2 + t .= w .- w.^T(2) mul!(H, X', t .* X) add_λI!(H, λ) end @@ -76,7 +76,11 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) end end -function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) +function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) + return fgh!(eltype(X), glr, X, y, scratch) +end + +function Hv!(::Type{T}, glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) where{T<:Real} n, p = size(X) λ = get_penalty_scale(glr, n) if glr.fit_intercept @@ -88,7 +92,7 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) apply_X!(Xθ, X, θ) # -- Xθ = apply_X(X, θ) w = scratch.n2 w .= σ.(Xθ .* y) # -- w = σ.(Xθ .* y) - w .-= w.^2 # -- w = w(1-w) + w .-= w.^T(2) # -- w = w(1-w) # view on the first p rows a = 1:p Hvₐ = view(Hv, a) @@ -112,7 +116,7 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) apply_X!(Xθ, X, θ) w = scratch.n2 w .= σ.(Xθ .* y) # -- σ(yXθ) - w .-= w.^2 + w .-= w.^T(2) Xv = scratch.n mul!(Xv, X, v) Xv .*= scratch.n2 # -- ΛXv @@ -122,6 +126,10 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) end end +function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch) + return Hv!(eltype(X), glr, X, y, scratch) +end + # ----------------------------------- # # -- L1/Elnet Logistic Regression -- # # ----------------------------------- # @@ -133,9 +141,13 @@ end # -> prox_r = soft-thresh # --------------------------------------------------------- -function smooth_fg!(glr::GLR{LogisticLoss,<:ENR}, X, y, scratch) +function smooth_fg!(::Type{T}, glr::GLR{LogisticLoss,<:ENR}, X, y, scratch) where {T<:Real} smooth = get_smooth(glr) - (g, θ) -> fgh!(smooth, X, y, scratch)(0.0, g, nothing, θ) + (g, θ) -> fgh!(T, smooth, X, y, scratch)(T(0.0), g, nothing, θ) +end + +function smooth_fg!(glr::GLR{LogisticLoss,<:ENR}, X, y, scratch) + return smooth_fg!(eltype(X), glr, X, y, scratch) end # ---------------------------------- # @@ -152,7 +164,7 @@ end # * yᵢ ∈ {1, 2, ..., c} # --------------------------------------------------------- -function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) +function fg!(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) where {T<:Real} n, p = size(X) c = getc(glr, y) λ = get_penalty_scale(glr, n) @@ -166,7 +178,7 @@ function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) ΛM .= M ./ sum(M, dims=2) # O(nc) store n * c Q = scratch.nc4 @inbounds for i = 1:n, j=1:c - Q[i, j] = ifelse(y[i] == j, 1.0, 0.0) + Q[i, j] = ifelse(y[i] == j, T(1.0), T(0.0)) end ∑ΛM = sum(ΛM, dims=1) ∑Q = sum(Q, dims=1) @@ -198,7 +210,7 @@ function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) ΛM = scratch.nc2 # note that _NC is already linked to P ΛM .= M ./ ems ss = sum(ΛM, dims=2) - t = 0.0 + t = T(0.0) @inbounds for i in eachindex(y) t += log(ss[i]) + ms[i] - P[i, y[i]] end @@ -207,7 +219,12 @@ function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) end end -function Hv!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) +function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) + return fg!(eltype(X), glr, X, y, scratch) +end + + +function Hv!(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) where {T<:Real} n, p = size(X) λ = get_penalty_scale(glr, n) c = getc(glr, y) @@ -239,6 +256,10 @@ function Hv!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) end end +function Hv!(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch) where {T<:Real} + return Hv!(eltype{X}, glr, X, y, scratch) +end + # -------------------------------------- # # -- L1/Elnet Multinomial Regression -- # # -------------------------------------- # @@ -250,7 +271,11 @@ end # -> prox_r = soft-thresh # --------------------------------------------------------- -function smooth_fg!(glr::GLR{<:MultinomialLoss,<:ENR}, X, y, scratch) +function smooth_fg!(::Type{T}, glr::GLR{<:MultinomialLoss,<:ENR}, X, y, scratch) where {T<:Real} smooth = get_smooth(glr) - (g, θ) -> fg!(smooth, X, y, scratch)(0.0, g, θ) + (g, θ) -> fg!(T, smooth, X, y, scratch)(T(0.0), g, θ) +end + +function smooth_fg!(glr::GLR{<:MultinomialLoss,<:ENR}, X, y, scratch) + return smooth_fg!(eltype(X), glr, X, y, scratch) end diff --git a/src/glr/d_robust.jl b/src/glr/d_robust.jl index b9bc4d1..92a8ed0 100644 --- a/src/glr/d_robust.jl +++ b/src/glr/d_robust.jl @@ -11,8 +11,8 @@ # -> ∇²f(θ) = X'Λ(r)X + λI # --------------------------------------------------------- -function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch - ) where ρ <: RobustRho1P{δ} where δ +function fgh!(::Type{T}, glr::GLR{RobustLoss{ρ},<:L2R}, + X, y, scratch) where {T<:Real, ρ<:RobustRho1P{δ}} where δ n, p = size(X) λ = get_penalty_scale(glr, n) ψ_ = ψ(ρ) @@ -53,7 +53,7 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch r = scratch.n get_residuals!(r, X, θ, y) w = scratch.n2 - w .= convert.(Float64, abs.(r) .<= δ) + w .= convert.(T, abs.(r) .<= δ) # gradient via ψ function g === nothing || begin ψr = scratch.n3 @@ -68,9 +68,13 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch end end +function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, + X, y, scratch) where {ρ<:RobustRho1P{δ}} where δ + return fgh!(eltype(X), glr, X, y, scratch) +end -function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch - ) where ρ <: RobustRho1P{δ} where δ +function Hv!(::Type{T}, glr::GLR{RobustLoss{ρ},<:L2R}, + X, y, scratch) where {T<:Real, ρ<:RobustRho1P{δ}} where δ n, p = size(X) λ = get_penalty_scale(glr, n) ϕ_ = ϕ(ρ) @@ -80,7 +84,7 @@ function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch r = scratch.n get_residuals!(r, X, θ, y) w = scratch.n2 - w .= convert.(Float64, abs.(r) .<= δ) + w .= convert.(T, abs.(r) .<= δ) w .= ϕ_.(r, w) # views on first p rows (intercept row treated after) a = 1:p @@ -103,7 +107,7 @@ function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch r = scratch.n get_residuals!(r, X, θ, y) w = scratch.n2 - w .= convert.(Float64, abs.(r) .<= δ) + w .= convert.(T, abs.(r) .<= δ) w .= ϕ_.(r, w) t = scratch.n3 apply_X!(t, X, v) @@ -114,10 +118,14 @@ function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch end end +function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, + X, y, scratch) where {ρ<:RobustRho1P{δ}} where δ + return Hv!(eltype(X), glr, X, y, scratch) +end # For IWLS -function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch; - threshold=1e-6) where ρ <: RobustRho1P{δ} where δ +function Mv!(::Type{T}, glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch; + threshold=T(1e-6)) where {T<:Real, ρ<:RobustRho1P{δ}} where δ n, p = size(X) λ = get_penalty_scale(glr, n) ω_ = ω(ρ, threshold) @@ -128,7 +136,7 @@ function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch; r = scratch.n get_residuals!(r, X, θ, y) w = scratch.n2 - w .= convert.(Float64, abs.(r) .<= δ) + w .= convert.(T, abs.(r) .<= δ) # ω = ψ(r)/r ; weighing factor for IWLS ωr .= ω_.(r, w) # function defining the application of (X'ΛX + λI) @@ -162,11 +170,17 @@ function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch; end end +function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y, scratch; + threshold=1e-6) where {ρ<:RobustRho1P{δ}} where δ + T = eltype(X) + return Mv!(T, glr, X, y, scratch; threshold=T(threshold)) +end + # this is a bit of an abuse in that in some cases the ρ is not everywhere # differentiable -function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, X, y, scratch - ) where ρ <: RobustRho1P{δ} where δ +function smooth_fg!(::Type{T}, glr::GLR{RobustLoss{ρ},<:ENR}, + X, y, scratch) where {T<:Real, ρ<:RobustRho1P{δ}} where δ n, p = size(X) λ = get_penalty_scale_l2(glr, n) ψ_ = ψ(ρ) @@ -174,7 +188,7 @@ function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, X, y, scratch r = scratch.n get_residuals!(r, X, θ, y) w = scratch.n2 - w .= convert.(Float64, abs.(r) .<= δ) + w .= convert.(T, abs.(r) .<= δ) ψr = scratch.n3 ψr .= ψ_.(r, w) apply_Xt!(g, X, ψr) @@ -183,3 +197,8 @@ function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, X, y, scratch return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ)) end end + +function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, + X, y, scratch) where {ρ<:RobustRho1P{δ}} where δ + return smooth_fg!(eltype(X), glr, X, y, scratch) +end diff --git a/src/glr/utils.jl b/src/glr/utils.jl index 102da1d..a8e4022 100644 --- a/src/glr/utils.jl +++ b/src/glr/utils.jl @@ -27,18 +27,34 @@ $SIGNATURES Return a function computing the smooth part of the objective at a given evaluation point `θ`. """ -smooth_objective(glr::GLR, X, y; c::Int=0) = - θ -> smooth_objective(glr, size(X, 1))(y, apply_X(X, θ, c), view_θ(glr, θ)) +function smooth_objective(::Type{T}, glr::GLR, X, y; c::Int=0) where {T<:Real} + θ -> smooth_objective(T, glr, size(X, 1))(y, apply_X(X, θ, c), view_θ(glr, θ)) +end + +function smooth_objective(glr::GLR, X, y; c::Int=0) + return smooth_objective(eltype(X), glr, X, y; c=c) +end """ $SIGNATURES Return the smooth part of the objective function of a GLR. """ -smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) = glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.) +function smooth_objective(::Type{T}, glr::GLR{<:SmoothLoss,<:ENR}, n) where {T<:Real} + return glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, T(1.)) +end -smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) = - θ -> (θ'*XX*θ)/2 - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.))(θ) +function smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) + return smooth_objective(eltype(getscale(glr.penaly)), glr, n) +end + +function smooth_gram_objective(::Type{T}, glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) where {T<:Real} + return θ -> (θ'*XX*θ)/T(2) - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, T(1.0)))(θ) +end + +function smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) + return smooth_gram_objective(eltype(XX), glr, XX, Xy, n) +end smooth_objective(::GLR) = @error "Case not implemented yet." diff --git a/src/loss-penalty/generic.jl b/src/loss-penalty/generic.jl index 4e33640..df3a21f 100644 --- a/src/loss-penalty/generic.jl +++ b/src/loss-penalty/generic.jl @@ -18,7 +18,7 @@ abstract type AtomicLoss <: Loss end mutable struct ScaledLoss{AL} <: Loss where AL <: AtomicLoss loss::AL - scale::Float64 + scale::Real end mutable struct CompositeLoss <: Loss @@ -53,7 +53,7 @@ abstract type AtomicPenalty <: Penalty end mutable struct ScaledPenalty{AP} <: Penalty where AP <: AtomicPenalty penalty::AP - scale::Float64 + scale::Real end mutable struct CompositePenalty <: Penalty @@ -109,13 +109,13 @@ scale1(a::AP) = ScaledPenalty(a, 1.0) +(a::AL, b::AL) = scale1(a) + scale1(b) +(a::AL, b::Union{SL,CL}) = scale1(a) + b +(b::Union{SL,CL}, a::AL) = a + b -*(a::AL, c::Real) = ScaledLoss(a, float(c)) +*(a::AL, c::Real) = ScaledLoss(a, c) # Combinations with AtomicPenalty (AP) +(a::AP, b::AP) = scale1(a) + scale1(b) +(a::AP, b::Union{SP,CP}) = scale1(a) + b +(b::Union{SP,CP}, a::AP) = a + b -*(a::AP, c::Real) = ScaledPenalty(a, float(c)) +*(a::AP, c::Real) = ScaledPenalty(a, c) # Combinations with Scaled Losses and Combined Losses +(a::SL{T}, b::SL{T}) where {T} = ScaledLoss(a.loss, a.scale + b.scale) diff --git a/src/loss-penalty/robust.jl b/src/loss-penalty/robust.jl index 65c76c6..19cb7d6 100644 --- a/src/loss-penalty/robust.jl +++ b/src/loss-penalty/robust.jl @@ -49,13 +49,20 @@ struct HuberRho{δ} <: RobustRho1P{δ} end Huber(δ::Real=1.0; delta::Real=δ) = HuberRho(delta) -(::HuberRho{δ})(r::AVR) where δ = begin +function (::HuberRho{δ})(::Type{T}, r::AVR) where δ where {T<:Real} ar = abs.(r) w = ar .<= δ - return sum( @. ifelse(w, r^2/2, δ * (ar - δ/2) ) ) + return sum( @. ifelse(w, r^T(2)/T(2), δ * (ar - δ/T(2)) ) ) end -ψ(::Type{HuberRho{δ}} ) where δ = (r, w) -> r * w + δ * sign(r) * (1.0 - w) +(hr::HuberRho{δ})(r::AVR) where δ = hr(eltype(r), r) + +function ψ(::Type{T}, ::Type{HuberRho{δ}}) where δ where {T<:Real} + return (r, w) -> r * w + δ * sign(r) * (T(1.0) - w) +end + +ψ(hr::Type{HuberRho{δ}}) where δ = hr(eltype(typeof(hr).parameters[1]), hr) + ω(::Type{HuberRho{δ}}, _) where δ = (r, w) -> w + (δ / abs(r)) * (1.0 - w) ϕ(::Type{HuberRho{δ}} ) where δ = (r, w) -> w diff --git a/src/loss-penalty/utils.jl b/src/loss-penalty/utils.jl index abcdb2b..516f472 100644 --- a/src/loss-penalty/utils.jl +++ b/src/loss-penalty/utils.jl @@ -19,6 +19,26 @@ getscale_l2(p2::L2R) = p2 |> getscale getscale_l2(cp::CompositePenalty) = is_elnet(cp) ? cp |> get_l2 |> getscale : @error "Case not implemented." -get_penalty_scale(glr, n) = getscale(glr.penalty) * ifelse(glr.scale_penalty_with_samples, float(n), 1.0) -get_penalty_scale_l2(glr, n) = getscale_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, float(n), 1.0) -get_penalty_scale_l1(glr, n) = getscale_l1(glr.penalty) * ifelse(glr.scale_penalty_with_samples, float(n), 1.0) +function get_penalty_scale(::Type{T}, glr, n) where {T<:Real} + return getscale(glr.penalty) * ifelse(glr.scale_penalty_with_samples, T(n), T(1.0)) +end + +function get_penalty_scale(glr, n) + return get_penalty_scale(eltype(getscale(glr.penalty)), glr, n) +end + +function get_penalty_scale_l2(::Type(T), glr, n) where {T<:Real} + return T(getscale_l2(glr.penalty)) * ifelse(glr.scale_penalty_with_samples, T(n), T(1.0)) +end + +function get_penalty_scale_l2(glr, n) + return get_penalty_scale_l2(eltype(getscale(glr.penalty)), glr, n) +end + +function get_penalty_scale_l1(::Type{T}, glr, n) where {T<:Real} + return getscale_l1(glr.penalty) * ifelse(glr.scale_penalty_with_samples, T(n), T(1.0)) +end + +function get_penalty_scale_l1(glr, n) + return return get_penalty_scale_l1(eltype(getscale(glr.penalty)), glr, n) +end diff --git a/src/utils.jl b/src/utils.jl index 36b93ba..b701753 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -44,9 +44,13 @@ $SIGNATURES Given a matrix `X`, append a column of ones if `fit_intercept` is true. """ -function augment_X(X::AbstractMatrix{<:Real}, fit_intercept::Bool) +function augment_X(::Type{T}, X::AbstractMatrix{<:Real}, fit_intercept::Bool) where {T<:Real} fit_intercept || return X - return hcat(X, ones(eltype(X), size(X, 1))) + return hcat(X, ones(T, size(X, 1))) +end + +function augment_X(X::AbstractMatrix{<:Real}, fit_intercept::Bool) + return augment_X(eltype(X), X, fit_intercept) end """ @@ -124,10 +128,10 @@ $SIGNATURES Form (X'X) while being memory aware (assuming p ≪ n). """ -function form_XtX(X, fit_intercept, lambda = 0, penalize_intercept = true) +function form_XtX(::Type{T}, X, fit_intercept, lambda = 0, penalize_intercept = true) where {T<:Real} if fit_intercept n, p = size(X) - XtX = zeros(p+1, p+1) + XtX = zeros(T, p+1, p+1) Xt1 = sum(X, dims=1) mul!(view(XtX, 1:p, 1:p), X', X) # O(np²) @inbounds for i in 1:p @@ -138,7 +142,7 @@ function form_XtX(X, fit_intercept, lambda = 0, penalize_intercept = true) XtX = X'*X # O(np²) end if !iszero(lambda) - λ = convert(eltype(XtX), lambda) + λ = convert(T, lambda) @inbounds for i in 1:size(XtX, 1) + fit_intercept * (penalize_intercept - 1) XtX[i,i] += λ end @@ -146,6 +150,10 @@ function form_XtX(X, fit_intercept, lambda = 0, penalize_intercept = true) return Hermitian(XtX) end +function form_XtX(X, fit_intercept, lambda = 0, penalize_intercept = true) + return form_XtX(eltype(X), X, fit_intercept, lambda, penalize_intercept) +end + # Sigmoid and log-sigmoid