JuliaAI · MartinuzziFrancesco · Mar 7, 2024
diff --git a/src/fit/analytical.jl b/src/fit/analytical.jl
@@ -15,13 +15,13 @@ Assuming `n` dominates `p`,
 * iterative (conjugate gradient): O(κnp) - with κ the number of CG steps
                                   (κ ≤ p).
 """
-function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
+function _fit(::Type{T}, glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch) where {T<:Real}
     # full solve
     if !solver.iterative
         λ  = get_penalty_scale(glr, length(y))
         if iszero(λ)
             # standard LS solution
-            return augment_X(X, glr.fit_intercept) \ y
+            return augment_X(T, X, glr.fit_intercept) \ y
         else
             # Ridge case -- form the Hat Matrix then solve
             H = form_XtX(X, glr.fit_intercept, λ, glr.penalize_intercept)
@@ -43,3 +43,7 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
     glr.fit_intercept && (b = vcat(b, sum(y)))
     return cg(Hm, b; maxiter=max_cg_steps)
 end
+
+function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
diff --git a/src/fit/default.jl b/src/fit/default.jl
@@ -33,8 +33,8 @@ $SIGNATURES
 Fit a generalised linear regression model using an appropriate solver based on
 the loss and penalty of the model. A method can, in some cases, be specified.
 """
-function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
-             solver::Solver=_solver(glr, size(X)))
+function fit(::Type{T}, glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
+             solver::Solver=_solver(glr, size(X))) where {T<:Real}
     if hasproperty(solver, :gram) && solver.gram
         # interpret X,y as X'X, X'y
         data = verify_or_construct_gramian(glr, X, y, data)
@@ -44,9 +44,14 @@ function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
         check_nrows(X, y)
         n, p = size(X)
         c = getc(glr, y)
-        return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+        return _fit(T, glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
     end
 end
+
+function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; kwargs...)
+    return fit(eltype(X), glr, X, y; kwargs...)
+end
+
 fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...)
 
 

diff --git a/src/fit/iwls.jl b/src/fit/iwls.jl
@@ -1,5 +1,5 @@
-function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
-              ) where {ρ}
+function _fit(::Type{T}, glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
+              ) where {ρ, T<:Real}
     n,p,_ = npc(scratch)
     _Mv! = Mv!(glr, X, y, scratch; threshold=solver.threshold)
     κ    = solver.damping # between 0 and 1, 1 = fully take the new iteration
@@ -34,3 +34,8 @@ function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
         @warn "IWLS did not converge in $(solver.max_iter) iterations."
     return θ
 end
+
+function _fit(glr::GLR{RobustLoss{ρ},<:L2R},
+    solver::IWLSCG, X, y, scratch) where {ρ}
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
diff --git a/src/fit/newton.jl b/src/fit/newton.jl
@@ -12,17 +12,22 @@ Fit a GLR using Newton's method.
 Assuming `n` dominates `p`, O(κnp²), dominated by the construction of the
 Hessian at each step with κ the number of Newton steps.
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::Newton, X, y, scratch)
+function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+    solver::Newton, X, y, scratch) where {T<:Real}
     _,p,_ = npc(scratch)
-    θ₀    = zeros(p)
-    _fgh! = fgh!(glr, X, y, scratch)
+    θ₀    = zeros(T, p)
+    _fgh! = fgh!(T, glr, X, y, scratch)
     opt   = Optim.only_fgh!(_fgh!)
     res   = Optim.optimize(opt, θ₀, Optim.Newton(; solver.newton_options...),
                            solver.optim_options)
     return Optim.minimizer(res)
 end
 
+function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+    solver::Newton, X, y, scratch)
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
+
 """
 $SIGNATURES
 
@@ -35,13 +40,13 @@ Assuming `n` dominates `p`, O(κ₁κ₂np), dominated by the application of the
 Hessian at each step where κ₁ is the number of Newton steps and κ₂ is the
 average number of CG steps per Newton step (which is at most p).
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::NewtonCG, X, y, scratch)
+function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+    solver::NewtonCG, X, y, scratch) where {T<:Real}
     _,p,_ = npc(scratch)
-    θ₀    = zeros(p)
+    θ₀    = zeros(T, p)
     _f    = objective(glr, X, y)
-    _fg!  = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738
-    _Hv!  = Hv!(glr, X, y, scratch)
+    _fg!  = (g, θ) -> fgh!(T, glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738
+    _Hv!  = Hv!(T, glr, X, y, scratch)
     opt   = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
     res   = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion(; solver.newtoncg_options...),
                            solver.optim_options)
@@ -58,11 +63,11 @@ Fit a GLR using LBFGS.
 Assuming `n` dominates `p`, O(κnp), dominated by the computation of the
 gradient at each step with κ the number of LBFGS steps.
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::LBFGS, X, y, scratch)
+function _fit(::Type{T}, glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+    solver::LBFGS, X, y, scratch) where {T<:Real}
     _,p,_ = npc(scratch)
-    θ₀    = zeros(p)
-    _fg!  = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
+    θ₀    = zeros(T, p)
+    _fg!  = (f, g, θ) -> fgh!(T, glr, X, y, scratch)(f, g, nothing, θ)
     opt   = Optim.only_fg!(_fg!)
     res   = Optim.optimize(opt, θ₀, Optim.LBFGS(; solver.lbfgs_options...),
                            solver.optim_options)
@@ -85,19 +90,23 @@ computations are dominated by the application of the Hessian at each step with
 κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton
 step.
 """
-function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG,
-              X, y, scratch)
+function _fit(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG,
+    X, y, scratch) where {T<:Real}
     _,p,c = npc(scratch)
-    θ₀    = zeros(p * c)
+    θ₀    = zeros(T, p * c)
     _f    = objective(glr, X, y; c=c)
-    _fg!  = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
-    _Hv!  = Hv!(glr, X, y, scratch)
+    _fg!  = (g, θ) -> fg!(T, glr, X, y, scratch)(T(0.0), g, θ) # XXX: Optim.jl/738
+    _Hv!  = Hv!(T, glr, X, y, scratch)
     opt   = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
     res   = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion(; solver.newtoncg_options...),
                            solver.optim_options)
     return Optim.minimizer(res)
 end
 
+function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG, X, y, scratch)
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
+
 """
 $SIGNATURES
 
@@ -109,13 +118,17 @@ Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated
 by the computation of the gradient at each step with κ the number of LBFGS
 steps.
 """
-function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS,
-              X, y, scratch)
+function _fit(::Type{T}, glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS,
+    X, y, scratch) where {T<:Real}
     _,p,c = npc(scratch)
-    θ₀    = zeros(p * c)
-    _fg!  = fg!(glr, X, y, scratch)
+    θ₀    = zeros(T, p * c)
+    _fg!  = fg!(T, glr, X, y, scratch)
     opt   = Optim.only_fg!(_fg!)
     res   = Optim.optimize(opt, θ₀, Optim.LBFGS(; solver.lbfgs_options...),
                            solver.optim_options)
     return Optim.minimizer(res)
 end
+
+function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS, X, y, scratch)
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl
@@ -2,27 +2,27 @@
 
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
-function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
+function _fit(::Type{T}, glr::GLR, solver::ProxGrad, X, y, scratch) where {T<:Real}
     n,p,c = npc(scratch)
     c > 0 && (p *= c)
     # vector caches + eval cache
-    θ   = zeros(p)   # θ_k
-    Δθ  = zeros(p)   # (θ_k - θ_{k-1})
-    θ̄   = zeros(p)   # θ_k + ρ Δθ // extrapolation
-    ∇fθ̄ = zeros(p)
-    fθ̄  = 0.0        # useful for backtracking function
-    θ̂   = zeros(p)   # candidate before becoming θ_k
+    θ   = zeros(T, p)   # θ_k
+    Δθ  = zeros(T, p)   # (θ_k - θ_{k-1})
+    θ̄   = zeros(T, p)   # θ_k + ρ Δθ // extrapolation
+    ∇fθ̄ = zeros(T, p)
+    fθ̄  = T(0.0)        # useful for backtracking function
+    θ̂   = zeros(T, p)   # candidate before becoming θ_k
     # cache for extrapolation constant and stepsizes
-    ω   = 0.0   # ω_k
-    ω_  = 0.0   # ω_{k-1}
-    ω__ = 0.0   # ω_{k-2}
-    η   = 1.0   # stepsize (1/L)
-    acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA)
+    ω   = T(0.0)   # ω_k
+    ω_  = T(0.0)   # ω_{k-1}
+    ω__ = T(0.0)   # ω_{k-2}
+    η   = T(1.0)   # stepsize (1/L)
+    acc = ifelse(solver.accel, T(1.0), T(0.0)) # if 0, no extrapolation (ISTA)
     # functions
     _f = if solver.gram
-        smooth_gram_objective(glr, X, y, n)
+        smooth_gram_objective(T, glr, X, y, n)
     else
-        smooth_objective(glr, X, y; c=c)
+        smooth_objective(T, glr, X, y; c=c)
     end
 
     _fg! = if solver.gram
@@ -32,7 +32,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
     end
     _prox!  = prox!(glr, n)
     bt_cond = θ̂ ->
-                _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
+                _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (T(2)*η)
     # loop-related
     k, tol = 1, Inf
     while k ≤ solver.max_iter && tol > solver.tol
@@ -43,7 +43,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
         # for Linear Inverse Problems" (page 193)
         # --------------------------------------------------
         # 1. linear extrapolation of past iterates
-        ω   = (1.0 + sqrt(1.0 + 4.0 * ω_^2)) / 2.0
+        ω   = (T(1.0) + sqrt(T(1.0) + T(4.0) * ω_^T(2))) / T(2.0)
         ρ   = acc * ω__ / ω  # ω_{k-2}/ω; note that ρ != 0 only as k > 2
         θ̄  .= θ + ρ * Δθ
         # 2. attempt a prox step, modify the step until verifies condition
@@ -74,3 +74,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
                               "$(solver.max_iter) iterations."
     return θ
 end
+
+function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
+    return _fit(eltype(X), glr, solver, X, y, scratch)
+end
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
@@ -127,12 +127,12 @@ Proximal Gradient solver for non-smooth objective functions.
                backtracking step
 * `beta`: rate of shrinkage in the backtracking step (between 0 and 1)
 """
-@with_kw struct ProxGrad <: Solver
+@with_kw struct ProxGrad{T<:Real} <: Solver
     accel::Bool    = false # use Nesterov style acceleration (see also FISTA)
     max_iter::Int  = 1000  # max number of overall iterations
-    tol::Float64   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
+    tol::T   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
     max_inner::Int = 100   # β^max_inner should be > 1e-10
-    beta::Float64  = 0.8   # in (0, 1); shrinkage in the backtracking step
+    beta::T  = 0.8   # in (0, 1); shrinkage in the backtracking step
     gram::Bool = false     # use precomputed Gramian for lsq where possible
 end
 
@@ -156,12 +156,12 @@ computations.
 * `damping` (Float64): how much to trust iterates (1=full trust)
 * `threshold` (Float64): threshold for the residuals
 """
-@with_kw struct IWLSCG <: Solver
+@with_kw struct IWLSCG{T<:Real} <: Solver
     max_iter::Int      = 100
     max_inner::Int     = 200
-    tol::Float64       = 1e-4
-    damping::Float64   = 1.0   # should be between 0 and 1, 1 = trust iterates
-    threshold::Float64 = 1e-6  # thresh for residuals; used eg in quantile reg
+    tol::T       = 1e-4
+    damping::T   = 1.0   # should be between 0 and 1, 1 = trust iterates
+    threshold::T = 1e-6  # thresh for residuals; used eg in quantile reg
 end
 
 # ===================== admm.jl

diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl
@@ -12,21 +12,21 @@
 # * Hv! used in iterative solution
 # ---------------------------------------------------------
 
-function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch)
+function Hv!(::Type{T}, glr::GLR{L2Loss,<:L2R}, X, y, scratch) where {T<:Real}
     n, p = size(X)
     λ    = get_penalty_scale(glr, n)
     if glr.fit_intercept
         # H = [X 1]'[X 1] + λ I
         # rows a 1:p = [X'X + λI | X'1]
         # row  e end = [1'X      | n+λι] where ι is 1 if glr.penalize_intercept
-        ι = float(glr.penalize_intercept)
+        ι = T(glr.penalize_intercept)
         (Hv, v) -> begin
             # view on the first p rows
             a     = 1:p
             Hvₐ   = view(Hv, a)
             vₐ    = view(v,  a)
             Xt1   = view(scratch.p, a)
-            Xt1 .*= 0
+            Xt1 .*= T(0)
             @inbounds for i in a, j in 1:n
                 Xt1[i] += X[j, i]           # -- X'1
             end
@@ -49,6 +49,10 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch)
     end
 end
 
+function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch)
+    return Hv!(eltype(X), glr, X, y, scratch)
+end
+
 # ----------------------------- #
 #  -- Lasso/Elnet Regression -- #
 # ----------------------------- #