From 29f9b7d69a27a156c76d369f16617223b161b6f7 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Fri, 1 Nov 2024 01:18:37 -0500 Subject: [PATCH 1/4] Add an implementation of BLOCK-MINRES --- docs/src/api.md | 10 +- docs/src/block_krylov.md | 16 +- docs/src/block_processes.md | 2 + docs/src/gpu.md | 2 +- docs/src/preconditioners.md | 2 +- src/Krylov.jl | 1 + src/block_gmres.jl | 2 +- src/block_krylov_processes.jl | 2 +- src/block_krylov_solvers.jl | 75 +++++++- src/block_minres.jl | 326 ++++++++++++++++++++++++++++++++++ src/krylov_processes.jl | 2 +- src/krylov_solve.jl | 66 +++++-- src/usymlq.jl | 2 +- src/usymqr.jl | 2 +- test/gpu/amd.jl | 2 +- test/gpu/gpu.jl | 22 +-- test/gpu/intel.jl | 2 +- test/gpu/metal.jl | 2 +- test/gpu/nvidia.jl | 2 +- 19 files changed, 491 insertions(+), 49 deletions(-) create mode 100644 src/block_minres.jl diff --git a/docs/src/api.md b/docs/src/api.md index b3050c16e..01453099d 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -12,11 +12,10 @@ Krylov.LSLQStats Krylov.LsmrStats ``` -## Solver Types +## Workspace of Krylov methods ```@docs KrylovSolver -BlockKrylovSolver MinresSolver MinaresSolver CgSolver @@ -53,6 +52,13 @@ CraigSolver CraigmrSolver GpmrSolver FgmresSolver +``` + +## Workspace of block-Krylov methods + +```@docs +BlockKrylovSolver +BlockMinresSolver BlockGmresSolver ``` diff --git a/docs/src/block_krylov.md b/docs/src/block_krylov.md index 428b1ca35..160b3fddb 100644 --- a/docs/src/block_krylov.md +++ b/docs/src/block_krylov.md @@ -1,10 +1,7 @@ -## Block-GMRES - !!! note - `block_gmres` works on GPUs - with Julia 1.11. + `block_minres` and `block_gmres` work on GPUs with Julia 1.11. -If you want to use `block_gmres` on previous Julia versions, you can overload the function `Krylov.copy_triangle` with the following code: +If you want to use `block_minres` and `block_gmres` on previous Julia versions, you can overload the function `Krylov.copy_triangle` with the following code: ```julia using KernelAbstractions, Krylov @@ -23,6 +20,15 @@ function Krylov.copy_triangle(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, k::I end ``` +## Block-MINRES + +```@docs +block_minres +block_minres! +``` + +## Block-GMRES + ```@docs block_gmres block_gmres! diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md index e9fc17811..3c1d114dc 100644 --- a/docs/src/block_processes.md +++ b/docs/src/block_processes.md @@ -27,6 +27,8 @@ T_{k+1,k} = The function [`hermitian_lanczos`](@ref hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$ and $T_{k+1,k}$. +Related method: [`BLOCK-MINRES`](@ref block_minres). + ```@docs hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` diff --git a/docs/src/gpu.md b/docs/src/gpu.md index 2db45df1b..f783a3995 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -107,7 +107,7 @@ if CUDA.functional() symmetric = hermitian = true opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(P, x, y, z)) - # Solve an Hermitian positive definite system with an IC(0) preconditioner on GPU + # Solve a Hermitian positive definite system with an IC(0) preconditioner on GPU x, stats = cg(A_gpu, b_gpu, M=opM) end ``` diff --git a/docs/src/preconditioners.md b/docs/src/preconditioners.md index 983bc51c7..c05225a9f 100644 --- a/docs/src/preconditioners.md +++ b/docs/src/preconditioners.md @@ -46,7 +46,7 @@ A Krylov method dedicated to non-Hermitian linear systems allows the three varia ### Hermitian linear systems -Methods concerned: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CG-LANCZOS`](@ref cg_lanczos), [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift), [`CR`](@ref cr), [`CAR`](@ref car), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp) and [`MINARES`](@ref minares). +Methods concerned: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CG-LANCZOS`](@ref cg_lanczos), [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift), [`CR`](@ref cr), [`CAR`](@ref car), [`MINRES`](@ref minres), [`BLOCK-MINRES`](@ref block_minres), [`MINRES-QLP`](@ref minres_qlp) and [`MINARES`](@ref minares). When $A$ is Hermitian, we can only use centered preconditioning $L^{-1}AL^{-H}y = L^{-1}b$ with $x = L^{-H}y$. Centered preconditioning is a special case of two-sided preconditioning with $P_{\ell} = L = P_r^H$ that maintains hermicity. diff --git a/src/Krylov.jl b/src/Krylov.jl index 2578d308b..ac44f8052 100644 --- a/src/Krylov.jl +++ b/src/Krylov.jl @@ -12,6 +12,7 @@ include("block_krylov_utils.jl") include("block_krylov_processes.jl") include("block_krylov_solvers.jl") +include("block_minres.jl") include("block_gmres.jl") include("cg.jl") diff --git a/src/block_gmres.jl b/src/block_gmres.jl index 376193114..d326ab025 100644 --- a/src/block_gmres.jl +++ b/src/block_gmres.jl @@ -1,6 +1,6 @@ # An implementation of block-GMRES for the solution of the square linear system AX = B. # -# Alexis Montoison, +# Alexis Montoison, -- # Argonne National Laboratory -- Chicago, October 2023. export block_gmres, block_gmres! diff --git a/src/block_krylov_processes.jl b/src/block_krylov_processes.jl index 3eb03d0b4..04d1061ae 100644 --- a/src/block_krylov_processes.jl +++ b/src/block_krylov_processes.jl @@ -3,7 +3,7 @@ #### Input arguments -* `A`: a linear operator that models an Hermitian matrix of dimension `n`; +* `A`: a linear operator that models a Hermitian matrix of dimension `n`; * `B`: a matrix of size `n × p`; * `k`: the number of iterations of the block Hermitian Lanczos process. diff --git a/src/block_krylov_solvers.jl b/src/block_krylov_solvers.jl index a1e8db13a..69692fe21 100644 --- a/src/block_krylov_solvers.jl +++ b/src/block_krylov_solvers.jl @@ -1,12 +1,80 @@ export BlockKrylovSolver -export BlockGmresSolver +export BlockMinresSolver, BlockGmresSolver -const BLOCK_KRYLOV_SOLVERS = Dict(:block_gmres => :BlockGmresSolver) +const BLOCK_KRYLOV_SOLVERS = Dict(:block_minres => :BlockMinresSolver, + :block_gmres => :BlockGmresSolver ) "Abstract type for using block Krylov solvers in-place" abstract type BlockKrylovSolver{T,FC,SV,SM} end +""" +Type for storing the vectors required by the in-place version of BLOCK-MINRES. + +The outer constructors + + solver = BlockMinresSolver(m, n, p, SV, SM) + solver = BlockMinresSolver(A, B) + +may be used in order to create these vectors. +`memory` is set to `div(n,p)` if the value given is larger than `div(n,p)`. +""" +mutable struct BlockMinresSolver{T,FC,SV,SM} <: BlockKrylovSolver{T,FC,SV,SM} + m :: Int + n :: Int + p :: Int + ΔX :: SM + X :: SM + P :: SM + Q :: SM + C :: SM + D :: SM + Φ :: SM + tmp :: SM + Vₖ₋₁ :: SM + Vₖ :: SM + wₖ₋₂ :: SM + wₖ₋₁ :: SM + Hₖ₋₂ :: SM + Hₖ₋₁ :: SM + τₖ₋₂ :: SV + τₖ₋₁ :: SV + warm_start :: Bool + stats :: SimpleStats{T} +end + +function BlockMinresSolver(m, n, p, SV, SM) + FC = eltype(SV) + T = real(FC) + ΔX = SM(undef, 0, 0) + X = SM(undef, n, p) + P = SM(undef, 0, 0) + Q = SM(undef, n, p) + C = SM(undef, p, p) + D = SM(undef, 2p, p) + Φ = SM(undef, p, p) + tmp = C isa Matrix ? SM(undef, 0, 0) : SM(undef, p, p) + Vₖ₋₁ = SM(undef, n, p) + Vₖ = SM(undef, n, p) + wₖ₋₂ = SM(undef, n, p) + wₖ₋₁ = SM(undef, n, p) + Hₖ₋₂ = SM(undef, 2p, p) + Hₖ₋₁ = SM(undef, 2p, p) + τₖ₋₂ = SV(undef, p) + τₖ₋₁ = SV(undef, p) + stats = SimpleStats(0, false, false, T[], T[], T[], 0.0, "unknown") + solver = BlockMinresSolver{T,FC,SV,SM}(m, n, p, ΔX, X, P, Q, C, D, Φ, tmp, Vₖ₋₁, Vₖ, wₖ₋₂, wₖ₋₁, Hₖ₋₂, Hₖ₋₁, τₖ₋₂, τₖ₋₁, false, stats) + return solver +end + +function BlockMinresSolver(A, B) + m, n = size(A) + s, p = size(B) + SM = typeof(B) + SV = matrix_to_vector(SM) + BlockMinresSolver(m, n, p, SV, SM) +end + """ Type for storing the vectors required by the in-place version of BLOCK-GMRES. @@ -71,7 +139,8 @@ function BlockGmresSolver(A, B, memory = 5) end for (KS, fun, nsol, nA, nAt, warm_start) in [ - (:BlockGmresSolver, :block_gmres!, 1, 1, 0, true) + (:BlockMinresSolver, :block_minres!, 1, 1, 0, true) + (:BlockGmresSolver , :block_gmres! , 1, 1, 0, true) ] @eval begin size(solver :: $KS) = solver.m, solver.n diff --git a/src/block_minres.jl b/src/block_minres.jl new file mode 100644 index 000000000..46c9e9d7f --- /dev/null +++ b/src/block_minres.jl @@ -0,0 +1,326 @@ +# An implementation of block-MINRES for the solution of the square linear system AX = B. +# +# Alexis Montoison, -- +# Argonne National Laboratory -- Chicago, October 2024. + +export block_minres, block_minres! + +""" + (X, stats) = block_minres(A, b::AbstractMatrix{FC}; + M=I, ldiv::Bool=false, + atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0, + timemax::Float64=Inf, verbose::Int=0, history::Bool=false, + callback=solver->false, iostream::IO=kstdout) + +`T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`. +`FC` is `T` or `Complex{T}`. + + (X, stats) = block_minres(A, B, X0::AbstractMatrix; kwargs...) + +Block-MINRES can be warm-started from an initial guess `X0` where `kwargs` are the same keyword arguments as above. + +Solve the Hermitian linear system AX = B of size n with p right-hand sides using block-MINRES. + +#### Input arguments + +* `A`: a linear operator that models a Hermitian matrix of dimension `n`; +* `B`: a matrix of size `n × p`. + +#### Optional argument + +* `X0`: a matrix of size `n × p` that represents an initial guess of the solution `X`. + +#### Keyword arguments + +* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning; +* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`; +* `atol`: absolute stopping tolerance based on the residual norm; +* `rtol`: relative stopping tolerance based on the residual norm; +* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2 * div(n,p)`; +* `timemax`: the time limit in seconds; +* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations; +* `history`: collect additional statistics on the run such as residual norms; +* `callback`: function or functor called as `callback(solver)` that returns `true` if the block-Krylov method should terminate, and `false` otherwise; +* `iostream`: stream to which output is logged. + +#### Output arguments + +* `X`: a dense matrix of size `n × p`; +* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure. +""" +function block_minres end + +""" + solver = block_minres!(solver::BlockMinresSolver, B; kwargs...) + solver = block_minres!(solver::BlockMinresSolver, B, X0; kwargs...) + +where `kwargs` are keyword arguments of [`block_minres`](@ref). + +See [`BlockMinresSolver`](@ref) for more details about the `solver`. +""" +function block_minres! end + +def_args_block_minres = (:(A ), + :(B::AbstractMatrix{FC})) + +def_optargs_block_minres = (:(X0::AbstractMatrix),) + +def_kwargs_block_minres = (:(; M = I ), + :(; ldiv::Bool = false ), + :(; atol::T = √eps(T) ), + :(; rtol::T = √eps(T) ), + :(; itmax::Int = 0 ), + :(; timemax::Float64 = Inf ), + :(; verbose::Int = 0 ), + :(; history::Bool = false ), + :(; callback = solver -> false ), + :(; iostream::IO = kstdout )) + +def_kwargs_block_minres = mapreduce(extract_parameters, vcat, def_kwargs_block_minres) + +args_block_minres = (:A, :B) +optargs_block_minres = (:X0,) +kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream) + +@eval begin + function block_minres!(solver :: BlockMinresSolver{T,FC,SV,SM}, $(def_args_block_minres...); $(def_kwargs_block_minres...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} + + # Timer + start_time = time_ns() + timemax_ns = 1e9 * timemax + + n, m = size(A) + s, p = size(B) + m == n || error("System must be square") + n == s || error("Inconsistent problem size") + (verbose > 0) && @printf(iostream, "BLOCK-MINRES: system of size %d with %d right-hand sides\n", n, p) + + # Check M = Iₙ + MisI = (M === I) + MisI || error("Block-MINRES doesn't support preconditioning yet.") + + # Check type consistency + eltype(A) == FC || @warn "eltype(A) ≠ $FC. This could lead to errors or additional allocations in operator-matrix products." + ktypeof(B) <: SM || error("ktypeof(B) is not a subtype of $SM") + + # Set up workspace. + Vₖ₋₁, Vₖ = solver.Vₖ₋₁, solver.Vₖ + ΔX, X, Q, C = solver.ΔX, solver.X, solver.Q, solver.C + D, Φ, stats = solver.D, solver.Φ, solver.stats + wₖ₋₂, wₖ₋₁ = solver.wₖ₋₂, solver.wₖ₋₁ + Hₖ₋₂, Hₖ₋₁ = solver.Hₖ₋₂, solver.Hₖ₋₁ + τₖ₋₂, τₖ₋₁ = solver.τₖ₋₂, solver.τₖ₋₁ + warm_start = solver.warm_start + RNorms = stats.residuals + reset!(stats) + R₀ = warm_start ? W : B + + # Temporary buffers -- should be stored in the solver + Ψₖ₋₁ = zeros(p, p) + Ψₖ = zeros(p, p) + Ωₖ = zeros(p, p) + Ψₖ₊₁ = zeros(p, p) + Πₖ₋₂ = zeros(p, p) + Γbarₖ₋₁ = zeros(p, p) + Γₖ₋₁ = zeros(p, p) + Λbarₖ = zeros(p, p) + Λₖ = zeros(p, p) + + # Define the blocks D1 and D2 + D1 = view(D, 1:p, :) + D2 = view(D, p+1:2p, :) + trans = FC <: AbstractFloat ? 'T' : 'C' + Φbarₖ = Φₖ = Φbarₖ₊₁ = Φ + + # Coefficients for mul! + α = -one(FC) + β = one(FC) + γ = one(FC) + + # Initial solution X₀. + fill!(X, zero(FC)) + + # Initial residual R₀. + if warm_start + mul!(Q, A, ΔX) + Q .= B .- Q + end + RNorm = norm(R₀) # ‖R₀‖_F + history && push!(RNorms, RNorm) + + iter = 0 + itmax == 0 && (itmax = 2*div(n,p)) + + ε = atol + rtol * RNorm + (verbose > 0) && @printf(iostream, "%5s %7s %5s\n", "k", "‖Rₖ‖", "timer") + kdisplay(iter, verbose) && @printf(iostream, "%5d %7.1e %.2fs\n", iter, RNorm, start_time |> ktimer) + + # Stopping criterion + status = "unknown" + solved = RNorm ≤ ε + tired = iter ≥ itmax + user_requested_exit = false + overtimed = false + + # Initial Ψ₁ and V₁ + τ = τₖ₋₂ + copyto!(Vₖ, R₀) + if C isa Matrix + householder!(Vₖ, Φbarₖ, τ) + else + householder!(Vₖ, Φbarₖ, τ, solver.tmp) + end + + while !(solved || tired || user_requested_exit || overtimed) + # Update iteration index. + iter = iter + 1 + + # Continue the block-Lanczos process. + mul!(Q, A, Vₖ) # Q ← AVₖ + mul!(Ωₖ, Vₖ', Q) # Ωₖ = Vₖᴴ * Q + (iter ≥ 2) && mul!(Q, Vₖ₋₁, Ψₖ', α, β) # Q ← Q - Vₖ₋₁ * Ψₖᴴ + mul!(Q, Vₖ, Ωₖ, α, β) # Q = Q - Vₖ * Ωₖ + + # Update the QR factorization of Tₖ₊₁.ₖ = Qₖ [ Rₖ ]. + # [ Oᵀ ] + # + # [ Ω₁ Ψ₂ᴴ 0 • • • 0 ] [ Λ₁ Γ₁ Π₁ 0 • • 0 ] + # [ Ψ₂ Ω₂ • • • ] [ 0 Λ₂ Γ₂ • • • ] + # [ 0 • • • • • ] [ • • Λ₃ • • • • ] + # [ • • • • • • • ] = Qₖ [ • • • • • 0 ] + # [ • • • • • 0 ] [ • • • • Πₖ₋₂] + # [ • • • • Ψₖᴴ ] [ • • • Γₖ₋₁] + # [ • • Ψₖ Ωₖ ] [ 0 • • • • 0 Λₖ ] + # [ 0 • • • • 0 Ψₖ₊₁] [ 0 • • • • • 0 ] + # + # If k = 1, we don't have any previous reflection. + # If k = 2, we apply the last reflection. + # If k ≥ 3, we only apply the two previous reflections. + + # Apply previous Householder reflections Θₖ₋₂. + if iter ≥ 3 + D1 .= zero(T) + D2 .= Ψₖ₋₁' + kormqr!('L', trans, Hₖ₋₂, τₖ₋₂, D) + Πₖ₋₂ .= D1 + Γbarₖ₋₁ .= D2 + end + + # Apply previous Householder reflections Θₖ₋₁. + if iter ≥ 2 + (iter == 2) && (Γbarₖ₋₁ .= Ψₖ₋₁') + D1 .= Γbarₖ₋₁ + D2 .= Ωₖ + kormqr!('L', trans, Hₖ₋₁, τₖ₋₁, D) + Γₖ₋₁ .= D1 + Λbarₖ .= D2 + end + + # Vₖ₊₁ and Ψₖ₊₁ are stored in Vₖ₋₁ and C. + τ = τₖ₋₂ + copyto!(Vₖ₋₁, Q) + if C isa Matrix + householder!(Vₖ₋₁, C, τ) + else + householder!(Vₖ₋₁, C, τ, solver.tmp) + end + + # Compute and apply current Householder reflection θₖ. + Ψₖ₊₁ = C + Hₖ = Hₖ₋₂ + τₖ = τₖ₋₂ + (iter == 1) && (Λbarₖ .= Ωₖ) + Hₖ[1:p,:] .= Λbarₖ + Hₖ[p+1:2p,:] .= Ψₖ₊₁ + if C isa Matrix + householder!(Hₖ, Ψₖ₊₁, τₖ, compact=true) + else + householder!(Hₖ, Ψₖ₊₁, τₖ, solver.tmp, compact=true) + end + Λₖ .= view(Hₖ, 1:p, 1:p) + + # Update Zₖ = (Qₖ)ᴴΨ₁E₁ = (Φ₁, ..., Φₖ, Φbarₖ₊₁) + D1 .= Φbarₖ + D2 .= zero(FC) + kormqr!('L', trans, Hₖ, τₖ, D) + Φₖ .= D1 + + # Compute the directions Wₖ, the last columns of Wₖ = Vₖ(Rₖ)⁻¹ ⟷ (Rₖ)ᵀ(Wₖ)ᵀ = (Vₖ)ᵀ + # (Λ₁)ᵀw₁ = v₁ + if iter == 1 + wₖ = wₖ₋₁ + wₖ .+= Vₖ + ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + end + # (Λ₂)ᵀw₂ = v₂ - (Γ₁)ᵀw₁ + if iter == 2 + wₖ = wₖ₋₂ + transpose(wₖ) .-= transpose(Γₖ₋₁) * transpose(wₖ₋₁) + wₖ .+= Vₖ + ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + end + # (Λₖ)ᵀwₖ = vₖ - (Γₖ₋₁)ᵀwₖ₋₁ - (Πₖ₋₂)ᵀwₖ₋₂ + if iter ≥ 3 + wₖ₋₂ .= (wₖ₋₂ * Πₖ₋₂) + # lmul!(transpose(Πₖ₋₂), transpose(wₖ₋₂)) + wₖ = wₖ₋₂ + transpose(wₖ) .-= transpose(Γₖ₋₁) * transpose(wₖ₋₁) + wₖ .+= Vₖ + ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + end + + # Update Xₖ = VₖYₖ = WₖZₖ + # Xₖ = Xₖ₋₁ + wₖ * Φₖ + mul!(X, wₖ, Φₖ, γ, β) + + # Update residual norm estimate. + # ‖ M(B - AXₖ) ‖_F = ‖Φbarₖ₊₁‖_F + Φbarₖ₊₁ .= D2 + RNorm = norm(Φbarₖ₊₁) + history && push!(RNorms, RNorm) + + # Compute vₖ and vₖ₊₁ + copyto!(Vₖ₋₁, Vₖ) # vₖ₋₁ ← vₖ + copyto!(Vₖ, Q) # vₖ ← vₖ₊₁ + + # Update directions for X + if iter ≥ 2 + @kswap!(wₖ₋₂, wₖ₋₁) + end + + # Update other variables... + if iter ≥ 2 + @kswap!(Hₖ₋₂, Hₖ₋₁) + @kswap!(τₖ₋₂, τₖ₋₁) + copyto!(Ψₖ₋₁, Ψₖ) + end + copyto!(Ψₖ, Ψₖ₊₁) + + # Update stopping criterion. + user_requested_exit = callback(solver) :: Bool + solved = RNorm ≤ ε + tired = iter ≥ itmax + timer = time_ns() - start_time + overtimed = timer > timemax_ns + kdisplay(iter, verbose) && @printf(iostream, "%5d %7.1e %.2fs\n", iter, RNorm, start_time |> ktimer) + end + (verbose > 0) && @printf(iostream, "\n") + + # Termination status + tired && (status = "maximum number of iterations exceeded") + solved && (status = "solution good enough given atol and rtol") + overtimed && (status = "time limit exceeded") + user_requested_exit && (status = "user-requested exit") + + # Update Xₖ + warm_start && (X .+= ΔX) + solver.warm_start = false + + # Update stats + stats.niter = iter + stats.solved = solved + stats.timer = start_time |> ktimer + stats.status = status + return solver + end +end diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl index b3fcc6efe..c5605a229 100644 --- a/src/krylov_processes.jl +++ b/src/krylov_processes.jl @@ -5,7 +5,7 @@ export hermitian_lanczos, nonhermitian_lanczos, arnoldi, golub_kahan, saunders_s #### Input arguments -* `A`: a linear operator that models an Hermitian matrix of dimension `n`; +* `A`: a linear operator that models a Hermitian matrix of dimension `n`; * `b`: a vector of length `n`; * `k`: the number of iterations of the Hermitian Lanczos process. diff --git a/src/krylov_solve.jl b/src/krylov_solve.jl index 400bb6ebf..e8133be85 100644 --- a/src/krylov_solve.jl +++ b/src/krylov_solve.jl @@ -176,37 +176,67 @@ end # Block-Krylov methods for (workspace, krylov, args, def_args, optargs, def_optargs, kwargs, def_kwargs) in [ - (:BlockGmresSolver, :block_gmres, args_block_gmres, def_args_block_gmres, optargs_block_gmres, def_optargs_block_gmres, kwargs_block_gmres, def_kwargs_block_gmres), + (:BlockMinresSolver, :block_minres, args_block_minres, def_args_block_minres, optargs_block_minres, def_optargs_block_minres, kwargs_block_minres, def_kwargs_block_minres) + (:BlockGmresSolver , :block_gmres , args_block_gmres , def_args_block_gmres , optargs_block_gmres , def_optargs_block_gmres , kwargs_block_gmres , def_kwargs_block_gmres ) ] # Create the symbol for the in-place method krylov! = Symbol(krylov, :!) - @eval begin - ## Out-of-place - function $(krylov)($(def_args...); memory :: Int=20, $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} - start_time = time_ns() - solver = $workspace(A, B, memory) - elapsed_time = start_time |> ktimer - timemax -= elapsed_time - $(krylov!)(solver, $(args...); $(kwargs...)) - solver.stats.timer += elapsed_time - return results(solver) - end - - if !isempty($optargs) - function $(krylov)($(def_args...), $(def_optargs...); memory :: Int=20, $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + ## Out-of-place + if krylov == :block_gmres + @eval begin + function $(krylov)($(def_args...); memory :: Int=20, $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} start_time = time_ns() solver = $workspace(A, B, memory) - warm_start!(solver, $(optargs...)) - elapsed_time = start_time |> ktimer + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + $(krylov!)(solver, $(args...); $(kwargs...)) + solver.stats.timer += elapsed_time + return results(solver) + end + + if !isempty($optargs) + function $(krylov)($(def_args...), $(def_optargs...); memory :: Int=20, $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + start_time = time_ns() + solver = $workspace(A, B, memory) + warm_start!(solver, $(optargs...)) + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + $(krylov!)(solver, $(args...); $(kwargs...)) + solver.stats.timer += elapsed_time + return results(solver) + end + end + end + else + @eval begin + function $(krylov)($(def_args...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + start_time = time_ns() + solver = $workspace(A, B) + elapsed_time = ktimer(start_time) timemax -= elapsed_time $(krylov!)(solver, $(args...); $(kwargs...)) solver.stats.timer += elapsed_time return results(solver) end + + if !isempty($optargs) + function $(krylov)($(def_args...), $(def_optargs...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + start_time = time_ns() + solver = $workspace(A, B) + warm_start!(solver, $(optargs...)) + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + $(krylov!)(solver, $(args...); $(kwargs...)) + solver.stats.timer += elapsed_time + return results(solver) + end + end end + end - ## In-place + ## In-place + @eval begin solve!(solver :: $workspace{T,FC,SV,SM}, $(def_args...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} = $(krylov!)(solver, $(args...); $(kwargs...)) if !isempty($optargs) diff --git a/src/usymlq.jl b/src/usymlq.jl index 7a8172500..a92913f99 100644 --- a/src/usymlq.jl +++ b/src/usymlq.jl @@ -281,7 +281,7 @@ kwargs_usymlq = (:transfer_to_usymcg, :atol, :rtol, :itmax, :timemax, :verbose, kaxpby!(n, -cₖ, uₖ, conj(sₖ), d̅) end - # Compute uₖ₊₁ and uₖ₊₁. + # Compute vₖ₊₁ and uₖ₊₁. kcopy!(m, vₖ₋₁, vₖ) # vₖ₋₁ ← vₖ kcopy!(n, uₖ₋₁, uₖ) # uₖ₋₁ ← uₖ diff --git a/src/usymqr.jl b/src/usymqr.jl index fa86d4524..98e9e87d0 100644 --- a/src/usymqr.jl +++ b/src/usymqr.jl @@ -293,7 +293,7 @@ kwargs_usymqr = (:atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, AᴴrNorm = abs(ζbarₖ) * √(abs2(δbarₖ) + abs2(cₖ₋₁ * γₖ₊₁)) history && push!(AᴴrNorms, AᴴrNorm) - # Compute uₖ₊₁ and uₖ₊₁. + # Compute vₖ₊₁ and uₖ₊₁. kcopy!(m, vₖ₋₁, vₖ) # vₖ₋₁ ← vₖ kcopy!(n, uₖ₋₁, uₖ) # uₖ₋₁ ← uₖ diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index 3b595f289..2690f20dc 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -79,7 +79,7 @@ include("gpu.jl") Krylov.kcopy!(n, y, x) end - @testset "kswap -- $FC" begin + @testset "kswap! -- $FC" begin Krylov.@kswap!(x, y) end diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index dcd7a007a..dd7c7140e 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -1,18 +1,20 @@ using SparseArrays, Random, Test using LinearAlgebra, Krylov, KernelAbstractions -@kernel function copy_triangle_kernel!(dest, src) - i, j = @index(Global, NTuple) - if j >= i - @inbounds dest[i, j] = src[i, j] +if VERSION < v"1.11" + @kernel function copy_triangle_kernel!(dest, src) + i, j = @index(Global, NTuple) + if j >= i + @inbounds dest[i, j] = src[i, j] + end end -end -function Krylov.copy_triangle(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, k::Int) where FC <: Krylov.FloatOrComplex - backend = get_backend(Q) - ndrange = (k, k) - copy_triangle_kernel!(backend)(R, Q; ndrange=ndrange) - KernelAbstractions.synchronize(backend) + function Krylov.copy_triangle(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, k::Int) where FC <: Krylov.FloatOrComplex + backend = get_backend(Q) + ndrange = (k, k) + copy_triangle_kernel!(backend)(R, Q; ndrange=ndrange) + KernelAbstractions.synchronize(backend) + end end Random.seed!(666) diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index c3ee22e4f..0498f2f01 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -62,7 +62,7 @@ include("gpu.jl") Krylov.kcopy!(n, y, x) end - @testset "kswap -- $FC" begin + @testset "kswap! -- $FC" begin Krylov.@kswap!(x, y) end diff --git a/test/gpu/metal.jl b/test/gpu/metal.jl index 5c90845c4..290b0e4d2 100644 --- a/test/gpu/metal.jl +++ b/test/gpu/metal.jl @@ -62,7 +62,7 @@ include("gpu.jl") Krylov.kcopy!(n, y, x) end - @testset "kswap -- $FC" begin + @testset "kswap! -- $FC" begin Krylov.@kswap!(x, y) end diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index 470ca3758..96825df1f 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -162,7 +162,7 @@ include("gpu.jl") Krylov.kcopy!(n, y, x) end - @testset "kswap -- $FC" begin + @testset "kswap! -- $FC" begin Krylov.@kswap!(x, y) end From 60ffac69c0f6dd13e1123a8168d9164d171f8dd1 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 4 Nov 2024 01:22:37 -0600 Subject: [PATCH 2/4] Working version of block_minres --- src/block_minres.jl | 54 +++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/block_minres.jl b/src/block_minres.jl index 46c9e9d7f..c9d8b5106 100644 --- a/src/block_minres.jl +++ b/src/block_minres.jl @@ -113,10 +113,9 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his warm_start = solver.warm_start RNorms = stats.residuals reset!(stats) - R₀ = warm_start ? W : B + R₀ = warm_start ? Q : B # Temporary buffers -- should be stored in the solver - Ψₖ₋₁ = zeros(p, p) Ψₖ = zeros(p, p) Ωₖ = zeros(p, p) Ψₖ₊₁ = zeros(p, p) @@ -200,7 +199,7 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his # Apply previous Householder reflections Θₖ₋₂. if iter ≥ 3 D1 .= zero(T) - D2 .= Ψₖ₋₁' + D2 .= Ψₖ' kormqr!('L', trans, Hₖ₋₂, τₖ₋₂, D) Πₖ₋₂ .= D1 Γbarₖ₋₁ .= D2 @@ -208,7 +207,7 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his # Apply previous Householder reflections Θₖ₋₁. if iter ≥ 2 - (iter == 2) && (Γbarₖ₋₁ .= Ψₖ₋₁') + (iter == 2) && (Γbarₖ₋₁ .= Ψₖ') D1 .= Γbarₖ₋₁ D2 .= Ωₖ kormqr!('L', trans, Hₖ₋₁, τₖ₋₁, D) @@ -216,28 +215,25 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his Λbarₖ .= D2 end - # Vₖ₊₁ and Ψₖ₊₁ are stored in Vₖ₋₁ and C. + # Vₖ₊₁ and Ψₖ₊₁ are stored in Q and Ψₖ₊₁. τ = τₖ₋₂ - copyto!(Vₖ₋₁, Q) if C isa Matrix - householder!(Vₖ₋₁, C, τ) + householder!(Q, Ψₖ₊₁, τ) else - householder!(Vₖ₋₁, C, τ, solver.tmp) + householder!(Q, Ψₖ₊₁, τ, solver.tmp) end # Compute and apply current Householder reflection θₖ. - Ψₖ₊₁ = C Hₖ = Hₖ₋₂ τₖ = τₖ₋₂ (iter == 1) && (Λbarₖ .= Ωₖ) Hₖ[1:p,:] .= Λbarₖ Hₖ[p+1:2p,:] .= Ψₖ₊₁ if C isa Matrix - householder!(Hₖ, Ψₖ₊₁, τₖ, compact=true) + householder!(Hₖ, Λₖ, τₖ, compact=true) else - householder!(Hₖ, Ψₖ₊₁, τₖ, solver.tmp, compact=true) + householder!(Hₖ, Λₖ, τₖ, solver.tmp, compact=true) end - Λₖ .= view(Hₖ, 1:p, 1:p) # Update Zₖ = (Qₖ)ᴴΨ₁E₁ = (Φ₁, ..., Φₖ, Φbarₖ₊₁) D1 .= Φbarₖ @@ -246,31 +242,31 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his Φₖ .= D1 # Compute the directions Wₖ, the last columns of Wₖ = Vₖ(Rₖ)⁻¹ ⟷ (Rₖ)ᵀ(Wₖ)ᵀ = (Vₖ)ᵀ - # (Λ₁)ᵀw₁ = v₁ + # w₁Λ₁ = v₁ if iter == 1 wₖ = wₖ₋₁ - wₖ .+= Vₖ - ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + wₖ .= Vₖ + rdiv!(wₖ, UpperTriangular(Λₖ)) end - # (Λ₂)ᵀw₂ = v₂ - (Γ₁)ᵀw₁ + # w₂Λ₂ = v₂ - w₁Γ₁ if iter == 2 wₖ = wₖ₋₂ - transpose(wₖ) .-= transpose(Γₖ₋₁) * transpose(wₖ₋₁) + wₖ .= (-wₖ₋₁ * Γₖ₋₁) wₖ .+= Vₖ - ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + rdiv!(wₖ, UpperTriangular(Λₖ)) end - # (Λₖ)ᵀwₖ = vₖ - (Γₖ₋₁)ᵀwₖ₋₁ - (Πₖ₋₂)ᵀwₖ₋₂ + # wₖΛₖ = vₖ - wₖ₋₁Γₖ₋₁ - wₖ₋₂Πₖ₋₂ if iter ≥ 3 - wₖ₋₂ .= (wₖ₋₂ * Πₖ₋₂) - # lmul!(transpose(Πₖ₋₂), transpose(wₖ₋₂)) wₖ = wₖ₋₂ - transpose(wₖ) .-= transpose(Γₖ₋₁) * transpose(wₖ₋₁) + wₖ .= (-wₖ₋₂ * Πₖ₋₂) + wₖ .= (wₖ - wₖ₋₁ * Γₖ₋₁) wₖ .+= Vₖ - ldiv!(LowerTriangular(Λₖ |> transpose), transpose(wₖ)) + rdiv!(wₖ, UpperTriangular(Λₖ)) end # Update Xₖ = VₖYₖ = WₖZₖ # Xₖ = Xₖ₋₁ + wₖ * Φₖ + R = B - A * X mul!(X, wₖ, Φₖ, γ, β) # Update residual norm estimate. @@ -283,16 +279,16 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his copyto!(Vₖ₋₁, Vₖ) # vₖ₋₁ ← vₖ copyto!(Vₖ, Q) # vₖ ← vₖ₊₁ - # Update directions for X + # Update directions for X and other variables... if iter ≥ 2 @kswap!(wₖ₋₂, wₖ₋₁) - end - - # Update other variables... - if iter ≥ 2 @kswap!(Hₖ₋₂, Hₖ₋₁) @kswap!(τₖ₋₂, τₖ₋₁) - copyto!(Ψₖ₋₁, Ψₖ) + end + + if iter == 1 + copyto!(Hₖ₋₁, Hₖ) + copyto!(τₖ₋₁, τₖ) end copyto!(Ψₖ, Ψₖ₊₁) From 1ab71053eac84e30aa4d405533c653dd099f65a3 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 4 Nov 2024 01:31:49 -0600 Subject: [PATCH 3/4] Test block_minres on GPUs --- test/gpu/amd.jl | 9 +++++++++ test/gpu/intel.jl | 11 ++++++++++- test/gpu/nvidia.jl | 9 +++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index 2690f20dc..a336dc3a4 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -103,6 +103,15 @@ include("gpu.jl") @test norm(b - A * x) ≤ atol + rtol * norm(b) end + @testset "block-MINRES -- $FC" begin + A, b = symmetric_indefinite(FC=FC) + B = hcat(b, -b) + A = M(A) + B = M(B) + X, stats = block_minres(A, B) + @test norm(B - A * X) ≤ atol + rtol * norm(B) + end + @testset "block-GMRES -- $FC" begin A, b = nonsymmetric_indefinite(FC=FC) B = hcat(b, -b) diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 0498f2f01..aa50e4e86 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -86,8 +86,17 @@ include("gpu.jl") @test norm(b - A * x) ≤ atol + rtol * norm(b) end + @testset "block-MINRES -- $FC" begin + A, b = symmetric_indefinite(FC=FC) + B = hcat(b, -b) + A = M(A) + B = M(B) + X, stats = block_minres(A, B) + @test norm(B - A * X) ≤ atol + rtol * norm(B) + end + @testset "block-GMRES -- $FC" begin - A, b = symmetric_definite(FC=FC) + A, b = nonsymmetric_definite(FC=FC) B = hcat(b, -b) A = M(A) B = M(B) diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index 96825df1f..b6020b641 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -186,6 +186,15 @@ include("gpu.jl") @test norm(b - A * x) ≤ atol + rtol * norm(b) end + @testset "block-MINRES -- $FC" begin + A, b = symmetric_indefinite(FC=FC) + B = hcat(b, -b) + A = M(A) + B = M(B) + X, stats = block_minres(A, B) + @test norm(B - A * X) ≤ atol + rtol * norm(B) + end + @testset "block-GMRES -- $FC" begin A, b = nonsymmetric_indefinite(FC=FC) B = hcat(b, -b) From fb781d2230bfafb478aefd91bc1c5692270c7996 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 4 Nov 2024 01:39:29 -0600 Subject: [PATCH 4/4] Update the type of the temporary buffers --- src/block_minres.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/block_minres.jl b/src/block_minres.jl index c9d8b5106..a35613b83 100644 --- a/src/block_minres.jl +++ b/src/block_minres.jl @@ -116,14 +116,14 @@ kwargs_block_minres = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :his R₀ = warm_start ? Q : B # Temporary buffers -- should be stored in the solver - Ψₖ = zeros(p, p) - Ωₖ = zeros(p, p) - Ψₖ₊₁ = zeros(p, p) - Πₖ₋₂ = zeros(p, p) - Γbarₖ₋₁ = zeros(p, p) - Γₖ₋₁ = zeros(p, p) - Λbarₖ = zeros(p, p) - Λₖ = zeros(p, p) + Ψₖ = similar(B, p, p) + Ωₖ = similar(B, p, p) + Ψₖ₊₁ = similar(B, p, p) + Πₖ₋₂ = similar(B, p, p) + Γbarₖ₋₁ = similar(B, p, p) + Γₖ₋₁ = similar(B, p, p) + Λbarₖ = similar(B, p, p) + Λₖ = similar(B, p, p) # Define the blocks D1 and D2 D1 = view(D, 1:p, :)