diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 000000000..d2cbb0258
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,67 @@
+steps:
+  - label: "Nvidia GPUs -- CUDA.jl"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.8
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    command: |
+      julia --color=yes --project -e '
+      using Pkg
+      Pkg.add("CUDA")
+      Pkg.add("LinearOperators")
+      Pkg.instantiate()
+      include("test/gpu/nvidia.jl")'
+    timeout_in_minutes: 30
+
+  - label: "AMD GPUs -- AMDGPU.jl"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.9-nightly
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    env:
+      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+    command: |
+      julia --color=yes --project -e '
+      using Pkg
+      # Pkg.add("AMDGPU")
+      Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
+      Pkg.instantiate()
+      include("test/gpu/amd.jl")'
+    timeout_in_minutes: 30
+
+  - label: "Intel GPUs -- oneAPI.jl"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.8
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    command: |
+      julia --color=yes --project -e '
+      using Pkg
+      Pkg.add("oneAPI")
+      Pkg.instantiate()
+      include("test/gpu/intel.jl")'
+    timeout_in_minutes: 30
+
+  - label: "Apple M1 GPUs -- Metal.jl"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.8
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    command: |
+      julia --color=yes --project -e '
+      using Pkg
+      Pkg.add("Metal")
+      Pkg.instantiate()
+      include("test/gpu/metal.jl")'
+    timeout_in_minutes: 30
diff --git a/.cirrus.yml b/.cirrus.yml
index d559cf609..f51d815a3 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -1,15 +1,41 @@
-freebsd_instance:
-  image: freebsd-13-0-release-amd64
 task:
-  name: FreeBSD
-  env:
-    matrix:
-      - JULIA_VERSION: 1.6
-      - JULIA_VERSION: 1
-      - JULIA_VERSION: nightly
-  allow_failures: $JULIA_VERSION == 'nightly'
-  install_script:
-    - sh -c "$(fetch https://raw.githubusercontent.com/ararslan/CirrusCI.jl/master/bin/install.sh -o -)"
+  matrix:
+    - name: FreeBSD
+      freebsd_instance:
+        image_family: freebsd-13-1
+      env:
+        matrix:
+          - JULIA_VERSION: 1.6
+          - JULIA_VERSION: 1
+    - name: Linux ARMv8
+      arm_container:
+        image: ubuntu:latest
+      env:
+        - JULIA_VERSION: 1
+    - name: musl Linux
+      container:
+        image: alpine:3.14
+      env:
+        - JULIA_VERSION: 1
+    - name: MacOS M1
+      macos_instance:
+        image: ghcr.io/cirruslabs/macos-monterey-base:latest
+      env:
+        - JULIA_VERSION: 1
+  install_script: |
+    URL="https://raw.githubusercontent.com/ararslan/CirrusCI.jl/master/bin/install.sh"
+    set -x
+    if [ "$(uname -s)" = "Linux" ] && command -v apt; then
+        apt update
+        apt install -y curl
+    fi
+    if command -v curl; then
+        sh -c "$(curl ${URL})"
+    elif command -v wget; then
+        sh -c "$(wget ${URL} -q -O-)"
+    elif command -v fetch; then
+        sh -c "$(fetch ${URL} -o -)"
+    fi
   build_script:
     - cirrusjl build
   test_script:
diff --git a/.github/workflows/Aqua.yml b/.github/workflows/Aqua.yml
new file mode 100644
index 000000000..da872e225
--- /dev/null
+++ b/.github/workflows/Aqua.yml
@@ -0,0 +1,17 @@
+name: Aqua
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: '1'
+      - name: Aqua.jl
+        run: julia --color=yes -e 'using Pkg; Pkg.add("Aqua"); Pkg.develop(path="."); using Aqua, Krylov; Aqua.test_all(Krylov)'
diff --git a/.github/workflows/Breakage.yml b/.github/workflows/Breakage.yml
index 266eed3cc..8fd92afdd 100644
--- a/.github/workflows/Breakage.yml
+++ b/.github/workflows/Breakage.yml
@@ -24,14 +24,14 @@ jobs:
         pkgversion: [latest, stable]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Install Julia
       - uses: julia-actions/setup-julia@v1
         with:
           version: '1'
           arch: x64
-      - uses: actions/cache@v1
+      - uses: actions/cache@v3
         env:
           cache-name: cache-artifacts
         with:
@@ -85,7 +85,7 @@ jobs:
               end;
             end'
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           name: pr
           path: pr/
@@ -94,9 +94,9 @@ jobs:
     needs: break
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: actions/download-artifact@v2
+      - uses: actions/download-artifact@v3
         with:
           name: pr
           path: pr/
@@ -127,7 +127,7 @@ jobs:
             fi
           done >> MSG
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           name: pr
           path: pr/
diff --git a/.github/workflows/CI_M1.yml b/.github/workflows/CI_M1.yml
deleted file mode 100644
index 6f9aa720b..000000000
--- a/.github/workflows/CI_M1.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: CI_M1
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, synchronize, reopened]
-jobs:
-  test:
-    name: Julia ${{ matrix.version }} - macOS - ${{ matrix.arch }} - ${{ github.event_name }}
-    runs-on: self-hosted
-    strategy:
-      fail-fast: false
-      matrix:
-        version:
-          - '1'
-        arch:
-          - aarch64
-    steps:
-      - uses: actions/checkout@v3
-      - uses: julia-actions/setup-julia@v1
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-      - name: Version Info
-        shell: julia --color=yes {0}
-        run: |
-          using InteractiveUtils
-          versioninfo()
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-runtest@v1
diff --git a/.github/workflows/CommentPR.yml b/.github/workflows/CommentPR.yml
index 14f6dcd47..043113f74 100644
--- a/.github/workflows/CommentPR.yml
+++ b/.github/workflows/CommentPR.yml
@@ -39,16 +39,36 @@ jobs:
       - run: unzip pr.zip
 
       - name: 'Comment on PR'
-        uses: actions/github-script@v3
+        uses: actions/github-script@v6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
-            var fs = require('fs');
-            var issue_number = Number(fs.readFileSync('./NR'));
-            var msg = fs.readFileSync('./MSG', 'utf8');
-            await github.issues.createComment({
+            var fs = require('fs')
+            var issue_number = Number(fs.readFileSync('./NR'))
+            var msg = fs.readFileSync('./MSG', 'utf8')
+
+            // Get the existing comments.
+            const {data: comments} = await github.rest.issues.listComments({
               owner: context.repo.owner,
               repo: context.repo.repo,
-              issue_number: issue_number,
-              body: msg
-            });
+              issue_number: issue_number
+            })
+
+            // Find any comment already made by the bot.
+            const botComment = comments.find(comment => comment.user.id === 41898282)
+
+            if (botComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: msg
+              })
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue_number,
+                body: msg
+              })
+            }
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index b546a8082..7a9c79fd4 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -1,19 +1,44 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 00 * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - uses: julia-actions/setup-julia@latest
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
         with:
           version: '1'
-      - name: CompatHelper
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
index be0b86584..406f15e0d 100644
--- a/.github/workflows/Documentation.yml
+++ b/.github/workflows/Documentation.yml
@@ -10,12 +10,12 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1'
       - name: Install dependencies
-        run: julia --project=docs -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+        run: julia --project=docs --color=yes -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/Invalidations.yml b/.github/workflows/Invalidations.yml
new file mode 100644
index 000000000..b0c37e05f
--- /dev/null
+++ b/.github/workflows/Invalidations.yml
@@ -0,0 +1,43 @@
+name: Invalidations
+# Uses SnoopCompile to evaluate number of invalidations caused by `using` the package
+# using https://github.com/julia-actions/julia-invalidations
+# Based on https://github.com/julia-actions/julia-invalidations
+
+on:
+  pull_request:
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: always.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  evaluate:
+    # Only run on PRs to the default branch.
+    # In the PR trigger above branches can be specified only explicitly whereas this check should work for master, main, or any other default branch
+    if: github.base_ref == github.event.repository.default_branch
+    runs-on: ubuntu-latest
+    steps:
+    - uses: julia-actions/setup-julia@v1
+      with:
+        version: '1'
+    - uses: actions/checkout@v3
+    - uses: julia-actions/julia-buildpkg@v1
+    - uses: julia-actions/julia-invalidations@v1
+      id: invs_pr
+
+    - uses: actions/checkout@v3
+      with:
+        ref: ${{ github.event.repository.default_branch }}
+    - uses: julia-actions/julia-buildpkg@v1
+    - uses: julia-actions/julia-invalidations@v1
+      id: invs_default
+    
+    - name: Report invalidation counts
+      run: |
+        echo "Invalidations on default branch: ${{ steps.invs_default.outputs.total }} (${{ steps.invs_default.outputs.deps }} via deps)" >> $GITHUB_STEP_SUMMARY
+        echo "This branch: ${{ steps.invs_pr.outputs.total }} (${{ steps.invs_pr.outputs.deps }} via deps)" >> $GITHUB_STEP_SUMMARY
+    - name: Check if the PR does increase number of invalidations
+      if: steps.invs_pr.outputs.total > steps.invs_default.outputs.total
+      run: exit 1
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 409e0d146..9e1791f48 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,12 +31,12 @@ jobs:
             arch: x64
             allow_failure: true
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v3
         env:
           cache-name: cache-artifacts
         with:
@@ -49,6 +49,6 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         with:
-          file: lcov.info
+          files: lcov.info
diff --git a/Project.toml b/Project.toml
index a91e07b8a..6249e13f4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Krylov"
 uuid = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
-version = "0.8.3"
+version = "0.9.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/README.md b/README.md
index a4664e187..55476e684 100644
--- a/README.md
+++ b/README.md
@@ -71,22 +71,22 @@ Overdetermined sytems are less common but also occur.
 4. Adjoint systems
 
 <p align="center">
-  <b><i>Ax = b</i></b> &nbsp; and &nbsp; <b><i>Aᵀy = c</i></b>
+  <b><i>Ax = b</i></b> &nbsp; and &nbsp; <b><i>Aᴴy = c</i></b>
 </p>
 
 where **_A_** can have any shape.
 
-5. Saddle-point and symmetric quasi-definite (SQD) systems
+5. Saddle-point and Hermitian quasi-definite systems
 
 <p align="center">
   [<b><i>M </i></b>&nbsp;&nbsp;&nbsp;<b><i> A</i></b>]&nbsp; [<b><i>x</i></b>]            =           [<b><i>b</i></b>]
   <br>
-  [<b><i>Aᵀ</i></b>&nbsp;&nbsp;      <b><i>-N</i></b>]&nbsp; [<b><i>y</i></b>]&nbsp;&nbsp;&nbsp;&nbsp;[<b><i>c</i></b>]
+  [<b><i>Aᴴ</i></b>&nbsp;&nbsp;      <b><i>-N</i></b>]&nbsp; [<b><i>y</i></b>]&nbsp;&nbsp;&nbsp;&nbsp;[<b><i>c</i></b>]
 </p>
 
 where **_A_** can have any shape.
 
-6. Generalized saddle-point and unsymmetric partitioned systems
+6. Generalized saddle-point and non-Hermitian partitioned systems
 
 <p align="center">
   [<b><i>M</i></b>&nbsp;&nbsp;&nbsp;<b><i>A</i></b>]&nbsp; [<b><i>x</i></b>]            =           [<b><i>b</i></b>]
@@ -94,7 +94,7 @@ where **_A_** can have any shape.
   [<b><i>B</i></b>&nbsp;&nbsp;&nbsp;<b><i>N</i></b>]&nbsp; [<b><i>y</i></b>]&nbsp;&nbsp;&nbsp;&nbsp;[<b><i>c</i></b>]
 </p>
 
-where **_A_** can have any shape and **_B_** has the shape of **_Aᵀ_**.
+where **_A_** can have any shape and **_B_** has the shape of **_Aᴴ_**.
 **_A_**, **_B_**, **_b_** and **_c_** must be all nonzero.
 
 Krylov solvers are particularly appropriate in situations where such problems must be solved but a factorization is not possible, either because:
@@ -121,3 +121,10 @@ julia> ]
 pkg> add Krylov
 pkg> test Krylov
 ```
+
+## Bug reports and discussions
+
+If you think you found a bug, feel free to open an [issue](https://github.com/JuliaSmoothOptimizers/Krylov.jl/issues).
+Focused suggestions and requests can also be opened as issues. Before opening a pull request, start an issue or a discussion on the topic, please.
+
+If you want to ask a question not suited for a bug report, feel free to start a discussion [here](https://github.com/JuliaSmoothOptimizers/Organization/discussions). This forum is for general discussion about this repository and the [JuliaSmoothOptimizers](https://github.com/JuliaSmoothOptimizers) organization, so questions about any of our packages are welcome.
diff --git a/docs/make.jl b/docs/make.jl
index 57ad87cd2..441ddb3ee 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -6,23 +6,26 @@ makedocs(
   linkcheck = true,
   strict = true,
   format = Documenter.HTML(assets = ["assets/style.css"],
-                           ansicolor=true,
+                           ansicolor = true,
                            prettyurls = get(ENV, "CI", nothing) == "true",
                            collapselevel = 1),
   sitename = "Krylov.jl",
   pages = ["Home" => "index.md",
            "API" => "api.md",
-           "Krylov methods" => ["Symmetric positive definite linear systems" => "solvers/spd.md",
-                                "Symmetric indefinite linear systems" => "solvers/sid.md",
-                                "Unsymmetric linear systems" => "solvers/unsymmetric.md",
+           "Krylov processes" => "processes.md",
+           "Krylov methods" => ["Hermitian positive definite linear systems" => "solvers/spd.md",
+                                "Hermitian indefinite linear systems" => "solvers/sid.md",
+                                "Non-Hermitian square linear systems" => "solvers/unsymmetric.md",
                                 "Least-norm problems" => "solvers/ln.md",
                                 "Least-squares problems" => "solvers/ls.md",
                                 "Adjoint systems" => "solvers/as.md",
-                                "Saddle-point and symmetric quasi-definite systems" => "solvers/sp_sqd.md",
-                                "Generalized saddle-point and unsymmetric partitioned systems" => "solvers/gsp.md"],
+                                "Saddle-point and Hermitian quasi-definite systems" => "solvers/sp_sqd.md",
+                                "Generalized saddle-point and non-Hermitian partitioned systems" => "solvers/gsp.md"],
            "In-place methods" => "inplace.md",
+           "Preconditioners" => "preconditioners.md",
+           "Storage requirements" => "storage.md",
            "GPU support" => "gpu.md",
-           "Warm start" => "warm_start.md",
+           "Warm-start" => "warm-start.md",
            "Factorization-free operators" => "factorization-free.md",
            "Callbacks" => "callbacks.md",
            "Performance tips" => "tips.md",
diff --git a/docs/src/api.md b/docs/src/api.md
index 7f2f4dff7..238c86f1a 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -48,6 +48,7 @@ LnlqSolver
 CraigSolver
 CraigmrSolver
 GpmrSolver
+FgmresSolver
 ```
 
 ## Utilities
@@ -60,4 +61,6 @@ Krylov.vec2str
 Krylov.ktypeof
 Krylov.kzeros
 Krylov.kones
+Krylov.vector_to_matrix
+Krylov.matrix_to_vector
 ```
diff --git a/docs/src/callbacks.md b/docs/src/callbacks.md
index f44018687..91e0b521c 100644
--- a/docs/src/callbacks.md
+++ b/docs/src/callbacks.md
@@ -1,43 +1,80 @@
-## Callbacks
+# [Callbacks](@id callbacks)
 
-Each Krylov method is able to call a callback function as `callback(solver)` at each iteration. The callback should return `true` if the main loop should terminate, and `false` otherwise.
+Each Krylov method is able to call a callback function as `callback(solver)` at each iteration.
+The callback should return `true` if the main loop should terminate, and `false` otherwise.
 If the method terminated because of the callback, the output status will be `"user-requested exit"`.
-For example, if the user defines `my_callback(solver::MinresSolver)`, it can be passed to the solver using
+For example, if the user defines `minres_callback(solver::MinresSolver)`, it can be passed to the solver using
 
 ```julia
-(x, stats) = minres(A, b, callback = my_callback)
+(x, stats) = minres(A, b, callback = minres_callback)
 ```
 
-If you need to write a callback that uses variables that are not in the `MinresSolver`, use a closure:
+If you need to write a callback that uses variables that are not in a `KrylovSolver`, use a closure:
 
 ```julia
-function my_callback2(solver::MinresSolver, A, b, storage_vec, tol::Float64)
-  mul!(storage_vec, A, solver.x)
-  storage_vec .-= b
-  return norm(storage_vec) ≤ tol # tolerance based on the 2-norm of the residual
+function custom_stopping_condition(solver::KrylovSolver, A, b, r, tol)
+  mul!(r, A, solver.x)
+  r .-= b               # r := b - Ax
+  bool = norm(r) ≤ tol  # tolerance based on the 2-norm of the residual
+  return bool
 end
 
-storage_vec = similar(b)
-(x, stats) = minres(A, b, callback = solver -> my_callback2(solver, A, b, storage_vec, 0.1))
+cg_callback(solver) = custom_stopping_condition(solver, A, b, r, tol)
+(x, stats) = cg(A, b, callback = cg_callback)
 ```
 
 Alternatively, use a structure and make it callable:
 
 ```julia
-mutable struct MyCallback3{S, M}
-  A::M
-  b::S
-  storage_vec::S
-  tol::Float64
+mutable struct CallbackWorkspace{T}
+  A::Matrix{T}
+  b::Vector{T}
+  r::Vector{T}
+  tol::T
 end
-MyCallback3(A, b; tol = 0.1) = MyCallback3(A, b, similar(b), tol)
 
-function (my_cb::MyCallback3)(solver)
-  mul!(my_cb.storage_vec, my_cb.A, solver.x)
-  my_cb.storage_vec .-= my_cb.b
-  return norm(my_cb.storage_vec) ≤ my_cb.tol # tolerance based on the 2-norm of the residual
+function (workspace::CallbackWorkspace)(solver::KrylovSolver)
+  mul!(workspace.r, workspace.A, solver.x)
+  workspace.r .-= workspace.b
+  bool = norm(workspace.r) ≤ workspace.tol
+  return bool
 end
 
-my_cb = MyCallback3(A, b; tol = 0.1)
-(x, stats) = minres(A, b, callback = my_cb)
+bicgstab_callback = CallbackWorkspace(A, b, r, tol)
+(x, stats) = bicgstab(A, b, callback = bicgstab_callback)
+```
+
+Although the main goal of a callback is to add new stopping conditions, it can also retrieve information from the workspace of a Krylov method along the iterations.
+We now illustrate how to store all iterates $x_k$ of the GMRES method.
+
+```julia
+S = Krylov.ktypeof(b)
+global X = S[]  # Storage for GMRES iterates
+
+function gmres_callback(solver)
+  z = solver.z
+  k = solver.inner_iter
+  nr = sum(1:k)
+  V = solver.V
+  R = solver.R
+  y = copy(z)
+
+  # Solve Rk * yk = zk
+  for i = k : -1 : 1
+    pos = nr + i - k
+    for j = k : -1 : i+1
+      y[i] = y[i] - R[pos] * y[j]
+      pos = pos - j + 1
+    end
+    y[i] = y[i] / R[pos]
+  end
+
+  # xk = Vk * yk
+  xk = sum(V[i] * y[i] for i = 1:k)
+  push!(X, xk)
+
+  return false  # We don't want to add new stopping conditions
+end
+
+(x, stats) = gmres(A, b, callback = gmres_callback)
 ```
diff --git a/docs/src/examples/tricg.md b/docs/src/examples/tricg.md
index e981c2f7e..61750de5f 100644
--- a/docs/src/examples/tricg.md
+++ b/docs/src/examples/tricg.md
@@ -14,7 +14,7 @@ N = diagm(0 => [5.0 * i for i = 1:n])
 c = -b
 
 # [I   A] [x] = [b]
-# [Aᵀ -I] [y]   [c]
+# [Aᴴ -I] [y]   [c]
 (x, y, stats) = tricg(A, b, c)
 K = [eye(m) A; A' -eye(n)]
 B = [b; c]
@@ -23,7 +23,7 @@ resid = norm(r)
 @printf("TriCG: Relative residual: %8.1e\n", resid)
 
 # [-I   A] [x] = [b]
-# [ Aᵀ  I] [y]   [c]
+# [ Aᴴ  I] [y]   [c]
 (x, y, stats) = tricg(A, b, c, flip=true)
 K = [-eye(m) A; A' eye(n)]
 B = [b; c]
@@ -32,7 +32,7 @@ resid = norm(r)
 @printf("TriCG: Relative residual: %8.1e\n", resid)
 
 # [I   A] [x] = [b]
-# [Aᵀ  I] [y]   [c]
+# [Aᴴ  I] [y]   [c]
 (x, y, stats) = tricg(A, b, c, spd=true)
 K = [eye(m) A; A' eye(n)]
 B = [b; c]
@@ -41,7 +41,7 @@ resid = norm(r)
 @printf("TriCG: Relative residual: %8.1e\n", resid)
 
 # [-I    A] [x] = [b]
-# [ Aᵀ  -I] [y]   [c]
+# [ Aᴴ  -I] [y]   [c]
 (x, y, stats) = tricg(A, b, c, snd=true)
 K = [-eye(m) A; A' -eye(n)]
 B = [b; c]
@@ -50,7 +50,7 @@ resid = norm(r)
 @printf("TriCG: Relative residual: %8.1e\n", resid)
 
 # [τI    A] [x] = [b]
-# [ Aᵀ  νI] [y]   [c]
+# [ Aᴴ  νI] [y]   [c]
 (τ, ν) = (1e-4, 1e2)
 (x, y, stats) = tricg(A, b, c, τ=τ, ν=ν)
 K = [τ*eye(m) A; A' ν*eye(n)]
@@ -60,7 +60,7 @@ resid = norm(r)
 @printf("TriCG: Relative residual: %8.1e\n", resid)
 
 # [M⁻¹  A  ] [x] = [b]
-# [Aᵀ  -N⁻¹] [y]   [c]
+# [Aᴴ  -N⁻¹] [y]   [c]
 (x, y, stats) = tricg(A, b, c, M=M, N=N, verbose=1)
 K = [inv(M) A; A' -inv(N)]
 H = BlockDiagonalOperator(M, N)
diff --git a/docs/src/examples/trimr.md b/docs/src/examples/trimr.md
index 2aa48be1e..adc4e82e5 100644
--- a/docs/src/examples/trimr.md
+++ b/docs/src/examples/trimr.md
@@ -14,7 +14,7 @@ m, n = size(A)
 c = -b
 
 # [D   A] [x] = [b]
-# [Aᵀ  0] [y]   [c]
+# [Aᴴ  0] [y]   [c]
 llt_D = cholesky(D)
 opD⁻¹ = LinearOperator(Float64, 5, 5, true, true, (y, v) -> ldiv!(y, llt_D, v))
 opH⁻¹ = BlockDiagonalOperator(opD⁻¹, eye(n))
@@ -34,7 +34,7 @@ N = diagm(0 => [5.0 * i for i = 1:n])
 c = -b
 
 # [I   A] [x] = [b]
-# [Aᵀ -I] [y]   [c]
+# [Aᴴ -I] [y]   [c]
 (x, y, stats) = trimr(A, b, c)
 K = [eye(m) A; A' -eye(n)]
 B = [b; c]
@@ -43,7 +43,7 @@ resid = norm(r)
 @printf("TriMR: Relative residual: %8.1e\n", resid)
 
 # [M   A] [x] = [b]
-# [Aᵀ -N] [y]   [c]
+# [Aᴴ -N] [y]   [c]
 ldlt_M = ldl(M)
 ldlt_N = ldl(N)
 opM⁻¹ = LinearOperator(Float64, size(M,1), size(M,2), true, true, (y, v) -> ldiv!(y, ldlt_M, v))
diff --git a/docs/src/factorization-free.md b/docs/src/factorization-free.md
index aa0f51f07..b97108b99 100644
--- a/docs/src/factorization-free.md
+++ b/docs/src/factorization-free.md
@@ -1,3 +1,32 @@
+```@raw html
+<style>
+.content table td {
+    border-right-width: 1px;
+}
+.content table th {
+    border-right-width: 1px;
+}
+.content table td:last-child {
+    border-right-width: 0px;
+}
+.content table th:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table td {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table th {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table td:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table th:last-child {
+    border-right-width: 0px;
+}
+</style>
+```
+
 ## [Factorization-free operators](@id factorization-free)
 
 All methods are factorization-free, which means that you only need to provide operator-vector products.
@@ -10,8 +39,11 @@ Some methods only require `A * v` products, whereas other ones also require `A'
 |:--------------------------------------:|:----------------------------------------:|
 | CG, CR                                 | CGLS, CRLS, CGNE, CRMR                   |
 | SYMMLQ, CG-LANCZOS, MINRES, MINRES-QLP | LSLQ, LSQR, LSMR, LNLQ, CRAIG, CRAIGMR   |
-| DIOM, FOM, DQGMRES, GMRES              | BiLQ, QMR, BiLQR, USYMLQ, USYMQR, TriLQR |
-| CGS, BICGSTAB                          | TriCG, TriMR, USYMLQR                    |
+| DIOM, FOM, DQGMRES, GMRES, FGMRES      | BiLQ, QMR, BiLQR, USYMLQ, USYMQR, TriLQR |
+| CGS, BICGSTAB                          | TriCG, TriMR                             |
+
+!!! info
+    GPMR is the only method that requires `A * v` and `B * w` products.
 
 Preconditioners `M`, `N`, `C`, `D`, `E` or `F` can be also linear operators and must implement `mul!` or `ldiv!`.
 
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 4c9887f24..378f4f5d3 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -1,6 +1,15 @@
-## GPU support
+# [GPU support](@id gpu)
 
-All solvers in Krylov.jl can be used with `CuArrays` and allow computations with Nvidia GPU. Problems stored in CPU format (`Matrix` and `Vector`) must first be converted to GPU format (`CuMatrix` and `CuVector`).
+Krylov methods are well suited for GPU computations because they only require matrix-vector products ($u \leftarrow Av$, $u \leftarrow A^{H}w$) and vector operations ($\|v\|$, $u^H v$, $v \leftarrow \alpha u + \beta v$), which are highly parallelizable.
+
+The implementations in Krylov.jl are generic so as to take advantage of the multiple dispatch and broadcast features of Julia.
+Those allow the implementations to be specialized automatically by the compiler for both CPU and GPU.
+Thus, Krylov.jl works with GPU backends that build on [GPUArrays.jl](https://github.com/JuliaGPU/GPUArrays.jl), such as [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl), [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl), [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) or [Metal.jl](https://github.com/JuliaGPU/Metal.jl).
+
+## Nvidia GPUs
+
+All solvers in Krylov.jl can be used with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) and allow computations on Nvidia GPUs.
+Problems stored in CPU format (`Matrix` and `Vector`) must first be converted to the related GPU format (`CuMatrix` and `CuVector`).
 
 ```julia
 using CUDA, Krylov
@@ -13,11 +22,11 @@ b_cpu = rand(20)
 A_gpu = CuMatrix(A_cpu)
 b_gpu = CuVector(b_cpu)
 
-# Solve a square and dense system on GPU
+# Solve a square and dense system on an Nivida GPU
 x, stats = bilq(A_gpu, b_gpu)
 ```
 
-Sparse matrices have a specific storage on GPU (`CuSparseMatrixCSC` or `CuSparseMatrixCSR`):
+Sparse matrices have a specific storage on Nvidia GPUs (`CuSparseMatrixCSC`, `CuSparseMatrixCSR` or `CuSparseMatrixCOO`):
 
 ```julia
 using CUDA, Krylov
@@ -31,7 +40,7 @@ b_cpu = rand(200)
 A_gpu = CuSparseMatrixCSC(A_cpu)
 b_gpu = CuVector(b_cpu)
 
-# Solve a rectangular and sparse system on GPU
+# Solve a rectangular and sparse system on an Nvidia GPU
 x, stats = lsmr(A_gpu, b_gpu)
 ```
 
@@ -47,14 +56,14 @@ using SparseArrays, Krylov, LinearOperators
 using CUDA, CUDA.CUSPARSE
 
 # Transfer the linear system from the CPU to the GPU
-A_gpu = CuSparseMatrixCSC(A_cpu)  # A = CuSparseMatrixCSR(A_cpu)
+A_gpu = CuSparseMatrixCSC(A_cpu)  # A_gpu = CuSparseMatrixCSR(A_cpu)
 b_gpu = CuVector(b_cpu)
 
-# LLᵀ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
+# LLᴴ ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
 P = ic02(A_gpu, 'O')
 
 # Solve Py = x
-function ldiv!(y, P, x)
+function ldiv_ic0!(y, P, x)
   copyto!(y, x)                        # Variant for CuSparseMatrixCSR
   sv2!('T', 'U', 'N', 1.0, P, y, 'O')  # sv2!('N', 'L', 'N', 1.0, P, y, 'O')
   sv2!('N', 'U', 'N', 1.0, P, y, 'O')  # sv2!('T', 'L', 'N', 1.0, P, y, 'O')
@@ -65,12 +74,15 @@ end
 n = length(b_gpu)
 T = eltype(b_gpu)
 symmetric = hermitian = true
-opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv!(y, P, x))
+opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ic0!(y, P, x))
 
 # Solve a symmetric positive definite system with an incomplete Cholesky preconditioner on GPU
-(x, stats) = cg(A_gpu, b_gpu, M=opM)
+x, stats = cg(A_gpu, b_gpu, M=opM)
 ```
 
+!!! note
+    You need to replace `'T'` by `'C'` in `ldiv_ic0!` if `A_gpu` is a complex matrix.
+
 ### Example with a general square system
 
 ```julia
@@ -84,14 +96,14 @@ A_cpu = A_cpu[p,:]
 b_cpu = b_cpu[p]
 
 # Transfer the linear system from the CPU to the GPU
-A_gpu = CuSparseMatrixCSC(A_cpu)  # A = CuSparseMatrixCSR(A_cpu)
+A_gpu = CuSparseMatrixCSC(A_cpu)  # A_gpu = CuSparseMatrixCSR(A_cpu)
 b_gpu = CuVector(b_cpu)
 
 # LU ≈ A for CuSparseMatrixCSC or CuSparseMatrixCSR matrices
 P = ilu02(A_gpu, 'O')
 
 # Solve Py = x
-function ldiv!(y, P, x)
+function ldiv_ilu0!(y, P, x)
   copyto!(y, x)                        # Variant for CuSparseMatrixCSR
   sv2!('N', 'L', 'N', 1.0, P, y, 'O')  # sv2!('N', 'L', 'U', 1.0, P, y, 'O')
   sv2!('N', 'U', 'U', 1.0, P, y, 'O')  # sv2!('N', 'U', 'N', 1.0, P, y, 'O')
@@ -102,8 +114,85 @@ end
 n = length(b_gpu)
 T = eltype(b_gpu)
 symmetric = hermitian = false
-opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv!(y, P, x))
+opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_ilu0!(y, P, x))
 
 # Solve an unsymmetric system with an incomplete LU preconditioner on GPU
-(x, stats) = bicgstab(A_gpu, b_gpu, M=opM)
+x, stats = bicgstab(A_gpu, b_gpu, M=opM)
+```
+
+## AMD GPUs
+
+All solvers in Krylov.jl can be used with [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) and allow computations on AMD GPUs.
+Problems stored in CPU format (`Matrix` and `Vector`) must first be converted to the related GPU format (`ROCMatrix` and `ROCVector`).
+
+```julia
+using Krylov, AMDGPU
+
+# CPU Arrays
+A_cpu = rand(ComplexF64, 20, 20)
+A_cpu = A_cpu + A_cpu'
+b_cpu = rand(ComplexF64, 20)
+
+A_gpu = ROCMatrix(A_cpu)
+b_gpu = ROCVector(b_cpu)
+
+# Solve a dense Hermitian system on an AMD GPU
+x, stats = minres(A_gpu, b_gpu)
+```
+
+!!! info
+    The library `rocSPARSE` is not interfaced yet in AMDGPU.jl and only dense linear systems are supported.
+
+## Intel GPUs
+
+All solvers in Krylov.jl can be used with [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) and allow computations on Intel GPUs.
+Problems stored in CPU format (`Matrix` and `Vector`) must first be converted to the related GPU format (`oneMatrix` and `oneVector`).
+
+```julia
+using Krylov, oneAPI
+
+T = Float32  # oneAPI.jl also works with ComplexF32
+m = 20
+n = 10
+
+# CPU Arrays
+A_cpu = rand(T, m, n)
+b_cpu = rand(T, m)
+
+# GPU Arrays
+A_gpu = oneMatrix(A_cpu)
+b_gpu = oneVector(b_cpu)
+
+# Solve a dense least-squares problem on an Intel GPU
+x, stats = lsqr(A_gpu, b_gpu)
 ```
+
+!!! warning
+    The library `oneMKL` is not interfaced yet in oneAPI.jl and all BLAS routines (dot, norm, mul!, etc.) dispatch to generic fallbacks.
+
+## Apple M1 GPUs
+
+All solvers in Krylov.jl can be used with [Metal.jl](https://github.com/JuliaGPU/Metal.jl) and allow computations on Apple M1 GPUs.
+Problems stored in CPU format (`Matrix` and `Vector`) must first be converted to the related GPU format (`MtlMatrix` and `MtlVector`).
+
+```julia
+using Krylov, Metal
+
+T = Float32  # Metal.jl also works with ComplexF32
+n = 10
+m = 20
+
+# CPU Arrays
+A_cpu = rand(T, n, m)
+b_cpu = rand(T, n)
+
+# GPU Arrays
+A_gpu = MtlMatrix(A_cpu)
+b_gpu = MtlVector(b_cpu)
+
+# Solve a dense least-norm problem on an Apple M1 GPU
+x, stats = craig(A_gpu, b_gpu)
+```
+
+!!! warning
+    Metal.jl is under heavy development and is considered experimental for now.
diff --git a/docs/src/graphics/arnoldi.png b/docs/src/graphics/arnoldi.png
new file mode 100644
index 000000000..9ef8bd3a3
Binary files /dev/null and b/docs/src/graphics/arnoldi.png differ
diff --git a/docs/src/graphics/golub_kahan.png b/docs/src/graphics/golub_kahan.png
new file mode 100644
index 000000000..32fc3d7b8
Binary files /dev/null and b/docs/src/graphics/golub_kahan.png differ
diff --git a/docs/src/graphics/hermitian_lanczos.png b/docs/src/graphics/hermitian_lanczos.png
new file mode 100644
index 000000000..c70082e72
Binary files /dev/null and b/docs/src/graphics/hermitian_lanczos.png differ
diff --git a/docs/src/graphics/montoison_orban.png b/docs/src/graphics/montoison_orban.png
new file mode 100644
index 000000000..5a14eda04
Binary files /dev/null and b/docs/src/graphics/montoison_orban.png differ
diff --git a/docs/src/graphics/nonhermitian_lanczos.png b/docs/src/graphics/nonhermitian_lanczos.png
new file mode 100644
index 000000000..b8d83961c
Binary files /dev/null and b/docs/src/graphics/nonhermitian_lanczos.png differ
diff --git a/docs/src/graphics/saunders_simon_yip.png b/docs/src/graphics/saunders_simon_yip.png
new file mode 100644
index 000000000..c3acfd181
Binary files /dev/null and b/docs/src/graphics/saunders_simon_yip.png differ
diff --git a/docs/src/index.md b/docs/src/index.md
index ce657436d..1a18e2315 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -46,26 +46,26 @@ Overdetermined sytems are less common but also occur.
 4 - Adjoint systems
 
 ```math
-  Ax = b \quad \text{and} \quad A^T y = c
+  Ax = b \quad \text{and} \quad A^H y = c
 ```
 
 where **_A_** can have any shape.
 
-5 - Saddle-point and symmetric quasi-definite (SQD) systems
+5 - Saddle-point and Hermitian quasi-definite systems
 
 ```math
-  \begin{bmatrix} M & \phantom{-}A \\ A^T & -N \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \left(\begin{bmatrix} b \\ 0 \end{bmatrix},\begin{bmatrix} 0 \\ c \end{bmatrix},\begin{bmatrix} b \\ c \end{bmatrix}\right)
+  \begin{bmatrix} M & \phantom{-}A \\ A^H & -N \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \left(\begin{bmatrix} b \\ 0 \end{bmatrix},\begin{bmatrix} 0 \\ c \end{bmatrix},\begin{bmatrix} b \\ c \end{bmatrix}\right)
 ```
 
 where **_A_** can have any shape.
 
-6 - Generalized saddle-point and unsymmetric partitioned systems
+6 - Generalized saddle-point and non-Hermitian partitioned systems
 
 ```math
   \begin{bmatrix} M & A \\ B & N \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} b \\ c \end{bmatrix}
 ```
 
-where **_A_** can have any shape and **_B_** has the shape of **_Aᵀ_**.
+where **_A_** can have any shape and **_B_** has the shape of **_Aᴴ_**.
 **_A_**, **_B_**, **_b_** and **_c_** must be all nonzero.
 
 Krylov solvers are particularly appropriate in situations where such problems must be solved but a factorization is not possible, either because:
@@ -92,3 +92,10 @@ julia> ]
 pkg> add Krylov
 pkg> test Krylov
 ```
+
+# Bug reports and discussions
+
+If you think you found a bug, feel free to open an [issue](https://github.com/JuliaSmoothOptimizers/Krylov.jl/issues).
+Focused suggestions and requests can also be opened as issues. Before opening a pull request, start an issue or a discussion on the topic, please.
+
+If you want to ask a question not suited for a bug report, feel free to start a discussion [here](https://github.com/JuliaSmoothOptimizers/Organization/discussions). This forum is for general discussion about this repository and the [JuliaSmoothOptimizers](https://github.com/JuliaSmoothOptimizers) organization, so questions about any of our packages are welcome.
diff --git a/docs/src/inplace.md b/docs/src/inplace.md
index 71a4e25de..9950575fe 100644
--- a/docs/src/inplace.md
+++ b/docs/src/inplace.md
@@ -15,7 +15,7 @@ Given an operator `A` and a right-hand side `b`, you can create a `KrylovSolver`
 For example, use `S = Vector{Float64}` if you want to solve linear systems in double precision on the CPU and `S = CuVector{Float32}` if you want to solve linear systems in single precision on an Nvidia GPU.
 
 !!! note
-    `DiomSolver`, `FomSolver`, `DqgmresSolver`, `GmresSolver`, `GpmrSolver` and `CgLanczosShiftSolver` require an additional argument (`memory` or `nshifts`).
+    `DiomSolver`, `FomSolver`, `DqgmresSolver`, `GmresSolver`, `FgmresSolver`, `GpmrSolver` and `CgLanczosShiftSolver` require an additional argument (`memory` or `nshifts`).
 
 The workspace is always the first argument of the in-place methods:
 
diff --git a/docs/src/preconditioners.md b/docs/src/preconditioners.md
new file mode 100644
index 000000000..fd203dddb
--- /dev/null
+++ b/docs/src/preconditioners.md
@@ -0,0 +1,237 @@
+# [Preconditioners](@id preconditioners)
+
+The solvers in Krylov.jl support preconditioners, i.e., transformations that modify a linear system $Ax = b$ into an equivalent form that may yield faster convergence in finite-precision arithmetic.
+Preconditioning can be used to reduce the condition number of the problem or cluster its eigenvalues or singular values for instance.
+
+The design of preconditioners is highly dependent on the origin of the problem and most preconditioners need to take application-dependent information and structure into account.
+Specialized preconditioners generally outperform generic preconditioners such as incomplete factorizations.
+
+The construction of a preconditioner necessitates trade-offs because we need to apply it at least once per iteration within a Krylov method.
+Hence, a preconditioner must be constructed such that it is cheap to apply, while also capturing the characteristics of the original system in some sense.
+
+There exist three variants of preconditioning:
+
+| Left preconditioning               | Two-sided preconditioning                                              | Right preconditioning                        |
+|:----------------------------------:|:----------------------------------------------------------------------:|:--------------------------------------------:|
+| $P_{\ell}^{-1}Ax = P_{\ell}^{-1}b$ | $P_{\ell}^{-1}AP_r^{-1}y = P_{\ell}^{-1}b~~\text{with}~~x = P_r^{-1}y$ | $AP_r^{-1}y = b~~\text{with}~~x = P_r^{-1}y$ |
+
+where $P_{\ell}$ and $P_r$ are square and nonsingular.
+
+In Krylov.jl , we call $P_{\ell}^{-1}$ and $P_r^{-1}$ the preconditioners and we assume that we can apply them with the operation $y \leftarrow P^{-1} * x$.
+It is also common to call $P_{\ell}$ and $P_r$ the preconditioners if the equivalent operation $y \leftarrow P~\backslash~x$ is available.
+Krylov.jl supports both approaches thanks to the argument `ldiv` of the Krylov solvers.
+
+## How to use preconditioners in Krylov.jl?
+
+!!! info
+    - A preconditioner only need support the operation `mul!(y, P⁻¹, x)` when `ldiv=false` or `ldiv!(y, P, x)` when `ldiv=true` to be used in Krylov.jl.
+    - The default value of a preconditioner in Krylov.jl is the identity operator `I`.
+
+### Square non-Hermitian linear systems
+
+Methods concerned: [`CGS`](@ref cgs), [`BiCGSTAB`](@ref bicgstab), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres), [`FGMRES`](@ref fgmres), [`DIOM`](@ref diom) and [`FOM`](@ref fom).
+
+A Krylov method dedicated to non-Hermitian linear systems allows the three variants of preconditioning.
+
+| Preconditioners | $P_{\ell}^{-1}$       | $P_{\ell}$           | $P_r^{-1}$            | $P_r$                |
+|:---------------:|:---------------------:|:--------------------:|:---------------------:|:--------------------:|
+| Arguments       | `M` with `ldiv=false` | `M` with `ldiv=true` | `N` with `ldiv=false` | `N` with `ldiv=true` |
+
+### Hermitian linear systems
+
+Methods concerned: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CG-LANCZOS`](@ref cg_lanczos), [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift), [`CR`](@ref cr), [`MINRES`](@ref minres) and [`MINRES-QLP`](@ref minres_qlp).
+
+When $A$ is Hermitian, we can only use centered preconditioning $L^{-1}AL^{-H}y = L^{-1}b$ with $x = L^{-H}y$.
+Centered preconditioning is a special case of two-sided preconditioning with $P_{\ell} = L = P_r^H$ that maintains hermicity.
+However, there is no need to specify $L$ and one may specify $P_c = LL^H$ or its inverse directly.
+
+| Preconditioners | $P_c^{-1}$                | $P_c$                |
+|:---------------:|:-------------------------:|:--------------------:|
+| Arguments       | `M` with `ldiv=false`     | `M` with `ldiv=true` |
+
+!!! warning
+    The preconditioner `M` must be hermitian and positive definite.
+
+### Linear least-squares problems
+
+Methods concerned: [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`LSLQ`](@ref lslq), [`LSQR`](@ref lsqr) and [`LSMR`](@ref lsmr).
+
+| Formulation           | Without preconditioning              | With preconditioning                        |
+|:---------------------:|:------------------------------------:|:-------------------------------------------:|
+| least-squares problem | $\min \tfrac{1}{2} \\|b - Ax\\|^2_2$ | $\min \tfrac{1}{2} \\|b - Ax\\|^2_{E^{-1}}$ |
+| Normal equation       | $A^HAx = A^Hb$                       | $A^HE^{-1}Ax = A^HE^{-1}b$                  |
+| Augmented system      | $\begin{bmatrix} I & A \\ A^H & 0 \end{bmatrix} \begin{bmatrix} r \\ x \end{bmatrix} = \begin{bmatrix} b \\ 0 \end{bmatrix}$ | $\begin{bmatrix} E & A \\ A^H & 0 \end{bmatrix} \begin{bmatrix} r \\ x \end{bmatrix} = \begin{bmatrix} b \\ 0 \end{bmatrix}$ |
+
+[`LSLQ`](@ref lslq), [`LSQR`](@ref lsqr) and [`LSMR`](@ref lsmr) also handle regularized least-squares problems.
+
+| Formulation           | Without preconditioning                                                   | With preconditioning                                                             |
+|:---------------------:|:-------------------------------------------------------------------------:|:--------------------------------------------------------------------------------:|
+| least-squares problem | $\min \tfrac{1}{2} \\|b - Ax\\|^2_2 + \tfrac{1}{2} \lambda^2 \\|x\\|^2_2$ | $\min \tfrac{1}{2} \\|b - Ax\\|^2_{E^{-1}} + \tfrac{1}{2} \lambda^2 \\|x\\|^2_F$ |
+| Normal equation       | $(A^HA + \lambda^2 I)x = A^Hb$                                            | $(A^HE^{-1}A + \lambda^2 F)x = A^HE^{-1}b$                                       |
+| Augmented system      | $\begin{bmatrix} I & A \\ A^H & -\lambda^2 I \end{bmatrix} \begin{bmatrix} r \\ x \end{bmatrix} = \begin{bmatrix} b \\ 0 \end{bmatrix}$ | $\begin{bmatrix} E & A \\ A^H & -\lambda^2 F \end{bmatrix} \begin{bmatrix} r \\ x \end{bmatrix} = \begin{bmatrix} b \\ 0 \end{bmatrix}$ |
+
+| Preconditioners | $E^{-1}$                | $E$                  | $F^{-1}$                | $F$                  |
+|:---------------:|:-----------------------:|:--------------------:|:-----------------------:|:--------------------:|
+| Arguments       | `M` with `ldiv=false`   | `M` with `ldiv=true` | `N` with `ldiv=false`   | `N` with `ldiv=true` |
+
+!!! warning
+    The preconditioners `M` and `N` must be hermitian and positive definite.
+
+### Linear least-norm problems
+
+Methods concerned: [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig) and [`CRAIGMR`](@ref craigmr).
+
+| Formulation          | Without preconditioning                              | With preconditioning                                 |
+|:--------------------:|:----------------------------------------------------:|:----------------------------------------------------:|
+| minimum-norm problem | $\min \tfrac{1}{2} \\|x\\|^2_2~~\text{s.t.}~~Ax = b$ | $\min \tfrac{1}{2} \\|x\\|^2_F~~\text{s.t.}~~Ax = b$ |
+| Normal equation      | $AA^Hy = b~~\text{with}~~x = A^Hy$                   | $AF^{-1}A^Hy = b~~\text{with}~~x = F^{-1}A^Hy$       |
+| Augmented system     | $\begin{bmatrix} -I & A^H \\ \phantom{-}A & 0 \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} 0 \\ b \end{bmatrix}$ | $\begin{bmatrix} -F & A^H \\ \phantom{-}A & 0 \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} 0 \\ b \end{bmatrix}$ |
+
+[`LNLQ`](@ref lslq), [`CRAIG`](@ref lsqr) and [`CRAIGMR`](@ref lsmr) also handle penalized minimum-norm problems.
+
+| Formulation          | Without preconditioning                                                                       | With preconditioning                                                                           |
+|:--------------------:|:---------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:|
+| minimum-norm problem | $\min \tfrac{1}{2} \\|x\\|^2_2 + \tfrac{1}{2} \\|y\\|^2_2~~\text{s.t.}~~Ax + \lambda^2 y = b$ | $\min \tfrac{1}{2} \\|x\\|^2_F + \tfrac{1}{2} \\|y\\|^2_E~~\text{s.t.}~~Ax + \lambda^2 Ey = b$ |
+| Normal equation      | $(AA^H + \lambda^2 I)y = b~~\text{with}~~x = A^Hy$                                            | $(AF^{-1}A^H + \lambda^2 E)y = b~~\text{with}~~x = F^{-1}A^Hy$                                 |
+| Augmented system     | $\begin{bmatrix} -I & A^H \\ \phantom{-}A & \lambda^2 I \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} 0 \\ b \end{bmatrix}$ | $\begin{bmatrix} -F & A^H \\ \phantom{-}A & \lambda^2 E \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} 0 \\ b \end{bmatrix}$ |
+
+| Preconditioners | $E^{-1}$                | $E$                  | $F^{-1}$                | $F$                  |
+|:---------------:|:-----------------------:|:--------------------:|:-----------------------:|:--------------------:|
+| Arguments       | `M` with `ldiv=false`   | `M` with `ldiv=true` | `N` with `ldiv=false`   | `N` with `ldiv=true` |
+
+!!! warning
+    The preconditioners `M` and `N` must be hermitian and positive definite.
+
+### Saddle-point and symmetric quasi-definite systems
+
+[`TriCG`](@ref tricg) and [`TriMR`](@ref trimr) can take advantage of the structure of Hermitian systems $Kz = d$ with the 2x2 block structure
+```math
+  \begin{bmatrix} \tau E & \phantom{-}A \\ A^H & \nu F \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} b \\ c \end{bmatrix},
+```
+| Preconditioners | $E^{-1}$              | $E$                  | $F^{-1}$              | $F$                  |
+|:---------------:|:---------------------:|:--------------------:|:---------------------:|:--------------------:|
+| Arguments       | `M` with `ldiv=false` | `M` with `ldiv=true` | `N` with `ldiv=false` | `N` with `ldiv=true` |
+
+!!! warning
+    The preconditioners `M` and `N` must be hermitian and positive definite.
+
+### Generalized saddle-point and unsymmetric partitioned systems
+
+[`GPMR`](@ref gpmr) can take advantage of the structure of general square systems $Kz = d$ with the 2x2 block structure
+```math
+  \begin{bmatrix} \lambda M & A \\ B & \mu N \end{bmatrix} \begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} b \\ c \end{bmatrix},
+```
+| Relations       | $CE = M^{-1}$                 | $EC = M$                     | $DF = N^{-1}$                 | $FD = N$                     |
+|:---------------:|:-----------------------------:|:----------------------------:|:-----------------------------:|:----------------------------:|
+| Arguments       | `C` and `E` with `ldiv=false` | `C` and `E` with `ldiv=true` | `D` and `F` with `ldiv=false` | `D` and `F` with `ldiv=true` |
+
+!!! note
+    Our implementations of [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr), [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr) and [`TriLQR`](@ref trilqr) don't support preconditioning.
+
+## Packages that provide preconditioners
+
+- [IncompleteLU.jl](https://github.com/haampie/IncompleteLU.jl) implements the left-looking and Crout versions of ILU decompositions.
+- [ILUZero.jl](https://github.com/mcovalt/ILUZero.jl) is a Julia implementation of incomplete LU factorization with zero level of fill-in. 
+- [LimitedLDLFactorizations.jl](https://github.com/JuliaSmoothOptimizers/LimitedLDLFactorizations.jl) for limited-memory LDLᵀ factorization of symmetric matrices.
+- [AlgebraicMultigrid.jl](https://github.com/JuliaLinearAlgebra/AlgebraicMultigrid.jl) provides two algebraic multigrid (AMG) preconditioners.
+- [RandomizedPreconditioners.jl](https://github.com/tjdiamandis/RandomizedPreconditioners.jl) uses randomized numerical linear algebra to construct approximate inverses of matrices.
+- [BasicLU.jl](https://github.com/JuliaSmoothOptimizers/BasicLU.jl) uses a sparse LU factorization to compute a maximum volume basis that can be used as a preconditioner for least-norm and least-squares problems.
+
+## Examples
+
+```julia
+using Krylov
+n, m = size(A)
+d = [A[i,i] ≠ 0 ? 1 / abs(A[i,i]) : 1 for i=1:n]  # Jacobi preconditioner
+P⁻¹ = diagm(d)
+x, stats = symmlq(A, b, M=P⁻¹)
+```
+
+```julia
+using Krylov
+n, m = size(A)
+d = [1 / norm(A[:,i]) for i=1:m]  # diagonal preconditioner
+P⁻¹ = diagm(d)
+x, stats = minres(A, b, M=P⁻¹)
+```
+
+```julia
+using IncompleteLU, Krylov
+Pℓ = ilu(A)
+x, stats = gmres(A, b, M=Pℓ, ldiv=true)  # left preconditioning
+```
+
+```julia
+using LimitedLDLFactorizations, Krylov
+P = lldl(A)
+P.D .= abs.(P.D)
+x, stats = cg(A, b, M=P, ldiv=true)  # centered preconditioning
+```
+
+```julia
+using ILUZero, Krylov
+Pᵣ = ilu0(A)
+x, stats = bicgstab(A, b, N=Pᵣ, ldiv=true)  # right preconditioning
+```
+
+```julia
+using LDLFactorizations, Krylov
+
+M = ldl(E)
+N = ldl(F)
+
+# [E   A] [x] = [b]
+# [Aᴴ -F] [y]   [c]
+x, y, stats = tricg(A, b, c, M=M, N=N, ldiv=true)
+```
+
+```julia
+using SuiteSparse, Krylov
+import LinearAlgebra.ldiv!
+
+M = cholesky(E)
+
+# ldiv! is not implemented for the sparse Cholesky factorization (SuiteSparse.CHOLMOD)
+ldiv!(y::Vector{T}, F::SuiteSparse.CHOLMOD.Factor{T}, x::Vector{T}) where T = (y .= F \ x)
+
+# [E  A] [x] = [b]
+# [Aᴴ 0] [y]   [c]
+x, y, stats = trimr(A, b, c, M=M, sp=true, ldiv=true)
+```
+
+```julia
+using Krylov
+
+C = lu(M)
+
+# [M  A] [x] = [b]
+# [B  0] [y]   [c]
+x, y, stats = gpmr(A, B, b, c, C=C, gsp=true, ldiv=true)
+```
+
+```julia
+import BasicLU
+using LinearOperators, Krylov
+
+# Least-squares problem
+m, n = size(A)
+Aᴴ = sparse(A')
+basis, B = BasicLU.maxvolbasis(Aᴴ)
+opA = LinearOperator(A)
+B⁻ᴴ = LinearOperator(Float64, n, n, false, false, (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'T')),
+                                                  (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'N')),
+                                                  (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'N')))
+
+d, stats = lsmr(opA * B⁻ᴴ, b)  # min ‖AB⁻ᴴd - b‖₂
+x = B⁻ᴴ * d                    # recover the solution of min ‖Ax - b‖₂
+
+# Least-norm problem
+m, n = size(A)
+basis, B = maxvolbasis(A)
+opA = LinearOperator(A)
+B⁻¹ = LinearOperator(Float64, m, m, false, false, (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'N')),
+                                                  (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'T')),
+                                                  (y, v) -> (y .= v ; BasicLU.solve!(B, y, 'T')))
+
+x, y, stats = craigmr(B⁻¹ * opA, B⁻¹ * b)  # min ‖x‖₂  s.t.  B⁻¹Ax = B⁻¹b
+```
diff --git a/docs/src/processes.md b/docs/src/processes.md
new file mode 100644
index 000000000..e9d4066d2
--- /dev/null
+++ b/docs/src/processes.md
@@ -0,0 +1,334 @@
+```@raw html
+<style>
+.content table td {
+    border-right-width: 1px;
+}
+.content table th {
+    border-right-width: 1px;
+}
+.content table td:last-child {
+    border-right-width: 0px;
+}
+.content table th:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table td {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table th {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table td:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table th:last-child {
+    border-right-width: 0px;
+}
+</style>
+```
+
+# [Krylov processes](@id krylov-processes)
+
+Krylov processes are the foundation of Krylov methods, they generate bases of Krylov subspaces.
+Depending on the Krylov subspaces generated, Krylov processes are more or less specialized for a subset of linear problems.
+The following table summarizes the most relevant processes for each linear problem.
+
+| Linear problems                                                |  Processes                        |
+|:--------------------------------------------------------------:|:---------------------------------:|
+| Hermitian linear systems                                       | Hermitian Lanczos                 |
+| Square Non-Hermitian linear systems                            | Non-Hermitian Lanczos -- Arnoldi  |
+| Least-squares problems                                         | Golub-Kahan -- Saunders-Simon-Yip |
+| Least-norm problems                                            | Golub-Kahan -- Saunders-Simon-Yip |
+| Saddle-point and Hermitian quasi-definite systems              | Golub-Kahan -- Saunders-Simon-Yip |
+| Generalized saddle-point and non-Hermitian partitioned systems | Montoison-Orban                   |
+
+### Notation
+
+For a matrix $A$, $A^H$ denotes the conjugate transpose of $A$.
+It coincides with $A^T$, the transpose of $A$, for real matrices.
+Define $V_k := \begin{bmatrix} v_1 & \ldots & v_k \end{bmatrix} \enspace$ and $\enspace U_k := \begin{bmatrix} u_1 & \ldots & u_k \end{bmatrix}$.
+
+For a matrix $C \in \mathbb{C}^{n \times n}$ and a vector $t \in \mathbb{C}^{n}$, the $k$-th Krylov subspace generated by $C$ and $t$ is
+```math
+\mathcal{K}_k(C, t) :=
+\left\{\sum_{i=0}^{k-1} \omega_i C^i t \, \middle \vert \, \omega_i \in \mathbb{C},~0 \le i \le k-1 \right\}.
+```
+
+For matrices $C \in \mathbb{C}^{n \times n} \enspace$ and $\enspace T \in \mathbb{C}^{n \times p}$, the $k$-th block Krylov subspace generated by $C$ and $T$ is
+```math
+\mathcal{K}_k^{\square}(C, T) :=
+\left\{\sum_{i=0}^{k-1} C^i T \, \Omega_i \, \middle \vert \, \Omega_i \in \mathbb{C}^{p \times p},~0 \le i \le k-1 \right\}.
+```
+
+## Hermitian Lanczos
+
+![hermitian_lanczos](./graphics/hermitian_lanczos.png)
+
+After $k$ iterations of the Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1}  T_{k+1,k}, \\
+  V_k^H V_k &= I_k,
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$,
+```math
+T_k =
+\begin{bmatrix}
+  \alpha_1 & \beta_2  &          &         \\
+  \beta_2  & \alpha_2 & \ddots   &         \\
+           & \ddots   & \ddots   & \beta_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}.
+```
+Note that $T_{k+1,k}$ is a real tridiagonal matrix even if $A$ is a complex matrix.
+
+The function [`hermitian_lanczos`](@ref hermitian_lanczos) returns $V_{k+1}$ and $T_{k+1,k}$.
+
+Related methods: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CR`](@ref cr), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp), [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`CG-LANCZOS`](@ref cg_lanczos) and [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift).
+
+```@docs
+hermitian_lanczos
+```
+
+## Non-Hermitian Lanczos
+
+![nonhermitian_lanczos](./graphics/nonhermitian_lanczos.png)
+
+After $k$ iterations of the non-Hermitian Lanczos process (also named the Lanczos biorthogonalization process), the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k T_k   +        \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  A^H U_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
+  V_k^H U_k &= U_k^H V_k = I_k,
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A,b)$ and $\mathcal{K}_k (A^H,c)$, respectively,
+```math
+T_k = 
+\begin{bmatrix}
+  \alpha_1 & \gamma_2 &          &          \\
+  \beta_2  & \alpha_2 & \ddots   &          \\
+           & \ddots   & \ddots   & \gamma_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}
+, \qquad
+T_{k,k+1} =
+\begin{bmatrix}
+  T_{k} & \gamma_{k+1} e_k
+\end{bmatrix}.
+```
+
+The function [`nonhermitian_lanczos`](@ref nonhermitian_lanczos) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$.
+
+Related methods: [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr), [`CGS`](@ref cgs) and [`BICGSTAB`](@ref bicgstab).
+
+!!! note
+    The scaling factors used in our implementation are $\beta_k = |u_k^H v_k|^{\tfrac{1}{2}}$ and $\gamma_k = (u_k^H v_k) / \beta_k$.
+    With these scaling factors, the non-Hermitian Lanczos process coincides with the Hermitian Lanczos process when $A = A^H$ and $b = c$.
+
+```@docs
+nonhermitian_lanczos
+```
+
+## Arnoldi
+
+![arnoldi](./graphics/arnoldi.png)
+
+After $k$ iterations of the Arnoldi process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\
+  V_k^H V_k &= I_k,
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$,
+```math
+H_k =
+\begin{bmatrix}
+  h_{1,1}~ & h_{1,2}~ & \ldots    & h_{1,k}   \\
+  h_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & h_{k-1,k} \\
+           &          & h_{k,k-1} & h_{k,k}
+\end{bmatrix}
+, \qquad
+H_{k+1,k} =
+\begin{bmatrix}
+  H_{k} \\
+  h_{k+1,k} e_{k}^T
+\end{bmatrix}.
+```
+
+The function [`arnoldi`](@ref arnoldi) returns $V_{k+1}$ and $H_{k+1,k}$.
+
+Related methods: [`DIOM`](@ref diom), [`FOM`](@ref fom), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres) and [`FGMRES`](@ref fgmres).
+
+!!! note
+    The Arnoldi process coincides with the Hermitian Lanczos process when $A$ is Hermitian.
+
+```@docs
+arnoldi
+```
+
+## Golub-Kahan
+
+![golub_kahan](./graphics/golub_kahan.png)
+
+After $k$ iterations of the Golub-Kahan bidiagonalization process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= U_{k+1} B_k,   \\
+  A^H U_{k+1} &= V_k B_k^H + \alpha_{k+1} v_{k+1} e_{k+1}^T = V_{k+1} L_{k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A^HA,A^Hb)$ and $\mathcal{K}_k (AA^H,b)$, respectively,
+```math
+L_k =
+\begin{bmatrix}
+  \alpha_1 &          &          &          \\
+  \beta_2  & \alpha_2 &          &          \\
+           & \ddots   & \ddots   &          \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+B_k =
+\begin{bmatrix}
+  \alpha_1 &          &          &             \\
+  \beta_2  & \alpha_2 &          &             \\
+           & \ddots   & \ddots   &             \\
+           &          & \beta_k  & \alpha_k    \\
+           &          &          & \beta_{k+1} \\
+\end{bmatrix}
+=
+\begin{bmatrix}
+  L_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}.
+```
+Note that $L_k$ is a real bidiagonal matrix even if $A$ is a complex matrix.
+
+The function [`golub_kahan`](@ref golub_kahan) returns $V_{k+1}$, $U_{k+1}$ and $L_{k+1}$.
+
+Related methods: [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig), [`CRAIGMR`](@ref craigmr), [`LSLQ`](@ref lslq), [`LSQR`](@ref lsqr) and [`LSMR`](@ref lsmr).
+
+!!! note
+    The Golub-Kahan process coincides with the Hermitian Lanczos process applied to the normal equations $A^HA x = A^Hb$ and $AA^H x = b$.
+    It is also related to the Hermitian Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial vector $\begin{bmatrix} b \\ 0 \end{bmatrix}$.
+
+```@docs
+golub_kahan
+```
+
+## Saunders-Simon-Yip
+
+![saunders_simon_yip](./graphics/saunders_simon_yip.png)
+
+After $k$ iterations of the Saunders-Simon-Yip process (also named the orthogonal tridiagonalization process), the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_k T_k   + \beta_{k+1}  v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  A^H V_k &= U_k T_k^H + \gamma_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$,
+```math
+T_k = 
+\begin{bmatrix}
+  \alpha_1 & \gamma_2 &          &          \\
+  \beta_2  & \alpha_2 & \ddots   &          \\
+           & \ddots   & \ddots   & \gamma_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}
+, \qquad
+T_{k,k+1} =
+\begin{bmatrix}
+  T_{k} & \gamma_{k+1} e_{k}
+\end{bmatrix}.
+```
+
+The function [`saunders_simon_yip`](@ref saunders_simon_yip) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$.
+
+Related methods: [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr), [`TriLQR`](@ref trilqr), [`TriCG`](@ref tricg) and [`TriMR`](@ref trimr).
+
+```@docs
+saunders_simon_yip
+```
+
+!!! note
+    The Saunders-Simon-Yip is equivalent to the block-Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial matrix $\begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}$.
+
+## Montoison-Orban
+
+![montoison_orban](./graphics/montoison_orban.png)
+
+After $k$ iterations of the Montoison-Orban process (also named the orthogonal Hessenberg reduction process), the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\
+  B V_k &= U_k F_k + f_{k+1,k} u_{k+1} e_k^T = U_{k+1} F_{k+1,k}, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$,
+```math
+H_k =
+\begin{bmatrix}
+  h_{1,1}~ & h_{1,2}~ & \ldots    & h_{1,k}   \\
+  h_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & h_{k-1,k} \\
+           &          & h_{k,k-1} & h_{k,k}
+\end{bmatrix}
+, \qquad
+F_k =
+\begin{bmatrix}
+  f_{1,1}~ & f_{1,2}~ & \ldots    & f_{1,k}   \\
+  f_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & f_{k-1,k} \\
+           &          & f_{k,k-1} & f_{k,k}
+\end{bmatrix},
+```
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  H_{k} \\
+  h_{k+1,k} e_{k}^T
+\end{bmatrix}
+, \qquad
+F_{k+1,k} =
+\begin{bmatrix}
+  F_{k} \\
+  f_{k+1,k} e_{k}^T
+\end{bmatrix}.
+```
+
+The function [`montoison_orban`](@ref montoison_orban) returns $V_{k+1}$, $H_{k+1,k}$, $U_{k+1}$ and $F_{k+1,k}$.
+
+Related methods: [`GPMR`](@ref gpmr).
+
+!!! note
+    The Montoison-Orban is equivalent to the block-Arnoldi process applied to $\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}$ with initial matrix $\begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}$.
+    It also coincides with the Saunders-Simon-Yip process when $B = A^H$.
+
+```@docs
+montoison_orban
+```
diff --git a/docs/src/reference.md b/docs/src/reference.md
index 0896e1639..f73e10043 100644
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@@ -6,6 +6,7 @@
 ```
 
 ```@docs
+Krylov.kstdout
 Krylov.FloatOrComplex
 Krylov.niterations
 Krylov.Aprod
diff --git a/docs/src/solvers/gsp.md b/docs/src/solvers/gsp.md
index 10aaccbe0..33c580b8a 100644
--- a/docs/src/solvers/gsp.md
+++ b/docs/src/solvers/gsp.md
@@ -1,5 +1,5 @@
 ```@meta
-# Generalized saddle-point and unsymmetric partitioned systems
+# Generalized saddle-point and non-Hermitian partitioned systems
 ```
 
 ## GPMR
diff --git a/docs/src/solvers/ln.md b/docs/src/solvers/ln.md
index c5396ffdd..b638b8247 100644
--- a/docs/src/solvers/ln.md
+++ b/docs/src/solvers/ln.md
@@ -36,3 +36,10 @@ craig!
 craigmr
 craigmr!
 ```
+
+## USYMLQ
+
+```@docs
+usymlq
+usymlq!
+```
diff --git a/docs/src/solvers/ls.md b/docs/src/solvers/ls.md
index f77057d94..fecfbc417 100644
--- a/docs/src/solvers/ls.md
+++ b/docs/src/solvers/ls.md
@@ -36,3 +36,10 @@ lsqr!
 lsmr
 lsmr!
 ```
+
+## USYMQR
+
+```@docs
+usymqr
+usymqr!
+```
diff --git a/docs/src/solvers/sid.md b/docs/src/solvers/sid.md
index 1bd459cd2..e911681be 100644
--- a/docs/src/solvers/sid.md
+++ b/docs/src/solvers/sid.md
@@ -1,5 +1,5 @@
 ```@meta
-# Symmetric indefinite linear systems
+# Hermitian indefinite linear systems
 ```
 
 ## SYMMLQ
diff --git a/docs/src/solvers/sp_sqd.md b/docs/src/solvers/sp_sqd.md
index 518684b5b..4ee4ab09b 100644
--- a/docs/src/solvers/sp_sqd.md
+++ b/docs/src/solvers/sp_sqd.md
@@ -1,5 +1,5 @@
 ```@meta
-# Saddle-point and symmetric quasi-definite systems
+# Saddle-point and Hermitian quasi-definite systems
 ```
 
 ## TriCG
diff --git a/docs/src/solvers/spd.md b/docs/src/solvers/spd.md
index 79bb6e9e8..aebda285b 100644
--- a/docs/src/solvers/spd.md
+++ b/docs/src/solvers/spd.md
@@ -1,5 +1,5 @@
 ```@meta
-# Symmetric positive definite linear systems
+# Hermitian positive definite linear systems
 ```
 
 ## CG
diff --git a/docs/src/solvers/unsymmetric.md b/docs/src/solvers/unsymmetric.md
index 280908ea5..c9e77f787 100644
--- a/docs/src/solvers/unsymmetric.md
+++ b/docs/src/solvers/unsymmetric.md
@@ -1,5 +1,5 @@
 ```@meta
-# Unsymmetric linear systems
+# Non-Hermitian square linear systems
 ```
 
 ## BiLQ
@@ -16,20 +16,6 @@ qmr
 qmr!
 ```
 
-## USYMLQ
-
-```@docs
-usymlq
-usymlq!
-```
-
-## USYMQR
-
-```@docs
-usymqr
-usymqr!
-```
-
 ## CGS
 
 ```@docs
@@ -71,3 +57,10 @@ dqgmres!
 gmres
 gmres!
 ```
+
+## FGMRES
+
+```@docs
+fgmres
+fgmres!
+```
diff --git a/docs/src/storage.md b/docs/src/storage.md
new file mode 100644
index 000000000..903cc0558
--- /dev/null
+++ b/docs/src/storage.md
@@ -0,0 +1,152 @@
+```@meta
+# Thanks Morten Piibeleht for the hack with the tables!
+```
+
+```@raw html
+<style>
+.content table td {
+    border-right-width: 1px;
+}
+.content table th {
+    border-right-width: 1px;
+}
+.content table td:last-child {
+    border-right-width: 0px;
+}
+.content table th:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table td {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table th {
+    border-right-width: 1px;
+}
+html.theme--documenter-dark .content table td:last-child {
+    border-right-width: 0px;
+}
+html.theme--documenter-dark .content table th:last-child {
+    border-right-width: 0px;
+}
+</style>
+```
+
+# [Storage requirements](@id storage-requirements)
+
+This section provides the storage requirements of all Krylov methods available in Krylov.jl.
+
+### Notation
+
+We denote by $m$ and $n$ the number of rows and columns of the linear problem.
+The memory parameter of DIOM, FOM, DQGMRES, GMRES, FGMRES and GPMR is $k$.
+The numbers of shifts of CG-LANCZOS-SHIFT is $p$.
+
+## Theoretical storage requirements
+
+The following tables provide the number of coefficients that must be allocated for each Krylov method.
+The coefficients have the same type as those that compose the linear problem we seek to solve.
+Each table summarizes the storage requirements of Krylov methods recommended to a specific linear problem.
+
+#### Hermitian positive definite linear systems
+
+| Methods | [`CG`](@ref cg) | [`CR`](@ref cr) | [`CG-LANCZOS`](@ref cg_lanczos) | [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift) |
+|:-------:|:---------------:|:---------------:|:-------------------------------:|:-------------------------------------------:|
+ Storage  | $4n$            | $5n$            | $5n$                            | $3n + 2np + 5p$                             |
+
+#### Hermitian indefinite linear systems
+
+| Methods | [`SYMMLQ`](@ref symmlq) | [`MINRES`](@ref minres) | [`MINRES-QLP`](@ref minres_qlp) |
+|:-------:|:-----------------------:|:-----------------------:|:-------------------------------:|
+| Storage | $5n$                    | $6n$                    | $6n$                            |
+
+#### Non-Hermitian square linear systems
+
+| Methods | [`CGS`](@ref cgs) | [`BICGSTAB`](@ref bicgstab) | [`BiLQ`](@ref bilq) | [`QMR`](@ref qmr) |
+|:-------:|:-----------------:|:---------------------------:|:-------------------:|:-----------------:|
+| Storage | $6n$              | $6n$                        | $8n$                | $9n$              |
+
+| Methods | [`DIOM`](@ref diom) | [`DQGMRES`](@ref dqgmres) |
+|:-------:|:-------------------:|:-------------------------:|
+| Storage | $n(2k+1) + 2k - 1$  | $n(2k+2) + 3k + 1$        |
+
+| Methods | [`FOM`](@ref fom)                                  | [`GMRES`](@ref gmres)                   | [`FGMRES`](@ref fgmres)                  |
+|:-------:|:--------------------------------------------------:|:---------------------------------------:|:----------------------------------------:|
+| Storage$\dfrac{}{}$ | $\!n(2+k) +2k + \dfrac{k(k + 1)}{2}\!$ | $\!n(2+k) + 3k + \dfrac{k(k + 1)}{2}\!$ | $\!n(2+2k) + 3k + \dfrac{k(k + 1)}{2}\!$ |
+
+#### Least-norm problems
+
+| Methods | [`USYMLQ`](@ref usymlq) | [`CGNE`](@ref cgne) | [`CRMR`](@ref crmr) | [`LNLQ`](@ref lnlq) | [`CRAIG`](@ref craig) | [`CRAIGMR`](@ref craigmr) |
+|:-------:|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:---------------------:|:-------------------------:|
+| Storage | $5n + 3m$               | $3n + 2m$           | $3n + 2m$           | $3n + 4m$           | $3n + 4m$             | $4n + 5m$                 |
+
+#### Least-squares problems
+
+| Methods | [`USYMQR`](@ref usymqr) | [`CGLS`](@ref cgls) | [`CRLS`](@ref crls) | [`LSLQ`](@ref lslq) | [`LSQR`](@ref lsqr) | [`LSMR`](@ref lsmr) |
+|:-------:|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
+| Storage | $6n + 3m$               | $3n + 2m$           | $4n + 3m$           | $4n + 2m$           | $4n + 2m$           | $5n + 2m$           |
+
+#### Adjoint systems
+
+| Methods | [`BiLQR`](@ref bilqr) | [`TriLQR`](@ref trilqr) |
+|:-------:|:---------------------:|:-----------------------:|
+| Storage | $11n$                 | $6m + 5n$               |
+
+#### Saddle-point and Hermitian quasi-definite systems
+
+| Methods  | [`TriCG`](@ref tricg) | [`TriMR`](@ref trimr) |
+|:--------:|:---------------------:|:---------------------:|
+| Storage  | $6n + 6m$             | $8n + 8m$             |
+
+#### Generalized saddle-point and non-Hermitian partitioned systems
+
+| Method  | [`GPMR`](@ref gpmr)       |
+|:-------:|:-------------------------:|
+| Storage | $(2+k)(n+m) + 2k^2 + 11k$ |
+
+## Practical storage requirements
+
+Each method has its own `KrylovSolver` that contains all the storage needed by the method.
+In the REPL, the size in bytes of each attribute and the total amount of memory allocated by the solver are displayed when we show a `KrylovSolver`.
+
+```@example storage
+using Krylov
+
+m = 5000
+n = 12000
+A = rand(Float64, m, n)
+b = rand(Float64, m)
+solver = LsmrSolver(A, b)
+show(stdout, solver, show_stats=false)
+```
+
+If we want the total number of bytes used by the solver, we can call `nbytes = sizeof(solver)`.
+
+```@example storage
+nbytes = sizeof(solver)
+```
+
+Thereafter, we can use `Base.format_bytes(nbytes)` to recover what is displayed in the REPL.
+
+```@example storage
+Base.format_bytes(nbytes)
+```
+
+To verify that we match the theoretical results, we just need to multiply the storage requirement of a method by the number of bytes associated to the precision of the linear problem.
+For instance, we need 4 bytes for the precision `Float32`, 8 bytes for precisions `Float64` and `ComplexF32`, and 16 bytes for the precision `ComplexF64`.
+
+```@example storage
+FC = Float64                            # precision of the least-squares problem
+ncoefs_lsmr = 5*n + 2*m                 # number of coefficients
+nbytes_lsmr = sizeof(FC) * ncoefs_lsmr  # number of bytes
+```
+
+Therefore, you can check that you have enough memory in RAM to allocate a `KrylovSolver`.
+
+```@example storage
+free_nbytes = Sys.free_memory()
+Base.format_bytes(free_nbytes)  # Total free memory in RAM in bytes.
+```
+
+!!! note
+    - Beyond having faster operations, using low precisions, such as simple precision, allows to store more coefficients in RAM and solve larger linear problems.
+    - In the file [test_allocations.jl](https://github.com/JuliaSmoothOptimizers/Krylov.jl/blob/main/test/test_allocations.jl), we use the macro `@allocated` to test that we match the expected storage requirement of each method with a tolerance of 2%.
diff --git a/docs/src/tips.md b/docs/src/tips.md
index 604c0633d..ca3d927bd 100644
--- a/docs/src/tips.md
+++ b/docs/src/tips.md
@@ -23,7 +23,7 @@ BLAS.set_num_threads(N)  # 1 ≤ N ≤ NMAX
 BLAS.get_num_threads()
 ```
 
-The recommended number of BLAS threads is the number of physical and not logical cores, which is in general `N = NMAX / 2`.
+The recommended number of BLAS threads is the number of physical and not logical cores, which is in general `N = NMAX / 2` if your CPU supports simultaneous multithreading (SMT).
 
 By default Julia ships with OpenBLAS but it's also possible to use Intel MKL BLAS and LAPACK with [MKL.jl](https://github.com/JuliaLinearAlgebra/MKL.jl).
 
diff --git a/docs/src/warm_start.md b/docs/src/warm-start.md
similarity index 59%
rename from docs/src/warm_start.md
rename to docs/src/warm-start.md
index 030cad6c0..d926db183 100644
--- a/docs/src/warm_start.md
+++ b/docs/src/warm-start.md
@@ -1,9 +1,10 @@
-## Warm Start
+# [Warm-start](@id warm-start)
 
-Most Krylov methods in this module accept a starting point as argument. The starting point is used as initial approximation to a solution.
+Most Krylov methods in this module accept a starting point as argument.
+The starting point is used as initial approximation to a solution.
 
 ```julia
-solver = CgSolver(n, n, S)
+solver = CgSolver(A, b)
 cg!(solver, A, b, itmax=100)
 if !issolved(solver)
   cg!(solver, A, b, solver.x, itmax=100) # cg! uses the approximate solution `solver.x` as starting point
@@ -28,7 +29,7 @@ If a Krylov method doesn't have the option to warm start, it can still be done e
 We provide an example with `cg_lanczos!`.
 
 ```julia
-solver = CgLanczosSolver(n, n, S)
+solver = CgLanczosSolver(A, b)
 cg_lanczos!(solver, A, b)
 x₀ = solver.x           # Ax₀ ≈ b
 r = b - A * x₀          # r = b - Ax₀
@@ -41,33 +42,34 @@ Explicit restarts cannot be avoided in certain block methods, such as TriMR, due
 
 ```julia
 # [E  A] [x] = [b]
-# [Aᵀ F] [y]   [c]
+# [Aᴴ F] [y]   [c]
 M = inv(E)
 N = inv(F)
 x₀, y₀, stats = trimr(A, b, c, M=M, N=N)
 
 # E and F are not available inside TriMR
 b₀ = b -  Ex₀ - Ay
-c₀ = c - Aᵀx₀ - Fy
+c₀ = c - Aᴴx₀ - Fy
 
 Δx, Δy, stats = trimr(A, b₀, c₀, M=M, N=N)
 x = x₀ + Δx
 y = y₀ + Δy
 ```
-
-## Restarted methods
-
-The storage requierements of Krylov methods based on the Arnoldi process, such as FOM and GMRES, increase as the iteration progresses.
-For very large problems, the storage costs become prohibitive after only few iterations and restarted variants FOM(k) and GMRES(k) are prefered.
-In this section, we show how to use warm starts to implement GMRES(k) and FOM(k).
-
-```julia
-k = 50
-solver = GmresSolver(A, b, k)  # FomSolver(A, b, k)
-solver.x .= 0                  # solver.x .= x₀ 
-nrestart = 0
-while !issolved(solver) || nrestart ≤ 10
-  solve!(solver, A, b, solver.x, itmax=k)
-  nrestart += 1
-end
+```@meta
+# ## Restarted methods
+#
+# The storage requierements of Krylov methods based on the Arnoldi process, such as FOM and GMRES, increase as the iteration progresses.
+# For very large problems, the storage costs become prohibitive after only few iterations and restarted variants FOM(k) and GMRES(k) are prefered.
+# In this section, we show how to use warm starts to implement GMRES(k) and FOM(k).
+#
+# ```julia
+# k = 50
+# solver = GmresSolver(A, b, k)  # FomSolver(A, b, k)
+# solver.x .= 0                  # solver.x .= x₀ 
+# nrestart = 0
+# while !issolved(solver) || nrestart ≤ 10
+#   solve!(solver, A, b, solver.x, itmax=k)
+#   nrestart += 1
+# end
+# ```
 ```
diff --git a/src/Krylov.jl b/src/Krylov.jl
index b714ccd79..aadde1575 100644
--- a/src/Krylov.jl
+++ b/src/Krylov.jl
@@ -5,6 +5,7 @@ using LinearAlgebra, SparseArrays, Printf
 include("krylov_utils.jl")
 include("krylov_stats.jl")
 include("krylov_solvers.jl")
+include("krylov_processes.jl")
 
 include("cg.jl")
 include("cr.jl")
@@ -19,6 +20,7 @@ include("diom.jl")
 include("fom.jl")
 include("dqgmres.jl")
 include("gmres.jl")
+include("fgmres.jl")
 
 include("gpmr.jl")
 
@@ -49,6 +51,4 @@ include("lnlq.jl")
 include("craig.jl")
 include("craigmr.jl")
 
-include("callback_utils.jl")
-
 end
diff --git a/src/bicgstab.jl b/src/bicgstab.jl
index c3b914599..c4f16595e 100644
--- a/src/bicgstab.jl
+++ b/src/bicgstab.jl
@@ -16,40 +16,59 @@
 export bicgstab, bicgstab!
 
 """
-    (x, stats) = bicgstab(A, b::AbstractVector{FC}; c::AbstractVector{FC}=b,
-                          M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                          itmax::Int=0, verbose::Int=0, history::Bool=false,
-                          ldiv::Bool=false, callback=solver->false)
+    (x, stats) = bicgstab(A, b::AbstractVector{FC};
+                          c::AbstractVector{FC}=b, M=I, N=I,
+                          ldiv::Bool=false, atol::T=√eps(T),
+                          rtol::T=√eps(T), itmax::Int=0,
+                          verbose::Int=0, history::Bool=false,
+                          callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the square linear system Ax = b using the BICGSTAB method.
+    (x, stats) = bicgstab(A, b, x0::AbstractVector; kwargs...)
+
+BICGSTAB can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the square linear system Ax = b of size n using BICGSTAB.
 BICGSTAB requires two initial vectors `b` and `c`.
-The relation `bᵀc ≠ 0` must be satisfied and by default `c = b`.
+The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
 
 The Biconjugate Gradient Stabilized method is a variant of BiCG, like CGS,
-but using different updates for the Aᵀ-sequence in order to obtain smoother
+but using different updates for the Aᴴ-sequence in order to obtain smoother
 convergence than CGS.
 
 If BICGSTAB stagnates, we recommend DQGMRES and BiLQ as alternative methods for unsymmetric square systems.
 
 BICGSTAB stops when `itmax` iterations are reached or when `‖rₖ‖ ≤ atol + ‖b‖ * rtol`.
-`atol` is an absolute tolerance and `rtol` is a relative tolerance.
 
-Additional details can be displayed if verbose mode is enabled (verbose > 0).
-Information will be displayed every `verbose` iterations.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
 
-This implementation allows a left preconditioner `M` and a right preconditioner `N`.
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-BICGSTAB can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = bicgstab(A, b, x0; kwargs...)
+* `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -86,15 +105,17 @@ function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC},
   return solver
 end
 
-function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: AbstractVector{FC}=b,
-                   M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                   itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                   ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC};
+                   c :: AbstractVector{FC}=b, M=I, N=I,
+                   ldiv :: Bool=false, atol :: T=√eps(T),
+                   rtol :: T=√eps(T), itmax :: Int=0,
+                   verbose :: Int=0, history :: Bool=false,
+                   callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("BICGSTAB: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "BICGSTAB: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -102,8 +123,8 @@ function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :t , S, n)
@@ -150,14 +171,14 @@ function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC};
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s  %8s  %8s\n", "k", "‖rₖ‖", "|αₖ|", "|ωₖ|")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %8.1e  %8.1e\n", iter, rNorm, abs(α), abs(ω))
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %8s  %8s\n", "k", "‖rₖ‖", "|αₖ|", "|ωₖ|")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %8.1e  %8.1e\n", iter, rNorm, abs(α), abs(ω))
 
   next_ρ = @kdot(n, c, r)  # ρ₁ = ⟨r̅₀,r₀⟩
   if next_ρ == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = false, false
-    stats.status = "Breakdown bᵀc = 0"
+    stats.status = "Breakdown bᴴc = 0"
     solver.warm_start = false
     return solver
   end
@@ -207,9 +228,9 @@ function bicgstab!(solver :: BicgstabSolver{T,FC,S}, A, b :: AbstractVector{FC};
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
     breakdown = (α == 0 || isnan(α))
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %8.1e  %8.1e\n", iter, rNorm, abs(α), abs(ω))
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %8.1e  %8.1e\n", iter, rNorm, abs(α), abs(ω))
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "breakdown αₖ == 0")
diff --git a/src/bilq.jl b/src/bilq.jl
index 39725fbfe..12ee40652 100644
--- a/src/bilq.jl
+++ b/src/bilq.jl
@@ -13,35 +13,54 @@
 export bilq, bilq!
 
 """
-    (x, stats) = bilq(A, b::AbstractVector{FC}; c::AbstractVector{FC}=b,
-                      atol::T=√eps(T), rtol::T=√eps(T), transfer_to_bicg::Bool=true,
-                      itmax::Int=0, verbose::Int=0, history::Bool=false,
-                      callback=solver->false)
+    (x, stats) = bilq(A, b::AbstractVector{FC};
+                      c::AbstractVector{FC}=b, transfer_to_bicg::Bool=true,
+                      atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the square linear system Ax = b using the BiLQ method.
+    (x, stats) = bilq(A, b, x0::AbstractVector; kwargs...)
 
+BiLQ can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the square linear system Ax = b of size n using BiLQ.
 BiLQ is based on the Lanczos biorthogonalization process and requires two initial vectors `b` and `c`.
-The relation `bᵀc ≠ 0` must be satisfied and by default `c = b`.
-When `A` is symmetric and `b = c`, BiLQ is equivalent to SYMMLQ.
+The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
+When `A` is Hermitian and `b = c`, BiLQ is equivalent to SYMMLQ.
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
 
-An option gives the possibility of transferring to the BiCG point,
-when it exists. The transfer is based on the residual norm.
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-BiLQ can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = bilq(A, b, x0; kwargs...)
+* `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
+* `transfer_to_bicg`: transfer from the BiLQ point to the BiCG point, when it exists. The transfer is based on the residual norm;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
-#### Reference
+#### References
 
 * A. Montoison and D. Orban, [*BiLQ: An Iterative Method for Nonsymmetric Linear Systems with a Quasi-Minimum Error Property*](https://doi.org/10.1137/19M1290991), SIAM Journal on Matrix Analysis and Applications, 41(3), pp. 1145--1166, 2020.
+* R. Fletcher, [*Conjugate gradient methods for indefinite systems*](https://doi.org/10.1007/BFb0080116), Numerical Analysis, Springer, pp. 73--89, 1976.
 """
 function bilq end
 
@@ -73,23 +92,24 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: A
   return solver
 end
 
-function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: AbstractVector{FC}=b,
-               atol :: T=√eps(T), rtol :: T=√eps(T), transfer_to_bicg :: Bool=true,
-               itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-               callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC};
+               c :: AbstractVector{FC}=b, transfer_to_bicg :: Bool=true,
+               atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("BILQ: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "BILQ: system of size %d\n", n)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   uₖ₋₁, uₖ, q, vₖ₋₁, vₖ = solver.uₖ₋₁, solver.uₖ, solver.q, solver.vₖ₋₁, solver.vₖ
@@ -122,29 +142,29 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * bNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, bNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, bNorm)
 
   # Initialize the Lanczos biorthogonalization process.
-  cᵗb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
-  if cᵗb == 0
+  cᴴb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
+  if cᴴb == 0
     stats.niter = 0
     stats.solved = false
     stats.inconsistent = false
-    stats.status = "Breakdown bᵀc = 0"
+    stats.status = "Breakdown bᴴc = 0"
     solver.warm_start = false
     return solver
   end
 
-  βₖ = √(abs(cᵗb))            # β₁γ₁ = cᵀ(b - Ax₀)
-  γₖ = cᵗb / βₖ               # β₁γ₁ = cᵀ(b - Ax₀)
+  βₖ = √(abs(cᴴb))            # β₁γ₁ = cᴴ(b - Ax₀)
+  γₖ = cᴴb / βₖ               # β₁γ₁ = cᴴ(b - Ax₀)
   vₖ₋₁ .= zero(FC)            # v₀ = 0
   uₖ₋₁ .= zero(FC)            # u₀ = 0
   vₖ .= r₀ ./ βₖ              # v₁ = (b - Ax₀) / β₁
   uₖ .= c ./ conj(γₖ)         # u₁ = c / γ̄₁
   cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
   sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-  d̅ .= zero(FC)               # Last column of D̅ₖ = Vₖ(Qₖ)ᵀ
+  d̅ .= zero(FC)               # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
   ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
   ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
   δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
@@ -164,10 +184,10 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
 
     # Continue the Lanczos biorthogonalization process.
     # AVₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀUₖ = Uₖ(Tₖ)ᵀ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , vₖ)  # Forms vₖ₊₁ : q ← Avₖ
-    mul!(p, Aᵀ, uₖ)  # Forms uₖ₊₁ : p ← Aᵀuₖ
+    mul!(p, Aᴴ, uₖ)  # Forms uₖ₊₁ : p ← Aᴴuₖ
 
     @kaxpy!(n, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - β̄ₖ * uₖ₋₁
@@ -177,9 +197,9 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
     @kaxpy!(n, -     αₖ , vₖ, q)    # q ← q - αₖ * vₖ
     @kaxpy!(n, -conj(αₖ), uₖ, p)    # p ← p - ᾱₖ * uₖ
 
-    pᵗq = @kdot(n, p, q)      # pᵗq  = ⟨p,q⟩
-    βₖ₊₁ = √(abs(pᵗq))        # βₖ₊₁ = √(|pᵗq|)
-    γₖ₊₁ = pᵗq / βₖ₊₁         # γₖ₊₁ = pᵗq / βₖ₊₁
+    pᴴq = @kdot(n, p, q)      # pᴴq  = ⟨p,q⟩
+    βₖ₊₁ = √(abs(pᴴq))        # βₖ₊₁ = √(|pᴴq|)
+    γₖ₊₁ = pᴴq / βₖ₊₁         # γₖ₊₁ = pᴴq / βₖ₊₁
 
     # Update the LQ factorization of Tₖ = L̅ₖQₖ.
     # [ α₁ γ₂ 0  •  •  •  0 ]   [ δ₁   0    •   •   •    •    0   ]
@@ -234,7 +254,7 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
       ηₖ   = -ϵₖ₋₂ * ζₖ₋₂ - λₖ₋₁ * ζₖ₋₁
     end
 
-    # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Vₖ(Qₖ)ᵀ.
+    # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Vₖ(Qₖ)ᴴ.
     # [d̅ₖ₋₁ vₖ] [cₖ  s̄ₖ] = [dₖ₋₁ d̅ₖ] ⟷ dₖ₋₁ = cₖ * d̅ₖ₋₁ + sₖ * vₖ
     #           [sₖ -cₖ]             ⟷ d̅ₖ   = s̄ₖ * d̅ₖ₋₁ - cₖ * vₖ
     if iter ≥ 2
@@ -257,13 +277,13 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
     @. vₖ₋₁ = vₖ # vₖ₋₁ ← vₖ
     @. uₖ₋₁ = uₖ # uₖ₋₁ ← uₖ
 
-    if pᵗq ≠ 0
+    if pᴴq ≠ 0
       @. vₖ = q / βₖ₊₁        # βₖ₊₁vₖ₊₁ = q
       @. uₖ = p / conj(γₖ₊₁)  # γ̄ₖ₊₁uₖ₊₁ = p
     end
 
     # Compute ⟨vₖ,vₖ₊₁⟩ and ‖vₖ₊₁‖
-    vₖᵀvₖ₊₁ = @kdot(n, vₖ₋₁, vₖ)
+    vₖᴴvₖ₊₁ = @kdot(n, vₖ₋₁, vₖ)
     norm_vₖ₊₁ = @knrm2(n, vₖ)
 
     # Compute BiLQ residual norm
@@ -273,7 +293,7 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
     else
       μₖ = βₖ * (sₖ₋₁ * ζₖ₋₂ - cₖ₋₁ * cₖ * ζₖ₋₁) + αₖ * sₖ * ζₖ₋₁
       ωₖ = βₖ₊₁ * sₖ * ζₖ₋₁
-      θₖ = conj(μₖ) * ωₖ * vₖᵀvₖ₊₁
+      θₖ = conj(μₖ) * ωₖ * vₖᴴvₖ₊₁
       rNorm_lq = sqrt(abs2(μₖ) * norm_vₖ^2 + abs2(ωₖ) * norm_vₖ₊₁^2 + 2 * real(θₖ))
     end
     history && push!(rNorms, rNorm_lq)
@@ -299,10 +319,10 @@ function bilq!(solver :: BilqSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Ab
     solved_lq = rNorm_lq ≤ ε
     solved_cg = transfer_to_bicg && (abs(δbarₖ) > eps(T)) && (rNorm_cg ≤ ε)
     tired = iter ≥ itmax
-    breakdown = !solved_lq && !solved_cg && (pᵗq == 0)
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm_lq)
+    breakdown = !solved_lq && !solved_cg && (pᴴq == 0)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm_lq)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute BICG point
   # (xᶜ)ₖ ← (xᴸ)ₖ₋₁ + ζbarₖ * d̅ₖ
diff --git a/src/bilqr.jl b/src/bilqr.jl
index 09fef1f6c..5666f0863 100644
--- a/src/bilqr.jl
+++ b/src/bilqr.jl
@@ -1,5 +1,5 @@
 # An implementation of BILQR for the solution of square
-# consistent linear adjoint systems Ax = b and Aᵀy = c.
+# consistent linear adjoint systems Ax = b and Aᴴy = c.
 #
 # This method is described in
 #
@@ -14,33 +14,54 @@ export bilqr, bilqr!
 
 """
     (x, y, stats) = bilqr(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                          atol::T=√eps(T), rtol::T=√eps(T), transfer_to_bicg::Bool=true,
-                          itmax::Int=0, verbose::Int=0, history::Bool=false,
-                          callback=solver->false)
+                          transfer_to_bicg::Bool=true, atol::T=√eps(T),
+                          rtol::T=√eps(T), itmax::Int=0,
+                          verbose::Int=0, history::Bool=false,
+                          callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
+    (x, y, stats) = bilqr(A, b, c, x0::AbstractVector, y0::AbstractVector; kwargs...)
+
+BiLQR can be warm-started from initial guesses `x0` and `y0` where `kwargs` are the same keyword arguments as above.
+
 Combine BiLQ and QMR to solve adjoint systems.
 
     [0  A] [y] = [b]
-    [Aᵀ 0] [x]   [c]
+    [Aᴴ 0] [x]   [c]
+
+The relation `bᴴc ≠ 0` must be satisfied.
+BiLQ is used for solving primal system `Ax = b` of size n.
+QMR is used for solving dual system `Aᴴy = c` of size n.
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n;
+* `c`: a vector of length n.
 
-The relation `bᵀc ≠ 0` must be satisfied.
-BiLQ is used for solving primal system `Ax = b`.
-QMR is used for solving dual system `Aᵀy = c`.
+#### Optional arguments
 
-An option gives the possibility of transferring from the BiLQ point to the
-BiCG point, when it exists. The transfer is based on the residual norm.
+* `x0`: a vector of length n that represents an initial guess of the solution x;
+* `y0`: a vector of length n that represents an initial guess of the solution y.
 
-BiLQR can be warm-started from initial guesses `x0` and `y0` with the method
+#### Keyword arguments
 
-    (x, y, stats) = bilqr(A, b, c, x0, y0; kwargs...)
+* `transfer_to_bicg`: transfer from the BiLQ point to the BiCG point, when it exists. The transfer is based on the residual norm;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `y`: a dense vector of length n;
+* `stats`: statistics collected on the run in an [`AdjointStats`](@ref) structure.
 
 #### Reference
 
@@ -78,23 +99,24 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 end
 
 function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                atol :: T=√eps(T), rtol :: T=√eps(T), transfer_to_bicg :: Bool=true,
-                itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                transfer_to_bicg :: Bool=true, atol :: T=√eps(T),
+                rtol :: T=√eps(T), itmax :: Int=0,
+                verbose :: Int=0, history :: Bool=false,
+                callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("Systems must be square")
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("BILQR: systems of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "BILQR: systems of size %d\n", n)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   uₖ₋₁, uₖ, q, vₖ₋₁, vₖ = solver.uₖ₋₁, solver.uₖ, solver.q, solver.vₖ₋₁, solver.vₖ
@@ -109,7 +131,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   if warm_start
     mul!(r₀, A, Δx)
     @kaxpby!(n, one(FC), b, -one(FC), r₀)
-    mul!(s₀, Aᵀ, Δy)
+    mul!(s₀, Aᴴ, Δy)
     @kaxpby!(n, one(FC), c, -one(FC), s₀)
   end
 
@@ -117,7 +139,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   x .= zero(FC)          # x₀
   bNorm = @knrm2(n, r₀)  # rNorm = ‖r₀‖
 
-  # Initial solution t₀ and residual norm ‖s₀‖ = ‖c - Aᵀy₀‖.
+  # Initial solution t₀ and residual norm ‖s₀‖ = ‖c - Aᴴy₀‖.
   t .= zero(FC)          # t₀
   cNorm = @knrm2(n, s₀)  # sNorm = ‖s₀‖
 
@@ -128,38 +150,38 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   history && push!(sNorms, cNorm)
   εL = atol + rtol * bNorm
   εQ = atol + rtol * cNorm
-  (verbose > 0) && @printf("%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖sₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e\n", iter, bNorm, cNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖sₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e\n", iter, bNorm, cNorm)
 
   # Initialize the Lanczos biorthogonalization process.
-  cᵗb = @kdot(n, s₀, r₀)  # ⟨s₀,r₀⟩ = ⟨c - Aᵀy₀,b - Ax₀⟩
-  if cᵗb == 0
+  cᴴb = @kdot(n, s₀, r₀)  # ⟨s₀,r₀⟩ = ⟨c - Aᴴy₀,b - Ax₀⟩
+  if cᴴb == 0
     stats.niter = 0
     stats.solved_primal = false
     stats.solved_dual = false
-    stats.status = "Breakdown bᵀc = 0"
+    stats.status = "Breakdown bᴴc = 0"
     solver.warm_start = false
     return solver
   end
 
   # Set up workspace.
-  βₖ = √(abs(cᵗb))            # β₁γ₁ = (c - Aᵀy₀)ᵀ(b - Ax₀)
-  γₖ = cᵗb / βₖ               # β₁γ₁ = (c - Aᵀy₀)ᵀ(b - Ax₀)
+  βₖ = √(abs(cᴴb))            # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
+  γₖ = cᴴb / βₖ               # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
   vₖ₋₁ .= zero(FC)            # v₀ = 0
   uₖ₋₁ .= zero(FC)            # u₀ = 0
   vₖ .= r₀ ./ βₖ              # v₁ = (b - Ax₀) / β₁
-  uₖ .= s₀ ./ conj(γₖ)        # u₁ = (c - Aᵀy₀) / γ̄₁
+  uₖ .= s₀ ./ conj(γₖ)        # u₁ = (c - Aᴴy₀) / γ̄₁
   cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
   sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-  d̅ .= zero(FC)               # Last column of D̅ₖ = Vₖ(Qₖ)ᵀ
+  d̅ .= zero(FC)               # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
   ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
   ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
   δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
   ψbarₖ₋₁ = ψₖ₋₁ = zero(FC)   # ψₖ₋₁ and ψbarₖ are the last components of h̅ₖ = Qₖγ̄₁e₁
   norm_vₖ = bNorm / βₖ        # ‖vₖ‖ is used for residual norm estimates
   ϵₖ₋₃ = λₖ₋₂ = zero(FC)      # Components of Lₖ₋₁
-  wₖ₋₃ .= zero(FC)            # Column k-3 of Wₖ = Uₖ(Lₖ)⁻ᵀ
-  wₖ₋₂ .= zero(FC)            # Column k-2 of Wₖ = Uₖ(Lₖ)⁻ᵀ
+  wₖ₋₃ .= zero(FC)            # Column k-3 of Wₖ = Uₖ(Lₖ)⁻ᴴ
+  wₖ₋₂ .= zero(FC)            # Column k-2 of Wₖ = Uₖ(Lₖ)⁻ᴴ
   τₖ = zero(T)                # τₖ is used for the dual residual norm estimate
 
   # Stopping criterion.
@@ -180,10 +202,10 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     # Continue the Lanczos biorthogonalization process.
     # AVₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀUₖ = Uₖ(Tₖ)ᵀ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , vₖ)  # Forms vₖ₊₁ : q ← Avₖ
-    mul!(p, Aᵀ, uₖ)  # Forms uₖ₊₁ : p ← Aᵀuₖ
+    mul!(p, Aᴴ, uₖ)  # Forms uₖ₊₁ : p ← Aᴴuₖ
 
     @kaxpy!(n, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - β̄ₖ * uₖ₋₁
@@ -193,9 +215,9 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     @kaxpy!(n, -     αₖ , vₖ, q)  # q ← q - αₖ * vₖ
     @kaxpy!(n, -conj(αₖ), uₖ, p)  # p ← p - ᾱₖ * uₖ
 
-    pᵗq = @kdot(n, p, q)  # pᵗq  = ⟨p,q⟩
-    βₖ₊₁ = √(abs(pᵗq))    # βₖ₊₁ = √(|pᵗq|)
-    γₖ₊₁ = pᵗq / βₖ₊₁     # γₖ₊₁ = pᵗq / βₖ₊₁
+    pᴴq = @kdot(n, p, q)  # pᴴq  = ⟨p,q⟩
+    βₖ₊₁ = √(abs(pᴴq))    # βₖ₊₁ = √(|pᴴq|)
+    γₖ₊₁ = pᴴq / βₖ₊₁     # γₖ₊₁ = pᴴq / βₖ₊₁
 
     # Update the LQ factorization of Tₖ = L̅ₖQₖ.
     # [ α₁ γ₂ 0  •  •  •  0 ]   [ δ₁   0    •   •   •    •    0   ]
@@ -251,7 +273,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
         ηₖ   = -ϵₖ₋₂ * ζₖ₋₂ - λₖ₋₁ * ζₖ₋₁
       end
 
-      # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Vₖ(Qₖ)ᵀ.
+      # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Vₖ(Qₖ)ᴴ.
       # [d̅ₖ₋₁ vₖ] [cₖ  s̄ₖ] = [dₖ₋₁ d̅ₖ] ⟷ dₖ₋₁ = cₖ * d̅ₖ₋₁ + sₖ * vₖ
       #           [sₖ -cₖ]             ⟷ d̅ₖ   = s̄ₖ * d̅ₖ₋₁ - cₖ * vₖ
       if iter ≥ 2
@@ -271,7 +293,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
       end
 
       # Compute ⟨vₖ,vₖ₊₁⟩ and ‖vₖ₊₁‖
-      vₖᵀvₖ₊₁ = @kdot(n, vₖ, q) / βₖ₊₁
+      vₖᴴvₖ₊₁ = @kdot(n, vₖ, q) / βₖ₊₁
       norm_vₖ₊₁ = @knrm2(n, q) / βₖ₊₁
 
       # Compute BiLQ residual norm
@@ -281,7 +303,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
       else
         μₖ = βₖ * (sₖ₋₁ * ζₖ₋₂ - cₖ₋₁ * cₖ * ζₖ₋₁) + αₖ * sₖ * ζₖ₋₁
         ωₖ = βₖ₊₁ * sₖ * ζₖ₋₁
-        θₖ = conj(μₖ) * ωₖ * vₖᵀvₖ₊₁
+        θₖ = conj(μₖ) * ωₖ * vₖᴴvₖ₊₁
         rNorm_lq = sqrt(abs2(μₖ) * norm_vₖ^2 + abs2(ωₖ) * norm_vₖ₊₁^2 + 2 * real(θₖ))
       end
       history && push!(rNorms, rNorm_lq)
@@ -318,7 +340,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
         ψbarₖ = sₖ * ψbarₖ₋₁
       end
 
-      # Compute the direction wₖ₋₁, the last column of Wₖ₋₁ = (Uₖ₋₁)(Lₖ₋₁)⁻ᵀ ⟷ (L̄ₖ₋₁)(Wₖ₋₁)ᵀ = (Uₖ₋₁)ᵀ.
+      # Compute the direction wₖ₋₁, the last column of Wₖ₋₁ = (Uₖ₋₁)(Lₖ₋₁)⁻ᴴ ⟷ (L̄ₖ₋₁)(Wₖ₋₁)ᵀ = (Uₖ₋₁)ᵀ.
       # w₁ = u₁ / δ̄₁
       if iter == 2
         wₖ₋₁ = wₖ₋₂
@@ -372,7 +394,7 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     @. vₖ₋₁ = vₖ  # vₖ₋₁ ← vₖ
     @. uₖ₋₁ = uₖ  # uₖ₋₁ ← uₖ
 
-    if pᵗq ≠ zero(FC)
+    if pᴴq ≠ zero(FC)
       @. vₖ = q / βₖ₊₁        # βₖ₊₁vₖ₊₁ = q
       @. uₖ = p / conj(γₖ₊₁)  # γ̄ₖ₊₁uₖ₊₁ = p
     end
@@ -392,13 +414,13 @@ function bilqr!(solver :: BilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     user_requested_exit = callback(solver) :: Bool
     tired = iter ≥ itmax
-    breakdown = !solved_lq && !solved_cg && (pᵗq == 0)
+    breakdown = !solved_lq && !solved_cg && (pᴴq == 0)
 
-    kdisplay(iter, verbose) &&  solved_primal && !solved_dual && @printf("%5d  %7s  %7.1e\n", iter, "", sNorm)
-    kdisplay(iter, verbose) && !solved_primal &&  solved_dual && @printf("%5d  %7.1e  %7s\n", iter, rNorm_lq, "")
-    kdisplay(iter, verbose) && !solved_primal && !solved_dual && @printf("%5d  %7.1e  %7.1e\n", iter, rNorm_lq, sNorm)
+    kdisplay(iter, verbose) &&  solved_primal && !solved_dual && @printf(iostream, "%5d  %7s  %7.1e\n", iter, "", sNorm)
+    kdisplay(iter, verbose) && !solved_primal &&  solved_dual && @printf(iostream, "%5d  %7.1e  %7s\n", iter, rNorm_lq, "")
+    kdisplay(iter, verbose) && !solved_primal && !solved_dual && @printf(iostream, "%5d  %7.1e  %7.1e\n", iter, rNorm_lq, sNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute BICG point
   # (xᶜ)ₖ ← (xᴸ)ₖ₋₁ + ζbarₖ * d̅ₖ
diff --git a/src/callback_utils.jl b/src/callback_utils.jl
deleted file mode 100644
index eac362e5d..000000000
--- a/src/callback_utils.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-export StorageGetxRestartedGmres
-
-export get_x_restarted_gmres!
-
-mutable struct StorageGetxRestartedGmres{S}
-  x::S
-  y::S
-  p::S
-end
-StorageGetxRestartedGmres(solver::GmresSolver; N = I) = 
-  StorageGetxRestartedGmres(similar(solver.x), similar(solver.z), (N === I) ? similar(solver.p) : similar(solver.x))
-
-function get_x_restarted_gmres!(solver::GmresSolver{T,FC,S}, A, 
-                                stor::StorageGetxRestartedGmres{S}, N) where {T,FC,S}
-  NisI = (N === I)
-  x2, y2, p2 = stor.x, stor.y, stor.p
-  n = size(A, 2)
-  # Compute yₖ by solving Rₖyₖ = zₖ with backward substitution.
-  nr = sum(1:solver.inner_iter)
-  y = solver.z  # yᵢ = zᵢ
-  y2 .= y
-  R = solver.R
-  V = solver.V
-  x2 .= solver.Δx
-  for i = solver.inner_iter : -1 : 1
-    pos = nr + i - solver.inner_iter      # position of rᵢ.ₖ
-    for j = solver.inner_iter : -1 : i+1
-      y2[i] = y2[i] - R[pos] * y2[j]  # yᵢ ← yᵢ - rᵢⱼyⱼ
-      pos = pos - j + 1            # position of rᵢ.ⱼ₋₁
-    end
-    # Rₖ can be singular if the system is inconsistent
-    if abs(R[pos]) ≤ eps(T)^(3/4)
-      y2[i] = zero(FC)
-      inconsistent = true
-    else
-      y2[i] = y2[i] / R[pos]  # yᵢ ← yᵢ / rᵢᵢ
-    end
-  end
-
-  # Form xₖ = N⁻¹Vₖyₖ
-  for i = 1 : solver.inner_iter
-    @kaxpy!(n, y2[i], V[i], x2)
-  end
-  if !NisI
-    p2 .= solver.p
-    p2 .= x2
-    mul!(x2, N, p2)
-  end
-  x2 .+= solver.x
-end
diff --git a/src/cg.jl b/src/cg.jl
index 8a974accc..ed9d88cfa 100644
--- a/src/cg.jl
+++ b/src/cg.jl
@@ -15,36 +15,53 @@
 
 export cg, cg!
 
-
 """
     (x, stats) = cg(A, b::AbstractVector{FC};
-                    M=I, atol::T=√eps(T), rtol::T=√eps(T),
-                    itmax::Int=0, radius::T=zero(T), linesearch::Bool=false,
+                    M=I, ldiv::Bool=false, radius::T=zero(T),
+                    linesearch::Bool=false, atol::T=√eps(T),
+                    rtol::T=√eps(T), itmax::Int=0,
                     verbose::Int=0, history::Bool=false,
-                    ldiv::Bool=false, callback=solver->false)
+                    callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-The conjugate gradient method to solve the symmetric linear system Ax=b.
+    (x, stats) = cg(A, b, x0::AbstractVector; kwargs...)
 
-The method does _not_ abort if A is not definite.
+CG can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be symmetric and positive definite.
+The conjugate gradient method to solve the Hermitian linear system Ax = b of size n.
+
+The method does _not_ abort if A is not definite.
 M also indicates the weighted norm in which residuals are measured.
 
-If `itmax=0`, the default number of iterations is set to `2 * n`,
-with `n = length(b)`.
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian positive definite matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-CG can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = cg(A, b, x0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `linesearch`: if `true`, indicate that the solution is to be used in an inexact Newton method with linesearch. If negative curvature is detected at iteration k > 0, the solution of iteration k-1 is returned. If negative curvature is detected at iteration 0, the right-hand side is returned (i.e., the negative gradient);
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -81,24 +98,25 @@ function cg!(solver :: CgSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: Abstr
 end
 
 function cg!(solver :: CgSolver{T,FC,S}, A, b :: AbstractVector{FC};
-             M=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-             itmax :: Int=0, radius :: T=zero(T), linesearch :: Bool=false,
+             M=I, ldiv :: Bool=false, radius :: T=zero(T),
+             linesearch :: Bool=false, atol :: T=√eps(T),
+             rtol :: T=√eps(T), itmax :: Int=0,
              verbose :: Int=0, history :: Bool=false,
-             ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+             callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   linesearch && (radius > 0) && error("`linesearch` set to `true` but trust-region radius > 0")
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CG: system of %d equations in %d variables\n", n, n)
+  (verbose > 0) && @printf(iostream, "CG: system of %d equations in %d variables\n", n, n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :z, S, n)
@@ -134,8 +152,8 @@ function cg!(solver :: CgSolver{T,FC,S}, A, b :: AbstractVector{FC};
   pAp = zero(T)
   pNorm² = γ
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s  %8s  %8s  %8s\n", "k", "‖r‖", "pAp", "α", "σ")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  ", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %8s  %8s  %8s\n", "k", "‖r‖", "pAp", "α", "σ")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  ", iter, rNorm)
 
   solved = rNorm ≤ ε
   tired = iter ≥ itmax
@@ -164,9 +182,9 @@ function cg!(solver :: CgSolver{T,FC,S}, A, b :: AbstractVector{FC};
     α = γ / pAp
 
     # Compute step size to boundary if applicable.
-    σ = radius > 0 ? maximum(to_boundary(x, p, radius, dNorm2=pNorm²)) : α
+    σ = radius > 0 ? maximum(to_boundary(n, x, p, radius, dNorm2=pNorm²)) : α
 
-    kdisplay(iter, verbose) && @printf("%8.1e  %8.1e  %8.1e\n", pAp, α, σ)
+    kdisplay(iter, verbose) && @printf(iostream, "%8.1e  %8.1e  %8.1e\n", pAp, α, σ)
 
     # Move along p from x to the boundary if either
     # the next step leads outside the trust region or
@@ -201,9 +219,9 @@ function cg!(solver :: CgSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + 1
     tired = iter ≥ itmax
     user_requested_exit = callback(solver) :: Bool
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  ", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  ", iter, rNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   solved && on_boundary && (status = "on trust-region boundary")
   solved && linesearch && (pAp ≤ 0) && (status = "nonpositive curvature detected")
diff --git a/src/cg_lanczos.jl b/src/cg_lanczos.jl
index a8e24f02f..f648eb2a8 100644
--- a/src/cg_lanczos.jl
+++ b/src/cg_lanczos.jl
@@ -12,34 +12,52 @@
 
 export cg_lanczos, cg_lanczos!
 
-
 """
     (x, stats) = cg_lanczos(A, b::AbstractVector{FC};
-                            M=I, atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
-                            check_curvature::Bool=false, verbose::Int=0, history::Bool=false,
-                            ldiv::Bool=false, callback=solver->false)
+                            M=I, ldiv::Bool=false,
+                            check_curvature::Bool=false, atol::T=√eps(T),
+                            rtol::T=√eps(T), itmax::Int=0,
+                            verbose::Int=0, history::Bool=false,
+                            callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-The Lanczos version of the conjugate gradient method to solve the
-symmetric linear system
+    (x, stats) = cg_lanczos(A, b, x0::AbstractVector; kwargs...)
 
-    Ax = b
+CG-LANCZOS can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+The Lanczos version of the conjugate gradient method to solve the
+Hermitian linear system Ax = b of size n.
 
 The method does _not_ abort if A is not definite.
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be hermitian and positive definite.
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-CG-LANCZOS can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = cg_lanczos(A, b, x0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `check_curvature`: if `true`, check that the curvature of the quadratic along the search direction is positive, and abort if not, unless `linesearch` is also `true`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`LanczosStats`](@ref) structure.
 
 #### References
 
@@ -77,21 +95,23 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
 end
 
 function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                     M=I, atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
-                     check_curvature :: Bool=false, verbose :: Int=0, history :: Bool=false,
-                     ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                     M=I, ldiv :: Bool=false,
+                     check_curvature :: Bool=false, atol :: T=√eps(T),
+                     rtol :: T=√eps(T), itmax :: Int=0,
+                     verbose :: Int=0, history :: Bool=false,
+                     callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CG Lanczos: system of %d equations in %d variables\n", n, n)
+  (verbose > 0) && @printf(iostream, "CG Lanczos: system of %d equations in %d variables\n", n, n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $T")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :v, S, n)
@@ -111,7 +131,7 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
     Mv .= b
   end
   MisI || mulorldiv!(v, M, Mv, ldiv)  # v₁ = M⁻¹r₀
-  β = sqrt(@kdotr(n, v, Mv))          # β₁ = v₁ᵀ M v₁
+  β = sqrt(@kdotr(n, v, Mv))          # β₁ = v₁ᴴ M v₁
   σ = β
   rNorm = σ
   history && push!(rNorms, rNorm)
@@ -143,8 +163,8 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
 
   # Define stopping tolerance.
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
   indefinite = false
   solved = rNorm ≤ ε
@@ -157,10 +177,10 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
     # Form next Lanczos vector.
     # βₖ₊₁Mvₖ₊₁ = Avₖ - δₖMvₖ - βₖMvₖ₋₁
     mul!(Mv_next, A, v)        # Mvₖ₊₁ ← Avₖ
-    δ = @kdotr(n, v, Mv_next)  # δₖ = vₖᵀ A vₖ
+    δ = @kdotr(n, v, Mv_next)  # δₖ = vₖᴴ A vₖ
 
     # Check curvature. Exit fast if requested.
-    # It is possible to show that σₖ² (δₖ - ωₖ₋₁ / γₖ₋₁) = pₖᵀ A pₖ.
+    # It is possible to show that σₖ² (δₖ - ωₖ₋₁ / γₖ₋₁) = pₖᴴ A pₖ.
     γ = one(T) / (δ - ω / γ)  # γₖ = 1 / (δₖ - ωₖ₋₁ / γₖ₋₁)
     indefinite |= (γ ≤ 0)
     (check_curvature & indefinite) && continue
@@ -172,7 +192,7 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
     end
     @. Mv = Mv_next                      # Mvₖ ← Mvₖ₊₁
     MisI || mulorldiv!(v, M, Mv, ldiv)   # vₖ₊₁ = M⁻¹ * Mvₖ₊₁
-    β = sqrt(@kdotr(n, v, Mv))           # βₖ₊₁ = vₖ₊₁ᵀ M vₖ₊₁
+    β = sqrt(@kdotr(n, v, Mv))           # βₖ₊₁ = vₖ₊₁ᴴ M vₖ₊₁
     @kscal!(n, one(FC) / β, v)           # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
     MisI || @kscal!(n, one(FC) / β, Mv)  # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
     Anorm2 += β_prev^2 + β^2 + δ^2       # Use ‖Tₖ₊₁‖₂ as increasing approximation of ‖A‖₂.
@@ -187,7 +207,7 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
     rNorm = abs(σ)          # ‖rₖ₊₁‖_M = |σₖ₊₁| because rₖ₊₁ = σₖ₊₁ * vₖ₊₁ and ‖vₖ₊₁‖_M = 1
     history && push!(rNorms, rNorm)
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
     # Stopping conditions that do not depend on user input.
     # This is to guard against tolerances that are unreasonably small.
@@ -198,7 +218,7 @@ function cg_lanczos!(solver :: CgLanczosSolver{T,FC,S}, A, b :: AbstractVector{F
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired                          && (status = "maximum number of iterations exceeded")
   (check_curvature & indefinite) && (status = "negative curvature")
diff --git a/src/cg_lanczos_shift.jl b/src/cg_lanczos_shift.jl
index 01f11e41f..bf883649d 100644
--- a/src/cg_lanczos_shift.jl
+++ b/src/cg_lanczos_shift.jl
@@ -13,13 +13,13 @@
 
 export cg_lanczos_shift, cg_lanczos_shift!
 
-
 """
     (x, stats) = cg_lanczos_shift(A, b::AbstractVector{FC}, shifts::AbstractVector{T};
-                                  M=I, atol::T=√eps(T), rtol::T=√eps(T),
-                                  itmax::Int=0, check_curvature::Bool=false,
+                                  M=I, ldiv::Bool=false,
+                                  check_curvature::Bool=false, atol::T=√eps(T),
+                                  rtol::T=√eps(T), itmax::Int=0,
                                   verbose::Int=0, history::Bool=false,
-                                  ldiv::Bool=false, callback=solver->false)
+                                  callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -27,15 +27,38 @@ export cg_lanczos_shift, cg_lanczos_shift!
 The Lanczos version of the conjugate gradient method to solve a family
 of shifted systems
 
-    (A + αI) x = b  (α = α₁, ..., αₙ)
+    (A + αI) x = b  (α = α₁, ..., αₚ)
+
+of size n. The method does _not_ abort if A + αI is not definite.
+
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n;
+* `shifts`: a vector of length p.
 
-The method does _not_ abort if A + αI is not definite.
+#### Keyword arguments
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be hermitian and positive definite.
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `check_curvature`: if `true`, check that the curvature of the quadratic along the search direction is positive, and abort if not, unless `linesearch` is also `true`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Output arguments
+
+* `x`: a vector of p dense vectors, each one of length n;
+* `stats`: statistics collected on the run in a [`LanczosShiftStats`](@ref) structure.
+
+#### References
+
+* A. Frommer and P. Maass, [*Fast CG-Based Methods for Tikhonov-Phillips Regularization*](https://doi.org/10.1137/S1064827596313310), SIAM Journal on Scientific Computing, 20(5), pp. 1831--1850, 1999.
+* C. C. Paige and M. A. Saunders, [*Solution of Sparse Indefinite Systems of Linear Equations*](https://doi.org/10.1137/0712047), SIAM Journal on Numerical Analysis, 12(4), pp. 617--629, 1975.
 """
 function cg_lanczos_shift end
 
@@ -56,24 +79,25 @@ See [`CgLanczosShiftSolver`](@ref) for more details about the `solver`.
 function cg_lanczos_shift! end
 
 function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: AbstractVector{FC}, shifts :: AbstractVector{T};
-                           M=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                           itmax :: Int=0, check_curvature :: Bool=false,
+                           M=I, ldiv :: Bool=false,
+                           check_curvature :: Bool=false, atol :: T=√eps(T),
+                           rtol :: T=√eps(T), itmax :: Int=0,
                            verbose :: Int=0, history :: Bool=false,
-                           ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                           callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == n || error("Inconsistent problem size")
 
   nshifts = length(shifts)
-  (verbose > 0) && @printf("CG Lanczos: system of %d equations in %d variables with %d shifts\n", n, n, nshifts)
+  (verbose > 0) && @printf(iostream, "CG Lanczos: system of %d equations in %d variables with %d shifts\n", n, n, nshifts)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :v, S, n)
@@ -92,7 +116,7 @@ function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: Abstr
   end
   Mv .= b                             # Mv₁ ← b
   MisI || mulorldiv!(v, M, Mv, ldiv)  # v₁ = M⁻¹ * Mv₁
-  β = sqrt(@kdotr(n, v, Mv))          # β₁ = v₁ᵀ M v₁
+  β = sqrt(@kdotr(n, v, Mv))          # β₁ = v₁ᴴ M v₁
   rNorms .= β
   if history
     for i = 1 : nshifts
@@ -140,14 +164,10 @@ function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: Abstr
   itmax == 0 && (itmax = 2 * n)
 
   # Build format strings for printing.
-  if kdisplay(iter, verbose)
-    fmt = "%5d" * repeat("  %8.1e", nshifts) * "\n"
-    # precompile printf for our particular format
-    local_printf(data...) = Core.eval(Main, :(@printf($fmt, $(data)...)))
-    local_printf(iter, rNorms...)
-  end
+  (verbose > 0) && (fmt = Printf.Format("%5d" * repeat("  %8.1e", nshifts) * "\n"))
+  kdisplay(iter, verbose) && Printf.format(iostream, fmt, iter, rNorms...)
 
-  solved = sum(not_cv) == 0
+  solved = !reduce(|, not_cv)
   tired = iter ≥ itmax
   status = "unknown"
   user_requested_exit = false
@@ -157,7 +177,7 @@ function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: Abstr
     # Form next Lanczos vector.
     # βₖ₊₁Mvₖ₊₁ = Avₖ - δₖMvₖ - βₖMvₖ₋₁
     mul!(Mv_next, A, v)                  # Mvₖ₊₁ ← Avₖ
-    δ = @kdotr(n, v, Mv_next)            # δₖ = vₖᵀ A vₖ
+    δ = @kdotr(n, v, Mv_next)            # δₖ = vₖᴴ A vₖ
     @kaxpy!(n, -δ, Mv, Mv_next)          # Mvₖ₊₁ ← Mvₖ₊₁ - δₖMvₖ
     if iter > 0
       @kaxpy!(n, -β, Mv_prev, Mv_next)   # Mvₖ₊₁ ← Mvₖ₊₁ - βₖMvₖ₋₁
@@ -165,12 +185,12 @@ function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: Abstr
     end
     @. Mv = Mv_next                      # Mvₖ ← Mvₖ₊₁
     MisI || mulorldiv!(v, M, Mv, ldiv)   # vₖ₊₁ = M⁻¹ * Mvₖ₊₁
-    β = sqrt(@kdotr(n, v, Mv))           # βₖ₊₁ = vₖ₊₁ᵀ M vₖ₊₁
+    β = sqrt(@kdotr(n, v, Mv))           # βₖ₊₁ = vₖ₊₁ᴴ M vₖ₊₁
     @kscal!(n, one(FC) / β, v)           # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
     MisI || @kscal!(n, one(FC) / β, Mv)  # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
 
-    # Check curvature: vₖᵀ(A + sᵢI)vₖ = vₖᵀAvₖ + sᵢ‖vₖ‖² = δₖ + ρₖ * sᵢ with ρₖ = ‖vₖ‖².
-    # It is possible to show that σₖ² (δₖ + ρₖ * sᵢ - ωₖ₋₁ / γₖ₋₁) = pₖᵀ (A + sᵢ I) pₖ.
+    # Check curvature: vₖᴴ(A + sᵢI)vₖ = vₖᴴAvₖ + sᵢ‖vₖ‖² = δₖ + ρₖ * sᵢ with ρₖ = ‖vₖ‖².
+    # It is possible to show that σₖ² (δₖ + ρₖ * sᵢ - ωₖ₋₁ / γₖ₋₁) = pₖᴴ (A + sᵢ I) pₖ.
     MisI || (ρ = @kdotr(n, v, v))
     for i = 1 : nshifts
       δhat[i] = δ + ρ * shifts[i]
@@ -208,13 +228,13 @@ function cg_lanczos_shift!(solver :: CgLanczosShiftSolver{T,FC,S}, A, b :: Abstr
       not_cv[i] = check_curvature ? !(converged[i] || indefinite[i]) : !converged[i]
     end
     iter = iter + 1
-    kdisplay(iter, verbose) && local_printf(iter, rNorms...)
+    kdisplay(iter, verbose) && Printf.format(iostream, fmt, iter, rNorms...)
 
     user_requested_exit = callback(solver) :: Bool
-    solved = sum(not_cv) == 0
+    solved = !reduce(|, not_cv)
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
diff --git a/src/cgls.jl b/src/cgls.jl
index f5529fbfb..55fe6d0ec 100644
--- a/src/cgls.jl
+++ b/src/cgls.jl
@@ -5,7 +5,7 @@
 #
 # equivalently, of the normal equations
 #
-#  AᵀAx = Aᵀb.
+#  AᴴAx = Aᴴb.
 #
 # CGLS is formally equivalent to applying the conjugate gradient method
 # to the normal equations but should be more stable. It is also formally
@@ -28,12 +28,12 @@
 
 export cgls, cgls!
 
-
 """
     (x, stats) = cgls(A, b::AbstractVector{FC};
-                      M=I, λ::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
-                      radius::T=zero(T), itmax::Int=0, verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      M=I, ldiv::Bool=false, radius::T=zero(T),
+                      λ::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
+                      itmax::Int=0, verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -42,19 +42,40 @@ Solve the regularized linear least-squares problem
 
     minimize ‖b - Ax‖₂² + λ‖x‖₂²
 
-using the Conjugate Gradient (CG) method, where λ ≥ 0 is a regularization
+of size m × n using the Conjugate Gradient (CG) method, where λ ≥ 0 is a regularization
 parameter. This method is equivalent to applying CG to the normal equations
 
-    (AᵀA + λI) x = Aᵀb
+    (AᴴA + λI) x = Aᴴb
 
 but is more stable.
 
-CGLS produces monotonic residuals ‖r‖₂ but not optimality residuals ‖Aᵀr‖₂.
+CGLS produces monotonic residuals ‖r‖₂ but not optimality residuals ‖Aᴴr‖₂.
 It is formally equivalent to LSQR, though can be slightly less accurate,
 but simpler to implement.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -79,23 +100,24 @@ See [`CglsSolver`](@ref) for more details about the `solver`.
 function cgls! end
 
 function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, λ :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
-               radius :: T=zero(T), itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               M=I, ldiv :: Bool=false, radius :: T=zero(T),
+               λ :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
+               itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CGLS: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CGLS: system of %d equations in %d variables\n", m, n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :Mr, S, m)
@@ -117,9 +139,9 @@ function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     return solver
   end
   MisI || mulorldiv!(Mr, M, r, ldiv)
-  mul!(s, Aᵀ, Mr)
+  mul!(s, Aᴴ, Mr)
   p .= s
-  γ = @kdotr(n, s, s)  # γ = sᵀs
+  γ = @kdotr(n, s, s)  # γ = sᴴs
   iter = 0
   itmax == 0 && (itmax = m + n)
 
@@ -128,8 +150,8 @@ function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
   history && push!(rNorms, rNorm)
   history && push!(ArNorms, ArNorm)
   ε = atol + rtol * ArNorm
-  (verbose > 0) && @printf("%5s  %8s  %8s\n", "k", "‖Aᵀr‖", "‖r‖")
-  kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %8s  %8s\n", "k", "‖Aᴴr‖", "‖r‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
 
   status = "unknown"
   on_boundary = false
@@ -140,12 +162,12 @@ function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
   while ! (solved || tired || user_requested_exit)
     mul!(q, A, p)
     MisI || mulorldiv!(Mq, M, q, ldiv)
-    δ = @kdotr(m, q, Mq)  # δ = qᵀMq
-    λ > 0 && (δ += λ * @kdotr(n, p, p))  # δ = δ + pᵀp
+    δ = @kdotr(m, q, Mq)  # δ = qᴴMq
+    λ > 0 && (δ += λ * @kdotr(n, p, p))  # δ = δ + pᴴp
     α = γ / δ
 
     # if a trust-region constraint is give, compute step to the boundary
-    σ = radius > 0 ? maximum(to_boundary(x, p, radius)) : α
+    σ = radius > 0 ? maximum(to_boundary(n, x, p, radius)) : α
     if (radius > 0) & (α > σ)
       α = σ
       on_boundary = true
@@ -154,9 +176,9 @@ function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     @kaxpy!(n,  α, p, x)     # Faster than x = x + α * p
     @kaxpy!(m, -α, q, r)     # Faster than r = r - α * q
     MisI || mulorldiv!(Mr, M, r, ldiv)
-    mul!(s, Aᵀ, Mr)
+    mul!(s, Aᴴ, Mr)
     λ > 0 && @kaxpy!(n, -λ, x, s)   # s = A' * r - λ * x
-    γ_next = @kdotr(n, s, s)   # γ_next = sᵀs
+    γ_next = @kdotr(n, s, s)   # γ_next = sᴴs
     β = γ_next / γ
     @kaxpby!(n, one(FC), s, β, p) # p = s + βp
     γ = γ_next
@@ -165,12 +187,12 @@ function cgls!(solver :: CglsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     history && push!(rNorms, rNorm)
     history && push!(ArNorms, ArNorm)
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
     user_requested_exit = callback(solver) :: Bool
     solved = (ArNorm ≤ ε) | on_boundary
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
diff --git a/src/cgne.jl b/src/cgne.jl
index 2859414e1..f85af32be 100644
--- a/src/cgne.jl
+++ b/src/cgne.jl
@@ -10,7 +10,7 @@
 # and is equivalent to applying the conjugate gradient method
 # to the linear system
 #
-#  AAᵀy = b.
+#  AAᴴy = b.
 #
 # This method is also known as Craig's method, CGME, and other
 # names, and is described in
@@ -28,12 +28,13 @@
 
 export cgne, cgne!
 
-
 """
     (x, stats) = cgne(A, b::AbstractVector{FC};
-                      M=I, λ::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
-                      itmax::Int=0, verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      N=I, ldiv::Bool=false,
+                      λ::T=zero(T), atol::T=√eps(T),
+                      rtol::T=√eps(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -42,11 +43,11 @@ Solve the consistent linear system
 
     Ax + √λs = b
 
-using the Conjugate Gradient (CG) method, where λ ≥ 0 is a regularization
+of size m × n using the Conjugate Gradient (CG) method, where λ ≥ 0 is a regularization
 parameter. This method is equivalent to applying CG to the normal equations
 of the second kind
 
-    (AAᵀ + λI) y = b
+    (AAᴴ + λI) y = b
 
 but is more stable. When λ = 0, this method solves the minimum-norm problem
 
@@ -60,10 +61,28 @@ CGNE produces monotonic errors ‖x-x*‖₂ but not residuals ‖r‖₂.
 It is formally equivalent to CRAIG, though can be slightly less accurate,
 but simpler to implement. Only the x-part of the solution is returned.
 
-A preconditioner M may be provided in the form of a linear operator.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `N`:
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -88,35 +107,37 @@ See [`CgneSolver`](@ref) for more details about the `solver`.
 function cgne! end
 
 function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, λ :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
-               itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               N=I, ldiv :: Bool=false,
+               λ :: T=zero(T), atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CGNE: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CGNE: system of %d equations in %d variables\n", m, n)
 
-  # Tests M = Iₙ
-  MisI = (M === I)
+  # Tests N = Iₙ
+  NisI = (N === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
-  allocate_if(!MisI, solver, :z, S, m)
+  allocate_if(!NisI, solver, :z, S, m)
   allocate_if(λ > 0, solver, :s, S, m)
-  x, p, Aᵀz, r, q, s, stats = solver.x, solver.p, solver.Aᵀz, solver.r, solver.q, solver.s, solver.stats
+  x, p, Aᴴz, r, q, s, stats = solver.x, solver.p, solver.Aᴴz, solver.r, solver.q, solver.s, solver.stats
   rNorms = stats.residuals
   reset!(stats)
-  z = MisI ? r : solver.z
+  z = NisI ? r : solver.z
 
   x .= zero(FC)
   r .= b
-  MisI || mulorldiv!(z, M, r, ldiv)
+  NisI || mulorldiv!(z, N, r, ldiv)
   rNorm = @knrm2(m, r)   # Marginally faster than norm(r)
   history && push!(rNorms, rNorm)
   if rNorm == 0
@@ -126,7 +147,7 @@ function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
     return solver
   end
   λ > 0 && (s .= r)
-  mul!(p, Aᵀ, z)
+  mul!(p, Aᴴ, z)
 
   # Use ‖p‖ to detect inconsistent system.
   # An inconsistent system will necessarily have AA' singular.
@@ -141,8 +162,8 @@ function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   ɛ_c = atol + rtol * rNorm  # Stopping tolerance for consistent systems.
   ɛ_i = atol + rtol * pNorm  # Stopping tolerance for inconsistent systems.
-  (verbose > 0) && @printf("%5s  %8s\n", "k", "‖r‖")
-  kdisplay(iter, verbose) && @printf("%5d  %8.2e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %8s\n", "k", "‖r‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e\n", iter, rNorm)
 
   status = "unknown"
   solved = rNorm ≤ ɛ_c
@@ -158,11 +179,11 @@ function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
     α = γ / δ
     @kaxpy!(n,  α, p, x)     # Faster than x = x + α * p
     @kaxpy!(m, -α, q, r)     # Faster than r = r - α * q
-    MisI || mulorldiv!(z, M, r, ldiv)
+    NisI || mulorldiv!(z, N, r, ldiv)
     γ_next = @kdotr(m, r, z)  # Faster than γ_next = dot(r, z)
     β = γ_next / γ
-    mul!(Aᵀz, Aᵀ, z)
-    @kaxpby!(n, one(FC), Aᵀz, β, p)  # Faster than p = Aᵀz + β * p
+    mul!(Aᴴz, Aᴴ, z)
+    @kaxpby!(n, one(FC), Aᴴz, β, p)  # Faster than p = Aᴴz + β * p
     pNorm = @knrm2(n, p)
     if λ > 0
       @kaxpby!(m, one(FC), r, β, s)  # s = r + β * s
@@ -171,7 +192,7 @@ function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
     rNorm = sqrt(γ_next)
     history && push!(rNorms, rNorm)
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %8.2e\n", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e\n", iter, rNorm)
 
     # Stopping conditions that do not depend on user input.
     # This is to guard against tolerances that are unreasonably small.
@@ -183,7 +204,7 @@ function cgne!(solver :: CgneSolver{T,FC,S}, A, b :: AbstractVector{FC};
     inconsistent = (rNorm > 100 * ɛ_c) && (pNorm ≤ ɛ_i)
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   inconsistent        && (status = "system probably inconsistent")
diff --git a/src/cgs.jl b/src/cgs.jl
index c1eb1056e..cbb3db13b 100644
--- a/src/cgs.jl
+++ b/src/cgs.jl
@@ -11,17 +11,23 @@
 export cgs, cgs!
 
 """
-    (x, stats) = cgs(A, b::AbstractVector{FC}; c::AbstractVector{FC}=b,
-                     M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                     itmax::Int=0, verbose::Int=0, history::Bool=false,
-                     ldiv::Bool=false, callback=solver->false)
+    (x, stats) = cgs(A, b::AbstractVector{FC};
+                     c::AbstractVector{FC}=b, M=I, N=I,
+                     ldiv::Bool=false, atol::T=√eps(T),
+                     rtol::T=√eps(T), itmax::Int=0,
+                     verbose::Int=0, history::Bool=false,
+                     callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the consistent linear system Ax = b using conjugate gradient squared algorithm.
+    (x, stats) = cgs(A, b, x0::AbstractVector; kwargs...)
+
+CGS can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the consistent linear system Ax = b of size n using CGS.
 CGS requires two initial vectors `b` and `c`.
-The relation `bᵀc ≠ 0` must be satisfied and by default `c = b`.
+The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
 
 From "Iterative Methods for Sparse Linear Systems (Y. Saad)" :
 
@@ -38,16 +44,33 @@ to become inaccurate.
 
 TFQMR and BICGSTAB were developed to remedy this difficulty.»
 
-This implementation allows a left preconditioner M and a right preconditioner N.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-CGS can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = cgs(A, b, x0; kwargs...)
+* `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -83,15 +106,17 @@ function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: Abs
   return solver
 end
 
-function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: AbstractVector{FC}=b,
-              M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-              itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-              ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC};
+              c :: AbstractVector{FC}=b, M=I, N=I,
+              ldiv :: Bool=false, atol :: T=√eps(T),
+              rtol :: T=√eps(T), itmax :: Int=0,
+              verbose :: Int=0, history :: Bool=false,
+              callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CGS: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "CGS: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -99,8 +124,8 @@ function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :vw, S, n)
@@ -142,7 +167,7 @@ function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
   if ρ == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = false, false
-    stats.status = "Breakdown bᵀc = 0"
+    stats.status = "Breakdown bᴴc = 0"
     solver.warm_start =false
     return solver
   end
@@ -151,8 +176,8 @@ function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
   u .= r        # u₀
   p .= r        # p₀
@@ -207,9 +232,9 @@ function cgs!(solver :: CgsSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
     breakdown = (α == 0 || isnan(α))
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "breakdown αₖ == 0")
diff --git a/src/cr.jl b/src/cr.jl
index c678c7d29..26f317385 100644
--- a/src/cr.jl
+++ b/src/cr.jl
@@ -16,32 +16,52 @@ export cr, cr!
 
 """
     (x, stats) = cr(A, b::AbstractVector{FC};
-                    M=I, atol::T=√eps(T), rtol::T=√eps(T), γ::T=√eps(T), itmax::Int=0,
-                    radius::T=zero(T), verbose::Int=0, linesearch::Bool=false, history::Bool=false,
-                    ldiv::Bool=false, callback=solver->false)
+                    M=I, ldiv::Bool=false, radius::T=zero(T),
+                    linesearch::Bool=false, γ::T=√eps(T),
+                    atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                    verbose::Int=0, history::Bool=false,
+                    callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-A truncated version of Stiefel’s Conjugate Residual method to solve the symmetric linear system Ax = b or the least-squares problem min ‖b - Ax‖.
-The matrix A must be positive semi-definite.
+    (x, stats) = cr(A, b, x0::AbstractVector; kwargs...)
 
-A preconditioner M may be provided in the form of a linear operator and is assumed to be symmetric and positive definite.
+CR can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+A truncated version of Stiefel’s Conjugate Residual method to solve the Hermitian linear system Ax = b
+of size n or the least-squares problem min ‖b - Ax‖ if A is singular.
+The matrix A must be Hermitian semi-definite.
 M also indicates the weighted norm in which residuals are measured.
 
-In a linesearch context, 'linesearch' must be set to 'true'.
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian positive definite matrix of dimension n;
+* `b`: a vector of length n.
 
-If `itmax=0`, the default number of iterations is set to `2 * n`,
-with `n = length(b)`.
+#### Optional argument
 
-CR can be warm-started from an initial guess `x0` with the method
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-    (x, stats) = cr(A, b, x0; kwargs...)
+#### Keyword arguments
 
-where `kwargs` are the same keyword arguments as above.
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `linesearch`: if `true`, indicate that the solution is to be used in an inexact Newton method with linesearch. If negative curvature is detected at iteration k > 0, the solution of iteration k-1 is returned. If negative curvature is detected at iteration 0, the right-hand side is returned (i.e., the negative gradient);
+* `γ`: tolerance to determine that the curvature of the quadratic model is nonpositive;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -80,22 +100,25 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: Abstr
 end
 
 function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
-             M=I, atol :: T=√eps(T), rtol :: T=√eps(T), γ :: T=√eps(T), itmax :: Int=0,
-             radius :: T=zero(T), verbose :: Int=0, linesearch :: Bool=false, history :: Bool=false,
-             ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+             M=I, ldiv :: Bool=false, radius :: T=zero(T),
+             linesearch :: Bool=false, γ :: T=√eps(T),
+             atol :: T=√eps(T), rtol :: T=√eps(T),  itmax :: Int=0,
+             verbose :: Int=0,  history :: Bool=false,
+             callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   linesearch && (radius > 0) && error("'linesearch' set to 'true' but radius > 0")
-  n, m = size(A)
+
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CR: system of %d equations in %d variables\n", n, n)
+  (verbose > 0) && @printf(iostream, "CR: system of %d equations in %d variables\n", n, n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace
   allocate_if(!MisI, solver, :Mq, S, n)
@@ -146,10 +169,10 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   ArNorm = @knrm2(n, Ar) # ‖Ar‖
   history && push!(ArNorms, ArNorm)
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s %8s %8s %8s\n", "k", "‖x‖", "‖r‖", "quad")
-  kdisplay(iter, verbose) && @printf("    %d  %8.1e %8.1e %8.1e\n", iter, xNorm, rNorm, m)
+  (verbose > 0) && @printf(iostream, "%5s %8s %8s %8s\n", "k", "‖x‖", "‖r‖", "quad")
+  kdisplay(iter, verbose) && @printf(iostream, "    %d  %8.1e %8.1e %8.1e\n", iter, xNorm, rNorm, m)
 
-  descent = pr > 0 # pᵀr > 0 means p is a descent direction
+  descent = pr > 0 # pᴴr > 0 means p is a descent direction
   solved = rNorm ≤ ε
   tired = iter ≥ itmax
   on_boundary = false
@@ -161,7 +184,7 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     if linesearch
       if (pAp ≤ γ * pNorm²) || (ρ ≤ γ * rNorm²)
         npcurv = true
-        (verbose > 0) && @printf("nonpositive curvature detected: pᵀAp = %8.1e and rᵀAr = %8.1e\n", pAp, ρ)
+        (verbose > 0) && @printf(iostream, "nonpositive curvature detected: pᴴAp = %8.1e and rᴴAr = %8.1e\n", pAp, ρ)
         stats.solved = solved
         stats.inconsistent = false
         stats.status = "nonpositive curvature"
@@ -173,52 +196,52 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     MisI || mulorldiv!(Mq, M, q, ldiv)
 
     if radius > 0
-      (verbose > 0) && @printf("radius = %8.1e > 0 and ‖x‖ = %8.1e\n", radius, xNorm)
+      (verbose > 0) && @printf(iostream, "radius = %8.1e > 0 and ‖x‖ = %8.1e\n", radius, xNorm)
       # find t1 > 0 and t2 < 0 such that ‖x + ti * p‖² = radius²  (i = 1, 2)
       xNorm² = xNorm * xNorm
-      t = to_boundary(x, p, radius; flip = false, xNorm2 = xNorm², dNorm2 = pNorm²)
+      t = to_boundary(n, x, p, radius; flip = false, xNorm2 = xNorm², dNorm2 = pNorm²)
       t1 = maximum(t) # > 0
       t2 = minimum(t) # < 0
-      tr = maximum(to_boundary(x, r, radius; flip = false, xNorm2 = xNorm², dNorm2 = rNorm²))
-      (verbose > 0) && @printf("t1 = %8.1e, t2 = %8.1e and tr = %8.1e\n", t1, t2, tr)
+      tr = maximum(to_boundary(n, x, r, radius; flip = false, xNorm2 = xNorm², dNorm2 = rNorm²))
+      (verbose > 0) && @printf(iostream, "t1 = %8.1e, t2 = %8.1e and tr = %8.1e\n", t1, t2, tr)
 
-      if abspAp ≤ γ * pNorm * @knrm2(n, q) # pᵀAp ≃ 0
+      if abspAp ≤ γ * pNorm * @knrm2(n, q) # pᴴAp ≃ 0
         npcurv = true # nonpositive curvature
-        (verbose > 0) && @printf("pᵀAp = %8.1e ≃ 0\n", pAp)
-        if abspr ≤ γ * pNorm * rNorm # pᵀr ≃ 0
-          (verbose > 0) && @printf("pᵀr = %8.1e ≃ 0, redefining p := r\n", pr)
+        (verbose > 0) && @printf(iostream, "pᴴAp = %8.1e ≃ 0\n", pAp)
+        if abspr ≤ γ * pNorm * rNorm # pᴴr ≃ 0
+          (verbose > 0) && @printf(iostream, "pᴴr = %8.1e ≃ 0, redefining p := r\n", pr)
           p = r # - ∇q(x)
           q = Ar
-          # q(x + αr) = q(x) - α ‖r‖² + ½ α² rᵀAr
-          # 1) if rᵀAr > 0, the quadratic decreases from α = 0 to α = ‖r‖² / rᵀAr
-          # 2) if rᵀAr ≤ 0, the quadratic decreases to -∞ in the direction r
+          # q(x + αr) = q(x) - α ‖r‖² + ½ α² rᴴAr
+          # 1) if rᴴAr > 0, the quadratic decreases from α = 0 to α = ‖r‖² / rᴴAr
+          # 2) if rᴴAr ≤ 0, the quadratic decreases to -∞ in the direction r
           if ρ > 0 # case 1
-            (verbose > 0) && @printf("quadratic is convex in direction r, curv = %8.1e\n", ρ)
+            (verbose > 0) && @printf(iostream, "quadratic is convex in direction r, curv = %8.1e\n", ρ)
             α = min(tr, rNorm² / ρ)
           else # case 2
-            (verbose > 0) && @printf("r is a direction of nonpositive curvature: %8.1e\n", ρ)
+            (verbose > 0) && @printf(iostream, "r is a direction of nonpositive curvature: %8.1e\n", ρ)
             α = tr
           end
         else
-          # q_p = q(x + α_p * p) - q(x) = -α_p * rᵀp + ½ (α_p)² * pᵀAp
-          # q_r = q(x + α_r * r) - q(x) = -α_r * ‖r‖² + ½ (α_r)² * rᵀAr
+          # q_p = q(x + α_p * p) - q(x) = -α_p * rᴴp + ½ (α_p)² * pᴴAp
+          # q_r = q(x + α_r * r) - q(x) = -α_r * ‖r‖² + ½ (α_r)² * rᴴAr
           # Δ = q_p - q_r. If Δ > 0, r is followed, else p is followed
           α = descent ? t1 : t2
           ρ > 0 && (tr = min(tr, rNorm² / ρ))
-          Δ = -α * pr + tr * rNorm² - (tr)^2 * ρ / 2 # as pᵀAp = 0
+          Δ = -α * pr + tr * rNorm² - (tr)^2 * ρ / 2 # as pᴴAp = 0
           if Δ > 0 # direction r engenders a better decrease
-            (verbose > 0) && @printf("direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
-            (verbose > 0) && @printf("redefining p := r\n")
+            (verbose > 0) && @printf(iostream, "direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
+            (verbose > 0) && @printf(iostream, "redefining p := r\n")
             p = r
             q = Ar
             α = tr
           else
-            (verbose > 0) && @printf("direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
+            (verbose > 0) && @printf(iostream, "direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
           end
         end
 
       elseif pAp > 0 && ρ > 0 # no negative curvature
-        (verbose > 0) && @printf("positive curvatures along p and r. pᵀAp = %8.1e and rᵀAr = %8.1e\n", pAp, ρ)
+        (verbose > 0) && @printf(iostream, "positive curvatures along p and r. pᴴAp = %8.1e and rᴴAr = %8.1e\n", pAp, ρ)
         α = ρ / @kdotr(n, q, Mq)
         if α ≥ t1
           α = t1
@@ -227,49 +250,49 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
       elseif pAp > 0 && ρ < 0
         npcurv = true
-        (verbose > 0) && @printf("pᵀAp = %8.1e > 0 and rᵀAr = %8.1e < 0\n", pAp, ρ)
-        # q_p is minimal for α_p = rᵀp / pᵀAp
+        (verbose > 0) && @printf(iostream, "pᴴAp = %8.1e > 0 and rᴴAr = %8.1e < 0\n", pAp, ρ)
+        # q_p is minimal for α_p = rᴴp / pᴴAp
         α = descent ?  min(t1, pr / pAp) : max(t2, pr / pAp)
         Δ = -α * pr + tr * rNorm² + (α^2 * pAp - (tr)^2 * ρ) / 2
         if Δ > 0
-          (verbose > 0) && @printf("direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
-          (verbose > 0) && @printf("redefining p := r\n")
+          (verbose > 0) && @printf(iostream, "direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "redefining p := r\n")
           p = r
           q = Ar
           α = tr
         else
-          (verbose > 0) && @printf("direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
         end
 
       elseif pAp < 0 && ρ > 0
         npcurv = true
-        (verbose > 0) && @printf("pᵀAp = %8.1e < 0 and rᵀAr = %8.1e > 0\n", pAp, ρ)
+        (verbose > 0) && @printf(iostream, "pᴴAp = %8.1e < 0 and rᴴAr = %8.1e > 0\n", pAp, ρ)
         α = descent ? t1 : t2
         tr = min(tr, rNorm² / ρ)
         Δ = -α * pr + tr * rNorm² + (α^2 * pAp - (tr)^2 * ρ) / 2
         if Δ > 0
-          (verbose > 0) && @printf("direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
-          (verbose > 0) && @printf("redefining p := r\n")
+          (verbose > 0) && @printf(iostream, "direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "redefining p := r\n")
           p = r
           q = Ar
           α = tr
         else
-          (verbose > 0) && @printf("direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
         end
 
       elseif pAp < 0 && ρ < 0
         npcurv = true
-        (verbose > 0) && @printf("negative curvatures along p and r. pᵀAp = %8.1e and rᵀAr = %8.1e\n", pAp, ρ)
+        (verbose > 0) && @printf(iostream, "negative curvatures along p and r. pᴴAp = %8.1e and rᴴAr = %8.1e\n", pAp, ρ)
         α = descent ? t1 : t2
         Δ = -α * pr + tr * rNorm² + (α^2 * pAp - (tr)^2 * ρ) / 2
         if Δ > 0
-          (verbose > 0) && @printf("direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
-          (verbose > 0) && @printf("redefining p := r\n")
+          (verbose > 0) && @printf(iostream, "direction r engenders a bigger decrease. q_p - q_r = %8.1e > 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "redefining p := r\n")
           p = r
           q = Ar
           α = tr
         else
-          (verbose > 0) && @printf("direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
+          (verbose > 0) && @printf(iostream, "direction p engenders an equal or a bigger decrease. q_p - q_r = %8.1e ≤ 0\n", Δ)
         end
       end
 
@@ -297,7 +320,7 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + 1
     if kdisplay(iter, verbose)
       m = m - α * pr + α^2 * pAp / 2
-      @printf("    %d  %8.1e %8.1e %8.1e\n", iter, xNorm, rNorm, m)
+      @printf(iostream, "    %d  %8.1e %8.1e %8.1e\n", iter, xNorm, rNorm, m)
     end
 
     # Stopping conditions that do not depend on user input.
@@ -330,14 +353,14 @@ function cr!(solver :: CrSolver{T,FC,S}, A, b :: AbstractVector{FC};
       solver.warm_start = false
       return solver
     end
-    pr = rNorm² + β * pr - β * α * pAp # pᵀr
+    pr = rNorm² + β * pr - β * α * pAp # pᴴr
     abspr = abs(pr)
-    pAp = ρ + β^2 * pAp # pᵀq
+    pAp = ρ + β^2 * pAp # pᴴq
     abspAp = abs(pAp)
     descent = pr > 0
 
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   on_boundary         && (status = "on trust-region boundary")
diff --git a/src/craig.jl b/src/craig.jl
index 20597ea02..76afe9d51 100644
--- a/src/craig.jl
+++ b/src/craig.jl
@@ -11,7 +11,7 @@
 # and is equivalent to applying the conjugate gradient method
 # to the linear system
 #
-#  AAᵀy = b.
+#  AAᴴy = b.
 #
 # This method, sometimes known under the name CRAIG, is the
 # Golub-Kahan implementation of CGNE, and is described in
@@ -32,13 +32,15 @@
 
 export craig, craig!
 
-
 """
     (x, y, stats) = craig(A, b::AbstractVector{FC};
-                          M=I, N=I, sqd::Bool=false, λ::T=zero(T), atol::T=√eps(T),
-                          btol::T=√eps(T), rtol::T=√eps(T), conlim::T=1/√eps(T), itmax::Int=0,
-                          verbose::Int=0, transfer_to_lsqr::Bool=false, history::Bool=false,
-                          ldiv::Bool=false, callback=solver->false)
+                          M=I, N=I, ldiv::Bool=false,
+                          transfer_to_lsqr::Bool=false, sqd::Bool=false,
+                          λ::T=zero(T), btol::T=√eps(T),
+                          conlim::T=1/√eps(T), atol::T=√eps(T),
+                          rtol::T=√eps(T), itmax::Int=0,
+                          verbose::Int=0, history::Bool=false,
+                          callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -47,19 +49,19 @@ Find the least-norm solution of the consistent linear system
 
     Ax + λ²y = b
 
-using the Golub-Kahan implementation of Craig's method, where λ ≥ 0 is a
+of size m × n using the Golub-Kahan implementation of Craig's method, where λ ≥ 0 is a
 regularization parameter. This method is equivalent to CGNE but is more
 stable.
 
 For a system in the form Ax = b, Craig's method is equivalent to applying
-CG to AAᵀy = b and recovering x = Aᵀy. Note that y are the Lagrange
+CG to AAᴴy = b and recovering x = Aᴴy. Note that y are the Lagrange
 multipliers of the least-norm problem
 
     minimize ‖x‖  s.t.  Ax = b.
 
 If `λ > 0`, CRAIG solves the symmetric and quasi-definite system
 
-    [ -F     Aᵀ ] [ x ]   [ 0 ]
+    [ -F     Aᴴ ] [ x ]   [ 0 ]
     [  A   λ²E  ] [ y ] = [ b ],
 
 where E and F are symmetric and positive definite.
@@ -70,12 +72,12 @@ The system above represents the optimality conditions of
 
     min ‖x‖²_F + λ²‖y‖²_E  s.t.  Ax + λ²Ey = b.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-CRAIG is then equivalent to applying CG to `(AF⁻¹Aᵀ + λ²E)y = b` with `Fx = Aᵀy`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+CRAIG is then equivalent to applying CG to `(AF⁻¹Aᴴ + λ²E)y = b` with `Fx = Aᴴy`.
 
 If `λ = 0`, CRAIG solves the symmetric and indefinite system
 
-    [ -F   Aᵀ ] [ x ]   [ 0 ]
+    [ -F   Aᴴ ] [ x ]   [ 0 ]
     [  A   0  ] [ y ] = [ b ].
 
 The system above represents the optimality conditions of
@@ -86,8 +88,34 @@ In this case, `M` can still be specified and indicates the weighted norm in whic
 
 In this implementation, both the x and y-parts of the solution are returned.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `transfer_to_lsqr`: transfer from the LSLQ point to the LSQR point, when it exists. The transfer is based on the residual norm;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `btol`: stopping tolerance used to detect zero-residual problems;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `y`: a dense vector of length m;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -112,14 +140,17 @@ See [`CraigSolver`](@ref) for more details about the `solver`.
 function craig! end
 
 function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                M=I, N=I, sqd :: Bool=false, λ :: T=zero(T), atol :: T=√eps(T),
-                btol :: T=√eps(T), rtol :: T=√eps(T), conlim :: T=1/√eps(T), itmax :: Int=0,
-                verbose :: Int=0, transfer_to_lsqr :: Bool=false, history :: Bool=false,
-                ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                M=I, N=I, ldiv :: Bool=false,
+                transfer_to_lsqr :: Bool=false, sqd :: Bool=false,
+                λ :: T=zero(T), btol :: T=√eps(T),
+                conlim :: T=1/√eps(T), atol :: T=√eps(T),
+                rtol :: T=√eps(T), itmax :: Int=0,
+                verbose :: Int=0, history :: Bool=false,
+                callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CRAIG: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CRAIG: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -131,16 +162,16 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u , S, m)
   allocate_if(!NisI, solver, :v , S, n)
   allocate_if(λ > 0, solver, :w2, S, n)
-  x, Nv, Aᵀu, y, w = solver.x, solver.Nv, solver.Aᵀu, solver.y, solver.w
+  x, Nv, Aᴴu, y, w = solver.x, solver.Nv, solver.Aᴴu, solver.y, solver.w
   Mu, Av, w2, stats = solver.Mu, solver.Av, solver.w2, solver.stats
   rNorms = stats.residuals
   reset!(stats)
@@ -180,7 +211,7 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   Anorm² = zero(T) # Estimate of ‖A‖²_F.
   Anorm  = zero(T)
-  Dnorm² = zero(T) # Estimate of ‖(AᵀA)⁻¹‖².
+  Dnorm² = zero(T) # Estimate of ‖(AᴴA)⁻¹‖².
   Acond  = zero(T) # Estimate of cond(A).
   xNorm² = zero(T) # Estimate of ‖x‖².
   xNorm  = zero(T)
@@ -191,8 +222,8 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
   ɛ_c = atol + rtol * rNorm   # Stopping tolerance for consistent systems.
   ɛ_i = atol                  # Stopping tolerance for inconsistent systems.
   ctol = conlim > 0 ? 1/conlim : zero(T)  # Stopping tolerance for ill-conditioned operators.
-  (verbose > 0) && @printf("%5s  %8s  %8s  %8s  %8s  %8s  %7s\n", "k", "‖r‖", "‖x‖", "‖A‖", "κ(A)", "α", "β")
-  kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e  %8.2e  %8.2e\n", iter, rNorm, xNorm, Anorm, Acond)
+  (verbose > 0) && @printf(iostream, "%5s  %8s  %8s  %8s  %8s  %8s  %7s\n", "k", "‖r‖", "‖x‖", "‖A‖", "κ(A)", "α", "β")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e  %8.2e  %8.2e\n", iter, rNorm, xNorm, Anorm, Acond)
 
   bkwerr = one(T)  # initial value of the backward error ‖r‖ / √(‖b‖² + ‖A‖² ‖x‖²)
 
@@ -212,9 +243,9 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   while ! (solved || inconsistent || ill_cond || tired || user_requested_exit)
     # Generate the next Golub-Kahan vectors
-    # 1. αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-    mul!(Aᵀu, Aᵀ, u)
-    @kaxpby!(n, one(FC), Aᵀu, -β, Nv)
+    # 1. αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+    mul!(Aᴴu, Aᴴ, u)
+    @kaxpby!(n, one(FC), Aᴴu, -β, Nv)
     NisI || mulorldiv!(v, N, Nv, ldiv)
     α = sqrt(@kdotr(n, v, Nv))
     if α == 0
@@ -296,7 +327,7 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
     ρ_prev = ρ   # Only differs from α if λ > 0.
 
-    kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e  %8.2e  %8.2e  %8.1e  %7.1e\n", iter, rNorm, xNorm, Anorm, Acond, α, β)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e  %8.2e  %8.2e  %8.1e  %7.1e\n", iter, rNorm, xNorm, Anorm, Acond, α, β)
 
     solved_lim = bkwerr ≤ btol
     solved_mach = one(T) + bkwerr ≤ one(T)
@@ -312,7 +343,7 @@ function craig!(solver :: CraigSolver{T,FC,S}, A, b :: AbstractVector{FC};
     inconsistent = false
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # transfer to LSQR point if requested
   if λ > 0 && transfer_to_lsqr
diff --git a/src/craigmr.jl b/src/craigmr.jl
index e08bb9c36..3b64829d6 100644
--- a/src/craigmr.jl
+++ b/src/craigmr.jl
@@ -10,7 +10,7 @@
 # and is equivalent to applying the conjugate residual method
 # to the linear system
 #
-#  AAᵀy = b.
+#  AAᴴy = b.
 #
 # This method is equivalent to CRMR, and is described in
 #
@@ -26,12 +26,13 @@
 
 export craigmr, craigmr!
 
-
 """
     (x, y, stats) = craigmr(A, b::AbstractVector{FC};
-                            M=I, N=I, sqd :: Bool=false, λ :: T=zero(T), atol :: T=√eps(T),
-                            rtol::T=√eps(T), itmax::Int=0, verbose::Int=0, history::Bool=false,
-                            ldiv::Bool=false, callback=solver->false)
+                            M=I, N=I, ldiv::Bool=false,
+                            sqd::Bool=false, λ::T=zero(T), atol::T=√eps(T),
+                            rtol::T=√eps(T), itmax::Int=0,
+                            verbose::Int=0, history::Bool=false,
+                            callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -40,11 +41,11 @@ Solve the consistent linear system
 
     Ax + λ²y = b
 
-using the CRAIGMR method, where λ ≥ 0 is a regularization parameter.
+of size m × n using the CRAIGMR method, where λ ≥ 0 is a regularization parameter.
 This method is equivalent to applying the Conjugate Residuals method
 to the normal equations of the second kind
 
-    (AAᵀ + λ²I) y = b
+    (AAᴴ + λ²I) y = b
 
 but is more stable. When λ = 0, this method solves the minimum-norm problem
 
@@ -52,7 +53,7 @@ but is more stable. When λ = 0, this method solves the minimum-norm problem
 
 If `λ > 0`, CRAIGMR solves the symmetric and quasi-definite system
 
-    [ -F    Aᵀ ] [ x ]   [ 0 ]
+    [ -F    Aᴴ ] [ x ]   [ 0 ]
     [  A  λ²E  ] [ y ] = [ b ],
 
 where E and F are symmetric and positive definite.
@@ -63,12 +64,12 @@ The system above represents the optimality conditions of
 
     min ‖x‖²_F + λ²‖y‖²_E  s.t.  Ax + λ²Ey = b.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-CRAIGMR is then equivalent to applying MINRES to `(AF⁻¹Aᵀ + λ²E)y = b` with `Fx = Aᵀy`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+CRAIGMR is then equivalent to applying MINRES to `(AF⁻¹Aᴴ + λ²E)y = b` with `Fx = Aᴴy`.
 
 If `λ = 0`, CRAIGMR solves the symmetric and indefinite system
 
-    [ -F   Aᵀ ] [ x ]   [ 0 ]
+    [ -F   Aᴴ ] [ x ]   [ 0 ]
     [  A   0  ] [ y ] = [ b ].
 
 The system above represents the optimality conditions of
@@ -82,8 +83,31 @@ It is formally equivalent to CRMR, though can be slightly more accurate,
 and intricate to implement. Both the x- and y-parts of the solution are
 returned.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `y`: a dense vector of length m;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -108,13 +132,15 @@ See [`CraigmrSolver`](@ref) for more details about the `solver`.
 function craigmr! end
 
 function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                  M=I, N=I, sqd :: Bool=false, λ :: T=zero(T), atol :: T=√eps(T),
-                  rtol :: T=√eps(T), itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                  ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                  M=I, N=I, ldiv :: Bool=false,
+                  sqd :: Bool=false, λ :: T=zero(T), atol :: T=√eps(T),
+                  rtol :: T=√eps(T), itmax :: Int=0,
+                  verbose :: Int=0, history :: Bool=false,
+                  callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CRAIGMR: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CRAIGMR: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -126,23 +152,23 @@ function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u, S, m)
   allocate_if(!NisI, solver, :v, S, n)
   allocate_if(λ > 0, solver, :q, S, n)
-  x, Nv, Aᵀu, d, y, Mu = solver.x, solver.Nv, solver.Aᵀu, solver.d, solver.y, solver.Mu
+  x, Nv, Aᴴu, d, y, Mu = solver.x, solver.Nv, solver.Aᴴu, solver.d, solver.y, solver.Mu
   w, wbar, Av, q, stats = solver.w, solver.wbar, solver.Av, solver.q, solver.stats
   rNorms, ArNorms = stats.residuals, stats.Aresiduals
   reset!(stats)
   u = MisI ? Mu : solver.u
   v = NisI ? Nv : solver.v
 
-  # Compute y such that AAᵀy = b. Then recover x = Aᵀy.
+  # Compute y such that AAᴴy = b. Then recover x = Aᴴy.
   x .= zero(FC)
   y .= zero(FC)
   Mu .= b
@@ -161,9 +187,9 @@ function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   # β₁Mu₁ = b.
   @kscal!(m, one(FC)/β, u)
   MisI || @kscal!(m, one(FC)/β, Mu)
-  # α₁Nv₁ = Aᵀu₁.
-  mul!(Aᵀu, Aᵀ, u)
-  Nv .= Aᵀu
+  # α₁Nv₁ = Aᴴu₁.
+  mul!(Aᴴu, Aᴴ, u)
+  Nv .= Aᴴu
   NisI || mulorldiv!(v, N, Nv, ldiv)
   α = sqrt(@kdotr(n, v, Nv))
   Anorm² = α * α
@@ -171,10 +197,10 @@ function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = m + n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s\n", "k", "‖r‖", "‖Aᵀr‖", "β", "α", "cos", "sin", "‖A‖²")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, β, α, β, α, 0, 1, Anorm²)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s\n", "k", "‖r‖", "‖Aᴴr‖", "β", "α", "cos", "sin", "‖A‖²")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, β, α, β, α, 0, 1, Anorm²)
 
-  # Aᵀb = 0 so x = 0 is a minimum least-squares solution
+  # Aᴴb = 0 so x = 0 is a minimum least-squares solution
   if α == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = true, false
@@ -288,16 +314,16 @@ function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # xₖ = Dₖzₖ
     @kaxpy!(n, ζ, d, x)
 
-    # 2. αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-    mul!(Aᵀu, Aᵀ, u)
-    @kaxpby!(n, one(FC), Aᵀu, -β, Nv)
+    # 2. αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+    mul!(Aᴴu, Aᴴ, u)
+    @kaxpby!(n, one(FC), Aᴴu, -β, Nv)
     NisI || mulorldiv!(v, N, Nv, ldiv)
     α = sqrt(@kdotr(n, v, Nv))
     Anorm² = Anorm² + α * α  # = ‖Lₖ‖
     ArNorm = α * β * abs(ζ/ρ)
     history && push!(ArNorms, ArNorm)
 
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm²)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm²)
 
     if λ > 0
       (cdₖ, sdₖ, λₖ₊₁) = sym_givens(λ, λₐᵤₓ)
@@ -320,7 +346,7 @@ function craigmr!(solver :: CraigmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     inconsistent = (rNorm > 100 * ɛ_c) & (ArNorm ≤ ɛ_i)
     tired  = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
   
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "found approximate minimum-norm solution")
diff --git a/src/crls.jl b/src/crls.jl
index 6410fb836..78615fad6 100644
--- a/src/crls.jl
+++ b/src/crls.jl
@@ -5,7 +5,7 @@
 #
 # equivalently, of the linear system
 #
-#  AᵀAx = Aᵀb.
+#  AᴴAx = Aᴴb.
 #
 # This implementation follows the formulation given in
 #
@@ -20,12 +20,12 @@
 
 export crls, crls!
 
-
 """
     (x, stats) = crls(A, b::AbstractVector{FC};
-                      M=I, λ::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
-                      radius::T=zero(T), itmax::Int=0, verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      M=I, ldiv::Bool=false, radius::T=zero(T),
+                      λ::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
+                      itmax::Int=0, verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -34,19 +34,40 @@ Solve the linear least-squares problem
 
     minimize ‖b - Ax‖₂² + λ‖x‖₂²
 
-using the Conjugate Residuals (CR) method. This method is equivalent to
-applying MINRES to the normal equations
+of size m × n using the Conjugate Residuals (CR) method.
+This method is equivalent to applying MINRES to the normal equations
 
-    (AᵀA + λI) x = Aᵀb.
+    (AᴴA + λI) x = Aᴴb.
 
 This implementation recurs the residual r := b - Ax.
 
-CRLS produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᵀr‖₂.
+CRLS produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᴴr‖₂.
 It is formally equivalent to LSMR, though can be substantially less accurate,
 but simpler to implement.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -70,23 +91,24 @@ See [`CrlsSolver`](@ref) for more details about the `solver`.
 function crls! end
 
 function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, λ :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
-               radius :: T=zero(T), itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               M=I, ldiv :: Bool=false, radius :: T=zero(T),
+               λ :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
+               itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CRLS: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CRLS: system of %d equations in %d variables\n", m, n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :Ms, S, m)
@@ -112,13 +134,13 @@ function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
   end
 
   MisI || mulorldiv!(Mr, M, r, ldiv)
-  mul!(Ar, Aᵀ, Mr)  # - λ * x0 if x0 ≠ 0.
+  mul!(Ar, Aᴴ, Mr)  # - λ * x0 if x0 ≠ 0.
   mul!(s, A, Ar)
   MisI || mulorldiv!(Ms, M, s, ldiv)
 
   p  .= Ar
   Ap .= s
-  mul!(q, Aᵀ, Ms)  # Ap
+  mul!(q, Aᴴ, Ms)  # Ap
   λ > 0 && @kaxpy!(n, λ, p, q)  # q = q + λ * p
   γ  = @kdotr(m, s, Ms)  # Faster than γ = dot(s, Ms)
   iter = 0
@@ -128,8 +150,8 @@ function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
   λ > 0 && (γ += λ * ArNorm * ArNorm)
   history && push!(ArNorms, ArNorm)
   ε = atol + rtol * ArNorm
-  (verbose > 0) && @printf("%5s  %8s  %8s\n", "k", "‖Aᵀr‖", "‖r‖")
-  kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %8s  %8s\n", "k", "‖Aᴴr‖", "‖r‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
 
   status = "unknown"
   on_boundary = false
@@ -147,14 +169,14 @@ function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     if radius > 0
       pNorm = @knrm2(n, p)
       if @kdotr(m, Ap, Ap) ≤ ε * sqrt(qNorm²) * pNorm # the quadratic is constant in the direction p
-        psd = true # det(AᵀA) = 0
-        p = Ar # p = Aᵀr
+        psd = true # det(AᴴA) = 0
+        p = Ar # p = Aᴴr
         pNorm² = ArNorm * ArNorm
-        mul!(q, Aᵀ, s)
-        α = min(ArNorm^2 / γ, maximum(to_boundary(x, p, radius, flip = false, dNorm2 = pNorm²))) # the quadratic is minimal in the direction Aᵀr for α = ‖Ar‖²/γ
+        mul!(q, Aᴴ, s)
+        α = min(ArNorm^2 / γ, maximum(to_boundary(n, x, p, radius, flip = false, dNorm2 = pNorm²))) # the quadratic is minimal in the direction Aᴴr for α = ‖Ar‖²/γ
       else
         pNorm² = pNorm * pNorm
-        σ = maximum(to_boundary(x, p, radius, flip = false, dNorm2 = pNorm²))
+        σ = maximum(to_boundary(n, x, p, radius, flip = false, dNorm2 = pNorm²))
         if α ≥ σ
           α = σ
           on_boundary = true
@@ -177,7 +199,7 @@ function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     @kaxpby!(n, one(FC), Ar, β, p)    # Faster than  p = Ar + β *  p
     @kaxpby!(m, one(FC), s, β, Ap)    # Faster than Ap =  s + β * Ap
     MisI || mulorldiv!(MAp, M, Ap, ldiv)
-    mul!(q, Aᵀ, MAp)
+    mul!(q, Aᴴ, MAp)
     λ > 0 && @kaxpy!(n, λ, p, q)  # q = q + λ * p
 
     γ = γ_next
@@ -189,12 +211,12 @@ function crls!(solver :: CrlsSolver{T,FC,S}, A, b :: AbstractVector{FC};
     history && push!(rNorms, rNorm)
     history && push!(ArNorms, ArNorm)
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
     user_requested_exit = callback(solver) :: Bool
     solved = (ArNorm ≤ ε) || on_boundary
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
diff --git a/src/crmr.jl b/src/crmr.jl
index deb5cf79f..621ba5ef3 100644
--- a/src/crmr.jl
+++ b/src/crmr.jl
@@ -10,9 +10,9 @@
 # and is equivalent to applying the conjugate residual method
 # to the linear system
 #
-#  AAᵀy = b.
+#  AAᴴy = b.
 #
-# This method is equivalent to Craig-MR, described in
+# This method is equivalent to CRAIGMR, described in
 #
 # D. Orban and M. Arioli. Iterative Solution of Symmetric Quasi-Definite Linear Systems,
 # Volume 3 of Spotlights. SIAM, Philadelphia, PA, 2017.
@@ -26,12 +26,13 @@
 
 export crmr, crmr!
 
-
 """
     (x, stats) = crmr(A, b::AbstractVector{FC};
-                      M=I, λ::T=zero(T), atol::T=√eps(T),
-                      rtol::T=√eps(T), itmax::Int=0, verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      N=I, ldiv::Bool=false,
+                      λ::T=zero(T), atol::T=√eps(T),
+                      rtol::T=√eps(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -40,11 +41,11 @@ Solve the consistent linear system
 
     Ax + √λs = b
 
-using the Conjugate Residual (CR) method, where λ ≥ 0 is a regularization
+of size m × n using the Conjugate Residual (CR) method, where λ ≥ 0 is a regularization
 parameter. This method is equivalent to applying CR to the normal equations
 of the second kind
 
-    (AAᵀ + λI) y = b
+    (AAᴴ + λI) y = b
 
 but is more stable. When λ = 0, this method solves the minimum-norm problem
 
@@ -58,10 +59,28 @@ CRMR produces monotonic residuals ‖r‖₂.
 It is formally equivalent to CRAIG-MR, though can be slightly less accurate,
 but simpler to implement. Only the x-part of the solution is returned.
 
-A preconditioner M may be provided.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `N`:
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -86,35 +105,37 @@ See [`CrmrSolver`](@ref) for more details about the `solver`.
 function crmr! end
 
 function crmr!(solver :: CrmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, λ :: T=zero(T), atol :: T=√eps(T),
-               rtol :: T=√eps(T), itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               N=I, ldiv :: Bool=false,
+               λ :: T=zero(T), atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("CRMR: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "CRMR: system of %d equations in %d variables\n", m, n)
 
-  # Tests M = Iₙ
-  MisI = (M === I)
+  # Tests N = Iₙ
+  NisI = (N === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
-  allocate_if(!MisI, solver, :Mq, S, m)
+  allocate_if(!NisI, solver, :Nq, S, m)
   allocate_if(λ > 0, solver, :s , S, m)
-  x, p, Aᵀr, r = solver.x, solver.p, solver.Aᵀr, solver.r
+  x, p, Aᴴr, r = solver.x, solver.p, solver.Aᴴr, solver.r
   q, s, stats = solver.q, solver.s, solver.stats
   rNorms, ArNorms = stats.residuals, stats.Aresiduals
   reset!(stats)
-  Mq = MisI ? q : solver.Mq
+  Nq = NisI ? q : solver.Nq
 
   x .= zero(FC)              # initial estimation x = 0
-  mulorldiv!(r, M, b, ldiv)  # initial residual r = M * (b - Ax) = M * b
+  mulorldiv!(r, N, b, ldiv)  # initial residual r = N * (b - Ax) = N * b
   bNorm = @knrm2(m, r)       # norm(b - A * x0) if x0 ≠ 0.
   rNorm = bNorm              # + λ * ‖x0‖ if x0 ≠ 0 and λ > 0.
   history && push!(rNorms, rNorm)
@@ -126,9 +147,9 @@ function crmr!(solver :: CrmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     return solver
   end
   λ > 0 && (s .= r)
-  mul!(Aᵀr, Aᵀ, r)  # - λ * x0 if x0 ≠ 0.
-  p .= Aᵀr
-  γ = @kdotr(n, Aᵀr, Aᵀr)  # Faster than γ = dot(Aᵀr, Aᵀr)
+  mul!(Aᴴr, Aᴴ, r)  # - λ * x0 if x0 ≠ 0.
+  p .= Aᴴr
+  γ = @kdotr(n, Aᴴr, Aᴴr)  # Faster than γ = dot(Aᴴr, Aᴴr)
   λ > 0 && (γ += λ * rNorm * rNorm)
   iter = 0
   itmax == 0 && (itmax = m + n)
@@ -137,8 +158,8 @@ function crmr!(solver :: CrmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   history && push!(ArNorms, ArNorm)
   ɛ_c = atol + rtol * rNorm  # Stopping tolerance for consistent systems.
   ɛ_i = atol + rtol * ArNorm  # Stopping tolerance for inconsistent systems.
-  (verbose > 0) && @printf("%5s  %8s  %8s\n", "k", "‖Aᵀr‖", "‖r‖")
-  kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %8s  %8s\n", "k", "‖Aᴴr‖", "‖r‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
 
   status = "unknown"
   solved = rNorm ≤ ɛ_c
@@ -149,17 +170,17 @@ function crmr!(solver :: CrmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   while ! (solved || inconsistent || tired || user_requested_exit)
     mul!(q, A, p)
     λ > 0 && @kaxpy!(m, λ, s, q)  # q = q + λ * s
-    MisI || mulorldiv!(Mq, M, q, ldiv)
-    α = γ / @kdotr(m, q, Mq)   # Compute qᵗ * M * q
+    NisI || mulorldiv!(Nq, N, q, ldiv)
+    α = γ / @kdotr(m, q, Nq)   # Compute qᴴ * N * q
     @kaxpy!(n,  α, p, x)       # Faster than  x =  x + α *  p
-    @kaxpy!(m, -α, Mq, r)      # Faster than  r =  r - α * Mq
+    @kaxpy!(m, -α, Nq, r)      # Faster than  r =  r - α * Nq
     rNorm = @knrm2(m, r)       # norm(r)
-    mul!(Aᵀr, Aᵀ, r)
-    γ_next = @kdotr(n, Aᵀr, Aᵀr)  # Faster than γ_next = dot(Aᵀr, Aᵀr)
+    mul!(Aᴴr, Aᴴ, r)
+    γ_next = @kdotr(n, Aᴴr, Aᴴr)  # Faster than γ_next = dot(Aᴴr, Aᴴr)
     λ > 0 && (γ_next += λ * rNorm * rNorm)
     β = γ_next / γ
 
-    @kaxpby!(n, one(FC), Aᵀr, β, p)  # Faster than  p = Aᵀr + β * p
+    @kaxpby!(n, one(FC), Aᴴr, β, p)  # Faster than  p = Aᴴr + β * p
     if λ > 0
       @kaxpby!(m, one(FC), r, β, s) # s = r + β * s
     end
@@ -169,13 +190,13 @@ function crmr!(solver :: CrmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     history && push!(rNorms, rNorm)
     history && push!(ArNorms, ArNorm)
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.2e  %8.2e\n", iter, ArNorm, rNorm)
     user_requested_exit = callback(solver) :: Bool
     solved = rNorm ≤ ɛ_c
     inconsistent = (rNorm > 100 * ɛ_c) && (ArNorm ≤ ɛ_i)
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
diff --git a/src/diom.jl b/src/diom.jl
index 9c6b9767b..7bf23e355 100644
--- a/src/diom.jl
+++ b/src/diom.jl
@@ -11,40 +11,58 @@
 export diom, diom!
 
 """
-    (x, stats) = diom(A, b::AbstractVector{FC}; memory::Int=20,
-                      M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                      reorthogonalization::Bool=false, itmax::Int=0,
+    (x, stats) = diom(A, b::AbstractVector{FC};
+                      memory::Int=20, M=I, N=I, ldiv::Bool=false,
+                      reorthogonalization::Bool=false, atol::T=√eps(T),
+                      rtol::T=√eps(T), itmax::Int=0,
                       verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the consistent linear system Ax = b using direct incomplete orthogonalization method.
+    (x, stats) = diom(A, b, x0::AbstractVector; kwargs...)
+
+DIOM can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the consistent linear system Ax = b of size n using DIOM.
 
 DIOM only orthogonalizes the new vectors of the Krylov basis against the `memory` most recent vectors.
 If CG is well defined on `Ax = b` and `memory = 2`, DIOM is theoretically equivalent to CG.
 If `k ≤ memory` where `k` is the number of iterations, DIOM is theoretically equivalent to FOM.
 Otherwise, DIOM interpolates between CG and FOM and is similar to CG with partial reorthogonalization.
 
-Partial reorthogonalization is available with the `reorthogonalization` option.
-
-An advantage of DIOM is that nonsymmetric or symmetric indefinite or both nonsymmetric
+An advantage of DIOM is that non-Hermitian or Hermitian indefinite or both non-Hermitian
 and indefinite systems of linear equations can be handled by this single algorithm.
 
-This implementation allows a left preconditioner M and a right preconditioner N.
-- Left  preconditioning : M⁻¹Ax = M⁻¹b
-- Right preconditioning : AN⁻¹u = b with x = N⁻¹u
-- Split preconditioning : M⁻¹AN⁻¹u = M⁻¹b with x = N⁻¹u
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
 
-DIOM can be warm-started from an initial guess `x0` with the method
+#### Optional argument
 
-    (x, stats) = diom(A, b, x0; kwargs...)
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-where `kwargs` are the same keyword arguments as above.
+#### Keyword arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `memory`: the number of most recent vectors of the Krylov basis against which to orthogonalize a new vector;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against the `memory` most recent vectors;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -84,15 +102,16 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: A
 end
 
 function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-               reorthogonalization :: Bool=false, itmax :: Int=0,
+               M=I, N=I, ldiv :: Bool=false,
+               reorthogonalization :: Bool=false, atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
                verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("DIOM: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "DIOM: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -100,7 +119,7 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :w, S, n)
@@ -121,7 +140,7 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
   else
     t .= b
   end
-  MisI || mulorldiv!(r₀, M, t, ldiv)  # M⁻¹(b - Ax₀)
+  MisI || mulorldiv!(r₀, M, t, ldiv)  # M(b - Ax₀)
   rNorm = @knrm2(n, r₀)               # β = ‖r₀‖₂
   history && push!(rNorms, rNorm)
   if rNorm == 0
@@ -136,23 +155,26 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
-  mem = length(L)  # Memory
+  mem = length(V)  # Memory
   for i = 1 : mem
-    V[i] .= zero(FC)  # Orthogonal basis of Kₖ(M⁻¹AN⁻¹, M⁻¹b).
-    P[i] .= zero(FC)  # Directions for x : Pₘ = N⁻¹Vₘ(Uₘ)⁻¹.
+    V[i] .= zero(FC)  # Orthogonal basis of Kₖ(MAN, Mr₀).
   end
-  H .= zero(FC)  # Last column of the band hessenberg matrix Hₘ = LₘUₘ.
-  # Each column has at most mem + 1 nonzero elements. hᵢ.ₘ is stored as H[m-i+2].
-  # m-i+2 represents the indice of the diagonal where hᵢ.ₘ is located.
-  # In addition of that, the last column of Uₘ is stored in H.
-  L .= zero(FC)  # Last mem pivots of Lₘ.
+  for i = 1 : mem-1
+    P[i] .= zero(FC)  # Directions Pₖ = NVₖ(Uₖ)⁻¹.
+  end
+  H .= zero(FC)  # Last column of the band hessenberg matrix Hₖ = LₖUₖ.
+  # Each column has at most mem + 1 nonzero elements.
+  # hᵢ.ₖ is stored as H[k-i+1], i ≤ k. hₖ₊₁.ₖ is not stored in H.
+  # k-i+1 represents the indice of the diagonal where hᵢ.ₖ is located.
+  # In addition of that, the last column of Uₖ is stored in H.
+  L .= zero(FC)  # Last mem-1 pivots of Lₖ.
 
   # Initial ξ₁ and V₁.
   ξ = rNorm
-  @. V[1] = r₀ / rNorm
+  V[1] .= r₀ ./ rNorm
 
   # Stopping criterion.
   solved = rNorm ≤ ε
@@ -166,83 +188,88 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + 1
 
     # Set position in circulars stacks.
-    pos = mod(iter-1, mem) + 1 # Position corresponding to pₘ and vₘ in circular stacks P and V.
-    next_pos = mod(iter, mem) + 1 # Position corresponding to vₘ₊₁ in the circular stack V.
+    pos = mod(iter-1, mem) + 1     # Position corresponding to vₖ in the circular stack V.
+    next_pos = mod(iter, mem) + 1  # Position corresponding to vₖ₊₁ in the circular stack V.
 
     # Incomplete Arnoldi procedure.
     z = NisI ? V[pos] : solver.z
-    NisI || mulorldiv!(z, N, V[pos], ldiv)  # N⁻¹vₘ, forms pₘ
-    mul!(t, A, z)                           # AN⁻¹vₘ
-    MisI || mulorldiv!(w, M, t, ldiv)       # M⁻¹AN⁻¹vₘ, forms vₘ₊₁
+    NisI || mulorldiv!(z, N, V[pos], ldiv)  # Nvₖ, forms pₖ
+    mul!(t, A, z)                           # ANvₖ
+    MisI || mulorldiv!(w, M, t, ldiv)       # MANvₖ, forms vₖ₊₁
     for i = max(1, iter-mem+1) : iter
-      ipos = mod(i-1, mem) + 1 # Position corresponding to vᵢ in the circular stack V.
-      diag = iter - i + 2
-      H[diag] = @kdot(n, w, V[ipos]) # hᵢ.ₘ = ⟨M⁻¹AN⁻¹vₘ , vᵢ⟩
-      @kaxpy!(n, -H[diag], V[ipos], w) # w ← w - hᵢ.ₘ * vᵢ
+      ipos = mod(i-1, mem) + 1  # Position corresponding to vᵢ in the circular stack V.
+      diag = iter - i + 1
+      H[diag] = @kdot(n, w, V[ipos])    # hᵢ.ₖ = ⟨MANvₖ, vᵢ⟩
+      @kaxpy!(n, -H[diag], V[ipos], w)  # w ← w - hᵢ.ₖvᵢ
     end
 
     # Partial reorthogonalization of the Krylov basis.
     if reorthogonalization
       for i = max(1, iter-mem+1) : iter
         ipos = mod(i-1, mem) + 1
-        diag = iter - i + 2
+        diag = iter - i + 1
         Htmp = @kdot(n, w, V[ipos])
         H[diag] += Htmp
         @kaxpy!(n, -Htmp, V[ipos], w)
       end
     end
 
-    # Compute hₘ₊₁.ₘ and vₘ₊₁.
-    H[1] = @knrm2(n, w) # hₘ₊₁.ₘ = ‖vₘ₊₁‖₂
-    if H[1] ≠ 0 # hₘ₊₁.ₘ = 0 ⇒ "lucky breakdown"
-      @. V[next_pos] = w / H[1] # vₘ₊₁ = w / hₘ₊₁.ₘ
-    end
-    # It's possible that uₘ₋ₘₑₘ.ₘ ≠ 0 when m ≥ mem + 1
-    if iter ≥ mem + 2
-      H[mem+2] = zero(FC) # hₘ₋ₘₑₘ.ₘ = 0
+    # Compute hₖ₊₁.ₖ and vₖ₊₁.
+    Haux = @knrm2(n, w)         # hₖ₊₁.ₖ = ‖vₖ₊₁‖₂
+    if Haux ≠ 0                 # hₖ₊₁.ₖ = 0 ⇒ "lucky breakdown"
+      V[next_pos] .= w ./ Haux  # vₖ₊₁ = w / hₖ₊₁.ₖ
     end
 
-    # Update the LU factorization with partial pivoting of H.
-    # Compute the last column of Uₘ.
+    # Update the LU factorization of Hₖ.
+    # Compute the last column of Uₖ.
     if iter ≥ 2
-      for i = max(2,iter-mem+1) : iter
-        lpos = mod(i-1, mem) + 1 # Position corresponding to lᵢ.ᵢ₋₁ in the circular stack L.
-        diag = iter - i + 2
+      # u₁.ₖ ← h₁.ₖ             if iter ≤ mem
+      # uₖ₋ₘₑₘ₊₁.ₖ ← hₖ₋ₘₑₘ₊₁.ₖ if iter ≥ mem + 1
+      for i = max(2,iter-mem+2) : iter
+        lpos = mod(i-1, mem-1) + 1  # Position corresponding to lᵢ.ᵢ₋₁ in the circular stack L.
+        diag = iter - i + 1
         next_diag = diag + 1
-        # uᵢ.ₘ ← hᵢ.ₘ - lᵢ.ᵢ₋₁ * uᵢ₋₁.ₘ
+        # uᵢ.ₖ ← hᵢ.ₖ - lᵢ.ᵢ₋₁ * uᵢ₋₁.ₖ
         H[diag] = H[diag] - L[lpos] * H[next_diag]
+        if i == iter
+          # Compute ξₖ the last component of zₖ = β(Lₖ)⁻¹e₁.
+          # ξₖ = -lₖ.ₖ₋₁ * ξₖ₋₁
+          ξ = - L[lpos] * ξ
+        end
       end
-      # Compute ξₘ the last component of zₘ = β(Lₘ)⁻¹e₁.
-      # ξₘ = -lₘ.ₘ₋₁ * ξₘ₋₁
-      ξ = - L[pos] * ξ
     end
-    # Compute next pivot lₘ₊₁.ₘ = hₘ₊₁.ₘ / uₘ.ₘ
-    L[next_pos] = H[1] / H[2]
-
-    # Compute the direction pₘ, the last column of Pₘ = N⁻¹Vₘ(Uₘ)⁻¹.
-    for i = max(1,iter-mem) : iter-1
-      ipos = mod(i-1, mem) + 1 # Position corresponding to pᵢ in the circular stack P.
-      diag = iter - i + 2
-      if ipos == pos
-        # pₐᵤₓ ← -hₘ₋ₘₑₘ.ₘ * pₘ₋ₘₑₘ
-        @kscal!(n, -H[diag], P[pos])
+    # Compute next pivot lₖ₊₁.ₖ = hₖ₊₁.ₖ / uₖ.ₖ
+    next_lpos = mod(iter, mem-1) + 1
+    L[next_lpos] = Haux / H[1]
+
+    ppos = mod(iter-1, mem-1) + 1 # Position corresponding to pₖ in the circular stack P.
+
+    # Compute the direction pₖ, the last column of Pₖ = NVₖ(Uₖ)⁻¹.
+    # u₁.ₖp₁ + ... + uₖ.ₖpₖ = Nvₖ             if k ≤ mem
+    # uₖ₋ₘₑₘ₊₁.ₖpₖ₋ₘₑₘ₊₁ + ... + uₖ.ₖpₖ = Nvₖ if k ≥ mem + 1
+    for i = max(1,iter-mem+1) : iter-1
+      ipos = mod(i-1, mem-1) + 1  # Position corresponding to pᵢ in the circular stack P.
+      diag = iter - i + 1
+      if ipos == ppos
+        # pₖ ← -uₖ₋ₘₑₘ₊₁.ₖ * pₖ₋ₘₑₘ₊₁
+        @kscal!(n, -H[diag], P[ppos])
       else
-        # pₐᵤₓ ← pₐᵤₓ - hᵢ.ₘ * pᵢ
-        @kaxpy!(n, -H[diag], P[ipos], P[pos])
+        # pₖ ← pₖ - uᵢ.ₖ * pᵢ
+        @kaxpy!(n, -H[diag], P[ipos], P[ppos])
       end
     end
-    # pₐᵤₓ ← pₐᵤₓ + N⁻¹vₘ
-    @kaxpy!(n, one(FC), z, P[pos])
-    # pₘ = pₐᵤₓ / uₘ.ₘ
-    @. P[pos] = P[pos] / H[2]
+    # pₐᵤₓ ← pₐᵤₓ + Nvₖ
+    @kaxpy!(n, one(FC), z, P[ppos])
+    # pₖ = pₐᵤₓ / uₖ.ₖ
+    P[ppos] .= P[ppos] ./ H[1]
 
-    # Update solution xₘ.
-    # xₘ = xₘ₋₁ + ξₘ * pₘ
-    @kaxpy!(n, ξ, P[pos], x)
+    # Update solution xₖ.
+    # xₖ = xₖ₋₁ + ξₖ * pₖ
+    @kaxpy!(n, ξ, P[ppos], x)
 
     # Compute residual norm.
-    # ‖ M⁻¹(b - Axₘ) ‖₂ = hₘ₊₁.ₘ * |ξₘ / uₘ.ₘ|
-    rNorm = real(H[1]) * abs(ξ / H[2])
+    # ‖ M(b - Axₖ) ‖₂ = hₖ₊₁.ₖ * |ξₖ / uₖ.ₖ|
+    rNorm = Haux * abs(ξ / H[1])
     history && push!(rNorms, rNorm)
 
     # Stopping conditions that do not depend on user input.
@@ -254,9 +281,9 @@ function diom!(solver :: DiomSolver{T,FC,S}, A, b :: AbstractVector{FC};
     resid_decrease_lim = rNorm ≤ ε
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
   user_requested_exit && (status = "user-requested exit")
diff --git a/src/dqgmres.jl b/src/dqgmres.jl
index ab7c490a6..025016304 100644
--- a/src/dqgmres.jl
+++ b/src/dqgmres.jl
@@ -11,16 +11,21 @@
 export dqgmres, dqgmres!
 
 """
-    (x, stats) = dqgmres(A, b::AbstractVector{FC}; memory::Int=20,
-                         M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                         reorthogonalization::Bool=false, itmax::Int=0,
+    (x, stats) = dqgmres(A, b::AbstractVector{FC};
+                         memory::Int=20, M=I, N=I, ldiv::Bool=false,
+                         reorthogonalization::Bool=false, atol::T=√eps(T),
+                         rtol::T=√eps(T), itmax::Int=0,
                          verbose::Int=0, history::Bool=false,
-                         ldiv::Bool=false, callback=solver->false)
+                         callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the consistent linear system Ax = b using DQGMRES method.
+    (x, stats) = dqgmres(A, b, x0::AbstractVector; kwargs...)
+
+DQGMRES can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the consistent linear system Ax = b of size n using DQGMRES.
 
 DQGMRES algorithm is based on the incomplete Arnoldi orthogonalization process
 and computes a sequence of approximate solutions with the quasi-minimal residual property.
@@ -30,21 +35,34 @@ If MINRES is well defined on `Ax = b` and `memory = 2`, DQGMRES is theoretically
 If `k ≤ memory` where `k` is the number of iterations, DQGMRES is theoretically equivalent to GMRES.
 Otherwise, DQGMRES interpolates between MINRES and GMRES and is similar to MINRES with partial reorthogonalization.
 
-Partial reorthogonalization is available with the `reorthogonalization` option.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
 
-This implementation allows a left preconditioner M and a right preconditioner N.
-- Left  preconditioning : M⁻¹Ax = M⁻¹b
-- Right preconditioning : AN⁻¹u = b with x = N⁻¹u
-- Split preconditioning : M⁻¹AN⁻¹u = M⁻¹b with x = N⁻¹u
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-DQGMRES can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = dqgmres(A, b, x0; kwargs...)
+* `memory`: the number of most recent vectors of the Krylov basis against which to orthogonalize a new vector;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against the `memory` most recent vectors;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -84,15 +102,16 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC}, x
 end
 
 function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                  M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                  reorthogonalization :: Bool=false, itmax :: Int=0,
+                  M=I, N=I, ldiv :: Bool=false,
+                  reorthogonalization :: Bool=false, atol :: T=√eps(T),
+                  rtol :: T=√eps(T), itmax :: Int=0,
                   verbose :: Int=0, history :: Bool=false,
-                  ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                  callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("DQGMRES: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "DQGMRES: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -100,7 +119,7 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :w, S, n)
@@ -121,7 +140,7 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
   else
     t .= b
   end
-  MisI || mulorldiv!(r₀, M, t, ldiv)  # M⁻¹(b - Ax₀)
+  MisI || mulorldiv!(r₀, M, t, ldiv)  # M(b - Ax₀)
   rNorm = @knrm2(n, r₀)               # β = ‖r₀‖₂
   history && push!(rNorms, rNorm)
   if rNorm == 0
@@ -136,29 +155,30 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
   # Set up workspace.
-  mem = length(c)  # Memory.
+  mem = length(V)  # Memory.
   for i = 1 : mem
-    V[i] .= zero(FC)  # Orthogonal basis of Kₖ(M⁻¹AN⁻¹, M⁻¹b).
-    P[i] .= zero(FC)  # Directions for x : Pₘ = N⁻¹Vₘ(Rₘ)⁻¹.
+    V[i] .= zero(FC)  # Orthogonal basis of Kₖ(MAN, Mr₀).
+    P[i] .= zero(FC)  # Directions for x : Pₖ = NVₖ(Rₖ)⁻¹.
   end
-  c .= zero(T)   # Last mem Givens cosines used for the factorization QₘRₘ = Hₘ.
-  s .= zero(FC)  # Last mem Givens sines used for the factorization QₘRₘ = Hₘ.
-  H .= zero(FC)  # Last column of the band hessenberg matrix Hₘ.
-  # Each column has at most mem + 1 nonzero elements. hᵢ.ₘ is stored as H[m-i+2].
-  # m-i+2 represents the indice of the diagonal where hᵢ.ₘ is located.
-  # In addition of that, the last column of Rₘ is also stored in H.
+  c .= zero(T)   # Last mem Givens cosines used for the factorization QₖRₖ = Hₖ.
+  s .= zero(FC)  # Last mem Givens sines used for the factorization QₖRₖ = Hₖ.
+  H .= zero(FC)  # Last column of the band hessenberg matrix Hₖ.
+  # Each column has at most mem + 1 nonzero elements.
+  # hᵢ.ₖ is stored as H[k-i+1], i ≤ k. hₖ₊₁.ₖ is not stored in H.
+  # k-i+1 represents the indice of the diagonal where hᵢ.ₖ is located.
+  # In addition of that, the last column of Rₖ is also stored in H.
 
   # Initial γ₁ and V₁.
-  γₘ = rNorm # γₘ and γₘ₊₁ are the last components of gₘ, right-hand of the least squares problem min ‖ Hₘyₘ - gₘ ‖₂.
-  @. V[1] = r₀ / rNorm
+  γₖ = rNorm # γₖ and γₖ₊₁ are the last components of gₖ, right-hand of the least squares problem min ‖ Hₖyₖ - gₖ ‖₂.
+  V[1] .= r₀ ./ rNorm
 
   # The following stopping criterion compensates for the lag in the
   # residual, but usually increases the number of iterations.
-  # solved = sqrt(max(1, iter-mem+1)) * |γₘ₊₁| ≤ ε
+  # solved = sqrt(max(1, iter-mem+1)) * |γₖ₊₁| ≤ ε
   solved = rNorm ≤ ε # less accurate, but acceptable.
   tired = iter ≥ itmax
   status = "unknown"
@@ -170,88 +190,89 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + 1
 
     # Set position in circulars stacks.
-    pos = mod(iter-1, mem) + 1 # Position corresponding to pₘ and vₘ in circular stacks P and V.
-    next_pos = mod(iter, mem) + 1 # Position corresponding to vₘ₊₁ in the circular stack V.
+    pos = mod(iter-1, mem) + 1     # Position corresponding to pₖ and vₖ in circular stacks P and V.
+    next_pos = mod(iter, mem) + 1  # Position corresponding to vₖ₊₁ in the circular stack V.
 
     # Incomplete Arnoldi procedure.
     z = NisI ? V[pos] : solver.z
-    NisI || mulorldiv!(z, N, V[pos], ldiv)  # N⁻¹vₘ, forms pₘ
-    mul!(t, A, z)                           # AN⁻¹vₘ
-    MisI || mulorldiv!(w, M, t, ldiv)       # M⁻¹AN⁻¹vₘ, forms vₘ₊₁
+    NisI || mulorldiv!(z, N, V[pos], ldiv)  # Nvₖ, forms pₖ
+    mul!(t, A, z)                           # ANvₖ
+    MisI || mulorldiv!(w, M, t, ldiv)       # MANvₖ, forms vₖ₊₁
     for i = max(1, iter-mem+1) : iter
-      ipos = mod(i-1, mem) + 1 # Position corresponding to vᵢ in the circular stack V.
-      diag = iter - i + 2
-      H[diag] = @kdot(n, w, V[ipos]) # hᵢ.ₘ = ⟨M⁻¹AN⁻¹vₘ , vᵢ⟩
-      @kaxpy!(n, -H[diag], V[ipos], w) # w ← w - hᵢ.ₘ * vᵢ
+      ipos = mod(i-1, mem) + 1  # Position corresponding to vᵢ in the circular stack V.
+      diag = iter - i + 1
+      H[diag] = @kdot(n, w, V[ipos])    # hᵢ.ₖ = ⟨MANvₖ, vᵢ⟩
+      @kaxpy!(n, -H[diag], V[ipos], w)  # w ← w - hᵢ.ₖvᵢ
     end
 
     # Partial reorthogonalization of the Krylov basis.
     if reorthogonalization
       for i = max(1, iter-mem+1) : iter
         ipos = mod(i-1, mem) + 1
-        diag = iter - i + 2
+        diag = iter - i + 1
         Htmp = @kdot(n, w, V[ipos])
         H[diag] += Htmp
         @kaxpy!(n, -Htmp, V[ipos], w)
       end
     end
 
-    # Compute hₘ₊₁.ₘ and vₘ₊₁.
-    H[1] = @knrm2(n, w) # hₘ₊₁.ₘ = ‖vₘ₊₁‖₂
-    if H[1] ≠ 0 # hₘ₊₁.ₘ = 0 ⇒ "lucky breakdown"
-      @. V[next_pos] = w / H[1] # vₘ₊₁ = w / hₘ₊₁.ₘ
+    # Compute hₖ₊₁.ₖ and vₖ₊₁.
+    Haux = @knrm2(n, w)         # hₖ₊₁.ₖ = ‖vₖ₊₁‖₂
+    if Haux ≠ 0                 # hₖ₊₁.ₖ = 0 ⇒ "lucky breakdown"
+      V[next_pos] .= w ./ Haux  # vₖ₊₁ = w / hₖ₊₁.ₖ
     end
-    # rₘ₋ₘₑₘ.ₘ ≠ 0 when m ≥ mem + 1
+    # rₖ₋ₘₑₘ.ₖ ≠ 0 when k ≥ mem + 1
+    # We don't want to use rₖ₋₁₋ₘₑₘ.ₖ₋₁ when we compute rₖ₋ₘₑₘ.ₖ
     if iter ≥ mem + 2
-      H[mem+2] = zero(FC) # hₘ₋ₘₑₘ.ₘ = 0
+      H[mem+1] = zero(FC)  # rₖ₋ₘₑₘ.ₖ = 0
     end
 
-    # Update the QR factorization of H.
+    # Update the QR factorization of Hₖ.
     # Apply mem previous Givens reflections Ωᵢ.
     for i = max(1,iter-mem) : iter-1
-      irot_pos = mod(i-1, mem) + 1 # Position corresponding to cᵢ and sᵢ in circular stacks c and s.
-      diag = iter - i + 1
+      irot_pos = mod(i-1, mem) + 1  # Position corresponding to cᵢ and sᵢ in circular stacks c and s.
+      diag = iter - i
       next_diag = diag + 1
-      H_aux        =      c[irot_pos]  * H[next_diag] + s[irot_pos] * H[diag]
+      Htmp         =      c[irot_pos]  * H[next_diag] + s[irot_pos] * H[diag]
       H[diag]      = conj(s[irot_pos]) * H[next_diag] - c[irot_pos] * H[diag]
-      H[next_diag] = H_aux
+      H[next_diag] = Htmp
     end
 
-    # Compute and apply current Givens reflection Ωₘ.
-    # [cₘ  sₘ] [ hₘ.ₘ ] = [ρₘ]
-    # [sₘ -cₘ] [hₘ₊₁.ₘ]   [0 ]
-    (c[pos], s[pos], H[2]) = sym_givens(H[2], H[1])
-    γₘ₊₁ = conj(s[pos]) * γₘ
-    γₘ   =      c[pos]  * γₘ
+    # Compute and apply current Givens reflection Ωₖ.
+    # [cₖ  sₖ] [ hₖ.ₖ ] = [ρₖ]
+    # [sₖ -cₖ] [hₖ₊₁.ₖ]   [0 ]
+    (c[pos], s[pos], H[1]) = sym_givens(H[1], Haux)
+    γₖ₊₁ = conj(s[pos]) * γₖ
+    γₖ   =      c[pos]  * γₖ
 
-    # Compute the direction pₘ, the last column of Pₘ = N⁻¹Vₘ(Rₘ)⁻¹.
+    # Compute the direction pₖ, the last column of Pₖ = NVₖ(Rₖ)⁻¹.
     for i = max(1,iter-mem) : iter-1
-      ipos = mod(i-1, mem) + 1 # Position corresponding to pᵢ in the circular stack P.
-      diag = iter - i + 2
+      ipos = mod(i-1, mem) + 1  # Position corresponding to pᵢ in the circular stack P.
+      diag = iter - i + 1
       if ipos == pos
-        # pₐᵤₓ ← -hₘ₋ₘₑₘ.ₘ * pₘ₋ₘₑₘ
+        # pₐᵤₓ ← -hₖ₋ₘₑₘ.ₖ * pₖ₋ₘₑₘ
         @kscal!(n, -H[diag], P[pos])
       else
-        # pₐᵤₓ ← pₐᵤₓ - hᵢ.ₘ * pᵢ
+        # pₐᵤₓ ← pₐᵤₓ - hᵢ.ₖ * pᵢ
         @kaxpy!(n, -H[diag], P[ipos], P[pos])
       end
     end
-    # pₐᵤₓ ← pₐᵤₓ + N⁻¹vₘ
+    # pₐᵤₓ ← pₐᵤₓ + Nvₖ
     @kaxpy!(n, one(FC), z, P[pos])
-    # pₘ = pₐᵤₓ / hₘ.ₘ
-    @. P[pos] = P[pos] / H[2]
+    # pₖ = pₐᵤₓ / hₖ.ₖ
+    P[pos] .= P[pos] ./ H[1]
 
-    # Compute solution xₘ.
-    # xₘ ← xₘ₋₁ + γₘ * pₘ
-    @kaxpy!(n, γₘ, P[pos], x)
+    # Compute solution xₖ.
+    # xₖ ← xₖ₋₁ + γₖ * pₖ
+    @kaxpy!(n, γₖ, P[pos], x)
 
     # Update residual norm estimate.
-    # ‖ M⁻¹(b - Axₘ) ‖₂ ≈ |γₘ₊₁|
-    rNorm = abs(γₘ₊₁)
+    # ‖ M(b - Axₖ) ‖₂ ≈ |γₖ₊₁|
+    rNorm = abs(γₖ₊₁)
     history && push!(rNorms, rNorm)
 
-    # Update γₘ.
-    γₘ = γₘ₊₁
+    # Update γₖ.
+    γₖ = γₖ₊₁
 
     # Stopping conditions that do not depend on user input.
     # This is to guard against tolerances that are unreasonably small.
@@ -262,9 +283,9 @@ function dqgmres!(solver :: DqgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     resid_decrease_lim = rNorm ≤ ε
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
   solved              && (status = "solution good enough given atol and rtol")
   tired               && (status = "maximum number of iterations exceeded")
   user_requested_exit && (status = "user-requested exit")
diff --git a/src/fgmres.jl b/src/fgmres.jl
new file mode 100644
index 000000000..fa536af23
--- /dev/null
+++ b/src/fgmres.jl
@@ -0,0 +1,353 @@
+# An implementation of FGMRES for the solution of the square linear system Ax = b.
+#
+# This method is described in
+#
+# Y. Saad, A Flexible Inner-Outer Preconditioned GMRES Algorithms.
+# SIAM Journal on Scientific Computing, Vol. 14(2), pp. 461--469, 1993.
+#
+# Alexis Montoison, <alexis.montoison@polymtl.ca>
+# Montreal, September 2022.
+
+export fgmres, fgmres!
+
+"""
+    (x, stats) = fgmres(A, b::AbstractVector{FC};
+                        memory::Int=20, M=I, N=I, ldiv::Bool=false,
+                        restart::Bool=false, reorthogonalization::Bool=false,
+                        atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                        verbose::Int=0, history::Bool=false,
+                        callback=solver->false, iostream::IO=kstdout)
+
+`T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
+`FC` is `T` or `Complex{T}`.
+
+    (x, stats) = fgmres(A, b, x0::AbstractVector; kwargs...)
+
+FGMRES can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the linear system Ax = b of size n using FGMRES.
+
+FGMRES computes a sequence of approximate solutions with minimum residual.
+FGMRES is a variant of GMRES that allows changes in the right preconditioner at each iteration.
+
+This implementation allows a left preconditioner M and a flexible right preconditioner N.
+A situation in which the preconditioner is "not constant" is when a relaxation-type method,
+a Chebyshev iteration or another Krylov subspace method is used as a preconditioner. 
+Compared to GMRES, there is no additional cost incurred in the arithmetic but the memory requirement almost doubles.
+Thus, GMRES is recommended if the right preconditioner N is constant.
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
+
+#### Keyword arguments
+
+* `memory`: if `restart = true`, the restarted version FGMRES(k) is used with `k = memory`. If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations. Additional storage will be allocated if the number of iterations exceeds `memory`;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `restart`: restart the method after `memory` iterations;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
+
+#### Reference
+
+* Y. Saad, [*A Flexible Inner-Outer Preconditioned GMRES Algorithm*](https://doi.org/10.1137/0914028), SIAM Journal on Scientific Computing, Vol. 14(2), pp. 461--469, 1993.
+"""
+function fgmres end
+
+function fgmres(A, b :: AbstractVector{FC}, x0 :: AbstractVector; memory :: Int=20, kwargs...) where FC <: FloatOrComplex
+  solver = FgmresSolver(A, b, memory)
+  fgmres!(solver, A, b, x0; kwargs...)
+  return (solver.x, solver.stats)
+end
+
+function fgmres(A, b :: AbstractVector{FC}; memory :: Int=20, kwargs...) where FC <: FloatOrComplex
+  solver = FgmresSolver(A, b, memory)
+  fgmres!(solver, A, b; kwargs...)
+  return (solver.x, solver.stats)
+end
+
+"""
+    solver = fgmres!(solver::FgmresSolver, A, b; kwargs...)
+    solver = fgmres!(solver::FgmresSolver, A, b, x0; kwargs...)
+
+where `kwargs` are keyword arguments of [`fgmres`](@ref).
+
+Note that the `memory` keyword argument is the only exception.
+It's required to create a `FgmresSolver` and can't be changed later.
+
+See [`FgmresSolver`](@ref) for more details about the `solver`.
+"""
+function fgmres! end
+
+function fgmres!(solver :: FgmresSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: AbstractVector; kwargs...) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+  warm_start!(solver, x0)
+  fgmres!(solver, A, b; kwargs...)
+  return solver
+end
+
+function fgmres!(solver :: FgmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
+                 M=I, N=I, ldiv :: Bool=false,
+                 restart :: Bool=false, reorthogonalization :: Bool=false,
+                 atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
+                 verbose :: Int=0, history :: Bool=false,
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+
+  m, n = size(A)
+  m == n || error("System must be square")
+  length(b) == m || error("Inconsistent problem size")
+  (verbose > 0) && @printf(iostream, "FGMRES: system of size %d\n", n)
+
+  # Check M = Iₙ
+  MisI = (M === I)
+
+  # Check type consistency
+  eltype(A) == FC || error("eltype(A) ≠ $FC")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+
+  # Set up workspace.
+  allocate_if(!MisI  , solver, :q , S, n)
+  allocate_if(restart, solver, :Δx, S, n)
+  Δx, x, w, V, Z = solver.Δx, solver.x, solver.w, solver.V, solver.Z
+  z, c, s, R, stats = solver.z, solver.c, solver.s, solver.R, solver.stats
+  warm_start = solver.warm_start
+  rNorms = stats.residuals
+  reset!(stats)
+  q  = MisI ? w : solver.q
+  r₀ = MisI ? w : solver.q
+  xr = restart ? Δx : x
+
+  # Initial solution x₀.
+  x .= zero(FC)
+
+  # Initial residual r₀.
+  if warm_start
+    mul!(w, A, Δx)
+    @kaxpby!(n, one(FC), b, -one(FC), w)
+    restart && @kaxpy!(n, one(FC), Δx, x)
+  else
+    w .= b
+  end
+  MisI || mulorldiv!(r₀, M, w, ldiv)  # r₀ = M(b - Ax₀)
+  β = @knrm2(n, r₀)                   # β = ‖r₀‖₂
+
+  rNorm = β
+  history && push!(rNorms, β)
+  ε = atol + rtol * rNorm
+
+  if β == 0
+    stats.niter = 0
+    stats.solved, stats.inconsistent = true, false
+    stats.status = "x = 0 is a zero-residual solution"
+    solver.warm_start = false
+    return solver
+  end
+
+  mem = length(c)  # Memory
+  npass = 0        # Number of pass
+
+  iter = 0        # Cumulative number of iterations
+  inner_iter = 0  # Number of iterations in a pass
+
+  itmax == 0 && (itmax = 2*n)
+  inner_itmax = itmax
+
+  (verbose > 0) && @printf(iostream, "%5s  %5s  %7s  %7s\n", "pass", "k", "‖rₖ‖", "hₖ₊₁.ₖ")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7s\n", npass, iter, rNorm, "✗ ✗ ✗ ✗")
+
+  # Tolerance for breakdown detection.
+  btol = eps(T)^(3/4)
+
+  # Stopping criterion
+  breakdown = false
+  inconsistent = false
+  solved = rNorm ≤ ε
+  tired = iter ≥ itmax
+  inner_tired = inner_iter ≥ inner_itmax
+  status = "unknown"
+  user_requested_exit = false
+
+  while !(solved || tired || breakdown || user_requested_exit)
+
+    # Initialize workspace.
+    nr = 0  # Number of coefficients stored in Rₖ.
+    for i = 1 : mem
+      V[i] .= zero(FC)  # Orthogonal basis of {Mr₀, MANₖr₀, ..., (MANₖ)ᵏ⁻¹r₀}.
+      Z[i] .= zero(FC)  # Zₖ = [N₁v₁, ..., Nₖvₖ]
+    end
+    s .= zero(FC)  # Givens sines used for the factorization QₖRₖ = Hₖ₊₁.ₖ.
+    c .= zero(T)   # Givens cosines used for the factorization QₖRₖ = Hₖ₊₁.ₖ.
+    R .= zero(FC)  # Upper triangular matrix Rₖ.
+    z .= zero(FC)  # Right-hand of the least squares problem min ‖Hₖ₊₁.ₖyₖ - βe₁‖₂.
+
+    if restart
+      xr .= zero(FC)  # xr === Δx when restart is set to true
+      if npass ≥ 1
+        mul!(w, A, x)
+        @kaxpby!(n, one(FC), b, -one(FC), w)
+        MisI || mulorldiv!(r₀, M, w, ldiv)
+      end
+    end
+
+    # Initial ζ₁ and V₁
+    β = @knrm2(n, r₀)
+    z[1] = β
+    @. V[1] = r₀ / rNorm
+
+    npass = npass + 1
+    solver.inner_iter = 0
+    inner_tired = false
+
+    while !(solved || inner_tired || breakdown || user_requested_exit)
+
+      # Update iteration index
+      solver.inner_iter = solver.inner_iter + 1
+      inner_iter = solver.inner_iter
+
+      # Update workspace if more storage is required and restart is set to false
+      if !restart && (inner_iter > mem)
+        for i = 1 : inner_iter
+          push!(R, zero(FC))
+        end
+        push!(s, zero(FC))
+        push!(c, zero(T))
+        push!(Z, S(undef, n))
+      end
+
+      # Continue the process.
+      # MAZₖ = Vₖ₊₁Hₖ₊₁.ₖ
+      mulorldiv!(Z[inner_iter], N, V[inner_iter], ldiv)  # zₖ ← Nₖvₖ
+      mul!(w, A, Z[inner_iter])                          # w  ← Azₖ
+      MisI || mulorldiv!(q, M, w, ldiv)                  # q  ← MAzₖ
+      for i = 1 : inner_iter
+        R[nr+i] = @kdot(n, V[i], q)      # hᵢₖ = (vᵢ)ᴴq
+        @kaxpy!(n, -R[nr+i], V[i], q)    # q ← q - hᵢₖvᵢ
+      end
+
+      # Reorthogonalization of the basis.
+      if reorthogonalization
+        for i = 1 : inner_iter
+          Htmp = @kdot(n, V[i], q)
+          R[nr+i] += Htmp
+          @kaxpy!(n, -Htmp, V[i], q)
+        end
+      end
+
+      # Compute hₖ₊₁.ₖ
+      Hbis = @knrm2(n, q)  # hₖ₊₁.ₖ = ‖vₖ₊₁‖₂
+
+      # Update the QR factorization of Hₖ₊₁.ₖ.
+      # Apply previous Givens reflections Ωᵢ.
+      # [cᵢ  sᵢ] [ r̄ᵢ.ₖ ] = [ rᵢ.ₖ ]
+      # [s̄ᵢ -cᵢ] [rᵢ₊₁.ₖ]   [r̄ᵢ₊₁.ₖ]
+      for i = 1 : inner_iter-1
+        Rtmp      =      c[i]  * R[nr+i] + s[i] * R[nr+i+1]
+        R[nr+i+1] = conj(s[i]) * R[nr+i] - c[i] * R[nr+i+1]
+        R[nr+i]   = Rtmp
+      end
+
+      # Compute and apply current Givens reflection Ωₖ.
+      # [cₖ  sₖ] [ r̄ₖ.ₖ ] = [rₖ.ₖ]
+      # [s̄ₖ -cₖ] [hₖ₊₁.ₖ]   [ 0  ]
+      (c[inner_iter], s[inner_iter], R[nr+inner_iter]) = sym_givens(R[nr+inner_iter], Hbis)
+
+      # Update zₖ = (Qₖ)ᴴβe₁
+      ζₖ₊₁          = conj(s[inner_iter]) * z[inner_iter]
+      z[inner_iter] =      c[inner_iter]  * z[inner_iter]
+
+      # Update residual norm estimate.
+      # ‖ M⁻¹(b - Axₖ) ‖₂ = |ζₖ₊₁|
+      rNorm = abs(ζₖ₊₁)
+      history && push!(rNorms, rNorm)
+
+      # Update the number of coefficients in Rₖ
+      nr = nr + inner_iter
+
+      # Stopping conditions that do not depend on user input.
+      # This is to guard against tolerances that are unreasonably small.
+      resid_decrease_mach = (rNorm + one(T) ≤ one(T))
+      
+      # Update stopping criterion.
+      resid_decrease_lim = rNorm ≤ ε
+      breakdown = Hbis ≤ btol
+      solved = resid_decrease_lim || resid_decrease_mach
+      inner_tired = restart ? inner_iter ≥ min(mem, inner_itmax) : inner_iter ≥ inner_itmax
+      solver.inner_iter = inner_iter
+      kdisplay(iter+inner_iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7.1e\n", npass, iter+inner_iter, rNorm, Hbis)
+
+      # Compute vₖ₊₁
+      if !(solved || inner_tired || breakdown)
+        if !restart && (inner_iter ≥ mem)
+          push!(V, S(undef, n))
+          push!(z, zero(FC))
+        end
+        @. V[inner_iter+1] = q / Hbis  # hₖ₊₁.ₖvₖ₊₁ = q
+        z[inner_iter+1] = ζₖ₊₁
+      end
+
+      user_requested_exit = callback(solver) :: Bool
+    end
+
+    # Compute y by solving Ry = z with backward substitution.
+    y = z  # yᵢ = ζᵢ
+    for i = inner_iter : -1 : 1
+      pos = nr + i - inner_iter      # position of rᵢ.ₖ
+      for j = inner_iter : -1 : i+1
+        y[i] = y[i] - R[pos] * y[j]  # yᵢ ← yᵢ - rᵢⱼyⱼ
+        pos = pos - j + 1            # position of rᵢ.ⱼ₋₁
+      end
+      # Rₖ can be singular if the system is inconsistent
+      if abs(R[pos]) ≤ btol
+        y[i] = zero(FC)
+        inconsistent = true
+      else
+        y[i] = y[i] / R[pos]  # yᵢ ← yᵢ / rᵢᵢ
+      end
+    end
+
+    # Form xₖ = N₁v₁y₁ + ... + Nₖvₖyₖ = z₁y₁ + ... + zₖyₖ
+    for i = 1 : inner_iter
+      @kaxpy!(n, y[i], Z[i], xr)
+    end
+    restart && @kaxpy!(n, one(FC), xr, x)
+
+    # Update inner_itmax, iter and tired variables.
+    inner_itmax = inner_itmax - inner_iter
+    iter = iter + inner_iter
+    tired = iter ≥ itmax
+  end
+  (verbose > 0) && @printf(iostream, "\n")
+
+  tired               && (status = "maximum number of iterations exceeded")
+  solved              && (status = "solution good enough given atol and rtol")
+  inconsistent        && (status = "found approximate least-squares solution")
+  user_requested_exit && (status = "user-requested exit")
+
+  # Update x
+  warm_start && !restart && @kaxpy!(n, one(FC), Δx, x)
+  solver.warm_start = false
+
+  # Update stats
+  stats.niter = iter
+  stats.solved = solved
+  stats.inconsistent = inconsistent
+  stats.status = status
+  return solver
+end
diff --git a/src/fom.jl b/src/fom.jl
index fcae5cf62..6aabb33f5 100644
--- a/src/fom.jl
+++ b/src/fom.jl
@@ -11,38 +11,53 @@
 export fom, fom!
 
 """
-    (x, stats) = fom(A, b::AbstractVector{FC}; memory::Int=20,
-                     M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                     reorthogonalization::Bool=false, itmax::Int=0,
-                     restart::Bool=false, verbose::Int=0, history::Bool=false,
-                     ldiv::Bool=false, callback=solver->false)
+    (x, stats) = fom(A, b::AbstractVector{FC};
+                     memory::Int=20, M=I, N=I, ldiv::Bool=false,
+                     restart::Bool=false, reorthogonalization::Bool=false,
+                     atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                     verbose::Int=0, history::Bool=false,
+                     callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the linear system Ax = b using FOM method.
+    (x, stats) = fom(A, b, x0::AbstractVector; kwargs...)
+
+FOM can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the linear system Ax = b of size n using FOM.
 
 FOM algorithm is based on the Arnoldi process and a Galerkin condition.
 
-This implementation allows a left preconditioner M and a right preconditioner N.
-- Left  preconditioning : M⁻¹Ax = M⁻¹b
-- Right preconditioning : AN⁻¹u = b with x = N⁻¹u
-- Split preconditioning : M⁻¹AN⁻¹u = M⁻¹b with x = N⁻¹u
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
 
-Full reorthogonalization is available with the `reorthogonalization` option.
+#### Optional argument
 
-If `restart = true`, the restarted version FOM(k) is used with `k = memory`.
-If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations.
-More storage will be allocated only if the number of iterations exceed `memory`.
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-FOM can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = fom(A, b, x0; kwargs...)
+* `memory`: if `restart = true`, the restarted version FOM(k) is used with `k = memory`. If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations. Additional storage will be allocated if the number of iterations exceeds `memory`;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `restart`: restart the method after `memory` iterations;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -82,15 +97,16 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 :: Abs
 end
 
 function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
-              M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-              reorthogonalization :: Bool=false, itmax :: Int=0,
-              restart :: Bool=false, verbose :: Int=0, history :: Bool=false,
-              ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+              M=I, N=I, ldiv :: Bool=false,
+              restart :: Bool=false, reorthogonalization :: Bool=false,
+              atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
+              verbose :: Int=0, history :: Bool=false,
+              callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("FOM: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "FOM: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -98,7 +114,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI  , solver, :q , S, n)
@@ -124,7 +140,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
   else
     w .= b
   end
-  MisI || mulorldiv!(r₀, M, w, ldiv)  # r₀ = M⁻¹(b - Ax₀)
+  MisI || mulorldiv!(r₀, M, w, ldiv)  # r₀ = M(b - Ax₀)
   β = @knrm2(n, r₀)                   # β = ‖r₀‖₂
 
   rNorm = β
@@ -148,8 +164,8 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
   itmax == 0 && (itmax = 2*n)
   inner_itmax = itmax
 
-  (verbose > 0) && @printf("%5s  %5s  %7s  %7s\n", "pass", "k", "‖rₖ‖", "hₖ₊₁.ₖ")
-  kdisplay(iter, verbose) && @printf("%5d  %5d  %7.1e  %7s\n", npass, iter, rNorm, "✗ ✗ ✗ ✗")
+  (verbose > 0) && @printf(iostream, "%5s  %5s  %7s  %7s\n", "pass", "k", "‖rₖ‖", "hₖ₊₁.ₖ")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7s\n", npass, iter, rNorm, "✗ ✗ ✗ ✗")
 
   # Tolerance for breakdown detection.
   btol = eps(T)^(3/4)
@@ -167,7 +183,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # Initialize workspace.
     nr = 0  # Number of coefficients stored in Uₖ.
     for i = 1 : mem
-      V[i] .= zero(FC)  # Orthogonal basis of Kₖ(M⁻¹AN⁻¹, M⁻¹r₀).
+      V[i] .= zero(FC)  # Orthogonal basis of Kₖ(MAN, Mr₀).
     end
     l .= zero(FC)  # Lower unit triangular matrix Lₖ.
     U .= zero(FC)  # Upper triangular matrix Uₖ.
@@ -207,11 +223,11 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
       # Continue the Arnoldi process.
       p = NisI ? V[inner_iter] : solver.p
-      NisI || mulorldiv!(p, N, V[inner_iter], ldiv)  # p ← N⁻¹vₖ
-      mul!(w, A, p)                                  # w ← AN⁻¹vₖ
-      MisI || mulorldiv!(q, M, w, ldiv)              # q ← M⁻¹AN⁻¹vₖ
+      NisI || mulorldiv!(p, N, V[inner_iter], ldiv)  # p ← Nvₖ
+      mul!(w, A, p)                                  # w ← ANvₖ
+      MisI || mulorldiv!(q, M, w, ldiv)              # q ← MANvₖ
       for i = 1 : inner_iter
-        U[nr+i] = @kdot(n, V[i], q)      # hᵢₖ = qᵀvᵢ
+        U[nr+i] = @kdot(n, V[i], q)      # hᵢₖ = (vᵢ)ᴴq
         @kaxpy!(n, -U[nr+i], V[i], q)    # q ← q - hᵢₖvᵢ
       end
 
@@ -240,7 +256,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
       l[inner_iter] = Hbis / U[nr+inner_iter]
 
       # Update residual norm estimate.
-      # ‖ M⁻¹(b - Axₖ) ‖₂ = hₖ₊₁.ₖ * |ζₖ / uₖ.ₖ|
+      # ‖ M(b - Axₖ) ‖₂ = hₖ₊₁.ₖ * |ζₖ / uₖ.ₖ|
       rNorm = Hbis * abs(z[inner_iter] / U[nr+inner_iter])
       history && push!(rNorms, rNorm)
 
@@ -257,7 +273,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
       breakdown = Hbis ≤ btol
       solved = resid_decrease_lim || resid_decrease_mach
       inner_tired = restart ? inner_iter ≥ min(mem, inner_itmax) : inner_iter ≥ inner_itmax
-      kdisplay(iter+inner_iter, verbose) && @printf("%5d  %5d  %7.1e  %7.1e\n", npass, iter+inner_iter, rNorm, Hbis)
+      kdisplay(iter+inner_iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7.1e\n", npass, iter+inner_iter, rNorm, Hbis)
 
       # Compute vₖ₊₁.
       if !(solved || inner_tired || breakdown)
@@ -280,7 +296,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
       y[i] = y[i] / U[pos]  # yᵢ ← yᵢ / rᵢᵢ
     end
 
-    # Form xₖ = N⁻¹Vₖyₖ
+    # Form xₖ = NVₖyₖ
     for i = 1 : inner_iter
       @kaxpy!(n, y[i], V[i], xr)
     end
@@ -295,7 +311,7 @@ function fom!(solver :: FomSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + inner_iter
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "inconsistent linear system")
diff --git a/src/gmres.jl b/src/gmres.jl
index 388a4ab96..d475198b5 100644
--- a/src/gmres.jl
+++ b/src/gmres.jl
@@ -11,38 +11,53 @@
 export gmres, gmres!
 
 """
-    (x, stats) = gmres(A, b::AbstractVector{FC}; memory::Int=20,
-                       M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                       reorthogonalization::Bool=false, itmax::Int=0,
-                       restart::Bool=false, verbose::Int=0, history::Bool=false,
-                       ldiv::Bool=false, callback=solver->false)
+    (x, stats) = gmres(A, b::AbstractVector{FC};
+                       memory::Int=20, M=I, N=I, ldiv::Bool=false,
+                       restart::Bool=false, reorthogonalization::Bool=false,
+                       atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                       verbose::Int=0, history::Bool=false,
+                       callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the linear system Ax = b using GMRES method.
+    (x, stats) = gmres(A, b, x0::AbstractVector; kwargs...)
 
-GMRES algorithm is based on the Arnoldi process and computes a sequence of approximate solutions with the minimal residual property.
+GMRES can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
 
-This implementation allows a left preconditioner M and a right preconditioner N.
-- Left  preconditioning : M⁻¹Ax = M⁻¹b
-- Right preconditioning : AN⁻¹u = b with x = N⁻¹u
-- Split preconditioning : M⁻¹AN⁻¹u = M⁻¹b with x = N⁻¹u
+Solve the linear system Ax = b of size n using GMRES.
 
-Full reorthogonalization is available with the `reorthogonalization` option.
+GMRES algorithm is based on the Arnoldi process and computes a sequence of approximate solutions with the minimum residual.
 
-If `restart = true`, the restarted version GMRES(k) is used with `k = memory`.
-If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations.
-More storage will be allocated only if the number of iterations exceed `memory`.
+#### Input arguments
 
-GMRES can be warm-started from an initial guess `x0` with the method
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
 
-    (x, stats) = gmres(A, b, x0; kwargs...)
+#### Optional argument
 
-where `kwargs` are the same keyword arguments as above.
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Keyword arguments
+
+* `memory`: if `restart = true`, the restarted version GMRES(k) is used with `k = memory`. If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations. Additional storage will be allocated if the number of iterations exceeds `memory`;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `restart`: restart the method after `memory` iterations;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -82,15 +97,16 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0 ::
 end
 
 function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                reorthogonalization :: Bool=false, itmax :: Int=0,
-                restart :: Bool=false, verbose :: Int=0, history :: Bool=false,
-                ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                M=I, N=I, ldiv :: Bool=false,
+                restart :: Bool=false, reorthogonalization :: Bool=false,
+                atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
+                verbose :: Int=0, history :: Bool=false,
+                callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("GMRES: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "GMRES: system of size %d\n", n)
 
   # Check M = Iₙ and N = Iₙ
   MisI = (M === I)
@@ -98,7 +114,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI  , solver, :q , S, n)
@@ -124,7 +140,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
   else
     w .= b
   end
-  MisI || mulorldiv!(r₀, M, w, ldiv)  # r₀ = M⁻¹(b - Ax₀)
+  MisI || mulorldiv!(r₀, M, w, ldiv)  # r₀ = M(b - Ax₀)
   β = @knrm2(n, r₀)                   # β = ‖r₀‖₂
 
   rNorm = β
@@ -148,8 +164,8 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
   itmax == 0 && (itmax = 2*n)
   inner_itmax = itmax
 
-  (verbose > 0) && @printf("%5s  %5s  %7s  %7s\n", "pass", "k", "‖rₖ‖", "hₖ₊₁.ₖ")
-  kdisplay(iter, verbose) && @printf("%5d  %5d  %7.1e  %7s\n", npass, iter, rNorm, "✗ ✗ ✗ ✗")
+  (verbose > 0) && @printf(iostream, "%5s  %5s  %7s  %7s\n", "pass", "k", "‖rₖ‖", "hₖ₊₁.ₖ")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7s\n", npass, iter, rNorm, "✗ ✗ ✗ ✗")
 
   # Tolerance for breakdown detection.
   btol = eps(T)^(3/4)
@@ -168,7 +184,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # Initialize workspace.
     nr = 0  # Number of coefficients stored in Rₖ.
     for i = 1 : mem
-      V[i] .= zero(FC)  # Orthogonal basis of Kₖ(M⁻¹AN⁻¹, M⁻¹r₀).
+      V[i] .= zero(FC)  # Orthogonal basis of Kₖ(MAN, Mr₀).
     end
     s .= zero(FC)  # Givens sines used for the factorization QₖRₖ = Hₖ₊₁.ₖ.
     c .= zero(T)   # Givens cosines used for the factorization QₖRₖ = Hₖ₊₁.ₖ.
@@ -210,11 +226,11 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
       # Continue the Arnoldi process.
       p = NisI ? V[inner_iter] : solver.p
-      NisI || mulorldiv!(p, N, V[inner_iter], ldiv)  # p ← N⁻¹vₖ
-      mul!(w, A, p)                                  # w ← AN⁻¹vₖ
-      MisI || mulorldiv!(q, M, w, ldiv)              # q ← M⁻¹AN⁻¹vₖ
+      NisI || mulorldiv!(p, N, V[inner_iter], ldiv)  # p ← Nvₖ
+      mul!(w, A, p)                                  # w ← ANvₖ
+      MisI || mulorldiv!(q, M, w, ldiv)              # q ← MANvₖ
       for i = 1 : inner_iter
-        R[nr+i] = @kdot(n, V[i], q)      # hᵢₖ = qᵀvᵢ
+        R[nr+i] = @kdot(n, V[i], q)      # hᵢₖ = (vᵢ)ᴴq
         @kaxpy!(n, -R[nr+i], V[i], q)    # q ← q - hᵢₖvᵢ
       end
 
@@ -245,12 +261,12 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
       # [s̄ₖ -cₖ] [hₖ₊₁.ₖ]   [ 0  ]
       (c[inner_iter], s[inner_iter], R[nr+inner_iter]) = sym_givens(R[nr+inner_iter], Hbis)
 
-      # Update zₖ = (Qₖ)ᵀβe₁
+      # Update zₖ = (Qₖ)ᴴβe₁
       ζₖ₊₁          = conj(s[inner_iter]) * z[inner_iter]
       z[inner_iter] =      c[inner_iter]  * z[inner_iter]
 
       # Update residual norm estimate.
-      # ‖ M⁻¹(b - Axₖ) ‖₂ = |ζₖ₊₁|
+      # ‖ M(b - Axₖ) ‖₂ = |ζₖ₊₁|
       rNorm = abs(ζₖ₊₁)
       history && push!(rNorms, rNorm)
 
@@ -267,7 +283,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
       solved = resid_decrease_lim || resid_decrease_mach
       inner_tired = restart ? inner_iter ≥ min(mem, inner_itmax) : inner_iter ≥ inner_itmax
       solver.inner_iter = inner_iter
-      kdisplay(iter+inner_iter, verbose) && @printf("%5d  %5d  %7.1e  %7.1e\n", npass, iter+inner_iter, rNorm, Hbis)
+      kdisplay(iter+inner_iter, verbose) && @printf(iostream, "%5d  %5d  %7.1e  %7.1e\n", npass, iter+inner_iter, rNorm, Hbis)
 
       # Compute vₖ₊₁
       if !(solved || inner_tired || breakdown)
@@ -299,7 +315,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
       end
     end
 
-    # Form xₖ = N⁻¹Vₖyₖ
+    # Form xₖ = NVₖyₖ
     for i = 1 : inner_iter
       @kaxpy!(n, y[i], V[i], xr)
     end
@@ -314,7 +330,7 @@ function gmres!(solver :: GmresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     iter = iter + inner_iter
     tired = iter ≥ itmax
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
diff --git a/src/gpmr.jl b/src/gpmr.jl
index b10942995..958d2977c 100644
--- a/src/gpmr.jl
+++ b/src/gpmr.jl
@@ -12,23 +12,30 @@
 export gpmr, gpmr!
 
 """
-    (x, y, stats) = gpmr(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}; memory::Int=20,
-                         C=I, D=I, E=I, F=I, atol::T=√eps(T), rtol::T=√eps(T),
-                         gsp::Bool=false, reorthogonalization::Bool=false,
-                         itmax::Int=0, λ::FC=one(FC), μ::FC=one(FC),
+    (x, y, stats) = gpmr(A, B, b::AbstractVector{FC}, c::AbstractVector{FC};
+                         memory::Int=20, C=I, D=I, E=I, F=I,
+                         ldiv::Bool=false, gsp::Bool=false,
+                         λ::FC=one(FC), μ::FC=one(FC),
+                         reorthogonalization::Bool=false, atol::T=√eps(T),
+                         rtol::T=√eps(T), itmax::Int=0,
                          verbose::Int=0, history::Bool=false,
-                         ldiv::Bool=false, callback=solver->false)
+                         callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-GPMR solves the unsymmetric partitioned linear system
+    (x, y, stats) = gpmr(A, B, b, c, x0::AbstractVector, y0::AbstractVector; kwargs...)
 
-    [ λI   A ] [ x ] = [ b ]
-    [  B  μI ] [ y ]   [ c ],
+GPMR can be warm-started from initial guesses `x0` and `y0` where `kwargs` are the same keyword arguments as above.
 
-where λ and μ are real or complex numbers.
-`A` can have any shape and `B` has the shape of `Aᵀ`.
+Given matrices `A` of dimension m × n and `B` of dimension n × m,
+GPMR solves the non-Hermitian partitioned linear system
+
+    [ λIₘ   A  ] [ x ] = [ b ]
+    [  B   μIₙ ] [ y ]   [ c ],
+
+of size (n+m) × (n+m) where λ and μ are real or complex numbers.
+`A` can have any shape and `B` has the shape of `Aᴴ`.
 `A`, `B`, `b` and `c` must be all nonzero.
 
 This implementation allows left and right block diagonal preconditioners
@@ -44,8 +51,6 @@ and can solve
 when `CE = M⁻¹` and `DF = N⁻¹`.
 
 By default, GPMR solves unsymmetric linear systems with `λ = 1` and `μ = 1`.
-If `gsp = true`, `λ = 1`, `μ = 0` and the associated generalized saddle point system is solved.
-`λ` and `μ` are also keyword arguments that can be directly modified for more specific problems.
 
 GPMR is based on the orthogonal Hessenberg reduction process and its relations with the block-Arnoldi process.
 The residual norm ‖rₖ‖ is monotonically decreasing in GPMR.
@@ -53,19 +58,42 @@ The residual norm ‖rₖ‖ is monotonically decreasing in GPMR.
 GPMR stops when `itmax` iterations are reached or when `‖rₖ‖ ≤ atol + ‖r₀‖ * rtol`.
 `atol` is an absolute tolerance and `rtol` is a relative tolerance.
 
-Full reorthogonalization is available with the `reorthogonalization` option.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a linear operator that models a matrix of dimension n × m;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
+
+#### Optional arguments
 
-Additional details can be displayed if verbose mode is enabled (verbose > 0).
-Information will be displayed every `verbose` iterations.
+* `x0`: a vector of length m that represents an initial guess of the solution x;
+* `y0`: a vector of length n that represents an initial guess of the solution y.
 
-GPMR can be warm-started from initial guesses `x0` and `y0` with the method
+#### Keyword arguments
 
-    (x, y, stats) = gpmr(A, B, b, c, x0, y0; kwargs...)
+* `memory`: if `restart = true`, the restarted version GPMR(k) is used with `k = memory`. If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations. Additional storage will be allocated if the number of iterations exceeds `memory`;
+* `C`: linear operator that models a nonsingular matrix of size `m`, and represents the first term of the block-diagonal left preconditioner;
+* `D`: linear operator that models a nonsingular matrix of size `n`, and represents the second term of the block-diagonal left preconditioner;
+* `E`: linear operator that models a nonsingular matrix of size `m`, and represents the first term of the block-diagonal right preconditioner;
+* `F`: linear operator that models a nonsingular matrix of size `n`, and represents the second term of the block-diagonal right preconditioner;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `gsp`: if `true`, set `λ = 1` and `μ = 0` for generalized saddle-point systems;
+* `λ` and `μ`: diagonal scaling factors of the partitioned linear system;
+* `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length m;
+* `y`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -106,11 +134,13 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
 end
 
 function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-               C=I, D=I, E=I, F=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-               gsp :: Bool=false, reorthogonalization :: Bool=false,
-               itmax :: Int=0, λ :: FC=one(FC), μ :: FC=one(FC),
+               C=I, D=I, E=I, F=I,
+               ldiv :: Bool=false, gsp :: Bool=false,
+               λ :: FC=one(FC), μ :: FC=one(FC),
+               reorthogonalization :: Bool=false, atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
                verbose :: Int=0, history::Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   s, t = size(B)
@@ -118,7 +148,7 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
   s == n         || error("Inconsistent problem size")
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("GPMR: system of %d equations in %d variables\n", m+n, m+n)
+  (verbose > 0) && @printf(iostream, "GPMR: system of %d equations in %d variables\n", m+n, m+n)
 
   # Check C = E = Iₘ and D = F = Iₙ
   CisI = (C === I)
@@ -129,8 +159,8 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
   eltype(B) == FC || error("eltype(B) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Determine λ and μ associated to generalized saddle point systems.
   gsp && (λ = one(FC) ; μ = zero(FC))
@@ -172,7 +202,7 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
   gs .= zero(FC)  # Givens sines used for the factorization QₖRₖ = Sₖ₊₁.ₖ.
   gc .= zero(T)   # Givens cosines used for the factorization QₖRₖ = Sₖ₊₁.ₖ.
   R  .= zero(FC)  # Upper triangular matrix Rₖ.
-  zt .= zero(FC)  # Rₖzₖ = tₖ with (tₖ, τbar₂ₖ₊₁, τbar₂ₖ₊₂) = (Qₖ)ᵀ(βe₁ + γe₂).
+  zt .= zero(FC)  # Rₖzₖ = tₖ with (tₖ, τbar₂ₖ₊₁, τbar₂ₖ₊₂) = (Qₖ)ᴴ(βe₁ + γe₂).
 
   # Warm-start
   # If λ ≠ 0, Cb₀ = Cb - CAΔy - λΔx because CM = Iₘ and E = Iₘ
@@ -213,8 +243,8 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
   zt[1] = β
   zt[2] = γ
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "hₖ₊₁.ₖ", "fₖ₊₁.ₖ")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7s  %7s\n", iter, rNorm, "✗ ✗ ✗ ✗", "✗ ✗ ✗ ✗")
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "hₖ₊₁.ₖ", "fₖ₊₁.ₖ")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7s  %7s\n", iter, rNorm, "✗ ✗ ✗ ✗", "✗ ✗ ✗ ✗")
 
   # Tolerance for breakdown detection.
   btol = eps(T)^(3/4)
@@ -259,8 +289,8 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
     DisI || mulorldiv!(p, D, dB, ldiv)        # p  = DBEvₖ
 
     for i = 1 : iter
-      hᵢₖ = @kdot(m, V[i], q)    # hᵢ.ₖ = vᵢAuₖ
-      fᵢₖ = @kdot(n, U[i], p)    # fᵢ.ₖ = uᵢBvₖ
+      hᵢₖ = @kdot(m, V[i], q)    # hᵢ.ₖ = (vᵢ)ᴴq
+      fᵢₖ = @kdot(n, U[i], p)    # fᵢ.ₖ = (uᵢ)ᴴp
       @kaxpy!(m, -hᵢₖ, V[i], q)  # q ← q - hᵢ.ₖvᵢ
       @kaxpy!(n, -fᵢₖ, U[i], p)  # p ← p - fᵢ.ₖuᵢ
       R[nr₂ₖ + 2i-1] = hᵢₖ
@@ -270,8 +300,8 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
     # Reorthogonalization of the Krylov basis.
     if reorthogonalization
       for i = 1 : iter
-        Htmp = @kdot(m, V[i], q)    # hₜₘₚ = qᵀvᵢ
-        Ftmp = @kdot(n, U[i], p)    # fₜₘₚ = pᵀuᵢ
+        Htmp = @kdot(m, V[i], q)    # hₜₘₚ = (vᵢ)ᴴq
+        Ftmp = @kdot(n, U[i], p)    # fₜₘₚ = (uᵢ)ᴴp
         @kaxpy!(m, -Htmp, V[i], q)  # q ← q - hₜₘₚvᵢ
         @kaxpy!(n, -Ftmp, U[i], p)  # p ← p - fₜₘₚuᵢ
         R[nr₂ₖ + 2i-1] += Htmp                            # hᵢ.ₖ = hᵢ.ₖ + hₜₘₚ
@@ -400,7 +430,7 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
     breakdown = Faux ≤ btol && Haux ≤ btol
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, Haux, Faux)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, Haux, Faux)
 
     # Compute vₖ₊₁ and uₖ₊₁
     if !(solved || tired || breakdown || user_requested_exit)
@@ -430,7 +460,7 @@ function gpmr!(solver :: GpmrSolver{T,FC,S}, A, B, b :: AbstractVector{FC}, c ::
       zt[2k+2] = τbar₂ₖ₊₂
     end
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute zₖ = (ζ₁, ..., ζ₂ₖ) by solving Rₖzₖ = tₖ with backward substitution.
   for i = 2iter : -1 : 1
diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl
new file mode 100644
index 000000000..2be66b1c5
--- /dev/null
+++ b/src/krylov_processes.jl
@@ -0,0 +1,439 @@
+export hermitian_lanczos, nonhermitian_lanczos, arnoldi, golub_kahan, saunders_simon_yip, montoison_orban
+
+"""
+    V, T = hermitian_lanczos(A, b, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n;
+* `k`: the number of iterations of the Hermitian Lanczos process.
+
+#### Output arguments
+
+* `V`: a dense n × (k+1) matrix;
+* `T`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950.
+"""
+function hermitian_lanczos(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  R = real(FC)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval = zeros(R, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, n, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval)
+
+  pαᵢ = 1  # Position of αᵢ in the vector `nzval`
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    if i == 1
+      βᵢ = @knrm2(n, b)
+      vᵢ .= b ./ βᵢ
+    end
+    mul!(q, A, vᵢ)
+    αᵢ = @kdotr(n, vᵢ, q)
+    nzval[pαᵢ] = αᵢ  # Tᵢ.ᵢ = αᵢ
+    @kaxpy!(n, -αᵢ, vᵢ, q)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      βᵢ = nzval[pαᵢ-2]  # βᵢ = Tᵢ.ᵢ₋₁
+      nzval[pαᵢ-1] = βᵢ  # Tᵢ₋₁.ᵢ = βᵢ
+      @kaxpy!(n, -βᵢ, vᵢ₋₁, q)
+    end
+    βᵢ₊₁ = @knrm2(n, q)
+    nzval[pαᵢ+1] = βᵢ₊₁  # Tᵢ₊₁.ᵢ = βᵢ₊₁
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+    pαᵢ = pαᵢ + 3
+  end
+  return V, T
+end
+
+"""
+    V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `b`: a vector of length n;
+* `c`: a vector of length n;
+* `k`: the number of iterations of the non-Hermitian Lanczos process.
+
+#### Output arguments
+
+* `V`: a dense n × (k+1) matrix;
+* `T`: a sparse (k+1) × k tridiagonal matrix;
+* `U`: a dense n × (k+1) matrix;
+* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950.
+"""
+function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval_T = zeros(FC, 3k-1)
+  nzval_Tᴴ = zeros(FC, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, n, k+1)
+  U = M(undef, n, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ)
+
+  pαᵢ = 1  # Position of αᵢ and ᾱᵢ in the vectors `nzval_T` and `nzval_Tᴴ`
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      cᴴb = @kdot(n, c, b)
+      βᵢ = √(abs(cᴴb))
+      γᵢ = cᴴb / βᵢ
+      vᵢ .= b ./ βᵢ
+      uᵢ .= c ./ conj(γᵢ)
+    end
+    mul!(q, A , vᵢ)
+    mul!(p, Aᴴ, uᵢ)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      uᵢ₋₁ = view(U,:,i-1)
+      βᵢ = nzval_T[pαᵢ-2]  # βᵢ = Tᵢ.ᵢ₋₁
+      γᵢ = nzval_T[pαᵢ-1]  # γᵢ = Tᵢ₋₁.ᵢ
+      @kaxpy!(n, -     γᵢ , vᵢ₋₁, q)
+      @kaxpy!(n, -conj(βᵢ), uᵢ₋₁, p)
+    end
+    αᵢ = @kdot(n, uᵢ, q)
+    nzval_T[pαᵢ]  = αᵢ        # Tᵢ.ᵢ  = αᵢ
+    nzval_Tᴴ[pαᵢ] = conj(αᵢ)  # Tᴴᵢ.ᵢ = ᾱᵢ
+    @kaxpy!(m, -     αᵢ , vᵢ, q)
+    @kaxpy!(n, -conj(αᵢ), uᵢ, p)
+    pᴴq = @kdot(n, p, q)
+    βᵢ₊₁ = √(abs(pᴴq))
+    γᵢ₊₁ = pᴴq / βᵢ₊₁
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+    uᵢ₊₁ .= p ./ conj(γᵢ₊₁)
+    nzval_T[pαᵢ+1]  = βᵢ₊₁        # Tᵢ₊₁.ᵢ  = βᵢ₊₁
+    nzval_Tᴴ[pαᵢ+1] = conj(γᵢ₊₁)  # Tᴴᵢ₊₁.ᵢ = γ̄ᵢ₊₁
+    if i ≤ k-1
+      nzval_T[pαᵢ+2]  = γᵢ₊₁        # Tᵢ.ᵢ₊₁  = γᵢ₊₁
+      nzval_Tᴴ[pαᵢ+2] = conj(βᵢ₊₁)  # Tᴴᵢ.ᵢ₊₁ = β̄ᵢ₊₁
+    end
+    pαᵢ = pαᵢ + 3
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H = arnoldi(A, b, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `b`: a vector of length n;
+* `k`: the number of iterations of the Arnoldi process.
+
+#### Output arguments
+
+* `V`: a dense n × (k+1) matrix;
+* `H`: a dense (k+1) × k upper Hessenberg matrix.
+
+#### Reference
+
+* W. E. Arnoldi, [*The principle of minimized iterations in the solution of the matrix eigenvalue problem*](https://doi.org/10.1090/qam/42792), Quarterly of Applied Mathematics, 9, pp. 17--29, 1951.
+"""
+function arnoldi(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  V = M(undef, n, k+1)
+  H = zeros(FC, k+1, k)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    if i == 1
+      β = @knrm2(n, b)
+      vᵢ .= b ./ β
+    end
+    mul!(q, A, vᵢ)
+    for j = 1:i
+      vⱼ = view(V,:,j)
+      H[j,i] = @kdot(n, vⱼ, q)
+      @kaxpy!(n, -H[j,i], vⱼ, q)
+    end
+    H[i+1,i] = @knrm2(n, q)
+    vᵢ₊₁ .= q ./ H[i+1,i]
+  end
+  return V, H
+end
+
+"""
+    V, U, L = golub_kahan(A, b, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `k`: the number of iterations of the Golub-Kahan process.
+
+#### Output arguments
+
+* `V`: a dense n × (k+1) matrix;
+* `U`: a dense m × (k+1) matrix;
+* `L`: a sparse (k+1) × (k+1) lower bidiagonal matrix.
+
+#### References
+
+* G. H. Golub and W. Kahan, [*Calculating the Singular Values and Pseudo-Inverse of a Matrix*](https://doi.org/10.1137/0702016), SIAM Journal on Numerical Analysis, 2(2), pp. 225--224, 1965.
+* C. C. Paige, [*Bidiagonalization of Matrices and Solution of Linear Equations*](https://doi.org/10.1137/0711019), SIAM Journal on Numerical Analysis, 11(1), pp. 197--209, 1974.
+"""
+function golub_kahan(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  R = real(FC)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+2)
+  rowval = zeros(Int, 2k+1)
+  nzval = zeros(R, 2k+1)
+
+  colptr[1] = 1
+  for i = 1:k
+    pos = colptr[i]
+    colptr[i+1] = pos+2
+    rowval[pos] = i
+    rowval[pos+1] = i+1
+  end
+  rowval[2k+1] = k+1
+  colptr[k+2] = 2k+2
+
+  V = M(undef, n, k+1)
+  U = M(undef, m, k+1)
+  L = SparseMatrixCSC(k+1, k+1, colptr, rowval, nzval)
+
+  pαᵢ = 1  # Position of αᵢ in the vector `nzval`
+  for i = 1:k
+    uᵢ = view(U,:,i)
+    vᵢ = view(V,:,i)
+    uᵢ₊₁ = q = view(U,:,i+1)
+    vᵢ₊₁ = p = view(V,:,i+1)
+    if i == 1
+      wᵢ = vᵢ
+      βᵢ = @knrm2(m, b)
+      uᵢ .= b ./ βᵢ
+      mul!(wᵢ, Aᴴ, uᵢ)
+      αᵢ = @knrm2(n, wᵢ)
+      nzval[pαᵢ] = αᵢ  # Lᵢ.ᵢ = αᵢ
+      vᵢ .= wᵢ ./ αᵢ
+    end
+    mul!(q, A, vᵢ)
+    αᵢ = nzval[pαᵢ]  # αᵢ = Lᵢ.ᵢ
+    @kaxpy!(m, -αᵢ, uᵢ, q)
+    βᵢ₊₁ = @knrm2(m, q)
+    uᵢ₊₁ .= q ./ βᵢ₊₁
+    mul!(p, Aᴴ, uᵢ₊₁)
+    @kaxpy!(n, -βᵢ₊₁, vᵢ, p)
+    αᵢ₊₁ = @knrm2(n, p)
+    vᵢ₊₁ .= p ./ αᵢ₊₁
+    nzval[pαᵢ+1] = βᵢ₊₁  # Lᵢ₊₁.ᵢ   = βᵢ₊₁
+    nzval[pαᵢ+2] = αᵢ₊₁  # Lᵢ₊₁.ᵢ₊₁ = αᵢ₊₁
+    pαᵢ = pαᵢ + 2
+  end
+  return V, U, L
+end
+
+"""
+    V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n;
+* `k`: the number of iterations of the Saunders-Simon-Yip process.
+
+#### Output arguments
+
+* `V`: a dense m × (k+1) matrix;
+* `T`: a sparse (k+1) × k tridiagonal matrix;
+* `U`: a dense n × (k+1) matrix;
+* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* M. A. Saunders, H. D. Simon, and E. L. Yip, [*Two Conjugate-Gradient-Type Methods for Unsymmetric Linear Equations*](https://doi.org/10.1137/0725052), SIAM Journal on Numerical Analysis, 25(4), pp. 927--940, 1988.
+"""
+function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval_T = zeros(FC, 3k-1)
+  nzval_Tᴴ = zeros(FC, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, m, k+1)
+  U = M(undef, n, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ)
+
+  pαᵢ = 1  # Position of αᵢ and ᾱᵢ in the vectors `nzval_T` and `nzval_Tᴴ`
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      β = @knrm2(m, b)
+      γ = @knrm2(n, c)
+      vᵢ .= b ./ β
+      uᵢ .= c ./ γ
+    end
+    mul!(q, A , uᵢ)
+    mul!(p, Aᴴ, vᵢ)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      uᵢ₋₁ = view(U,:,i-1)
+      βᵢ = nzval_T[pαᵢ-2]  # βᵢ = Tᵢ.ᵢ₋₁
+      γᵢ = nzval_T[pαᵢ-1]  # γᵢ = Tᵢ₋₁.ᵢ
+      @kaxpy!(m, -γᵢ, vᵢ₋₁, q)
+      @kaxpy!(n, -βᵢ, uᵢ₋₁, p)
+    end
+    αᵢ = @kdot(m, vᵢ, q)
+    nzval_T[pαᵢ]  = αᵢ        # Tᵢ.ᵢ  = αᵢ
+    nzval_Tᴴ[pαᵢ] = conj(αᵢ)  # Tᴴᵢ.ᵢ = ᾱᵢ
+    @kaxpy!(m, -     αᵢ , vᵢ, q)
+    @kaxpy!(n, -conj(αᵢ), uᵢ, p)
+    βᵢ₊₁ = @knrm2(m, q)
+    γᵢ₊₁ = @knrm2(n, p)
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+    uᵢ₊₁ .= p ./ γᵢ₊₁
+    nzval_T[pαᵢ+1]  = βᵢ₊₁  # Tᵢ₊₁.ᵢ  = βᵢ₊₁
+    nzval_Tᴴ[pαᵢ+1] = γᵢ₊₁  # Tᴴᵢ₊₁.ᵢ = γᵢ₊₁
+    if i ≤ k-1
+      nzval_T[pαᵢ+2]  = γᵢ₊₁  # Tᵢ.ᵢ₊₁  = γᵢ₊₁
+      nzval_Tᴴ[pαᵢ+2] = βᵢ₊₁  # Tᴴᵢ.ᵢ₊₁ = βᵢ₊₁
+    end
+    pαᵢ = pαᵢ + 3
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H, U, F = montoison_orban(A, B, b, c, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a linear operator that models a matrix of dimension n × m;
+* `b`: a vector of length m;
+* `c`: a vector of length n;
+* `k`: the number of iterations of the Montoison-Orban process.
+
+#### Output arguments
+
+* `V`: a dense m × (k+1) matrix;
+* `H`: a dense (k+1) × k upper Hessenberg matrix;
+* `U`: a dense n × (k+1) matrix;
+* `F`: a dense (k+1) × k upper Hessenberg matrix.
+
+#### Reference
+
+* A. Montoison and D. Orban, [*GPMR: An Iterative Method for Unsymmetric Partitioned Linear Systems*](https://dx.doi.org/10.13140/RG.2.2.24069.68326), Cahier du GERAD G-2021-62, GERAD, Montréal, 2021.
+"""
+function montoison_orban(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  V = M(undef, m, k+1)
+  U = M(undef, n, k+1)
+  H = zeros(FC, k+1, k)
+  F = zeros(FC, k+1, k)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      β = @knrm2(m, b)
+      γ = @knrm2(n, c)
+      vᵢ .= b ./ β
+      uᵢ .= c ./ γ
+    end
+    mul!(q, A, uᵢ)
+    mul!(p, B, vᵢ)
+    for j = 1:i
+      vⱼ = view(V,:,j)
+      uⱼ = view(U,:,j)
+      H[j,i] = @kdot(m, vⱼ, q)
+      @kaxpy!(n, -H[j,i], vⱼ, q)
+      F[j,i] = @kdot(n, uⱼ, p)
+      @kaxpy!(m, -F[j,i], uⱼ, p)
+    end
+    H[i+1,i] = @knrm2(m, q)
+    vᵢ₊₁ .= q ./ H[i+1,i]
+    F[i+1,i] = @knrm2(n, p)
+    uᵢ₊₁ .= p ./ F[i+1,i]
+  end
+  return V, H, U, F
+end
diff --git a/src/krylov_solvers.jl b/src/krylov_solvers.jl
index 8a109a2be..bd2bc8a0e 100644
--- a/src/krylov_solvers.jl
+++ b/src/krylov_solvers.jl
@@ -3,11 +3,13 @@ CgLanczosShiftSolver, MinresQlpSolver, DqgmresSolver, DiomSolver, UsymlqSolver,
 UsymqrSolver, TricgSolver, TrimrSolver, TrilqrSolver, CgsSolver, BicgstabSolver,
 BilqSolver, QmrSolver, BilqrSolver, CglsSolver, CrlsSolver, CgneSolver, CrmrSolver,
 LslqSolver, LsqrSolver, LsmrSolver, LnlqSolver, CraigSolver, CraigmrSolver,
-GmresSolver, FomSolver, GpmrSolver
+GmresSolver, FomSolver, GpmrSolver, FgmresSolver
 
 export solve!, solution, nsolution, statistics, issolved, issolved_primal, issolved_dual,
 niterations, Aprod, Atprod, Bprod, warm_start!
 
+import Base.size, Base.sizeof, Base.format_bytes
+
 const KRYLOV_SOLVERS = Dict(
   :cg               => :CgSolver            ,
   :cr               => :CrSolver            ,
@@ -20,6 +22,7 @@ const KRYLOV_SOLVERS = Dict(
   :fom              => :FomSolver           ,
   :dqgmres          => :DqgmresSolver       ,
   :gmres            => :GmresSolver         ,
+  :fgmres           => :FgmresSolver        ,
   :gpmr             => :GpmrSolver          ,
   :usymlq           => :UsymlqSolver        ,
   :usymqr           => :UsymqrSolver        ,
@@ -51,12 +54,14 @@ Type for storing the vectors required by the in-place version of MINRES.
 
 The outer constructors
 
-    solver = MinresSolver(n, m, S; window :: Int=5)
+    solver = MinresSolver(m, n, S; window :: Int=5)
     solver = MinresSolver(A, b; window :: Int=5)
 
 may be used in order to create these vectors.
 """
 mutable struct MinresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   r1         :: S
@@ -68,29 +73,29 @@ mutable struct MinresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   err_vec    :: Vector{T}
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function MinresSolver(n, m, S; window :: Int=5)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    r1 = S(undef, n)
-    r2 = S(undef, n)
-    w1 = S(undef, n)
-    w2 = S(undef, n)
-    y  = S(undef, n)
-    v  = S(undef, 0)
-    err_vec = zeros(T, window)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, r1, r2, w1, w2, y, v, err_vec, false, stats)
-    return solver
-  end
+function MinresSolver(m, n, S; window :: Int=5)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  r1 = S(undef, n)
+  r2 = S(undef, n)
+  w1 = S(undef, n)
+  w2 = S(undef, n)
+  y  = S(undef, n)
+  v  = S(undef, 0)
+  err_vec = zeros(T, window)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = MinresSolver{T,FC,S}(m, n, Δx, x, r1, r2, w1, w2, y, v, err_vec, false, stats)
+  return solver
+end
 
-  function MinresSolver(A, b; window :: Int=5)
-    n, m = size(A)
-    S = ktypeof(b)
-    MinresSolver(n, m, S, window=window)
-  end
+function MinresSolver(A, b; window :: Int=5)
+  m, n = size(A)
+  S = ktypeof(b)
+  MinresSolver(m, n, S, window=window)
 end
 
 """
@@ -98,12 +103,14 @@ Type for storing the vectors required by the in-place version of CG.
 
 The outer constructors
 
-    solver = CgSolver(n, m, S)
+    solver = CgSolver(m, n, S)
     solver = CgSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CgSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   r          :: S
@@ -112,26 +119,26 @@ mutable struct CgSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   z          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function CgSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    r  = S(undef, n)
-    p  = S(undef, n)
-    Ap = S(undef, n)
-    z  = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, r, p, Ap, z, false, stats)
-    return solver
-  end
+function CgSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  r  = S(undef, n)
+  p  = S(undef, n)
+  Ap = S(undef, n)
+  z  = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CgSolver{T,FC,S}(m, n, Δx, x, r, p, Ap, z, false, stats)
+  return solver
+end
 
-  function CgSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CgSolver(n, m, S)
-  end
+function CgSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CgSolver(m, n, S)
 end
 
 """
@@ -139,12 +146,14 @@ Type for storing the vectors required by the in-place version of CR.
 
 The outer constructors
 
-    solver = CrSolver(n, m, S)
+    solver = CrSolver(m, n, S)
     solver = CrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   r          :: S
@@ -154,27 +163,27 @@ mutable struct CrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   Mq         :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function CrSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    r  = S(undef, n)
-    p  = S(undef, n)
-    q  = S(undef, n)
-    Ar = S(undef, n)
-    Mq = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, r, p, q, Ar, Mq, false, stats)
-    return solver
-  end
+function CrSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  r  = S(undef, n)
+  p  = S(undef, n)
+  q  = S(undef, n)
+  Ar = S(undef, n)
+  Mq = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CrSolver{T,FC,S}(m, n, Δx, x, r, p, q, Ar, Mq, false, stats)
+  return solver
+end
 
-  function CrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CrSolver(n, m, S)
-  end
+function CrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CrSolver(m, n, S)
 end
 
 """
@@ -182,12 +191,14 @@ Type for storing the vectors required by the in-place version of SYMMLQ.
 
 The outer constructors
 
-    solver = SymmlqSolver(n, m, S)
+    solver = SymmlqSolver(m, n, S)
     solver = SymmlqSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct SymmlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   Mvold      :: S
@@ -200,30 +211,30 @@ mutable struct SymmlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   sprod      :: Vector{T}
   warm_start :: Bool
   stats      :: SymmlqStats{T}
+end
 
-  function SymmlqSolver(n, m, S; window :: Int=5)
-    FC      = eltype(S)
-    T       = real(FC)
-    Δx      = S(undef, 0)
-    x       = S(undef, n)
-    Mvold   = S(undef, n)
-    Mv      = S(undef, n)
-    Mv_next = S(undef, n)
-    w̅       = S(undef, n)
-    v       = S(undef, 0)
-    clist   = zeros(T, window)
-    zlist   = zeros(T, window)
-    sprod   = ones(T, window)
-    stats = SymmlqStats(0, false, T[], Union{T, Missing}[], T[], Union{T, Missing}[], T(NaN), T(NaN), "unknown")
-    solver = new{T,FC,S}(Δx, x, Mvold, Mv, Mv_next, w̅, v, clist, zlist, sprod, false, stats)
-    return solver
-  end
+function SymmlqSolver(m, n, S; window :: Int=5)
+  FC      = eltype(S)
+  T       = real(FC)
+  Δx      = S(undef, 0)
+  x       = S(undef, n)
+  Mvold   = S(undef, n)
+  Mv      = S(undef, n)
+  Mv_next = S(undef, n)
+  w̅       = S(undef, n)
+  v       = S(undef, 0)
+  clist   = zeros(T, window)
+  zlist   = zeros(T, window)
+  sprod   = ones(T, window)
+  stats = SymmlqStats(0, false, T[], Union{T, Missing}[], T[], Union{T, Missing}[], T(NaN), T(NaN), "unknown")
+  solver = SymmlqSolver{T,FC,S}(m, n, Δx, x, Mvold, Mv, Mv_next, w̅, v, clist, zlist, sprod, false, stats)
+  return solver
+end
 
-  function SymmlqSolver(A, b; window :: Int=5)
-    n, m = size(A)
-    S = ktypeof(b)
-    SymmlqSolver(n, m, S, window=window)
-  end
+function SymmlqSolver(A, b; window :: Int=5)
+  m, n = size(A)
+  S = ktypeof(b)
+  SymmlqSolver(m, n, S, window=window)
 end
 
 """
@@ -231,12 +242,14 @@ Type for storing the vectors required by the in-place version of CG-LANCZOS.
 
 The outer constructors
 
-    solver = CgLanczosSolver(n, m, S)
+    solver = CgLanczosSolver(m, n, S)
     solver = CgLanczosSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CgLanczosSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   Mv         :: S
@@ -246,27 +259,27 @@ mutable struct CgLanczosSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v          :: S
   warm_start :: Bool
   stats      :: LanczosStats{T}
+end
 
-  function CgLanczosSolver(n, m, S)
-    FC      = eltype(S)
-    T       = real(FC)
-    Δx      = S(undef, 0)
-    x       = S(undef, n)
-    Mv      = S(undef, n)
-    Mv_prev = S(undef, n)
-    p       = S(undef, n)
-    Mv_next = S(undef, n)
-    v       = S(undef, 0)
-    stats = LanczosStats(0, false, T[], false, T(NaN), T(NaN), "unknown")
-    solver = new{T,FC,S}(Δx, x, Mv, Mv_prev, p, Mv_next, v, false, stats)
-    return solver
-  end
+function CgLanczosSolver(m, n, S)
+  FC      = eltype(S)
+  T       = real(FC)
+  Δx      = S(undef, 0)
+  x       = S(undef, n)
+  Mv      = S(undef, n)
+  Mv_prev = S(undef, n)
+  p       = S(undef, n)
+  Mv_next = S(undef, n)
+  v       = S(undef, 0)
+  stats = LanczosStats(0, false, T[], false, T(NaN), T(NaN), "unknown")
+  solver = CgLanczosSolver{T,FC,S}(m, n, Δx, x, Mv, Mv_prev, p, Mv_next, v, false, stats)
+  return solver
+end
 
-  function CgLanczosSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CgLanczosSolver(n, m, S)
-  end
+function CgLanczosSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CgLanczosSolver(m, n, S)
 end
 
 """
@@ -274,12 +287,14 @@ Type for storing the vectors required by the in-place version of CG-LANCZOS-SHIF
 
 The outer constructors
 
-    solver = CgLanczosShiftSolver(n, m, nshifts, S)
+    solver = CgLanczosShiftSolver(m, n, nshifts, S)
     solver = CgLanczosShiftSolver(A, b, nshifts)
 
 may be used in order to create these vectors.
 """
 mutable struct CgLanczosShiftSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Mv         :: S
   Mv_prev    :: S
   Mv_next    :: S
@@ -294,34 +309,34 @@ mutable struct CgLanczosShiftSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   converged  :: BitVector
   not_cv     :: BitVector
   stats      :: LanczosShiftStats{T}
+end
 
-  function CgLanczosShiftSolver(n, m, nshifts, S)
-    FC         = eltype(S)
-    T          = real(FC)
-    Mv         = S(undef, n)
-    Mv_prev    = S(undef, n)
-    Mv_next    = S(undef, n)
-    v          = S(undef, 0)
-    x          = [S(undef, n) for i = 1 : nshifts]
-    p          = [S(undef, n) for i = 1 : nshifts]
-    σ          = Vector{T}(undef, nshifts)
-    δhat       = Vector{T}(undef, nshifts)
-    ω          = Vector{T}(undef, nshifts)
-    γ          = Vector{T}(undef, nshifts)
-    rNorms     = Vector{T}(undef, nshifts)
-    indefinite = BitVector(undef, nshifts)
-    converged  = BitVector(undef, nshifts)
-    not_cv     = BitVector(undef, nshifts)
-    stats = LanczosShiftStats(0, false, [T[] for i = 1 : nshifts], indefinite, T(NaN), T(NaN), "unknown")
-    solver = new{T,FC,S}(Mv, Mv_prev, Mv_next, v, x, p, σ, δhat, ω, γ, rNorms, converged, not_cv, stats)
-    return solver
-  end
+function CgLanczosShiftSolver(m, n, nshifts, S)
+  FC         = eltype(S)
+  T          = real(FC)
+  Mv         = S(undef, n)
+  Mv_prev    = S(undef, n)
+  Mv_next    = S(undef, n)
+  v          = S(undef, 0)
+  x          = S[S(undef, n) for i = 1 : nshifts]
+  p          = S[S(undef, n) for i = 1 : nshifts]
+  σ          = Vector{T}(undef, nshifts)
+  δhat       = Vector{T}(undef, nshifts)
+  ω          = Vector{T}(undef, nshifts)
+  γ          = Vector{T}(undef, nshifts)
+  rNorms     = Vector{T}(undef, nshifts)
+  indefinite = BitVector(undef, nshifts)
+  converged  = BitVector(undef, nshifts)
+  not_cv     = BitVector(undef, nshifts)
+  stats = LanczosShiftStats(0, false, Vector{T}[T[] for i = 1 : nshifts], indefinite, T(NaN), T(NaN), "unknown")
+  solver = CgLanczosShiftSolver{T,FC,S}(m, n, Mv, Mv_prev, Mv_next, v, x, p, σ, δhat, ω, γ, rNorms, converged, not_cv, stats)
+  return solver
+end
 
-  function CgLanczosShiftSolver(A, b, nshifts)
-    n, m = size(A)
-    S = ktypeof(b)
-    CgLanczosShiftSolver(n, m, nshifts, S)
-  end
+function CgLanczosShiftSolver(A, b, nshifts)
+  m, n = size(A)
+  S = ktypeof(b)
+  CgLanczosShiftSolver(m, n, nshifts, S)
 end
 
 """
@@ -329,12 +344,14 @@ Type for storing the vectors required by the in-place version of MINRES-QLP.
 
 The outer constructors
 
-    solver = MinresQlpSolver(n, m, S)
+    solver = MinresQlpSolver(m, n, S)
     solver = MinresQlpSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct MinresQlpSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   wₖ₋₁       :: S
   wₖ         :: S
@@ -345,28 +362,28 @@ mutable struct MinresQlpSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   vₖ         :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function MinresQlpSolver(n, m, S)
-    FC      = eltype(S)
-    T       = real(FC)
-    Δx      = S(undef, 0)
-    wₖ₋₁    = S(undef, n)
-    wₖ      = S(undef, n)
-    M⁻¹vₖ₋₁ = S(undef, n)
-    M⁻¹vₖ   = S(undef, n)
-    x       = S(undef, n)
-    p       = S(undef, n)
-    vₖ      = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, wₖ₋₁, wₖ, M⁻¹vₖ₋₁, M⁻¹vₖ, x, p, vₖ, false, stats)
-    return solver
-  end
+function MinresQlpSolver(m, n, S)
+  FC      = eltype(S)
+  T       = real(FC)
+  Δx      = S(undef, 0)
+  wₖ₋₁    = S(undef, n)
+  wₖ      = S(undef, n)
+  M⁻¹vₖ₋₁ = S(undef, n)
+  M⁻¹vₖ   = S(undef, n)
+  x       = S(undef, n)
+  p       = S(undef, n)
+  vₖ      = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = MinresQlpSolver{T,FC,S}(m, n, Δx, wₖ₋₁, wₖ, M⁻¹vₖ₋₁, M⁻¹vₖ, x, p, vₖ, false, stats)
+  return solver
+end
 
-  function MinresQlpSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    MinresQlpSolver(n, m, S)
-  end
+function MinresQlpSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  MinresQlpSolver(m, n, S)
 end
 
 """
@@ -374,13 +391,15 @@ Type for storing the vectors required by the in-place version of DQGMRES.
 
 The outer constructors
 
-    solver = DqgmresSolver(n, m, memory, S)
+    solver = DqgmresSolver(m, n, memory, S)
     solver = DqgmresSolver(A, b, memory = 20)
 
 may be used in order to create these vectors.
 `memory` is set to `n` if the value given is larger than `n`.
 """
 mutable struct DqgmresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   t          :: S
@@ -393,31 +412,31 @@ mutable struct DqgmresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   H          :: Vector{FC}
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function DqgmresSolver(n, m, memory, S)
-    memory = min(n, memory)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    t  = S(undef, n)
-    z  = S(undef, 0)
-    w  = S(undef, 0)
-    P  = [S(undef, n) for i = 1 : memory]
-    V  = [S(undef, n) for i = 1 : memory]
-    c  = Vector{T}(undef, memory)
-    s  = Vector{FC}(undef, memory)
-    H  = Vector{FC}(undef, memory+2)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, t, z, w, P, V, c, s, H, false, stats)
-    return solver
-  end
+function DqgmresSolver(m, n, memory, S)
+  memory = min(m, memory)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  t  = S(undef, n)
+  z  = S(undef, 0)
+  w  = S(undef, 0)
+  P  = S[S(undef, n) for i = 1 : memory]
+  V  = S[S(undef, n) for i = 1 : memory]
+  c  = Vector{T}(undef, memory)
+  s  = Vector{FC}(undef, memory)
+  H  = Vector{FC}(undef, memory+1)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = DqgmresSolver{T,FC,S}(m, n, Δx, x, t, z, w, P, V, c, s, H, false, stats)
+  return solver
+end
 
-  function DqgmresSolver(A, b, memory = 20)
-    n, m = size(A)
-    S = ktypeof(b)
-    DqgmresSolver(n, m, memory, S)
-  end
+function DqgmresSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  DqgmresSolver(m, n, memory, S)
 end
 
 """
@@ -425,13 +444,15 @@ Type for storing the vectors required by the in-place version of DIOM.
 
 The outer constructors
 
-    solver = DiomSolver(n, m, memory, S)
+    solver = DiomSolver(m, n, memory, S)
     solver = DiomSolver(A, b, memory = 20)
 
 may be used in order to create these vectors.
 `memory` is set to `n` if the value given is larger than `n`.
 """
 mutable struct DiomSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   t          :: S
@@ -443,30 +464,30 @@ mutable struct DiomSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   H          :: Vector{FC}
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function DiomSolver(n, m, memory, S)
-    memory = min(n, memory)
-    FC  = eltype(S)
-    T   = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    t  = S(undef, n)
-    z  = S(undef, 0)
-    w  = S(undef, 0)
-    P  = [S(undef, n) for i = 1 : memory]
-    V  = [S(undef, n) for i = 1 : memory]
-    L  = Vector{FC}(undef, memory)
-    H  = Vector{FC}(undef, memory+2)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, t, z, w, P, V, L, H, false, stats)
-    return solver
-  end
+function DiomSolver(m, n, memory, S)
+  memory = min(m, memory)
+  FC  = eltype(S)
+  T   = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  t  = S(undef, n)
+  z  = S(undef, 0)
+  w  = S(undef, 0)
+  P  = S[S(undef, n) for i = 1 : memory-1]
+  V  = S[S(undef, n) for i = 1 : memory]
+  L  = Vector{FC}(undef, memory-1)
+  H  = Vector{FC}(undef, memory)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = DiomSolver{T,FC,S}(m, n, Δx, x, t, z, w, P, V, L, H, false, stats)
+  return solver
+end
 
-  function DiomSolver(A, b, memory = 20)
-    n, m = size(A)
-    S = ktypeof(b)
-    DiomSolver(n, m, memory, S)
-  end
+function DiomSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  DiomSolver(m, n, memory, S)
 end
 
 """
@@ -474,12 +495,14 @@ Type for storing the vectors required by the in-place version of USYMLQ.
 
 The outer constructors
 
-    solver = UsymlqSolver(n, m, S)
+    solver = UsymlqSolver(m, n, S)
     solver = UsymlqSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct UsymlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   uₖ₋₁       :: S
   uₖ         :: S
   p          :: S
@@ -491,29 +514,29 @@ mutable struct UsymlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   q          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function UsymlqSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    uₖ₋₁ = S(undef, m)
-    uₖ   = S(undef, m)
-    p    = S(undef, m)
-    Δx   = S(undef, 0)
-    x    = S(undef, m)
-    d̅    = S(undef, m)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    q    = S(undef, n)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(uₖ₋₁, uₖ, p, Δx, x, d̅, vₖ₋₁, vₖ, q, false, stats)
-    return solver
-  end
+function UsymlqSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  p    = S(undef, n)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  d̅    = S(undef, n)
+  vₖ₋₁ = S(undef, m)
+  vₖ   = S(undef, m)
+  q    = S(undef, m)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = UsymlqSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, p, Δx, x, d̅, vₖ₋₁, vₖ, q, false, stats)
+  return solver
+end
 
-  function UsymlqSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    UsymlqSolver(n, m, S)
-  end
+function UsymlqSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  UsymlqSolver(m, n, S)
 end
 
 """
@@ -521,12 +544,14 @@ Type for storing the vectors required by the in-place version of USYMQR.
 
 The outer constructors
 
-    solver = UsymqrSolver(n, m, S)
+    solver = UsymqrSolver(m, n, S)
     solver = UsymqrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct UsymqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   vₖ₋₁       :: S
   vₖ         :: S
   q          :: S
@@ -539,30 +564,30 @@ mutable struct UsymqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   p          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function UsymqrSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    q    = S(undef, n)
-    Δx   = S(undef, 0)
-    x    = S(undef, m)
-    wₖ₋₂ = S(undef, m)
-    wₖ₋₁ = S(undef, m)
-    uₖ₋₁ = S(undef, m)
-    uₖ   = S(undef, m)
-    p    = S(undef, m)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(vₖ₋₁, vₖ, q, Δx, x, wₖ₋₂, wₖ₋₁, uₖ₋₁, uₖ, p, false, stats)
-    return solver
-  end
+function UsymqrSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  vₖ₋₁ = S(undef, m)
+  vₖ   = S(undef, m)
+  q    = S(undef, m)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  wₖ₋₂ = S(undef, n)
+  wₖ₋₁ = S(undef, n)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  p    = S(undef, n)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = UsymqrSolver{T,FC,S}(m, n, vₖ₋₁, vₖ, q, Δx, x, wₖ₋₂, wₖ₋₁, uₖ₋₁, uₖ, p, false, stats)
+  return solver
+end
 
-  function UsymqrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    UsymqrSolver(n, m, S)
-  end
+function UsymqrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  UsymqrSolver(m, n, S)
 end
 
 """
@@ -570,12 +595,14 @@ Type for storing the vectors required by the in-place version of TRICG.
 
 The outer constructors
 
-    solver = TricgSolver(n, m, S)
+    solver = TricgSolver(m, n, S)
     solver = TricgSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct TricgSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   y          :: S
   N⁻¹uₖ₋₁    :: S
   N⁻¹uₖ      :: S
@@ -594,36 +621,36 @@ mutable struct TricgSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   vₖ         :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function TricgSolver(n, m, S)
-    FC      = eltype(S)
-    T       = real(FC)
-    y       = S(undef, m)
-    N⁻¹uₖ₋₁ = S(undef, m)
-    N⁻¹uₖ   = S(undef, m)
-    p       = S(undef, m)
-    gy₂ₖ₋₁  = S(undef, m)
-    gy₂ₖ    = S(undef, m)
-    x       = S(undef, n)
-    M⁻¹vₖ₋₁ = S(undef, n)
-    M⁻¹vₖ   = S(undef, n)
-    q       = S(undef, n)
-    gx₂ₖ₋₁  = S(undef, n)
-    gx₂ₖ    = S(undef, n)
-    Δx      = S(undef, 0)
-    Δy      = S(undef, 0)
-    uₖ      = S(undef, 0)
-    vₖ      = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(y, N⁻¹uₖ₋₁, N⁻¹uₖ, p, gy₂ₖ₋₁, gy₂ₖ, x, M⁻¹vₖ₋₁, M⁻¹vₖ, q, gx₂ₖ₋₁, gx₂ₖ, Δx, Δy, uₖ, vₖ, false, stats)
-    return solver
-  end
+function TricgSolver(m, n, S)
+  FC      = eltype(S)
+  T       = real(FC)
+  y       = S(undef, n)
+  N⁻¹uₖ₋₁ = S(undef, n)
+  N⁻¹uₖ   = S(undef, n)
+  p       = S(undef, n)
+  gy₂ₖ₋₁  = S(undef, n)
+  gy₂ₖ    = S(undef, n)
+  x       = S(undef, m)
+  M⁻¹vₖ₋₁ = S(undef, m)
+  M⁻¹vₖ   = S(undef, m)
+  q       = S(undef, m)
+  gx₂ₖ₋₁  = S(undef, m)
+  gx₂ₖ    = S(undef, m)
+  Δx      = S(undef, 0)
+  Δy      = S(undef, 0)
+  uₖ      = S(undef, 0)
+  vₖ      = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = TricgSolver{T,FC,S}(m, n, y, N⁻¹uₖ₋₁, N⁻¹uₖ, p, gy₂ₖ₋₁, gy₂ₖ, x, M⁻¹vₖ₋₁, M⁻¹vₖ, q, gx₂ₖ₋₁, gx₂ₖ, Δx, Δy, uₖ, vₖ, false, stats)
+  return solver
+end
 
-  function TricgSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    TricgSolver(n, m, S)
-  end
+function TricgSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  TricgSolver(m, n, S)
 end
 
 """
@@ -631,12 +658,14 @@ Type for storing the vectors required by the in-place version of TRIMR.
 
 The outer constructors
 
-    solver = TrimrSolver(n, m, S)
+    solver = TrimrSolver(m, n, S)
     solver = TrimrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct TrimrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   y          :: S
   N⁻¹uₖ₋₁    :: S
   N⁻¹uₖ      :: S
@@ -659,40 +688,40 @@ mutable struct TrimrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   vₖ         :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function TrimrSolver(n, m, S)
-    FC      = eltype(S)
-    T       = real(FC)
-    y       = S(undef, m)
-    N⁻¹uₖ₋₁ = S(undef, m)
-    N⁻¹uₖ   = S(undef, m)
-    p       = S(undef, m)
-    gy₂ₖ₋₃  = S(undef, m)
-    gy₂ₖ₋₂  = S(undef, m)
-    gy₂ₖ₋₁  = S(undef, m)
-    gy₂ₖ    = S(undef, m)
-    x       = S(undef, n)
-    M⁻¹vₖ₋₁ = S(undef, n)
-    M⁻¹vₖ   = S(undef, n)
-    q       = S(undef, n)
-    gx₂ₖ₋₃  = S(undef, n)
-    gx₂ₖ₋₂  = S(undef, n)
-    gx₂ₖ₋₁  = S(undef, n)
-    gx₂ₖ    = S(undef, n)
-    Δx      = S(undef, 0)
-    Δy      = S(undef, 0)
-    uₖ      = S(undef, 0)
-    vₖ      = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(y, N⁻¹uₖ₋₁, N⁻¹uₖ, p, gy₂ₖ₋₃, gy₂ₖ₋₂, gy₂ₖ₋₁, gy₂ₖ, x, M⁻¹vₖ₋₁, M⁻¹vₖ, q, gx₂ₖ₋₃, gx₂ₖ₋₂, gx₂ₖ₋₁, gx₂ₖ, Δx, Δy, uₖ, vₖ, false, stats)
-    return solver
-  end
+function TrimrSolver(m, n, S)
+  FC      = eltype(S)
+  T       = real(FC)
+  y       = S(undef, n)
+  N⁻¹uₖ₋₁ = S(undef, n)
+  N⁻¹uₖ   = S(undef, n)
+  p       = S(undef, n)
+  gy₂ₖ₋₃  = S(undef, n)
+  gy₂ₖ₋₂  = S(undef, n)
+  gy₂ₖ₋₁  = S(undef, n)
+  gy₂ₖ    = S(undef, n)
+  x       = S(undef, m)
+  M⁻¹vₖ₋₁ = S(undef, m)
+  M⁻¹vₖ   = S(undef, m)
+  q       = S(undef, m)
+  gx₂ₖ₋₃  = S(undef, m)
+  gx₂ₖ₋₂  = S(undef, m)
+  gx₂ₖ₋₁  = S(undef, m)
+  gx₂ₖ    = S(undef, m)
+  Δx      = S(undef, 0)
+  Δy      = S(undef, 0)
+  uₖ      = S(undef, 0)
+  vₖ      = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = TrimrSolver{T,FC,S}(m, n, y, N⁻¹uₖ₋₁, N⁻¹uₖ, p, gy₂ₖ₋₃, gy₂ₖ₋₂, gy₂ₖ₋₁, gy₂ₖ, x, M⁻¹vₖ₋₁, M⁻¹vₖ, q, gx₂ₖ₋₃, gx₂ₖ₋₂, gx₂ₖ₋₁, gx₂ₖ, Δx, Δy, uₖ, vₖ, false, stats)
+  return solver
+end
 
-  function TrimrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    TrimrSolver(n, m, S)
-  end
+function TrimrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  TrimrSolver(m, n, S)
 end
 
 """
@@ -700,12 +729,14 @@ Type for storing the vectors required by the in-place version of TRILQR.
 
 The outer constructors
 
-    solver = TrilqrSolver(n, m, S)
+    solver = TrilqrSolver(m, n, S)
     solver = TrilqrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct TrilqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   uₖ₋₁       :: S
   uₖ         :: S
   p          :: S
@@ -721,33 +752,33 @@ mutable struct TrilqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   wₖ₋₂       :: S
   warm_start :: Bool
   stats      :: AdjointStats{T}
+end
 
-  function TrilqrSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    uₖ₋₁ = S(undef, m)
-    uₖ   = S(undef, m)
-    p    = S(undef, m)
-    d̅    = S(undef, m)
-    Δx   = S(undef, 0)
-    x    = S(undef, m)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    q    = S(undef, n)
-    Δy   = S(undef, 0)
-    y    = S(undef, n)
-    wₖ₋₃ = S(undef, n)
-    wₖ₋₂ = S(undef, n)
-    stats = AdjointStats(0, false, false, T[], T[], "unknown")
-    solver = new{T,FC,S}(uₖ₋₁, uₖ, p, d̅, Δx, x, vₖ₋₁, vₖ, q, Δy, y, wₖ₋₃, wₖ₋₂, false, stats)
-    return solver
-  end
+function TrilqrSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  p    = S(undef, n)
+  d̅    = S(undef, n)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  vₖ₋₁ = S(undef, m)
+  vₖ   = S(undef, m)
+  q    = S(undef, m)
+  Δy   = S(undef, 0)
+  y    = S(undef, m)
+  wₖ₋₃ = S(undef, m)
+  wₖ₋₂ = S(undef, m)
+  stats = AdjointStats(0, false, false, T[], T[], "unknown")
+  solver = TrilqrSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, p, d̅, Δx, x, vₖ₋₁, vₖ, q, Δy, y, wₖ₋₃, wₖ₋₂, false, stats)
+  return solver
+end
 
-  function TrilqrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    TrilqrSolver(n, m, S)
-  end
+function TrilqrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  TrilqrSolver(m, n, S)
 end
 
 """
@@ -755,12 +786,14 @@ Type for storing the vectors required by the in-place version of CGS.
 
 The outer constructorss
 
-    solver = CgsSolver(n, m, S)
+    solver = CgsSolver(m, n, S)
     solver = CgsSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CgsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   r          :: S
@@ -772,29 +805,30 @@ mutable struct CgsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   vw         :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function CgsSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    r  = S(undef, n)
-    u  = S(undef, n)
-    p  = S(undef, n)
-    q  = S(undef, n)
-    ts = S(undef, n)
-    yz = S(undef, 0)
-    vw = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, r, u, p, q, ts, yz, vw, false, stats)
-    return solver
-  end
+function CgsSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  r  = S(undef, n)
+  u  = S(undef, n)
+  p  = S(undef, n)
+  q  = S(undef, n)
+  ts = S(undef, n)
+  yz = S(undef, 0)
+  vw = S(undef, 0)
+  
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CgsSolver{T,FC,S}(m, n, Δx, x, r, u, p, q, ts, yz, vw, false, stats)
+  return solver
+end
 
-  function CgsSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CgsSolver(n, m, S)
-  end
+function CgsSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CgsSolver(m, n, S)
 end
 
 """
@@ -802,12 +836,14 @@ Type for storing the vectors required by the in-place version of BICGSTAB.
 
 The outer constructors
 
-    solver = BicgstabSolver(n, m, S)
+    solver = BicgstabSolver(m, n, S)
     solver = BicgstabSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct BicgstabSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   r          :: S
@@ -819,29 +855,29 @@ mutable struct BicgstabSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   t          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function BicgstabSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    r  = S(undef, n)
-    p  = S(undef, n)
-    v  = S(undef, n)
-    s  = S(undef, n)
-    qd = S(undef, n)
-    yz = S(undef, 0)
-    t  = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, r, p, v, s, qd, yz, t, false, stats)
-    return solver
-  end
+function BicgstabSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  r  = S(undef, n)
+  p  = S(undef, n)
+  v  = S(undef, n)
+  s  = S(undef, n)
+  qd = S(undef, n)
+  yz = S(undef, 0)
+  t  = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = BicgstabSolver{T,FC,S}(m, n, Δx, x, r, p, v, s, qd, yz, t, false, stats)
+  return solver
+end
 
-  function BicgstabSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    BicgstabSolver(n, m, S)
-  end
+function BicgstabSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  BicgstabSolver(m, n, S)
 end
 
 """
@@ -849,12 +885,14 @@ Type for storing the vectors required by the in-place version of BILQ.
 
 The outer constructors
 
-    solver = BilqSolver(n, m, S)
+    solver = BilqSolver(m, n, S)
     solver = BilqSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct BilqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   uₖ₋₁       :: S
   uₖ         :: S
   q          :: S
@@ -866,29 +904,29 @@ mutable struct BilqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   d̅          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function BilqSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    uₖ₋₁ = S(undef, n)
-    uₖ   = S(undef, n)
-    q    = S(undef, n)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    p    = S(undef, n)
-    Δx   = S(undef, 0)
-    x    = S(undef, n)
-    d̅    = S(undef, n)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, d̅, false, stats)
-    return solver
-  end
+function BilqSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  q    = S(undef, n)
+  vₖ₋₁ = S(undef, n)
+  vₖ   = S(undef, n)
+  p    = S(undef, n)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  d̅    = S(undef, n)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = BilqSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, d̅, false, stats)
+  return solver
+end
 
-  function BilqSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    BilqSolver(n, m, S)
-  end
+function BilqSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  BilqSolver(m, n, S)
 end
 
 """
@@ -896,12 +934,14 @@ Type for storing the vectors required by the in-place version of QMR.
 
 The outer constructors
 
-    solver = QmrSolver(n, m, S)
+    solver = QmrSolver(m, n, S)
     solver = QmrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct QmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   uₖ₋₁       :: S
   uₖ         :: S
   q          :: S
@@ -914,30 +954,30 @@ mutable struct QmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   wₖ₋₁       :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function QmrSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    uₖ₋₁ = S(undef, n)
-    uₖ   = S(undef, n)
-    q    = S(undef, n)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    p    = S(undef, n)
-    Δx   = S(undef, 0)
-    x    = S(undef, n)
-    wₖ₋₂ = S(undef, n)
-    wₖ₋₁ = S(undef, n)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, wₖ₋₂, wₖ₋₁, false, stats)
-    return solver
-  end
+function QmrSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  q    = S(undef, n)
+  vₖ₋₁ = S(undef, n)
+  vₖ   = S(undef, n)
+  p    = S(undef, n)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  wₖ₋₂ = S(undef, n)
+  wₖ₋₁ = S(undef, n)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = QmrSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, wₖ₋₂, wₖ₋₁, false, stats)
+  return solver
+end
 
-  function QmrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    QmrSolver(n, m, S)
-  end
+function QmrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  QmrSolver(m, n, S)
 end
 
 """
@@ -945,12 +985,14 @@ Type for storing the vectors required by the in-place version of BILQR.
 
 The outer constructors
 
-    solver = BilqrSolver(n, m, S)
+    solver = BilqrSolver(m, n, S)
     solver = BilqrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct BilqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   uₖ₋₁       :: S
   uₖ         :: S
   q          :: S
@@ -966,33 +1008,33 @@ mutable struct BilqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   wₖ₋₂       :: S
   warm_start :: Bool
   stats      :: AdjointStats{T}
+end
 
-  function BilqrSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    uₖ₋₁ = S(undef, n)
-    uₖ   = S(undef, n)
-    q    = S(undef, n)
-    vₖ₋₁ = S(undef, n)
-    vₖ   = S(undef, n)
-    p    = S(undef, n)
-    Δx   = S(undef, 0)
-    x    = S(undef, n)
-    Δy   = S(undef, 0)
-    y    = S(undef, n)
-    d̅    = S(undef, n)
-    wₖ₋₃ = S(undef, n)
-    wₖ₋₂ = S(undef, n)
-    stats = AdjointStats(0, false, false, T[], T[], "unknown")
-    solver = new{T,FC,S}(uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, Δy, y, d̅, wₖ₋₃, wₖ₋₂, false, stats)
-    return solver
-  end
+function BilqrSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  uₖ₋₁ = S(undef, n)
+  uₖ   = S(undef, n)
+  q    = S(undef, n)
+  vₖ₋₁ = S(undef, n)
+  vₖ   = S(undef, n)
+  p    = S(undef, n)
+  Δx   = S(undef, 0)
+  x    = S(undef, n)
+  Δy   = S(undef, 0)
+  y    = S(undef, n)
+  d̅    = S(undef, n)
+  wₖ₋₃ = S(undef, n)
+  wₖ₋₂ = S(undef, n)
+  stats = AdjointStats(0, false, false, T[], T[], "unknown")
+  solver = BilqrSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, Δy, y, d̅, wₖ₋₃, wₖ₋₂, false, stats)
+  return solver
+end
 
-  function BilqrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    BilqrSolver(n, m, S)
-  end
+function BilqrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  BilqrSolver(m, n, S)
 end
 
 """
@@ -1000,12 +1042,14 @@ Type for storing the vectors required by the in-place version of CGLS.
 
 The outer constructors
 
-    solver = CglsSolver(n, m, S)
+    solver = CglsSolver(m, n, S)
     solver = CglsSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CglsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   p     :: S
   s     :: S
@@ -1013,26 +1057,26 @@ mutable struct CglsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   q     :: S
   Mr    :: S
   stats :: SimpleStats{T}
+end
 
-  function CglsSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    x  = S(undef, m)
-    p  = S(undef, m)
-    s  = S(undef, m)
-    r  = S(undef, n)
-    q  = S(undef, n)
-    Mr = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, p, s, r, q, Mr, stats)
-    return solver
-  end
+function CglsSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  x  = S(undef, n)
+  p  = S(undef, n)
+  s  = S(undef, n)
+  r  = S(undef, m)
+  q  = S(undef, m)
+  Mr = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CglsSolver{T,FC,S}(m, n, x, p, s, r, q, Mr, stats)
+  return solver
+end
 
-  function CglsSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CglsSolver(n, m, S)
-  end
+function CglsSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CglsSolver(m, n, S)
 end
 
 """
@@ -1040,12 +1084,14 @@ Type for storing the vectors required by the in-place version of CRLS.
 
 The outer constructors
 
-    solver = CrlsSolver(n, m, S)
+    solver = CrlsSolver(m, n, S)
     solver = CrlsSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CrlsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   p     :: S
   Ar    :: S
@@ -1055,28 +1101,28 @@ mutable struct CrlsSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   s     :: S
   Ms    :: S
   stats :: SimpleStats{T}
+end
 
-  function CrlsSolver(n, m, S)
-    FC = eltype(S)
-    T  = real(FC)
-    x  = S(undef, m)
-    p  = S(undef, m)
-    Ar = S(undef, m)
-    q  = S(undef, m)
-    r  = S(undef, n)
-    Ap = S(undef, n)
-    s  = S(undef, n)
-    Ms = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, p, Ar, q, r, Ap, s, Ms, stats)
-    return solver
-  end
+function CrlsSolver(m, n, S)
+  FC = eltype(S)
+  T  = real(FC)
+  x  = S(undef, n)
+  p  = S(undef, n)
+  Ar = S(undef, n)
+  q  = S(undef, n)
+  r  = S(undef, m)
+  Ap = S(undef, m)
+  s  = S(undef, m)
+  Ms = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CrlsSolver{T,FC,S}(m, n, x, p, Ar, q, r, Ap, s, Ms, stats)
+  return solver
+end
 
-  function CrlsSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CrlsSolver(n, m, S)
-  end
+function CrlsSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CrlsSolver(m, n, S)
 end
 
 """
@@ -1084,41 +1130,43 @@ Type for storing the vectors required by the in-place version of CGNE.
 
 The outer constructors
 
-    solver = CgneSolver(n, m, S)
+    solver = CgneSolver(m, n, S)
     solver = CgneSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CgneSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   p     :: S
-  Aᵀz   :: S
+  Aᴴz   :: S
   r     :: S
   q     :: S
   s     :: S
   z     :: S
   stats :: SimpleStats{T}
+end
 
-  function CgneSolver(n, m, S)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    p   = S(undef, m)
-    Aᵀz = S(undef, m)
-    r   = S(undef, n)
-    q   = S(undef, n)
-    s   = S(undef, 0)
-    z   = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, p, Aᵀz, r, q, s, z, stats)
-    return solver
-  end
+function CgneSolver(m, n, S)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  p   = S(undef, n)
+  Aᴴz = S(undef, n)
+  r   = S(undef, m)
+  q   = S(undef, m)
+  s   = S(undef, 0)
+  z   = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CgneSolver{T,FC,S}(m, n, x, p, Aᴴz, r, q, s, z, stats)
+  return solver
+end
 
-  function CgneSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CgneSolver(n, m, S)
-  end
+function CgneSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CgneSolver(m, n, S)
 end
 
 """
@@ -1126,41 +1174,43 @@ Type for storing the vectors required by the in-place version of CRMR.
 
 The outer constructors
 
-    solver = CrmrSolver(n, m, S)
+    solver = CrmrSolver(m, n, S)
     solver = CrmrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CrmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   p     :: S
-  Aᵀr   :: S
+  Aᴴr   :: S
   r     :: S
   q     :: S
-  Mq    :: S
+  Nq    :: S
   s     :: S
   stats :: SimpleStats{T}
+end
 
-  function CrmrSolver(n, m, S)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    p   = S(undef, m)
-    Aᵀr = S(undef, m)
-    r   = S(undef, n)
-    q   = S(undef, n)
-    Mq  = S(undef, 0)
-    s   = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, p, Aᵀr, r, q, Mq, s, stats)
-    return solver
-  end
+function CrmrSolver(m, n, S)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  p   = S(undef, n)
+  Aᴴr = S(undef, n)
+  r   = S(undef, m)
+  q   = S(undef, m)
+  Nq  = S(undef, 0)
+  s   = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CrmrSolver{T,FC,S}(m, n, x, p, Aᴴr, r, q, Nq, s, stats)
+  return solver
+end
 
-  function CrmrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CrmrSolver(n, m, S)
-  end
+function CrmrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CrmrSolver(m, n, S)
 end
 
 """
@@ -1168,15 +1218,17 @@ Type for storing the vectors required by the in-place version of LSLQ.
 
 The outer constructors
 
-    solver = LslqSolver(n, m, S)
+    solver = LslqSolver(m, n, S)
     solver = LslqSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct LslqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m       :: Int
+  n       :: Int
   x       :: S
   Nv      :: S
-  Aᵀu     :: S
+  Aᴴu     :: S
   w̄       :: S
   Mu      :: S
   Av      :: S
@@ -1184,29 +1236,29 @@ mutable struct LslqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v       :: S
   err_vec :: Vector{T}
   stats   :: LSLQStats{T}
+end
 
-  function LslqSolver(n, m, S; window :: Int=5)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    Nv  = S(undef, m)
-    Aᵀu = S(undef, m)
-    w̄   = S(undef, m)
-    Mu  = S(undef, n)
-    Av  = S(undef, n)
-    u   = S(undef, 0)
-    v   = S(undef, 0)
-    err_vec = zeros(T, window)
-    stats = LSLQStats(0, false, false, T[], T[], T[], false, T[], T[], "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, w̄, Mu, Av, u, v, err_vec, stats)
-    return solver
-  end
+function LslqSolver(m, n, S; window :: Int=5)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  Nv  = S(undef, n)
+  Aᴴu = S(undef, n)
+  w̄   = S(undef, n)
+  Mu  = S(undef, m)
+  Av  = S(undef, m)
+  u   = S(undef, 0)
+  v   = S(undef, 0)
+  err_vec = zeros(T, window)
+  stats = LSLQStats(0, false, false, T[], T[], T[], false, T[], T[], "unknown")
+  solver = LslqSolver{T,FC,S}(m, n, x, Nv, Aᴴu, w̄, Mu, Av, u, v, err_vec, stats)
+  return solver
+end
 
-  function LslqSolver(A, b; window :: Int=5)
-    n, m = size(A)
-    S = ktypeof(b)
-    LslqSolver(n, m, S, window=window)
-  end
+function LslqSolver(A, b; window :: Int=5)
+  m, n = size(A)
+  S = ktypeof(b)
+  LslqSolver(m, n, S, window=window)
 end
 
 """
@@ -1214,15 +1266,17 @@ Type for storing the vectors required by the in-place version of LSQR.
 
 The outer constructors
 
-    solver = LsqrSolver(n, m, S)
+    solver = LsqrSolver(m, n, S)
     solver = LsqrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct LsqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m       :: Int
+  n       :: Int
   x       :: S
   Nv      :: S
-  Aᵀu     :: S
+  Aᴴu     :: S
   w       :: S
   Mu      :: S
   Av      :: S
@@ -1230,29 +1284,29 @@ mutable struct LsqrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v       :: S
   err_vec :: Vector{T}
   stats   :: SimpleStats{T}
+end
 
-  function LsqrSolver(n, m, S; window :: Int=5)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    Nv  = S(undef, m)
-    Aᵀu = S(undef, m)
-    w   = S(undef, m)
-    Mu  = S(undef, n)
-    Av  = S(undef, n)
-    u   = S(undef, 0)
-    v   = S(undef, 0)
-    err_vec = zeros(T, window)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, w, Mu, Av, u, v, err_vec, stats)
-    return solver
-  end
+function LsqrSolver(m, n, S; window :: Int=5)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  Nv  = S(undef, n)
+  Aᴴu = S(undef, n)
+  w   = S(undef, n)
+  Mu  = S(undef, m)
+  Av  = S(undef, m)
+  u   = S(undef, 0)
+  v   = S(undef, 0)
+  err_vec = zeros(T, window)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = LsqrSolver{T,FC,S}(m, n, x, Nv, Aᴴu, w, Mu, Av, u, v, err_vec, stats)
+  return solver
+end
 
-  function LsqrSolver(A, b; window :: Int=5)
-    n, m = size(A)
-    S = ktypeof(b)
-    LsqrSolver(n, m, S, window=window)
-  end
+function LsqrSolver(A, b; window :: Int=5)
+  m, n = size(A)
+  S = ktypeof(b)
+  LsqrSolver(m, n, S, window=window)
 end
 
 """
@@ -1260,15 +1314,17 @@ Type for storing the vectors required by the in-place version of LSMR.
 
 The outer constructors
 
-    solver = LsmrSolver(n, m, S)
+    solver = LsmrSolver(m, n, S)
     solver = LsmrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct LsmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m       :: Int
+  n       :: Int
   x       :: S
   Nv      :: S
-  Aᵀu     :: S
+  Aᴴu     :: S
   h       :: S
   hbar    :: S
   Mu      :: S
@@ -1277,30 +1333,30 @@ mutable struct LsmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v       :: S
   err_vec :: Vector{T}
   stats   :: LsmrStats{T}
+end
 
-  function LsmrSolver(n, m, S; window :: Int=5)
-    FC   = eltype(S)
-    T    = real(FC)
-    x    = S(undef, m)
-    Nv   = S(undef, m)
-    Aᵀu  = S(undef, m)
-    h    = S(undef, m)
-    hbar = S(undef, m)
-    Mu   = S(undef, n)
-    Av   = S(undef, n)
-    u    = S(undef, 0)
-    v    = S(undef, 0)
-    err_vec = zeros(T, window)
-    stats = LsmrStats(0, false, false, T[], T[], zero(T), zero(T), zero(T), zero(T), zero(T), "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, h, hbar, Mu, Av, u, v, err_vec, stats)
-    return solver
-  end
+function LsmrSolver(m, n, S; window :: Int=5)
+  FC   = eltype(S)
+  T    = real(FC)
+  x    = S(undef, n)
+  Nv   = S(undef, n)
+  Aᴴu  = S(undef, n)
+  h    = S(undef, n)
+  hbar = S(undef, n)
+  Mu   = S(undef, m)
+  Av   = S(undef, m)
+  u    = S(undef, 0)
+  v    = S(undef, 0)
+  err_vec = zeros(T, window)
+  stats = LsmrStats(0, false, false, T[], T[], zero(T), zero(T), zero(T), zero(T), zero(T), "unknown")
+  solver = LsmrSolver{T,FC,S}(m, n, x, Nv, Aᴴu, h, hbar, Mu, Av, u, v, err_vec, stats)
+  return solver
+end
 
-  function LsmrSolver(A, b; window :: Int=5)
-    n, m = size(A)
-    S = ktypeof(b)
-    LsmrSolver(n, m, S, window=window)
-  end
+function LsmrSolver(A, b; window :: Int=5)
+  m, n = size(A)
+  S = ktypeof(b)
+  LsmrSolver(m, n, S, window=window)
 end
 
 """
@@ -1308,15 +1364,17 @@ Type for storing the vectors required by the in-place version of LNLQ.
 
 The outer constructors
 
-    solver = LnlqSolver(n, m, S)
+    solver = LnlqSolver(m, n, S)
     solver = LnlqSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct LnlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   Nv    :: S
-  Aᵀu   :: S
+  Aᴴu   :: S
   y     :: S
   w̄     :: S
   Mu    :: S
@@ -1325,30 +1383,30 @@ mutable struct LnlqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v     :: S
   q     :: S
   stats :: LNLQStats{T}
+end
 
-  function LnlqSolver(n, m, S)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    Nv  = S(undef, m)
-    Aᵀu = S(undef, m)
-    y   = S(undef, n)
-    w̄   = S(undef, n)
-    Mu  = S(undef, n)
-    Av  = S(undef, n)
-    u   = S(undef, 0)
-    v   = S(undef, 0)
-    q   = S(undef, 0)
-    stats = LNLQStats(0, false, T[], false, T[], T[], "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, y, w̄, Mu, Av, u, v, q, stats)
-    return solver
-  end
+function LnlqSolver(m, n, S)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  Nv  = S(undef, n)
+  Aᴴu = S(undef, n)
+  y   = S(undef, m)
+  w̄   = S(undef, m)
+  Mu  = S(undef, m)
+  Av  = S(undef, m)
+  u   = S(undef, 0)
+  v   = S(undef, 0)
+  q   = S(undef, 0)
+  stats = LNLQStats(0, false, T[], false, T[], T[], "unknown")
+  solver = LnlqSolver{T,FC,S}(m, n, x, Nv, Aᴴu, y, w̄, Mu, Av, u, v, q, stats)
+  return solver
+end
 
-  function LnlqSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    LnlqSolver(n, m, S)
-  end
+function LnlqSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  LnlqSolver(m, n, S)
 end
 
 """
@@ -1356,15 +1414,17 @@ Type for storing the vectors required by the in-place version of CRAIG.
 
 The outer constructors
 
-    solver = CraigSolver(n, m, S)
+    solver = CraigSolver(m, n, S)
     solver = CraigSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CraigSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   Nv    :: S
-  Aᵀu   :: S
+  Aᴴu   :: S
   y     :: S
   w     :: S
   Mu    :: S
@@ -1373,30 +1433,30 @@ mutable struct CraigSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v     :: S
   w2    :: S
   stats :: SimpleStats{T}
+end
 
-  function CraigSolver(n, m, S)
-    FC  = eltype(S)
-    T   = real(FC)
-    x   = S(undef, m)
-    Nv  = S(undef, m)
-    Aᵀu = S(undef, m)
-    y   = S(undef, n)
-    w   = S(undef, n)
-    Mu  = S(undef, n)
-    Av  = S(undef, n)
-    u   = S(undef, 0)
-    v   = S(undef, 0)
-    w2  = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, y, w, Mu, Av, u, v, w2, stats)
-    return solver
-  end
+function CraigSolver(m, n, S)
+  FC  = eltype(S)
+  T   = real(FC)
+  x   = S(undef, n)
+  Nv  = S(undef, n)
+  Aᴴu = S(undef, n)
+  y   = S(undef, m)
+  w   = S(undef, m)
+  Mu  = S(undef, m)
+  Av  = S(undef, m)
+  u   = S(undef, 0)
+  v   = S(undef, 0)
+  w2  = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CraigSolver{T,FC,S}(m, n, x, Nv, Aᴴu, y, w, Mu, Av, u, v, w2, stats)
+  return solver
+end
 
-  function CraigSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CraigSolver(n, m, S)
-  end
+function CraigSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CraigSolver(m, n, S)
 end
 
 """
@@ -1404,15 +1464,17 @@ Type for storing the vectors required by the in-place version of CRAIGMR.
 
 The outer constructors
 
-    solver = CraigmrSolver(n, m, S)
+    solver = CraigmrSolver(m, n, S)
     solver = CraigmrSolver(A, b)
 
 may be used in order to create these vectors.
 """
 mutable struct CraigmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m     :: Int
+  n     :: Int
   x     :: S
   Nv    :: S
-  Aᵀu   :: S
+  Aᴴu   :: S
   d     :: S
   y     :: S
   Mu    :: S
@@ -1423,32 +1485,32 @@ mutable struct CraigmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   v     :: S
   q     :: S
   stats :: SimpleStats{T}
+end
 
-  function CraigmrSolver(n, m, S)
-    FC   = eltype(S)
-    T    = real(FC)
-    x    = S(undef, m)
-    Nv   = S(undef, m)
-    Aᵀu  = S(undef, m)
-    d    = S(undef, m)
-    y    = S(undef, n)
-    Mu   = S(undef, n)
-    w    = S(undef, n)
-    wbar = S(undef, n)
-    Av   = S(undef, n)
-    u    = S(undef, 0)
-    v    = S(undef, 0)
-    q    = S(undef, 0)
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(x, Nv, Aᵀu, d, y, Mu, w, wbar, Av, u, v, q, stats)
-    return solver
-  end
+function CraigmrSolver(m, n, S)
+  FC   = eltype(S)
+  T    = real(FC)
+  x    = S(undef, n)
+  Nv   = S(undef, n)
+  Aᴴu  = S(undef, n)
+  d    = S(undef, n)
+  y    = S(undef, m)
+  Mu   = S(undef, m)
+  w    = S(undef, m)
+  wbar = S(undef, m)
+  Av   = S(undef, m)
+  u    = S(undef, 0)
+  v    = S(undef, 0)
+  q    = S(undef, 0)
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = CraigmrSolver{T,FC,S}(m, n, x, Nv, Aᴴu, d, y, Mu, w, wbar, Av, u, v, q, stats)
+  return solver
+end
 
-  function CraigmrSolver(A, b)
-    n, m = size(A)
-    S = ktypeof(b)
-    CraigmrSolver(n, m, S)
-  end
+function CraigmrSolver(A, b)
+  m, n = size(A)
+  S = ktypeof(b)
+  CraigmrSolver(m, n, S)
 end
 
 """
@@ -1456,13 +1518,15 @@ Type for storing the vectors required by the in-place version of GMRES.
 
 The outer constructors
 
-    solver = GmresSolver(n, m, memory, S)
+    solver = GmresSolver(m, n, memory, S)
     solver = GmresSolver(A, b, memory = 20)
 
 may be used in order to create these vectors.
 `memory` is set to `n` if the value given is larger than `n`.
 """
 mutable struct GmresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   w          :: S
@@ -1476,31 +1540,85 @@ mutable struct GmresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   warm_start :: Bool
   inner_iter :: Int
   stats      :: SimpleStats{T}
+end
 
-  function GmresSolver(n, m, memory, S)
-    memory = min(n, memory)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    w  = S(undef, n)
-    p  = S(undef, 0)
-    q  = S(undef, 0)
-    V  = [S(undef, n) for i = 1 : memory]
-    c  = Vector{T}(undef, memory)
-    s  = Vector{FC}(undef, memory)
-    z  = Vector{FC}(undef, memory)
-    R  = Vector{FC}(undef, div(memory * (memory+1), 2))
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, w, p, q, V, c, s, z, R, false, 0, stats)
-    return solver
-  end
+function GmresSolver(m, n, memory, S)
+  memory = min(m, memory)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  w  = S(undef, n)
+  p  = S(undef, 0)
+  q  = S(undef, 0)
+  V  = S[S(undef, n) for i = 1 : memory]
+  c  = Vector{T}(undef, memory)
+  s  = Vector{FC}(undef, memory)
+  z  = Vector{FC}(undef, memory)
+  R  = Vector{FC}(undef, div(memory * (memory+1), 2))
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = GmresSolver{T,FC,S}(m, n, Δx, x, w, p, q, V, c, s, z, R, false, 0, stats)
+  return solver
+end
 
-  function GmresSolver(A, b, memory = 20)
-    n, m = size(A)
-    S = ktypeof(b)
-    GmresSolver(n, m, memory, S)
-  end
+function GmresSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  GmresSolver(m, n, memory, S)
+end
+
+"""
+Type for storing the vectors required by the in-place version of FGMRES.
+
+The outer constructors
+
+    solver = FgmresSolver(m, n, memory, S)
+    solver = FgmresSolver(A, b, memory = 20)
+
+may be used in order to create these vectors.
+`memory` is set to `n` if the value given is larger than `n`.
+"""
+mutable struct FgmresSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
+  Δx         :: S
+  x          :: S
+  w          :: S
+  q          :: S
+  V          :: Vector{S}
+  Z          :: Vector{S}
+  c          :: Vector{T}
+  s          :: Vector{FC}
+  z          :: Vector{FC}
+  R          :: Vector{FC}
+  warm_start :: Bool
+  inner_iter :: Int
+  stats      :: SimpleStats{T}
+end
+
+function FgmresSolver(m, n, memory, S)
+  memory = min(m, memory)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  w  = S(undef, n)
+  q  = S(undef, 0)
+  V  = S[S(undef, n) for i = 1 : memory]
+  Z  = S[S(undef, n) for i = 1 : memory]
+  c  = Vector{T}(undef, memory)
+  s  = Vector{FC}(undef, memory)
+  z  = Vector{FC}(undef, memory)
+  R  = Vector{FC}(undef, div(memory * (memory+1), 2))
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = FgmresSolver{T,FC,S}(m, n, Δx, x, w, q, V, Z, c, s, z, R, false, 0, stats)
+  return solver
+end
+
+function FgmresSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  FgmresSolver(m, n, memory, S)
 end
 
 """
@@ -1508,13 +1626,15 @@ Type for storing the vectors required by the in-place version of FOM.
 
 The outer constructors
 
-    solver = FomSolver(n, m, memory, S)
+    solver = FomSolver(m, n, memory, S)
     solver = FomSolver(A, b, memory = 20)
 
 may be used in order to create these vectors.
 `memory` is set to `n` if the value given is larger than `n`.
 """
 mutable struct FomSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   Δx         :: S
   x          :: S
   w          :: S
@@ -1526,30 +1646,30 @@ mutable struct FomSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   U          :: Vector{FC}
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function FomSolver(n, m, memory, S)
-    memory = min(n, memory)
-    FC = eltype(S)
-    T  = real(FC)
-    Δx = S(undef, 0)
-    x  = S(undef, n)
-    w  = S(undef, n)
-    p  = S(undef, 0)
-    q  = S(undef, 0)
-    V  = [S(undef, n) for i = 1 : memory]
-    l  = Vector{FC}(undef, memory)
-    z  = Vector{FC}(undef, memory)
-    U  = Vector{FC}(undef, div(memory * (memory+1), 2))
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(Δx, x, w, p, q, V, l, z, U, false, stats)
-    return solver
-  end
+function FomSolver(m, n, memory, S)
+  memory = min(m, memory)
+  FC = eltype(S)
+  T  = real(FC)
+  Δx = S(undef, 0)
+  x  = S(undef, n)
+  w  = S(undef, n)
+  p  = S(undef, 0)
+  q  = S(undef, 0)
+  V  = S[S(undef, n) for i = 1 : memory]
+  l  = Vector{FC}(undef, memory)
+  z  = Vector{FC}(undef, memory)
+  U  = Vector{FC}(undef, div(memory * (memory+1), 2))
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = FomSolver{T,FC,S}(m, n, Δx, x, w, p, q, V, l, z, U, false, stats)
+  return solver
+end
 
-  function FomSolver(A, b, memory = 20)
-    n, m = size(A)
-    S = ktypeof(b)
-    FomSolver(n, m, memory, S)
-  end
+function FomSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  FomSolver(m, n, memory, S)
 end
 
 """
@@ -1557,13 +1677,15 @@ Type for storing the vectors required by the in-place version of GPMR.
 
 The outer constructors
 
-    solver = GpmrSolver(n, m, memory, S)
+    solver = GpmrSolver(m, n, memory, S)
     solver = GpmrSolver(A, b, memory = 20)
 
 may be used in order to create these vectors.
 `memory` is set to `n + m` if the value given is larger than `n + m`.
 """
 mutable struct GpmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
+  m          :: Int
+  n          :: Int
   wA         :: S
   wB         :: S
   dA         :: S
@@ -1582,37 +1704,37 @@ mutable struct GpmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   R          :: Vector{FC}
   warm_start :: Bool
   stats      :: SimpleStats{T}
+end
 
-  function GpmrSolver(n, m, memory, S)
-    memory = min(n + m, memory)
-    FC = eltype(S)
-    T  = real(FC)
-    wA = S(undef, 0)
-    wB = S(undef, 0)
-    dA = S(undef, n)
-    dB = S(undef, m)
-    Δx = S(undef, 0)
-    Δy = S(undef, 0)
-    x  = S(undef, n)
-    y  = S(undef, m)
-    q  = S(undef, 0)
-    p  = S(undef, 0)
-    V  = [S(undef, n) for i = 1 : memory]
-    U  = [S(undef, m) for i = 1 : memory]
-    gs = Vector{FC}(undef, 4 * memory)
-    gc = Vector{T}(undef, 4 * memory)
-    zt = Vector{FC}(undef, 2 * memory)
-    R  = Vector{FC}(undef, memory * (2memory + 1))
-    stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
-    solver = new{T,FC,S}(wA, wB, dA, dB, Δx, Δy, x, y, q, p, V, U, gs, gc, zt, R, false, stats)
-    return solver
-  end
+function GpmrSolver(m, n, memory, S)
+  memory = min(n + m, memory)
+  FC = eltype(S)
+  T  = real(FC)
+  wA = S(undef, 0)
+  wB = S(undef, 0)
+  dA = S(undef, m)
+  dB = S(undef, n)
+  Δx = S(undef, 0)
+  Δy = S(undef, 0)
+  x  = S(undef, m)
+  y  = S(undef, n)
+  q  = S(undef, 0)
+  p  = S(undef, 0)
+  V  = S[S(undef, m) for i = 1 : memory]
+  U  = S[S(undef, n) for i = 1 : memory]
+  gs = Vector{FC}(undef, 4 * memory)
+  gc = Vector{T}(undef, 4 * memory)
+  zt = Vector{FC}(undef, 2 * memory)
+  R  = Vector{FC}(undef, memory * (2 * memory + 1))
+  stats = SimpleStats(0, false, false, T[], T[], T[], "unknown")
+  solver = GpmrSolver{T,FC,S}(m, n, wA, wB, dA, dB, Δx, Δy, x, y, q, p, V, U, gs, gc, zt, R, false, stats)
+  return solver
+end
 
-  function GpmrSolver(A, b, memory = 20)
-    n, m = size(A)
-    S = ktypeof(b)
-    GpmrSolver(n, m, memory, S)
-  end
+function GpmrSolver(A, b, memory = 20)
+  m, n = size(A)
+  S = ktypeof(b)
+  GpmrSolver(m, n, memory, S)
 end
 
 """
@@ -1704,29 +1826,35 @@ for (KS, fun, nsol, nA, nAt, warm_start) in [
   (MinresQlpSolver     , :minres_qlp!      , 1, 1, 0, true )
   (QmrSolver           , :qmr!             , 1, 1, 1, true )
   (GmresSolver         , :gmres!           , 1, 1, 0, true )
+  (FgmresSolver        , :fgmres!          , 1, 1, 0, true )
   (FomSolver           , :fom!             , 1, 1, 0, true )
   (GpmrSolver          , :gpmr!            , 2, 1, 0, true )
 ]
   @eval begin
-    @inline solve!(solver :: $KS, args...; kwargs...) = $(fun)(solver, args...; kwargs...)
-    @inline statistics(solver :: $KS) = solver.stats
-    @inline niterations(solver :: $KS) = solver.stats.niter
-    @inline Aprod(solver :: $KS) = $nA * solver.stats.niter
-    @inline Atprod(solver :: $KS) = $nAt * solver.stats.niter
+    size(solver :: $KS) = solver.m, solver.n
+    solve!(solver :: $KS, args...; kwargs...) = $(fun)(solver, args...; kwargs...)
+    statistics(solver :: $KS) = solver.stats
+    niterations(solver :: $KS) = solver.stats.niter
+    Aprod(solver :: $KS) = $nA * solver.stats.niter
+    Atprod(solver :: $KS) = $nAt * solver.stats.niter
     if $KS == GpmrSolver
-      @inline Bprod(solver :: $KS) = solver.stats.niter
+      Bprod(solver :: $KS) = solver.stats.niter
+    end
+    nsolution(solver :: $KS) = $nsol
+    if $nsol == 1
+      solution(solver :: $KS) = solver.x
+      solution(solver :: $KS, p :: Integer) = (p == 1) ? solution(solver) : error("solution(solver) has only one output.")
+    end
+    if $nsol == 2
+      solution(solver :: $KS) = solver.x, solver.y
+      solution(solver :: $KS, p :: Integer) = (1 ≤ p ≤ 2) ? solution(solver)[p] : error("solution(solver) has only two outputs.")
     end
-    @inline nsolution(solver :: $KS) = $nsol
-    ($nsol == 1) && @inline solution(solver :: $KS) = solver.x
-    ($nsol == 2) && @inline solution(solver :: $KS) = solver.x, solver.y
-    ($nsol == 1) && @inline solution(solver :: $KS, p :: Integer) = (p == 1) ? solution(solver) : error("solution(solver) has only one output.")
-    ($nsol == 2) && @inline solution(solver :: $KS, p :: Integer) = (1 ≤ p ≤ 2) ? solution(solver)[p] : error("solution(solver) has only two outputs.")
     if $KS ∈ (BilqrSolver, TrilqrSolver)
-      @inline issolved_primal(solver :: $KS) = solver.stats.solved_primal
-      @inline issolved_dual(solver :: $KS) = solver.stats.solved_dual
-      @inline issolved(solver :: $KS) = issolved_primal(solver) && issolved_dual(solver)
+      issolved_primal(solver :: $KS) = solver.stats.solved_primal
+      issolved_dual(solver :: $KS) = solver.stats.solved_dual
+      issolved(solver :: $KS) = issolved_primal(solver) && issolved_dual(solver)
     else
-      @inline issolved(solver :: $KS) = solver.stats.solved
+      issolved(solver :: $KS) = solver.stats.solved
     end
     if $warm_start
       if $KS in (BilqrSolver, TrilqrSolver, TricgSolver, TrimrSolver, GpmrSolver)
@@ -1758,6 +1886,29 @@ for (KS, fun, nsol, nA, nAt, warm_start) in [
   end
 end
 
+function ksizeof(attribute)
+  if isa(attribute, Vector{<:AbstractVector}) && !isempty(attribute)
+    # A vector of vectors is a vector of pointers in Julia.
+    # All vectors inside a vector have the same size in Krylov.jl
+    size_attribute = sizeof(attribute) + length(attribute) * ksizeof(attribute[1])
+  else
+    size_attribute = sizeof(attribute)
+  end
+  return size_attribute
+end
+
+function sizeof(stats_solver :: Union{KrylovStats, KrylovSolver})
+  type = typeof(stats_solver)
+  nfields = fieldcount(type)
+  storage = 0
+  for i = 1:nfields
+    field_i = getfield(stats_solver, i)
+    size_i = ksizeof(field_i)
+    storage += size_i
+  end
+  return storage
+end
+
 """
     show(io, solver; show_stats=true)
 
@@ -1765,38 +1916,40 @@ Statistics of `solver` are displayed if `show_stats` is set to true.
 """
 function show(io :: IO, solver :: KrylovSolver{T,FC,S}; show_stats :: Bool=true) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
   workspace = typeof(solver)
-  name_solver = workspace.name.wrapper
-  l1 = max(length(string(name_solver)), 10)  # length("warm_start") = 10
-  l2 = length(string(S)) + 8  # length("Vector{}") = 8
+  name_solver = string(workspace.name.name)
+  name_stats = string(typeof(solver.stats).name.name)
+  nbytes = sizeof(solver)
+  storage = format_bytes(nbytes)
   architecture = S <: Vector ? "CPU" : "GPU"
-  format = Printf.Format("│%$(l1)s│%$(l2)s│%18s│\n")
-  format2 = Printf.Format("│%$(l1+1)s│%$(l2)s│%18s│\n")
-  @printf(io, "┌%s┬%s┬%s┐\n", "─"^l1, "─"^l2, "─"^18)
-  Printf.format(io, format, name_solver, "Precision: $FC", "Architecture: $architecture")
-  @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^18)
+  l1 = max(length(name_solver), length(string(FC)) + 11)  # length("Precision: ") = 11
+  nchar = workspace <: Union{CgLanczosShiftSolver, FomSolver, DiomSolver, DqgmresSolver, GmresSolver, FgmresSolver, GpmrSolver} ? 8 : 0  # length("Vector{}") = 8
+  l2 = max(ndigits(solver.m) + 7, length(architecture) + 14, length(string(S)) + nchar)  # length("nrows: ") = 7 and length("Architecture: ") = 14
+  l2 = max(l2, length(name_stats) + 2 + length(string(T)))  # length("{}") = 2
+  l3 = max(ndigits(solver.n) + 7, length(storage) + 9)  # length("Storage: ") = 9 and length("cols: ") = 7
+  format = Printf.Format("│%$(l1)s│%$(l2)s│%$(l3)s│\n")
+  format2 = Printf.Format("│%$(l1+1)s│%$(l2)s│%$(l3)s│\n")
+  @printf(io, "┌%s┬%s┬%s┐\n", "─"^l1, "─"^l2, "─"^l3)
+  Printf.format(io, format, "$(name_solver)", "nrows: $(solver.m)", "ncols: $(solver.n)")
+  @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3)
+  Printf.format(io, format, "Precision: $FC", "Architecture: $architecture","Storage: $storage")
+  @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3)
   Printf.format(io, format, "Attribute", "Type", "Size")
-  @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^18)
-  for i=1:fieldcount(workspace)-1 # show stats seperately
-    type_i = fieldtype(workspace, i)
+  @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3)
+  for i=1:fieldcount(workspace)
     name_i = fieldname(workspace, i)
-    len = if type_i <: AbstractVector
-      field_i = getfield(solver, name_i)
-      ni = length(field_i)
-      if eltype(type_i) <: AbstractVector
-        "$(ni) x $(length(field_i[1]))"
-      else
-        length(field_i)
-      end
-    else
-      0
-    end
-    if (name_i in [:w̅, :w̄, :d̅]) && (VERSION < v"1.8.0-DEV")
-      Printf.format(io, format2, string(name_i), type_i, len)
+    type_i = fieldtype(workspace, i)
+    field_i = getfield(solver, name_i)
+    size_i = ksizeof(field_i)
+    if (name_i::Symbol in [:w̅, :w̄, :d̅]) && (VERSION < v"1.8.0-DEV")
+      (size_i ≠ 0) && Printf.format(io, format2, string(name_i), type_i, format_bytes(size_i))
     else
-      Printf.format(io, format, string(name_i), type_i, len)
+      (size_i ≠ 0) && Printf.format(io, format, string(name_i), type_i, format_bytes(size_i))
     end
   end
-  @printf(io, "└%s┴%s┴%s┘\n","─"^l1,"─"^l2,"─"^18)
-  show_stats && show(io, solver.stats)
+  @printf(io, "└%s┴%s┴%s┘\n","─"^l1,"─"^l2,"─"^l3)
+  if show_stats
+    @printf(io, "\n")
+    show(io, solver.stats)
+  end
   return nothing
 end
diff --git a/src/krylov_stats.jl b/src/krylov_stats.jl
index a662fa0a0..392912895 100644
--- a/src/krylov_stats.jl
+++ b/src/krylov_stats.jl
@@ -1,3 +1,6 @@
+export KrylovStats, SimpleStats, LsmrStats, LanczosStats, LanczosShiftStats,
+SymmlqStats, AdjointStats, LNLQStats, LSLQStats
+
 "Abstract type for statistics returned by a solver"
 abstract type KrylovStats{T} end
 
@@ -21,6 +24,12 @@ mutable struct SimpleStats{T} <: KrylovStats{T}
   status       :: String
 end
 
+function reset!(stats :: SimpleStats)
+  empty!(stats.residuals)
+  empty!(stats.Aresiduals)
+  empty!(stats.Acond)
+end
+
 """
 Type for statistics returned by LSMR. The attributes are:
 - niter
@@ -47,6 +56,11 @@ mutable struct LsmrStats{T} <: KrylovStats{T}
   status       :: String
 end
 
+function reset!(stats :: LsmrStats)
+  empty!(stats.residuals)
+  empty!(stats.Aresiduals)
+end
+
 """
 Type for statistics returned by CG-LANCZOS, the attributes are:
 - niter
@@ -67,6 +81,10 @@ mutable struct LanczosStats{T} <: KrylovStats{T}
   status     :: String
 end
 
+function reset!(stats :: LanczosStats)
+  empty!(stats.residuals)
+end
+
 """
 Type for statistics returned by CG-LANCZOS with shifts, the attributes are:
 - niter
@@ -117,6 +135,13 @@ mutable struct SymmlqStats{T} <: KrylovStats{T}
   status      :: String
 end
 
+function reset!(stats :: SymmlqStats)
+  empty!(stats.residuals)
+  empty!(stats.residualscg)
+  empty!(stats.errors)
+  empty!(stats.errorscg)
+end
+
 """
 Type for statistics returned by adjoint systems solvers BiLQR and TriLQR, the attributes are:
 - niter
@@ -135,6 +160,11 @@ mutable struct AdjointStats{T} <: KrylovStats{T}
   status           :: String
 end
 
+function reset!(stats :: AdjointStats)
+  empty!(stats.residuals_primal)
+  empty!(stats.residuals_dual)
+end
+
 """
 Type for statistics returned by the LNLQ method, the attributes are:
 - niter
@@ -155,6 +185,12 @@ mutable struct LNLQStats{T} <: KrylovStats{T}
   status         :: String
 end
 
+function reset!(stats :: LNLQStats)
+  empty!(stats.residuals)
+  empty!(stats.error_bnd_x)
+  empty!(stats.error_bnd_y)
+end
+
 """
 Type for statistics returned by the LSLQ method, the attributes are:
 - niter
@@ -181,6 +217,14 @@ mutable struct LSLQStats{T} <: KrylovStats{T}
   status         :: String
 end
 
+function reset!(stats :: LSLQStats)
+  empty!(stats.residuals)
+  empty!(stats.Aresiduals)
+  empty!(stats.err_lbnds)
+  empty!(stats.err_ubnds_lq)
+  empty!(stats.err_ubnds_cg)
+end
+
 import Base.show
 
 special_fields = Dict(
@@ -192,45 +236,24 @@ special_fields = Dict(
   :err_ubnds_cg => "error bound CG",
 )
 
-for f in ["Simple", "Lsmr", "Adjoint", "LNLQ", "LSLQ", "Lanczos", "Symmlq"]
-  T = Meta.parse("Krylov." * f * "Stats{S}")
-
-  @eval function empty_field!(stats :: $T, i, ::Type{Vector{Si}}) where {S, Si}
-    statfield = getfield(stats, i)
-    empty!(statfield)
-  end
-  @eval empty_field!(stats :: $T, i, type) where S = stats
-
-  @eval function reset!(stats :: $T) where S
-    nfield = length($T.types)
-    for i = 1 : nfield
-      type  = fieldtype($T, i)
-      empty_field!(stats, i, type)
+function show(io :: IO, stats :: KrylovStats)
+  kst = typeof(stats)
+  s = string(kst.name.name) * "\n"
+  nfield = fieldcount(kst)
+  for i = 1 : nfield
+    field = fieldname(kst, i)
+    field_name = if field ∈ keys(special_fields)
+      special_fields[field]
+    else
+      replace(string(field), "_" => " ")
     end
-  end
-end
-
-for f in ["Simple", "Lsmr", "Lanczos", "LanczosShift", "Symmlq", "Adjoint", "LNLQ", "LSLQ"]
-  T = Meta.parse("Krylov." * f * "Stats{S}")
-
-  @eval function show(io :: IO, stats :: $T) where S
-    s  = $f * " stats\n"
-    nfield = length($T.types)
-    for i = 1 : nfield
-      field = fieldname($T, i)
-      field_name = if field ∈ keys(special_fields) 
-        special_fields[field]
-      else
-        replace(string(field), "_" => " ")
-      end
-      s *=  " " * field_name * ":"
-      statfield = getfield(stats, field)
-      if isa(statfield, AbstractVector) && eltype(statfield) <: Union{Missing, AbstractFloat}
-        s *= @sprintf " %s\n" vec2str(statfield)
-      else
-        s *= @sprintf " %s\n" statfield
-      end
+    s *=  " " * field_name * ":"
+    statfield = getfield(stats, field)
+    if isa(statfield, AbstractVector) && eltype(statfield) <: Union{Missing, AbstractFloat}
+      s *= @sprintf " %s\n" vec2str(statfield)
+    else
+      s *= @sprintf " %s\n" statfield
     end
-    print(io, s)
   end
+  print(io, s)
 end
diff --git a/src/krylov_utils.jl b/src/krylov_utils.jl
index 6f0c1c382..6049f9c28 100644
--- a/src/krylov_utils.jl
+++ b/src/krylov_utils.jl
@@ -1,3 +1,8 @@
+export kstdout
+
+"Default I/O stream for all Krylov methods."
+const kstdout = Core.stdout
+
 """
     FloatOrComplex{T}
 Union type of `T` and `Complex{T}` where T is an `AbstractFloat`.
@@ -92,8 +97,8 @@ function sym_givens(a :: Complex{T}, b :: Complex{T}) where T <: AbstractFloat
   return (c, s, ρ)
 end
 
-@inline sym_givens(a :: Complex{T}, b :: T) where T <: AbstractFloat = sym_givens(a, Complex{T}(b))
-@inline sym_givens(a :: T, b :: Complex{T}) where T <: AbstractFloat = sym_givens(Complex{T}(a), b)
+sym_givens(a :: Complex{T}, b :: T) where T <: AbstractFloat = sym_givens(a, Complex{T}(b))
+sym_givens(a :: T, b :: Complex{T}) where T <: AbstractFloat = sym_givens(Complex{T}(a), b)
 
 """
     roots = roots_quadratic(q₂, q₁, q₀; nitref)
@@ -111,68 +116,86 @@ function roots_quadratic(q₂ :: T, q₁ :: T, q₀ :: T;
   # Case where q(x) is linear.
   if q₂ == zero(T)
     if q₁ == zero(T)
-      root = [zero(T)]
-      q₀ == zero(T) || (root = T[])
+      q₀ == zero(T) || error("The quadratic `q` doesn't have real roots.")
+      root = zero(T)
     else
-      root = [-q₀ / q₁]
+      root = -q₀ / q₁
     end
-    return root
+    return (root, root)
   end
 
   # Case where q(x) is indeed quadratic.
   rhs = √eps(T) * q₁ * q₁
   if abs(q₀ * q₂) > rhs
     ρ = q₁ * q₁ - 4 * q₂ * q₀
-    ρ < 0 && return T[]
+    ρ < 0 && return error("The quadratic `q` doesn't have real roots.")
     d = -(q₁ + copysign(sqrt(ρ), q₁)) / 2
-    roots = [d / q₂, q₀ / d]
+    root1 = d / q₂
+    root2 = q₀ / d
   else
     # Ill-conditioned quadratic.
-    roots = [-q₁ / q₂, zero(T)]
+    root1 = -q₁ / q₂
+    root2 = zero(T)
   end
 
   # Perform a few Newton iterations to improve accuracy.
-  for k = 1 : 2
-    root = roots[k]
-    for it = 1 : nitref
-      q = (q₂ * root + q₁) * root + q₀
-      dq = 2 * q₂ * root + q₁
-      dq == zero(T) && continue
-      root = root - q / dq
-    end
-    roots[k] = root
+  for it = 1 : nitref
+    q = (q₂ * root1 + q₁) * root1 + q₀
+    dq = 2 * q₂ * root1 + q₁
+    dq == zero(T) && continue
+    root1 = root1 - q / dq
   end
-  return roots
-end
 
+  for it = 1 : nitref
+    q = (q₂ * root2 + q₁) * root2 + q₀
+    dq = 2 * q₂ * root2 + q₁
+    dq == zero(T) && continue
+    root2 = root2 - q / dq
+  end
+  return (root1, root2)
+end
 
 """
-    roots = to_boundary(x, d, radius; flip, xNorm2, dNorm2)
-
-Given a trust-region radius `radius`, a vector `x` lying inside the
-trust-region and a direction `d`, return `σ1` and `σ2` such that
-
-    ‖x + σi d‖ = radius, i = 1, 2
+    s = vec2str(x; ndisp)
 
-in the Euclidean norm. If known, ‖x‖² may be supplied in `xNorm2`.
+Display an array in the form
 
-If `flip` is set to `true`, `σ1` and `σ2` are computed such that
+    [ -3.0e-01 -5.1e-01  1.9e-01 ... -2.3e-01 -4.4e-01  2.4e-01 ]
 
-    ‖x - σi d‖ = radius, i = 1, 2.
+with (ndisp - 1)/2 elements on each side.
 """
-function to_boundary(x :: Vector{T}, d :: Vector{T},
-                     radius :: T; flip :: Bool=false, xNorm2 :: T=zero(T), dNorm2 :: T=zero(T)) where T <: Number
-  radius > 0 || error("radius must be positive")
-
-  # ‖d‖² σ² + 2 xᵀd σ + (‖x‖² - radius²).
-  xd = dot(x, d)
-  flip && (xd = -xd)
-  dNorm2 == zero(T) && (dNorm2 = dot(d, d))
-  dNorm2 == zero(T) && error("zero direction")
-  xNorm2 == zero(T) && (xNorm2 = dot(x, x))
-  (xNorm2 ≤ radius * radius) || error(@sprintf("outside of the trust region: ‖x‖²=%7.1e, Δ²=%7.1e", xNorm2, radius * radius))
-  roots = roots_quadratic(dNorm2, 2 * xd, xNorm2 - radius * radius)
-  return roots # `σ1` and `σ2`
+function vec2str(x :: AbstractVector{T}; ndisp :: Int=7) where T <: Union{AbstractFloat, Missing}
+  n = length(x)
+  if n ≤ ndisp
+    ndisp = n
+    nside = n
+  else
+    nside = max(1, div(ndisp - 1, 2))
+  end
+  s = "["
+  i = 1
+  while i ≤ nside
+    if x[i] !== missing
+      s *= @sprintf("%8.1e ", x[i])
+    else
+      s *= " ✗✗✗✗ "
+    end
+      i += 1
+  end
+  if i ≤ div(n, 2)
+    s *= "... "
+  end
+  i = max(i, n - nside + 1)
+  while i ≤ n
+    if x[i] !== missing
+      s *= @sprintf("%8.1e ", x[i])
+    else
+      s *= " ✗✗✗✗ "
+    end
+    i += 1
+  end
+  s *= "]"
+  return s
 end
 
 """
@@ -201,84 +224,125 @@ function ktypeof(v::S) where S <: AbstractVector
 end
 
 function ktypeof(v::S) where S <: SubArray
-  return ktypeof(v.parent)
+  vp = v.parent
+  if isa(vp, DenseMatrix)
+    M = typeof(vp)
+    return matrix_to_vector(M)  # view of a row or a column of a matrix
+  else
+    return ktypeof(vp)  # view of a vector
+  end
+end
+
+"""
+    M = vector_to_matrix(S)
+
+Return the dense matrix storage type `M` related to the dense vector storage type `S`.
+"""
+function vector_to_matrix(::Type{S}) where S <: DenseVector
+  T = hasproperty(S, :body) ? S.body : S
+  par = T.parameters
+  npar = length(par)
+  (2 ≤ npar ≤ 3) || error("Type $S is not supported.")
+  if npar == 2
+    M = T.name.wrapper{par[1], 2}
+  else
+    M = T.name.wrapper{par[1], 2, par[3]}
+  end
+  return M
+end
+
+"""
+    S = matrix_to_vector(M)
+
+Return the dense vector storage type `S` related to the dense matrix storage type `M`.
+"""
+function matrix_to_vector(::Type{M}) where M <: DenseMatrix
+  T = hasproperty(M, :body) ? M.body : M
+  par = T.parameters
+  npar = length(par)
+  (2 ≤ npar ≤ 3) || error("Type $M is not supported.")
+  if npar == 2
+    S = T.name.wrapper{par[1], 1}
+  else
+    S = T.name.wrapper{par[1], 1, par[3]}
+  end
+  return S
 end
 
 """
     v = kzeros(S, n)
 
-Create an AbstractVector of storage type `S` of length `n` only composed of zero.
+Create a vector of storage type `S` of length `n` only composed of zero.
 """
-@inline kzeros(S, n) = fill!(S(undef, n), zero(eltype(S)))
+kzeros(S, n) = fill!(S(undef, n), zero(eltype(S)))
 
 """
     v = kones(S, n)
 
-Create an AbstractVector of storage type `S` of length `n` only composed of one.
+Create a vector of storage type `S` of length `n` only composed of one.
 """
-@inline kones(S, n) = fill!(S(undef, n), one(eltype(S)))
+kones(S, n) = fill!(S(undef, n), one(eltype(S)))
 
-@inline allocate_if(bool, solver, v, S, n) = bool && isempty(solver.:($v)) && (solver.:($v) = S(undef, n))
+allocate_if(bool, solver, v, S, n) = bool && isempty(solver.:($v)::S) && (solver.:($v)::S = S(undef, n))
 
-@inline kdisplay(iter, verbose) = (verbose > 0) && (mod(iter, verbose) == 0)
+kdisplay(iter, verbose) = (verbose > 0) && (mod(iter, verbose) == 0)
 
-@inline mulorldiv!(y, P, x, ldiv::Bool) = ldiv ? ldiv!(y, P, x) : mul!(y, P, x)
+mulorldiv!(y, P, x, ldiv::Bool) = ldiv ? ldiv!(y, P, x) : mul!(y, P, x)
 
-@inline krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)
-@inline krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasComplex = BLAS.dotc(n, x, dx, y, dy)
-@inline krylov_dot(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: Number = dot(x, y)
+kdot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)
+kdot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasComplex = BLAS.dotc(n, x, dx, y, dy)
+kdot(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = dot(x, y)
 
-@inline krylov_dotr(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: AbstractFloat = krylov_dot(n, x, dx, y, dy)
-@inline krylov_dotr(n :: Integer, x :: AbstractVector{Complex{T}}, dx :: Integer, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = real(krylov_dot(n, x, dx, y, dy))
+kdotr(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: AbstractFloat = kdot(n, x, dx, y, dy)
+kdotr(n :: Integer, x :: AbstractVector{Complex{T}}, dx :: Integer, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = real(kdot(n, x, dx, y, dy))
 
-@inline krylov_norm2(n :: Integer, x :: Vector{T}, dx :: Integer) where T <: BLAS.BlasFloat = BLAS.nrm2(n, x, dx)
-@inline krylov_norm2(n :: Integer, x :: AbstractVector{T}, dx :: Integer) where T <: Number = norm(x)
+knrm2(n :: Integer, x :: Vector{T}, dx :: Integer) where T <: BLAS.BlasFloat = BLAS.nrm2(n, x, dx)
+knrm2(n :: Integer, x :: AbstractVector{T}, dx :: Integer) where T <: FloatOrComplex = norm(x)
 
-@inline krylov_scal!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer) where T <: BLAS.BlasFloat = BLAS.scal!(n, s, x, dx)
-@inline krylov_scal!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer) where T <: Number = (x .*= s)
-@inline krylov_scal!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer) where T <: AbstractFloat = krylov_scal!(n, Complex{T}(s), x, dx)
+kscal!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer) where T <: BLAS.BlasFloat = BLAS.scal!(n, s, x, dx)
+kscal!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer) where T <: FloatOrComplex = (x .*= s)
+kscal!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer) where T <: AbstractFloat = kscal!(n, Complex{T}(s), x, dx)
 
-@inline krylov_axpy!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.axpy!(n, s, x, dx, y, dy)
-@inline krylov_axpy!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: Number = axpy!(s, x, y)
-@inline krylov_axpy!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = krylov_axpy!(n, Complex{T}(s), x, dx, y, dy)
+kaxpy!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.axpy!(n, s, x, dx, y, dy)
+kaxpy!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = axpy!(s, x, y)
+kaxpy!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = kaxpy!(n, Complex{T}(s), x, dx, y, dy)
 
-@inline krylov_axpby!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer, t :: T, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.axpby!(n, s, x, dx, t, y, dy)
-@inline krylov_axpby!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer, t :: T, y :: AbstractVector{T}, dy :: Integer) where T <: Number = axpby!(s, x, t, y)
-@inline krylov_axpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: Complex{T}, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = krylov_axpby!(n, Complex{T}(s), x, dx, t, y, dy)
-@inline krylov_axpby!(n :: Integer, s :: Complex{T}, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: T, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = krylov_axpby!(n, s, x, dx, Complex{T}(t), y, dy)
-@inline krylov_axpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: T, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = krylov_axpby!(n, Complex{T}(s), x, dx, Complex{T}(t), y, dy)
+kaxpby!(n :: Integer, s :: T, x :: Vector{T}, dx :: Integer, t :: T, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.axpby!(n, s, x, dx, t, y, dy)
+kaxpby!(n :: Integer, s :: T, x :: AbstractVector{T}, dx :: Integer, t :: T, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = axpby!(s, x, t, y)
+kaxpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: Complex{T}, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = kaxpby!(n, Complex{T}(s), x, dx, t, y, dy)
+kaxpby!(n :: Integer, s :: Complex{T}, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: T, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = kaxpby!(n, s, x, dx, Complex{T}(t), y, dy)
+kaxpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t :: T, y :: AbstractVector{Complex{T}}, dy :: Integer) where T <: AbstractFloat = kaxpby!(n, Complex{T}(s), x, dx, Complex{T}(t), y, dy)
 
-@inline krylov_copy!(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.blascopy!(n, x, dx, y, dy)
-@inline krylov_copy!(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: Number = copyto!(y, x)
+kcopy!(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.blascopy!(n, x, dx, y, dy)
+kcopy!(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = copyto!(y, x)
 
 # the macros are just for readability, so we don't have to write the increments (always equal to 1)
-
 macro kdot(n, x, y)
-  return esc(:(krylov_dot($n, $x, 1, $y, 1)))
+  return esc(:(Krylov.kdot($n, $x, 1, $y, 1)))
 end
 
 macro kdotr(n, x, y)
-  return esc(:(krylov_dotr($n, $x, 1, $y, 1)))
+  return esc(:(Krylov.kdotr($n, $x, 1, $y, 1)))
 end
 
 macro knrm2(n, x)
-  return esc(:(krylov_norm2($n, $x, 1)))
+  return esc(:(Krylov.knrm2($n, $x, 1)))
 end
 
 macro kscal!(n, s, x)
-  return esc(:(krylov_scal!($n, $s, $x, 1)))
+  return esc(:(Krylov.kscal!($n, $s, $x, 1)))
 end
 
 macro kaxpy!(n, s, x, y)
-  return esc(:(krylov_axpy!($n, $s, $x, 1, $y, 1)))
+  return esc(:(Krylov.kaxpy!($n, $s, $x, 1, $y, 1)))
 end
 
 macro kaxpby!(n, s, x, t, y)
-  return esc(:(krylov_axpby!($n, $s, $x, 1, $t, $y, 1)))
+  return esc(:(Krylov.kaxpby!($n, $s, $x, 1, $t, $y, 1)))
 end
 
 macro kcopy!(n, x, y)
-  return esc(:(krylov_copy!($n, $x, 1, $y, 1)))
+  return esc(:(Krylov.kcopy!($n, $x, 1, $y, 1)))
 end
 
 macro kswap(x, y)
@@ -294,44 +358,35 @@ macro kref!(n, x, y, c, s)
 end
 
 """
-    s = vec2str(x; ndisp)
+    roots = to_boundary(n, x, d, radius; flip, xNorm2, dNorm2)
 
-Display an array in the form
+Given a trust-region radius `radius`, a vector `x` lying inside the
+trust-region and a direction `d`, return `σ1` and `σ2` such that
 
-    [ -3.0e-01 -5.1e-01  1.9e-01 ... -2.3e-01 -4.4e-01  2.4e-01 ]
+    ‖x + σi d‖ = radius, i = 1, 2
 
-with (ndisp - 1)/2 elements on each side.
+in the Euclidean norm.
+`n` is the length of vectors `x` and `d`.
+If known, ‖x‖² and ‖d‖² may be supplied with `xNorm2` and `dNorm2`.
+
+If `flip` is set to `true`, `σ1` and `σ2` are computed such that
+
+    ‖x - σi d‖ = radius, i = 1, 2.
 """
-function vec2str(x :: AbstractVector{T}; ndisp :: Int=7) where T <: Union{AbstractFloat, Missing}
-  n = length(x)
-  if n ≤ ndisp
-    ndisp = n
-    nside = n
-  else
-    nside = max(1, div(ndisp - 1, 2))
-  end
-  s = "["
-  i = 1
-  while i ≤ nside
-    if x[i] !== missing
-      s *= @sprintf("%8.1e ", x[i])
-    else
-      s *= " ✗✗✗✗ "
-    end
-      i += 1
-  end
-  if i ≤ div(n, 2)
-    s *= "... "
-  end
-  i = max(i, n - nside + 1)
-  while i ≤ n
-    if x[i] !== missing
-      s *= @sprintf("%8.1e ", x[i])
-    else
-      s *= " ✗✗✗✗ "
-    end
-    i += 1
-  end
-  s *= "]"
-  return s
+function to_boundary(n :: Int, x :: AbstractVector{FC}, d :: AbstractVector{FC}, radius :: T; flip :: Bool=false, xNorm2 :: T=zero(T), dNorm2 :: T=zero(T)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}}
+  radius > 0 || error("radius must be positive")
+
+  # ‖d‖² σ² + (xᴴd + dᴴx) σ + (‖x‖² - Δ²).
+  rxd = @kdotr(n, x, d)
+  flip && (rxd = -rxd)
+  dNorm2 == zero(T) && (dNorm2 = @kdotr(n, d, d))
+  dNorm2 == zero(T) && error("zero direction")
+  xNorm2 == zero(T) && (xNorm2 = @kdotr(n, x, x))
+  radius2 = radius * radius
+  (xNorm2 ≤ radius2) || error(@sprintf("outside of the trust region: ‖x‖²=%7.1e, Δ²=%7.1e", xNorm2, radius2))
+
+  # q₂ = ‖d‖², q₁ = xᴴd + dᴴx, q₀ = ‖x‖² - Δ²
+  # ‖x‖² ≤ Δ² ⟹ (q₁)² - 4 * q₂ * q₀ ≥ 0
+  roots = roots_quadratic(dNorm2, 2 * rxd, xNorm2 - radius2)
+  return roots  # `σ1` and `σ2`
 end
diff --git a/src/lnlq.jl b/src/lnlq.jl
index a1f890de2..deda7336f 100644
--- a/src/lnlq.jl
+++ b/src/lnlq.jl
@@ -9,9 +9,9 @@
 # and is equivalent to applying the SYMMLQ method
 # to the linear system
 #
-#  AAᵀy = b with x = Aᵀy and can be reformulated as
+#  AAᴴy = b with x = Aᴴy and can be reformulated as
 #
-#  [ -I  Aᵀ ][ x ] = [ 0 ]
+#  [ -I  Aᴴ ][ x ] = [ 0 ]
 #  [  A     ][ y ]   [ b ].
 #
 # This method is based on the Golub-Kahan bidiagonalization process and is described in
@@ -26,10 +26,14 @@ export lnlq, lnlq!
 
 """
     (x, y, stats) = lnlq(A, b::AbstractVector{FC};
-                         M=I, N=I, sqd::Bool=false, λ::T=zero(T), σ::T=zero(T),
-                         atol::T=√eps(T), rtol::T=√eps(T), etolx::T=√eps(T), etoly::T=√eps(T), itmax::Int=0,
-                         transfer_to_craig::Bool=true, verbose::Int=0, history::Bool=false,
-                         ldiv::Bool=false, callback=solver->false)
+                         M=I, N=I, ldiv::Bool=false,
+                         transfer_to_craig::Bool=true,
+                         sqd::Bool=false, λ::T=zero(T),
+                         σ::T=zero(T), utolx::T=√eps(T),
+                         utoly::T=√eps(T), atol::T=√eps(T),
+                         rtol::T=√eps(T), itmax::Int=0,
+                         verbose::Int=0, history::Bool=false,
+                         callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -38,17 +42,17 @@ Find the least-norm solution of the consistent linear system
 
     Ax + λ²y = b
 
-using the LNLQ method, where λ ≥ 0 is a regularization parameter.
+of size m × n using the LNLQ method, where λ ≥ 0 is a regularization parameter.
 
 For a system in the form Ax = b, LNLQ method is equivalent to applying
-SYMMLQ to AAᵀy = b and recovering x = Aᵀy but is more stable.
+SYMMLQ to AAᴴy = b and recovering x = Aᴴy but is more stable.
 Note that y are the Lagrange multipliers of the least-norm problem
 
     minimize ‖x‖  s.t.  Ax = b.
 
 If `λ > 0`, LNLQ solves the symmetric and quasi-definite system
 
-    [ -F    Aᵀ ] [ x ]   [ 0 ]
+    [ -F    Aᴴ ] [ x ]   [ 0 ]
     [  A  λ²E  ] [ y ] = [ b ],
 
 where E and F are symmetric and positive definite.
@@ -59,12 +63,12 @@ The system above represents the optimality conditions of
 
     min ‖x‖²_F + λ²‖y‖²_E  s.t.  Ax + λ²Ey = b.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-LNLQ is then equivalent to applying SYMMLQ to `(AF⁻¹Aᵀ + λ²E)y = b` with `Fx = Aᵀy`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+LNLQ is then equivalent to applying SYMMLQ to `(AF⁻¹Aᴴ + λ²E)y = b` with `Fx = Aᴴy`.
 
 If `λ = 0`, LNLQ solves the symmetric and indefinite system
 
-    [ -F   Aᵀ ] [ x ]   [ 0 ]
+    [ -F   Aᴴ ] [ x ]   [ 0 ]
     [  A   0  ] [ y ] = [ b ].
 
 The system above represents the optimality conditions of
@@ -75,12 +79,39 @@ In this case, `M` can still be specified and indicates the weighted norm in whic
 
 In this implementation, both the x and y-parts of the solution are returned.
 
-`etolx` and `etoly` are tolerances on the upper bound of the distance to the solution ‖x-xₛ‖ and ‖y-yₛ‖, respectively.
+`utolx` and `utoly` are tolerances on the upper bound of the distance to the solution ‖x-x*‖ and ‖y-y*‖, respectively.
 The bound is valid if λ>0 or σ>0 where σ should be strictly smaller than the smallest positive singular value.
 For instance σ:=(1-1e-7)σₘᵢₙ .
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `transfer_to_craig`: transfer from the LNLQ point to the CRAIG point, when it exists. The transfer is based on the residual norm;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `σ`: strict lower bound on the smallest positive singular value `σₘᵢₙ` such as `σ = (1-10⁻⁷)σₘᵢₙ`;
+* `utolx`: tolerance on the upper bound on the distance to the solution `‖x-x*‖`;
+* `utoly`: tolerance on the upper bound on the distance to the solution `‖y-y*‖`;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `y`: a dense vector of length m;
+* `stats`: statistics collected on the run in a [`LNLQStats`](@ref) structure.
 
 #### Reference
 
@@ -104,14 +135,18 @@ See [`LnlqSolver`](@ref) for more details about the `solver`.
 function lnlq! end
 
 function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, N=I, sqd :: Bool=false, λ :: T=zero(T), σ :: T=zero(T),
-               atol :: T=√eps(T), rtol :: T=√eps(T), etolx :: T=√eps(T), etoly :: T=√eps(T), itmax :: Int=0,
-               transfer_to_craig :: Bool=true, verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               M=I, N=I, ldiv :: Bool=false,
+               transfer_to_craig :: Bool=true,
+               sqd :: Bool=false, λ :: T=zero(T),
+               σ :: T=zero(T), utolx :: T=√eps(T),
+               utoly :: T=√eps(T), atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("LNLQ: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "LNLQ: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -123,16 +158,16 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u, S, m)
   allocate_if(!NisI, solver, :v, S, n)
   allocate_if(λ > 0, solver, :q, S, n)
-  x, Nv, Aᵀu, y, w̄ = solver.x, solver.Nv, solver.Aᵀu, solver.y, solver.w̄
+  x, Nv, Aᴴu, y, w̄ = solver.x, solver.Nv, solver.Aᴴu, solver.y, solver.w̄
   Mu, Av, q, stats = solver.Mu, solver.Av, solver.q, solver.stats
   rNorms, xNorms, yNorms = stats.residuals, stats.error_bnd_x, stats.error_bnd_y
   reset!(stats)
@@ -163,8 +198,8 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = m + n)
 
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, bNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, bNorm)
 
   # Update iteration index
   iter = iter + 1
@@ -179,9 +214,9 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     MisI || @kscal!(m, one(FC) / βₖ, Mu)
   end
 
-  # α₁Nv₁ = Aᵀu₁.
-  mul!(Aᵀu, Aᵀ, u)
-  Nv .= Aᵀu
+  # α₁Nv₁ = Aᴴu₁.
+  mul!(Aᴴu, Aᴴ, u)
+  Nv .= Aᴴu
   NisI || mulorldiv!(v, N, Nv, ldiv)  # v₁ = N⁻¹ * Nv₁
   αₖ = sqrt(@kdotr(n, v, Nv))         # α₁ = ‖v₁‖_N
   if αₖ ≠ 0
@@ -190,8 +225,8 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
   end
 
   w̄ .= u           # Direction w̄₁
-  cₖ = zero(T)     # Givens cosines used for the LQ factorization of (Lₖ)ᵀ
-  sₖ = zero(FC)    # Givens sines used for the LQ factorization of (Lₖ)ᵀ
+  cₖ = zero(T)     # Givens cosines used for the LQ factorization of (Lₖ)ᴴ
+  sₖ = zero(FC)    # Givens sines used for the LQ factorization of (Lₖ)ᴴ
   ζₖ₋₁ = zero(FC)  # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ
   ηₖ = zero(FC)    # Coefficient of M̅ₖ
 
@@ -214,7 +249,7 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     αhatₖ = αₖ
   end
 
-  # Begin the LQ factorization of (Lₖ)ᵀ = M̅ₖQₖ.
+  # Begin the LQ factorization of (Lₖ)ᴴ = M̅ₖQₖ.
   # [ α₁ β₂ 0  •  •  •  0 ]   [ ϵ₁  0   •   •   •   •   0   ]
   # [ 0  α₂ •  •        • ]   [ η₂  ϵ₂  •               •   ]
   # [ •  •  •  •  •     • ]   [ 0   •   •   •           •   ]
@@ -225,7 +260,7 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   ϵbarₖ = αhatₖ  # ϵbar₁ = αhat₁
 
-  # Hₖ = Bₖ(Lₖ)ᵀ = [   Lₖ(Lₖ)ᵀ   ] ⟹ (Hₖ₋₁)ᵀ = [Lₖ₋₁Mₖ₋₁  0] Qₖ
+  # Hₖ = Bₖ(Lₖ)ᴴ = [   Lₖ(Lₖ)ᴴ   ] ⟹ (Hₖ₋₁)ᴴ = [Lₖ₋₁Mₖ₋₁  0] Qₖ
   #                [ αₖβₖ₊₁(eₖ)ᵀ ]
   #
   # Solve Lₖtₖ = β₁e₁ and M̅ₖz̅ₖ = tₖ
@@ -247,7 +282,7 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     err_x = τtildeₖ
     err_y = ζtildeₖ
 
-    solved_lq = err_x ≤ etolx || err_y ≤ etoly
+    solved_lq = err_x ≤ utolx || err_y ≤ utoly
     history && push!(xNorms, err_x)
     history && push!(yNorms, err_y)
 
@@ -273,7 +308,7 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
     # Continue the generalized Golub-Kahan bidiagonalization.
     # AVₖ    = MUₖ₊₁Bₖ
-    # AᵀUₖ₊₁ = NVₖ(Bₖ)ᵀ + αₖ₊₁Nvₖ₊₁(eₖ₊₁)ᵀ = NVₖ₊₁(Lₖ₊₁)ᵀ
+    # AᴴUₖ₊₁ = NVₖ(Bₖ)ᴴ + αₖ₊₁Nvₖ₊₁(eₖ₊₁)ᴴ = NVₖ₊₁(Lₖ₊₁)ᴴ
     #
     #      [ α₁ 0  •  •  •  •  0 ]
     #      [ β₂ α₂ •           • ]
@@ -296,9 +331,9 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
       MisI || @kscal!(m, one(FC) / βₖ₊₁, Mu)
     end
 
-    # αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-    mul!(Aᵀu, Aᵀ, u)
-    @kaxpby!(n, one(FC), Aᵀu, -βₖ₊₁, Nv)
+    # αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+    mul!(Aᴴu, Aᴴ, u)
+    @kaxpby!(n, one(FC), Aᴴu, -βₖ₊₁, Nv)
     NisI || mulorldiv!(v, N, Nv, ldiv)  # vₖ₊₁ = N⁻¹ * Nvₖ₊₁
     αₖ₊₁ = sqrt(@kdotr(n, v, Nv))       # αₖ₊₁ = ‖vₖ₊₁‖_N
     if αₖ₊₁ ≠ 0
@@ -353,7 +388,7 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
       ρbar = ssig * μbar + csig * σₑₛₜ
     end
 
-    # Continue the LQ factorization of (Lₖ₊₁)ᵀ.
+    # Continue the LQ factorization of (Lₖ₊₁)ᴴ.
     # [ηₖ ϵbarₖ βₖ₊₁] [1     0     0 ] = [ηₖ  ϵₖ     0    ]
     # [0    0   αₖ₊₁] [0   cₖ₊₁  sₖ₊₁]   [0  ηₖ₊₁  ϵbarₖ₊₁]
     #                 [0   sₖ₊₁ -cₖ₊₁]
@@ -438,18 +473,15 @@ function lnlq!(solver :: LnlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     solved_lq = rNorm_lq ≤ ε
     solved_cg = transfer_to_craig && rNorm_cg ≤ ε
     if σₑₛₜ > 0
-      if transfer_to_craig
-        solved_cg = solved_cg || err_x ≤ etolx || err_y ≤ etoly
-      else
-        solved_lq = solved_lq || err_x ≤ etolx || err_y ≤ etoly
-      end
+      solved_lq = solved_lq || err_x ≤ utolx || err_y ≤ utoly
+      solved_cg = transfer_to_craig && (solved_cg || err_x ≤ utolx || err_y ≤ utoly)
     end
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm_lq)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm_lq)
 
     # Update iteration index.
     iter = iter + 1
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   if solved_cg
     if λ > 0
diff --git a/src/lslq.jl b/src/lslq.jl
index 908de19c5..4e26fb67a 100644
--- a/src/lslq.jl
+++ b/src/lslq.jl
@@ -5,7 +5,7 @@
 #
 # equivalently, of the normal equations
 #
-#  AᵀAx = Aᵀb.
+#  AᴴAx = Aᴴb.
 #
 # LSLQ is formally equivalent to applying SYMMLQ to the normal equations
 # but should be more stable.
@@ -21,15 +21,17 @@
 
 export lslq, lslq!
 
-
 """
     (x, stats) = lslq(A, b::AbstractVector{FC};
-                      M=I, N=I, sqd::Bool=false, λ::T=zero(T),
-                      atol::T=√eps(T), btol::T=√eps(T), etol::T=√eps(T),
-                      window::Int=5, utol::T=√eps(T), itmax::Int=0,
-                      σ::T=zero(T), transfer_to_lsqr::Bool=false, 
-                      conlim::T=1/√eps(T), verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      M=I, N=I, ldiv::Bool=false,
+                      window::Int=5, transfer_to_lsqr::Bool=false,
+                      sqd::Bool=false, λ::T=zero(T),
+                      σ::T=zero(T), etol::T=√eps(T),
+                      utol::T=√eps(T), btol::T=√eps(T),
+                      conlim::T=1/√eps(T), atol::T=√eps(T),
+                      rtol::T=√eps(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -38,31 +40,17 @@ Solve the regularized linear least-squares problem
 
     minimize ‖b - Ax‖₂² + λ²‖x‖₂²
 
-using the LSLQ method, where λ ≥ 0 is a regularization parameter.
+of size m × n using the LSLQ method, where λ ≥ 0 is a regularization parameter.
 LSLQ is formally equivalent to applying SYMMLQ to the normal equations
 
-    (AᵀA + λ²I) x = Aᵀb
+    (AᴴA + λ²I) x = Aᴴb
 
 but is more stable.
 
-#### Main features
-
-* the solution estimate is updated along orthogonal directions
-* the norm of the solution estimate ‖xᴸₖ‖₂ is increasing
-* the error ‖eₖ‖₂ := ‖xᴸₖ - x*‖₂ is decreasing
-* it is possible to transition cheaply from the LSLQ iterate to the LSQR iterate if there is an advantage (there always is in terms of error)
-* if `A` is rank deficient, identify the minimum least-squares solution
-
-#### Optional arguments
-
-* `M`: a symmetric and positive definite dual preconditioner
-* `N`: a symmetric and positive definite primal preconditioner
-* `sqd` indicates that we are solving a symmetric and quasi-definite system with `λ=1`
-
 If `λ > 0`, we solve the symmetric and quasi-definite system
 
     [ E      A ] [ r ]   [ b ]
-    [ Aᵀ  -λ²F ] [ x ] = [ 0 ],
+    [ Aᴴ  -λ²F ] [ x ] = [ 0 ],
 
 where E and F are symmetric and positive definite.
 Preconditioners M = E⁻¹ ≻ 0 and N = F⁻¹ ≻ 0 may be provided in the form of linear operators.
@@ -72,39 +60,60 @@ The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹ + λ²‖x‖²_F.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-LSLQ is then equivalent to applying SYMMLQ to `(AᵀE⁻¹A + λ²F)x = AᵀE⁻¹b` with `r = E⁻¹(b - Ax)`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+LSLQ is then equivalent to applying SYMMLQ to `(AᴴE⁻¹A + λ²F)x = AᴴE⁻¹b` with `r = E⁻¹(b - Ax)`.
 
 If `λ = 0`, we solve the symmetric and indefinite system
 
     [ E    A ] [ r ]   [ b ]
-    [ Aᵀ   0 ] [ x ] = [ 0 ].
+    [ Aᴴ   0 ] [ x ] = [ 0 ].
 
 The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹.
 
-In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᵀr` should be measured.
+In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᴴr` should be measured.
 `r` can be recovered by computing `E⁻¹(b - Ax)`.
 
-* `λ` is a regularization parameter (see the problem statement above)
-* `σ` is an underestimate of the smallest nonzero singular value of `A`---setting `σ` too large will result in an error in the course of the iterations
-* `atol` is a stopping tolerance based on the residual
-* `btol` is a stopping tolerance used to detect zero-residual problems
-* `etol` is a stopping tolerance based on the lower bound on the error
-* `window` is the number of iterations used to accumulate a lower bound on the error
-* `utol` is a stopping tolerance based on the upper bound on the error
-* `transfer_to_lsqr` return the CG solution estimate (i.e., the LSQR point) instead of the LQ estimate
-* `itmax` is the maximum number of iterations (0 means no imposed limit)
-* `conlim` is the limit on the estimated condition number of `A` beyond which the solution will be abandoned
-* `verbose` determines verbosity.
-
-#### Return values
+#### Main features
 
-`lslq` returns the tuple `(x, stats)` where
+* the solution estimate is updated along orthogonal directions
+* the norm of the solution estimate ‖xᴸₖ‖₂ is increasing
+* the error ‖eₖ‖₂ := ‖xᴸₖ - x*‖₂ is decreasing
+* it is possible to transition cheaply from the LSLQ iterate to the LSQR iterate if there is an advantage (there always is in terms of error)
+* if `A` is rank deficient, identify the minimum least-squares solution
 
-* `x` is the LQ solution estimate
-* `stats` collects other statistics on the run in a LSLQStats
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `window`: number of iterations used to accumulate a lower bound on the error;
+* `transfer_to_lsqr`: transfer from the LSLQ point to the LSQR point, when it exists. The transfer is based on the residual norm;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `σ`: strict lower bound on the smallest positive singular value `σₘᵢₙ` such as `σ = (1-10⁻⁷)σₘᵢₙ`;
+* `etol`: stopping tolerance based on the lower bound on the error;
+* `utol`: stopping tolerance based on the upper bound on the error;
+* `btol`: stopping tolerance used to detect zero-residual problems;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`LSLQStats`](@ref) structure.
 
 * `stats.err_lbnds` is a vector of lower bounds on the LQ error---the vector is empty if `window` is set to zero
 * `stats.err_ubnds_lq` is a vector of upper bounds on the LQ error---the vector is empty if `σ == 0` is left at zero
@@ -116,8 +125,8 @@ In this case, `N` can still be specified and indicates the weighted norm in whic
 The iterations stop as soon as one of the following conditions holds true:
 
 * the optimality residual is sufficiently small (`stats.status = "found approximate minimum least-squares solution"`) in the sense that either
-  * ‖Aᵀr‖ / (‖A‖ ‖r‖) ≤ atol, or
-  * 1 + ‖Aᵀr‖ / (‖A‖ ‖r‖) ≤ 1
+  * ‖Aᴴr‖ / (‖A‖ ‖r‖) ≤ atol, or
+  * 1 + ‖Aᴴr‖ / (‖A‖ ‖r‖) ≤ 1
 * an approximate zero-residual solution has been found (`stats.status = "found approximate zero-residual solution"`) in the sense that either
   * ‖r‖ / ‖b‖ ≤ btol + atol ‖A‖ * ‖xᴸ‖ / ‖b‖, or
   * 1 + ‖r‖ / ‖b‖ ≤ 1
@@ -127,9 +136,6 @@ The iterations stop as soon as one of the following conditions holds true:
 * the lower bound on the LQ forward error is less than etol * ‖xᴸ‖
 * the upper bound on the CG forward error is less than utol * ‖xᶜ‖
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
-
 #### References
 
 * R. Estrin, D. Orban and M. A. Saunders, [*Euclidean-norm error bounds for SYMMLQ and CG*](https://doi.org/10.1137/16M1094816), SIAM Journal on Matrix Analysis and Applications, 40(1), pp. 235--253, 2019.
@@ -153,16 +159,19 @@ See [`LslqSolver`](@ref) for more details about the `solver`.
 function lslq! end
 
 function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, N=I, sqd :: Bool=false, λ :: T=zero(T),
-               atol :: T=√eps(T), btol :: T=√eps(T), etol :: T=√eps(T),
-               utol :: T=√eps(T), itmax :: Int=0, σ :: T=zero(T),
-               transfer_to_lsqr :: Bool=false, conlim :: T=1/√eps(T),
+               M=I, N=I, ldiv :: Bool=false,
+               transfer_to_lsqr :: Bool=false,
+               sqd :: Bool=false, λ :: T=zero(T),
+               σ :: T=zero(T), etol :: T=√eps(T),
+               utol :: T=√eps(T), btol :: T=√eps(T),
+               conlim :: T=1/√eps(T), atol :: T=√eps(T),
+               rtol :: T=√eps(T), itmax :: Int=0,
                verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               callback=solver->false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("LSLQ: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "LSLQ: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -174,15 +183,15 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u, S, m)
   allocate_if(!NisI, solver, :v, S, n)
-  x, Nv, Aᵀu, w̄ = solver.x, solver.Nv, solver.Aᵀu, solver.w̄
+  x, Nv, Aᴴu, w̄ = solver.x, solver.Nv, solver.Aᴴu, solver.w̄
   Mu, Av, err_vec, stats = solver.Mu, solver.Av, solver.err_vec, solver.stats
   rNorms, ArNorms, err_lbnds = stats.residuals, stats.Aresiduals, stats.err_lbnds
   err_ubnds_lq, err_ubnds_cg = stats.err_ubnds_lq, stats.err_ubnds_cg
@@ -213,12 +222,12 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   @kscal!(m, one(FC)/β₁, u)
   MisI || @kscal!(m, one(FC)/β₁, Mu)
-  mul!(Aᵀu, Aᵀ, u)
-  Nv .= Aᵀu
+  mul!(Aᴴu, Aᴴ, u)
+  Nv .= Aᴴu
   NisI || mulorldiv!(v, N, Nv, ldiv)
   α = sqrt(@kdotr(n, v, Nv))  # = α₁
 
-  # Aᵀb = 0 so x = 0 is a minimum least-squares solution
+  # Aᴴb = 0 so x = 0 is a minimum least-squares solution
   if α == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = true, false
@@ -274,11 +283,12 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = m + n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s  %7s  %7s\n", "k", "‖r‖", "‖Aᵀr‖", "β", "α", "cos", "sin", "‖A‖²", "κ(A)", "‖xL‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm², Acond, xlqNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s  %7s  %7s\n", "k", "‖r‖", "‖Aᴴr‖", "β", "α", "cos", "sin", "‖A‖²", "κ(A)", "‖xL‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm², Acond, xlqNorm)
 
   status = "unknown"
-  solved = solved_mach = solved_lim = (rNorm ≤ atol)
+  ε = atol + rtol * β₁
+  solved = solved_mach = solved_lim = (rNorm ≤ ε)
   tired  = iter ≥ itmax
   ill_cond = ill_cond_mach = ill_cond_lim = false
   zero_resid = zero_resid_mach = zero_resid_lim = false
@@ -298,9 +308,9 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
       @kscal!(m, one(FC)/β, u)
       MisI || @kscal!(m, one(FC)/β, Mu)
 
-      # 2. αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-      mul!(Aᵀu, Aᵀ, u)
-      @kaxpby!(n, one(FC), Aᵀu, -β, Nv)
+      # 2. αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+      mul!(Aᴴu, Aᴴ, u)
+      @kaxpby!(n, one(FC), Aᴴu, -β, Nv)
       NisI || mulorldiv!(v, N, Nv, ldiv)
       α = sqrt(@kdotr(n, v, Nv))
       if α ≠ 0
@@ -388,11 +398,11 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
       end
     end
 
-    test1 = rNorm / β₁
+    test1 = rNorm
     test2 = ArNorm / (Anorm * rNorm)
     test3 = 1 / Acond
-    t1    = test1 / (one(T) + Anorm * xlqNorm / β₁)
-    rtol  = btol + atol * Anorm * xlqNorm / β₁
+    t1    = test1 / (one(T) + Anorm * xlqNorm)
+    tol   = btol + atol * Anorm * xlqNorm / β₁
 
     # update LSLQ point for next iteration
     @kaxpy!(n, c * ζ, w̄, x)
@@ -407,7 +417,7 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # check stopping condition based on forward error lower bound
     err_vec[mod(iter, window) + 1] = ζ
     if iter ≥ window
-      err_lbnd = norm(err_vec)
+      err_lbnd = @knrm2(window, err_vec)
       history && push!(err_lbnds, err_lbnd)
       fwd_err_lbnd = err_lbnd ≤ etol * xlqNorm
     end
@@ -432,16 +442,16 @@ function lslq!(solver :: LslqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     tired  = iter ≥ itmax
     ill_cond_lim = (test3 ≤ ctol)
     solved_lim = (test2 ≤ atol)
-    zero_resid_lim = (test1 ≤ rtol)
+    zero_resid_lim = (test1 ≤ ε)
 
     ill_cond = ill_cond_mach || ill_cond_lim
     zero_resid = zero_resid_mach || zero_resid_lim
     solved = solved_mach || solved_lim || zero_resid || fwd_err_lbnd || fwd_err_ubnd
 
     iter = iter + 1
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm, Acond, xlqNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm, Acond, xlqNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   if transfer_to_lsqr  # compute LSQR point
     @kaxpy!(n, ζ̄ , w̄, x)
diff --git a/src/lsmr.jl b/src/lsmr.jl
index f4d8349d1..781d9448a 100644
--- a/src/lsmr.jl
+++ b/src/lsmr.jl
@@ -5,7 +5,7 @@
 #
 # equivalently, of the normal equations
 #
-#  AᵀAx = Aᵀb.
+#  AᴴAx = Aᴴb.
 #
 # LSMR is formally equivalent to applying MINRES to the normal equations
 # but should be more stable. It is also formally equivalent to CRLS though
@@ -24,17 +24,16 @@
 
 export lsmr, lsmr!
 
-
 """
     (x, stats) = lsmr(A, b::AbstractVector{FC};
-                      M=I, N=I, sqd::Bool=false, λ::T=zero(T),
+                      M=I, N=I, ldiv::Bool=false,
+                      window::Int=5, sqd::Bool=false, λ::T=zero(T),
+                      radius::T=zero(T), etol::T=√eps(T),
                       axtol::T=√eps(T), btol::T=√eps(T),
-                      atol::T=zero(T), rtol::T=zero(T),
-                      etol::T=√eps(T), window::Int=5,
-                      itmax::Int=0, conlim::T=1/√eps(T),
-                      radius::T=zero(T), verbose::Int=0,
-                      history::Bool=false, ldiv::Bool=false,
-                      callback=solver->false)
+                      conlim::T=1/√eps(T), atol::T=zero(T),
+                      rtol::T=zero(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -43,24 +42,24 @@ Solve the regularized linear least-squares problem
 
     minimize ‖b - Ax‖₂² + λ²‖x‖₂²
 
-using the LSMR method, where λ ≥ 0 is a regularization parameter.
+of size m × n using the LSMR method, where λ ≥ 0 is a regularization parameter.
 LSMR is formally equivalent to applying MINRES to the normal equations
 
-    (AᵀA + λ²I) x = Aᵀb
+    (AᴴA + λ²I) x = Aᴴb
 
 (and therefore to CRLS) but is more stable.
 
-LSMR produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᵀr‖₂.
+LSMR produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᴴr‖₂.
 It is formally equivalent to CRLS, though can be substantially more accurate.
 
 LSMR can be also used to find a null vector of a singular matrix A
-by solving the problem `min ‖Aᵀx - b‖` with any nonzero vector `b`.
-At a minimizer, the residual vector `r = b - Aᵀx` will satisfy `Ar = 0`.
+by solving the problem `min ‖Aᴴx - b‖` with any nonzero vector `b`.
+At a minimizer, the residual vector `r = b - Aᴴx` will satisfy `Ar = 0`.
 
 If `λ > 0`, we solve the symmetric and quasi-definite system
 
     [ E      A ] [ r ]   [ b ]
-    [ Aᵀ  -λ²F ] [ x ] = [ 0 ],
+    [ Aᴴ  -λ²F ] [ x ] = [ 0 ],
 
 where E and F are symmetric and positive definite.
 Preconditioners M = E⁻¹ ≻ 0 and N = F⁻¹ ≻ 0 may be provided in the form of linear operators.
@@ -70,23 +69,51 @@ The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹ + λ²‖x‖²_F.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-LSMR is then equivalent to applying MINRES to `(AᵀE⁻¹A + λ²F)x = AᵀE⁻¹b` with `r = E⁻¹(b - Ax)`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+LSMR is then equivalent to applying MINRES to `(AᴴE⁻¹A + λ²F)x = AᴴE⁻¹b` with `r = E⁻¹(b - Ax)`.
 
 If `λ = 0`, we solve the symmetric and indefinite system
 
     [ E    A ] [ r ]   [ b ]
-    [ Aᵀ   0 ] [ x ] = [ 0 ].
+    [ Aᴴ   0 ] [ x ] = [ 0 ].
 
 The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹.
 
-In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᵀr` should be measured.
+In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᴴr` should be measured.
 `r` can be recovered by computing `E⁻¹(b - Ax)`.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `window`: number of iterations used to accumulate a lower bound on the error;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `etol`: stopping tolerance based on the lower bound on the error;
+* `axtol`: tolerance on the backward error;
+* `btol`: stopping tolerance used to detect zero-residual problems;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`LsmrStats`](@ref) structure.
 
 #### Reference
 
@@ -110,16 +137,18 @@ See [`LsmrSolver`](@ref) for more details about the `solver`.
 function lsmr! end
 
 function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, N=I, sqd :: Bool=false, λ :: T=zero(T),
+               M=I, N=I, ldiv :: Bool=false,
+               sqd :: Bool=false, λ :: T=zero(T),
+               radius :: T=zero(T), etol :: T=√eps(T),
                axtol :: T=√eps(T), btol :: T=√eps(T),
-               atol :: T=zero(T), rtol :: T=zero(T),
-               etol :: T=√eps(T), itmax :: Int=0, conlim :: T=1/√eps(T),
-               radius :: T=zero(T), verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               conlim :: T=1/√eps(T), atol :: T=zero(T),
+               rtol :: T=zero(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("LSMR: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "LSMR: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -131,15 +160,15 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u, S, m)
   allocate_if(!NisI, solver, :v, S, n)
-  x, Nv, Aᵀu, h, hbar = solver.x, solver.Nv, solver.Aᵀu, solver.h, solver.hbar
+  x, Nv, Aᴴu, h, hbar = solver.x, solver.Nv, solver.Aᴴu, solver.h, solver.hbar
   Mu, Av, err_vec, stats = solver.Mu, solver.Av, solver.err_vec, solver.stats
   rNorms, ArNorms = stats.residuals, stats.Aresiduals
   reset!(stats)
@@ -166,8 +195,8 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   @kscal!(m, one(FC)/β₁, u)
   MisI || @kscal!(m, one(FC)/β₁, Mu)
-  mul!(Aᵀu, Aᵀ, u)
-  Nv .= Aᵀu
+  mul!(Aᴴu, Aᴴ, u)
+  Nv .= Aᴴu
   NisI || mulorldiv!(v, N, Nv, ldiv)
   α = sqrt(@kdotr(n, v, Nv))
 
@@ -210,10 +239,10 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = m + n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s\n", "k", "‖r‖", "‖Aᵀr‖", "β", "α", "cos", "sin", "‖A‖²")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, β₁, α, β₁, α, 0, 1, Anorm²)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %7s  %8s  %8s  %7s\n", "k", "‖r‖", "‖Aᴴr‖", "β", "α", "cos", "sin", "‖A‖²")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, β₁, α, β₁, α, 0, 1, Anorm²)
 
-  # Aᵀb = 0 so x = 0 is a minimum least-squares solution
+  # Aᴴb = 0 so x = 0 is a minimum least-squares solution
   if α == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = true, false
@@ -248,9 +277,9 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
       @kscal!(m, one(FC)/β, u)
       MisI || @kscal!(m, one(FC)/β, Mu)
 
-      # 2. αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-      mul!(Aᵀu, Aᵀ, u)
-      @kaxpby!(n, one(FC), Aᵀu, -β, Nv)
+      # 2. αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+      mul!(Aᴴu, Aᴴ, u)
+      @kaxpby!(n, one(FC), Aᴴu, -β, Nv)
       NisI || mulorldiv!(v, N, Nv, ldiv)
       α = sqrt(@kdotr(n, v, Nv))
       if α ≠ 0
@@ -287,7 +316,7 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # the step ϕ/ρ is not necessarily positive
     σ = ζ / (ρ * ρbar)
     if radius > 0
-      t1, t2 = to_boundary(x, hbar, radius)
+      t1, t2 = to_boundary(n, x, hbar, radius)
       tmax, tmin = max(t1, t2), min(t1, t2)
       on_boundary = σ > tmax || σ < tmin
       σ = σ > 0 ? min(σ, tmax) : max(σ, tmin)
@@ -336,7 +365,7 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     t1    = test1 / (one(T) + Anorm * xNorm / β₁)
     rNormtol  = btol + axtol * Anorm * xNorm / β₁
 
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm²)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e\n", iter, rNorm, ArNorm, β, α, c, s, Anorm²)
 
     # Stopping conditions that do not depend on user input.
     # This is to guard against tolerances that are unreasonably small.
@@ -357,7 +386,7 @@ function lsmr!(solver :: LsmrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     zero_resid = zero_resid_mach | zero_resid_lim
     solved = solved_mach | solved_lim | solved_opt | zero_resid | fwd_err | on_boundary
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   ill_cond_mach       && (status = "condition number seems too large for this machine")
diff --git a/src/lsqr.jl b/src/lsqr.jl
index dd3779dce..0351b75e1 100644
--- a/src/lsqr.jl
+++ b/src/lsqr.jl
@@ -5,7 +5,7 @@
 #
 # equivalently, of the normal equations
 #
-#  AᵀAx = Aᵀb.
+#  AᴴAx = Aᴴb.
 #
 # LSQR is formally equivalent to applying the conjugate gradient method
 # to the normal equations but should be more stable. It is also formally
@@ -24,16 +24,16 @@
 
 export lsqr, lsqr!
 
-
 """
     (x, stats) = lsqr(A, b::AbstractVector{FC};
-                      M=I, N=I, sqd::Bool=false, λ::T=zero(T),
+                      M=I, N=I, ldiv::Bool=false,
+                      window::Int=5, sqd::Bool=false, λ::T=zero(T),
+                      radius::T=zero(T), etol::T=√eps(T),
                       axtol::T=√eps(T), btol::T=√eps(T),
-                      atol::T=zero(T), rtol::T=zero(T),
-                      etol::T=√eps(T), window::Int=5,
-                      itmax::Int=0, conlim::T=1/√eps(T),
-                      radius::T=zero(T), verbose::Int=0, history::Bool=false,
-                      ldiv::Bool=false, callback=solver->false)
+                      conlim::T=1/√eps(T), atol::T=zero(T),
+                      rtol::T=zero(T), itmax::Int=0,
+                      verbose::Int=0, history::Bool=false,
+                      callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
@@ -42,20 +42,20 @@ Solve the regularized linear least-squares problem
 
     minimize ‖b - Ax‖₂² + λ²‖x‖₂²
 
-using the LSQR method, where λ ≥ 0 is a regularization parameter.
+of size m × n using the LSQR method, where λ ≥ 0 is a regularization parameter.
 LSQR is formally equivalent to applying CG to the normal equations
 
-    (AᵀA + λ²I) x = Aᵀb
+    (AᴴA + λ²I) x = Aᴴb
 
 (and therefore to CGLS) but is more stable.
 
-LSQR produces monotonic residuals ‖r‖₂ but not optimality residuals ‖Aᵀr‖₂.
+LSQR produces monotonic residuals ‖r‖₂ but not optimality residuals ‖Aᴴr‖₂.
 It is formally equivalent to CGLS, though can be slightly more accurate.
 
 If `λ > 0`, LSQR solves the symmetric and quasi-definite system
 
     [ E      A ] [ r ]   [ b ]
-    [ Aᵀ  -λ²F ] [ x ] = [ 0 ],
+    [ Aᴴ  -λ²F ] [ x ] = [ 0 ],
 
 where E and F are symmetric and positive definite.
 Preconditioners M = E⁻¹ ≻ 0 and N = F⁻¹ ≻ 0 may be provided in the form of linear operators.
@@ -65,23 +65,51 @@ The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹ + λ²‖x‖²_F.
 
-For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᵀKx`.
-LSQR is then equivalent to applying CG to `(AᵀE⁻¹A + λ²F)x = AᵀE⁻¹b` with `r = E⁻¹(b - Ax)`.
+For a symmetric and positive definite matrix `K`, the K-norm of a vector `x` is `‖x‖²_K = xᴴKx`.
+LSQR is then equivalent to applying CG to `(AᴴE⁻¹A + λ²F)x = AᴴE⁻¹b` with `r = E⁻¹(b - Ax)`.
 
 If `λ = 0`, we solve the symmetric and indefinite system
 
     [ E    A ] [ r ]   [ b ]
-    [ Aᵀ   0 ] [ x ] = [ 0 ].
+    [ Aᴴ   0 ] [ x ] = [ 0 ].
 
 The system above represents the optimality conditions of
 
     minimize ‖b - Ax‖²_E⁻¹.
 
-In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᵀr` should be measured.
+In this case, `N` can still be specified and indicates the weighted norm in which `x` and `Aᴴr` should be measured.
 `r` can be recovered by computing `E⁻¹(b - Ax)`.
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m.
+
+#### Keyword arguments
+
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the augmented system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the augmented system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `window`: number of iterations used to accumulate a lower bound on the error;
+* `sqd`: if `true`, set `λ=1` for Hermitian quasi-definite systems;
+* `λ`: regularization parameter;
+* `radius`: add the trust-region constraint ‖x‖ ≤ `radius` if `radius > 0`. Useful to compute a step in a trust-region method for optimization;
+* `etol`: stopping tolerance based on the lower bound on the error;
+* `axtol`: tolerance on the backward error;
+* `btol`: stopping tolerance used to detect zero-residual problems;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -105,16 +133,18 @@ See [`LsqrSolver`](@ref) for more details about the `solver`.
 function lsqr! end
 
 function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
-               M=I, N=I, sqd :: Bool=false, λ :: T=zero(T),
+               M=I, N=I, ldiv :: Bool=false,
+               sqd :: Bool=false, λ :: T=zero(T),
+               radius :: T=zero(T), etol :: T=√eps(T),
                axtol :: T=√eps(T), btol :: T=√eps(T),
-               atol :: T=zero(T), rtol :: T=zero(T),
-               etol :: T=√eps(T), itmax :: Int=0, conlim :: T=1/√eps(T),
-               radius :: T=zero(T), verbose :: Int=0, history :: Bool=false,
-               ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+               conlim :: T=1/√eps(T), atol :: T=zero(T),
+               rtol :: T=zero(T), itmax :: Int=0,
+               verbose :: Int=0, history :: Bool=false,
+               callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("LSQR: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "LSQR: system of %d equations in %d variables\n", m, n)
 
   # Check sqd and λ parameters
   sqd && (λ ≠ 0) && error("sqd cannot be set to true if λ ≠ 0 !")
@@ -126,15 +156,15 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :u, S, m)
   allocate_if(!NisI, solver, :v, S, n)
-  x, Nv, Aᵀu, w = solver.x, solver.Nv, solver.Aᵀu, solver.w
+  x, Nv, Aᴴu, w = solver.x, solver.Nv, solver.Aᴴu, solver.w
   Mu, Av, err_vec, stats = solver.Mu, solver.Av, solver.err_vec, solver.stats
   rNorms, ArNorms = stats.residuals, stats.Aresiduals
   reset!(stats)
@@ -162,8 +192,8 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
   @kscal!(m, one(FC)/β₁, u)
   MisI || @kscal!(m, one(FC)/β₁, Mu)
-  mul!(Aᵀu, Aᵀ, u)
-  Nv .= Aᵀu
+  mul!(Aᴴu, Aᴴ, u)
+  Nv .= Aᴴu
   NisI || mulorldiv!(v, N, Nv, ldiv)
   Anorm² = @kdotr(n, v, Nv)
   Anorm = sqrt(Anorm²)
@@ -184,8 +214,8 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = m + n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %7s  %7s  %7s  %7s  %7s\n", "k", "α", "β", "‖r‖", "‖Aᵀr‖", "compat", "backwrd", "‖A‖", "κ(A)")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, β₁, α, β₁, α, 0, 1, Anorm, Acond)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %7s  %7s  %7s  %7s  %7s\n", "k", "α", "β", "‖r‖", "‖Aᴴr‖", "compat", "backwrd", "‖A‖", "κ(A)")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, β₁, α, β₁, α, 0, 1, Anorm, Acond)
 
   rNorm = β₁
   r1Norm = rNorm
@@ -194,7 +224,7 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
   history && push!(rNorms, r2Norm)
   ArNorm = ArNorm0 = α * β
   history && push!(ArNorms, ArNorm)
-  # Aᵀb = 0 so x = 0 is a minimum least-squares solution
+  # Aᴴb = 0 so x = 0 is a minimum least-squares solution
   if α == 0
     stats.niter = 0
     stats.solved, stats.inconsistent = true, false
@@ -237,9 +267,9 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
       Anorm² = Anorm² + α * α + β * β  # = ‖B_{k-1}‖²
       λ > 0 && (Anorm² += λ²)
 
-      # 2. αₖ₊₁Nvₖ₊₁ = Aᵀuₖ₊₁ - βₖ₊₁Nvₖ
-      mul!(Aᵀu, Aᵀ, u)
-      @kaxpby!(n, one(FC), Aᵀu, -β, Nv)
+      # 2. αₖ₊₁Nvₖ₊₁ = Aᴴuₖ₊₁ - βₖ₊₁Nvₖ
+      mul!(Aᴴu, Aᴴ, u)
+      @kaxpby!(n, one(FC), Aᴴu, -β, Nv)
       NisI || mulorldiv!(v, N, Nv, ldiv)
       α = sqrt(@kdotr(n, v, Nv))
       if α ≠ 0
@@ -272,7 +302,7 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
     xENorm² = xENorm² + ϕ * ϕ
     err_vec[mod(iter, window) + 1] = ϕ
-    iter ≥ window && (err_lbnd = norm(err_vec))
+    iter ≥ window && (err_lbnd = @knrm2(window, err_vec))
 
     τ = s * ϕ
     θ = s * α
@@ -283,7 +313,7 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # the step ϕ/ρ is not necessarily positive
     σ = ϕ / ρ
     if radius > 0
-      t1, t2 = to_boundary(x, w, radius)
+      t1, t2 = to_boundary(n, x, w, radius)
       tmax, tmin = max(t1, t2), min(t1, t2)
       on_boundary = σ > tmax || σ < tmin
       σ = σ > 0 ? min(σ, tmax) : max(σ, tmin)
@@ -325,7 +355,7 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     t1    = test1 / (one(T) + Anorm * xNorm / β₁)
     rNormtol = btol + axtol * Anorm * xNorm / β₁
 
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, α, β, rNorm, ArNorm, test1, test2, Anorm, Acond)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, α, β, rNorm, ArNorm, test1, test2, Anorm, Acond)
 
     # Stopping conditions that do not depend on user input.
     # This is to guard against tolerances that are unreasonably small.
@@ -346,7 +376,7 @@ function lsqr!(solver :: LsqrSolver{T,FC,S}, A, b :: AbstractVector{FC};
     zero_resid = zero_resid_mach | zero_resid_lim
     solved = solved_mach | solved_lim | solved_opt | zero_resid | fwd_err | on_boundary
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   ill_cond_mach       && (status = "condition number seems too large for this machine")
diff --git a/src/minres.jl b/src/minres.jl
index cbaefee9f..f82bbc350 100644
--- a/src/minres.jl
+++ b/src/minres.jl
@@ -3,7 +3,7 @@
 #
 #  minimize ‖Ax - b‖₂
 #
-# where A is square and symmetric.
+# where A is Hermitian.
 #
 # MINRES is formally equivalent to applying the conjugate residuals method
 # to Ax = b when A is positive definite, but is more general and also applies
@@ -21,20 +21,22 @@
 
 export minres, minres!
 
-
 """
     (x, stats) = minres(A, b::AbstractVector{FC};
-                        M=I, λ::T=zero(T), atol::T=√eps(T)/100,
-                        rtol::T=√eps(T)/100, ratol :: T=zero(T), 
-                        rrtol :: T=zero(T), etol::T=√eps(T),
-                        window::Int=5, itmax::Int=0,
-                        conlim::T=1/√eps(T), verbose::Int=0,
-                        history::Bool=false, ldiv::Bool=false,
-                        callback=solver->false)
+                        M=I, ldiv::Bool=false, window::Int=5,
+                        λ::T=zero(T), atol::T=√eps(T),
+                        rtol::T=√eps(T), etol::T=√eps(T),
+                        conlim::T=1/√eps(T), itmax::Int=0,
+                        verbose::Int=0, history::Bool=false,
+                        callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
+    (x, stats) = minres(A, b, x0::AbstractVector; kwargs...)
+
+MINRES can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
 Solve the shifted linear least-squares problem
 
     minimize ‖b - (A + λI)x‖₂²
@@ -43,26 +45,44 @@ or the shifted linear system
 
     (A + λI) x = b
 
-using the MINRES method, where λ ≥ 0 is a shift parameter,
-where A is square and symmetric.
+of size n using the MINRES method, where λ ≥ 0 is a shift parameter,
+where A is Hermitian.
 
 MINRES is formally equivalent to applying CR to Ax=b when A is positive
 definite, but is typically more stable and also applies to the case where
 A is indefinite.
 
-MINRES produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᵀr‖₂.
+MINRES produces monotonic residuals ‖r‖₂ and optimality residuals ‖Aᴴr‖₂.
+
+#### Input arguments
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be symmetric and positive definite.
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n.
 
-MINRES can be warm-started from an initial guess `x0` with the method
+#### Optional argument
 
-    (x, stats) = minres(A, b, x0; kwargs...)
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-where `kwargs` are the same keyword arguments as above.
+#### Keyword arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `window`: number of iterations used to accumulate a lower bound on the error;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `etol`: stopping tolerance based on the lower bound on the error;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
+
+#### Output arguments
+
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -99,22 +119,24 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0
 end
 
 function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                 M=I, λ :: T=zero(T), atol :: T=√eps(T)/100, rtol :: T=√eps(T)/100, 
-                 ratol :: T=zero(T), rrtol :: T=zero(T), etol :: T=√eps(T),
-                 itmax :: Int=0, conlim :: T=1/√eps(T), verbose :: Int=0,
-                 history :: Bool=false, ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
-
-  n, m = size(A)
+                 M=I, ldiv :: Bool=false,
+                 λ :: T=zero(T), atol :: T=√eps(T),
+                 rtol :: T=√eps(T), etol :: T=√eps(T),
+                 conlim :: T=1/√eps(T), itmax :: Int=0,
+                 verbose :: Int=0, history :: Bool=false,
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("MINRES: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "MINRES: system of size %d\n", n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :v, S, n)
@@ -189,16 +211,15 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = 2*n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %8s  %8s  %7s  %7s  %7s  %7s\n", "k", "‖r‖", "‖Aᵀr‖", "β", "cos", "sin", "‖A‖", "κ(A)", "test1", "test2")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, cs, sn, ANorm, Acond)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %8s  %8s  %7s  %7s  %7s  %7s\n", "k", "‖r‖", "‖Aᴴr‖", "β", "cos", "sin", "‖A‖", "κ(A)", "test1", "test2")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, cs, sn, ANorm, Acond)
 
-  tol = atol + rtol * β₁
-  rNormtol = ratol + rrtol * β₁ 
+  ε = atol + rtol * β₁
   stats.status = "unknown"
   solved = solved_mach = solved_lim = (rNorm ≤ rtol)
   tired  = iter ≥ itmax
   ill_cond = ill_cond_mach = ill_cond_lim = false
-  zero_resid = zero_resid_mach = zero_resid_lim = (rNorm ≤ tol)
+  zero_resid = zero_resid_mach = zero_resid_lim = (rNorm ≤ ε)
   fwd_err = false
   user_requested_exit = false
 
@@ -241,7 +262,7 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     ϵ = sn * β
     δbar = -cs * β
     root = sqrt(γbar * γbar + δbar * δbar)
-    ArNorm = ϕbar * root  # = ‖Aᵀrₖ₋₁‖
+    ArNorm = ϕbar * root  # = ‖Aᴴrₖ₋₁‖
     history && push!(ArNorms, ArNorm)
 
     # Compute the next plane rotation.
@@ -266,7 +287,7 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
 
     # Compute lower bound on forward error.
     err_vec[mod(iter, window) + 1] = ϕ
-    iter ≥ window && (err_lbnd = norm(err_vec))
+    iter ≥ window && (err_lbnd = @knrm2(window, err_vec))
 
     γmax = max(γmax, γ)
     γmin = min(γmin, γ)
@@ -292,11 +313,11 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     Acond = γmax / γmin
     history && push!(Aconds, Acond)
 
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, cs, sn, ANorm, Acond, test1, test2)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, ArNorm, β, cs, sn, ANorm, Acond, test1, test2)
 
     if iter == 1 && β / β₁ ≤ 10 * ϵM
-      # Aᵀb = 0 so x = 0 is a minimum least-squares solution
-      stats.niter = 0
+      # Aᴴb = 0 so x = 0 is a minimum least-squares solution
+      stats.niter = 1
       stats.solved, stats.inconsistent = true, true
       stats.status = "x is a minimum least-squares solution"
       solver.warm_start = false
@@ -314,18 +335,18 @@ function minres!(solver :: MinresSolver{T,FC,S}, A, b :: AbstractVector{FC};
     # Stopping conditions based on user-provided tolerances.
     tired = iter ≥ itmax
     ill_cond_lim = (one(T) / Acond ≤ ctol)
-    solved_lim = (test2 ≤ tol)
-    zero_resid_lim = (test1 ≤ tol)
-    resid_decrease_lim = (rNorm ≤ rNormtol)
+    solved_lim = (test2 ≤ ε)
+    zero_resid_lim = MisI && (test1 ≤ eps(T))
+    resid_decrease_lim = (rNorm ≤ ε)
     iter ≥ window && (fwd_err = err_lbnd ≤ etol * sqrt(xENorm²))
 
     user_requested_exit = callback(solver) :: Bool
-    zero_resid = zero_resid_mach | zero_resid_lim
-    resid_decrease = resid_decrease_mach | resid_decrease_lim
-    ill_cond = ill_cond_mach | ill_cond_lim
-    solved = solved_mach | solved_lim | zero_resid | fwd_err | resid_decrease
+    zero_resid = zero_resid_mach || zero_resid_lim
+    resid_decrease = resid_decrease_mach || resid_decrease_lim
+    ill_cond = ill_cond_mach || ill_cond_lim
+    solved = solved_mach || solved_lim || zero_resid || fwd_err || resid_decrease
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   ill_cond_mach       && (status = "condition number seems too large for this machine")
diff --git a/src/minres_qlp.jl b/src/minres_qlp.jl
index bbfbf856b..72662f97e 100644
--- a/src/minres_qlp.jl
+++ b/src/minres_qlp.jl
@@ -18,30 +18,52 @@ export minres_qlp, minres_qlp!
 
 """
     (x, stats) = minres_qlp(A, b::AbstractVector{FC};
-                            M=I, atol::T=√eps(T), rtol::T=√eps(T),
-                            ctol::T=√eps(T), λ::T=zero(T), itmax::Int=0,
+                            M=I, ldiv::Bool=false, Artol::T=√eps(T),
+                            λ::T=zero(T), atol::T=√eps(T),
+                            rtol::T=√eps(T), itmax::Int=0,
                             verbose::Int=0, history::Bool=false,
-                            ldiv::Bool=false, callback=solver->false)
+                            callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
+    (x, stats) = minres_qlp(A, b, x0::AbstractVector; kwargs...)
+
+MINRES-QLP can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
 MINRES-QLP is the only method based on the Lanczos process that returns the minimum-norm
-solution on singular inconsistent systems (A + λI)x = b, where λ is a shift parameter.
+solution on singular inconsistent systems (A + λI)x = b of size n, where λ is a shift parameter.
 It is significantly more complex but can be more reliable than MINRES when A is ill-conditioned.
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be symmetric and positive definite.
 M also indicates the weighted norm in which residuals are measured.
 
-MINRES-QLP can be warm-started from an initial guess `x0` with the method
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
+
+#### Keyword arguments
 
-    (x, stats) = minres_qlp(A, b, x0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `Artol`: relative stopping tolerance based on the Aᴴ-residual norm;
+* `λ`: regularization parameter;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -80,22 +102,23 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
 end
 
 function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                     M=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                     ctol :: T=√eps(T), λ ::T=zero(T), itmax :: Int=0,
+                     M=I, ldiv :: Bool=false, Artol :: T=√eps(T),
+                     λ ::T=zero(T), atol :: T=√eps(T),
+                     rtol :: T=√eps(T), itmax :: Int=0,
                      verbose :: Int=0, history :: Bool=false,
-                     ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                     callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("MINRES-QLP: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "MINRES-QLP: system of size %d\n", n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :vₖ, S, n)
@@ -147,8 +170,8 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
 
   ε = atol + rtol * rNorm
   κ = zero(T)
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s  %7s  %8s  %7s  %8s  %7s\n", "k", "‖rₖ‖", "‖Arₖ₋₁‖", "βₖ₊₁", "Rₖ.ₖ", "Lₖ.ₖ", "‖A‖", "κ(A)", "backward")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7s  %7.1e  %7s  %8s  %7.1e  %7.1e  %8s\n", iter, rNorm, "✗ ✗ ✗ ✗", βₖ, "✗ ✗ ✗ ✗", " ✗ ✗ ✗ ✗", ANorm, Acond, " ✗ ✗ ✗ ✗")
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s  %7s  %8s  %7s  %8s  %7s\n", "k", "‖rₖ‖", "‖Arₖ₋₁‖", "βₖ₊₁", "Rₖ.ₖ", "Lₖ.ₖ", "‖A‖", "κ(A)", "backward")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7s  %7.1e  %7s  %8s  %7.1e  %7.1e  %8s\n", iter, rNorm, "✗ ✗ ✗ ✗", βₖ, "✗ ✗ ✗ ✗", " ✗ ✗ ✗ ✗", ANorm, Acond, " ✗ ✗ ✗ ✗")
 
   # Set up workspace.
   M⁻¹vₖ₋₁ .= zero(FC)
@@ -246,7 +269,7 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
     # [sₖ -cₖ] [βₖ₊₁ ]   [0 ]
     (cₖ, sₖ, λₖ) = sym_givens(λbarₖ, βₖ₊₁)
 
-    # Compute [   zₖ  ] = (Qₖ)ᵀβ₁e₁
+    # Compute [   zₖ  ] = (Qₖ)ᴴβ₁e₁
     #         [ζbarₖ₊₁]
     #
     # [cₖ  sₖ] [ζbarₖ] = [   ζₖ  ]
@@ -312,7 +335,7 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
       τₖ   = (ξₖ - ψbarₖ₋₁ * τₖ₋₁) / μbarₖ
     end
 
-    # Compute directions wₖ₋₂, ẘₖ₋₁ and w̄ₖ, last columns of Wₖ = Vₖ(Pₖ)ᵀ
+    # Compute directions wₖ₋₂, ẘₖ₋₁ and w̄ₖ, last columns of Wₖ = Vₖ(Pₖ)ᴴ
     if iter == 1
       # w̅₁ = v₁
       @. wₖ = vₖ
@@ -352,7 +375,7 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
     # Update ‖Arₖ₋₁‖ estimate
     # ‖ Arₖ₋₁ ‖ = |ζbarₖ| * √(|λbarₖ|² + |γbarₖ|²)
     ArNorm = abs(ζbarₖ) * √(abs2(λbarₖ) + abs2(cₖ₋₁ * βₖ₊₁))
-    iter == 1 && (κ = atol + ctol * ArNorm)
+    iter == 1 && (κ = atol + Artol * ArNorm)
     history && push!(ArNorms, ArNorm)
 
     ANorm = sqrt(ANorm²)
@@ -383,14 +406,14 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
     # Stopping conditions based on user-provided tolerances.
     tired = iter ≥ itmax
     resid_decrease_lim = (rNorm ≤ ε)
-    zero_resid_lim = (backward ≤ ε)
+    zero_resid_lim = MisI && (backward ≤ eps(T))
     breakdown = βₖ₊₁ ≤ btol
 
     user_requested_exit = callback(solver) :: Bool
     zero_resid = zero_resid_mach | zero_resid_lim
     resid_decrease = resid_decrease_mach | resid_decrease_lim
     solved = resid_decrease | zero_resid
-    inconsistent = (ArNorm ≤ κ && abs(μbarₖ) ≤ ctol) || (breakdown && !solved)
+    inconsistent = (ArNorm ≤ κ && abs(μbarₖ) ≤ Artol) || (breakdown && !solved)
 
     # Update variables
     if iter ≥ 2
@@ -405,9 +428,9 @@ function minres_qlp!(solver :: MinresQlpSolver{T,FC,S}, A, b :: AbstractVector{F
     μbarₖ₋₁ = μbarₖ
     ζbarₖ = ζbarₖ₊₁
     βₖ = βₖ₊₁
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %7.1e  %7.1e  %8.1e\n", iter, rNorm, ArNorm, βₖ₊₁, λₖ, μbarₖ, ANorm, Acond, backward)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e  %7.1e  %8.1e  %7.1e  %7.1e  %8.1e\n", iter, rNorm, ArNorm, βₖ₊₁, λₖ, μbarₖ, ANorm, Acond, backward)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Finalize the update of x
   if iter ≥ 2
diff --git a/src/qmr.jl b/src/qmr.jl
index eb4a4eb46..e24fba79a 100644
--- a/src/qmr.jl
+++ b/src/qmr.jl
@@ -21,28 +21,48 @@
 export qmr, qmr!
 
 """
-    (x, stats) = qmr(A, b::AbstractVector{FC}; c::AbstractVector{FC}=b,
-                     atol::T=√eps(T), rtol::T=√eps(T),
-                     itmax::Int=0, verbose::Int=0, history::Bool=false,
-                     callback=solver->false)
+    (x, stats) = qmr(A, b::AbstractVector{FC};
+                     c::AbstractVector{FC}=b, atol::T=√eps(T),
+                     rtol::T=√eps(T), itmax::Int=0, verbose::Int=0,
+                     history::Bool=false, callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the square linear system Ax = b using the QMR method.
+    (x, stats) = qmr(A, b, x0::AbstractVector; kwargs...)
+
+QMR can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+Solve the square linear system Ax = b of size n using QMR.
 
 QMR is based on the Lanczos biorthogonalization process and requires two initial vectors `b` and `c`.
-The relation `bᵀc ≠ 0` must be satisfied and by default `c = b`.
-When `A` is symmetric and `b = c`, QMR is equivalent to MINRES.
+The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
+When `A` is Hermitian and `b = c`, QMR is equivalent to MINRES.
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension n;
+* `b`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-QMR can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = qmr(A, b, x0; kwargs...)
+* `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -83,20 +103,20 @@ end
 function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: AbstractVector{FC}=b,
               atol :: T=√eps(T), rtol :: T=√eps(T),
               itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-              callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+              callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
-  n, m = size(A)
+  m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("QMR: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "QMR: system of size %d\n", n)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p = solver.uₖ₋₁, solver.uₖ, solver.q, solver.vₖ₋₁, solver.vₖ, solver.p
@@ -129,22 +149,22 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
   itmax == 0 && (itmax = 2*n)
 
   ε = atol + rtol * rNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
 
   # Initialize the Lanczos biorthogonalization process.
-  cᵗb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
-  if cᵗb == 0
+  cᴴb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
+  if cᴴb == 0
     stats.niter = 0
     stats.solved = false
     stats.inconsistent = false
-    stats.status = "Breakdown bᵀc = 0"
+    stats.status = "Breakdown bᴴc = 0"
     solver.warm_start = false
     return solver
   end
 
-  βₖ = √(abs(cᵗb))             # β₁γ₁ = cᵀ(b - Ax₀)
-  γₖ = cᵗb / βₖ                # β₁γ₁ = cᵀ(b - Ax₀)
+  βₖ = √(abs(cᴴb))             # β₁γ₁ = cᴴ(b - Ax₀)
+  γₖ = cᴴb / βₖ                # β₁γ₁ = cᴴ(b - Ax₀)
   vₖ₋₁ .= zero(FC)             # v₀ = 0
   uₖ₋₁ .= zero(FC)             # u₀ = 0
   vₖ .= r₀ ./ βₖ               # v₁ = (b - Ax₀) / β₁
@@ -153,7 +173,7 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
   sₖ₋₂ = sₖ₋₁ = sₖ = zero(FC)  # Givens sines used for the QR factorization of Tₖ₊₁.ₖ
   wₖ₋₂ .= zero(FC)             # Column k-2 of Wₖ = Vₖ(Rₖ)⁻¹
   wₖ₋₁ .= zero(FC)             # Column k-1 of Wₖ = Vₖ(Rₖ)⁻¹
-  ζbarₖ = βₖ                   # ζbarₖ is the last component of z̅ₖ = (Qₖ)ᵀβ₁e₁
+  ζbarₖ = βₖ                   # ζbarₖ is the last component of z̅ₖ = (Qₖ)ᴴβ₁e₁
   τₖ = @kdotr(n, vₖ, vₖ)       # τₖ is used for the residual norm estimate
 
   # Stopping criterion.
@@ -169,10 +189,10 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
 
     # Continue the Lanczos biorthogonalization process.
     # AVₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀUₖ = Uₖ(Tₖ)ᵀ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , vₖ)  # Forms vₖ₊₁ : q ← Avₖ
-    mul!(p, Aᵀ, uₖ)  # Forms uₖ₊₁ : p ← Aᵀuₖ
+    mul!(p, Aᴴ, uₖ)  # Forms uₖ₊₁ : p ← Aᴴuₖ
 
     @kaxpy!(n, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - β̄ₖ * uₖ₋₁
@@ -182,9 +202,9 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
     @kaxpy!(n, -     αₖ , vₖ, q)    # q ← q - αₖ * vₖ
     @kaxpy!(n, -conj(αₖ), uₖ, p)    # p ← p - ᾱₖ * uₖ
 
-    pᵗq = @kdot(n, p, q)      # pᵗq  = ⟨p,q⟩
-    βₖ₊₁ = √(abs(pᵗq))        # βₖ₊₁ = √(|pᵗq|)
-    γₖ₊₁ = pᵗq / βₖ₊₁         # γₖ₊₁ = pᵗq / βₖ₊₁
+    pᴴq = @kdot(n, p, q)      # pᴴq  = ⟨p,q⟩
+    βₖ₊₁ = √(abs(pᴴq))        # βₖ₊₁ = √(|pᴴq|)
+    γₖ₊₁ = pᴴq / βₖ₊₁         # γₖ₊₁ = pᴴq / βₖ₊₁
 
     # Update the QR factorization of Tₖ₊₁.ₖ = Qₖ [ Rₖ ].
     #                                            [ Oᵀ ]
@@ -271,7 +291,7 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
     @. vₖ₋₁ = vₖ  # vₖ₋₁ ← vₖ
     @. uₖ₋₁ = uₖ  # uₖ₋₁ ← uₖ
 
-    if pᵗq ≠ zero(FC)
+    if pᴴq ≠ zero(FC)
       @. vₖ = q / βₖ₊₁        # βₖ₊₁vₖ₊₁ = q
       @. uₖ = p / conj(γₖ₊₁)  # γ̄ₖ₊₁uₖ₊₁ = p
     end
@@ -303,10 +323,10 @@ function qmr!(solver :: QmrSolver{T,FC,S}, A, b :: AbstractVector{FC}; c :: Abst
     resid_decrease_lim = rNorm ≤ ε
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    breakdown = !solved && (pᵗq == 0)
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm)
+    breakdown = !solved && (pᴴq == 0)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "Breakdown ⟨uₖ₊₁,vₖ₊₁⟩ = 0")
diff --git a/src/symmlq.jl b/src/symmlq.jl
index 7b889c715..81477fc66 100644
--- a/src/symmlq.jl
+++ b/src/symmlq.jl
@@ -1,5 +1,5 @@
 # An implementation of SYMMLQ for the solution of the
-# linear system Ax = b, where A is square and symmetric.
+# linear system Ax = b, where A is Hermitian.
 #
 # This implementation follows the original implementation by
 # Michael Saunders described in
@@ -11,38 +11,62 @@
 
 export symmlq, symmlq!
 
-
 """
-    (x, stats) = symmlq(A, b::AbstractVector{FC}; window::Int=0,
-                        M=I, λ::T=zero(T), transfer_to_cg::Bool=true,
-                        λest::T=zero(T), atol::T=√eps(T), rtol::T=√eps(T),
-                        etol::T=√eps(T), itmax::Int=0, conlim::T=1/√eps(T),
+    (x, stats) = symmlq(A, b::AbstractVector{FC};
+                        M=I, ldiv::Bool=false, window::Int=5,
+                        transfer_to_cg::Bool=true, λ::T=zero(T),
+                        λest::T=zero(T), etol::T=√eps(T),
+                        conlim::T=1/√eps(T), atol::T=√eps(T),
+                        rtol::T=√eps(T), itmax::Int=0,
                         verbose::Int=0, history::Bool=false,
-                        ldiv::Bool=false, callback=solver->false)
+                        callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
+    (x, stats) = symmlq(A, b, x0::AbstractVector; kwargs...)
+
+SYMMLQ can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above
+
 Solve the shifted linear system
 
     (A + λI) x = b
 
-using the SYMMLQ method, where λ is a shift parameter,
-and A is square and symmetric.
+of size n using the SYMMLQ method, where λ is a shift parameter, and A is Hermitian.
+
+SYMMLQ produces monotonic errors ‖x* - x‖₂.
+
+#### Input arguments
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n;
+* `b`: a vector of length n.
 
-SYMMLQ produces monotonic errors ‖x*-x‖₂.
+#### Optional argument
 
-A preconditioner M may be provided in the form of a linear operator and is
-assumed to be symmetric and positive definite.
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-SYMMLQ can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = symmlq(A, b, x0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning;
+* `ldiv`: define whether the preconditioner uses `ldiv!` or `mul!`;
+* `window`: number of iterations used to accumulate a lower bound on the error;
+* `transfer_to_cg`: transfer from the SYMMLQ point to the CG point, when it exists. The transfer is based on the residual norm;
+* `λ`: regularization parameter;
+* `λest`: positive strict lower bound on the smallest eigenvalue `λₘᵢₙ` when solving a positive-definite system, such as `λest = (1-10⁻⁷)λₘᵢₙ`;
+* `etol`: stopping tolerance based on the lower bound on the error;
+* `conlim`: limit on the estimated condition number of `A` beyond which the solution will be abandoned;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SymmlqStats`](@ref) structure.
 
 #### Reference
 
@@ -79,23 +103,25 @@ function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, x0
 end
 
 function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
-                 M=I, λ :: T=zero(T), transfer_to_cg :: Bool=true,
-                 λest :: T=zero(T), atol :: T=√eps(T), rtol :: T=√eps(T),
-                 etol :: T=√eps(T), itmax :: Int=0, conlim :: T=1/√eps(T),
+                 M=I, ldiv :: Bool=false,
+                 transfer_to_cg :: Bool=true, λ :: T=zero(T),
+                 λest :: T=zero(T), etol :: T=√eps(T),
+                 conlim :: T=1/√eps(T), atol :: T=√eps(T),
+                 rtol :: T=√eps(T), itmax :: Int=0,
                  verbose :: Int=0, history :: Bool=false,
-                 ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   m == n || error("System must be square")
   length(b) == m || error("Inconsistent problem size")
-  (verbose > 0) && @printf("SYMMLQ: system of size %d\n", n)
+  (verbose > 0) && @printf(iostream, "SYMMLQ: system of size %d\n", n)
 
   # Tests M = Iₙ
   MisI = (M === I)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
 
   # Set up workspace.
   allocate_if(!MisI, solver, :v, S, n)
@@ -213,8 +239,8 @@ function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
   iter = 0
   itmax == 0 && (itmax = 2 * n)
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %8s  %8s  %7s  %7s  %7s\n", "k", "‖r‖", "β", "cos", "sin", "‖A‖", "κ(A)", "test1")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e\n", iter, rNorm, β, cold, sold, ANorm, Acond)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %8s  %8s  %7s  %7s  %7s\n", "k", "‖r‖", "β", "cos", "sin", "‖A‖", "κ(A)", "test1")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e\n", iter, rNorm, β, cold, sold, ANorm, Acond)
 
   tol = atol + rtol * β₁
   status = "unknown"
@@ -301,8 +327,11 @@ function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
           zetabark = zlist[jx] / clist[jx]
 
           if γbar ≠ 0
-            theta = abs(sum(clist[i] * sprod[i] * zlist[i] for i = 1 : window))
-            theta = zetabark * theta + abs(zetabark * ζbar * sprod[ix] * s) - zetabark^2
+            theta = zero(T)
+            for i = 1 : window
+              theta += clist[i] * sprod[i] * zlist[i]
+            end
+            theta = zetabark * abs(theta) + abs(zetabark * ζbar * sprod[ix] * s) - zetabark^2
             history && (errorscg[iter-window+1] = sqrt(abs(errorscg[iter-window+1]^2 - 2*theta)))
           else
             history && (errorscg[iter-window+1] = missing)
@@ -345,7 +374,7 @@ function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     ANorm = sqrt(ANorm²)
     test1 = rNorm / (ANorm * xNorm)
 
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, β, c, s, ANorm, Acond, test1)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %8.1e  %8.1e  %7.1e  %7.1e  %7.1e\n", iter, rNorm, β, c, s, ANorm, Acond, test1)
 
     # Reset variables
     ϵold = ϵ
@@ -372,7 +401,7 @@ function symmlq!(solver :: SymmlqSolver{T,FC,S}, A, b :: AbstractVector{FC};
     ill_cond = ill_cond_mach || ill_cond_lim
     solved = solved_mach || zero_resid || zero_resid_mach || zero_resid_lim || fwd_err || resid_decrease_mach
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute CG point
   # (xᶜ)ₖ ← (xᴸ)ₖ₋₁ + ζbarₖ * w̅ₖ
diff --git a/src/tricg.jl b/src/tricg.jl
index 5acff2d52..4096a9ffe 100644
--- a/src/tricg.jl
+++ b/src/tricg.jl
@@ -13,30 +13,32 @@ export tricg, tricg!
 
 """
     (x, y, stats) = tricg(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                          M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                          spd::Bool=false, snd::Bool=false, flip::Bool=false,
-                          τ::T=one(T), ν::T=-one(T), itmax::Int=0,
+                          M=I, N=I, ldiv::Bool=false,
+                          spd::Bool=false, snd::Bool=false,
+                          flip::Bool=false, τ::T=one(T),
+                          ν::T=-one(T), atol::T=√eps(T),
+                          rtol::T=√eps(T), itmax::Int=0,
                           verbose::Int=0, history::Bool=false,
-                          ldiv::Bool=false, callback=solver->false)
+                          callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-TriCG solves the symmetric linear system
+    (x, y, stats) = tricg(A, b, c, x0::AbstractVector, y0::AbstractVector; kwargs...)
+
+TriCG can be warm-started from initial guesses `x0` and `y0` where `kwargs` are the same keyword arguments as above.
+
+Given a matrix `A` of dimension m × n, TriCG solves the Hermitian linear system
 
     [ τE    A ] [ x ] = [ b ]
-    [  Aᵀ  νF ] [ y ]   [ c ],
+    [  Aᴴ  νF ] [ y ]   [ c ],
 
-where τ and ν are real numbers, E = M⁻¹ ≻ 0 and F = N⁻¹ ≻ 0.
+of size (n+m) × (n+m) where τ and ν are real numbers, E = M⁻¹ ≻ 0 and F = N⁻¹ ≻ 0.
 `b` and `c` must both be nonzero.
 TriCG could breakdown if `τ = 0` or `ν = 0`.
 It's recommended to use TriMR in these cases.
 
-By default, TriCG solves symmetric and quasi-definite linear systems with τ = 1 and ν = -1.
-If `flip = true`, TriCG solves another known variant of SQD systems where τ = -1 and ν = 1.
-If `spd = true`, τ = ν = 1 and the associated symmetric and positive definite linear system is solved.
-If `snd = true`, τ = ν = -1 and the associated symmetric and negative definite linear system is solved.
-`τ` and `ν` are also keyword arguments that can be directly modified for more specific problems.
+By default, TriCG solves Hermitian and quasi-definite linear systems with τ = 1 and ν = -1.
 
 TriCG is based on the preconditioned orthogonal tridiagonalization process
 and its relation with the preconditioned block-Lanczos process.
@@ -50,17 +52,39 @@ It's the Euclidean norm when `M` and `N` are identity operators.
 TriCG stops when `itmax` iterations are reached or when `‖rₖ‖ ≤ atol + ‖r₀‖ * rtol`.
 `atol` is an absolute tolerance and `rtol` is a relative tolerance.
 
-Additional details can be displayed if verbose mode is enabled (verbose > 0).
-Information will be displayed every `verbose` iterations.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
+
+#### Optional arguments
+
+* `x0`: a vector of length m that represents an initial guess of the solution x;
+* `y0`: a vector of length n that represents an initial guess of the solution y.
 
-TriCG can be warm-started from initial guesses `x0` and `y0` with the method
+#### Keyword arguments
 
-    (x, y, stats) = tricg(A, b, c, x0, y0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the partitioned system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the partitioned system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `spd`: if `true`, set `τ = 1` and `ν = 1` for Hermitian and positive-definite linear system;
+* `snd`: if `true`, set `τ = -1` and `ν = -1` for Hermitian and negative-definite linear systems;
+* `flip`: if `true`, set `τ = -1` and `ν = 1` for another known variant of Hermitian quasi-definite systems;
+* `τ` and `ν`: diagonal scaling factors of the partitioned Hermitian linear system;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length m;
+* `y`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -98,16 +122,18 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 end
 
 function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                spd :: Bool=false, snd :: Bool=false, flip :: Bool=false,
-                τ :: T=one(T), ν :: T=-one(T), itmax :: Int=0,
+                M=I, N=I, ldiv :: Bool=false,
+                spd :: Bool=false, snd :: Bool=false,
+                flip :: Bool=false, τ :: T=one(T),
+                ν :: T=-one(T), atol :: T=√eps(T),
+                rtol :: T=√eps(T), itmax :: Int=0,
                 verbose :: Int=0, history :: Bool=false,
-                ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("TriCG: system of %d equations in %d variables\n", m+n, m+n)
+  (verbose > 0) && @printf(iostream, "TriCG: system of %d equations in %d variables\n", m+n, m+n)
 
   # Check flip, spd and snd parameters
   spd && flip && error("The matrix cannot be SPD and SQD")
@@ -120,8 +146,8 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Determine τ and ν associated to SQD, SPD or SND systems.
   flip && (τ = -one(T) ; ν =  one(T))
@@ -133,7 +159,7 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   warm_start && (ν ≠ 0) && !NisI && error("Warm-start with preconditioners is not supported.")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :vₖ, S, m)
@@ -164,12 +190,12 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   N⁻¹uₖ₋₁ .= zero(FC)  # u₀ = 0
 
   # [ τI    A ] [ xₖ ] = [ b -  τΔx - AΔy ] = [ b₀ ]
-  # [  Aᵀ  νI ] [ yₖ ]   [ c - AᵀΔx - νΔy ]   [ c₀ ]
+  # [  Aᴴ  νI ] [ yₖ ]   [ c - AᴴΔx - νΔy ]   [ c₀ ]
   if warm_start
     mul!(b₀, A, Δy)
     (τ ≠ 0) && @kaxpy!(m, τ, Δx, b₀)
     @kaxpby!(m, one(FC), b, -one(FC), b₀)
-    mul!(c₀, Aᵀ, Δx)
+    mul!(c₀, Aᴴ, Δx)
     (ν ≠ 0) && @kaxpy!(n, ν, Δy, c₀)
     @kaxpby!(n, one(FC), c, -one(FC), c₀)
   end
@@ -196,7 +222,7 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     error("c must be nonzero")
   end
 
-  # Initialize directions Gₖ such that Lₖ(Gₖ)ᵀ = (Wₖ)ᵀ
+  # Initialize directions Gₖ such that L̄ₖ(Gₖ)ᵀ = (Wₖ)ᵀ
   gx₂ₖ₋₁ .= zero(FC)
   gy₂ₖ₋₁ .= zero(FC)
   gx₂ₖ   .= zero(FC)
@@ -207,8 +233,8 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   history && push!(rNorms, rNorm)
   ε = atol + rtol * rNorm
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "βₖ₊₁", "γₖ₊₁")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ, γₖ)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "βₖ₊₁", "γₖ₊₁")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ, γₖ)
 
   # Set up workspace.
   d₂ₖ₋₃ = d₂ₖ₋₂ = zero(T)
@@ -231,10 +257,10 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     # Continue the orthogonal tridiagonalization process.
     # AUₖ  = EVₖTₖ    + βₖ₊₁Evₖ₊₁(eₖ)ᵀ = EVₖ₊₁Tₖ₊₁.ₖ
-    # AᵀVₖ = FUₖ(Tₖ)ᵀ + γₖ₊₁Fuₖ₊₁(eₖ)ᵀ = FUₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴVₖ = FUₖ(Tₖ)ᴴ + γₖ₊₁Fuₖ₊₁(eₖ)ᵀ = FUₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , uₖ)  # Forms Evₖ₊₁ : q ← Auₖ
-    mul!(p, Aᵀ, vₖ)  # Forms Fuₖ₊₁ : p ← Aᵀvₖ
+    mul!(p, Aᴴ, vₖ)  # Forms Fuₖ₊₁ : p ← Aᴴvₖ
 
     if iter ≥ 2
       @kaxpy!(m, -γₖ, M⁻¹vₖ₋₁, q)  # q ← q - γₖ * M⁻¹vₖ₋₁
@@ -254,14 +280,14 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     #                                [0  u₁ ••• 0  uₖ]
     #
     # rₖ = [ b ] - [ τE    A ] [ xₖ ] = [ b ] - [ τE    A ] Wₖzₖ
-    #      [ c ]   [  Aᵀ  νF ] [ yₖ ]   [ c ]   [  Aᵀ  νF ]
+    #      [ c ]   [  Aᴴ  νF ] [ yₖ ]   [ c ]   [  Aᴴ  νF ]
     #
     # block-Lanczos formulation : [ τE    A ] Wₖ = [ E   0 ] Wₖ₊₁Sₖ₊₁.ₖ
-    #                             [  Aᵀ  νF ]      [ 0   F ]
+    #                             [  Aᴴ  νF ]      [ 0   F ]
     #
-    # TriCG subproblem : (Wₖ)ᵀ * rₖ = 0 ↔ Sₖ.ₖzₖ = β₁e₁ + γ₁e₂
+    # TriCG subproblem : (Wₖ)ᴴ * rₖ = 0 ↔ Sₖ.ₖzₖ = β₁e₁ + γ₁e₂
     #
-    # Update the LDLᵀ factorization of Sₖ.ₖ.
+    # Update the LDLᴴ factorization of Sₖ.ₖ.
     #
     # [ τ  α₁    γ₂ 0  •  •  •  •  0  ]
     # [ ᾱ₁ ν  β₂       •           •  ]
@@ -306,7 +332,7 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
       π₂ₖ   = -(δₖ * d₂ₖ₋₁ * π₂ₖ₋₁ + λₖ * d₂ₖ₋₂ * π₂ₖ₋₂ + ηₖ * d₂ₖ₋₃ * π₂ₖ₋₃) / d₂ₖ
     end
 
-    # Solve Gₖ = Wₖ(Lₖ)⁻ᵀ ⟷ L̄ₖ(Gₖ)ᵀ = (Wₖ)ᵀ.
+    # Solve Gₖ = Wₖ(Lₖ)⁻ᴴ ⟷ L̄ₖ(Gₖ)ᵀ = (Wₖ)ᵀ.
     if iter == 1
       # [ 1  0 ] [ gx₁ gy₁ ] = [ v₁ 0  ]
       # [ δ̄₁ 1 ] [ gx₂ gy₂ ]   [ 0  u₁ ]
@@ -342,7 +368,7 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     # Compute vₖ₊₁ and uₖ₊₁
     MisI || mulorldiv!(vₖ₊₁, M, q, ldiv)  # βₖ₊₁vₖ₊₁ = MAuₖ  - γₖvₖ₋₁ - αₖvₖ
-    NisI || mulorldiv!(uₖ₊₁, N, p, ldiv)  # γₖ₊₁uₖ₊₁ = NAᵀvₖ - βₖuₖ₋₁ - ᾱₖuₖ
+    NisI || mulorldiv!(uₖ₊₁, N, p, ldiv)  # γₖ₊₁uₖ₊₁ = NAᴴvₖ - βₖuₖ₋₁ - ᾱₖuₖ
 
     βₖ₊₁ = sqrt(@kdotr(m, vₖ₊₁, q))  # βₖ₊₁ = ‖vₖ₊₁‖_E
     γₖ₊₁ = sqrt(@kdotr(n, uₖ₊₁, p))  # γₖ₊₁ = ‖uₖ₊₁‖_F
@@ -388,9 +414,9 @@ function tricg!(solver :: TricgSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     breakdown = βₖ₊₁ ≤ btol && γₖ₊₁ ≤ btol
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ₊₁, γₖ₊₁)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ₊₁, γₖ₊₁)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "inconsistent linear system")
diff --git a/src/trilqr.jl b/src/trilqr.jl
index edcb4c9b9..e11a8a6c6 100644
--- a/src/trilqr.jl
+++ b/src/trilqr.jl
@@ -1,5 +1,5 @@
 # An implementation of TRILQR for the solution of square or
-# rectangular consistent linear adjoint systems Ax = b and Aᵀy = c.
+# rectangular consistent linear adjoint systems Ax = b and Aᴴy = c.
 #
 # This method is described in
 #
@@ -14,32 +14,53 @@ export trilqr, trilqr!
 
 """
     (x, y, stats) = trilqr(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                           atol::T=√eps(T), rtol::T=√eps(T), transfer_to_usymcg::Bool=true,
-                           itmax::Int=0, verbose::Int=0, history::Bool=false,
-                           callback=solver->false)
+                           transfer_to_usymcg::Bool=true, atol::T=√eps(T),
+                           rtol::T=√eps(T), itmax::Int=0,
+                           verbose::Int=0, history::Bool=false,
+                           callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
+    (x, y, stats) = trilqr(A, b, c, x0::AbstractVector, y0::AbstractVector; kwargs...)
+
+TriLQR can be warm-started from initial guesses `x0` and `y0` where `kwargs` are the same keyword arguments as above.
+
 Combine USYMLQ and USYMQR to solve adjoint systems.
 
     [0  A] [y] = [b]
-    [Aᵀ 0] [x]   [c]
+    [Aᴴ 0] [x]   [c]
+
+USYMLQ is used for solving primal system `Ax = b` of size m × n.
+USYMQR is used for solving dual system `Aᴴy = c` of size n × m.
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
 
-USYMLQ is used for solving primal system `Ax = b`.
-USYMQR is used for solving dual system `Aᵀy = c`.
+#### Optional arguments
 
-An option gives the possibility of transferring from the USYMLQ point to the
-USYMCG point, when it exists. The transfer is based on the residual norm.
+* `x0`: a vector of length n that represents an initial guess of the solution x;
+* `y0`: a vector of length m that represents an initial guess of the solution y.
 
-TriLQR can be warm-started from initial guesses `x0` and `y0` with the method
+#### Keyword arguments
 
-    (x, y, stats) = trilqr(A, b, c, x0, y0; kwargs...)
+* `transfer_to_usymcg`: transfer from the USYMLQ point to the USYMCG point, when it exists. The transfer is based on the residual norm;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `y`: a dense vector of length m;
+* `stats`: statistics collected on the run in an [`AdjointStats`](@ref) structure.
 
 #### Reference
 
@@ -77,23 +98,24 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 end
 
 function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                 atol :: T=√eps(T), rtol :: T=√eps(T), transfer_to_usymcg :: Bool=true,
-                 itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                 callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                 transfer_to_usymcg :: Bool=true, atol :: T=√eps(T),
+                 rtol :: T=√eps(T), itmax :: Int=0,
+                 verbose :: Int=0, history :: Bool=false,
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("TRILQR: primal system of %d equations in %d variables\n", m, n)
-  (verbose > 0) && @printf("TRILQR: dual system of %d equations in %d variables\n", n, m)
+  (verbose > 0) && @printf(iostream, "TRILQR: primal system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "TRILQR: dual system of %d equations in %d variables\n", n, m)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   uₖ₋₁, uₖ, p, d̅, x, stats = solver.uₖ₋₁, solver.uₖ, solver.p, solver.d̅, solver.x, solver.stats
@@ -107,7 +129,7 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   if warm_start
     mul!(r₀, A, Δx)
     @kaxpby!(n, one(FC), b, -one(FC), r₀)
-    mul!(s₀, Aᵀ, Δy)
+    mul!(s₀, Aᴴ, Δy)
     @kaxpby!(n, one(FC), c, -one(FC), s₀)
   end
 
@@ -115,7 +137,7 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   x .= zero(FC)          # x₀
   bNorm = @knrm2(m, r₀)  # rNorm = ‖r₀‖
 
-  # Initial solution y₀ and residual s₀ = c - Aᵀy₀.
+  # Initial solution y₀ and residual s₀ = c - Aᴴy₀.
   t .= zero(FC)          # t₀
   cNorm = @knrm2(n, s₀)  # sNorm = ‖s₀‖
 
@@ -127,8 +149,8 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   εL = atol + rtol * bNorm
   εQ = atol + rtol * cNorm
   ξ = zero(T)
-  (verbose > 0) && @printf("%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖sₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e\n", iter, bNorm, cNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖sₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e\n", iter, bNorm, cNorm)
 
   # Set up workspace.
   βₖ = @knrm2(m, r₀)          # β₁ = ‖r₀‖ = ‖v₁‖
@@ -136,17 +158,17 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   vₖ₋₁ .= zero(FC)            # v₀ = 0
   uₖ₋₁ .= zero(FC)            # u₀ = 0
   vₖ .= r₀ ./ βₖ              # v₁ = (b - Ax₀) / β₁
-  uₖ .= s₀ ./ γₖ              # u₁ = (c - Aᵀy₀) / γ₁
+  uₖ .= s₀ ./ γₖ              # u₁ = (c - Aᴴy₀) / γ₁
   cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
   sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-  d̅ .= zero(FC)               # Last column of D̅ₖ = Uₖ(Qₖ)ᵀ
+  d̅ .= zero(FC)               # Last column of D̅ₖ = Uₖ(Qₖ)ᴴ
   ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
   ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
   δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
   ψbarₖ₋₁ = ψₖ₋₁ = zero(FC)   # ψₖ₋₁ and ψbarₖ are the last components of h̅ₖ = Qₖγ₁e₁
   ϵₖ₋₃ = λₖ₋₂ = zero(FC)      # Components of Lₖ₋₁
-  wₖ₋₃ .= zero(FC)            # Column k-3 of Wₖ = Vₖ(Lₖ)⁻ᵀ
-  wₖ₋₂ .= zero(FC)            # Column k-2 of Wₖ = Vₖ(Lₖ)⁻ᵀ
+  wₖ₋₃ .= zero(FC)            # Column k-3 of Wₖ = Vₖ(Lₖ)⁻ᴴ
+  wₖ₋₂ .= zero(FC)            # Column k-2 of Wₖ = Vₖ(Lₖ)⁻ᴴ
 
   # Stopping criterion.
   inconsistent = false
@@ -166,10 +188,10 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 
     # Continue the SSY tridiagonalization process.
     # AUₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀVₖ = Uₖ(Tₖ)ᵀ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴVₖ = Uₖ(Tₖ)ᴴ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , uₖ)  # Forms vₖ₊₁ : q ← Auₖ
-    mul!(p, Aᵀ, vₖ)  # Forms uₖ₊₁ : p ← Aᵀvₖ
+    mul!(p, Aᴴ, vₖ)  # Forms uₖ₊₁ : p ← Aᴴvₖ
 
     @kaxpy!(m, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - βₖ * uₖ₋₁
@@ -236,7 +258,7 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
         ηₖ   = -ϵₖ₋₂ * ζₖ₋₂ - λₖ₋₁ * ζₖ₋₁
       end
 
-      # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Uₖ(Qₖ)ᵀ.
+      # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Uₖ(Qₖ)ᴴ.
       # [d̅ₖ₋₁ uₖ] [cₖ  s̄ₖ] = [dₖ₋₁ d̅ₖ] ⟷ dₖ₋₁ = cₖ * d̅ₖ₋₁ + sₖ * uₖ
       #           [sₖ -cₖ]             ⟷ d̅ₖ   = s̄ₖ * d̅ₖ₋₁ - cₖ * uₖ
       if iter ≥ 2
@@ -295,7 +317,7 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
         ψbarₖ = sₖ * ψbarₖ₋₁
       end
 
-      # Compute the direction wₖ₋₁, the last column of Wₖ₋₁ = (Vₖ₋₁)(Lₖ₋₁)⁻ᵀ ⟷ (L̄ₖ₋₁)(Wₖ₋₁)ᵀ = (Vₖ₋₁)ᵀ.
+      # Compute the direction wₖ₋₁, the last column of Wₖ₋₁ = (Vₖ₋₁)(Lₖ₋₁)⁻ᴴ ⟷ (L̄ₖ₋₁)(Wₖ₋₁)ᵀ = (Vₖ₋₁)ᵀ.
       # w₁ = v₁ / δ̄₁
       if iter == 2
         wₖ₋₁ = wₖ₋₂
@@ -374,11 +396,11 @@ function trilqr!(solver :: TrilqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
     user_requested_exit = callback(solver) :: Bool
     tired = iter ≥ itmax
 
-    kdisplay(iter, verbose) &&  solved_primal && !solved_dual && @printf("%5d  %7s  %7.1e\n", iter, "", sNorm)
-    kdisplay(iter, verbose) && !solved_primal &&  solved_dual && @printf("%5d  %7.1e  %7s\n", iter, rNorm_lq, "")
-    kdisplay(iter, verbose) && !solved_primal && !solved_dual && @printf("%5d  %7.1e  %7.1e\n", iter, rNorm_lq, sNorm)
+    kdisplay(iter, verbose) &&  solved_primal && !solved_dual && @printf(iostream, "%5d  %7s  %7.1e\n", iter, "", sNorm)
+    kdisplay(iter, verbose) && !solved_primal &&  solved_dual && @printf(iostream, "%5d  %7.1e  %7s\n", iter, rNorm_lq, "")
+    kdisplay(iter, verbose) && !solved_primal && !solved_dual && @printf(iostream, "%5d  %7.1e  %7.1e\n", iter, rNorm_lq, sNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute USYMCG point
   # (xᶜ)ₖ ← (xᴸ)ₖ₋₁ + ζbarₖ * d̅ₖ
diff --git a/src/trimr.jl b/src/trimr.jl
index bc53633c2..9da4dfa92 100644
--- a/src/trimr.jl
+++ b/src/trimr.jl
@@ -13,30 +13,31 @@ export trimr, trimr!
 
 """
     (x, y, stats) = trimr(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                          M=I, N=I, atol::T=√eps(T), rtol::T=√eps(T),
-                          spd::Bool=false, snd::Bool=false, flip::Bool=false, sp::Bool=false,
-                          τ::T=one(T), ν::T=-one(T), itmax::Int=0,
+                          M=I, N=I, ldiv::Bool=false,
+                          spd::Bool=false, snd::Bool=false,
+                          flip::Bool=false, sp::Bool=false,
+                          τ::T=one(T), ν::T=-one(T), atol::T=√eps(T),
+                          rtol::T=√eps(T), itmax::Int=0,
                           verbose::Int=0, history::Bool=false,
-                          ldiv::Bool=false, callback=solver->false)
+                          callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-TriMR solves the symmetric linear system
+    (x, y, stats) = trimr(A, b, c, x0::AbstractVector, y0::AbstractVector; kwargs...)
+
+TriMR can be warm-started from initial guesses `x0` and `y0` where `kwargs` are the same keyword arguments as above.
+
+Given a matrix `A` of dimension m × n, TriMR solves the symmetric linear system
 
     [ τE    A ] [ x ] = [ b ]
-    [  Aᵀ  νF ] [ y ]   [ c ],
+    [  Aᴴ  νF ] [ y ]   [ c ],
 
-where τ and ν are real numbers, E = M⁻¹ ≻ 0, F = N⁻¹ ≻ 0.
+of size (n+m) × (n+m) where τ and ν are real numbers, E = M⁻¹ ≻ 0, F = N⁻¹ ≻ 0.
 `b` and `c` must both be nonzero.
 TriMR handles saddle-point systems (`τ = 0` or `ν = 0`) and adjoint systems (`τ = 0` and `ν = 0`) without any risk of breakdown.
 
 By default, TriMR solves symmetric and quasi-definite linear systems with τ = 1 and ν = -1.
-If `flip = true`, TriMR solves another known variant of SQD systems where τ = -1 and ν = 1.
-If `spd = true`, τ = ν = 1 and the associated symmetric and positive definite linear system is solved.
-If `snd = true`, τ = ν = -1 and the associated symmetric and negative definite linear system is solved.
-If `sp = true`, τ = 1, ν = 0 and the associated saddle-point linear system is solved.
-`τ` and `ν` are also keyword arguments that can be directly modified for more specific problems.
 
 TriMR is based on the preconditioned orthogonal tridiagonalization process
 and its relation with the preconditioned block-Lanczos process.
@@ -50,17 +51,40 @@ It's the Euclidean norm when `M` and `N` are identity operators.
 TriMR stops when `itmax` iterations are reached or when `‖rₖ‖ ≤ atol + ‖r₀‖ * rtol`.
 `atol` is an absolute tolerance and `rtol` is a relative tolerance.
 
-Additional details can be displayed if verbose mode is enabled (verbose > 0).
-Information will be displayed every `verbose` iterations.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
+
+#### Optional arguments
+
+* `x0`: a vector of length m that represents an initial guess of the solution x;
+* `y0`: a vector of length n that represents an initial guess of the solution y.
 
-TriMR can be warm-started from initial guesses `x0` and `y0` with the method
+#### Keyword arguments
 
-    (x, y, stats) = trimr(A, b, c, x0, y0; kwargs...)
+* `M`: linear operator that models a Hermitian positive-definite matrix of size `m` used for centered preconditioning of the partitioned system;
+* `N`: linear operator that models a Hermitian positive-definite matrix of size `n` used for centered preconditioning of the partitioned system;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
+* `spd`: if `true`, set `τ = 1` and `ν = 1` for Hermitian and positive-definite linear system;
+* `snd`: if `true`, set `τ = -1` and `ν = -1` for Hermitian and negative-definite linear systems;
+* `flip`: if `true`, set `τ = -1` and `ν = 1` for another known variant of Hermitian quasi-definite systems;
+* `sp`: if `true`, set `τ = 1` and `ν = 0` for saddle-point systems;
+* `τ` and `ν`: diagonal scaling factors of the partitioned Hermitian linear system;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length m;
+* `y`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### Reference
 
@@ -98,16 +122,18 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 end
 
 function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                M=I, N=I, atol :: T=√eps(T), rtol :: T=√eps(T),
-                spd :: Bool=false, snd :: Bool=false, flip :: Bool=false, sp :: Bool=false,
-                τ :: T=one(T), ν :: T=-one(T), itmax :: Int=0,
+                M=I, N=I, ldiv :: Bool=false,
+                spd :: Bool=false, snd :: Bool=false,
+                flip :: Bool=false, sp :: Bool=false,
+                τ :: T=one(T), ν :: T=-one(T), atol :: T=√eps(T),
+                rtol :: T=√eps(T), itmax :: Int=0,
                 verbose :: Int=0, history :: Bool=false,
-                ldiv :: Bool=false, callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("TriMR: system of %d equations in %d variables\n", m+n, m+n)
+  (verbose > 0) && @printf(iostream, "TriMR: system of %d equations in %d variables\n", m+n, m+n)
 
   # Check flip, sp, spd and snd parameters
   spd && flip && error("The matrix cannot be symmetric positive definite and symmetric quasi-definite !")
@@ -123,8 +149,8 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Determine τ and ν associated to SQD, SPD or SND systems.
   flip && (τ = -one(T) ; ν =  one(T))
@@ -137,7 +163,7 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   warm_start && (ν ≠ 0) && !NisI && error("Warm-start with preconditioners is not supported.")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   allocate_if(!MisI, solver, :vₖ, S, m)
@@ -169,12 +195,12 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   N⁻¹uₖ₋₁ .= zero(FC)  # u₀ = 0
 
   # [ τI    A ] [ xₖ ] = [ b -  τΔx - AΔy ] = [ b₀ ]
-  # [  Aᵀ  νI ] [ yₖ ]   [ c - AᵀΔx - νΔy ]   [ c₀ ]
+  # [  Aᴴ  νI ] [ yₖ ]   [ c - AᴴΔx - νΔy ]   [ c₀ ]
   if warm_start
     mul!(b₀, A, Δy)
     (τ ≠ 0) && @kaxpy!(m, τ, Δx, b₀)
     @kaxpby!(m, one(FC), b, -one(FC), b₀)
-    mul!(c₀, Aᵀ, Δx)
+    mul!(c₀, Aᴴ, Δx)
     (ν ≠ 0) && @kaxpy!(n, ν, Δy, c₀)
     @kaxpby!(n, one(FC), c, -one(FC), c₀)
   end
@@ -216,8 +242,8 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
   history && push!(rNorms, rNorm)
   ε = atol + rtol * rNorm
 
-  (verbose > 0) && @printf("%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "βₖ₊₁", "γₖ₊₁")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ, γₖ)
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s  %7s\n", "k", "‖rₖ‖", "βₖ₊₁", "γₖ₊₁")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ, γₖ)
 
   # Set up workspace.
   old_c₁ₖ = old_c₂ₖ = old_c₃ₖ = old_c₄ₖ = zero(T)
@@ -244,10 +270,10 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     # Continue the orthogonal tridiagonalization process.
     # AUₖ  = EVₖTₖ    + βₖ₊₁Evₖ₊₁(eₖ)ᵀ = EVₖ₊₁Tₖ₊₁.ₖ
-    # AᵀVₖ = FUₖ(Tₖ)ᵀ + γₖ₊₁Fuₖ₊₁(eₖ)ᵀ = FUₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴVₖ = FUₖ(Tₖ)ᴴ + γₖ₊₁Fuₖ₊₁(eₖ)ᵀ = FUₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , uₖ)  # Forms Evₖ₊₁ : q ← Auₖ
-    mul!(p, Aᵀ, vₖ)  # Forms Fuₖ₊₁ : p ← Aᵀvₖ
+    mul!(p, Aᴴ, vₖ)  # Forms Fuₖ₊₁ : p ← Aᴴvₖ
 
     if iter ≥ 2
       @kaxpy!(m, -γₖ, M⁻¹vₖ₋₁, q)  # q ← q - γₖ * M⁻¹vₖ₋₁
@@ -261,7 +287,7 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
 
     # Compute vₖ₊₁ and uₖ₊₁
     MisI || mulorldiv!(vₖ₊₁, M, q, ldiv)  # βₖ₊₁vₖ₊₁ = MAuₖ  - γₖvₖ₋₁ - αₖvₖ
-    NisI || mulorldiv!(uₖ₊₁, N, p, ldiv)  # γₖ₊₁uₖ₊₁ = NAᵀvₖ - βₖuₖ₋₁ - ᾱₖuₖ
+    NisI || mulorldiv!(uₖ₊₁, N, p, ldiv)  # γₖ₊₁uₖ₊₁ = NAᴴvₖ - βₖuₖ₋₁ - ᾱₖuₖ
 
     βₖ₊₁ = sqrt(@kdotr(m, vₖ₊₁, q))  # βₖ₊₁ = ‖vₖ₊₁‖_E
     γₖ₊₁ = sqrt(@kdotr(n, uₖ₊₁, p))  # γₖ₊₁ = ‖uₖ₊₁‖_F
@@ -282,10 +308,10 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     #                                [0  u₁ ••• 0  uₖ]
     #
     # rₖ = [ b ] - [ τE    A ] [ xₖ ] = [ b ] - [ τE    A ] Wₖzₖ
-    #      [ c ]   [  Aᵀ  νF ] [ yₖ ]   [ c ]   [  Aᵀ  νF ]
+    #      [ c ]   [  Aᴴ  νF ] [ yₖ ]   [ c ]   [  Aᴴ  νF ]
     #
     # block-Lanczos formulation : [ τE    A ] Wₖ = [ E   0 ] Wₖ₊₁Sₖ₊₁.ₖ
-    #                             [  Aᵀ  νF ]      [ 0   F ]
+    #                             [  Aᴴ  νF ]      [ 0   F ]
     #
     # TriMR subproblem : min ‖ rₖ ‖ ↔ min ‖ Sₖ₊₁.ₖzₖ - β₁e₁ - γ₁e₂ ‖
     #
@@ -419,7 +445,7 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
       @kswap(gy₂ₖ₋₂, gy₂ₖ)
     end
 
-    # Update p̅ₖ = (Qₖ)ᵀ * (β₁e₁ + γ₁e₂)
+    # Update p̅ₖ = (Qₖ)ᴴ * (β₁e₁ + γ₁e₂)
     πbis₂ₖ   =      c₁ₖ  * πbar₂ₖ
     πbis₂ₖ₊₂ = conj(s₁ₖ) * πbar₂ₖ
     #
@@ -490,9 +516,9 @@ function trimr!(solver :: TrimrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c ::
     breakdown = βₖ₊₁ ≤ btol && γₖ₊₁ ≤ btol
     solved = resid_decrease_lim || resid_decrease_mach
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ₊₁, γₖ₊₁)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e  %7.1e\n", iter, rNorm, βₖ₊₁, γₖ₊₁)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   tired               && (status = "maximum number of iterations exceeded")
   breakdown           && (status = "inconsistent linear system")
diff --git a/src/usymlq.jl b/src/usymlq.jl
index 71670c80f..53aef51a3 100644
--- a/src/usymlq.jl
+++ b/src/usymlq.jl
@@ -21,34 +21,53 @@ export usymlq, usymlq!
 
 """
     (x, stats) = usymlq(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                        atol::T=√eps(T), rtol::T=√eps(T), transfer_to_usymcg::Bool=true,
-                        itmax::Int=0, verbose::Int=0, history::Bool=false,
-                        callback=solver->false)
+                        transfer_to_usymcg::Bool=true, atol::T=√eps(T),
+                        rtol::T=√eps(T), itmax::Int=0,
+                        verbose::Int=0, history::Bool=false,
+                        callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the linear system Ax = b using the USYMLQ method.
+    (x, stats) = usymlq(A, b, c, x0::AbstractVector; kwargs...)
+
+USYMLQ can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+USYMLQ determines the least-norm solution of the consistent linear system Ax = b of size m × n.
 
 USYMLQ is based on the orthogonal tridiagonalization process and requires two initial nonzero vectors `b` and `c`.
-The vector `c` is only used to initialize the process and a default value can be `b` or `Aᵀb` depending on the shape of `A`.
+The vector `c` is only used to initialize the process and a default value can be `b` or `Aᴴb` depending on the shape of `A`.
 The error norm ‖x - x*‖ monotonously decreases in USYMLQ.
 It's considered as a generalization of SYMMLQ.
 
 It can also be applied to under-determined and over-determined problems.
 In all cases, problems must be consistent.
 
-An option gives the possibility of transferring to the USYMCG point,
-when it exists. The transfer is based on the residual norm.
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
 
-USYMLQ can be warm-started from an initial guess `x0` with the method
+#### Keyword arguments
 
-    (x, stats) = usymlq(A, b, c, x0; kwargs...)
+* `transfer_to_usymcg`: transfer from the USYMLQ point to the USYMCG point, when it exists. The transfer is based on the residual norm;
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -88,22 +107,23 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 end
 
 function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                 atol :: T=√eps(T), rtol :: T=√eps(T), transfer_to_usymcg :: Bool=true,
-                 itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                 callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                 transfer_to_usymcg :: Bool=true, atol :: T=√eps(T),
+                 rtol :: T=√eps(T), itmax :: Int=0,
+                 verbose :: Int=0, history :: Bool=false,
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("USYMLQ: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "USYMLQ: system of %d equations in %d variables\n", m, n)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   uₖ₋₁, uₖ, p, Δx, x = solver.uₖ₋₁, solver.uₖ, solver.p, solver.Δx, solver.x
@@ -135,8 +155,8 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   itmax == 0 && (itmax = m+n)
 
   ε = atol + rtol * bNorm
-  (verbose > 0) && @printf("%5s  %7s\n", "k", "‖rₖ‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, bNorm)
+  (verbose > 0) && @printf(iostream, "%5s  %7s\n", "k", "‖rₖ‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, bNorm)
 
   βₖ = @knrm2(m, r₀)          # β₁ = ‖v₁‖ = ‖r₀‖
   γₖ = @knrm2(n, c)           # γ₁ = ‖u₁‖ = ‖c‖
@@ -146,7 +166,7 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   uₖ .= c ./ γₖ               # u₁ = c / γ₁
   cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
   sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-  d̅ .= zero(FC)               # Last column of D̅ₖ = Uₖ(Qₖ)ᵀ
+  d̅ .= zero(FC)               # Last column of D̅ₖ = Uₖ(Qₖ)ᴴ
   ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
   ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
   δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and Lₖ modified over the course of two iterations
@@ -164,10 +184,10 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 
     # Continue the SSY tridiagonalization process.
     # AUₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀVₖ = Uₖ(Tₖ)ᵀ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴVₖ = Uₖ(Tₖ)ᴴ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , uₖ)  # Forms vₖ₊₁ : q ← Auₖ
-    mul!(p, Aᵀ, vₖ)  # Forms uₖ₊₁ : p ← Aᵀvₖ
+    mul!(p, Aᴴ, vₖ)  # Forms uₖ₊₁ : p ← Aᴴvₖ
 
     @kaxpy!(m, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - βₖ * uₖ₋₁
@@ -233,7 +253,7 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
       ηₖ   = -ϵₖ₋₂ * ζₖ₋₂ - λₖ₋₁ * ζₖ₋₁
     end
 
-    # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Uₖ(Qₖ)ᵀ.
+    # Relations for the directions dₖ₋₁ and d̅ₖ, the last two columns of D̅ₖ = Uₖ(Qₖ)ᴴ.
     # [d̅ₖ₋₁ uₖ] [cₖ  s̄ₖ] = [dₖ₋₁ d̅ₖ] ⟷ dₖ₋₁ = cₖ * d̅ₖ₋₁ + sₖ * uₖ
     #           [sₖ -cₖ]             ⟷ d̅ₖ   = s̄ₖ * d̅ₖ₋₁ - cₖ * uₖ
     if iter ≥ 2
@@ -294,9 +314,9 @@ function usymlq!(solver :: UsymlqSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
     solved_lq = rNorm_lq ≤ ε
     solved_cg = transfer_to_usymcg && (abs(δbarₖ) > eps(T)) && (rNorm_cg ≤ ε)
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e\n", iter, rNorm_lq)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e\n", iter, rNorm_lq)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
 
   # Compute USYMCG point
   # (xᶜ)ₖ ← (xᴸ)ₖ₋₁ + ζbarₖ * d̅ₖ
diff --git a/src/usymqr.jl b/src/usymqr.jl
index 863390c3f..3876499b5 100644
--- a/src/usymqr.jl
+++ b/src/usymqr.jl
@@ -21,31 +21,52 @@ export usymqr, usymqr!
 
 """
     (x, stats) = usymqr(A, b::AbstractVector{FC}, c::AbstractVector{FC};
-                        atol::T=√eps(T), rtol::T=√eps(T),
-                        itmax::Int=0, verbose::Int=0, history::Bool=false,
-                        callback=solver->false)
+                        atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
+                        verbose::Int=0, history::Bool=false,
+                        callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
 `FC` is `T` or `Complex{T}`.
 
-Solve the linear system Ax = b using the USYMQR method.
+    (x, stats) = usymqr(A, b, c, x0::AbstractVector; kwargs...)
+
+USYMQR can be warm-started from an initial guess `x0` where `kwargs` are the same keyword arguments as above.
+
+USYMQR solves the linear least-squares problem min ‖b - Ax‖² of size m × n.
+USYMQR solves Ax = b if it is consistent.
 
 USYMQR is based on the orthogonal tridiagonalization process and requires two initial nonzero vectors `b` and `c`.
-The vector `c` is only used to initialize the process and a default value can be `b` or `Aᵀb` depending on the shape of `A`.
+The vector `c` is only used to initialize the process and a default value can be `b` or `Aᴴb` depending on the shape of `A`.
 The residual norm ‖b - Ax‖ monotonously decreases in USYMQR.
 It's considered as a generalization of MINRES.
 
 It can also be applied to under-determined and over-determined problems.
 USYMQR finds the minimum-norm solution if problems are inconsistent.
 
-USYMQR can be warm-started from an initial guess `x0` with the method
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `b`: a vector of length m;
+* `c`: a vector of length n.
+
+#### Optional argument
+
+* `x0`: a vector of length n that represents an initial guess of the solution x.
+
+#### Keyword arguments
 
-    (x, stats) = usymqr(A, b, c, x0; kwargs...)
+* `atol`: absolute stopping tolerance based on the residual norm;
+* `rtol`: relative stopping tolerance based on the residual norm;
+* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `m+n`;
+* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations;
+* `history`: collect additional statistics on the run such as residual norms, or Aᴴ-residual norms;
+* `callback`: function or functor called as `callback(solver)` that returns `true` if the Krylov method should terminate, and `false` otherwise;
+* `iostream`: stream to which output is logged.
 
-where `kwargs` are the same keyword arguments as above.
+#### Output arguments
 
-The callback is called as `callback(solver)` and should return `true` if the main loop should terminate,
-and `false` otherwise.
+* `x`: a dense vector of length n;
+* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure.
 
 #### References
 
@@ -85,28 +106,28 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 end
 
 function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :: AbstractVector{FC};
-                 atol :: T=√eps(T), rtol :: T=√eps(T),
-                 itmax :: Int=0, verbose :: Int=0, history :: Bool=false,
-                 callback = solver -> false) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
+                 atol :: T=√eps(T), rtol :: T=√eps(T), itmax :: Int=0,
+                 verbose :: Int=0, history :: Bool=false,
+                 callback = solver -> false, iostream :: IO=kstdout) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: DenseVector{FC}}
 
   m, n = size(A)
   length(b) == m || error("Inconsistent problem size")
   length(c) == n || error("Inconsistent problem size")
-  (verbose > 0) && @printf("USYMQR: system of %d equations in %d variables\n", m, n)
+  (verbose > 0) && @printf(iostream, "USYMQR: system of %d equations in %d variables\n", m, n)
 
   # Check type consistency
   eltype(A) == FC || error("eltype(A) ≠ $FC")
-  ktypeof(b) == S || error("ktypeof(b) ≠ $S")
-  ktypeof(c) == S || error("ktypeof(c) ≠ $S")
+  ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
+  ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
   # Compute the adjoint of A
-  Aᵀ = A'
+  Aᴴ = A'
 
   # Set up workspace.
   vₖ₋₁, vₖ, q, Δx, x, p = solver.vₖ₋₁, solver.vₖ, solver.q, solver.Δx, solver.x, solver.p
   wₖ₋₂, wₖ₋₁, uₖ₋₁, uₖ, stats = solver.wₖ₋₂, solver.wₖ₋₁, solver.uₖ₋₁, solver.uₖ, solver.stats
   warm_start = solver.warm_start
-  rNorms, AᵀrNorms = stats.residuals, stats.Aresiduals
+  rNorms, AᴴrNorms = stats.residuals, stats.Aresiduals
   reset!(stats)
   r₀ = warm_start ? q : b
 
@@ -133,8 +154,8 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 
   ε = atol + rtol * rNorm
   κ = zero(T)
-  (verbose > 0) && @printf("%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖Aᵀrₖ₋₁‖")
-  kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7s\n", iter, rNorm, "✗ ✗ ✗ ✗")
+  (verbose > 0) && @printf(iostream, "%5s  %7s  %7s\n", "k", "‖rₖ‖", "‖Aᴴrₖ₋₁‖")
+  kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7s\n", iter, rNorm, "✗ ✗ ✗ ✗")
 
   βₖ = @knrm2(m, r₀)           # β₁ = ‖v₁‖ = ‖r₀‖
   γₖ = @knrm2(n, c)            # γ₁ = ‖u₁‖ = ‖c‖
@@ -146,7 +167,7 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
   sₖ₋₂ = sₖ₋₁ = sₖ = zero(FC)  # Givens sines used for the QR factorization of Tₖ₊₁.ₖ
   wₖ₋₂ .= zero(FC)             # Column k-2 of Wₖ = Uₖ(Rₖ)⁻¹
   wₖ₋₁ .= zero(FC)             # Column k-1 of Wₖ = Uₖ(Rₖ)⁻¹
-  ζbarₖ = βₖ                   # ζbarₖ is the last component of z̅ₖ = (Qₖ)ᵀβ₁e₁
+  ζbarₖ = βₖ                   # ζbarₖ is the last component of z̅ₖ = (Qₖ)ᴴβ₁e₁
 
   # Stopping criterion.
   solved = rNorm ≤ ε
@@ -161,10 +182,10 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
 
     # Continue the SSY tridiagonalization process.
     # AUₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-    # AᵀVₖ = Uₖ(Tₖ)ᵀ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᵀ
+    # AᴴVₖ = Uₖ(Tₖ)ᴴ + γₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
     mul!(q, A , uₖ)  # Forms vₖ₊₁ : q ← Auₖ
-    mul!(p, Aᵀ, vₖ)  # Forms uₖ₊₁ : p ← Aᵀvₖ
+    mul!(p, Aᴴ, vₖ)  # Forms uₖ₊₁ : p ← Aᴴvₖ
 
     @kaxpy!(m, -γₖ, vₖ₋₁, q) # q ← q - γₖ * vₖ₋₁
     @kaxpy!(n, -βₖ, uₖ₋₁, p) # p ← p - βₖ * uₖ₋₁
@@ -254,9 +275,9 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
     rNorm = abs(ζbarₖ₊₁)
     history && push!(rNorms, rNorm)
 
-    # Compute ‖Aᵀrₖ₋₁‖ = |ζbarₖ| * √(|δbarₖ|² + |λbarₖ|²).
-    AᵀrNorm = abs(ζbarₖ) * √(abs2(δbarₖ) + abs2(cₖ₋₁ * γₖ₊₁))
-    history && push!(AᵀrNorms, AᵀrNorm)
+    # Compute ‖Aᴴrₖ₋₁‖ = |ζbarₖ| * √(|δbarₖ|² + |λbarₖ|²).
+    AᴴrNorm = abs(ζbarₖ) * √(abs2(δbarₖ) + abs2(cₖ₋₁ * γₖ₊₁))
+    history && push!(AᴴrNorms, AᴴrNorm)
 
     # Compute uₖ₊₁ and uₖ₊₁.
     @. vₖ₋₁ = vₖ # vₖ₋₁ ← vₖ
@@ -286,14 +307,14 @@ function usymqr!(solver :: UsymqrSolver{T,FC,S}, A, b :: AbstractVector{FC}, c :
     βₖ    = βₖ₊₁
 
     # Update stopping criterion.
-    iter == 1 && (κ = atol + rtol * AᵀrNorm)
+    iter == 1 && (κ = atol + rtol * AᴴrNorm)
     user_requested_exit = callback(solver) :: Bool
     solved = rNorm ≤ ε
-    inconsistent = !solved && AᵀrNorm ≤ κ
+    inconsistent = !solved && AᴴrNorm ≤ κ
     tired = iter ≥ itmax
-    kdisplay(iter, verbose) && @printf("%5d  %7.1e  %7.1e\n", iter, rNorm, AᵀrNorm)
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %7.1e\n", iter, rNorm, AᴴrNorm)
   end
-  (verbose > 0) && @printf("\n")
+  (verbose > 0) && @printf(iostream, "\n")
   tired               && (status = "maximum number of iterations exceeded")
   solved              && (status = "solution good enough given atol and rtol")
   user_requested_exit && (status = "user-requested exit")
diff --git a/test/callback_utils.jl b/test/callback_utils.jl
new file mode 100644
index 000000000..f88f01848
--- /dev/null
+++ b/test/callback_utils.jl
@@ -0,0 +1,152 @@
+mutable struct StorageGetxRestartedGmres{S}
+  x::S
+  y::S
+  p::S
+end
+StorageGetxRestartedGmres(solver::GmresSolver; N = I) = 
+  StorageGetxRestartedGmres(similar(solver.x), similar(solver.z), (N === I) ? similar(solver.p) : similar(solver.x))
+
+function get_x_restarted_gmres!(solver::GmresSolver{T,FC,S}, A, 
+                                stor::StorageGetxRestartedGmres{S}, N) where {T,FC,S}
+  NisI = (N === I)
+  x2, y2, p2 = stor.x, stor.y, stor.p
+  n = size(A, 2)
+  # Compute yₖ by solving Rₖyₖ = zₖ with backward substitution.
+  nr = sum(1:solver.inner_iter)
+  y = solver.z  # yᵢ = zᵢ
+  y2 .= y
+  R = solver.R
+  V = solver.V
+  x2 .= solver.Δx
+  for i = solver.inner_iter : -1 : 1
+    pos = nr + i - solver.inner_iter      # position of rᵢ.ₖ
+    for j = solver.inner_iter : -1 : i+1
+      y2[i] = y2[i] - R[pos] * y2[j]  # yᵢ ← yᵢ - rᵢⱼyⱼ
+      pos = pos - j + 1            # position of rᵢ.ⱼ₋₁
+    end
+    # Rₖ can be singular if the system is inconsistent
+    if abs(R[pos]) ≤ eps(T)^(3/4)
+      y2[i] = zero(FC)
+      inconsistent = true
+    else
+      y2[i] = y2[i] / R[pos]  # yᵢ ← yᵢ / rᵢᵢ
+    end
+  end
+
+  # Form xₖ = N⁻¹Vₖyₖ
+  for i = 1 : solver.inner_iter
+    Krylov.@kaxpy!(n, y2[i], V[i], x2)
+  end
+  if !NisI
+    p2 .= solver.p
+    p2 .= x2
+    mul!(x2, N, p2)
+  end
+  x2 .+= solver.x
+end
+
+mutable struct TestCallbackN2{T, S, M}
+  A::M
+  b::S
+  storage_vec::S
+  tol::T
+end
+TestCallbackN2(A, b; tol = 0.1) = TestCallbackN2(A, b, similar(b), tol)
+
+function (cb_n2::TestCallbackN2)(solver)
+  mul!(cb_n2.storage_vec, cb_n2.A, solver.x)
+  cb_n2.storage_vec .-= cb_n2.b
+  return norm(cb_n2.storage_vec) ≤ cb_n2.tol
+end
+
+mutable struct TestCallbackN2Adjoint{T, S, M}
+  A::M
+  b::S
+  c::S
+  storage_vec1::S
+  storage_vec2::S
+  tol::T
+end
+TestCallbackN2Adjoint(A, b, c; tol = 0.1) = TestCallbackN2Adjoint(A, b, c, similar(b), similar(c), tol)
+
+function (cb_n2::TestCallbackN2Adjoint)(solver)
+  mul!(cb_n2.storage_vec1, cb_n2.A, solver.x)
+  cb_n2.storage_vec1 .-= cb_n2.b
+  mul!(cb_n2.storage_vec2, cb_n2.A', solver.y)
+  cb_n2.storage_vec2 .-= cb_n2.c
+  return (norm(cb_n2.storage_vec1) ≤ cb_n2.tol && norm(cb_n2.storage_vec2) ≤ cb_n2.tol)
+end
+
+mutable struct TestCallbackN2Shifts{T, S, M}
+  A::M
+  b::S
+  shifts::Vector{T}
+  tol::T
+end
+TestCallbackN2Shifts(A, b, shifts; tol = 0.1) = TestCallbackN2Shifts(A, b, shifts, tol)
+
+function (cb_n2::TestCallbackN2Shifts)(solver)
+  r = residuals(cb_n2.A, cb_n2.b, cb_n2.shifts, solver.x)
+  return all(map(norm, r) .≤ cb_n2.tol)
+end
+
+mutable struct TestCallbackN2LS{T, S, M}
+  A::M
+  b::S
+  λ::T
+  storage_vec1::S
+  storage_vec2::S
+  tol::T
+end
+TestCallbackN2LS(A, b, λ; tol = 0.1) = TestCallbackN2LS(A, b, λ, similar(b), similar(b, size(A, 2)), tol)
+
+function (cb_n2::TestCallbackN2LS)(solver)
+  mul!(cb_n2.storage_vec1, cb_n2.A, solver.x)
+  cb_n2.storage_vec1 .-= cb_n2.b
+  mul!(cb_n2.storage_vec2, cb_n2.A', cb_n2.storage_vec1)
+  cb_n2.storage_vec2 .+= cb_n2.λ .* solver.x
+  return norm(cb_n2.storage_vec2) ≤ cb_n2.tol
+end
+
+mutable struct TestCallbackN2LN{T, S, M}
+  A::M
+  b::S
+  λ::T
+  storage_vec::S
+  tol::T
+end
+TestCallbackN2LN(A, b, λ; tol = 0.1) = TestCallbackN2LN(A, b, λ, similar(b), tol)
+
+function (cb_n2::TestCallbackN2LN)(solver)
+  mul!(cb_n2.storage_vec, cb_n2.A, solver.x)
+  cb_n2.storage_vec .-= cb_n2.b
+  cb_n2.λ != 0 && (cb_n2.storage_vec .+= cb_n2.λ .* solver.x)
+  return norm(cb_n2.storage_vec) ≤ cb_n2.tol
+end
+
+mutable struct TestCallbackN2SaddlePts{T, S, M}
+  A::M
+  b::S
+  c::S
+  storage_vec1::S
+  storage_vec2::S
+  tol::T
+end
+TestCallbackN2SaddlePts(A, b, c; tol = 0.1) = 
+  TestCallbackN2SaddlePts(A, b, c, similar(b), similar(c), tol)
+
+function (cb_n2::TestCallbackN2SaddlePts)(solver)
+  mul!(cb_n2.storage_vec1, cb_n2.A, solver.y)
+  cb_n2.storage_vec1 .+= solver.x .- cb_n2.b
+  mul!(cb_n2.storage_vec2, cb_n2.A', solver.x)
+  cb_n2.storage_vec2 .-= solver.y .+ cb_n2.c
+  return (norm(cb_n2.storage_vec1) ≤ cb_n2.tol && norm(cb_n2.storage_vec2) ≤ cb_n2.tol)
+end
+
+function restarted_gmres_callback_n2(solver::GmresSolver, A, b, stor, N, storage_vec, tol)
+  get_x_restarted_gmres!(solver, A, stor, N)
+  x = stor.x
+  mul!(storage_vec, A, x)
+  storage_vec .-= b
+  return (norm(storage_vec) ≤ tol)
+end
diff --git a/test/get_div_grad.jl b/test/get_div_grad.jl
index 6d6bf012e..ae27e5061 100644
--- a/test/get_div_grad.jl
+++ b/test/get_div_grad.jl
@@ -1,8 +1,8 @@
 # Identity matrix.
 eye(n::Int; FC=Float64) = sparse(one(FC) * I, n, n)
 
-# Compute the energy norm ‖r‖ₚ = √(rᵀPr) where P is a symmetric and positive definite matrix.
-metric(r, P) = sqrt(dot(r, P * r))
+# Compute the energy norm ‖r‖ₚ = √(rᴴPr) where P is a symmetric and positive definite matrix.
+metric(r, P) = sqrt(real(dot(r, P * r)))
 
 # Based on Lars Ruthotto's initial implementation.
 function get_div_grad(n1 :: Int, n2 :: Int, n3 :: Int)
diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl
new file mode 100644
index 000000000..9fb6cdffd
--- /dev/null
+++ b/test/gpu/amd.jl
@@ -0,0 +1,111 @@
+using AMDGPU
+
+include("gpu.jl")
+
+@testset "AMD -- AMDGPU.jl" begin
+
+  @test AMDGPU.functional()
+  AMDGPU.allowscalar(false)
+
+  @testset "documentation" begin
+    A_cpu = rand(ComplexF64, 20, 20)
+    A_cpu = A_cpu + A_cpu'
+    b_cpu = rand(ComplexF64, 20)
+    A_gpu = ROCMatrix(A_cpu)
+    b_gpu = ROCVector(b_cpu)
+    x, stats = minres(A_gpu, b_gpu)
+  end
+
+  for FC in (Float32, Float64, ComplexF32, ComplexF64)
+    S = ROCVector{FC}
+    M = ROCMatrix{FC}
+    T = real(FC)
+    n = 10
+    x = rand(FC, n)
+    x = S(x)
+    y = rand(FC, n)
+    y = S(y)
+    a = rand(FC)
+    b = rand(FC)
+    s = rand(FC)
+    a2 = rand(T)
+    b2 = rand(T)
+    c = rand(T)
+
+    @testset "kdot -- $FC" begin
+      Krylov.@kdot(n, x, y)
+    end
+
+    @testset "kdotr -- $FC" begin
+      Krylov.@kdotr(n, x, y)
+    end
+
+    @testset "knrm2 -- $FC" begin
+      Krylov.@knrm2(n, x)
+    end
+
+    @testset "kaxpy! -- $FC" begin
+      Krylov.@kaxpy!(n, a, x, y)
+      Krylov.@kaxpy!(n, a2, x, y)
+    end
+
+    @testset "kaxpby! -- $FC" begin
+      Krylov.@kaxpby!(n, a, x, b, y)
+      Krylov.@kaxpby!(n, a2, x, b, y)
+      Krylov.@kaxpby!(n, a, x, b2, y)
+      Krylov.@kaxpby!(n, a2, x, b2, y)
+    end
+
+    @testset "kcopy! -- $FC" begin
+      Krylov.@kcopy!(n, x, y)
+    end
+
+    @testset "kswap -- $FC" begin
+      Krylov.@kswap(x, y)
+    end
+
+    @testset "kref! -- $FC" begin
+      Krylov.@kref!(n, x, y, c, s)
+    end
+
+    @testset "conversion -- $FC" begin
+      test_conversion(S, M)
+    end
+
+    ε = eps(T)
+    atol = √ε
+    rtol = √ε
+
+    @testset "GMRES -- $FC" begin
+      A, b = nonsymmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = gmres(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "CG -- $FC" begin
+      A, b = symmetric_definite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = cg(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "MINRES-QLP -- $FC" begin
+      A, b = symmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = minres_qlp(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
+
+    @testset "solver -- $FC" begin
+      test_solver(S, M)
+    end
+  end
+end
diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
new file mode 100644
index 000000000..09036ecac
--- /dev/null
+++ b/test/gpu/gpu.jl
@@ -0,0 +1,52 @@
+using LinearAlgebra, SparseArrays, Test
+using Krylov
+
+include("../test_utils.jl")
+
+function test_processes(S, M)
+  m = 250
+  n = 500
+  k = 20
+  FC = eltype(S)
+
+  cpu_A, cpu_b = symmetric_indefinite(n, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, T = hermitian_lanczos(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = nonsymmetric_definite(n, FC=FC)
+  cpu_c = -cpu_b
+  gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c)
+  V, T, U, Tᴴ = nonhermitian_lanczos(gpu_A, gpu_b, gpu_c, k)
+
+  cpu_A, cpu_b = nonsymmetric_indefinite(n, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, H = arnoldi(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = under_consistent(m, n, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, U, L = golub_kahan(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = under_consistent(m, n, FC=FC)
+  _, cpu_c = over_consistent(n, m, FC=FC)
+  gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c)
+  V, T, U, Tᴴ = saunders_simon_yip(gpu_A, gpu_b, gpu_c, k)
+
+  cpu_A, cpu_b = under_consistent(m, n, FC=FC)
+  cpu_B, cpu_c = over_consistent(n, m, FC=FC)
+  gpu_A, gpu_B, gpu_b, gpu_c = M(cpu_A), M(cpu_B), S(cpu_b), S(cpu_c)
+  V, H, U, F = montoison_orban(gpu_A, gpu_B, gpu_b, gpu_c, k)
+end
+
+function test_solver(S, M)
+  n = 10
+  memory = 5
+  A = M(undef, n, n)
+  b = S(undef, n)
+  solver = GmresSolver(n, n, memory, S)
+  solve!(solver, A, b)  # Test that we don't have errors
+end
+
+function test_conversion(S, M)
+  @test Krylov.vector_to_matrix(S) == M
+  @test Krylov.matrix_to_vector(M) == S
+end
diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl
new file mode 100644
index 000000000..f03176199
--- /dev/null
+++ b/test/gpu/intel.jl
@@ -0,0 +1,113 @@
+using oneAPI
+
+include("gpu.jl")
+
+@testset "Intel -- oneAPI.jl" begin
+
+  @test oneAPI.functional()
+  oneAPI.allowscalar(false)
+
+  @testset "documentation" begin
+    T = Float32
+    m = 20
+    n = 10
+    A_cpu = rand(T, m, n)
+    b_cpu = rand(T, m)
+    A_gpu = oneMatrix(A_cpu)
+    b_gpu = oneVector(b_cpu)
+    x, stats = lsqr(A_gpu, b_gpu)
+  end
+
+  for FC ∈ (Float32, ComplexF32)
+    S = oneVector{FC}
+    M = oneMatrix{FC}
+    T = real(FC)
+    n = 10
+    x = rand(FC, n)
+    x = S(x)
+    y = rand(FC, n)
+    y = S(y)
+    a = rand(FC)
+    b = rand(FC)
+    s = rand(FC)
+    a2 = rand(T)
+    b2 = rand(T)
+    c = rand(T)
+
+    @testset "kdot -- $FC" begin
+      Krylov.@kdot(n, x, y)
+    end
+
+    @testset "kdotr -- $FC" begin
+      Krylov.@kdotr(n, x, y)
+    end
+
+    @testset "knrm2 -- $FC" begin
+      Krylov.@knrm2(n, x)
+    end
+
+    @testset "kaxpy! -- $FC" begin
+      Krylov.@kaxpy!(n, a, x, y)
+      Krylov.@kaxpy!(n, a2, x, y)
+    end
+
+    @testset "kaxpby! -- $FC" begin
+      Krylov.@kaxpby!(n, a, x, b, y)
+      Krylov.@kaxpby!(n, a2, x, b, y)
+      Krylov.@kaxpby!(n, a, x, b2, y)
+      Krylov.@kaxpby!(n, a2, x, b2, y)
+    end
+
+    @testset "kcopy! -- $FC" begin
+      Krylov.@kcopy!(n, x, y)
+    end
+
+    @testset "kswap -- $FC" begin
+      Krylov.@kswap(x, y)
+    end
+
+    @testset "kref! -- $FC" begin
+      Krylov.@kref!(n, x, y, c, s)
+    end
+
+    @testset "conversion -- $FC" begin
+      test_conversion(S, M)
+    end
+
+    ε = eps(T)
+    atol = √ε
+    rtol = √ε
+
+    @testset "GMRES -- $FC" begin
+      A, b = nonsymmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = gmres(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "CG -- $FC" begin
+      A, b = symmetric_definite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = cg(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "MINRES-QLP -- $FC" begin
+      A, b = symmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = minres_qlp(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
+
+    @testset "solver -- $FC" begin
+      test_solver(S, M)
+    end
+  end
+end
diff --git a/test/gpu/metal.jl b/test/gpu/metal.jl
new file mode 100644
index 000000000..2e684e21f
--- /dev/null
+++ b/test/gpu/metal.jl
@@ -0,0 +1,113 @@
+using Metal
+
+include("gpu.jl")
+
+@testset "Apple M1 GPUs -- Metal.jl" begin
+
+  # @test Metal.functional()
+  Metal.allowscalar(false)
+
+  @testset "documentation" begin
+    T = Float32
+    n = 10
+    m = 20
+    A_cpu = rand(T, n, m)
+    b_cpu = rand(T, n)
+    A_gpu = MtlMatrix(A_cpu)
+    b_gpu = MtlVector(b_cpu)
+    x, stats = craig(A_gpu, b_gpu)
+  end
+
+  for FC in (Float32, ComplexF32)
+    S = MtlVector{FC}
+    M = MtlMatrix{FC}
+    T = real(FC)
+    n = 10
+    x = rand(FC, n)
+    x = S(x)
+    y = rand(FC, n)
+    y = S(y)
+    a = rand(FC)
+    b = rand(FC)
+    s = rand(FC)
+    a2 = rand(T)
+    b2 = rand(T)
+    c = rand(T)
+
+    @testset "kdot -- $FC" begin
+      Krylov.@kdot(n, x, y)
+    end
+
+    @testset "kdotr -- $FC" begin
+      Krylov.@kdotr(n, x, y)
+    end
+
+    @testset "knrm2 -- $FC" begin
+      Krylov.@knrm2(n, x)
+    end
+
+    @testset "kaxpy! -- $FC" begin
+      Krylov.@kaxpy!(n, a, x, y)
+      Krylov.@kaxpy!(n, a2, x, y)
+    end
+
+    @testset "kaxpby! -- $FC" begin
+      Krylov.@kaxpby!(n, a, x, b, y)
+      Krylov.@kaxpby!(n, a2, x, b, y)
+      Krylov.@kaxpby!(n, a, x, b2, y)
+      Krylov.@kaxpby!(n, a2, x, b2, y)
+    end
+
+    @testset "kcopy! -- $FC" begin
+      Krylov.@kcopy!(n, x, y)
+    end
+
+    @testset "kswap -- $FC" begin
+      Krylov.@kswap(x, y)
+    end
+
+    @testset "kref! -- $FC" begin
+      Krylov.@kref!(n, x, y, c, s)
+    end
+
+    @testset "conversion -- $FC" begin
+      test_conversion(S, M)
+    end
+
+    ε = eps(T)
+    atol = √ε
+    rtol = √ε
+
+    @testset "GMRES -- $FC" begin
+      A, b = nonsymmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = gmres(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "CG -- $FC" begin
+      A, b = symmetric_definite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = cg(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "MINRES-QLP -- $FC" begin
+      A, b = symmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = minres_qlp(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
+
+    @testset "solver -- $FC" begin
+      test_solver(S, M)
+    end
+  end
+end
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
new file mode 100644
index 000000000..908a2819c
--- /dev/null
+++ b/test/gpu/nvidia.jl
@@ -0,0 +1,204 @@
+using LinearOperators, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER
+
+include("gpu.jl")
+
+@testset "Nvidia -- CUDA.jl" begin
+
+  @test CUDA.functional()
+  CUDA.allowscalar(false)
+
+  @testset "documentation" begin
+    A_cpu = rand(20, 20)
+    b_cpu = rand(20)
+    A_gpu = CuMatrix(A_cpu)
+    b_gpu = CuVector(b_cpu)
+    x, stats = bilq(A_gpu, b_gpu)
+
+    A_cpu = sprand(200, 100, 0.3)
+    b_cpu = rand(200)
+    A_gpu = CuSparseMatrixCSC(A_cpu)
+    b_gpu = CuVector(b_cpu)
+    x, stats = lsmr(A_gpu, b_gpu)
+
+    @testset "ic0" begin
+      A_cpu, b_cpu = sparse_laplacian()
+
+      b_gpu = CuVector(b_cpu)
+      n = length(b_gpu)
+      T = eltype(b_gpu)
+      symmetric = hermitian = true
+
+      A_gpu = CuSparseMatrixCSC(A_cpu)
+      P = ic02(A_gpu, 'O')
+      function ldiv_csc_ic0!(y, P, x)
+        copyto!(y, x)
+        sv2!('T', 'U', 'N', 1.0, P, y, 'O')
+        sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+        return y
+      end
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csc_ic0!(y, P, x))
+      x, stats = cg(A_gpu, b_gpu, M=opM)
+      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+
+      A_gpu = CuSparseMatrixCSR(A_cpu)
+      P = ic02(A_gpu, 'O')
+      function ldiv_csr_ic0!(y, P, x)
+        copyto!(y, x)
+        sv2!('N', 'L', 'N', 1.0, P, y, 'O')
+        sv2!('T', 'L', 'N', 1.0, P, y, 'O')
+        return y
+      end
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csr_ic0!(y, P, x))
+      x, stats = cg(A_gpu, b_gpu, M=opM)
+      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+    end
+
+    @testset "ilu0" begin
+      A_cpu, b_cpu = polar_poisson()
+
+      p = zfd(A_cpu, 'O')
+      p .+= 1
+      A_cpu = A_cpu[p,:]
+      b_cpu = b_cpu[p]
+
+      b_gpu = CuVector(b_cpu)
+      n = length(b_gpu)
+      T = eltype(b_gpu)
+      symmetric = hermitian = false
+
+      A_gpu = CuSparseMatrixCSC(A_cpu)
+      P = ilu02(A_gpu, 'O')
+      function ldiv_csc_ilu0!(y, P, x)
+        copyto!(y, x)
+        sv2!('N', 'L', 'N', 1.0, P, y, 'O')
+        sv2!('N', 'U', 'U', 1.0, P, y, 'O')
+        return y
+      end
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csc_ilu0!(y, P, x))
+      x, stats = bicgstab(A_gpu, b_gpu, M=opM)
+      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+
+      A_gpu = CuSparseMatrixCSR(A_cpu)
+      P = ilu02(A_gpu, 'O')
+      function ldiv_csr_ilu0!(y, P, x)
+        copyto!(y, x)
+        sv2!('N', 'L', 'U', 1.0, P, y, 'O')
+        sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+        return y
+      end
+      opM = LinearOperator(T, n, n, symmetric, hermitian, (y, x) -> ldiv_csr_ilu0!(y, P, x))
+      x, stats = bicgstab(A_gpu, b_gpu, M=opM)
+      @test norm(b_gpu - A_gpu * x) ≤ 1e-6
+    end
+  end
+
+  for FC in (Float32, Float64, ComplexF32, ComplexF64)
+    S = CuVector{FC}
+    V = CuSparseVector{FC}
+    M = CuMatrix{FC}
+    T = real(FC)
+    n = 10
+    x = rand(FC, n)
+    x = S(x)
+    y = rand(FC, n)
+    y = S(y)
+    a = rand(FC)
+    b = rand(FC)
+    s = rand(FC)
+    a2 = rand(T)
+    b2 = rand(T)
+    c = rand(T)
+
+    @testset "kdot -- $FC" begin
+      Krylov.@kdot(n, x, y)
+    end
+
+    @testset "kdotr -- $FC" begin
+      Krylov.@kdotr(n, x, y)
+    end
+
+    @testset "knrm2 -- $FC" begin
+      Krylov.@knrm2(n, x)
+    end
+
+    @testset "kaxpy! -- $FC" begin
+      Krylov.@kaxpy!(n, a, x, y)
+      Krylov.@kaxpy!(n, a2, x, y)
+    end
+
+    @testset "kaxpby! -- $FC" begin
+      Krylov.@kaxpby!(n, a, x, b, y)
+      Krylov.@kaxpby!(n, a2, x, b, y)
+      Krylov.@kaxpby!(n, a, x, b2, y)
+      Krylov.@kaxpby!(n, a2, x, b2, y)
+    end
+
+    @testset "kcopy! -- $FC" begin
+      Krylov.@kcopy!(n, x, y)
+    end
+
+    @testset "kswap -- $FC" begin
+      Krylov.@kswap(x, y)
+    end
+
+    @testset "kref! -- $FC" begin
+      Krylov.@kref!(n, x, y, c, s)
+    end
+
+    @testset "conversion -- $FC" begin
+      test_conversion(S, M)
+    end
+
+    ε = eps(T)
+    atol = √ε
+    rtol = √ε
+
+    @testset "GMRES -- $FC" begin
+      A, b = nonsymmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = gmres(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "CG -- $FC" begin
+      A, b = symmetric_definite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = cg(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "MINRES-QLP -- $FC" begin
+      A, b = symmetric_indefinite(FC=FC)
+      A = M(A)
+      b = S(b)
+      x, stats = minres_qlp(A, b)
+      @test norm(b - A * x) ≤ atol + rtol * norm(b)
+    end
+
+    @testset "processes -- $FC" begin
+      test_processes(S, M)
+    end
+
+    @testset "solver -- $FC" begin
+      test_solver(S, M)
+    end
+
+    @testset "ktypeof -- $FC" begin
+      dv = S(rand(FC, 10))
+      b = view(dv, 4:8)
+      @test Krylov.ktypeof(dv) <: S
+      @test Krylov.ktypeof(b)  <: S
+
+      dm = M(rand(FC, 10, 10))
+      b = view(dm, :, 3)
+      @test Krylov.ktypeof(b) <: S
+
+      sv = V(sprand(FC, 10, 0.5))
+      b = view(sv, 4:8)
+      @test Krylov.ktypeof(sv) <: S
+      @test Krylov.ktypeof(b)  <: S
+    end
+  end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 99ab25fda..b69865f61 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,9 @@ import Krylov.KRYLOV_SOLVERS
 include("test_utils.jl")
 include("test_aux.jl")
 include("test_stats.jl")
+include("test_processes.jl")
 
+include("test_fgmres.jl")
 include("test_gpmr.jl")
 include("test_fom.jl")
 include("test_gmres.jl")
diff --git a/test/test_allocations.jl b/test/test_allocations.jl
index 4c6817499..174d0ae55 100644
--- a/test/test_allocations.jl
+++ b/test/test_allocations.jl
@@ -1,26 +1,27 @@
 @testset "allocations" begin
 
-  for FC in (Float64, ComplexF64)
+  for FC in (Float32, Float64, ComplexF32, ComplexF64)
     @testset "Data Type: $FC" begin
 
-      A   = FC.(get_div_grad(16, 16, 16))  # Dimension n x n
-      n   = size(A, 1)
-      m   = div(n, 2)
-      Au  = A[1:m,:]  # Dimension m x n
-      Ao  = A[:,1:m]  # Dimension n x m
-      b   = Ao * ones(FC, m) # Dimension n
-      c   = Au * ones(FC, n) # Dimension m
+      A   = FC.(get_div_grad(18, 18, 18))  # Dimension m x n
+      m,n = size(A)
+      k   = div(n, 2)
+      Au  = A[1:k,:]          # Dimension k x n
+      Ao  = A[:,1:k]          # Dimension m x k
+      b   = Ao * ones(FC, k)  # Dimension m
+      c   = Au * ones(FC, n)  # Dimension k
       mem = 200
 
-      shifts  = [1.0; 2.0; 3.0; 4.0; 5.0]
+      T = real(FC)
+      shifts  = T[1; 2; 3; 4; 5]
       nshifts = 5
-      nbits = sizeof(FC)  # 8 bits for Float64 and 16 bits for ComplexF64
+      nbits_FC = sizeof(FC)  # 8 bits for ComplexF32 and 16 bits for ComplexF64
+      nbits_T = sizeof(T)    # 4 bits for Float32 and 8 bits for Float64
 
       @testset "SYMMLQ" begin
         # SYMMLQ needs:
         # 5 n-vectors: x, Mvold, Mv, Mv_next, w̅
-        storage_symmlq(n) = 5 * n
-        storage_symmlq_bytes(n) = nbits * storage_symmlq(n)
+        storage_symmlq_bytes(n) = nbits_FC * 5 * n
 
         expected_symmlq_bytes = storage_symmlq_bytes(n)
         symmlq(A, b)  # warmup
@@ -36,8 +37,7 @@
       @testset "CG" begin
         # CG needs:
         # 4 n-vectors: x, r, p, Ap
-        storage_cg(n) = 4 * n
-        storage_cg_bytes(n) = nbits * storage_cg(n)
+        storage_cg_bytes(n) = nbits_FC * 4 * n
 
         expected_cg_bytes = storage_cg_bytes(n)
         cg(A, b)  # warmup
@@ -53,8 +53,7 @@
       @testset "CG-LANCZOS" begin
         # CG-LANCZOS needs:
         # 5 n-vectors: x, Mv, Mv_prev, p, Mv_next
-        storage_cg_lanczos(n) = 5 * n
-        storage_cg_lanczos_bytes(n) = nbits * storage_cg_lanczos(n)
+        storage_cg_lanczos_bytes(n) = nbits_FC * 5 * n
 
         expected_cg_lanczos_bytes = storage_cg_lanczos_bytes(n)
         cg_lanczos(A, b)  # warmup
@@ -73,9 +72,7 @@
         # - 2 (n*nshifts)-matrices: x, p
         # - 5 nshifts-vectors: σ, δhat, ω, γ, rNorms
         # - 3 nshifts-bitVector: indefinite, converged, not_cv
-        storage_cg_lanczos_shift(n, nshifts) = (3 * n) + (2 * n * nshifts) + (5 * nshifts) + (3 * nshifts / 64)
-        storage_cg_lanczos_shift_bytes(n, nshifts) = nbits * storage_cg_lanczos_shift(n, nshifts)
-
+        storage_cg_lanczos_shift_bytes(n, nshifts) = nbits_FC * ((3 * n) + (2 * n * nshifts)) + nbits_T * (5 * nshifts) + (3 * nshifts)
         expected_cg_lanczos_shift_bytes = storage_cg_lanczos_shift_bytes(n, nshifts)
         cg_lanczos_shift(A, b, shifts)  # warmup
         actual_cg_lanczos_shift_bytes = @allocated cg_lanczos_shift(A, b, shifts)
@@ -90,8 +87,7 @@
       @testset "CR" begin
         # CR needs:
         # 5 n-vectors: x, r, p, q, Ar
-        storage_cr(n) = 5 * n
-        storage_cr_bytes(n) = nbits * storage_cr(n)
+        storage_cr_bytes(n) = nbits_FC * 5 * n
 
         expected_cr_bytes = storage_cr_bytes(n)
         cr(A, b)  # warmup
@@ -107,8 +103,7 @@
       @testset "MINRES" begin
         # MINRES needs:
         # 6 n-vectors: x, r1, r2, w1, w2, y
-        storage_minres(n) = 6 * n
-        storage_minres_bytes(n) = nbits * storage_minres(n)
+        storage_minres_bytes(n) = nbits_FC * 6 * n
 
         expected_minres_bytes = storage_minres_bytes(n)
         minres(A, b)  # warmup
@@ -124,8 +119,7 @@
       @testset "MINRES-QLP" begin
         # MINRES-QLP needs:
         # - 6 n-vectors: wₖ₋₁, wₖ, vₖ₋₁, vₖ, x, p
-        storage_minres_qlp(n) = 6 * n
-        storage_minres_qlp_bytes(n) = nbits * storage_minres_qlp(n)
+        storage_minres_qlp_bytes(n) = nbits_FC * 6 * n
 
         expected_minres_qlp_bytes = storage_minres_qlp_bytes(n)
         minres_qlp(A, b)  # warmup
@@ -141,11 +135,11 @@
       @testset "DIOM" begin
         # DIOM needs:
         # - 2 n-vectors: x, t
-        # - 2 (n*mem)-matrices: P, V
-        # - 1 mem-vector: L
-        # - 1 (mem+2)-vector: H
-        storage_diom(mem, n) = (2 * n) + (2 * n * mem) + (mem) + (mem + 2)
-        storage_diom_bytes(mem, n) = nbits * storage_diom(mem, n)
+        # - 1 (n*mem)-matrix: V
+        # - 1 n*(mem-1)-matrix: P
+        # - 1 (mem-1)-vector: L
+        # - 1 mem-vector: H
+        storage_diom_bytes(mem, n) = nbits_FC * ((2 * n) + (n * mem) + (n * (mem-1)) + (mem-1) + (mem))
 
         expected_diom_bytes = storage_diom_bytes(mem, n)
         diom(A, b, memory=mem)  # warmup
@@ -164,8 +158,7 @@
         # - 1 (n*mem)-matrix: V
         # - 2 mem-vectors: l, z
         # - 1 (mem*(mem+1)/2)-vector: U
-        storage_fom(mem, n) = (2 * n) + (n * mem) + (2 * mem) + (mem * (mem+1) / 2)
-        storage_fom_bytes(mem, n) = nbits * storage_fom(mem, n)
+        storage_fom_bytes(mem, n) = nbits_FC * ((2 * n) + (n * mem) + (2 * mem) + (mem * (mem+1) / 2))
 
         expected_fom_bytes = storage_fom_bytes(mem, n)
         fom(A, b, memory=mem)  # warmup
@@ -183,9 +176,8 @@
         # - 2 n-vectors: x, t
         # - 2 (n*mem)-matrices: P, V
         # - 2 mem-vectors: c, s
-        # - 1 (mem+2)-vector: H
-        storage_dqgmres(mem, n) = (2 * n) + (2 * n * mem) + (2 * mem) + (mem + 2)
-        storage_dqgmres_bytes(mem, n) = nbits * storage_dqgmres(mem, n)
+        # - 1 (mem+1)-vector: H
+        storage_dqgmres_bytes(mem, n) = nbits_FC * ((2 * n) + (2 * n * mem) + mem + (mem + 1)) + nbits_T * mem
 
         expected_dqgmres_bytes = storage_dqgmres_bytes(mem, n)
         dqgmres(A, b, memory=mem)  # warmup
@@ -204,8 +196,7 @@
         # - 1 n*(mem)-matrix: V
         # - 3 mem-vectors: c, s, z
         # - 1 (mem*(mem+1)/2)-vector: R
-        storage_gmres(mem, n) = (2 * n) + (n * mem) + (3 * mem) + (mem * (mem+1) / 2)
-        storage_gmres_bytes(mem, n) = nbits * storage_gmres(mem, n)
+        storage_gmres_bytes(mem, n) = nbits_FC * ((2 * n) + (n * mem) + (2 * mem) + (mem * (mem+1) / 2)) + nbits_T * mem
 
         expected_gmres_bytes = storage_gmres_bytes(mem, n)
         gmres(A, b, memory=mem)  # warmup
@@ -218,11 +209,29 @@
         @test inplace_gmres_bytes == 0
       end
 
+      @testset "FGMRES" begin
+        # FGMRES needs:
+        # - 2 n-vectors: x, w
+        # - 2 n*(mem)-matrix: V, Z
+        # - 3 mem-vectors: c, s, z
+        # - 1 (mem*(mem+1)/2)-vector: R
+        storage_fgmres_bytes(mem, n) = nbits_FC * ((2 * n) + (2 * n * mem) + (2 * mem) + (mem * (mem+1) / 2)) + nbits_T * mem
+
+        expected_fgmres_bytes = storage_fgmres_bytes(mem, n)
+        fgmres(A, b, memory=mem)  # warmup
+        actual_fgmres_bytes = @allocated fgmres(A, b, memory=mem)
+        @test expected_fgmres_bytes ≤ actual_fgmres_bytes ≤ 1.02 * expected_fgmres_bytes
+
+        solver = FgmresSolver(A, b, mem)
+        fgmres!(solver, A, b)  # warmup
+        inplace_fgmres_bytes = @allocated fgmres!(solver, A, b)
+        @test inplace_fgmres_bytes == 0
+      end
+
       @testset "CGS" begin
         # CGS needs:
         # 6 n-vectors: x, r, u, p, q, ts
-        storage_cgs(n) = 6 * n
-        storage_cgs_bytes(n) = nbits * storage_cgs(n)
+        storage_cgs_bytes(n) = nbits_FC * 6 * n
 
         expected_cgs_bytes = storage_cgs_bytes(n)
         cgs(A, b)  # warmup
@@ -238,8 +247,7 @@
       @testset "BICGSTAB" begin
         # BICGSTAB needs:
         # 6 n-vectors: x, r, p, v, s, qd
-        storage_bicgstab(n) = 6 * n
-        storage_bicgstab_bytes(n) = nbits * storage_bicgstab(n)
+        storage_bicgstab_bytes(n) = nbits_FC * 6 * n
 
         expected_bicgstab_bytes = storage_bicgstab_bytes(n)
         bicgstab(A, b)  # warmup
@@ -254,12 +262,11 @@
 
       @testset "CGNE" begin
         # CGNE needs:
-        # - 3 n-vectors: x, p, Aᵀz
+        # - 3 n-vectors: x, p, Aᴴz
         # - 2 m-vectors: r, q
-        storage_cgne(n, m) = 3 * n + 2 * m
-        storage_cgne_bytes(n, m) = nbits * storage_cgne(n, m)
+        storage_cgne_bytes(m, n) = nbits_FC * (3 * n + 2 * m)
 
-        expected_cgne_bytes = storage_cgne_bytes(n, m)
+        expected_cgne_bytes = storage_cgne_bytes(k, n)
         (x, stats) = cgne(Au, c)  # warmup
         actual_cgne_bytes = @allocated cgne(Au, c)
         @test expected_cgne_bytes ≤ actual_cgne_bytes ≤ 1.02 * expected_cgne_bytes
@@ -272,12 +279,11 @@
 
       @testset "CRMR" begin
         # CRMR needs:
-        # - 3 n-vectors: x, p, Aᵀr
+        # - 3 n-vectors: x, p, Aᴴr
         # - 2 m-vectors: r, q
-        storage_crmr(n, m) = 3 * n + 2 * m
-        storage_crmr_bytes(n, m) = nbits * storage_crmr(n, m)
+        storage_crmr_bytes(m, n) = nbits_FC * (3 * n + 2 * m)
 
-        expected_crmr_bytes = storage_crmr_bytes(n, m)
+        expected_crmr_bytes = storage_crmr_bytes(k, n)
         (x, stats) = crmr(Au, c)  # warmup
         actual_crmr_bytes = @allocated crmr(Au, c)
         @test expected_crmr_bytes ≤ actual_crmr_bytes ≤ 1.02 * expected_crmr_bytes
@@ -290,12 +296,11 @@
 
       @testset "LNLQ" begin
         # LNLQ needs:
-        # - 3 n-vectors: x, v, Aᵀu
+        # - 3 n-vectors: x, v, Aᴴu
         # - 4 m-vectors: y, w̄, u, Av
-        storage_lnlq(n, m) = 3 * n + 4 * m
-        storage_lnlq_bytes(n, m) = nbits * storage_lnlq(n, m)
+        storage_lnlq_bytes(m, n) = nbits_FC * (3 * n + 4 * m)
 
-        expected_lnlq_bytes = storage_lnlq_bytes(n, m)
+        expected_lnlq_bytes = storage_lnlq_bytes(k, n)
         lnlq(Au, c)  # warmup
         actual_lnlq_bytes = @allocated lnlq(Au, c)
         @test expected_lnlq_bytes ≤ actual_lnlq_bytes ≤ 1.02 * expected_lnlq_bytes
@@ -308,12 +313,11 @@
 
       @testset "CRAIG" begin
         # CRAIG needs:
-        # - 3 n-vectors: x, v, Aᵀu
+        # - 3 n-vectors: x, v, Aᴴu
         # - 4 m-vectors: y, w, u, Av
-        storage_craig(n, m) = 3 * n + 4 * m
-        storage_craig_bytes(n, m) = nbits * storage_craig(n, m)
+        storage_craig_bytes(m, n) = nbits_FC * (3 * n + 4 * m)
 
-        expected_craig_bytes = storage_craig_bytes(n, m)
+        expected_craig_bytes = storage_craig_bytes(k, n)
         craig(Au, c)  # warmup
         actual_craig_bytes = @allocated craig(Au, c)
         @test expected_craig_bytes ≤ actual_craig_bytes ≤ 1.02 * expected_craig_bytes
@@ -326,12 +330,11 @@
 
       @testset "CRAIGMR" begin
         # CRAIGMR needs:
-        # - 4 n-vectors: x, v, Aᵀu, d
+        # - 4 n-vectors: x, v, Aᴴu, d
         # - 5 m-vectors: y, u, w, wbar, Av
-        storage_craigmr(n, m) = 4 * n + 5 * m
-        storage_craigmr_bytes(n, m) = nbits * storage_craigmr(n, m)
+        storage_craigmr_bytes(m, n) = nbits_FC * (4 * n + 5 * m)
 
-        expected_craigmr_bytes = storage_craigmr_bytes(n, m)
+        expected_craigmr_bytes = storage_craigmr_bytes(k, n)
         craigmr(Au, c)  # warmup
         actual_craigmr_bytes = @allocated craigmr(Au, c)
         @test expected_craigmr_bytes ≤ actual_craigmr_bytes ≤ 1.02 * expected_craigmr_bytes
@@ -344,12 +347,11 @@
 
       @testset "CGLS" begin
         # CGLS needs:
-        # - 3 m-vectors: x, p, s
-        # - 2 n-vectors: r, q
-        storage_cgls(n, m) = 3 * m + 2 * n
-        storage_cgls_bytes(n, m) = nbits * storage_cgls(n, m)
+        # - 3 n-vectors: x, p, s
+        # - 2 m-vectors: r, q
+        storage_cgls_bytes(m, n) = nbits_FC * (3 * n + 2 * m)
 
-        expected_cgls_bytes = storage_cgls_bytes(n, m)
+        expected_cgls_bytes = storage_cgls_bytes(m, k)
         (x, stats) = cgls(Ao, b)  # warmup
         actual_cgls_bytes = @allocated cgls(Ao, b)
         @test expected_cgls_bytes ≤ actual_cgls_bytes ≤ 1.02 * expected_cgls_bytes
@@ -362,12 +364,11 @@
 
       @testset "LSLQ" begin
         # LSLQ needs:
-        # - 4 m-vectors: x_lq, v, Aᵀu, w̄ (= x_cg)
-        # - 2 n-vectors: u, Av
-        storage_lslq(n, m) = 4 * m + 2 * n
-        storage_lslq_bytes(n, m) = nbits * storage_lslq(n, m)
+        # - 4 n-vectors: x_lq, v, Aᴴu, w̄ (= x_cg)
+        # - 2 m-vectors: u, Av
+        storage_lslq_bytes(m, n) = nbits_FC * (4 * n + 2 * m)
 
-        expected_lslq_bytes = storage_lslq_bytes(n, m)
+        expected_lslq_bytes = storage_lslq_bytes(m, k)
         (x, stats) = lslq(Ao, b)  # warmup
         actual_lslq_bytes = @allocated lslq(Ao, b)
         @test expected_lslq_bytes ≤ actual_lslq_bytes ≤ 1.02 * expected_lslq_bytes
@@ -380,12 +381,11 @@
 
       @testset "CRLS" begin
         # CRLS needs:
-        # - 4 m-vectors: x, p, Ar, q
-        # - 3 n-vectors: r, Ap, s
-        storage_crls(n, m) = 4 * m + 3 * n
-        storage_crls_bytes(n, m) = nbits * storage_crls(n, m)
+        # - 4 n-vectors: x, p, Ar, q
+        # - 3 m-vectors: r, Ap, s
+        storage_crls_bytes(m, n) = nbits_FC * (4 * n + 3 * m)
 
-        expected_crls_bytes = storage_crls_bytes(n, m)
+        expected_crls_bytes = storage_crls_bytes(m, k)
         (x, stats) = crls(Ao, b)  # warmup
         actual_crls_bytes = @allocated crls(Ao, b)
         @test expected_crls_bytes ≤ actual_crls_bytes ≤ 1.02 * expected_crls_bytes
@@ -398,12 +398,11 @@
 
       @testset "LSQR" begin
         # LSQR needs:
-        # - 4 m-vectors: x, v, w, Aᵀu
-        # - 2 n-vectors: u, Av
-        storage_lsqr(n, m) = 4 * m + 2 * n
-        storage_lsqr_bytes(n, m) = nbits * storage_lsqr(n, m)
+        # - 4 n-vectors: x, v, w, Aᴴu
+        # - 2 m-vectors: u, Av
+        storage_lsqr_bytes(m, n) = nbits_FC * (4 * n + 2 * m)
 
-        expected_lsqr_bytes = storage_lsqr_bytes(n, m)
+        expected_lsqr_bytes = storage_lsqr_bytes(m, k)
         (x, stats) = lsqr(Ao, b)  # warmup
         actual_lsqr_bytes = @allocated lsqr(Ao, b)
         @test expected_lsqr_bytes ≤ actual_lsqr_bytes ≤ 1.02 * expected_lsqr_bytes
@@ -416,12 +415,11 @@
 
       @testset "LSMR" begin
         # LSMR needs:
-        # - 5 m-vectors: x, v, h, hbar, Aᵀu
-        # - 2 n-vectors: u, Av
-        storage_lsmr(n, m) = 5 * m + 2 * n
-        storage_lsmr_bytes(n, m) = nbits * storage_lsmr(n, m)
+        # - 5 n-vectors: x, v, h, hbar, Aᴴu
+        # - 2 m-vectors: u, Av
+        storage_lsmr_bytes(m, n) = nbits_FC * (5 * n + 2 * m)
 
-        expected_lsmr_bytes = storage_lsmr_bytes(n, m)
+        expected_lsmr_bytes = storage_lsmr_bytes(m, k)
         (x, stats) = lsmr(Ao, b)  # warmup
         actual_lsmr_bytes = @allocated lsmr(Ao, b)
         @test expected_lsmr_bytes ≤ actual_lsmr_bytes ≤ 1.02 * expected_lsmr_bytes
@@ -435,8 +433,7 @@
       @testset "BiLQ" begin
         # BILQ needs:
         # - 8 n-vectors: uₖ₋₁, uₖ, vₖ₋₁, vₖ, x, d̅, p, q
-        storage_bilq(n) = 8 * n
-        storage_bilq_bytes(n) = nbits * storage_bilq(n)
+        storage_bilq_bytes(n) = nbits_FC * 8 * n
 
         expected_bilq_bytes = storage_bilq_bytes(n)
         bilq(A, b)  # warmup
@@ -452,8 +449,7 @@
       @testset "QMR" begin
         # QMR needs:
         # - 9 n-vectors: uₖ₋₁, uₖ, vₖ₋₁, vₖ, x, wₖ₋₁, wₖ, p, q
-        storage_qmr(n) = 9 * n
-        storage_qmr_bytes(n) = nbits * storage_qmr(n)
+        storage_qmr_bytes(n) = nbits_FC * 9 * n
 
         expected_qmr_bytes = storage_qmr_bytes(n)
         qmr(A, b)  # warmup
@@ -469,8 +465,7 @@
       @testset "BiLQR" begin
         # BILQR needs:
         # - 11 n-vectors: uₖ₋₁, uₖ, vₖ₋₁, vₖ, x, t, d̅, wₖ₋₁, wₖ, p, q
-        storage_bilqr(n) = 11 * n
-        storage_bilqr_bytes(n) = nbits * storage_bilqr(n)
+        storage_bilqr_bytes(n) = nbits_FC * 11 * n
 
         expected_bilqr_bytes = storage_bilqr_bytes(n)
         bilqr(A, b, b)  # warmup
@@ -487,10 +482,9 @@
         # USYMLQ needs:
         # - 5 n-vectors: uₖ₋₁, uₖ, x, d̅, p
         # - 3 m-vectors: vₖ₋₁, vₖ, q
-        storage_usymlq(n, m) = 5 * n + 3 * m
-        storage_usymlq_bytes(n, m) = nbits * storage_usymlq(n, m)
+        storage_usymlq_bytes(m, n) = nbits_FC * (5 * n + 3 * m)
 
-        expected_usymlq_bytes = storage_usymlq_bytes(n, m)
+        expected_usymlq_bytes = storage_usymlq_bytes(k, n)
         usymlq(Au, c, b)  # warmup
         actual_usymlq_bytes = @allocated usymlq(Au, c, b)
         @test expected_usymlq_bytes ≤ actual_usymlq_bytes ≤ 1.02 * expected_usymlq_bytes
@@ -503,12 +497,11 @@
 
       @testset "USYMQR" begin
         # USYMQR needs:
-        # - 6 m-vectors: vₖ₋₁, vₖ, x, wₖ₋₁, wₖ, p
-        # - 3 n-vectors: uₖ₋₁, uₖ, q
-        storage_usymqr(n, m) = 6 * m + 3 * n
-        storage_usymqr_bytes(n, m) = nbits * storage_usymqr(n, m)
+        # - 6 n-vectors: vₖ₋₁, vₖ, x, wₖ₋₁, wₖ, p
+        # - 3 m-vectors: uₖ₋₁, uₖ, q
+        storage_usymqr_bytes(m, n) = nbits_FC * (6 * n + 3 * m)
 
-        expected_usymqr_bytes = storage_usymqr_bytes(n, m)
+        expected_usymqr_bytes = storage_usymqr_bytes(m, k)
         (x, stats) = usymqr(Ao, b, c) # warmup
         actual_usymqr_bytes = @allocated usymqr(Ao, b, c)
         @test expected_usymqr_bytes ≤ actual_usymqr_bytes ≤ 1.02 * expected_usymqr_bytes
@@ -523,8 +516,7 @@
         # TRILQR needs:
         # - 6 m-vectors: vₖ₋₁, vₖ, t, wₖ₋₁, wₖ, q
         # - 5 n-vectors: uₖ₋₁, uₖ, x, d̅, p
-        storage_trilqr(n, m) = 6 * m + 5 * n
-        storage_trilqr_bytes(n, m) = nbits * storage_trilqr(n, m)
+        storage_trilqr_bytes(m, n) = nbits_FC * (6 * m + 5 * n)
 
         expected_trilqr_bytes = storage_trilqr_bytes(n, n)
         trilqr(A, b, b)  # warmup
@@ -541,10 +533,9 @@
         # TriCG needs:
         # - 6 n-vectors: yₖ, uₖ₋₁, uₖ, gy₂ₖ₋₁, gy₂ₖ, p
         # - 6 m-vectors: xₖ, vₖ₋₁, vₖ, gx₂ₖ₋₁, gx₂ₖ, q
-        storage_tricg(n, m) = 6 * n + 6 * m
-        storage_tricg_bytes(n, m) = nbits * storage_tricg(n, m)
+        storage_tricg_bytes(m, n) = nbits_FC * (6 * n + 6 * m)
 
-        expected_tricg_bytes = storage_tricg_bytes(n, m)
+        expected_tricg_bytes = storage_tricg_bytes(k, n)
         tricg(Au, c, b)  # warmup
         actual_tricg_bytes = @allocated tricg(Au, c, b)
         @test expected_tricg_bytes ≤ actual_tricg_bytes ≤ 1.02 * expected_tricg_bytes
@@ -559,10 +550,9 @@
         # TriMR needs:
         # - 8 n-vectors: yₖ, uₖ₋₁, uₖ, gy₂ₖ₋₃, gy₂ₖ₋₂, gy₂ₖ₋₁, gy₂ₖ, p
         # - 8 m-vectors: xₖ, vₖ₋₁, vₖ, gx₂ₖ₋₃, gx₂ₖ₋₂, gx₂ₖ₋₁, gx₂ₖ, q
-        storage_trimr(n, m) = 8 * n + 8 * m
-        storage_trimr_bytes(n, m) = nbits * storage_trimr(n, m)
+        storage_trimr_bytes(m, n) = nbits_FC * (8 * n + 8 * m)
 
-        expected_trimr_bytes = storage_trimr_bytes(n, m)
+        expected_trimr_bytes = storage_trimr_bytes(k, n)
         trimr(Au, c, b)  # warmup
         actual_trimr_bytes = @allocated trimr(Au, c, b)
         @test expected_trimr_bytes ≤ actual_trimr_bytes ≤ 1.02 * expected_trimr_bytes
@@ -575,17 +565,16 @@
 
       @testset "GPMR" begin
         # GPMR needs:
-        # - 2 n-vectors: x, q
-        # - 2 m-vectors: y, p
-        # - 1 (n*mem)-matrix: V
-        # - 1 (m*mem)-matrix: U
+        # - 2 m-vectors: x, q
+        # - 2 n-vectors: y, p
+        # - 1 (m*mem)-matrix: V
+        # - 1 (n*mem)-matrix: U
         # - 1 (2*mem)-vector: zt
         # - 2 (4*mem)-vectors: gc, gs
         # - 1 (mem*(2mem+1))-vector: R
-        storage_gpmr(mem, n, m) = (mem + 2) * (n + m) + mem * (2 * mem + 11)
-        storage_gpmr_bytes(mem, n, m) = nbits * storage_gpmr(mem, n, m)
+        storage_gpmr_bytes(mem, m, n) = nbits_FC * ((mem + 2) * (n + m) + mem * (2 * mem + 7)) + nbits_T * 4 * mem
 
-        expected_gpmr_bytes = storage_gpmr_bytes(mem, n, m)
+        expected_gpmr_bytes = storage_gpmr_bytes(mem, m, k)
         gpmr(Ao, Au, b, c, memory=mem, itmax=mem)  # warmup
         actual_gpmr_bytes = @allocated gpmr(Ao, Au, b, c, memory=mem, itmax=mem)
         @test expected_gpmr_bytes ≤ actual_gpmr_bytes ≤ 1.02 * expected_gpmr_bytes
diff --git a/test/test_aux.jl b/test/test_aux.jl
index 11bdb7c2d..6c43142c0 100644
--- a/test/test_aux.jl
+++ b/test/test_aux.jl
@@ -1,119 +1,203 @@
 @testset "aux" begin
-  # test Givens reflector corner cases
-  (c, s, ρ) = Krylov.sym_givens(0.0, 0.0)
-  @test (c == 1.0) && (s == 0.0) && (ρ == 0.0)
-
-  a = 3.14
-  (c, s, ρ) = Krylov.sym_givens(a, 0.0)
-  @test (c == 1.0) && (s == 0.0) && (ρ == a)
-  (c, s, ρ) = Krylov.sym_givens(-a, 0.0)
-  @test (c == -1.0) && (s == 0.0) && (ρ == a)
-
-  b = 3.14
-  (c, s, ρ) = Krylov.sym_givens(0.0, b)
-  @test (c == 0.0) && (s == 1.0) && (ρ == b)
-  (c, s, ρ) = Krylov.sym_givens(0.0, -b)
-  @test (c == 0.0) && (s == -1.0) && (ρ == b)
-
-  (c, s, ρ) = Krylov.sym_givens(Complex(0.0), Complex(0.0))
-  @test (c == 1.0) && (s == Complex(0.0)) && (ρ == Complex(0.0))
-
-  a = Complex(1.0, 1.0)
-  (c, s, ρ) = Krylov.sym_givens(a, Complex(0.0))
-  @test (c == 1.0) && (s == Complex(0.0)) && (ρ == a)
-  (c, s, ρ) = Krylov.sym_givens(-a, Complex(0.0))
-  @test (c == 1.0) && (s == Complex(0.0)) && (ρ == -a)
-
-  b = Complex(1.0, 1.0)
-  (c, s, ρ) = Krylov.sym_givens(Complex(0.0), b)
-  @test (c == 0.0) && (s == Complex(1.0)) && (ρ == b)
-  (c, s, ρ) = Krylov.sym_givens(Complex(0.0), -b)
-  @test (c == 0.0) && (s == Complex(1.0)) && (ρ == -b)
-
-  # test roots of a quadratic
-  roots = Krylov.roots_quadratic(0.0, 0.0, 0.0)
-  @test length(roots) == 1
-  @test roots[1] == 0.0
-
-  roots = Krylov.roots_quadratic(0.0, 0.0, 1.0)
-  @test length(roots) == 0
-
-  roots = Krylov.roots_quadratic(0.0, 3.14, -1.0)
-  @test length(roots) == 1
-  @test roots[1] == 1.0 / 3.14
-
-  roots = Krylov.roots_quadratic(1.0, 0.0, 1.0)
-  @test length(roots) == 0
-
-  roots = Krylov.roots_quadratic(1.0, 0.0, 0.0)
-  @test length(roots) == 2
-  @test roots[1] == 0.0
-  @test roots[2] == 0.0
-
-  roots = Krylov.roots_quadratic(1.0, 3.0, 2.0)
-  @test length(roots) == 2
-  @test roots[1] ≈ -2.0
-  @test roots[2] ≈ -1.0
-
-  roots = Krylov.roots_quadratic(1.0e+8, 1.0, 1.0)
-  @test length(roots) == 0
-
-  # ill-conditioned quadratic
-  roots = Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=0)
-  @test length(roots) == 2
-  @test roots[1] == 1.0e+13
-  @test roots[2] == 0.0
-
-  # iterative refinement is crucial!
-  roots = Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=1)
-  @test length(roots) == 2
-  @test roots[1] == 1.0e+13
-  @test roots[2] == -1.0e-05
-
-  # not ill-conditioned quadratic
-  roots = Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=0)
-  @test length(roots) == 2
-  @test isapprox(roots[1],  1.0e+7, rtol=1.0e-6)
-  @test isapprox(roots[2], -1.0, rtol=1.0e-6)
-
-  roots = Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=1)
-  @test length(roots) == 2
-  @test isapprox(roots[1], 1.0e+7, rtol=1.0e-6)
-  @test isapprox(roots[2], -1.0, rtol=1.0e-6)
-
-  # test trust-region boundary
-  x = ones(5)
-  d = ones(5); d[1:2:5] .= -1
-  @test_throws ErrorException Krylov.to_boundary(x, d, -1.0)
-  @test_throws ErrorException Krylov.to_boundary(x, d, 0.5)
-  @test_throws ErrorException Krylov.to_boundary(x, zeros(5), 1.0)
-  @test maximum(Krylov.to_boundary(x, d, 5.0)) ≈ 2.209975124224178
-  @test minimum(Krylov.to_boundary(x, d, 5.0)) ≈ -1.8099751242241782
-  @test maximum(Krylov.to_boundary(x, d, 5.0, flip=true)) ≈ 1.8099751242241782
-  @test minimum(Krylov.to_boundary(x, d, 5.0, flip=true)) ≈ -2.209975124224178
-
-  # test kzeros and kones
-  @test Krylov.kzeros(Vector{Float64}, 10) == zeros(10)
-  @test Krylov.kones(Vector{Float64}, 10) == ones(10)
-
-  # test ktypeof
-  a = rand(Float32, 10)
-  b = view(a, 4:8)
-  @test Krylov.ktypeof(a) == Vector{Float32}
-  @test Krylov.ktypeof(b) == Vector{Float32}
-
-  a = rand(Float64, 10)
-  b = view(a, 4:8)
-  @test Krylov.ktypeof(a) == Vector{Float64}
-  @test Krylov.ktypeof(b) == Vector{Float64}
-
-  a = sprand(Float32, 10, 0.5)
-  b = view(a, 4:8)
-  @test Krylov.ktypeof(a) == Vector{Float32}
-  @test Krylov.ktypeof(b) == Vector{Float32}
-
-  a = sprand(Float64, 10, 0.5)
-  b = view(a, 4:8)
-  @test Krylov.ktypeof(a) == Vector{Float64}
-  @test Krylov.ktypeof(b) == Vector{Float64}
+
+  @testset "sym_givens" begin
+    # test Givens reflector corner cases
+    (c, s, ρ) = Krylov.sym_givens(0.0, 0.0)
+    @test (c == 1.0) && (s == 0.0) && (ρ == 0.0)
+
+    a = 3.14
+    (c, s, ρ) = Krylov.sym_givens(a, 0.0)
+    @test (c == 1.0) && (s == 0.0) && (ρ == a)
+    (c, s, ρ) = Krylov.sym_givens(-a, 0.0)
+    @test (c == -1.0) && (s == 0.0) && (ρ == a)
+
+    b = 3.14
+    (c, s, ρ) = Krylov.sym_givens(0.0, b)
+    @test (c == 0.0) && (s == 1.0) && (ρ == b)
+    (c, s, ρ) = Krylov.sym_givens(0.0, -b)
+    @test (c == 0.0) && (s == -1.0) && (ρ == b)
+
+    (c, s, ρ) = Krylov.sym_givens(Complex(0.0), Complex(0.0))
+    @test (c == 1.0) && (s == Complex(0.0)) && (ρ == Complex(0.0))
+
+    a = Complex(1.0, 1.0)
+    (c, s, ρ) = Krylov.sym_givens(a, Complex(0.0))
+    @test (c == 1.0) && (s == Complex(0.0)) && (ρ == a)
+    (c, s, ρ) = Krylov.sym_givens(-a, Complex(0.0))
+    @test (c == 1.0) && (s == Complex(0.0)) && (ρ == -a)
+
+    b = Complex(1.0, 1.0)
+    (c, s, ρ) = Krylov.sym_givens(Complex(0.0), b)
+    @test (c == 0.0) && (s == Complex(1.0)) && (ρ == b)
+    (c, s, ρ) = Krylov.sym_givens(Complex(0.0), -b)
+    @test (c == 0.0) && (s == Complex(1.0)) && (ρ == -b)
+  end
+
+  @testset "roots_quadratic" begin
+    # test roots of a quadratic
+    roots = Krylov.roots_quadratic(0.0, 0.0, 0.0)
+    @test roots[1] == 0.0
+    @test roots[2] == 0.0
+
+    @test_throws ErrorException Krylov.roots_quadratic(0.0, 0.0, 1.0)
+
+    roots = Krylov.roots_quadratic(0.0, 3.14, -1.0)
+    @test roots[1] == 1.0 / 3.14
+    @test roots[2] == 1.0 / 3.14
+
+    @test_throws ErrorException Krylov.roots_quadratic(1.0, 0.0, 1.0)
+
+    roots = Krylov.roots_quadratic(1.0, 0.0, 0.0)
+    @test roots[1] == 0.0
+    @test roots[2] == 0.0
+
+    roots = Krylov.roots_quadratic(1.0, 3.0, 2.0)
+    @test roots[1] ≈ -2.0
+    @test roots[2] ≈ -1.0
+
+    @test_throws ErrorException Krylov.roots_quadratic(1.0e+8, 1.0, 1.0)
+
+    # ill-conditioned quadratic
+    roots = Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=0)
+    @test roots[1] == 1.0e+13
+    @test roots[2] == 0.0
+
+    # iterative refinement is crucial!
+    roots = Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=1)
+    @test roots[1] == 1.0e+13
+    @test roots[2] == -1.0e-05
+
+    # not ill-conditioned quadratic
+    roots = Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=0)
+    @test isapprox(roots[1],  1.0e+7, rtol=1.0e-6)
+    @test isapprox(roots[2], -1.0, rtol=1.0e-6)
+
+    roots = Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=1)
+    @test isapprox(roots[1], 1.0e+7, rtol=1.0e-6)
+    @test isapprox(roots[2], -1.0, rtol=1.0e-6)
+
+    allocations = @allocated Krylov.roots_quadratic(0.0, 0.0, 0.0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(0.0, 3.14, -1.0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(1.0, 0.0, 0.0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(1.0, 3.0, 2.0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(-1.0e-8, 1.0e+5, 1.0, nitref=1)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=0)
+    @test allocations == 0
+
+    allocations = @allocated Krylov.roots_quadratic(-1.0e-7, 1.0, 1.0, nitref=1)
+    @test allocations == 0
+  end
+
+  @testset "to_boundary" begin
+    # test trust-region boundary
+    n = 5
+    x = ones(n)
+    d = ones(n); d[1:2:n] .= -1
+    @test_throws ErrorException Krylov.to_boundary(n, x, d, -1.0)
+    @test_throws ErrorException Krylov.to_boundary(n, x, d, 0.5)
+    @test_throws ErrorException Krylov.to_boundary(n, x, zeros(n), 1.0)
+    @test maximum(Krylov.to_boundary(n, x, d, 5.0)) ≈ 2.209975124224178
+    @test minimum(Krylov.to_boundary(n, x, d, 5.0)) ≈ -1.8099751242241782
+    @test maximum(Krylov.to_boundary(n, x, d, 5.0, flip=true)) ≈ 1.8099751242241782
+    @test minimum(Krylov.to_boundary(n, x, d, 5.0, flip=true)) ≈ -2.209975124224178
+  end
+
+  @testset "kzeros" begin
+    # test kzeros
+    @test Krylov.kzeros(Vector{Float64}, 10) == zeros(Float64, 10)
+    @test Krylov.kzeros(Vector{ComplexF32}, 10) == zeros(ComplexF32, 10)
+  end
+
+  @testset "kones" begin
+    # test kones
+    @test Krylov.kones(Vector{Float64}, 10) == ones(Float64, 10)
+    @test Krylov.kones(Vector{ComplexF32}, 10) == ones(ComplexF32, 10)
+  end
+
+  @testset "ktypeof" begin
+    # test ktypeof
+    for FC in (Float32, Float64, ComplexF32, ComplexF64)
+      dv = rand(FC, 10)
+      b = view(dv, 4:8)
+      @test Krylov.ktypeof(dv) == Vector{FC}
+      @test Krylov.ktypeof(b)  == Vector{FC}
+
+      dm = rand(FC, 10, 10)
+      b = view(dm, :, 3)
+      @test Krylov.ktypeof(b) == Vector{FC}
+
+      sv = sprand(FC, 10, 0.5)
+      b = view(sv, 4:8)
+      @test Krylov.ktypeof(sv) == Vector{FC}
+      @test Krylov.ktypeof(b)  == Vector{FC}
+    end
+  end
+
+  @testset "vector_to_matrix" begin
+    # test vector_to_matrix
+    for FC in (Float32, Float64, ComplexF32, ComplexF64)
+      S = Vector{FC}
+      M = Krylov.vector_to_matrix(S)
+      @test M == Matrix{FC}
+    end
+  end
+
+  @testset "matrix_to_vector" begin
+    # test matrix_to_vector
+    for FC in (Float32, Float64, ComplexF32, ComplexF64)
+      M = Matrix{FC}
+      S = Krylov.matrix_to_vector(M)
+      @test S == Vector{FC}
+    end
+  end
+
+  @testset "macros" begin
+    # test macros
+    for FC ∈ (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64)
+      n = 10
+      x = rand(FC, n)
+      y = rand(FC, n)
+      a = rand(FC)
+      b = rand(FC)
+      c = rand(FC)
+      s = rand(FC)
+
+      T = real(FC)
+      a2 = rand(T)
+      b2 = rand(T)
+
+      Krylov.@kdot(n, x, y)
+
+      Krylov.@kdotr(n, x, y)
+
+      Krylov.@knrm2(n, x)
+
+      Krylov.@kaxpy!(n, a, x, y)
+      Krylov.@kaxpy!(n, a2, x, y)
+
+      Krylov.@kaxpby!(n, a, x, b, y)
+      Krylov.@kaxpby!(n, a2, x, b, y)
+      Krylov.@kaxpby!(n, a, x, b2, y)
+      Krylov.@kaxpby!(n, a2, x, b2, y)
+
+      Krylov.@kcopy!(n, x, y)
+
+      Krylov.@kswap(x, y)
+
+      Krylov.@kref!(n, x, y, c, s)
+    end
+  end
 end
diff --git a/test/test_bicgstab.jl b/test/test_bicgstab.jl
index ce4e6dcd4..6817acf3d 100644
--- a/test/test_bicgstab.jl
+++ b/test/test_bicgstab.jl
@@ -82,10 +82,10 @@
       @test(resid ≤ bicgstab_tol)
       @test(stats.solved)
 
-      # Test bᵀc == 0
+      # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = bicgstab(A, b, c=c)
-      @test stats.status == "Breakdown bᵀc = 0"
+      @test stats.status == "Breakdown bᴴc = 0"
 
       # test callback function
       solver = BicgstabSolver(A, b)
diff --git a/test/test_bilq.jl b/test/test_bilq.jl
index 900d1f6e5..40b9872db 100644
--- a/test/test_bilq.jl
+++ b/test/test_bilq.jl
@@ -66,10 +66,10 @@
       @test(resid ≤ bilq_tol)
       @test(stats.solved)
 
-      # Test bᵀc == 0
+      # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = bilq(A, b, c=c)
-      @test stats.status == "Breakdown bᵀc = 0"
+      @test stats.status == "Breakdown bᴴc = 0"
 
       
       # test callback function
diff --git a/test/test_bilqr.jl b/test/test_bilqr.jl
index 6dab06ec7..fd46aade4 100644
--- a/test/test_bilqr.jl
+++ b/test/test_bilqr.jl
@@ -46,10 +46,10 @@
       @test(resid_dual ≤ bilqr_tol)
       @test(stats.solved_dual)
 
-      # Test bᵀc == 0
+      # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, t, stats) = bilqr(A, b, c)
-      @test stats.status == "Breakdown bᵀc = 0"
+      @test stats.status == "Breakdown bᴴc = 0"
 
       # test callback function
       A, b, c = adjoint_pde(FC=FC)
diff --git a/test/test_cgne.jl b/test/test_cgne.jl
index 64cbc0ea7..c1a3e798b 100644
--- a/test/test_cgne.jl
+++ b/test/test_cgne.jl
@@ -1,6 +1,6 @@
-function test_cgne(A, b; λ=0.0, M=I)
+function test_cgne(A, b; λ=0.0, N=I, history=false)
   (nrow, ncol) = size(A)
-  (x, stats) = cgne(A, b, λ=λ, M=M)
+  (x, stats) = cgne(A, b, λ=λ, N=N, history=history)
   r = b - A * x
   if λ > 0
     s = r / sqrt(λ)
@@ -69,8 +69,8 @@ end
       @test stats.status == "x = 0 is a zero-residual solution"
 
       # Test with Jacobi (or diagonal) preconditioner
-      A, b, M = square_preconditioned(FC=FC)
-      (x, stats, resid) = test_cgne(A, b, M=M)
+      A, b, N = square_preconditioned(FC=FC)
+      (x, stats, resid) = test_cgne(A, b, N=N)
       @test(resid ≤ cgne_tol)
       @test(stats.solved)
       (xI, xmin, xmin_norm) = check_min_norm(A, b, x)
@@ -81,8 +81,8 @@ end
       A = 0.5 * [19.0 17.0 15.0 13.0 11.0 9.0 7.0 5.0 3.0 1.0;
                  2.0  2.0  2.0  2.0  2.0 2.0 2.0 2.0 2.0 2.0]
       b = [1.0; 0.0]
-      M = Diagonal(1 ./ (A * A'))
-      (x, stats, resid) = test_cgne(A, b, M=M)
+      N = Diagonal(1 ./ (A * A'))
+      (x, stats, resid) = test_cgne(A, b, N=N)
       @test(resid ≤ cgne_tol)
       @test(stats.solved)
       (xI, xmin, xmin_norm) = check_min_norm(A, b, x)
@@ -92,7 +92,7 @@ end
       for transpose ∈ (false, true)
         A, b, c, D = small_sp(transpose, FC=FC)
         D⁻¹ = inv(D)
-        (x, stats) = cgne(A, b, M=D⁻¹, λ=1.0)
+        (x, stats) = cgne(A, b, N=D⁻¹, λ=1.0)
       end
 
       # test callback function
diff --git a/test/test_cgs.jl b/test/test_cgs.jl
index 5c505bb70..832cd76c3 100644
--- a/test/test_cgs.jl
+++ b/test/test_cgs.jl
@@ -74,10 +74,10 @@
       @test(resid ≤ cgs_tol)
       @test(stats.solved)
 
-      # Test bᵀc == 0
+      # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = cgs(A, b, c=c)
-      @test stats.status == "Breakdown bᵀc = 0"
+      @test stats.status == "Breakdown bᴴc = 0"
 
       # test callback function
       A, b = sparse_laplacian(FC=FC)
diff --git a/test/test_crmr.jl b/test/test_crmr.jl
index 6354f329f..d0f902df6 100644
--- a/test/test_crmr.jl
+++ b/test/test_crmr.jl
@@ -1,6 +1,6 @@
-function test_crmr(A, b; λ=0.0, M=I, history=false)
+function test_crmr(A, b; λ=0.0, N=I, history=false)
   (nrow, ncol) = size(A)
-  (x, stats) = crmr(A, b, λ=λ, M=M, history=history)
+  (x, stats) = crmr(A, b, λ=λ, N=N, history=history)
   r = b - A * x
   if λ > 0
     s = r / sqrt(λ)
@@ -76,8 +76,8 @@ end
       A = 0.5 * [19.0 17.0 15.0 13.0 11.0 9.0 7.0 5.0 3.0 1.0;
                   2.0  2.0  2.0  2.0  2.0 2.0 2.0 2.0 2.0 2.0]
       b = [1.0; 0.0]
-      M = Diagonal(1 ./ (A * A'))
-      (x, stats, resid) = test_crmr(A, b, M=M)
+      N = Diagonal(1 ./ (A * A'))
+      (x, stats, resid) = test_crmr(A, b, N=N)
       @test(resid ≤ crmr_tol)
       @test(stats.solved)
       (xI, xmin, xmin_norm) = check_min_norm(A, b, x)
@@ -87,7 +87,7 @@ end
       for transpose ∈ (false, true)
         A, b, c, D = small_sp(transpose, FC=FC)
         D⁻¹ = inv(D)
-        (x, stats) = crmr(A, b, M=D⁻¹, λ=1.0)
+        (x, stats) = crmr(A, b, N=D⁻¹, λ=1.0)
       end
 
       # test callback function
diff --git a/test/test_diom.jl b/test/test_diom.jl
index 4f1a8ecea..62a38b198 100644
--- a/test/test_diom.jl
+++ b/test/test_diom.jl
@@ -60,7 +60,7 @@
 
       # Poisson equation in polar coordinates.
       A, b = polar_poisson(FC=FC)
-      (x, stats) = diom(A, b, memory=200)
+      (x, stats) = diom(A, b, memory=150)
       r = b - A * x
       resid = norm(r) / norm(b)
       @test(resid ≤ diom_tol)
diff --git a/test/test_fgmres.jl b/test/test_fgmres.jl
new file mode 100644
index 000000000..9bb73d3e4
--- /dev/null
+++ b/test/test_fgmres.jl
@@ -0,0 +1,154 @@
+import LinearAlgebra.mul!
+
+mutable struct FlexiblePreconditioner{T,S}
+  D::Diagonal{T, S}
+  ω::T
+end
+
+function mul!(y::Vector, P::FlexiblePreconditioner, x::Vector)
+  P.ω = -P.ω
+  mul!(y, P.D, x)
+  y .*= P.ω
+end
+
+@testset "fgmres" begin
+  fgmres_tol = 1.0e-6
+
+  for FC in (Float64, ComplexF64)
+    @testset "Data Type: $FC" begin
+
+      # Symmetric and positive definite system.
+      A, b = symmetric_definite(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Symmetric indefinite variant.
+      A, b = symmetric_indefinite(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Nonsymmetric and positive definite systems.
+      A, b = nonsymmetric_definite(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Nonsymmetric indefinite variant.
+      A, b = nonsymmetric_indefinite(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Symmetric indefinite variant, almost singular.
+      A, b = almost_singular(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ 100 * fgmres_tol)
+      @test(stats.solved)
+
+      # Singular system.
+      A, b = square_inconsistent(FC=FC)
+      (x, stats) = fgmres(A, b)
+      r = b - A * x
+      Aresid = norm(A' * r) / norm(A' * b)
+      @test(Aresid ≤ fgmres_tol)
+      @test(stats.inconsistent)
+
+      # Test b == 0
+      A, b = zero_rhs(FC=FC)
+      (x, stats) = fgmres(A, b)
+      @test norm(x) == 0
+      @test stats.status == "x = 0 is a zero-residual solution"
+
+      # Poisson equation in polar coordinates.
+      A, b = polar_poisson(FC=FC)
+      (x, stats) = fgmres(A, b, reorthogonalization=true)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Left preconditioning
+      A, b, M = square_preconditioned(FC=FC)
+      (x, stats) = fgmres(A, b, M=M)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Right preconditioning
+      A, b, N = square_preconditioned(FC=FC)
+      (x, stats) = fgmres(A, b, N=N)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Split preconditioning
+      A, b, M, N = two_preconditioners(FC=FC)
+      (x, stats) = fgmres(A, b, M=M, N=N)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+
+      # Restart
+      for restart ∈ (false, true)
+        memory = 10
+
+        A, b = sparse_laplacian(FC=FC)
+        (x, stats) = fgmres(A, b, restart=restart, memory=memory)
+        r = b - A * x
+        resid = norm(r) / norm(b)
+        @test(resid ≤ fgmres_tol)
+        @test(stats.niter > memory)
+        @test(stats.solved)
+
+        M = Diagonal(1 ./ diag(A))
+        (x, stats) = fgmres(A, b, M=M, restart=restart, memory=memory)
+        r = b - A * x
+        resid = norm(M * r) / norm(M * b)
+        @test(resid ≤ fgmres_tol)
+        @test(stats.niter > memory)
+        @test(stats.solved)
+
+        N = Diagonal(1 ./ diag(A))
+        (x, stats) = fgmres(A, b, N=N, restart=restart, memory=memory)
+        r = b - A * x
+        resid = norm(r) / norm(b)
+        @test(resid ≤ fgmres_tol)
+        @test(stats.niter > memory)
+        @test(stats.solved)
+
+        N = Diagonal(1 ./ sqrt.(diag(A)))
+        N = Diagonal(1 ./ sqrt.(diag(A)))
+        (x, stats) = fgmres(A, b, M=M, N=N, restart=restart, memory=memory)
+        r = b - A * x
+        resid = norm(M * r) / norm(M * b)
+        @test(resid ≤ fgmres_tol)
+        @test(stats.niter > memory)
+        @test(stats.solved)
+      end
+
+      A, b = polar_poisson(FC=FC)
+      J = inv(Diagonal(A))  # Jacobi preconditioner
+      N = FlexiblePreconditioner(J, 1.0)
+      (x, stats) = fgmres(A, b, N=N)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ fgmres_tol)
+      @test(stats.solved)
+    end
+  end
+end
diff --git a/test/test_lnlq.jl b/test/test_lnlq.jl
index 888119db8..b308609fa 100644
--- a/test/test_lnlq.jl
+++ b/test/test_lnlq.jl
@@ -1,5 +1,5 @@
 function test_lnlq(A, b,transfer_to_craig)
-  (x, y, stats) = lnlq(A, b, transfer_to_craig=transfer_to_craig, etolx=0.0, etoly=0.0)
+  (x, y, stats) = lnlq(A, b, transfer_to_craig=transfer_to_craig, utolx=0.0, utoly=0.0)
   r = b - A * x
   resid = norm(r) / norm(b)
   return (x, y, stats, resid)
@@ -61,8 +61,8 @@ end
 
         # Test regularization
         A, b, λ = regularization(FC=FC)
-        (x, y, stats) = lnlq(A, b, λ=λ, transfer_to_craig=transfer_to_craig, etolx=0.0, etoly=0.0)
-        (xₛ, yₛ, stats) = lnlq(A, b, transfer_to_craig=transfer_to_craig, atol=0.0, rtol=0.0, etolx=1e-10, etoly=1e-10, λ=λ)
+        (x, y, stats) = lnlq(A, b, λ=λ, transfer_to_craig=transfer_to_craig, utolx=0.0, utoly=0.0)
+        (xₛ, yₛ, stats) = lnlq(A, b, transfer_to_craig=transfer_to_craig, atol=0.0, rtol=0.0, utolx=1e-10, utoly=1e-10, λ=λ)
         for (x, y) in ((x, y), (xₛ, yₛ))
           s = λ * y
           r = b - (A * x + λ * s)
diff --git a/test/test_minres_qlp.jl b/test/test_minres_qlp.jl
index 6e983e49a..0b4d2046d 100644
--- a/test/test_minres_qlp.jl
+++ b/test/test_minres_qlp.jl
@@ -80,7 +80,7 @@
       solver = MinresQlpSolver(A, b)
       tol = 1.0
       cb_n2 = TestCallbackN2(A, b, tol = tol)
-      minres_qlp!(solver, A, b, atol = 0.0, rtol = 0.0, ctol = 0.0, callback = cb_n2)
+      minres_qlp!(solver, A, b, atol = 0.0, rtol = 0.0, Artol = 0.0, callback = cb_n2)
       @test solver.stats.status == "user-requested exit"
       @test cb_n2(solver)
 
diff --git a/test/test_mp.jl b/test/test_mp.jl
index b7aa43d38..6b6d58450 100644
--- a/test/test_mp.jl
+++ b/test/test_mp.jl
@@ -3,7 +3,7 @@
   for fn in (:cg, :cgls, :usymqr, :cgne, :cgs, :crmr, :cg_lanczos, :dqgmres, :diom, :cr, :gpmr,
              :lslq, :lsqr, :lsmr, :lnlq, :craig, :bicgstab, :craigmr, :crls, :symmlq, :minres,
              :bilq, :minres_qlp, :qmr, :usymlq, :tricg, :trimr, :trilqr, :bilqr, :gmres, :fom,
-             :cg_lanczos_shift)
+             :fgmres, :cg_lanczos_shift)
     for T in (Float16, Float32, Float64, BigFloat)
       for FC in (T, Complex{T})
         A = spdiagm(-1 => -ones(FC,n-1), 0 => 3*ones(FC,n), 1 => -ones(FC,n-1))
diff --git a/test/test_processes.jl b/test/test_processes.jl
new file mode 100644
index 000000000..eb3ad19af
--- /dev/null
+++ b/test/test_processes.jl
@@ -0,0 +1,146 @@
+"""
+    P = permutation_paige(k)
+
+Return the sparse (2k) × (2k) matrix
+
+    [e₁    •    eₖ   ]
+    [   e₁    •    eₖ]
+"""
+function permutation_paige(k)
+  P = spzeros(Float64, 2k, 2k)
+  for i = 1:k
+    P[i,2i-1] = 1.0
+    P[i+k,2i] = 1.0
+  end
+  return P
+end
+
+@testset "processes" begin
+  m = 250
+  n = 500
+  k = 20
+  
+  for FC in (Float64, ComplexF64)
+    R = real(FC)
+    nbits_FC = sizeof(FC)
+    nbits_R = sizeof(R)
+    nbits_I = sizeof(Int)
+
+    @testset "Data Type: $FC" begin
+      
+      @testset "Hermitian Lanczos" begin
+        A, b = symmetric_indefinite(n, FC=FC)
+        V, T = hermitian_lanczos(A, b, k)
+
+        @test A * V[:,1:k] ≈ V * T
+
+        storage_hermitian_lanczos_bytes(n, k) = 4k * nbits_I + (3k-1) * nbits_R + n*(k+1) * nbits_FC
+
+        expected_hermitian_lanczos_bytes = storage_hermitian_lanczos_bytes(n, k)
+        actual_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, b, k)
+        @test expected_hermitian_lanczos_bytes ≤ actual_hermitian_lanczos_bytes ≤ 1.02 * expected_hermitian_lanczos_bytes
+      end
+
+      @testset "Non-Hermitian Lanczos" begin
+        A, b = nonsymmetric_definite(n, FC=FC)
+        c = -b
+        V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k)
+
+        @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]'
+        @test A  * V[:,1:k] ≈ V * T
+        @test A' * U[:,1:k] ≈ U * Tᴴ
+
+        storage_nonhermitian_lanczos_bytes(n, k) = 4k * nbits_I + (6k-2) * nbits_FC + 2*n*(k+1) * nbits_FC
+
+        expected_nonhermitian_lanczos_bytes = storage_nonhermitian_lanczos_bytes(n, k)
+        actual_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, b, c, k)
+        @test expected_nonhermitian_lanczos_bytes ≤ actual_nonhermitian_lanczos_bytes ≤ 1.02 * expected_nonhermitian_lanczos_bytes
+      end
+
+      @testset "Arnoldi" begin
+        A, b = nonsymmetric_indefinite(n, FC=FC)
+        V, H = arnoldi(A, b, k)
+
+        @test A * V[:,1:k] ≈ V * H
+
+        function storage_arnoldi_bytes(n, k)
+          return k*(k+1) * nbits_FC + n*(k+1) * nbits_FC
+        end
+
+        expected_arnoldi_bytes = storage_arnoldi_bytes(n, k)
+        actual_arnoldi_bytes = @allocated arnoldi(A, b, k)
+        @test expected_arnoldi_bytes ≤ actual_arnoldi_bytes ≤ 1.02 * expected_arnoldi_bytes
+      end
+
+      @testset "Golub-Kahan" begin
+        A, b = under_consistent(m, n, FC=FC)
+        V, U, L = golub_kahan(A, b, k)
+        B = L[1:k+1,1:k]
+
+        @test A  * V[:,1:k] ≈ U * B
+        @test A' * U ≈ V * L'
+        @test A' * A  * V[:,1:k] ≈ V * L' * B
+        @test A  * A' * U[:,1:k] ≈ U * B * L[1:k,1:k]'
+
+        storage_golub_kahan_bytes(m, n, k) = 3*(k+1) * nbits_I + (2k+1) * nbits_R + (n+m)*(k+1) * nbits_FC
+
+        expected_golub_kahan_bytes = storage_golub_kahan_bytes(m, n, k)
+        actual_golub_kahan_bytes = @allocated golub_kahan(A, b, k)
+        @test expected_golub_kahan_bytes ≤ actual_golub_kahan_bytes ≤ 1.02 * expected_golub_kahan_bytes
+      end
+
+      @testset "Saunders-Simon-Yip" begin
+        A, b = under_consistent(m, n, FC=FC)
+        _, c = over_consistent(n, m, FC=FC)
+        V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k)
+
+        @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]'
+        @test A  * U[:,1:k] ≈ V * T
+        @test A' * V[:,1:k] ≈ U * Tᴴ
+        @test A' * A  * U[:,1:k-1] ≈ U * Tᴴ * T[1:k,1:k-1]
+        @test A  * A' * V[:,1:k-1] ≈ V * T * Tᴴ[1:k,1:k-1]
+
+        K = [zeros(FC,m,m) A; A' zeros(FC,n,n)]
+        Pₖ = permutation_paige(k)
+        Wₖ = [V[:,1:k] zeros(FC,m,k); zeros(FC,n,k) U[:,1:k]] * Pₖ
+        Pₖ₊₁ = permutation_paige(k+1)
+        Wₖ₊₁ = [V zeros(FC,m,k+1); zeros(FC,n,k+1) U] * Pₖ₊₁
+        G = Pₖ₊₁' * [zeros(FC,k+1,k) T; Tᴴ zeros(FC,k+1,k)] * Pₖ
+        @test K * Wₖ ≈ Wₖ₊₁ * G
+
+        storage_saunders_simon_yip_bytes(m, n, k) = 4k * nbits_I + (6k-2) * nbits_FC + (n+m)*(k+1) * nbits_FC
+
+        expected_saunders_simon_yip_bytes = storage_saunders_simon_yip_bytes(m, n, k)
+        actual_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, b, c, k)
+        @test expected_saunders_simon_yip_bytes ≤ actual_saunders_simon_yip_bytes ≤ 1.02 * expected_saunders_simon_yip_bytes
+      end
+
+      @testset "Montoison-Orban" begin
+        A, b = under_consistent(m, n, FC=FC)
+        B, c = over_consistent(n, m, FC=FC)
+        V, H, U, F = montoison_orban(A, B, b, c, k)
+
+        @test A * U[:,1:k] ≈ V * H
+        @test B * V[:,1:k] ≈ U * F
+        @test B * A * U[:,1:k-1] ≈ U * F * H[1:k,1:k-1]
+        @test A * B * V[:,1:k-1] ≈ V * H * F[1:k,1:k-1]
+
+        K = [zeros(FC,m,m) A; B zeros(FC,n,n)]
+        Pₖ = permutation_paige(k)
+        Wₖ = [V[:,1:k] zeros(FC,m,k); zeros(FC,n,k) U[:,1:k]] * Pₖ
+        Pₖ₊₁ = permutation_paige(k+1)
+        Wₖ₊₁ = [V zeros(FC,m,k+1); zeros(FC,n,k+1) U] * Pₖ₊₁
+        G = Pₖ₊₁' * [zeros(FC,k+1,k) H; F zeros(FC,k+1,k)] * Pₖ
+        @test K * Wₖ ≈ Wₖ₊₁ * G
+
+        function storage_montoison_orban_bytes(m, n, k)
+          return 2*k*(k+1) * nbits_FC + (n+m)*(k+1) * nbits_FC
+        end
+
+        expected_montoison_orban_bytes = storage_montoison_orban_bytes(m, n, k)
+        actual_montoison_orban_bytes = @allocated montoison_orban(A, B, b, c, k)
+        @test expected_montoison_orban_bytes ≤ actual_montoison_orban_bytes ≤ 1.02 * expected_montoison_orban_bytes
+      end
+    end
+  end
+end
diff --git a/test/test_qmr.jl b/test/test_qmr.jl
index 184b9877d..4a6b8c1c9 100644
--- a/test/test_qmr.jl
+++ b/test/test_qmr.jl
@@ -58,10 +58,10 @@
       @test(resid ≤ qmr_tol)
       @test(stats.solved)
 
-      # Test bᵀc == 0
+      # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = qmr(A, b, c=c)
-      @test stats.status == "Breakdown bᵀc = 0"
+      @test stats.status == "Breakdown bᴴc = 0"
 
       # test callback function
       solver = QmrSolver(A, b)
diff --git a/test/test_solvers.jl b/test/test_solvers.jl
index 468fa5a05..2c98dc795 100644
--- a/test/test_solvers.jl
+++ b/test/test_solvers.jl
@@ -11,1139 +11,142 @@ function test_solvers(FC)
   nshifts = 5
   T = real(FC)
   S = Vector{FC}
+  solvers = Dict{Symbol, KrylovSolver}()
 
   @eval begin
-    cg_solver = $(KRYLOV_SOLVERS[:cg])($n, $n, $S)
-    symmlq_solver = $(KRYLOV_SOLVERS[:symmlq])($n, $n, $S)
-    minres_solver = $(KRYLOV_SOLVERS[:minres])($n, $n, $S)
-    cg_lanczos_solver = $(KRYLOV_SOLVERS[:cg_lanczos])($n, $n, $S)
-    diom_solver = $(KRYLOV_SOLVERS[:diom])($n, $n, $mem, $S)
-    fom_solver = $(KRYLOV_SOLVERS[:fom])($n, $n, $mem, $S)
-    dqgmres_solver = $(KRYLOV_SOLVERS[:dqgmres])($n, $n, $mem, $S)
-    gmres_solver = $(KRYLOV_SOLVERS[:gmres])($n, $n, $mem, $S)
-    cr_solver = $(KRYLOV_SOLVERS[:cr])($n, $n, $S)
-    crmr_solver = $(KRYLOV_SOLVERS[:crmr])($m, $n, $S)
-    cgs_solver = $(KRYLOV_SOLVERS[:cgs])($n, $n, $S)
-    bicgstab_solver = $(KRYLOV_SOLVERS[:bicgstab])($n, $n, $S)
-    craigmr_solver = $(KRYLOV_SOLVERS[:craigmr])($m, $n, $S)
-    cgne_solver = $(KRYLOV_SOLVERS[:cgne])($m, $n, $S)
-    lnlq_solver = $(KRYLOV_SOLVERS[:lnlq])($m, $n, $S)
-    craig_solver = $(KRYLOV_SOLVERS[:craig])($m, $n, $S)
-    lslq_solver = $(KRYLOV_SOLVERS[:lslq])($n, $m, $S)
-    cgls_solver = $(KRYLOV_SOLVERS[:cgls])($n, $m, $S)
-    lsqr_solver = $(KRYLOV_SOLVERS[:lsqr])($n, $m, $S)
-    crls_solver = $(KRYLOV_SOLVERS[:crls])($n, $m, $S)
-    lsmr_solver = $(KRYLOV_SOLVERS[:lsmr])($n, $m, $S)
-    usymqr_solver = $(KRYLOV_SOLVERS[:usymqr])($n, $m, $S)
-    trilqr_solver = $(KRYLOV_SOLVERS[:trilqr])($n, $n, $S)
-    bilq_solver = $(KRYLOV_SOLVERS[:bilq])($n, $n, $S)
-    bilqr_solver = $(KRYLOV_SOLVERS[:bilqr])($n, $n, $S)
-    minres_qlp_solver = $(KRYLOV_SOLVERS[:minres_qlp])($n, $n, $S)
-    qmr_solver = $(KRYLOV_SOLVERS[:qmr])($n, $n, $S)
-    usymlq_solver = $(KRYLOV_SOLVERS[:usymlq])($m, $n, $S)
-    tricg_solver = $(KRYLOV_SOLVERS[:tricg])($m, $n, $S)
-    trimr_solver = $(KRYLOV_SOLVERS[:trimr])($m, $n, $S)
-    gpmr_solver = $(KRYLOV_SOLVERS[:gpmr])($n, $m, $mem, $S)
-    cg_lanczos_shift_solver = $(KRYLOV_SOLVERS[:cg_lanczos_shift])($n, $m, $nshifts, $S)
+    $solvers[:cg] = $(KRYLOV_SOLVERS[:cg])($n, $n, $S)
+    $solvers[:symmlq] = $(KRYLOV_SOLVERS[:symmlq])($n, $n, $S)
+    $solvers[:minres] = $(KRYLOV_SOLVERS[:minres])($n, $n, $S)
+    $solvers[:cg_lanczos] = $(KRYLOV_SOLVERS[:cg_lanczos])($n, $n, $S)
+    $solvers[:cg_lanczos_shift] = $(KRYLOV_SOLVERS[:cg_lanczos_shift])($n, $n, $nshifts, $S)
+    $solvers[:diom] = $(KRYLOV_SOLVERS[:diom])($n, $n, $mem, $S)
+    $solvers[:fom] = $(KRYLOV_SOLVERS[:fom])($n, $n, $mem, $S)
+    $solvers[:dqgmres] = $(KRYLOV_SOLVERS[:dqgmres])($n, $n, $mem, $S)
+    $solvers[:gmres] = $(KRYLOV_SOLVERS[:gmres])($n, $n, $mem, $S)
+    $solvers[:fgmres] = $(KRYLOV_SOLVERS[:fgmres])($n, $n, $mem, $S)
+    $solvers[:cr] = $(KRYLOV_SOLVERS[:cr])($n, $n, $S)
+    $solvers[:crmr] = $(KRYLOV_SOLVERS[:crmr])($m, $n, $S)
+    $solvers[:cgs] = $(KRYLOV_SOLVERS[:cgs])($n, $n, $S)
+    $solvers[:bicgstab] = $(KRYLOV_SOLVERS[:bicgstab])($n, $n, $S)
+    $solvers[:craigmr] = $(KRYLOV_SOLVERS[:craigmr])($m, $n, $S)
+    $solvers[:cgne] = $(KRYLOV_SOLVERS[:cgne])($m, $n, $S)
+    $solvers[:lnlq] = $(KRYLOV_SOLVERS[:lnlq])($m, $n, $S)
+    $solvers[:craig] = $(KRYLOV_SOLVERS[:craig])($m, $n, $S)
+    $solvers[:lslq] = $(KRYLOV_SOLVERS[:lslq])($n, $m, $S)
+    $solvers[:cgls] = $(KRYLOV_SOLVERS[:cgls])($n, $m, $S)
+    $solvers[:lsqr] = $(KRYLOV_SOLVERS[:lsqr])($n, $m, $S)
+    $solvers[:crls] = $(KRYLOV_SOLVERS[:crls])($n, $m, $S)
+    $solvers[:lsmr] = $(KRYLOV_SOLVERS[:lsmr])($n, $m, $S)
+    $solvers[:usymqr] = $(KRYLOV_SOLVERS[:usymqr])($n, $m, $S)
+    $solvers[:trilqr] = $(KRYLOV_SOLVERS[:trilqr])($n, $n, $S)
+    $solvers[:bilq] = $(KRYLOV_SOLVERS[:bilq])($n, $n, $S)
+    $solvers[:bilqr] = $(KRYLOV_SOLVERS[:bilqr])($n, $n, $S)
+    $solvers[:minres_qlp] = $(KRYLOV_SOLVERS[:minres_qlp])($n, $n, $S)
+    $solvers[:qmr] = $(KRYLOV_SOLVERS[:qmr])($n, $n, $S)
+    $solvers[:usymlq] = $(KRYLOV_SOLVERS[:usymlq])($m, $n, $S)
+    $solvers[:tricg] = $(KRYLOV_SOLVERS[:tricg])($m, $n, $S)
+    $solvers[:trimr] = $(KRYLOV_SOLVERS[:trimr])($m, $n, $S)
+    $solvers[:gpmr] = $(KRYLOV_SOLVERS[:gpmr])($n, $m, $mem, $S)
+    $solvers[:cg_lanczos_shift] = $(KRYLOV_SOLVERS[:cg_lanczos_shift])($n, $n, $nshifts, $S)
   end
 
-  for i = 1 : 3
-    A  = i * A
-    Au = i * Au
-    Ao = i * Ao
-    b  = 5 * b
-    c  = 3 * c
-
-    solver = solve!(cg_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(symmlq_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(minres_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(cg_lanczos_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(cg_lanczos_shift_solver, A, b, shifts)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(diom_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(fom_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(dqgmres_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(gmres_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(cr_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(crmr_solver, Au, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(cgs_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == 2 * niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(bicgstab_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == 2 * niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(craigmr_solver, Au, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 2
-    @test issolved(solver)
-
-    solver = solve!(cgne_solver, Au, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(lnlq_solver, Au, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved(solver)
-
-    solver = solve!(craig_solver, Au, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved(solver)
-
-    solver = solve!(lslq_solver, Ao, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(cgls_solver, Ao, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(lsqr_solver, Ao, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(crls_solver, Ao, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(lsmr_solver, Ao, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(usymqr_solver, Ao, b, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(trilqr_solver, A, b, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved_primal(solver)
-    @test issolved_dual(solver)
-    @test issolved(solver)
-
-    solver = solve!(bilq_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(bilqr_solver, A, b, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved_primal(solver)
-    @test issolved_dual(solver)
-    @test issolved(solver)
-
-    solver = solve!(minres_qlp_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(qmr_solver, A, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(usymlq_solver, Au, c, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test nsolution(solver) == 1
-    @test issolved(solver)
-
-    solver = solve!(tricg_solver, Au, c, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved(solver)
-
-    solver = solve!(trimr_solver, Au, c, b)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved(solver)
-
-    solver = solve!(gpmr_solver, Ao, Au, b, c)
-    niter = niterations(solver)
-    @test niter > 0
-    @test Aprod(solver) == niter
-    @test Atprod(solver) == 0
-    @test Bprod(solver) == niter
-    @test statistics(solver) === solver.stats
-    @test solution(solver, 1) === solver.x
-    @test solution(solver, 2) === solver.y
-    @test nsolution(solver) == 2
-    @test issolved(solver)
+  for (method, solver) in solvers
+    @testset "$(method)" begin
+      for i = 1 : 3
+        A  = i * A
+        Au = i * Au
+        Ao = i * Ao
+        b  = 5 * b
+        c  = 3 * c
+
+        if method ∈ (:cg, :cr, :symmlq, :minres, :minres_qlp, :cg_lanczos, :diom, :fom,
+                     :dqgmres, :gmres, :fgmres, :cgs, :bicgstab, :bilq, :qmr, :cg_lanczos_shift)
+          method == :cg_lanczos_shift ? solve!(solver, A, b, shifts) : solve!(solver, A, b)
+          niter = niterations(solver)
+          @test Aprod(solver) == (method ∈ (:cgs, :bicgstab) ? 2 * niter : niter)
+          @test Atprod(solver) == (method ∈ (:bilq, :qmr) ? niter : 0)
+          @test solution(solver) === solver.x
+          @test nsolution(solver) == 1
+        end
+
+        if method ∈ (:cgne, :crmr, :lnlq, :craig, :craigmr)
+          solve!(solver, Au, c)
+          niter = niterations(solver)
+          @test Aprod(solver) == niter
+          @test Atprod(solver) == niter
+          @test solution(solver, 1) === solver.x
+          @test nsolution(solver) == (method ∈ (:cgne, :crmr) ? 1 : 2)
+          (nsolution == 2) && (@test solution(solver, 2) == solver.y)
+        end
+
+        if method ∈ (:cgls, :crls, :lslq, :lsqr, :lsmr)
+          solve!(solver, Ao, b)
+          niter = niterations(solver)
+          @test Aprod(solver) == niter
+          @test Atprod(solver) == niter
+          @test solution(solver) === solver.x
+          @test nsolution(solver) == 1
+        end
+
+        if method ∈ (:bilqr, :trilqr)
+          solve!(solver, A, b, b)
+          niter = niterations(solver)
+          @test Aprod(solver) == niter
+          @test Atprod(solver) == niter
+          @test solution(solver, 1) === solver.x
+          @test solution(solver, 2) === solver.y
+          @test nsolution(solver) == 2
+          @test issolved_primal(solver)
+          @test issolved_dual(solver)
+        end
+
+        if method ∈ (:tricg, :trimr, :gpmr)
+          method == :gpmr ? solve!(solver, Ao, Au, b, c) : solve!(solver, Au, c, b)
+          niter = niterations(solver)
+          @test Aprod(solver) == niter
+          method != :gpmr && (@test Atprod(solver) == niter)
+          method == :gpmr && (@test Bprod(solver) == niter)
+          @test solution(solver, 1) === solver.x
+          @test solution(solver, 2) === solver.y
+          @test nsolution(solver) == 2
+        end
+
+        if method ∈ (:usymlq, :usymqr)
+          method == :usymlq ? solve!(solver, Au, c, b) : solve!(solver, Ao, b, c)
+          niter = niterations(solver)
+          @test Aprod(solver) == niter
+          @test Atprod(solver) == niter
+          @test solution(solver) === solver.x
+          @test nsolution(solver) == 1
+        end
+
+        @test niter > 0
+        @test statistics(solver) === solver.stats
+        @test issolved(solver)
+      end
+
+      io = IOBuffer()
+      show(io, solver, show_stats=false)
+      showed = String(take!(io))
+
+      # Test that the lines have the same length
+      str = split(showed, "\n", keepempty=false)
+      len_row = length(str[1])
+      @test mapreduce(x -> length(x) - mapreduce(y -> occursin(y, x), |, ["w̅","w̄","d̅"]) == len_row, &, str)
+
+      # Test that the columns have the same length
+      str2 = split(showed, ['│','┌','┬','┐','├','┼','┤','└','┴','┴','┘','\n'], keepempty=false)
+      len_col1 = length(str2[1])
+      len_col2 = length(str2[2])
+      len_col3 = length(str2[3])
+      @test mapreduce(x -> length(x) - mapreduce(y -> occursin(y, x), |, ["w̅","w̄","d̅"]) == len_col1, &, str2[1:3:end-2])
+      @test mapreduce(x -> length(x) - mapreduce(y -> occursin(y, x), |, ["w̅","w̄","d̅"]) == len_col2, &, str2[2:3:end-1])
+      @test mapreduce(x -> length(x) - mapreduce(y -> occursin(y, x), |, ["w̅","w̄","d̅"]) == len_col3, &, str2[3:3:end])
+
+      # Code coverage
+      show(io, solver, show_stats=true)
+    end
   end
-
-  io = IOBuffer()
-  show(io, cg_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │  CgSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │        Δx│    Vector{$FC}│                0│
-  │         x│    Vector{$FC}│               64│
-  │         r│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │        Ap│    Vector{$FC}│               64│
-  │         z│    Vector{$FC}│                0│
-  │warm_start│           Bool│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, symmlq_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────┬───────────────┬─────────────────┐
-  │SymmlqSolver│Precision: $FC │Architecture: CPU│
-  ├────────────┼───────────────┼─────────────────┤
-  │   Attribute│           Type│             Size│
-  ├────────────┼───────────────┼─────────────────┤
-  │          Δx│    Vector{$FC}│                0│
-  │           x│    Vector{$FC}│               64│
-  │       Mvold│    Vector{$FC}│               64│
-  │          Mv│    Vector{$FC}│               64│
-  │     Mv_next│    Vector{$FC}│               64│
-  │           w̅│    Vector{$FC}│               64│
-  │           v│    Vector{$FC}│                0│
-  │       clist│     Vector{$T}│                5│
-  │       zlist│     Vector{$T}│                5│
-  │       sprod│     Vector{$T}│                5│
-  │  warm_start│           Bool│                0│
-  └────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, minres_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────┬───────────────┬─────────────────┐
-  │MinresSolver│Precision: $FC │Architecture: CPU│
-  ├────────────┼───────────────┼─────────────────┤
-  │   Attribute│           Type│             Size│
-  ├────────────┼───────────────┼─────────────────┤
-  │          Δx│    Vector{$FC}│                0│
-  │           x│    Vector{$FC}│               64│
-  │          r1│    Vector{$FC}│               64│
-  │          r2│    Vector{$FC}│               64│
-  │          w1│    Vector{$FC}│               64│
-  │          w2│    Vector{$FC}│               64│
-  │           y│    Vector{$FC}│               64│
-  │           v│    Vector{$FC}│                0│
-  │     err_vec│     Vector{$T}│                5│
-  │  warm_start│           Bool│                0│
-  └────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cg_lanczos_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────────┬───────────────┬─────────────────┐
-  │CgLanczosSolver│Precision: $FC │Architecture: CPU│
-  ├───────────────┼───────────────┼─────────────────┤
-  │      Attribute│           Type│             Size│
-  ├───────────────┼───────────────┼─────────────────┤
-  │             Δx│    Vector{$FC}│                0│
-  │              x│    Vector{$FC}│               64│
-  │             Mv│    Vector{$FC}│               64│
-  │        Mv_prev│    Vector{$FC}│               64│
-  │              p│    Vector{$FC}│               64│
-  │        Mv_next│    Vector{$FC}│               64│
-  │              v│    Vector{$FC}│                0│
-  │     warm_start│           Bool│                0│
-  └───────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cg_lanczos_shift_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────────────┬───────────────────┬─────────────────┐
-  │CgLanczosShiftSolver│    Precision: $FC │Architecture: CPU│
-  ├────────────────────┼───────────────────┼─────────────────┤
-  │           Attribute│               Type│             Size│
-  ├────────────────────┼───────────────────┼─────────────────┤
-  │                  Mv│        Vector{$FC}│               64│
-  │             Mv_prev│        Vector{$FC}│               64│
-  │             Mv_next│        Vector{$FC}│               64│
-  │                   v│        Vector{$FC}│                0│
-  │                   x│Vector{Vector{$FC}}│           5 x 64│
-  │                   p│Vector{Vector{$FC}}│           5 x 64│
-  │                   σ│         Vector{$T}│                5│
-  │                δhat│         Vector{$T}│                5│
-  │                   ω│         Vector{$T}│                5│
-  │                   γ│         Vector{$T}│                5│
-  │              rNorms│         Vector{$T}│                5│
-  │           converged│          BitVector│                5│
-  │              not_cv│          BitVector│                5│
-  └────────────────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, diom_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────────┬─────────────────┐
-  │DiomSolver│    Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────────┼─────────────────┤
-  │ Attribute│               Type│             Size│
-  ├──────────┼───────────────────┼─────────────────┤
-  │        Δx│        Vector{$FC}│                0│
-  │         x│        Vector{$FC}│               64│
-  │         t│        Vector{$FC}│               64│
-  │         z│        Vector{$FC}│                0│
-  │         w│        Vector{$FC}│                0│
-  │         P│Vector{Vector{$FC}}│          10 x 64│
-  │         V│Vector{Vector{$FC}}│          10 x 64│
-  │         L│        Vector{$FC}│               10│
-  │         H│        Vector{$FC}│               12│
-  │warm_start│               Bool│                0│
-  └──────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, fom_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────────┬─────────────────┐
-  │ FomSolver│    Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────────┼─────────────────┤
-  │ Attribute│               Type│             Size│
-  ├──────────┼───────────────────┼─────────────────┤
-  │        Δx│        Vector{$FC}│                0│
-  │         x│        Vector{$FC}│               64│
-  │         w│        Vector{$FC}│               64│
-  │         p│        Vector{$FC}│                0│
-  │         q│        Vector{$FC}│                0│
-  │         V│Vector{Vector{$FC}}│          10 x 64│
-  │         l│        Vector{$FC}│               10│
-  │         z│        Vector{$FC}│               10│
-  │         U│        Vector{$FC}│               55│
-  │warm_start│               Bool│                0│
-  └──────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, dqgmres_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌─────────────┬───────────────────┬─────────────────┐
-  │DqgmresSolver│    Precision: $FC │Architecture: CPU│
-  ├─────────────┼───────────────────┼─────────────────┤
-  │    Attribute│               Type│             Size│
-  ├─────────────┼───────────────────┼─────────────────┤
-  │           Δx│        Vector{$FC}│                0│
-  │            x│        Vector{$FC}│               64│
-  │            t│        Vector{$FC}│               64│
-  │            z│        Vector{$FC}│                0│
-  │            w│        Vector{$FC}│                0│
-  │            P│Vector{Vector{$FC}}│          10 x 64│
-  │            V│Vector{Vector{$FC}}│          10 x 64│
-  │            c│         Vector{$T}│               10│
-  │            s│        Vector{$FC}│               10│
-  │            H│        Vector{$FC}│               12│
-  │   warm_start│               Bool│                0│
-  └─────────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, gmres_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────┬───────────────────┬─────────────────┐
-  │GmresSolver│    Precision: $FC │Architecture: CPU│
-  ├───────────┼───────────────────┼─────────────────┤
-  │  Attribute│               Type│             Size│
-  ├───────────┼───────────────────┼─────────────────┤
-  │         Δx│        Vector{$FC}│                0│
-  │          x│        Vector{$FC}│               64│
-  │          w│        Vector{$FC}│               64│
-  │          p│        Vector{$FC}│                0│
-  │          q│        Vector{$FC}│                0│
-  │          V│Vector{Vector{$FC}}│          10 x 64│
-  │          c│         Vector{$T}│               10│
-  │          s│        Vector{$FC}│               10│
-  │          z│        Vector{$FC}│               10│
-  │          R│        Vector{$FC}│               55│
-  │ warm_start│               Bool│                0│
-  │ inner_iter│              Int64│                0│
-  └───────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │  CrSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │        Δx│    Vector{$FC}│                0│
-  │         x│    Vector{$FC}│               64│
-  │         r│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │         q│    Vector{$FC}│               64│
-  │        Ar│    Vector{$FC}│               64│
-  │        Mq│    Vector{$FC}│                0│
-  │warm_start│           Bool│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, crmr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │CrmrSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │       Aᵀr│    Vector{$FC}│               64│
-  │         r│    Vector{$FC}│               32│
-  │         q│    Vector{$FC}│               32│
-  │        Mq│    Vector{$FC}│                0│
-  │         s│    Vector{$FC}│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cgs_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │ CgsSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │Attribute │           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │        Δx│    Vector{$FC}│                0│
-  │         x│    Vector{$FC}│               64│
-  │         r│    Vector{$FC}│               64│
-  │         u│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │         q│    Vector{$FC}│               64│
-  │        ts│    Vector{$FC}│               64│
-  │        yz│    Vector{$FC}│                0│
-  │        vw│    Vector{$FC}│                0│
-  │warm_start│           Bool│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, bicgstab_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────────┬───────────────┬─────────────────┐
-  │BicgstabSolver│Precision: $FC │Architecture: CPU│
-  ├──────────────┼───────────────┼─────────────────┤
-  │     Attribute│           Type│             Size│
-  ├──────────────┼───────────────┼─────────────────┤
-  │            Δx│    Vector{$FC}│                0│
-  │             x│    Vector{$FC}│               64│
-  │             r│    Vector{$FC}│               64│
-  │             p│    Vector{$FC}│               64│
-  │             v│    Vector{$FC}│               64│
-  │             s│    Vector{$FC}│               64│
-  │            qd│    Vector{$FC}│               64│
-  │            yz│    Vector{$FC}│                0│
-  │             t│    Vector{$FC}│                0│
-  │    warm_start│           Bool│                0│
-  └──────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, craigmr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌─────────────┬───────────────┬─────────────────┐
-  │CraigmrSolver│Precision: $FC │Architecture: CPU│
-  ├─────────────┼───────────────┼─────────────────┤
-  │    Attribute│           Type│             Size│
-  ├─────────────┼───────────────┼─────────────────┤
-  │            x│    Vector{$FC}│               64│
-  │           Nv│    Vector{$FC}│               64│
-  │          Aᵀu│    Vector{$FC}│               64│
-  │            d│    Vector{$FC}│               64│
-  │            y│    Vector{$FC}│               32│
-  │           Mu│    Vector{$FC}│               32│
-  │            w│    Vector{$FC}│               32│
-  │         wbar│    Vector{$FC}│               32│
-  │           Av│    Vector{$FC}│               32│
-  │            u│    Vector{$FC}│                0│
-  │            v│    Vector{$FC}│                0│
-  │            q│    Vector{$FC}│                0│
-  └─────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cgne_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │CgneSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │       Aᵀz│    Vector{$FC}│               64│
-  │         r│    Vector{$FC}│               32│
-  │         q│    Vector{$FC}│               32│
-  │         s│    Vector{$FC}│                0│
-  │         z│    Vector{$FC}│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, lnlq_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │LnlqSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               64│
-  │        Nv│    Vector{$FC}│               64│
-  │       Aᵀu│    Vector{$FC}│               64│
-  │         y│    Vector{$FC}│               32│
-  │         w̄│    Vector{$FC}│               32│
-  │        Mu│    Vector{$FC}│               32│
-  │        Av│    Vector{$FC}│               32│
-  │         u│    Vector{$FC}│                0│
-  │         v│    Vector{$FC}│                0│
-  │         q│    Vector{$FC}│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, craig_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────┬───────────────┬─────────────────┐
-  │CraigSolver│Precision: $FC │Architecture: CPU│
-  ├───────────┼───────────────┼─────────────────┤
-  │  Attribute│           Type│             Size│
-  ├───────────┼───────────────┼─────────────────┤
-  │          x│    Vector{$FC}│               64│
-  │         Nv│    Vector{$FC}│               64│
-  │        Aᵀu│    Vector{$FC}│               64│
-  │          y│    Vector{$FC}│               32│
-  │          w│    Vector{$FC}│               32│
-  │         Mu│    Vector{$FC}│               32│
-  │         Av│    Vector{$FC}│               32│
-  │          u│    Vector{$FC}│                0│
-  │          v│    Vector{$FC}│                0│
-  │         w2│    Vector{$FC}│                0│
-  └───────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, lslq_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │LslqSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               32│
-  │        Nv│    Vector{$FC}│               32│
-  │       Aᵀu│    Vector{$FC}│               32│
-  │         w̄│    Vector{$FC}│               32│
-  │        Mu│    Vector{$FC}│               64│
-  │        Av│    Vector{$FC}│               64│
-  │         u│    Vector{$FC}│                0│
-  │         v│    Vector{$FC}│                0│
-  │   err_vec│     Vector{$T}│                5│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, cgls_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │CglsSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               32│
-  │         p│    Vector{$FC}│               32│
-  │         s│    Vector{$FC}│               32│
-  │         r│    Vector{$FC}│               64│
-  │         q│    Vector{$FC}│               64│
-  │        Mr│    Vector{$FC}│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, lsqr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │LsqrSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               32│
-  │        Nv│    Vector{$FC}│               32│
-  │       Aᵀu│    Vector{$FC}│               32│
-  │         w│    Vector{$FC}│               32│
-  │        Mu│    Vector{$FC}│               64│
-  │        Av│    Vector{$FC}│               64│
-  │         u│    Vector{$FC}│                0│
-  │         v│    Vector{$FC}│                0│
-  │   err_vec│     Vector{$T}│                5│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, crls_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │CrlsSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               32│
-  │         p│    Vector{$FC}│               32│
-  │        Ar│    Vector{$FC}│               32│
-  │         q│    Vector{$FC}│               32│
-  │         r│    Vector{$FC}│               64│
-  │        Ap│    Vector{$FC}│               64│
-  │         s│    Vector{$FC}│               64│
-  │        Ms│    Vector{$FC}│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, lsmr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │LsmrSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │         x│    Vector{$FC}│               32│
-  │        Nv│    Vector{$FC}│               32│
-  │       Aᵀu│    Vector{$FC}│               32│
-  │         h│    Vector{$FC}│               32│
-  │      hbar│    Vector{$FC}│               32│
-  │        Mu│    Vector{$FC}│               64│
-  │        Av│    Vector{$FC}│               64│
-  │         u│    Vector{$FC}│                0│
-  │         v│    Vector{$FC}│                0│
-  │   err_vec│     Vector{$T}│                5│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, usymqr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────┬───────────────┬─────────────────┐
-  │UsymqrSolver│Precision: $FC │Architecture: CPU│
-  ├────────────┼───────────────┼─────────────────┤
-  │   Attribute│           Type│             Size│
-  ├────────────┼───────────────┼─────────────────┤
-  │        vₖ₋₁│    Vector{$FC}│               64│
-  │          vₖ│    Vector{$FC}│               64│
-  │           q│    Vector{$FC}│               64│
-  │          Δx│    Vector{$FC}│                0│
-  │           x│    Vector{$FC}│               32│
-  │        wₖ₋₂│    Vector{$FC}│               32│
-  │        wₖ₋₁│    Vector{$FC}│               32│
-  │        uₖ₋₁│    Vector{$FC}│               32│
-  │          uₖ│    Vector{$FC}│               32│
-  │           p│    Vector{$FC}│               32│
-  │  warm_start│           Bool│                0│
-  └────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, trilqr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────┬───────────────┬─────────────────┐
-  │TrilqrSolver│Precision: $FC │Architecture: CPU│
-  ├────────────┼───────────────┼─────────────────┤
-  │   Attribute│           Type│             Size│
-  ├────────────┼───────────────┼─────────────────┤
-  │        uₖ₋₁│    Vector{$FC}│               64│
-  │          uₖ│    Vector{$FC}│               64│
-  │           p│    Vector{$FC}│               64│
-  │           d̅│    Vector{$FC}│               64│
-  │          Δx│    Vector{$FC}│                0│
-  │           x│    Vector{$FC}│               64│
-  │        vₖ₋₁│    Vector{$FC}│               64│
-  │          vₖ│    Vector{$FC}│               64│
-  │           q│    Vector{$FC}│               64│
-  │          Δy│    Vector{$FC}│                0│
-  │           y│    Vector{$FC}│               64│
-  │        wₖ₋₃│    Vector{$FC}│               64│
-  │        wₖ₋₂│    Vector{$FC}│               64│
-  │  warm_start│           Bool│                0│
-  └────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, bilq_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │BilqSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │      uₖ₋₁│    Vector{$FC}│               64│
-  │        uₖ│    Vector{$FC}│               64│
-  │         q│    Vector{$FC}│               64│
-  │      vₖ₋₁│    Vector{$FC}│               64│
-  │        vₖ│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │        Δx│    Vector{$FC}│                0│
-  │         x│    Vector{$FC}│               64│
-  │         d̅│    Vector{$FC}│               64│
-  │warm_start│           Bool│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, bilqr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────┬───────────────┬─────────────────┐
-  │BilqrSolver│Precision: $FC │Architecture: CPU│
-  ├───────────┼───────────────┼─────────────────┤
-  │  Attribute│           Type│             Size│
-  ├───────────┼───────────────┼─────────────────┤
-  │       uₖ₋₁│    Vector{$FC}│               64│
-  │         uₖ│    Vector{$FC}│               64│
-  │          q│    Vector{$FC}│               64│
-  │       vₖ₋₁│    Vector{$FC}│               64│
-  │         vₖ│    Vector{$FC}│               64│
-  │          p│    Vector{$FC}│               64│
-  │         Δx│    Vector{$FC}│                0│
-  │          x│    Vector{$FC}│               64│
-  │         Δy│    Vector{$FC}│                0│
-  │          y│    Vector{$FC}│               64│
-  │          d̅│    Vector{$FC}│               64│
-  │       wₖ₋₃│    Vector{$FC}│               64│
-  │       wₖ₋₂│    Vector{$FC}│               64│
-  │ warm_start│           Bool│                0│
-  └───────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, minres_qlp_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────────┬───────────────┬─────────────────┐
-  │MinresQlpSolver│Precision: $FC │Architecture: CPU│
-  ├───────────────┼───────────────┼─────────────────┤
-  │      Attribute│           Type│             Size│
-  ├───────────────┼───────────────┼─────────────────┤
-  │             Δx│    Vector{$FC}│                0│
-  │           wₖ₋₁│    Vector{$FC}│               64│
-  │             wₖ│    Vector{$FC}│               64│
-  │        M⁻¹vₖ₋₁│    Vector{$FC}│               64│
-  │          M⁻¹vₖ│    Vector{$FC}│               64│
-  │              x│    Vector{$FC}│               64│
-  │              p│    Vector{$FC}│               64│
-  │             vₖ│    Vector{$FC}│                0│
-  │     warm_start│           Bool│                0│
-  └───────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, qmr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────┬─────────────────┐
-  │ QmrSolver│Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────┼─────────────────┤
-  │ Attribute│           Type│             Size│
-  ├──────────┼───────────────┼─────────────────┤
-  │      uₖ₋₁│    Vector{$FC}│               64│
-  │        uₖ│    Vector{$FC}│               64│
-  │         q│    Vector{$FC}│               64│
-  │      vₖ₋₁│    Vector{$FC}│               64│
-  │        vₖ│    Vector{$FC}│               64│
-  │         p│    Vector{$FC}│               64│
-  │        Δx│    Vector{$FC}│                0│
-  │         x│    Vector{$FC}│               64│
-  │      wₖ₋₂│    Vector{$FC}│               64│
-  │      wₖ₋₁│    Vector{$FC}│               64│
-  │warm_start│           Bool│                0│
-  └──────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, usymlq_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌────────────┬───────────────┬─────────────────┐
-  │UsymlqSolver│Precision: $FC │Architecture: CPU│
-  ├────────────┼───────────────┼─────────────────┤
-  │   Attribute│           Type│             Size│
-  ├────────────┼───────────────┼─────────────────┤
-  │        uₖ₋₁│    Vector{$FC}│               64│
-  │          uₖ│    Vector{$FC}│               64│
-  │           p│    Vector{$FC}│               64│
-  │          Δx│    Vector{$FC}│                0│
-  │           x│    Vector{$FC}│               64│
-  │           d̅│    Vector{$FC}│               64│
-  │        vₖ₋₁│    Vector{$FC}│               32│
-  │          vₖ│    Vector{$FC}│               32│
-  │           q│    Vector{$FC}│               32│
-  │  warm_start│           Bool│                0│
-  └────────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, tricg_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────┬───────────────┬─────────────────┐
-  │TricgSolver│Precision: $FC │Architecture: CPU│
-  ├───────────┼───────────────┼─────────────────┤
-  │  Attribute│           Type│             Size│
-  ├───────────┼───────────────┼─────────────────┤
-  │          y│    Vector{$FC}│               64│
-  │    N⁻¹uₖ₋₁│    Vector{$FC}│               64│
-  │      N⁻¹uₖ│    Vector{$FC}│               64│
-  │          p│    Vector{$FC}│               64│
-  │     gy₂ₖ₋₁│    Vector{$FC}│               64│
-  │       gy₂ₖ│    Vector{$FC}│               64│
-  │          x│    Vector{$FC}│               32│
-  │    M⁻¹vₖ₋₁│    Vector{$FC}│               32│
-  │      M⁻¹vₖ│    Vector{$FC}│               32│
-  │          q│    Vector{$FC}│               32│
-  │     gx₂ₖ₋₁│    Vector{$FC}│               32│
-  │       gx₂ₖ│    Vector{$FC}│               32│
-  │         Δx│    Vector{$FC}│                0│
-  │         Δy│    Vector{$FC}│                0│
-  │         uₖ│    Vector{$FC}│                0│
-  │         vₖ│    Vector{$FC}│                0│
-  │ warm_start│           Bool│                0│
-  └───────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, trimr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌───────────┬───────────────┬─────────────────┐
-  │TrimrSolver│Precision: $FC │Architecture: CPU│
-  ├───────────┼───────────────┼─────────────────┤
-  │  Attribute│           Type│             Size│
-  ├───────────┼───────────────┼─────────────────┤
-  │          y│    Vector{$FC}│               64│
-  │    N⁻¹uₖ₋₁│    Vector{$FC}│               64│
-  │      N⁻¹uₖ│    Vector{$FC}│               64│
-  │          p│    Vector{$FC}│               64│
-  │     gy₂ₖ₋₃│    Vector{$FC}│               64│
-  │     gy₂ₖ₋₂│    Vector{$FC}│               64│
-  │     gy₂ₖ₋₁│    Vector{$FC}│               64│
-  │       gy₂ₖ│    Vector{$FC}│               64│
-  │          x│    Vector{$FC}│               32│
-  │    M⁻¹vₖ₋₁│    Vector{$FC}│               32│
-  │      M⁻¹vₖ│    Vector{$FC}│               32│
-  │          q│    Vector{$FC}│               32│
-  │     gx₂ₖ₋₃│    Vector{$FC}│               32│
-  │     gx₂ₖ₋₂│    Vector{$FC}│               32│
-  │     gx₂ₖ₋₁│    Vector{$FC}│               32│
-  │       gx₂ₖ│    Vector{$FC}│               32│
-  │         Δx│    Vector{$FC}│                0│
-  │         Δy│    Vector{$FC}│                0│
-  │         uₖ│    Vector{$FC}│                0│
-  │         vₖ│    Vector{$FC}│                0│
-  │ warm_start│           Bool│                0│
-  └───────────┴───────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
-
-  io = IOBuffer()
-  show(io, gpmr_solver, show_stats=false)
-  showed = String(take!(io))
-  expected = """
-  ┌──────────┬───────────────────┬─────────────────┐
-  │GpmrSolver│    Precision: $FC │Architecture: CPU│
-  ├──────────┼───────────────────┼─────────────────┤
-  │ Attribute│               Type│             Size│
-  ├──────────┼───────────────────┼─────────────────┤
-  │        wA│        Vector{$FC}│                0│
-  │        wB│        Vector{$FC}│                0│
-  │        dA│        Vector{$FC}│               64│
-  │        dB│        Vector{$FC}│               32│
-  │        Δx│        Vector{$FC}│                0│
-  │        Δy│        Vector{$FC}│                0│
-  │         x│        Vector{$FC}│               64│
-  │         y│        Vector{$FC}│               32│
-  │         q│        Vector{$FC}│                0│
-  │         p│        Vector{$FC}│                0│
-  │         V│Vector{Vector{$FC}}│          10 x 64│
-  │         U│Vector{Vector{$FC}}│          10 x 32│
-  │        gs│        Vector{$FC}│               40│
-  │        gc│         Vector{$T}│               40│
-  │        zt│        Vector{$FC}│               20│
-  │         R│        Vector{$FC}│              210│
-  │warm_start│               Bool│                0│
-  └──────────┴───────────────────┴─────────────────┘
-  """
-  @test reduce(replace, [" " => "", "\n" => "", "─" => ""], init=showed) == reduce(replace, [" " => "", "\n" => "", "─" => ""], init=expected)
 end
 
 @testset "solvers" begin
diff --git a/test/test_stats.jl b/test/test_stats.jl
index 4289a78a3..186c56c20 100644
--- a/test/test_stats.jl
+++ b/test/test_stats.jl
@@ -4,7 +4,7 @@
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """Simple stats
+  expected = """SimpleStats
   niter: 0
   solved: true
   inconsistent: true
@@ -15,14 +15,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.LsmrStats(0, true, true, Float64[1.0], Float64[2.0], Float64(3.0), Float64(4.0), Float64(5.0), Float64(6.0), Float64(7.0), "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """Lsmr stats
+  expected = """LsmrStats
   niter: 0
   solved: true
   inconsistent: true
@@ -37,14 +38,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.LanczosStats(0, true, Float64[3.0], true, NaN, NaN, "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """Lanczos stats
+  expected = """LanczosStats
   niter: 0
   solved: true
   residuals: [ 3.0e+00 ]
@@ -55,14 +57,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.LanczosShiftStats(0, true, [Float64[0.9, 0.5], Float64[0.6, 0.4, 0.1]], BitVector([false, true]), NaN, NaN, "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """LanczosShift stats
+  expected = """LanczosShiftStats
   niter: 0
   solved: true
   residuals: [[0.9, 0.5], [0.6, 0.4, 0.1]]
@@ -70,16 +73,17 @@
   ‖A‖F: NaN
   κ₂(A): NaN
   status: t"""
-  @test (VERSION < v"1.5") || strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
+  @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.SymmlqStats(0, true, Float64[4.0], Union{Float64,Missing}[5.0, missing], Float64[6.0], Union{Float64,Missing}[7.0, missing], NaN, NaN, "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """Symmlq stats
+  expected = """SymmlqStats
   niter: 0
   solved: true
   residuals: [ 4.0e+00 ]
@@ -92,14 +96,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.AdjointStats(0, true, true, Float64[8.0], Float64[9.0], "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """Adjoint stats
+  expected = """AdjointStats
   niter: 0
   solved primal: true
   solved dual: true
@@ -109,14 +114,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.LNLQStats(0, true, Float64[10.0], false, Float64[11.0], Float64[12.0], "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """LNLQ stats
+  expected = """LNLQStats
   niter: 0
   solved: true
   residuals: [ 1.0e+01 ]
@@ -127,14 +133,15 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 
   stats = Krylov.LSLQStats(0, true, false, Float64[13.0], Float64[14.0], Float64[15.0], false, Float64[16.0], Float64[17.0], "t")
   io = IOBuffer()
   show(io, stats)
   showed = String(take!(io))
   storage_type = typeof(stats)
-  expected = """LSLQ stats
+  expected = """LSLQStats
   niter: 0
   solved: true
   inconsistent: false
@@ -148,5 +155,6 @@
   @test strip.(split(chomp(showed), "\n")) == strip.(split(chomp(expected), "\n"))
   Krylov.reset!(stats)
   check_reset(stats)
-  @test (VERSION < v"1.5") || (@allocated Krylov.reset!(stats)) == 0
+  nbytes_allocated = @allocated Krylov.reset!(stats)
+  @test nbytes_allocated == 0
 end
diff --git a/test/test_trilqr.jl b/test/test_trilqr.jl
index 7d7927372..baf8a597e 100644
--- a/test/test_trilqr.jl
+++ b/test/test_trilqr.jl
@@ -74,7 +74,7 @@
       @test(resid_dual ≤ trilqr_tol)
       @test(stats.solved_dual)
 
-      # Test consistent Ax = b and inconsistent Aᵀt = c.
+      # Test consistent Ax = b and inconsistent Aᴴt = c.
       A, b, c = rectangular_adjoint(FC=FC)
       (x, t, stats) = trilqr(A, b, c)
 
diff --git a/test/test_utils.jl b/test/test_utils.jl
index ed72056b6..f1c3ca44e 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -1,50 +1,51 @@
 include("get_div_grad.jl")
 include("gen_lsq.jl")
 include("check_min_norm.jl")
+include("callback_utils.jl")
 
 # Symmetric and positive definite systems.
 function symmetric_definite(n :: Int=10; FC=Float64)
-  α = FC <: Complex ? im : 1
+  α = FC <: Complex ? FC(im) : one(FC)
   A = spdiagm(-1 => α * ones(FC, n-1), 0 => 4 * ones(FC, n), 1 => conj(α) * ones(FC, n-1))
-  b = A * [1:n;]
+  b = A * FC[1:n;]
   return A, b
 end
 
 # Symmetric and indefinite systems.
 function symmetric_indefinite(n :: Int=10; FC=Float64)
-  α = FC <: Complex ? im : 1
+  α = FC <: Complex ? FC(im) : one(FC)
   A = spdiagm(-1 => α * ones(FC, n-1), 0 => ones(FC, n), 1 => conj(α) * ones(FC, n-1))
-  b = A * [1:n;]
+  b = A * FC[1:n;]
   return A, b
 end
 
 # Nonsymmetric and positive definite systems.
 function nonsymmetric_definite(n :: Int=10; FC=Float64)
   if FC <: Complex
-    A = [i == j ? n * one(FC) : im * one(FC) for i=1:n, j=1:n]
+    A = [i == j ? n * one(FC) : FC(im) * one(FC) for i=1:n, j=1:n]
   else
     A = [i == j ? n * one(FC) : i < j ? one(FC) : -one(FC) for i=1:n, j=1:n]
   end
-  b = A * [1:n;]
+  b = A * FC[1:n;]
   return A, b
 end
 
 # Nonsymmetric and indefinite systems.
 function nonsymmetric_indefinite(n :: Int=10; FC=Float64)
   if FC <: Complex
-    A = [i == j ? n * (-one(FC))^(i*j) : im * one(FC) for i=1:n, j=1:n]
+    A = [i == j ? n * (-one(FC))^(i*j) : FC(im) * one(FC) for i=1:n, j=1:n]
   else
     A = [i == j ? n * (-one(FC))^(i*j) : i < j ? one(FC) : -one(FC) for i=1:n, j=1:n]
   end
-  b = A * [1:n;]
+  b = A * FC[1:n;]
   return A, b
 end
 
 # Underdetermined and consistent systems.
 function under_consistent(n :: Int=10, m :: Int=25; FC=Float64)
   n < m || error("Square or overdetermined system!")
-  α = FC <: Complex ? im : 1
-  A = [i/j - α * j/i for i=1:n, j=1:m]
+  α = FC <: Complex ? FC(im) : one(FC)
+  A = FC[i/j - α * j/i for i=1:n, j=1:m]
   b = A * ones(FC, m)
   return A, b
 end
@@ -52,7 +53,7 @@ end
 # Underdetermined and inconsistent systems.
 function under_inconsistent(n :: Int=10, m :: Int=25; FC=Float64)
   n < m || error("Square or overdetermined system!")
-  α = FC <: Complex ? 1 + im : 1
+  α = FC <: Complex ? FC(1 + im) : one(FC)
   A = α * ones(FC, n, m)
   b = [i == 1 ? -one(FC) : i * one(FC) for i=1:n]
   return A, b
@@ -84,8 +85,8 @@ end
 # Overdetermined and consistent systems.
 function over_consistent(n :: Int=25, m :: Int=10; FC=Float64)
   n > m || error("Underdetermined or square system!")
-  α = FC <: Complex ? im : 1
-  A = [i/j - α * j/i for i=1:n, j=1:m]
+  α = FC <: Complex ? FC(im) : one(FC)
+  A = FC[i/j - α * j/i for i=1:n, j=1:m]
   b = A * ones(FC, m)
   return A, b
 end
@@ -93,7 +94,7 @@ end
 # Overdetermined and inconsistent systems.
 function over_inconsistent(n :: Int=25, m :: Int=10; FC=Float64)
   n > m || error("Underdetermined or square system!")
-  α = FC <: Complex ? 1 + im : 1
+  α = FC <: Complex ? FC(1 + im) : one(FC)
   A = α * ones(FC, n, m)
   b = [i == 1 ? -one(FC) : i * one(FC) for i=1:n]
   return A, b
@@ -162,23 +163,23 @@ end
 function underdetermined_adjoint(n :: Int=100, m :: Int=200; FC=Float64)
   n < m || error("Square or overdetermined system!")
   A = [i == j ? FC(10.0) : i < j ? one(FC) : -one(FC) for i=1:n, j=1:m]
-  b = A * [1:m;]
-  c = A' * [-n:-1;]
+  b = A * FC[1:m;]
+  c = A' * FC[-n:-1;]
   return A, b, c
 end
 
 # Square consistent adjoint systems.
 function square_adjoint(n :: Int=100; FC=Float64)
   A = [i == j ? FC(10.0) : i < j ? one(FC) : -one(FC) for i=1:n, j=1:n]
-  b = A * [1:n;]
-  c = A' * [-n:-1;]
+  b = A * FC[1:n;]
+  c = A' * FC[-n:-1;]
   return A, b, c
 end
 
-# Adjoint systems with Ax = b underdetermined consistent and Aᵀt = c overdetermined insconsistent.
+# Adjoint systems with Ax = b underdetermined consistent and Aᴴt = c overdetermined insconsistent.
 function rectangular_adjoint(n :: Int=10, m :: Int=25; FC=Float64)
-  Aᵀ, c = over_inconsistent(m, n; FC=FC)
-  A = adjoint(Aᵀ)
+  Aᴴ, c = over_inconsistent(m, n; FC=FC)
+  A = adjoint(Aᴴ)
   b = A * ones(FC, m)
   return A, b, c
 end
@@ -187,8 +188,8 @@ end
 function overdetermined_adjoint(n :: Int=200, m :: Int=100; FC=Float64)
   n > m || error("Underdetermined or square system!")
   A = [i == j ? FC(10.0) : i < j ? one(FC) : -one(FC) for i=1:n, j=1:m]
-  b = A * [1:m;]
-  c = A' * [-n:-1;]
+  b = A * FC[1:m;]
+  c = A' * FC[-n:-1;]
   return A, b, c
 end
 
@@ -251,7 +252,7 @@ end
 # Square and preconditioned problems.
 function square_preconditioned(n :: Int=10; FC=Float64)
   A   = ones(FC, n, n) + (n-1) * eye(n)
-  b   = FC(10.0) * [1:n;]
+  b   = 10 * FC[1:n;]
   M⁻¹ = FC(1/n) * eye(n)
   return A, b, M⁻¹
 end
@@ -363,110 +364,3 @@ function check_reset(stats :: KS) where KS <: Krylov.KrylovStats
     end
   end
 end
-
-# Test callback
-mutable struct TestCallbackN2{T, S, M}
-  A::M
-  b::S
-  storage_vec::S
-  tol::T
-end
-TestCallbackN2(A, b; tol = 0.1) = TestCallbackN2(A, b, similar(b), tol)
-
-function (cb_n2::TestCallbackN2)(solver)
-  mul!(cb_n2.storage_vec, cb_n2.A, solver.x)
-  cb_n2.storage_vec .-= cb_n2.b
-  return norm(cb_n2.storage_vec) ≤ cb_n2.tol
-end
-
-mutable struct TestCallbackN2Adjoint{T, S, M}
-  A::M
-  b::S
-  c::S
-  storage_vec1::S
-  storage_vec2::S
-  tol::T
-end
-TestCallbackN2Adjoint(A, b, c; tol = 0.1) = TestCallbackN2Adjoint(A, b, c, similar(b), similar(c), tol)
-
-function (cb_n2::TestCallbackN2Adjoint)(solver)
-  mul!(cb_n2.storage_vec1, cb_n2.A, solver.x)
-  cb_n2.storage_vec1 .-= cb_n2.b
-  mul!(cb_n2.storage_vec2, cb_n2.A', solver.y)
-  cb_n2.storage_vec2 .-= cb_n2.c
-  return (norm(cb_n2.storage_vec1) ≤ cb_n2.tol && norm(cb_n2.storage_vec2) ≤ cb_n2.tol)
-end
-
-mutable struct TestCallbackN2Shifts{T, S, M}
-  A::M
-  b::S
-  shifts::Vector{T}
-  tol::T
-end
-TestCallbackN2Shifts(A, b, shifts; tol = 0.1) = TestCallbackN2Shifts(A, b, shifts, tol)
-
-function (cb_n2::TestCallbackN2Shifts)(solver)
-  r = residuals(cb_n2.A, cb_n2.b, cb_n2.shifts, solver.x)
-  return all(map(norm, r) .≤ cb_n2.tol)
-end
-
-mutable struct TestCallbackN2LS{T, S, M}
-  A::M
-  b::S
-  λ::T
-  storage_vec1::S
-  storage_vec2::S
-  tol::T
-end
-TestCallbackN2LS(A, b, λ; tol = 0.1) = TestCallbackN2LS(A, b, λ, similar(b), similar(b, size(A, 2)), tol)
-
-function (cb_n2::TestCallbackN2LS)(solver)
-  mul!(cb_n2.storage_vec1, cb_n2.A, solver.x)
-  cb_n2.storage_vec1 .-= cb_n2.b
-  mul!(cb_n2.storage_vec2, cb_n2.A', cb_n2.storage_vec1)
-  cb_n2.storage_vec2 .+= cb_n2.λ .* solver.x
-  return norm(cb_n2.storage_vec2) ≤ cb_n2.tol
-end
-
-mutable struct TestCallbackN2LN{T, S, M}
-  A::M
-  b::S
-  λ::T
-  storage_vec::S
-  tol::T
-end
-TestCallbackN2LN(A, b, λ; tol = 0.1) = TestCallbackN2LN(A, b, λ, similar(b), tol)
-
-function (cb_n2::TestCallbackN2LN)(solver)
-  mul!(cb_n2.storage_vec, cb_n2.A, solver.x)
-  cb_n2.storage_vec .-= cb_n2.b
-  cb_n2.λ != 0 && (cb_n2.storage_vec .+= sqrt(cb_n2.λ) .* solver.s)
-  return norm(cb_n2.storage_vec) ≤ cb_n2.tol
-end
-
-mutable struct TestCallbackN2SaddlePts{T, S, M}
-  A::M
-  b::S
-  c::S
-  storage_vec1::S
-  storage_vec2::S
-  tol::T
-end
-TestCallbackN2SaddlePts(A, b, c; tol = 0.1) = 
-  TestCallbackN2SaddlePts(A, b, c, similar(b), similar(c), tol)
-
-function (cb_n2::TestCallbackN2SaddlePts)(solver)
-  mul!(cb_n2.storage_vec1, cb_n2.A, solver.y)
-  cb_n2.storage_vec1 .+= solver.x .- cb_n2.b
-  mul!(cb_n2.storage_vec2, cb_n2.A', solver.x)
-  cb_n2.storage_vec2 .-= solver.y .+ cb_n2.c
-  return (norm(cb_n2.storage_vec1) ≤ cb_n2.tol && norm(cb_n2.storage_vec2) ≤ cb_n2.tol)
-end
-
-function restarted_gmres_callback_n2(solver::GmresSolver, A, b, stor, N, storage_vec, tol)
-  get_x_restarted_gmres!(solver, A, stor, N)
-  x = stor.x
-  mul!(storage_vec, A, x)
-  storage_vec .-= b
-  return (norm(storage_vec) ≤ tol)
-end
diff --git a/test/test_warm_start.jl b/test/test_warm_start.jl
index 66a1cbea7..232a5a9cf 100644
--- a/test/test_warm_start.jl
+++ b/test/test_warm_start.jl
@@ -70,6 +70,11 @@ function test_warm_start(FC)
   resid = norm(r) / norm(b)
   @test(resid ≤ tol)
 
+  x, stats = fgmres(A, b, x0)
+  r = b - A * x
+  resid = norm(r) / norm(b)
+  @test(resid ≤ tol)
+
   x, stats = bicgstab(A, b, x0)
   r = b - A * x
   resid = norm(r) / norm(b)