From 107ed64479a9666cd3c6e4f24a27827de0320d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Sat, 20 Jun 2020 10:37:36 +0200 Subject: [PATCH 01/12] Make SIMDops.@explicit act on the following expr ...rather than the entire scope. This will allow fine-tuning which operations can and cannot be fused. --- Project.toml | 1 + src/SIMDops.jl | 20 +++++++++++----- src/accumulators/compDot.jl | 10 ++++---- src/accumulators/compSum.jl | 8 ++----- src/accumulators/dot.jl | 6 ++--- src/accumulators/sum.jl | 6 ++--- src/errorfree.jl | 47 ++++++++++++++++--------------------- test/runtests.jl | 16 ++++++------- 8 files changed, 53 insertions(+), 61 deletions(-) diff --git a/Project.toml b/Project.toml index 0ff39de..25fe864 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.3.3" [deps] JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" diff --git a/src/SIMDops.jl b/src/SIMDops.jl index 3342050..5670413 100644 --- a/src/SIMDops.jl +++ b/src/SIMDops.jl @@ -1,4 +1,5 @@ module SIMDops +import MacroTools import SIMDPirates using SIMDPirates: Vec, vload, vsum, vabs, vfma, vifelse, vless @@ -16,12 +17,19 @@ emul(x::T, y::T) where {T<:NTuple} = SIMDPirates.evmul(x, y) fma(x...) = Base.fma(x...) fma(x::T, y::T, z::T) where {T<:NTuple} = SIMDPirates.vfma(x, y, z) -macro explicit() - quote - $(esc(:+)) = eadd - $(esc(:-)) = esub - $(esc(:*)) = emul - end +macro explicit(expr) + MacroTools.postwalk(expr) do x + x == :+ && return eadd + x == :- && return esub + x == :* && return emul + x == :fma && return fma + + if MacroTools.@capture(x, a_ += b_) + return :($a = $eadd($a, $b)) + end + + return x + end |> esc end vzero(x) = zero(x) diff --git a/src/accumulators/compDot.jl b/src/accumulators/compDot.jl index a13ac68..cdcfc9c 100644 --- a/src/accumulators/compDot.jl +++ b/src/accumulators/compDot.jl @@ -6,18 +6,16 @@ end compDotAcc(T) = CompDotAcc{T}(vzero(T), vzero(T)) @inline function add!(acc::CompDotAcc{T}, x::T, y::T) where {T} - SIMDops.@explicit - p, ep = two_prod(x, y) acc.s, es = two_sum(acc.s, p) - acc.e += ep + es + + SIMDops.@explicit acc.e += ep + es end @inline function add!(acc::A, x::A) where {A<:CompDotAcc} - SIMDops.@explicit - acc.s, e = two_sum(acc.s, x.s) - acc.e += x.e + e + + SIMDops.@explicit acc.e += x.e + e end @inline function Base.sum(acc::CompDotAcc{T}) where {T<:Vec} diff --git a/src/accumulators/compSum.jl b/src/accumulators/compSum.jl index e7213f2..9b06655 100644 --- a/src/accumulators/compSum.jl +++ b/src/accumulators/compSum.jl @@ -7,17 +7,13 @@ end @inline compSumAcc(EFT, T) = CompSumAcc{T, EFT}(vzero(T), vzero(T)) @inline function add!(acc::CompSumAcc{T, EFT}, x::T) where {T, EFT} - SIMDops.@explicit - acc.s, e = EFT(acc.s, x) - acc.e += e + SIMDops.@explicit acc.e += e end @inline function add!(acc::A, x::A) where {A<:CompSumAcc{T, EFT}} where {T, EFT} - SIMDops.@explicit - acc.s, e = EFT(acc.s, x.s) - acc.e += x.e + e + SIMDops.@explicit acc.e += x.e + e end @inline function Base.sum(acc::CompSumAcc{T, EFT}) where {T<:Vec, EFT} diff --git a/src/accumulators/dot.jl b/src/accumulators/dot.jl index 43a85a9..28b00de 100644 --- a/src/accumulators/dot.jl +++ b/src/accumulators/dot.jl @@ -5,13 +5,11 @@ end dotAcc(T) = DotAcc{T}(vzero(T)) function add!(acc::DotAcc, x, y) - SIMDops.@explicit - acc.s += x * y + SIMDops.@explicit acc.s += x * y end function add!(acc::DotAcc{T}, x::DotAcc{T}) where {T} - SIMDops.@explicit - acc.s += x.s + SIMDops.@explicit acc.s += x.s end Base.sum(acc::DotAcc{T}) where {T<:Vec} = DotAcc(vsum(acc.s)) diff --git a/src/accumulators/sum.jl b/src/accumulators/sum.jl index 6768ed9..7e9941a 100644 --- a/src/accumulators/sum.jl +++ b/src/accumulators/sum.jl @@ -5,13 +5,11 @@ end sumAcc(T) = SumAcc{T}(vzero(T)) function add!(acc::SumAcc, x) - SIMDops.@explicit - acc.s += x + SIMDops.@explicit acc.s += x end function add!(acc::SumAcc{T}, x::SumAcc{T}) where {T} - SIMDops.@explicit - acc.s += x.s + SIMDops.@explicit acc.s += x.s end Base.sum(acc::SumAcc{T}) where {T<:Vec} = SumAcc(vsum(acc.s)) diff --git a/src/errorfree.jl b/src/errorfree.jl index 655ab70..aa69974 100644 --- a/src/errorfree.jl +++ b/src/errorfree.jl @@ -8,12 +8,12 @@ This algorithm does not use any branch. See also `fast_two_sum` for an alternative algorithm which branches but does fewer arithmetic operations. """ @inline function two_sum(a::T, b::T) where {T} - SIMDops.@explicit - - hi = a + b - v = hi - a - lo = (a - (hi - v)) + (b - v) - return hi, lo + SIMDops.@explicit begin + hi = a + b + v = hi - a + lo = (a - (hi - v)) + (b - v) + return hi, lo + end end """ @@ -43,18 +43,18 @@ algorithm which performs more arithmetic operations, but does not branch. end @inline function fast_two_sum(a::T, b::T) where T <: NTuple - SIMDops.@explicit + SIMDops.@explicit begin + x = a + b - x = a + b - - t = vless(vabs(a), vabs(b)) - a_ = vifelse(t, b, a) - b_ = vifelse(t, a, b) + t = vless(vabs(a), vabs(b)) + a_ = vifelse(t, b, a) + b_ = vifelse(t, a, b) - z = x - a_ - e = b_ - z + z = x - a_ + e = b_ - z - x, e + x, e + end end @@ -205,18 +205,11 @@ end Computes `s = fl(a*b)` and `e = err(a*b)`. """ @inline function two_prod(a::T, b::T) where {T} - p = a * b - e = fma(a, b, -p) - p, e -end - -@inline function two_prod(a::T, b::T) where {T<:NTuple} - SIMDops.@explicit - p = a * b - # TODO: add vfma to @explicit so that this method can be merged with the - # generic one - e = vfma(a, b, -p) - p, e + SIMDops.@explicit begin + p = a * b + e = fma(a, b, -p) + p, e + end end diff --git a/test/runtests.jl b/test/runtests.jl index c11b657..e8b7ed6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -101,8 +101,7 @@ using LinearAlgebra end using BenchmarkTools - -acc = compSumAcc(two_sum) +BLAS.set_num_threads(1) BenchmarkTools.DEFAULT_PARAMETERS.evals = 1000 println("\nsize 10_000") @@ -116,12 +115,13 @@ print(" sum_kbn"); @btime sum_kbn($(rand(1_000_000))) println("\nsize 100_000_000") x = rand(100_000_000) -print(" sum_oro"); @btime sum_oro($x) -print(" sum_kbn"); @btime sum_naive($x) -print(" sum "); @btime sum($x) +print(" sum_kbn "); @btime sum_kbn($x) +print(" sum_oro "); @btime sum_oro($x) +print(" sum_naive"); @btime sum_naive($x) +print(" sum "); @btime sum($x) println() y = rand(100_000_000) -print(" dot_oro"); @btime dot_oro($x, $y) -print(" dot_kbn"); @btime dot_naive($x, $y) -print(" dot "); @btime dot($x, $y) +print(" dot_oro "); @btime dot_oro($x, $y) +print(" dot_naive"); @btime dot_naive($x, $y) +print(" dot "); @btime dot($x, $y) From c7665337c6a20cb7aea653c9a12b63c8193eb994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Sat, 20 Jun 2020 12:05:27 +0200 Subject: [PATCH 02/12] Added an @fusible counterpart to SIMDops.@explicit Hopefully, this will allow fusing a few non-essential SIMD ops. --- src/SIMDops.jl | 45 ++++++++++++++++++++++++++++++++----- src/accumulators/compDot.jl | 4 ++-- src/accumulators/compSum.jl | 4 ++-- src/accumulators/dot.jl | 4 ++-- src/accumulators/sum.jl | 4 ++-- src/errorfree.jl | 26 +++++++++------------ 6 files changed, 58 insertions(+), 29 deletions(-) diff --git a/src/SIMDops.jl b/src/SIMDops.jl index 5670413..cace849 100644 --- a/src/SIMDops.jl +++ b/src/SIMDops.jl @@ -3,6 +3,19 @@ import MacroTools import SIMDPirates using SIMDPirates: Vec, vload, vsum, vabs, vfma, vifelse, vless +# * Generic operations on SIMD packs + +fma(x...) = Base.fma(x...) +fma(x::T, y::T, z::T) where {T<:NTuple} = SIMDPirates.vfma(x, y, z) + +vzero(x) = zero(x) +vzero(::Type{Vec{W, T}}) where {W, T} = SIMDPirates.vbroadcast(Vec{W, T}, 0) + +fptype(::Type{Vec{W, T}}) where {W, T} = T + + +# * Explicit/exact operations (non-fusible) + eadd(x...) = Base.:+(x...) eadd(x::T, y::T) where {T<:NTuple} = SIMDPirates.evadd(x, y) @@ -14,9 +27,6 @@ esub(x::T, y::T) where {T<:NTuple} = SIMDPirates.evsub(x, y) emul(x...) = Base.:*(x...) emul(x::T, y::T) where {T<:NTuple} = SIMDPirates.evmul(x, y) -fma(x...) = Base.fma(x...) -fma(x::T, y::T, z::T) where {T<:NTuple} = SIMDPirates.vfma(x, y, z) - macro explicit(expr) MacroTools.postwalk(expr) do x x == :+ && return eadd @@ -32,9 +42,32 @@ macro explicit(expr) end |> esc end -vzero(x) = zero(x) -vzero(::Type{Vec{W, T}}) where {W, T} = SIMDPirates.vbroadcast(Vec{W, T}, 0) -fptype(::Type{Vec{W, T}}) where {W, T} = T +# * Fast/fusible operations + +fadd(x...) = Base.:+(x...) +fadd(x::T, y::T) where {T<:NTuple} = SIMDPirates.vadd(x, y) + +fsub(x...) = Base.:-(x...) +fsub(x::NTuple) = broadcast(esub, x) +fsub(x::T, y::T) where {T<:NTuple} = SIMDPirates.vsub(x, y) + +fmul(x...) = Base.:*(x...) +fmul(x::T, y::T) where {T<:NTuple} = SIMDPirates.vmul(x, y) + +macro fusible(expr) + MacroTools.postwalk(expr) do x + x == :+ && return fadd + x == :- && return fsub + x == :* && return fmul + x == :fma && return fma + + if MacroTools.@capture(x, a_ += b_) + return :($a = $fadd($a, $b)) + end + + return x + end |> esc +end end diff --git a/src/accumulators/compDot.jl b/src/accumulators/compDot.jl index cdcfc9c..768d0f9 100644 --- a/src/accumulators/compDot.jl +++ b/src/accumulators/compDot.jl @@ -9,13 +9,13 @@ compDotAcc(T) = CompDotAcc{T}(vzero(T), vzero(T)) p, ep = two_prod(x, y) acc.s, es = two_sum(acc.s, p) - SIMDops.@explicit acc.e += ep + es + SIMDops.@fusible acc.e += ep + es end @inline function add!(acc::A, x::A) where {A<:CompDotAcc} acc.s, e = two_sum(acc.s, x.s) - SIMDops.@explicit acc.e += x.e + e + SIMDops.@fusible acc.e += x.e + e end @inline function Base.sum(acc::CompDotAcc{T}) where {T<:Vec} diff --git a/src/accumulators/compSum.jl b/src/accumulators/compSum.jl index 9b06655..d23c4a8 100644 --- a/src/accumulators/compSum.jl +++ b/src/accumulators/compSum.jl @@ -8,12 +8,12 @@ end @inline function add!(acc::CompSumAcc{T, EFT}, x::T) where {T, EFT} acc.s, e = EFT(acc.s, x) - SIMDops.@explicit acc.e += e + SIMDops.@fusible acc.e += e end @inline function add!(acc::A, x::A) where {A<:CompSumAcc{T, EFT}} where {T, EFT} acc.s, e = EFT(acc.s, x.s) - SIMDops.@explicit acc.e += x.e + e + SIMDops.@fusible acc.e += x.e + e end @inline function Base.sum(acc::CompSumAcc{T, EFT}) where {T<:Vec, EFT} diff --git a/src/accumulators/dot.jl b/src/accumulators/dot.jl index 28b00de..777a8c3 100644 --- a/src/accumulators/dot.jl +++ b/src/accumulators/dot.jl @@ -5,11 +5,11 @@ end dotAcc(T) = DotAcc{T}(vzero(T)) function add!(acc::DotAcc, x, y) - SIMDops.@explicit acc.s += x * y + SIMDops.@fusible acc.s += x * y end function add!(acc::DotAcc{T}, x::DotAcc{T}) where {T} - SIMDops.@explicit acc.s += x.s + SIMDops.@fusible acc.s += x.s end Base.sum(acc::DotAcc{T}) where {T<:Vec} = DotAcc(vsum(acc.s)) diff --git a/src/accumulators/sum.jl b/src/accumulators/sum.jl index 7e9941a..6e36e2c 100644 --- a/src/accumulators/sum.jl +++ b/src/accumulators/sum.jl @@ -5,11 +5,11 @@ end sumAcc(T) = SumAcc{T}(vzero(T)) function add!(acc::SumAcc, x) - SIMDops.@explicit acc.s += x + SIMDops.@fusible acc.s += x end function add!(acc::SumAcc{T}, x::SumAcc{T}) where {T} - SIMDops.@explicit acc.s += x.s + SIMDops.@fusible acc.s += x.s end Base.sum(acc::SumAcc{T}) where {T<:Vec} = SumAcc(vsum(acc.s)) diff --git a/src/errorfree.jl b/src/errorfree.jl index aa69974..d90501b 100644 --- a/src/errorfree.jl +++ b/src/errorfree.jl @@ -8,12 +8,10 @@ This algorithm does not use any branch. See also `fast_two_sum` for an alternative algorithm which branches but does fewer arithmetic operations. """ @inline function two_sum(a::T, b::T) where {T} - SIMDops.@explicit begin - hi = a + b - v = hi - a - lo = (a - (hi - v)) + (b - v) - return hi, lo - end + SIMDops.@fusible hi = a + b + SIMDops.@explicit v = hi - a + SIMDops.@explicit lo = (a - (hi - v)) + (b - v) + return hi, lo end """ @@ -43,18 +41,16 @@ algorithm which performs more arithmetic operations, but does not branch. end @inline function fast_two_sum(a::T, b::T) where T <: NTuple - SIMDops.@explicit begin - x = a + b + SIMDops.@fusible x = a + b - t = vless(vabs(a), vabs(b)) - a_ = vifelse(t, b, a) - b_ = vifelse(t, a, b) + t = vless(vabs(a), vabs(b)) + a_ = vifelse(t, b, a) + b_ = vifelse(t, a, b) - z = x - a_ - e = b_ - z + SIMDops.@explicit z = x - a_ + SIMDops.@explicit e = b_ - z - x, e - end + x, e end From 87fcacc6182380733770d7b84f78e1de7978fc91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Sat, 20 Jun 2020 13:04:32 +0200 Subject: [PATCH 03/12] Added v1.5 to tested Julia versions --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 857d38d..b7d8420 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ os: julia: - 1.3 - 1.4 + - 1.5 - nightly matrix: fast_finish: true From 1695e3dd917fac34412f78a3ead76a843e85299a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Sat, 20 Jun 2020 20:59:55 +0200 Subject: [PATCH 04/12] Use vzero from SIMDPirates --- src/SIMDops.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/SIMDops.jl b/src/SIMDops.jl index cace849..f06a0ab 100644 --- a/src/SIMDops.jl +++ b/src/SIMDops.jl @@ -1,16 +1,14 @@ module SIMDops import MacroTools import SIMDPirates -using SIMDPirates: Vec, vload, vsum, vabs, vfma, vifelse, vless +using SIMDPirates: Vec, vload, vsum, vabs, vfma, vifelse, vless, vzero + # * Generic operations on SIMD packs fma(x...) = Base.fma(x...) fma(x::T, y::T, z::T) where {T<:NTuple} = SIMDPirates.vfma(x, y, z) -vzero(x) = zero(x) -vzero(::Type{Vec{W, T}}) where {W, T} = SIMDPirates.vbroadcast(Vec{W, T}, 0) - fptype(::Type{Vec{W, T}}) where {W, T} = T From 0bcaeece8a856b706255ad7c364c25d392480a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Sat, 20 Jun 2020 22:30:49 +0200 Subject: [PATCH 05/12] Normalized info on performance tests for automated extraction --- test/perftests.jl | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/perftests.jl b/test/perftests.jl index caee6fb..e568f1e 100644 --- a/test/perftests.jl +++ b/test/perftests.jl @@ -1,3 +1,6 @@ +import Pkg +Pkg.activate(joinpath(@__DIR__, "..")) + println("Loading required packages...") using LinearAlgebra, Random, Printf, Statistics using BenchmarkTools, JSON @@ -238,3 +241,31 @@ function run_tests(fast=false) run_performance() println("Normal end of the performance tests") end + +if abspath(PROGRAM_FILE) == @__FILE__ + date = open(readline, `git show --no-patch --pretty="%cd" --date="format:%Y-%m-%d.%H:%M:%S" HEAD`) + sha1 = open(readline, `git show --no-patch --pretty="%h" HEAD`) |> String + cpu = open(read, `lscpu`) |> String + cpu = match(r"Model name:\s*(.*)\s+CPU", cpu)[1] + cpu = replace(cpu, r"\(\S+\)" => "") + cpu = replace(strip(cpu), r"\s+" => ".") + jobname = "$(date)_$(sha1)_$(VERSION)_$(cpu)" + open("jobname", "w") do f + write(f, jobname) + end + open("info.json", "w") do f + JSON.print(f, Dict( + :job => jobname, + :date => date, + :sha1 => sha1, + :julia => string(VERSION), + :cpu => cpu)) + end + + println("\nJob name: $jobname") + println("\nGit commit: $sha1 ($date)"); run(`git show --no-patch --oneline HEAD`) + println("\nJulia version: $VERSION"); println(Base.julia_cmd()) + println("\nCPU: $cpu"); run(`lscpu`) + + run_tests("fast" in ARGS) +end From 10f2a74fded8947b156c36e30930269508afe2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Mon, 22 Jun 2020 11:10:30 +0200 Subject: [PATCH 06/12] Added a parameter to tune cache prefetching Not sure what an optimal value would be yet (or even what it would depend on). Performance tests were added, in a similar way to Ushift --- src/Summation.jl | 39 +++++++++++++++++------ test/perftests.jl | 79 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 10 deletions(-) diff --git a/src/Summation.jl b/src/Summation.jl index 05e6feb..d0b6c8b 100644 --- a/src/Summation.jl +++ b/src/Summation.jl @@ -6,6 +6,7 @@ import VectorizationBase import ..SIMDops using ..SIMDops: Vec, vload, vsum, vzero, fptype +using SIMDPirates using ..EFT: two_sum, fast_two_sum, two_prod @@ -19,10 +20,11 @@ include("accumulators/compDot.jl") # SIAM Journal on Scientific Computing, 6(26), 2005. # DOI: 10.1137/030601818 @generated function accumulate(x::NTuple{A, AbstractArray{T}}, - accType::F, - rem_handling = Val(:scalar), - ::Val{Ushift} = Val(2) - ) where {F, A, T <: Union{Float32,Float64}, Ushift} + accType::F, + rem_handling = Val(:scalar), + ::Val{Ushift} = Val(2), + ::Val{Prefetch} = Val(0), + ) where {F, A, T <: Union{Float32,Float64}, Ushift, Prefetch} @assert 0 ≤ Ushift < 6 U = 1 << Ushift @@ -43,6 +45,10 @@ include("accumulators/compDot.jl") Nshift = N >> $(shift + Ushift) offset = 0 for n in 1:Nshift + if $Prefetch > 0 + SIMDPirates.prefetch.(px.+offset.+$(Prefetch*WT), Val(3), Val(0)) + end + Base.Cartesian.@nexprs $U u -> begin xi = vload.($V, px.+offset) add!(acc_u, xi...) @@ -85,11 +91,24 @@ include("accumulators/compDot.jl") end end -sum_naive(x) = accumulate((x,), sumAcc, Val(:scalar), Val(3)) -sum_kbn(x) = accumulate((x,), compSumAcc(fast_two_sum), Val(:scalar), Val(2)) -sum_oro(x) = accumulate((x,), compSumAcc(two_sum), Val(:scalar), Val(2)) - -dot_naive(x, y) = accumulate((x,y), dotAcc, Val(:scalar), Val(3)) -dot_oro(x, y) = accumulate((x,y), compDotAcc, Val(:scalar), Val(2)) +# Dispatch +# either default_ushift(x, acc) +# or default_ushift((x,), acc) +default_ushift(x::NTuple, acc) = default_ushift(first(x), acc) +default_ushift(x::AbstractArray, acc) = default_ushift(acc(eltype(x))) +# Default values for Ushift +default_ushift(::SumAcc) = Val(3) +default_ushift(::CompSumAcc) = Val(2) +default_ushift(::DotAcc) = Val(3) +default_ushift(::CompDotAcc) = Val(2) + +_sum(x, acc) = accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), Val(30)) +sum_naive(x) = _sum(x, sumAcc) +sum_kbn(x) = _sum(x, compSumAcc(fast_two_sum)) +sum_oro(x) = _sum(x, compSumAcc(two_sum)) + +_dot(x, y, acc) = accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), Val(30)) +dot_naive(x, y) = _dot(x, y, dotAcc) +dot_oro(x, y) = _dot(x, y, compDotAcc) end diff --git a/test/perftests.jl b/test/perftests.jl index e568f1e..1093ab0 100644 --- a/test/perftests.jl +++ b/test/perftests.jl @@ -8,6 +8,7 @@ using BenchmarkTools, JSON using AccurateArithmetic using AccurateArithmetic.Summation: accumulate, two_sum, fast_two_sum using AccurateArithmetic.Summation: sumAcc, dotAcc, compSumAcc, compDotAcc +using AccurateArithmetic.Summation: default_ushift using AccurateArithmetic.Test output(x) = @printf "%.2e " x @@ -157,6 +158,82 @@ function run_ushift() end + +# * Optimal prefetch + +function run_prefetch(gen, acc, title, filename) + println("-> $title") + + if FAST_TESTS + title *= " [FAST]" + sizes = [2^(3*i) for i in (3,5,7)] + prefetch = [0, 20, 40] + BenchmarkTools.DEFAULT_PARAMETERS.evals = 1 + BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5 + else + sizes = [2^(3*i) for i in 2:8] + prefetch = 0:4:60 + BenchmarkTools.DEFAULT_PARAMETERS.evals = 2 + BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5.0 + end + println(" sizes: $sizes") + + data = [[] for _ in 1:(1+length(sizes))] + for pref in prefetch + i = 1 + print(pref, " ") + push!(data[i], pref) + + for n in sizes + i += 1 + x = gen(n) + + ushift = AccurateArithmetic.Summation.default_ushift(x, acc) + + b = @benchmark accumulate($x, $acc, $(Val(:scalar)), $ushift, $(Val(pref))) + t = minimum(b.times) / n + output(t) + push!(data[i], t) + end + println() + end + + open(filename*".json", "w") do f + JSON.print(f, Dict( + :type => :prefetch, + :title => title, + :labels => sizes, + :data => data)) + end +end + + +function run_prefetch() + BenchmarkTools.DEFAULT_PARAMETERS.evals = 2 + println("Finding optimal prefetch...") + + run_prefetch(n->(rand(n),), sumAcc, + "Performance of naive summation", + "sum_naive_prefetch") + + run_prefetch(n->(rand(n),), compSumAcc(two_sum), + "Performance of ORO summation", + "sum_oro_prefetch") + + run_prefetch(n->(rand(n),), compSumAcc(fast_two_sum), + "Performance of KBN summation", + "sum_kbn_prefetch") + + run_prefetch(n->(rand(n), rand(n)), dotAcc, + "Performance of naive dot product", + "dot_naive_prefetch") + + run_prefetch(n->(rand(n), rand(n)), compDotAcc, + "Performance of compensated dot product", + "dot_oro_prefetch") +end + + # * Performance comparisons @@ -238,6 +315,8 @@ function run_tests(fast=false) sleep(5) run_ushift() sleep(5) + run_prefetch() + sleep(5) run_performance() println("Normal end of the performance tests") end From 7d137ab6fb8e404fe7070746262147dc1acf458e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Mon, 22 Jun 2020 15:44:37 +0200 Subject: [PATCH 07/12] Dropped the dependency on MacroTools This also reduces (pre)compilation times --- Project.toml | 3 -- src/SIMDops.jl | 84 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/Project.toml b/Project.toml index 25fe864..dac6b60 100644 --- a/Project.toml +++ b/Project.toml @@ -7,15 +7,12 @@ version = "0.3.3" [deps] JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] -BenchmarkTools = "0" JSON = "0" -Plots = "0" SIMDPirates = "0.2, 0.3, 0.5, 0.6.3, 0.7, 0.8" VectorizationBase = "0" julia = "1" diff --git a/src/SIMDops.jl b/src/SIMDops.jl index f06a0ab..984ebbf 100644 --- a/src/SIMDops.jl +++ b/src/SIMDops.jl @@ -1,5 +1,4 @@ module SIMDops -import MacroTools import SIMDPirates using SIMDPirates: Vec, vload, vsum, vabs, vfma, vifelse, vless, vzero @@ -12,7 +11,40 @@ fma(x::T, y::T, z::T) where {T<:NTuple} = SIMDPirates.vfma(x, y, z) fptype(::Type{Vec{W, T}}) where {W, T} = T -# * Explicit/exact operations (non-fusible) +# * Macros rewriting mathematical operations + +# ** Helper functions + +replace_simd_ops(x, _, _) = x + +function replace_simd_ops(sym::Symbol, ops, updates) + # Replace e.g: + + # => eadd + for (op, replacement) in ops + sym === op && return replacement + end + sym +end + +function replace_simd_ops(expr::Expr, ops, updates) + # Replace e.g: lhs += rhs + # => lhs = eadd(lhs, rhs) + for (op, replacement) in updates + if expr.head === op + lhs = expr.args[1] + rhs = replace_simd_ops(expr.args[2], ops, updates) + return :($lhs = $replacement($lhs, $rhs)) + end + end + + newexpr = Expr(replace_simd_ops(expr.head, ops, updates)) + for arg in expr.args + push!(newexpr.args, replace_simd_ops(arg, ops, updates)) + end + newexpr +end + +# ** Explicit/exact operations (non-fusible) eadd(x...) = Base.:+(x...) eadd(x::T, y::T) where {T<:NTuple} = SIMDPirates.evadd(x, y) @@ -25,19 +57,19 @@ esub(x::T, y::T) where {T<:NTuple} = SIMDPirates.evsub(x, y) emul(x...) = Base.:*(x...) emul(x::T, y::T) where {T<:NTuple} = SIMDPirates.evmul(x, y) -macro explicit(expr) - MacroTools.postwalk(expr) do x - x == :+ && return eadd - x == :- && return esub - x == :* && return emul - x == :fma && return fma - - if MacroTools.@capture(x, a_ += b_) - return :($a = $eadd($a, $b)) - end +function explicit(expr::Expr) + replace_simd_ops(expr, + (:+ => eadd, + :- => esub, + :* => emul, + :fma => fma), + (:+= => eadd, + :-= => esub, + :*= => emul)) +end - return x - end |> esc +macro explicit(expr) + explicit(expr) |> esc end @@ -53,19 +85,19 @@ fsub(x::T, y::T) where {T<:NTuple} = SIMDPirates.vsub(x, y) fmul(x...) = Base.:*(x...) fmul(x::T, y::T) where {T<:NTuple} = SIMDPirates.vmul(x, y) -macro fusible(expr) - MacroTools.postwalk(expr) do x - x == :+ && return fadd - x == :- && return fsub - x == :* && return fmul - x == :fma && return fma - - if MacroTools.@capture(x, a_ += b_) - return :($a = $fadd($a, $b)) - end +function fusible(expr::Expr) + replace_simd_ops(expr, + (:+ => fadd, + :- => fsub, + :* => fmul, + :fma => fma), + (:+= => fadd, + :-= => fsub, + :*= => fmul)) +end - return x - end |> esc +macro fusible(expr) + fusible(expr) |> esc end end From c8063499aaf3d519744d8d130e775c20d6999dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Mon, 22 Jun 2020 16:57:29 +0200 Subject: [PATCH 08/12] Default values for cache prefetching Optimal values for this parameter seem to vary a lot across architectures. The default values might not be very well chosen... --- src/Summation.jl | 29 +++++++++++++++++++++-------- test/perfplots.py | 32 +++++++++++++++++++++++++++++--- test/perftests.jl | 2 +- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/Summation.jl b/src/Summation.jl index d0b6c8b..e51fd93 100644 --- a/src/Summation.jl +++ b/src/Summation.jl @@ -91,23 +91,36 @@ include("accumulators/compDot.jl") end end -# Dispatch -# either default_ushift(x, acc) -# or default_ushift((x,), acc) -default_ushift(x::NTuple, acc) = default_ushift(first(x), acc) -default_ushift(x::AbstractArray, acc) = default_ushift(acc(eltype(x))) -# Default values for Ushift +# Default values for unrolling default_ushift(::SumAcc) = Val(3) default_ushift(::CompSumAcc) = Val(2) default_ushift(::DotAcc) = Val(3) default_ushift(::CompDotAcc) = Val(2) +# dispatch +# either default_ushift(x, acc) +# or default_ushift((x,), acc) +default_ushift(x::AbstractArray, acc) = default_ushift(acc(eltype(x))) +default_ushift(x::NTuple, acc) = default_ushift(first(x), acc) + + +# Default values for cache prefetching +default_prefetch(::SumAcc) = Val(0) +default_prefetch(::CompSumAcc) = Val(35) +default_prefetch(::DotAcc) = Val(0) +default_prefetch(::CompDotAcc) = Val(20) +# dispatch +# either default_prefetch(x, acc) +# or default_prefetch((x,), acc) +default_prefetch(x::AbstractArray, acc) = length(x)<500 ? Val(0) : default_prefetch(acc(eltype(x))) +default_prefetch(x::NTuple, acc) = default_prefetch(first(x), acc) + -_sum(x, acc) = accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), Val(30)) +_sum(x, acc) = accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) sum_naive(x) = _sum(x, sumAcc) sum_kbn(x) = _sum(x, compSumAcc(fast_two_sum)) sum_oro(x) = _sum(x, compSumAcc(two_sum)) -_dot(x, y, acc) = accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), Val(30)) +_dot(x, y, acc) = accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) dot_naive(x, y) = _dot(x, y, dotAcc) dot_oro(x, y) = _dot(x, y, compDotAcc) diff --git a/test/perfplots.py b/test/perfplots.py index 2b49e83..a856986 100644 --- a/test/perfplots.py +++ b/test/perfplots.py @@ -57,13 +57,37 @@ def plot_ushift(filename, results): p.legend("north east") +def plot_prefetch(filename, results): + title = results["title"] + data = results["data"] + labels = results["labels"] + + with Plot(filename) as p: + p.title = title.encode() + + p.x.label = "Cache prefetch" + #p.x.ticks = range(0,4+1) + + p.y.label = "Time [ns/elem]" + p.y.label_rotate = 90 + p.y.min = 0 + p.y.ticks = 0.05 + + for i in xrange(1, len(labels)): + p.plot(zip(data[0], data[i+1]), + title="%d elems" % labels[i]) + + p.legend("south east") + def plot_performance(filename, results): title = results["title"] data = results["data"] labels = results["labels"] - with open("cache.json", "r") as f: - cache = json.load(f) + with open("info.json", "r") as f: + cpu = json.load(f)["cpu"] + with open("../../cache.json", "r") as f: + cache = json.load(f)[cpu] with Plot("%s" % filename) as p: p.title = title.encode() @@ -122,13 +146,15 @@ def plot_results(filename): plot_accuracy(filename, results) elif results["type"] == "ushift": plot_ushift(filename, results) + elif results["type"] == "prefetch": + plot_prefetch(filename, results) elif results["type"] == "performance": plot_performance(filename, results) def plot_all(): print("Generating all plots...") for filename in os.listdir(os.getcwd()): - if filename == "cache.json": + if filename == "info.json": continue if filename.endswith(".json"): plot_results(filename.replace(".json", "")) diff --git a/test/perftests.jl b/test/perftests.jl index 1093ab0..8384e0b 100644 --- a/test/perftests.jl +++ b/test/perftests.jl @@ -322,7 +322,7 @@ function run_tests(fast=false) end if abspath(PROGRAM_FILE) == @__FILE__ - date = open(readline, `git show --no-patch --pretty="%cd" --date="format:%Y-%m-%d.%H:%M:%S" HEAD`) + date = open(readline, `git show --no-patch --pretty="%cd" --date="format:%Y-%m-%d.%H%M%S" HEAD`) sha1 = open(readline, `git show --no-patch --pretty="%h" HEAD`) |> String cpu = open(read, `lscpu`) |> String cpu = match(r"Model name:\s*(.*)\s+CPU", cpu)[1] From bd5f4adea25625cd7c2f8eb6dad76f846f815edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Mon, 22 Jun 2020 21:17:14 +0200 Subject: [PATCH 09/12] Fixed performance issues for small vectors Removed two allocations (not sure why small union optimization didn't handle this...) --- src/Summation.jl | 41 +++++++++++++++++++++++++++-------------- test/perfplots.py | 18 ++++++++---------- test/runtests.jl | 29 ++++++++++++++++++++++------- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/Summation.jl b/src/Summation.jl index e51fd93..7ca04d0 100644 --- a/src/Summation.jl +++ b/src/Summation.jl @@ -92,35 +92,48 @@ include("accumulators/compDot.jl") end # Default values for unrolling -default_ushift(::SumAcc) = Val(3) -default_ushift(::CompSumAcc) = Val(2) -default_ushift(::DotAcc) = Val(3) -default_ushift(::CompDotAcc) = Val(2) +@inline default_ushift(::SumAcc) = Val(3) +@inline default_ushift(::CompSumAcc) = Val(2) +@inline default_ushift(::DotAcc) = Val(3) +@inline default_ushift(::CompDotAcc) = Val(2) # dispatch # either default_ushift(x, acc) # or default_ushift((x,), acc) -default_ushift(x::AbstractArray, acc) = default_ushift(acc(eltype(x))) -default_ushift(x::NTuple, acc) = default_ushift(first(x), acc) +@inline default_ushift(x::AbstractArray, acc) = default_ushift(acc(eltype(x))) +@inline default_ushift(x::NTuple, acc) = default_ushift(first(x), acc) # Default values for cache prefetching -default_prefetch(::SumAcc) = Val(0) -default_prefetch(::CompSumAcc) = Val(35) -default_prefetch(::DotAcc) = Val(0) -default_prefetch(::CompDotAcc) = Val(20) +@inline default_prefetch(::SumAcc) = Val(0) +@inline default_prefetch(::CompSumAcc) = Val(35) +@inline default_prefetch(::DotAcc) = Val(0) +@inline default_prefetch(::CompDotAcc) = Val(20) # dispatch # either default_prefetch(x, acc) # or default_prefetch((x,), acc) -default_prefetch(x::AbstractArray, acc) = length(x)<500 ? Val(0) : default_prefetch(acc(eltype(x))) -default_prefetch(x::NTuple, acc) = default_prefetch(first(x), acc) +@inline default_prefetch(x::AbstractArray, acc) = default_prefetch(acc(eltype(x))) +@inline default_prefetch(x::NTuple, acc) = default_prefetch(first(x), acc) -_sum(x, acc) = accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) +@inline _sum(x, acc) = if length(x) < 500 + # no cache prefetching for small vectors + accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), Val(0)) +else + accumulate((x,), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) +end + sum_naive(x) = _sum(x, sumAcc) sum_kbn(x) = _sum(x, compSumAcc(fast_two_sum)) sum_oro(x) = _sum(x, compSumAcc(two_sum)) -_dot(x, y, acc) = accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) + +@inline _dot(x, y, acc) = if length(x) < 500 + # no cache prefetching for small vectors + accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), Val(0)) +else + accumulate((x,y), acc, Val(:scalar), default_ushift(x, acc), default_prefetch(x, acc)) +end + dot_naive(x, y) = _dot(x, y, dotAcc) dot_oro(x, y) = _dot(x, y, compDotAcc) diff --git a/test/perfplots.py b/test/perfplots.py index a856986..b471839 100644 --- a/test/perfplots.py +++ b/test/perfplots.py @@ -107,13 +107,14 @@ def plot_performance(filename, results): points = zip(data[0], data[i+1]) smoothing = 3 - for j in xrange(len(points)-smoothing): - if points[j][0] < 100: - continue - for k in range(1,smoothing+1): - if points[j][1] > points[j+k][1]: - points[j] = (points[j][0], points[j+k][1]) if smoothing > 0: + for j in xrange(len(points)-smoothing): + points[j] = (points[j][0], min(points[j][1], 1.2*points[-1][1])) + if points[j][0] < 100: + continue + for k in range(1,smoothing+1): + if points[j][1] > points[j+k][1]: + points[j] = (points[j][0], points[j+k][1]) points = points[:-smoothing] p.plot(points, title=labels[i].encode()) @@ -131,10 +132,7 @@ def plot_performance(filename, results): p.y.max, lvl.encode()) p.tikz += "\n" - if results["elem_size"] == 16: - p.legend("south east") - else: - p.legend("north east") + p.legend("south east") def plot_results(filename): diff --git a/test/runtests.jl b/test/runtests.jl index e8b7ed6..b69b358 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -102,26 +102,41 @@ end using BenchmarkTools BLAS.set_num_threads(1) - BenchmarkTools.DEFAULT_PARAMETERS.evals = 1000 + +println("\nsize 32") +x = rand(32) +y = rand(32) +print(" sum_kbn"); @btime sum_kbn($x) +print(" sum_oro"); @btime sum_oro($x) +println() +print(" dot_oro"); @btime dot_oro($x, $y) + println("\nsize 10_000") -print(" sum_oro"); @btime sum_oro($(rand(10_000))) -print(" sum_kbn"); @btime sum_kbn($(rand(10_000))) +x = rand(10_000) +y = rand(10_000) +print(" sum_kbn"); @btime sum_kbn($x) +print(" sum_oro"); @btime sum_oro($x) +println() +print(" dot_oro"); @btime dot_oro($x, $y) BenchmarkTools.DEFAULT_PARAMETERS.evals = 10 println("\nsize 1_000_000") -print(" sum_oro"); @btime sum_oro($(rand(1_000_000))) -print(" sum_kbn"); @btime sum_kbn($(rand(1_000_000))) +x = rand(1_000_000) +y = rand(1_000_000) +print(" sum_kbn"); @btime sum_kbn($x) +print(" sum_oro"); @btime sum_oro($x) +println() +print(" dot_oro"); @btime dot_oro($x, $y) println("\nsize 100_000_000") x = rand(100_000_000) +y = rand(100_000_000) print(" sum_kbn "); @btime sum_kbn($x) print(" sum_oro "); @btime sum_oro($x) print(" sum_naive"); @btime sum_naive($x) print(" sum "); @btime sum($x) - println() -y = rand(100_000_000) print(" dot_oro "); @btime dot_oro($x, $y) print(" dot_naive"); @btime dot_naive($x, $y) print(" dot "); @btime dot($x, $y) From 5fb191fe8b9911803f394037b0822c836bff8e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Tue, 23 Jun 2020 10:00:03 +0200 Subject: [PATCH 10/12] Test against MKL (esp. for LinearAlgebra.dot) --- test/perftests.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/perftests.jl b/test/perftests.jl index 8384e0b..7971088 100644 --- a/test/perftests.jl +++ b/test/perftests.jl @@ -328,7 +328,8 @@ if abspath(PROGRAM_FILE) == @__FILE__ cpu = match(r"Model name:\s*(.*)\s+CPU", cpu)[1] cpu = replace(cpu, r"\(\S+\)" => "") cpu = replace(strip(cpu), r"\s+" => ".") - jobname = "$(date)_$(sha1)_$(VERSION)_$(cpu)" + blas = BLAS.vendor() |> String + jobname = "$(date)_$(sha1)_$(VERSION)_$(cpu)_$(blas)" open("jobname", "w") do f write(f, jobname) end @@ -338,13 +339,15 @@ if abspath(PROGRAM_FILE) == @__FILE__ :date => date, :sha1 => sha1, :julia => string(VERSION), - :cpu => cpu)) + :cpu => cpu, + :blas => blas)) end println("\nJob name: $jobname") println("\nGit commit: $sha1 ($date)"); run(`git show --no-patch --oneline HEAD`) println("\nJulia version: $VERSION"); println(Base.julia_cmd()) println("\nCPU: $cpu"); run(`lscpu`) + println("\nBLAS vendor: $blas") run_tests("fast" in ARGS) end From 0d37d8c571dc769daf237ed4b13a623377e46d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Thu, 25 Jun 2020 10:58:24 +0200 Subject: [PATCH 11/12] Updated README.md following improvements with cache prefetching & co --- README.md | 93 +- test/figs/dot_accuracy.svg | 5419 ++++++++++++++++++++++++++ test/figs/dot_performance.svg | 1441 +++++++ test/figs/perf.svg | 261 -- test/figs/qual.svg | 1059 ----- test/figs/sum_accuracy.svg | 6871 +++++++++++++++++++++++++++++++++ test/figs/sum_performance.svg | 1494 +++++++ 7 files changed, 15282 insertions(+), 1356 deletions(-) create mode 100644 test/figs/dot_accuracy.svg create mode 100644 test/figs/dot_performance.svg delete mode 100644 test/figs/perf.svg delete mode 100644 test/figs/qual.svg create mode 100644 test/figs/sum_accuracy.svg create mode 100644 test/figs/sum_performance.svg diff --git a/README.md b/README.md index d0e5457..44b01c1 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ julia> sum(big.(x)) # But the standard summation algorithms computes this sum very inaccurately # (not even the sign is correct) julia> sum(x) --136.0 +-8.0 # Compensated summation algorithms should compute this more accurately @@ -64,24 +64,26 @@ julia> using AccurateArithmetic # Algorithm by Ogita, Rump and Oishi julia> sum_oro(x) -0.9999999999999716 +1.0000000000000084 # Algorithm by Kahan, Babuska and Neumaier julia> sum_kbn(x) -0.9999999999999716 +1.0000000000000084 ``` -![](test/figs/qual.svg) +![](test/figs/sum_accuracy.svg) +![](test/figs/dot_accuracy.svg) -In the graph above, we see the relative error vary as a function of the +In the graphs above, we see the relative error vary as a function of the condition number, in a log-log scale. Errors lower than ϵ are arbitrarily set to ϵ; conversely, when the relative error is more than 100% (i.e no digit is correctly computed anymore), the error is capped there in order to avoid -affecting the scale of the graph too much. What we see is that the pairwise +affecting the scale of the graph too much. What we see on the left is that the pairwise summation algorithm (as implemented in Base.sum) starts losing accuracy as soon as the condition number increases, computing only noise when the condition -number exceeds 1/ϵ≃10¹⁶. In contrast, both compensated algorithms +number exceeds 1/ϵ≃10¹⁶. The same goes for the naive summation algorithm. +In contrast, both compensated algorithms (Kahan-Babuska-Neumaier and Ogita-Rump-Oishi) still accurately compute the result at this point, and start losing accuracy there, computing meaningless results when the condition nuber reaches 1/ϵ²≃10³². In effect these (simply) @@ -89,6 +91,13 @@ compensated algorithms produce the same results as if a naive summation had been performed with twice the working precision (128 bits in this case), and then rounded to 64-bit floats. +The same comments can be made for the dot product implementations shown on the +right. Uncompensated algorithms, as implemented in +`AccurateArithmetic.dot_naive` or `Base.dot` (which internally calls BLAS in +this case) exhibit typical loss of accuracy. In contrast, the implementation of +Ogita, Rump & Oishi's compentated algorithm effectively doubles the working +precision. +
Performancewise, compensated algorithms perform a lot better than alternatives @@ -97,24 +106,31 @@ such as arbitrary precision (`BigFloat`) or rational arithmetic (`Rational`) : ```julia julia> using BenchmarkTools +julia> length(x) +10001 + julia> @btime sum($x) - 1.305 μs (0 allocations: 0 bytes) --136.0 + 1.320 μs (0 allocations: 0 bytes) +-8.0 + +julia> @btime sum_naive($x) + 1.026 μs (0 allocations: 0 bytes) +-1.121325337906356 julia> @btime sum_oro($x) - 3.421 μs (0 allocations: 0 bytes) -0.9999999999999716 + 3.348 μs (0 allocations: 0 bytes) +1.0000000000000084 julia> @btime sum_kbn($x) - 3.792 μs (0 allocations: 0 bytes) -0.9999999999999716 + 3.870 μs (0 allocations: 0 bytes) +1.0000000000000084 -julia> @btime sum(big.($x)) - 874.203 μs (20006 allocations: 1.14 MiB) +julia> @btime sum($(big.(x))) + 437.495 μs (2 allocations: 112 bytes) 1.0 -julia> @btime sum(Rational{BigInt}.(x)) - 22.702 ms (582591 allocations: 10.87 MiB) +julia> @btime sum($(Rational{BigInt}.(x))) + 10.894 ms (259917 allocations: 4.76 MiB) 1//1 ``` @@ -124,32 +140,37 @@ than their naive floating-point counterparts. As such, they usually perform worse. However, leveraging the power of modern architectures via vectorization, the slow down can be kept to a small value. -![](test/figs/perf.svg) - -In the graph above, the time spent in the summation (renormalized per element) -is plotted against the vector size (the units in the y-axis label should be -“ns/elem”). What we see with the standard summation is that, once vectors start -having significant sizes (say, more than 1000 elements), the implementation is -memory bound (as expected of a typical BLAS1 operation). Which is why we see -significant decreases in the performance when the vector can’t fit into the L2 -cache (around 30k elements, or 256kB on my machine) or the L3 cache (around 400k -elements, or 3MB on y machine). - -The Ogita-Rump-Oishi algorithm, when implemented with a suitable unrolling level -(ushift=2, i.e 2²=4 unrolled iterations), is CPU-bound when vectors fit inside -the cache. However, when vectors are to large to fit into the L3 cache, the -implementation becomes memory-bound again (on my system), which means we get the -same performance as the standard summation. +![](test/figs/sum_performance.svg) +![](test/figs/dot_performance.svg) + +Benchmarks presented in the above graphs were obtained in an Intel® Xeon® Gold +6128 CPU @ 3.40GHz. The time spent in the summation (renormalized per element) +is plotted against the vector size. What we see with the standard summation is +that, once vectors start having significant sizes (say, more than a few +thousands of elements), the implementation is memory bound (as expected of a +typical BLAS1 operation). Which is why we see significant decreases in the +performance when the vector can’t fit into the L1, L2 or L3 cache. + +On this AVX512-enabled system, the Kahan-Babuska-Neumaier implementation tends +to be a little more efficient than the Ogita-Rump-Oishi algorithm (this would +generally the opposite for AVX2 systems). When implemented with a suitable +unrolling level and cache prefetching, these implementations are CPU-bound when +vectors fit inside the L1 or L2 cache. However, when vectors are too large to +fit into the L2 cache, the implementation becomes memory-bound again (on this +system), which means we get the same performance as the standard +summation. Again, the same could be said as well for dot product calculations +(graph on the right), where the implementations from `AccurateArithmetic.jl` +compete against MKL's dot product. In other words, the improved accuracy is free for sufficiently large -vectors. For smaller vectors, the accuracy comes with a slow-down that can reach -values slightly above 3 for vectors which fit in the L2 cache. +vectors. For smaller vectors, the accuracy comes with a slow-down by a factor of +approximately 3 in the L2 cache. ### Tests The graphs above can be reproduced using the `test/perftests.jl` script in this -repository. Before running them, be aware that it takes around one hour to +repository. Before running them, be aware that it takes around tow hours to generate the performance graph, during which the benchmark machine should be as low-loaded as possible in order to avoid perturbing performance measurements. diff --git a/test/figs/dot_accuracy.svg b/test/figs/dot_accuracy.svg new file mode 100644 index 0000000..21460f6 --- /dev/null +++ b/test/figs/dot_accuracy.svg @@ -0,0 +1,5419 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/figs/dot_performance.svg b/test/figs/dot_performance.svg new file mode 100644 index 0000000..6678749 --- /dev/null +++ b/test/figs/dot_performance.svg @@ -0,0 +1,1441 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/figs/perf.svg b/test/figs/perf.svg deleted file mode 100644 index eabdf54..0000000 --- a/test/figs/perf.svg +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -10 - - -2 - - -10 - - -3 - - -10 - - -4 - - -10 - - -5 - - -10 - - -6 - - -10 - - -7 - - -10 - - -8 - - -0.1 - - -0.2 - - -0.3 - - -0.4 - - -0.5 - - -0.6 - - -0.7 - - -Performance of summation algorithms - - -Vector size - - -Time [µs/elem] - - - - - - - - -sum - - - -oro, ushift=1 - - - -oro, ushift=2 - - diff --git a/test/figs/qual.svg b/test/figs/qual.svg deleted file mode 100644 index 3351556..0000000 --- a/test/figs/qual.svg +++ /dev/null @@ -1,1059 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -10 - - -10 - - -10 - - -20 - - -10 - - -30 - - -10 - - -40 - - -10 - - -- - - -15 - - -10 - - -- - - -10 - - -10 - - -- - - -5 - - -10 - - -0 - - -Error of summation algorithms - - -Condition number - - -Relative error - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -sum - - - - -oro - - - - -kbn - - diff --git a/test/figs/sum_accuracy.svg b/test/figs/sum_accuracy.svg new file mode 100644 index 0000000..011a847 --- /dev/null +++ b/test/figs/sum_accuracy.svg @@ -0,0 +1,6871 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/figs/sum_performance.svg b/test/figs/sum_performance.svg new file mode 100644 index 0000000..b305df8 --- /dev/null +++ b/test/figs/sum_performance.svg @@ -0,0 +1,1494 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From a284b4c681a5a83780109df2a2360c9ff8726c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20F=C3=A9votte?= Date: Mon, 29 Jun 2020 09:29:17 +0200 Subject: [PATCH 12/12] Bump version number for release v0.3.4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index dac6b60..691e8e0 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "AccurateArithmetic" uuid = "22286c92-06ac-501d-9306-4abd417d9753" license = "MIT" repo = "https://github.com/JuliaMath/AccurateArithmetic.jl.git" -version = "0.3.3" +version = "0.3.4" [deps] JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"