Merge pull request #61 from SymbolicML:bump-alloc3

Bump allocator version of expression evaluation
SymbolicML · Feb 3, 2024 · 1f1ad6c · 1f1ad6c · MilesCranmer · Feb 3, 2024
2 parents 17f04ad + 15844ad
commit 1f1ad6c
Show file tree

Hide file tree

Showing 18 changed files with 562 additions and 175 deletions.
diff --git a/Project.toml b/Project.toml
@@ -6,7 +6,6 @@ version = "0.15.0"
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
@@ -16,17 +15,22 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
+DynamicExpressionsBumperExt = "Bumper"
+DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsOptimExt = "Optim"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
 DynamicExpressionsZygoteExt = "Zygote"
 
 [compat]
 Aqua = "0.7"
+Bumper = "0.6"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
@@ -41,8 +45,10 @@ julia = "1.6"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
@@ -52,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Enzyme", "Optim", "ForwardDiff", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -1,4 +1,7 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -1,6 +1,12 @@
 using DynamicExpressions, BenchmarkTools, Random
 using DynamicExpressions.EquationUtilsModule: is_constant
+
+# Trigger extensions:
+using LoopVectorization
+using Bumper
+using StrideArrays
 using Zygote
+
 if PACKAGE_VERSION < v"0.14.0"
     @eval using DynamicExpressions: Node as GraphNode
 else
@@ -27,27 +33,46 @@ function benchmark_evaluation()
         n = 1_000
 
         #! format: off
-        for turbo in (false, true)
-            if turbo && !(T in (Float32, Float64))
-                continue
+        for turbo in (false, true), bumper in (false, true)
+
+            (turbo || bumper) && !(T in (Float32, Float64)) && continue
+            if bumper
+                try
+                    eval_tree_array(Node{T}(val=1.0), ones(T, 5, n), operators; turbo, bumper)
+                catch e
+                    isa(e, MethodError) || rethrow(e)
+                    @warn "Skipping bumper tests"
+                    continue  # Assume its not available
+                end
+            end
+
+            extra_key = if turbo && bumper
+                "_turbo_bumper"
+            elseif turbo
+                "_turbo"
+            elseif bumper
+                "_bumper"
+            else
+                ""
             end
-            extra_key = turbo ? "_turbo" : ""
+            extra_kws = bumper ? (; bumper=Val(true)) : ()
             eval_tree_array(
                 gen_random_tree_fixed_size(20, operators, 5, T),
                 randn(MersenneTwister(0), T, 5, n),
                 operators;
-                turbo=turbo
+                turbo,
+                extra_kws...
             )
             suite[T]["evaluation$(extra_key)"] = @benchmarkable(
-                [eval_tree_array(tree, X, $operators; turbo=$turbo) for tree in trees],
+                [eval_tree_array(tree, X, $operators; turbo=$turbo, $extra_kws...) for tree in trees],
                 setup=(
                     X=randn(MersenneTwister(0), $T, 5, $n);
                     treesize=20;
                     ntrees=100;
                     trees=[gen_random_tree_fixed_size(treesize, $operators, 5, $T) for _ in 1:ntrees]
                 )
             )
-            if T <: Real
+            if T <: Real && !bumper
                 eval_grad_tree_array(
                     gen_random_tree_fixed_size(20, operators, 5, T),
                     randn(MersenneTwister(0), T, 5, n),

diff --git a/docs/src/eval.md b/docs/src/eval.md
@@ -13,25 +13,23 @@ Assuming you are only using a single `OperatorEnum`, you can also use
 the following shorthand by using the expression as a function:
 
 ```
-    (tree::Node)(X::AbstractMatrix, operators::GenericOperatorEnum; throw_errors::Bool=true)
+    (tree::AbstractExpressionNode)(X::AbstractMatrix{T}, operators::OperatorEnum; turbo::Union{Bool,Val}=false, bumper::Union{Bool,Val}=Val(false))
+
+Evaluate a binary tree (equation) over a given input data matrix. The
+operators contain all of the operators used. This function fuses doublets
+and triplets of operations for lower memory usage.
 
 # Arguments
-- `X::AbstractArray`: The input data to evaluate the tree on.
-- `operators::GenericOperatorEnum`: The operators used in the tree.
-- `throw_errors::Bool=true`: Whether to throw errors
-    if they occur during evaluation. Otherwise,
-    MethodErrors will be caught before they happen and
-    evaluation will return `nothing`,
-    rather than throwing an error. This is useful in cases
-    where you are unsure if a particular tree is valid or not,
-    and would prefer to work with `nothing` as an output.
+- `tree::AbstractExpressionNode`: The root node of the tree to evaluate.
+- `cX::AbstractMatrix{T}`: The input data to evaluate the tree on.
+- `operators::OperatorEnum`: The operators used in the tree.
+- `turbo::Union{Bool,Val}`: Use LoopVectorization.jl for faster evaluation.
+- `bumper::Union{Bool,Val}`: Use Bumper.jl for faster evaluation.
 
 # Returns
-- `output`: the result of the evaluation.
-    If evaluation failed, `nothing` will be returned for the first argument.
-    A `false` complete means an operator was called on input types
-    that it was not defined for. You can change this behavior by
-    setting `throw_errors=false`.
+- `output::AbstractVector{T}`: the result, which is a 1D array.
+    Any NaN, Inf, or other failure during the evaluation will result in the entire
+    output array being set to NaN.
 ```
 
 For example,
@@ -98,7 +96,7 @@ all variables (or, all constants). Both use forward-mode automatic, but use
 
 ```@docs
 eval_diff_tree_array(tree::Node{T}, cX::AbstractMatrix{T}, operators::OperatorEnum, direction::Integer) where {T<:Number}
-eval_grad_tree_array(tree::Node{T}, cX::AbstractMatrix{T}, operators::OperatorEnum; turbo::Bool=false, variable::Bool=false) where {T<:Number}
+eval_grad_tree_array(tree::Node{T}, cX::AbstractMatrix{T}, operators::OperatorEnum) where {T<:Number}
 ```
 
 You can compute gradients this with shorthand notation as well (which by default computes

diff --git a/ext/DynamicExpressionsBumperExt.jl b/ext/DynamicExpressionsBumperExt.jl
@@ -0,0 +1,104 @@
+module DynamicExpressionsBumperExt
+
+using Bumper: @no_escape, @alloc
+using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce
+using DynamicExpressions.UtilsModule: ResultOk, counttuple, is_bad_array
+
+import DynamicExpressions.ExtensionInterfaceModule:
+    bumper_eval_tree_array, bumper_kern1!, bumper_kern2!
+
+function bumper_eval_tree_array(
+    tree::AbstractExpressionNode{T},
+    cX::AbstractMatrix{T},
+    operators::OperatorEnum,
+    ::Val{turbo},
+) where {T,turbo}
+    result = similar(cX, axes(cX, 2))
+    n = size(cX, 2)
+    all_ok = Ref(false)
+    @no_escape begin
+        _result_ok = tree_mapreduce(
+            # Leaf nodes, we create an allocation and fill
+            # it with the value of the leaf:
+            leaf_node -> begin
+                ar = @alloc(T, n)
+                ok = if leaf_node.constant
+                    v = leaf_node.val::T
+                    ar .= v
+                    isfinite(v)
+                else
+                    ar .= view(cX, leaf_node.feature, :)
+                    true
+                end
+                ResultOk(ar, ok)
+            end,
+            # Branch nodes, we simply pass them to the evaluation kernel:
+            branch_node -> branch_node,
+            # In the evaluation kernel, we combine the branch nodes
+            # with the arrays created by the leaf nodes:
+            ((args::Vararg{Any,M}) where {M}) ->
+                dispatch_kerns!(operators, args..., Val(turbo)),
+            tree;
+            break_sharing=Val(true),
+        )
+        x = _result_ok.x
+        result .= x
+        all_ok[] = _result_ok.ok
+    end
+    return (result, all_ok[])
+end
+
+function dispatch_kerns!(operators, branch_node, cumulator, ::Val{turbo}) where {turbo}
+    cumulator.ok || return cumulator
+
+    out = dispatch_kern1!(operators.unaops, branch_node.op, cumulator.x, Val(turbo))
+    return ResultOk(out, !is_bad_array(out))
+end
+function dispatch_kerns!(
+    operators, branch_node, cumulator1, cumulator2, ::Val{turbo}
+) where {turbo}
+    cumulator1.ok || return cumulator1
+    cumulator2.ok || return cumulator2
+
+    out = dispatch_kern2!(
+        operators.binops, branch_node.op, cumulator1.x, cumulator2.x, Val(turbo)
+    )
+    return ResultOk(out, !is_bad_array(out))
+end
+
+@generated function dispatch_kern1!(unaops, op_idx, cumulator, ::Val{turbo}) where {turbo}
+    nuna = counttuple(unaops)
+    quote
+        Base.@nif(
+            $nuna,
+            i -> i == op_idx,
+            i -> let op = unaops[i]
+                return bumper_kern1!(op, cumulator, Val(turbo))
+            end,
+        )
+    end
+end
+@generated function dispatch_kern2!(
+    binops, op_idx, cumulator1, cumulator2, ::Val{turbo}
+) where {turbo}
+    nbin = counttuple(binops)
+    quote
+        Base.@nif(
+            $nbin,
+            i -> i == op_idx,
+            i -> let op = binops[i]
+                return bumper_kern2!(op, cumulator1, cumulator2, Val(turbo))
+            end,
+        )
+    end
+end
+function bumper_kern1!(op::F, cumulator, ::Val{false}) where {F}
+    @. cumulator = op(cumulator)
+    return cumulator
+end
+function bumper_kern2!(op::F, cumulator1, cumulator2, ::Val{false}) where {F}
+    @. cumulator1 = op(cumulator1, cumulator2)
+    return cumulator1
+end
+
+end