Take fewer locks (#705)

JuliaGPU · Dec 3, 2024 · fadefca · fadefca
1 parent ba79add
commit fadefca
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 22 deletions.
diff --git a/src/blas/highlevel.jl b/src/blas/highlevel.jl
@@ -90,6 +90,16 @@ end
 # BLAS 2
 #
 
+const ROCUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{
+    <:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}
+const ROCLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{
+    <:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}
+
+LinearAlgebra.istriu(::ROCUpperOrUnitUpperTriangular) = true
+LinearAlgebra.istril(::ROCUpperOrUnitUpperTriangular) = false
+LinearAlgebra.istriu(::ROCLowerOrUnitLowerTriangular) = false
+LinearAlgebra.istril(::ROCLowerOrUnitLowerTriangular) = true
+
 # multiplication
 LinearAlgebra.generic_trimatmul!(
     c::StridedROCVector{T}, uploc, isunitc, tfun::Function,

diff --git a/src/dnn/convolution.jl b/src/dnn/convolution.jl
@@ -38,10 +38,10 @@ get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHM
 get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE
 
 function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS
-    perf_results = lock(get_conv_cache_type(conv_type)) do cache
-        get(cache, conv_args, nothing)
-    end
-    isnothing(perf_results) && return nothing
+    cache = get_conv_cache_type(conv_type).payload
+    perf_results = get(cache, conv_args, nothing)
+    perf_results ≡ nothing && return nothing
+
     workspace = ROCArray{UInt8}(undef, perf_results.memory)
     perf_results, workspace
 end

diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl
@@ -35,13 +35,18 @@ end
 const CONTEXTS = AMDGPU.LockedObject(Dict{HIPDevice,HIPContext}())
 
 function HIPContext(device::HIPDevice)
-    lock(CONTEXTS) do contexts
+    contexts = CONTEXTS.payload
+
+    ctx = get(contexts, device, nothing)
+    ctx ≡ nothing || return ctx
+
+    Base.@lock CONTEXTS.lock begin
         get!(contexts, device) do
-            context_ref = Ref{hipContext_t}()
-            hipCtxCreate(context_ref, Cuint(0), device.device)
-            context = HIPContext(context_ref[], true)
+            ctx_ref = Ref{hipContext_t}()
+            hipCtxCreate(ctx_ref, Cuint(0), device.device)
+            ctx = HIPContext(ctx_ref[], true)
             device!(device)
-            return context
+            return ctx
         end
     end
 end

diff --git a/src/memory.jl b/src/memory.jl
@@ -119,8 +119,14 @@ MemoryStats() = MemoryStats(0, 0.0, 0, 0.0, 0.0, 0)
 const MEMORY_STATS = AMDGPU.LockedObject(Dict{Int, MemoryStats}())
 
 function memory_stats(dev::HIPDevice = AMDGPU.device())
-    Base.lock(MEMORY_STATS) do ms
-        get!(() -> MemoryStats(), ms, HIP.device_id(dev))
+    ms = MEMORY_STATS.payload
+    did = HIP.device_id(dev)
+
+    stats = get(ms, did, nothing)
+    stats ≡ nothing || return stats
+
+    Base.@lock MEMORY_STATS.lock begin
+        get!(() -> MemoryStats(), ms, did)
     end
 end
 
@@ -158,7 +164,7 @@ function maybe_collect(; blocking::Bool = false)
 
     # Check if we are under memory pressure.
     pressure = stats.live / stats.size
-    min_pressure = 0.5
+    min_pressure = blocking ? 0.5 : 0.75
     pressure < min_pressure && return
 
     # TODO take allocations into account
@@ -175,22 +181,21 @@ function maybe_collect(; blocking::Bool = false)
     blocking && (max_gc_rate *= 2;)
 
     # And even more if the pressure is high.
-    pressure > 0.6 && (max_gc_rate *= 2;)
-    pressure > 0.8 && (max_gc_rate *= 2;)
-    # Always try to collect if pressure ≥ 0.9.
-    gc_rate > max_gc_rate && pressure < 0.9 && return
+    pressure > 0.9 && (max_gc_rate *= 2;)
+    pressure > 0.95 && (max_gc_rate *= 2;)
+    gc_rate > max_gc_rate && return
 
     Base.@atomic stats.last_time = current_time
 
     # Call the GC.
     pre_gc_live = stats.live
-    gc_time = Base.@elapsed GC.gc(pressure > 0.8 ? true : false)
+    gc_time = Base.@elapsed GC.gc(false)
     post_gc_live = stats.live
 
     # Update stats.
     freed = pre_gc_live - post_gc_live
     Base.@atomic stats.last_freed = freed
-    Base.@atomic stats.last_gc_time = gc_time
+    Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time
     return
 end
 

diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
@@ -1,10 +1,15 @@
 # Device ID => HIPMemoryPool
-const MEMORY_POOLS = AMDGPU.LockedObject(
-    Dict{Int64, HIP.HIPMemoryPool}())
+const MEMORY_POOLS = AMDGPU.LockedObject(Dict{Int64, HIP.HIPMemoryPool}())
 
 function pool_create(dev::HIPDevice)
-    Base.lock(MEMORY_POOLS) do pools
-        get!(pools, HIP.device_id(dev)) do
+    mp = MEMORY_POOLS.payload
+    did = HIP.device_id(dev)
+
+    pool = get(mp, did, nothing)
+    pool ≡ nothing || return pool
+
+    Base.@lock MEMORY_POOLS.lock begin
+        get!(mp, HIP.device_id(dev)) do
             max_size::UInt64 = AMDGPU.hard_memory_limit()
             max_size = max_size != typemax(UInt64) ? max_size : UInt64(0)