diff --git a/src/blas/highlevel.jl b/src/blas/highlevel.jl index fc74a1a92..17017a475 100644 --- a/src/blas/highlevel.jl +++ b/src/blas/highlevel.jl @@ -90,6 +90,16 @@ end # BLAS 2 # +const ROCUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{ + <:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}} +const ROCLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{ + <:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}} + +LinearAlgebra.istriu(::ROCUpperOrUnitUpperTriangular) = true +LinearAlgebra.istril(::ROCUpperOrUnitUpperTriangular) = false +LinearAlgebra.istriu(::ROCLowerOrUnitLowerTriangular) = false +LinearAlgebra.istril(::ROCLowerOrUnitLowerTriangular) = true + # multiplication LinearAlgebra.generic_trimatmul!( c::StridedROCVector{T}, uploc, isunitc, tfun::Function, diff --git a/src/dnn/convolution.jl b/src/dnn/convolution.jl index 4b174d822..5c1278f3f 100644 --- a/src/dnn/convolution.jl +++ b/src/dnn/convolution.jl @@ -38,10 +38,10 @@ get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHM get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS - perf_results = lock(get_conv_cache_type(conv_type)) do cache - get(cache, conv_args, nothing) - end - isnothing(perf_results) && return nothing + cache = get_conv_cache_type(conv_type).payload + perf_results = get(cache, conv_args, nothing) + perf_results ≡ nothing && return nothing + workspace = ROCArray{UInt8}(undef, perf_results.memory) perf_results, workspace end diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl index 05fa2c20a..889f2d5bb 100644 --- a/src/hip/HIP.jl +++ b/src/hip/HIP.jl @@ -35,13 +35,18 @@ end const CONTEXTS = AMDGPU.LockedObject(Dict{HIPDevice,HIPContext}()) function HIPContext(device::HIPDevice) - lock(CONTEXTS) do contexts + contexts = CONTEXTS.payload + + ctx = get(contexts, device, nothing) + ctx ≡ nothing || return ctx + + Base.@lock CONTEXTS.lock begin get!(contexts, device) do - context_ref = Ref{hipContext_t}() - hipCtxCreate(context_ref, Cuint(0), device.device) - context = HIPContext(context_ref[], true) + ctx_ref = Ref{hipContext_t}() + hipCtxCreate(ctx_ref, Cuint(0), device.device) + ctx = HIPContext(ctx_ref[], true) device!(device) - return context + return ctx end end end diff --git a/src/memory.jl b/src/memory.jl index 2e55ca655..524087f86 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -119,8 +119,14 @@ MemoryStats() = MemoryStats(0, 0.0, 0, 0.0, 0.0, 0) const MEMORY_STATS = AMDGPU.LockedObject(Dict{Int, MemoryStats}()) function memory_stats(dev::HIPDevice = AMDGPU.device()) - Base.lock(MEMORY_STATS) do ms - get!(() -> MemoryStats(), ms, HIP.device_id(dev)) + ms = MEMORY_STATS.payload + did = HIP.device_id(dev) + + stats = get(ms, did, nothing) + stats ≡ nothing || return stats + + Base.@lock MEMORY_STATS.lock begin + get!(() -> MemoryStats(), ms, did) end end @@ -158,7 +164,7 @@ function maybe_collect(; blocking::Bool = false) # Check if we are under memory pressure. pressure = stats.live / stats.size - min_pressure = 0.5 + min_pressure = blocking ? 0.5 : 0.75 pressure < min_pressure && return # TODO take allocations into account @@ -175,22 +181,21 @@ function maybe_collect(; blocking::Bool = false) blocking && (max_gc_rate *= 2;) # And even more if the pressure is high. - pressure > 0.6 && (max_gc_rate *= 2;) - pressure > 0.8 && (max_gc_rate *= 2;) - # Always try to collect if pressure ≥ 0.9. - gc_rate > max_gc_rate && pressure < 0.9 && return + pressure > 0.9 && (max_gc_rate *= 2;) + pressure > 0.95 && (max_gc_rate *= 2;) + gc_rate > max_gc_rate && return Base.@atomic stats.last_time = current_time # Call the GC. pre_gc_live = stats.live - gc_time = Base.@elapsed GC.gc(pressure > 0.8 ? true : false) + gc_time = Base.@elapsed GC.gc(false) post_gc_live = stats.live # Update stats. freed = pre_gc_live - post_gc_live Base.@atomic stats.last_freed = freed - Base.@atomic stats.last_gc_time = gc_time + Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time return end diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index 7a8f37b65..332416763 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -1,10 +1,15 @@ # Device ID => HIPMemoryPool -const MEMORY_POOLS = AMDGPU.LockedObject( - Dict{Int64, HIP.HIPMemoryPool}()) +const MEMORY_POOLS = AMDGPU.LockedObject(Dict{Int64, HIP.HIPMemoryPool}()) function pool_create(dev::HIPDevice) - Base.lock(MEMORY_POOLS) do pools - get!(pools, HIP.device_id(dev)) do + mp = MEMORY_POOLS.payload + did = HIP.device_id(dev) + + pool = get(mp, did, nothing) + pool ≡ nothing || return pool + + Base.@lock MEMORY_POOLS.lock begin + get!(mp, HIP.device_id(dev)) do max_size::UInt64 = AMDGPU.hard_memory_limit() max_size = max_size != typemax(UInt64) ? max_size : UInt64(0)