Skip to content

Commit

Permalink
Take fewer locks (#705)
Browse files Browse the repository at this point in the history
  • Loading branch information
pxl-th authored Dec 3, 2024
1 parent ba79add commit fadefca
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 22 deletions.
10 changes: 10 additions & 0 deletions src/blas/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,16 @@ end
# BLAS 2
#

const ROCUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{
<:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}
const ROCLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{
<:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}

LinearAlgebra.istriu(::ROCUpperOrUnitUpperTriangular) = true
LinearAlgebra.istril(::ROCUpperOrUnitUpperTriangular) = false
LinearAlgebra.istriu(::ROCLowerOrUnitLowerTriangular) = false
LinearAlgebra.istril(::ROCLowerOrUnitLowerTriangular) = true

# multiplication
LinearAlgebra.generic_trimatmul!(
c::StridedROCVector{T}, uploc, isunitc, tfun::Function,
Expand Down
8 changes: 4 additions & 4 deletions src/dnn/convolution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHM
get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE

function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS
perf_results = lock(get_conv_cache_type(conv_type)) do cache
get(cache, conv_args, nothing)
end
isnothing(perf_results) && return nothing
cache = get_conv_cache_type(conv_type).payload
perf_results = get(cache, conv_args, nothing)
perf_results nothing && return nothing

workspace = ROCArray{UInt8}(undef, perf_results.memory)
perf_results, workspace
end
Expand Down
15 changes: 10 additions & 5 deletions src/hip/HIP.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,18 @@ end
const CONTEXTS = AMDGPU.LockedObject(Dict{HIPDevice,HIPContext}())

function HIPContext(device::HIPDevice)
lock(CONTEXTS) do contexts
contexts = CONTEXTS.payload

ctx = get(contexts, device, nothing)
ctx nothing || return ctx

Base.@lock CONTEXTS.lock begin
get!(contexts, device) do
context_ref = Ref{hipContext_t}()
hipCtxCreate(context_ref, Cuint(0), device.device)
context = HIPContext(context_ref[], true)
ctx_ref = Ref{hipContext_t}()
hipCtxCreate(ctx_ref, Cuint(0), device.device)
ctx = HIPContext(ctx_ref[], true)
device!(device)
return context
return ctx
end
end
end
Expand Down
23 changes: 14 additions & 9 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,14 @@ MemoryStats() = MemoryStats(0, 0.0, 0, 0.0, 0.0, 0)
const MEMORY_STATS = AMDGPU.LockedObject(Dict{Int, MemoryStats}())

function memory_stats(dev::HIPDevice = AMDGPU.device())
Base.lock(MEMORY_STATS) do ms
get!(() -> MemoryStats(), ms, HIP.device_id(dev))
ms = MEMORY_STATS.payload
did = HIP.device_id(dev)

stats = get(ms, did, nothing)
stats nothing || return stats

Base.@lock MEMORY_STATS.lock begin
get!(() -> MemoryStats(), ms, did)
end
end

Expand Down Expand Up @@ -158,7 +164,7 @@ function maybe_collect(; blocking::Bool = false)

# Check if we are under memory pressure.
pressure = stats.live / stats.size
min_pressure = 0.5
min_pressure = blocking ? 0.5 : 0.75
pressure < min_pressure && return

# TODO take allocations into account
Expand All @@ -175,22 +181,21 @@ function maybe_collect(; blocking::Bool = false)
blocking && (max_gc_rate *= 2;)

# And even more if the pressure is high.
pressure > 0.6 && (max_gc_rate *= 2;)
pressure > 0.8 && (max_gc_rate *= 2;)
# Always try to collect if pressure ≥ 0.9.
gc_rate > max_gc_rate && pressure < 0.9 && return
pressure > 0.9 && (max_gc_rate *= 2;)
pressure > 0.95 && (max_gc_rate *= 2;)
gc_rate > max_gc_rate && return

Base.@atomic stats.last_time = current_time

# Call the GC.
pre_gc_live = stats.live
gc_time = Base.@elapsed GC.gc(pressure > 0.8 ? true : false)
gc_time = Base.@elapsed GC.gc(false)
post_gc_live = stats.live

# Update stats.
freed = pre_gc_live - post_gc_live
Base.@atomic stats.last_freed = freed
Base.@atomic stats.last_gc_time = gc_time
Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time
return
end

Expand Down
13 changes: 9 additions & 4 deletions src/runtime/memory/hip.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# Device ID => HIPMemoryPool
const MEMORY_POOLS = AMDGPU.LockedObject(
Dict{Int64, HIP.HIPMemoryPool}())
const MEMORY_POOLS = AMDGPU.LockedObject(Dict{Int64, HIP.HIPMemoryPool}())

function pool_create(dev::HIPDevice)
Base.lock(MEMORY_POOLS) do pools
get!(pools, HIP.device_id(dev)) do
mp = MEMORY_POOLS.payload
did = HIP.device_id(dev)

pool = get(mp, did, nothing)
pool nothing || return pool

Base.@lock MEMORY_POOLS.lock begin
get!(mp, HIP.device_id(dev)) do
max_size::UInt64 = AMDGPU.hard_memory_limit()
max_size = max_size != typemax(UInt64) ? max_size : UInt64(0)

Expand Down

0 comments on commit fadefca

Please sign in to comment.