-
Notifications
You must be signed in to change notification settings - Fork 234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve NSight Systems activation by inspecting the session list. #2638
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/profile.jl b/src/profile.jl
index 15c98ddf1..cbfc38295 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -169,7 +169,7 @@ function find_nsys()
return ENV["_"]
else
# look at a couple of environment variables that may point to NSight
- nsys = nothing
+ nsys = nothing
for var in ("LD_PRELOAD", "CUDA_INJECTION64_PATH", "NVTX_INJECTION64_PATH")
haskey(ENV, var) || continue
for val in split(ENV[var], Sys.iswindows() ? ';' : ':')
@@ -186,56 +186,56 @@ function find_nsys()
error("Running under Nsight Systems, but could not find the `nsys` binary to start the profiler. Please specify using JULIA_CUDA_NSYS=path/to/nsys, and file an issue with the contents of ENV.")
end
-const __nsys = Ref{Union{Nothing,String}}()
-function nsys()
- if !isassigned(__nsys)
+ const __nsys = Ref{Union{Nothing, String}}()
+ function nsys()
+ if !isassigned(__nsys)
# find the active Nsight Systems profiler
if haskey(ENV, "NSYS_PROFILING_SESSION_ID") && ccall(:jl_generating_output, Cint, ()) == 0
- __nsys[] = find_nsys()
+ __nsys[] = find_nsys()
else
- __nsys[] = nothing
+ __nsys[] = nothing
end
end
- __nsys[]
+ return __nsys[]
end
-function nsys_sessions()
- sessions = Dict{Int,Dict{String,String}}()
- open(`$(nsys()) sessions list`, "r") do io
- header = Dict()
- for line in eachline(io)
- # parse the header
- if isempty(header)
- @assert startswith(line, r"\s+ID")
- colnames = split(line)[1:end-1] # ignore the final left-aligned column
- colranges = []
- for column in colnames
- push!(colranges, findfirst(Regex("\\s+\\b$column\\b"), line))
- end
- for (name, range) in zip(colnames, colranges)
- header[name] = range
- end
+ function nsys_sessions()
+ sessions = Dict{Int, Dict{String, String}}()
+ open(`$(nsys()) sessions list`, "r") do io
+ header = Dict()
+ for line in eachline(io)
+ # parse the header
+ if isempty(header)
+ @assert startswith(line, r"\s+ID")
+ colnames = split(line)[1:(end - 1)] # ignore the final left-aligned column
+ colranges = []
+ for column in colnames
+ push!(colranges, findfirst(Regex("\\s+\\b$column\\b"), line))
+ end
+ for (name, range) in zip(colnames, colranges)
+ header[name] = range
+ end
- # parse the data
- else
- session = Dict()
- for (name, range) in header
- session[name] = lstrip(line[range])
- end
+ # parse the data
+ else
+ session = Dict()
+ for (name, range) in header
+ session[name] = lstrip(line[range])
+ end
- id = parse(Int, session["ID"])
- delete!(session, "ID")
- sessions[id] = session
+ id = parse(Int, session["ID"])
+ delete!(session, "ID")
+ sessions[id] = session
+ end
end
end
+ return sessions
end
- return sessions
-end
-nsys_session() = parse(Int, ENV["NSYS_PROFILING_SESSION_ID"])
+ nsys_session() = parse(Int, ENV["NSYS_PROFILING_SESSION_ID"])
-nsys_state() = nsys_sessions()[nsys_session()]["STATE"]
+ nsys_state() = nsys_sessions()[nsys_session()]["STATE"]
@@ -246,50 +246,50 @@ Enables profile collection by the active profiling tool for the current context.
profiling is already enabled, then this call has no effect.
"""
function start()
- if nsys() !== nothing
- # by default, running under NSight Systems does not activate the profiler API-based
- # ranged collection; that's done by calling `nsys start --capture-range=cudaProfilerApi`.
- # however, as of recent we cannot do this anymore when already running under the
- # capturing `nsys profile`, so we need to detect the state and act accordingly.
- try
- state = nsys_state()
-
- # `nsys profile`
- if state == "Collection"
- @warn """The application is already being profiled; starting the profiler is a no-op.
-
- If you meant to profile a specific region, make sure to start NSight Systems in
- delayed mode (`nsys profile --start-later=true --capture-range=cudaProfilerApi`)
- or simply switch to the interactive `nsys launch` command."""
- return
-
- # `nsys profile --start-later=true`
- elseif state == "DelayedCollection"
- @error """The application is running under a delayed profiling session which CUDA.jl cannot activate.
-
- If you want `CUDA.@profile` to enable the profiler, make sure
- to pass `--capture-range=cudaProfilerApi` to `nsys profile`."""
- return
-
- # `nsys profile --start-later=true --capture-range=cudaProfilerApi`
- elseif state == "StartRange"
-
- # `nsys launch`
- elseif state == "Launched"
- run(`$(nsys()) start --capture-range=cudaProfilerApi`)
+ if nsys() !== nothing
+ # by default, running under NSight Systems does not activate the profiler API-based
+ # ranged collection; that's done by calling `nsys start --capture-range=cudaProfilerApi`.
+ # however, as of recent we cannot do this anymore when already running under the
+ # capturing `nsys profile`, so we need to detect the state and act accordingly.
+ try
+ state = nsys_state()
+
+ # `nsys profile`
+ if state == "Collection"
+ @warn """The application is already being profiled; starting the profiler is a no-op.
+
+ If you meant to profile a specific region, make sure to start NSight Systems in
+ delayed mode (`nsys profile --start-later=true --capture-range=cudaProfilerApi`)
+ or simply switch to the interactive `nsys launch` command."""
+ return
+
+ # `nsys profile --start-later=true`
+ elseif state == "DelayedCollection"
+ @error """The application is running under a delayed profiling session which CUDA.jl cannot activate.
+
+ If you want `CUDA.@profile` to enable the profiler, make sure
+ to pass `--capture-range=cudaProfilerApi` to `nsys profile`."""
+ return
+
+ # `nsys profile --start-later=true --capture-range=cudaProfilerApi`
+ elseif state == "StartRange"
+
+ # `nsys launch`
+ elseif state == "Launched"
+ run(`$(nsys()) start --capture-range=cudaProfilerApi`)
- else
- error("Unexpected state: $state")
+ else
+ error("Unexpected state: $state")
+ end
+ catch err
+ @error "Failed to find the active profiling session ($(nsys_session())) in the session list:\n" * read(`$(nsys()) sessions list`, String) * "\n\nPlease file an issue." exception = (err, catch_backtrace())
end
- catch err
- @error "Failed to find the active profiling session ($(nsys_session())) in the session list:\n" * read(`$(nsys()) sessions list`, String) * "\n\nPlease file an issue." exception=(err,catch_backtrace())
- end
- # it takes a while for the profiler to attach to our process
+ # it takes a while for the profiler to attach to our process
sleep(0.01)
end
- # actually start the capture
+ # actually start the capture
CUDA.cuProfilerStart()
end
@@ -300,7 +300,7 @@ Disables profile collection by the active profiling tool for the current context
profiling is already disabled, then this call has no effect.
"""
function stop()
- CUDA.cuProfilerStop()
+ return CUDA.cuProfilerStop()
end
|
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2638 +/- ##
==========================================
- Coverage 73.53% 73.36% -0.18%
==========================================
Files 157 157
Lines 15238 15272 +34
==========================================
- Hits 11205 11204 -1
- Misses 4033 4068 +35 ☔ View full report in Codecov by Sentry. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: a71b1bf | Previous: 24c236a | Ratio |
---|---|---|---|
latency/precompile |
46377879057 ns |
46329110276 ns |
1.00 |
latency/ttfp |
6994478571 ns |
6970088608 ns |
1.00 |
latency/import |
3638981184 ns |
3628117061 ns |
1.00 |
integration/volumerhs |
9624929 ns |
9622596.5 ns |
1.00 |
integration/byval/slices=1 |
147109 ns |
147068 ns |
1.00 |
integration/byval/slices=3 |
425287 ns |
425521 ns |
1.00 |
integration/byval/reference |
145172 ns |
144904 ns |
1.00 |
integration/byval/slices=2 |
286178 ns |
286164 ns |
1.00 |
integration/cudadevrt |
103567 ns |
103422 ns |
1.00 |
kernel/indexing |
14350 ns |
14042 ns |
1.02 |
kernel/indexing_checked |
14927 ns |
14520 ns |
1.03 |
kernel/occupancy |
633.3550295857988 ns |
633.7076023391813 ns |
1.00 |
kernel/launch |
2188.777777777778 ns |
2101.9 ns |
1.04 |
kernel/rand |
14846 ns |
14379 ns |
1.03 |
array/reverse/1d |
19733 ns |
19719 ns |
1.00 |
array/reverse/2d |
25320 ns |
25121 ns |
1.01 |
array/reverse/1d_inplace |
10574 ns |
11160 ns |
0.95 |
array/reverse/2d_inplace |
12253 ns |
13070 ns |
0.94 |
array/copy |
21532 ns |
21024 ns |
1.02 |
array/iteration/findall/int |
157929.5 ns |
155786 ns |
1.01 |
array/iteration/findall/bool |
137134.5 ns |
134662.5 ns |
1.02 |
array/iteration/findfirst/int |
154486.5 ns |
147148 ns |
1.05 |
array/iteration/findfirst/bool |
153973 ns |
154167.5 ns |
1.00 |
array/iteration/scalar |
64196.5 ns |
61499 ns |
1.04 |
array/iteration/logical |
208380 ns |
203811.5 ns |
1.02 |
array/iteration/findmin/1d |
39462 ns |
39639 ns |
1.00 |
array/iteration/findmin/2d |
94142 ns |
94387 ns |
1.00 |
array/reductions/reduce/1d |
38671.5 ns |
30507 ns |
1.27 |
array/reductions/reduce/2d |
51283.5 ns |
51213.5 ns |
1.00 |
array/reductions/mapreduce/1d |
38292 ns |
30459 ns |
1.26 |
array/reductions/mapreduce/2d |
51530.5 ns |
51487 ns |
1.00 |
array/broadcast |
21140.5 ns |
20729 ns |
1.02 |
array/copyto!/gpu_to_gpu |
12008 ns |
11560 ns |
1.04 |
array/copyto!/cpu_to_gpu |
210181 ns |
208930 ns |
1.01 |
array/copyto!/gpu_to_cpu |
243254.5 ns |
241934 ns |
1.01 |
array/accumulate/1d |
109436 ns |
108332 ns |
1.01 |
array/accumulate/2d |
80310 ns |
80060 ns |
1.00 |
array/construct |
1340.7 ns |
1258.3 ns |
1.07 |
array/random/randn/Float32 |
44338.5 ns |
42993 ns |
1.03 |
array/random/randn!/Float32 |
27063 ns |
26226 ns |
1.03 |
array/random/rand!/Int64 |
27383 ns |
26878 ns |
1.02 |
array/random/rand!/Float32 |
8812.833333333332 ns |
8569 ns |
1.03 |
array/random/rand/Int64 |
38260.5 ns |
29957 ns |
1.28 |
array/random/rand/Float32 |
13367 ns |
12962 ns |
1.03 |
array/permutedims/4d |
61382 ns |
61080 ns |
1.00 |
array/permutedims/2d |
55577 ns |
55180 ns |
1.01 |
array/permutedims/3d |
56451 ns |
55780 ns |
1.01 |
array/sorting/1d |
2766078 ns |
2774685 ns |
1.00 |
array/sorting/by |
3369788 ns |
3367411.5 ns |
1.00 |
array/sorting/2d |
1084807 ns |
1085055 ns |
1.00 |
cuda/synchronization/stream/auto |
1092.6 ns |
1048.8 ns |
1.04 |
cuda/synchronization/stream/nonblocking |
6449.8 ns |
6455.4 ns |
1.00 |
cuda/synchronization/stream/blocking |
830.5952380952381 ns |
846.986301369863 ns |
0.98 |
cuda/synchronization/context/auto |
1225.9 ns |
1230.1 ns |
1.00 |
cuda/synchronization/context/nonblocking |
6732.4 ns |
6670.8 ns |
1.01 |
cuda/synchronization/context/blocking |
973.5833333333334 ns |
928 ns |
1.05 |
This comment was automatically generated by workflow using github-action-benchmark.
Thanks! This now execute as expected when profiling a block of code in non-interactive mode
|
Fixes #2629