Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve NSight Systems activation by inspecting the session list. #2638

Merged
merged 1 commit into from
Jan 28, 2025

Conversation

maleadt
Copy link
Member

@maleadt maleadt commented Jan 28, 2025

Fixes #2629

Copy link
Contributor

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/profile.jl b/src/profile.jl
index 15c98ddf1..cbfc38295 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -169,7 +169,7 @@ function find_nsys()
         return ENV["_"]
     else
         # look at a couple of environment variables that may point to NSight
-        nsys = nothing
+            nsys = nothing
         for var in ("LD_PRELOAD", "CUDA_INJECTION64_PATH", "NVTX_INJECTION64_PATH")
             haskey(ENV, var) || continue
             for val in split(ENV[var], Sys.iswindows() ? ';' : ':')
@@ -186,56 +186,56 @@ function find_nsys()
     error("Running under Nsight Systems, but could not find the `nsys` binary to start the profiler. Please specify using JULIA_CUDA_NSYS=path/to/nsys, and file an issue with the contents of ENV.")
 end
 
-const __nsys = Ref{Union{Nothing,String}}()
-function nsys()
-    if !isassigned(__nsys)
+    const __nsys = Ref{Union{Nothing, String}}()
+    function nsys()
+        if !isassigned(__nsys)
         # find the active Nsight Systems profiler
         if haskey(ENV, "NSYS_PROFILING_SESSION_ID") && ccall(:jl_generating_output, Cint, ()) == 0
-            __nsys[] = find_nsys()
+                __nsys[] = find_nsys()
         else
-            __nsys[] = nothing
+                __nsys[] = nothing
         end
     end
 
-    __nsys[]
+        return __nsys[]
 end
 
-function nsys_sessions()
-    sessions = Dict{Int,Dict{String,String}}()
-    open(`$(nsys()) sessions list`, "r") do io
-        header = Dict()
-        for line in eachline(io)
-            # parse the header
-            if isempty(header)
-                @assert startswith(line, r"\s+ID")
-                colnames = split(line)[1:end-1] # ignore the final left-aligned column
-                colranges = []
-                for column in colnames
-                    push!(colranges, findfirst(Regex("\\s+\\b$column\\b"), line))
-                end
-                for (name, range) in zip(colnames, colranges)
-                    header[name] = range
-                end
+    function nsys_sessions()
+        sessions = Dict{Int, Dict{String, String}}()
+        open(`$(nsys()) sessions list`, "r") do io
+            header = Dict()
+            for line in eachline(io)
+                # parse the header
+                if isempty(header)
+                    @assert startswith(line, r"\s+ID")
+                    colnames = split(line)[1:(end - 1)] # ignore the final left-aligned column
+                    colranges = []
+                    for column in colnames
+                        push!(colranges, findfirst(Regex("\\s+\\b$column\\b"), line))
+                    end
+                    for (name, range) in zip(colnames, colranges)
+                        header[name] = range
+                    end
 
-            # parse the data
-            else
-                session = Dict()
-                for (name, range) in header
-                    session[name] = lstrip(line[range])
-                end
+                    # parse the data
+                else
+                    session = Dict()
+                    for (name, range) in header
+                        session[name] = lstrip(line[range])
+                    end
 
-                id = parse(Int, session["ID"])
-                delete!(session, "ID")
-                sessions[id] = session
+                    id = parse(Int, session["ID"])
+                    delete!(session, "ID")
+                    sessions[id] = session
+                end
             end
         end
+        return sessions
     end
-    return sessions
-end
 
-nsys_session() = parse(Int, ENV["NSYS_PROFILING_SESSION_ID"])
+    nsys_session() = parse(Int, ENV["NSYS_PROFILING_SESSION_ID"])
 
-nsys_state() = nsys_sessions()[nsys_session()]["STATE"]
+    nsys_state() = nsys_sessions()[nsys_session()]["STATE"]
 
 
 
@@ -246,50 +246,50 @@ Enables profile collection by the active profiling tool for the current context.
 profiling is already enabled, then this call has no effect.
 """
 function start()
-    if nsys() !== nothing
-        # by default, running under NSight Systems does not activate the profiler API-based
-        # ranged collection; that's done by calling `nsys start --capture-range=cudaProfilerApi`.
-        # however, as of recent we cannot do this anymore when already running under the
-        # capturing `nsys profile`, so we need to detect the state and act accordingly.
-        try
-            state = nsys_state()
-
-            # `nsys profile`
-            if state == "Collection"
-                @warn """The application is already being profiled; starting the profiler is a no-op.
-
-                         If you meant to profile a specific region, make sure to start NSight Systems in
-                         delayed mode (`nsys profile --start-later=true --capture-range=cudaProfilerApi`)
-                         or simply switch to the interactive `nsys launch` command."""
-                return
-
-            # `nsys profile --start-later=true`
-            elseif state == "DelayedCollection"
-                @error """The application is running under a delayed profiling session which CUDA.jl cannot activate.
-
-                          If you want `CUDA.@profile` to enable the profiler, make sure
-                          to pass `--capture-range=cudaProfilerApi` to `nsys profile`."""
-                return
-
-            # `nsys profile --start-later=true --capture-range=cudaProfilerApi`
-            elseif state == "StartRange"
-
-            # `nsys launch`
-            elseif state == "Launched"
-                run(`$(nsys()) start --capture-range=cudaProfilerApi`)
+        if nsys() !== nothing
+            # by default, running under NSight Systems does not activate the profiler API-based
+            # ranged collection; that's done by calling `nsys start --capture-range=cudaProfilerApi`.
+            # however, as of recent we cannot do this anymore when already running under the
+            # capturing `nsys profile`, so we need to detect the state and act accordingly.
+            try
+                state = nsys_state()
+
+                # `nsys profile`
+                if state == "Collection"
+                    @warn """The application is already being profiled; starting the profiler is a no-op.
+
+                    If you meant to profile a specific region, make sure to start NSight Systems in
+                    delayed mode (`nsys profile --start-later=true --capture-range=cudaProfilerApi`)
+                    or simply switch to the interactive `nsys launch` command."""
+                    return
+
+                    # `nsys profile --start-later=true`
+                elseif state == "DelayedCollection"
+                    @error """The application is running under a delayed profiling session which CUDA.jl cannot activate.
+
+                    If you want `CUDA.@profile` to enable the profiler, make sure
+                    to pass `--capture-range=cudaProfilerApi` to `nsys profile`."""
+                    return
+
+                    # `nsys profile --start-later=true --capture-range=cudaProfilerApi`
+                elseif state == "StartRange"
+
+                    # `nsys launch`
+                elseif state == "Launched"
+                    run(`$(nsys()) start --capture-range=cudaProfilerApi`)
 
-            else
-                error("Unexpected state: $state")
+                else
+                    error("Unexpected state: $state")
+                end
+            catch err
+                @error "Failed to find the active profiling session ($(nsys_session())) in the session list:\n" * read(`$(nsys()) sessions list`, String) * "\n\nPlease file an issue." exception = (err, catch_backtrace())
             end
-        catch err
-            @error "Failed to find the active profiling session ($(nsys_session())) in the session list:\n" * read(`$(nsys()) sessions list`, String) * "\n\nPlease file an issue." exception=(err,catch_backtrace())
-        end
 
-        # it takes a while for the profiler to attach to our process
+            # it takes a while for the profiler to attach to our process
         sleep(0.01)
     end
 
-    # actually start the capture
+        # actually start the capture
     CUDA.cuProfilerStart()
 end
 
@@ -300,7 +300,7 @@ Disables profile collection by the active profiling tool for the current context
 profiling is already disabled, then this call has no effect.
 """
 function stop()
-    CUDA.cuProfilerStop()
+        return CUDA.cuProfilerStop()
 end
 
 

Copy link

codecov bot commented Jan 28, 2025

Codecov Report

Attention: Patch coverage is 10.86957% with 41 lines in your changes missing coverage. Please review.

Project coverage is 73.36%. Comparing base (24c236a) to head (a71b1bf).
Report is 3 commits behind head on master.

Files with missing lines Patch % Lines
src/profile.jl 10.86% 41 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##           master    #2638      +/-   ##
==========================================
- Coverage   73.53%   73.36%   -0.18%     
==========================================
  Files         157      157              
  Lines       15238    15272      +34     
==========================================
- Hits        11205    11204       -1     
- Misses       4033     4068      +35     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: a71b1bf Previous: 24c236a Ratio
latency/precompile 46377879057 ns 46329110276 ns 1.00
latency/ttfp 6994478571 ns 6970088608 ns 1.00
latency/import 3638981184 ns 3628117061 ns 1.00
integration/volumerhs 9624929 ns 9622596.5 ns 1.00
integration/byval/slices=1 147109 ns 147068 ns 1.00
integration/byval/slices=3 425287 ns 425521 ns 1.00
integration/byval/reference 145172 ns 144904 ns 1.00
integration/byval/slices=2 286178 ns 286164 ns 1.00
integration/cudadevrt 103567 ns 103422 ns 1.00
kernel/indexing 14350 ns 14042 ns 1.02
kernel/indexing_checked 14927 ns 14520 ns 1.03
kernel/occupancy 633.3550295857988 ns 633.7076023391813 ns 1.00
kernel/launch 2188.777777777778 ns 2101.9 ns 1.04
kernel/rand 14846 ns 14379 ns 1.03
array/reverse/1d 19733 ns 19719 ns 1.00
array/reverse/2d 25320 ns 25121 ns 1.01
array/reverse/1d_inplace 10574 ns 11160 ns 0.95
array/reverse/2d_inplace 12253 ns 13070 ns 0.94
array/copy 21532 ns 21024 ns 1.02
array/iteration/findall/int 157929.5 ns 155786 ns 1.01
array/iteration/findall/bool 137134.5 ns 134662.5 ns 1.02
array/iteration/findfirst/int 154486.5 ns 147148 ns 1.05
array/iteration/findfirst/bool 153973 ns 154167.5 ns 1.00
array/iteration/scalar 64196.5 ns 61499 ns 1.04
array/iteration/logical 208380 ns 203811.5 ns 1.02
array/iteration/findmin/1d 39462 ns 39639 ns 1.00
array/iteration/findmin/2d 94142 ns 94387 ns 1.00
array/reductions/reduce/1d 38671.5 ns 30507 ns 1.27
array/reductions/reduce/2d 51283.5 ns 51213.5 ns 1.00
array/reductions/mapreduce/1d 38292 ns 30459 ns 1.26
array/reductions/mapreduce/2d 51530.5 ns 51487 ns 1.00
array/broadcast 21140.5 ns 20729 ns 1.02
array/copyto!/gpu_to_gpu 12008 ns 11560 ns 1.04
array/copyto!/cpu_to_gpu 210181 ns 208930 ns 1.01
array/copyto!/gpu_to_cpu 243254.5 ns 241934 ns 1.01
array/accumulate/1d 109436 ns 108332 ns 1.01
array/accumulate/2d 80310 ns 80060 ns 1.00
array/construct 1340.7 ns 1258.3 ns 1.07
array/random/randn/Float32 44338.5 ns 42993 ns 1.03
array/random/randn!/Float32 27063 ns 26226 ns 1.03
array/random/rand!/Int64 27383 ns 26878 ns 1.02
array/random/rand!/Float32 8812.833333333332 ns 8569 ns 1.03
array/random/rand/Int64 38260.5 ns 29957 ns 1.28
array/random/rand/Float32 13367 ns 12962 ns 1.03
array/permutedims/4d 61382 ns 61080 ns 1.00
array/permutedims/2d 55577 ns 55180 ns 1.01
array/permutedims/3d 56451 ns 55780 ns 1.01
array/sorting/1d 2766078 ns 2774685 ns 1.00
array/sorting/by 3369788 ns 3367411.5 ns 1.00
array/sorting/2d 1084807 ns 1085055 ns 1.00
cuda/synchronization/stream/auto 1092.6 ns 1048.8 ns 1.04
cuda/synchronization/stream/nonblocking 6449.8 ns 6455.4 ns 1.00
cuda/synchronization/stream/blocking 830.5952380952381 ns 846.986301369863 ns 0.98
cuda/synchronization/context/auto 1225.9 ns 1230.1 ns 1.00
cuda/synchronization/context/nonblocking 6732.4 ns 6670.8 ns 1.01
cuda/synchronization/context/blocking 973.5833333333334 ns 928 ns 1.05

This comment was automatically generated by workflow using github-action-benchmark.

@luraess
Copy link
Contributor

luraess commented Jan 28, 2025

Thanks! This now execute as expected when profiling a block of code in non-interactive mode

nsys profile --start-later=true --capture-range=cudaProfilerApi --capture-range-end=stop --trace=cuda,nvtx julia --project test.jl

@maleadt maleadt merged commit 69f3a76 into master Jan 28, 2025
3 checks passed
@maleadt maleadt deleted the tb/nsys_session branch January 28, 2025 13:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Regression with nsys profile and CUDA.@profile
2 participants