diff --git a/examples/compilation/README.md b/examples/compilation/README.md deleted file mode 100644 index 307d57a..0000000 --- a/examples/compilation/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Compilation of CUDA C - -Example of compiling inline CUDA C code and running it using CUDArt and CUDAdrv. diff --git a/examples/compilation/usage.jl b/examples/compilation/usage.jl deleted file mode 100644 index 7129b47..0000000 --- a/examples/compilation/usage.jl +++ /dev/null @@ -1,69 +0,0 @@ -using CUDAdrv, CUDArt -using Base.Test - -include("library.jl") - -dev = CuDevice(0) -ctx = CuContext(dev) - - -## basic compilation & execution - -let - @compile dev kernel """ - __global__ void kernel() - { - } - """ - - cudacall(kernel, 1, 1, ()) -end - -@test_throws CompileError let - @compile dev kernel """ - __global__ void kernel() - { - invalid code - } - """ -end - -@test_throws CompileError let - @compile dev wrongname """ - __global__ void kernel() - { - } - """ -end - - -## argument passing - -dims = (16, 16) -len = prod(dims) - -@compile dev kernel_copy """ -__global__ void kernel_copy(const float *input, float *output) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - output[i] = input[i]; -} -""" - -let - input = round.(rand(Float32, dims) * 100) - - input_dev = CuArray(input) - output_dev = CuArray{Float32}(dims) - - cudacall(kernel_copy, 1, len, - Tuple{Ptr{Float32}, Ptr{Float32}}, - pointer(input_dev), pointer(output_dev)) - output = Array(output_dev) - @test input ≈ output -end - - -clean_cache() # for deterministic testing purposes -destroy!(ctx) diff --git a/examples/vadd.jl b/examples/vadd.jl new file mode 100644 index 0000000..1e80a12 --- /dev/null +++ b/examples/vadd.jl @@ -0,0 +1,30 @@ +using CUDAdrv, CUDArt +using Base.Test + +using Compat + +dev = CuDevice(0) +ctx = CuContext(dev) + +CUDArt.@compile dev kernel_vadd """ + __global__ void kernel_vadd(const float *a, const float *b, float *c) + { + int i = blockIdx.x *blockDim.x + threadIdx.x; + c[i] = a[i] + b[i]; + } +""" + +dims = (3,4) +a = round.(rand(Float32, dims) * 100) +b = round.(rand(Float32, dims) * 100) + +d_a = CuArray(a) +d_b = CuArray(b) +d_c = similar(d_a) + +len = prod(dims) +cudacall(kernel_vadd, len, 1, Tuple{Ptr{Cfloat},Ptr{Cfloat},Ptr{Cfloat}}, d_a, d_b, d_c) +c = Array(d_c) +@test a+b ≈ c + +destroy!(ctx) diff --git a/src/CUDArt.jl b/src/CUDArt.jl index c4a19fa..7a55f4f 100644 --- a/src/CUDArt.jl +++ b/src/CUDArt.jl @@ -46,6 +46,7 @@ include("device.jl") include("stream.jl") include("pointer.jl") include("arrays.jl") +include("compile.jl") include("execute.jl") include("precompile.jl") diff --git a/examples/compilation/library.jl b/src/compile.jl similarity index 59% rename from examples/compilation/library.jl rename to src/compile.jl index 2b4eceb..243dd61 100644 --- a/examples/compilation/library.jl +++ b/src/compile.jl @@ -1,7 +1,4 @@ -# EXCLUDE FROM TESTING - -using CUDArt -using Compat +export CompileError # Generate a temporary file with specific suffix # NOTE: mkstemps is glibc 2.19+, so emulate its behavior @@ -28,9 +25,8 @@ macro compile(dev, kernel, code) :($(esc(kernel)) = _compile($(esc(dev)), $kernel_name, $code, $containing_file))) end -type CompileError <: Base.WrappedException +immutable CompileError <: Exception message::String - error end const builddir = joinpath(@__DIR__, ".cache") @@ -43,7 +39,7 @@ function _compile(dev, kernel, code, containing_file) mkpath(builddir) end - # Check if we need to compile + # check if we need to compile codehash = hex(hash(code)) output = "$builddir/$(kernel)_$(codehash)-$(arch).ptx" if isfile(output) @@ -52,9 +48,9 @@ function _compile(dev, kernel, code, containing_file) need_compile = true end - # Compile the source, if necessary + # compile the source, if necessary if need_compile - # Write the source into a compilable file + # write the source to a compilable file (source, io) = mkstemps(".cu") write(io, """ extern "C" @@ -62,17 +58,20 @@ extern "C" $code } """) - close(io) + Base.close(io) compile_flags = vcat(CUDArt.toolchain_flags, ["--gpu-architecture", arch]) - try - # TODO: capture STDERR - run(pipeline(`$(CUDArt.toolchain_nvcc) $(compile_flags) -ptx -o $output $source`, stderr=DevNull)) - catch ex - isa(ex, ErrorException) || rethrow(ex) - rethrow(CompileError("compilation of kernel $kernel failed (typo in C++ source?)", ex)) - finally - rm(source) + err = Pipe() + cmd = `$(CUDArt.toolchain_nvcc) $(compile_flags) -ptx -o $output $source` + result = success(pipeline(cmd; stdout=DevNull, stderr=err)) + Base.close(err.in) + rm(source) + + errors = readstring(err) + if !result + throw(CompileError("compilation of kernel $kernel failed\n$errors")) + elseif !isempty(errors) + warn("during compilation of kernel $kernel:\n$errors") end if !isfile(output) @@ -80,23 +79,13 @@ $code end end - # Pass the module to the CUDA driver - mod = try - CuModuleFile(output) - catch ex - rethrow(CompileError("loading of kernel $kernel failed (invalid CUDA code?)", ex)) - end - - # Load the function pointer - func = try - CuFunction(mod, kernel) - catch ex - rethrow(CompileError("could not find kernel $kernel in the compiled binary (wrong function name?)", ex)) - end - - return func + mod = CUDAdrv.CuModuleFile(output) + return CUDAdrv.CuFunction(mod, kernel) end function clean_cache() - rm(builddir; recursive=true) + if ispath(builddir) + @assert isdir(builddir) + rm(builddir; recursive=true) + end end diff --git a/test/compile.jl b/test/compile.jl new file mode 100644 index 0000000..6d4e13c --- /dev/null +++ b/test/compile.jl @@ -0,0 +1,69 @@ +using CUDArt +import CUDAdrv +using Base.Test + +dev = CUDAdrv.CuDevice(0) +ctx = CUDAdrv.CuContext(dev) + +CUDArt.clean_cache() # for deterministic testing purposes + + +## basic compilation & execution + +let + CUDArt.@compile dev kernel """ + __global__ void kernel() + { + } + """ + + CUDAdrv.cudacall(kernel, 1, 1, ()) +end + +@test_throws CompileError let + CUDArt.@compile dev kernel """ + __global__ void kernel() + { + invalid code + } + """ +end + +@test_throws CUDAdrv.CuError let + CUDArt.@compile dev wrongname """ + __global__ void kernel() + { + } + """ +end + + +## argument passing + +dims = (16, 16) +len = prod(dims) + +CUDArt.@compile dev kernel_copy """ +__global__ void kernel_copy(const float *input, float *output) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + output[i] = input[i]; +} +""" + +let + input = round.(rand(Cfloat, dims) * 100) + + input_dev = CUDAdrv.CuArray(input) + output_dev = CUDAdrv.CuArray{Cfloat}(dims) + + CUDAdrv.cudacall(kernel_copy, 1, len, + Tuple{Ptr{Cfloat}, Ptr{Cfloat}}, + input_dev, output_dev) + output = Array(output_dev) + @test input ≈ output +end + + +CUDAdrv.destroy!(ctx) diff --git a/test/runtests.jl b/test/runtests.jl index cba300e..4d9ae74 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,4 @@ include("gc.jl") include("test.jl") +include("compile.jl") include("examples.jl")