-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CUDA Support for ALS #37
base: master
Are you sure you want to change the base?
Changes from 22 commits
d48f0f1
4596484
6e2a476
01e2ab9
1e6097e
fd71ca4
e5b30ea
c34cc3c
966e30b
9938c46
e8077a3
fb1d0ce
e368433
614929d
7854ec8
df803e1
2a7f0c2
b72f6e9
12f0004
e9680f6
ebc5455
ab40583
f4696c0
d7f68ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,22 +4,29 @@ authors = ["David Hong <[email protected]> and contributors"] | |
version = "0.1.2" | ||
|
||
[deps] | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" | ||
IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" | ||
LBFGSB = "5be7bae1-8223-5378-bac3-9e7378a2f6e6" | ||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" | ||
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" | ||
|
||
[weakdeps] | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" | ||
|
||
[extensions] | ||
CUDAExt = "CUDA" | ||
LossFunctionsExt = "LossFunctions" | ||
|
||
[compat] | ||
CUDA = ">= 4.4.1" | ||
ForwardDiff = "0.10.36" | ||
IntervalSets = "0.7.7" | ||
LBFGSB = "0.4.1" | ||
LinearAlgebra = "1.6" | ||
LossFunctions = "0.11.1" | ||
julia = "1.6" | ||
|
||
[extras] | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
module CUDAExt | ||
|
||
using GCPDecompositions, CUDA | ||
|
||
GCPDecompositions.gcp( | ||
X::CuArray, | ||
r, | ||
loss = LeastSquaresLoss(); | ||
constraints = (), | ||
algorithm = GCPAlgorithms.ALS(), | ||
) = _gcp(X, r, loss, constraints, algorithm) | ||
function _gcp( | ||
X::CuArray{TX,N}, | ||
r, | ||
loss::LeastSquaresLoss, | ||
constraints::Tuple{}, | ||
algorithm::GCPAlgorithms.ALS, | ||
) where {TX<:Real,N} | ||
T = promote_type(TX, Float32) | ||
|
||
# Random initialization | ||
M0 = CPD(ones(T, r), rand.(T, size(X), r)) | ||
M0norm = sqrt(sum(abs2, M0[I] for I in CartesianIndices(size(M0)))) | ||
Xnorm = sqrt(mapreduce(x -> isnan(x) ? 0 : abs2(x), +, X, init=0f0)) | ||
for k in Base.OneTo(N) | ||
M0.U[k] .*= (Xnorm / M0norm)^(1 / N) | ||
end | ||
λ, U = M0.λ, collect(M0.U) | ||
|
||
# Move λ, U to gpu | ||
λ = CuArray(λ) | ||
U = [CuArray(U[i]) for i in 1:N] | ||
|
||
# Inefficient but simple implementation | ||
for _ in 1:algorithm.maxiters | ||
for n in 1:N | ||
V = reduce(.*, U[i]'U[i] for i in setdiff(1:N, n)) | ||
U[n] = GCPDecompositions.mttkrp(X, U, n) / V | ||
λ = CuArray(CUDA.norm.(eachcol(U[n]))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like line 39 here is much slower when having to move the data around devices:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rewriting norm.(eachcol(U[n])) as vec(sqrt.(sum(abs2, U_gpu[n]; dims=1))) prevents data from being transferred back to CPU and gets a 5x speedup for GPU version, similar time for CPU:
|
||
U[n] = U[n] ./ permutedims(λ) | ||
end | ||
end | ||
|
||
return CPD(Array(λ), Tuple([Array(U[i]) for i in 1:N])) | ||
end | ||
|
||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added CUDA as an extension, where the extension has a gcp definition for CuArray input. Right now M0 is created and normalized on the CPU then moved to the GPU for ALS (and moved back to the CPU to be returned at the end). Need to figure out how rewrite line 24 without scalar indexing so M0 can be created directly as a CuArray.