dahong67 · alexmul1114 · Jan 8, 2024 · Jan 8, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/Project.toml b/Project.toml
@@ -4,22 +4,29 @@ authors = ["David Hong <[email protected]> and contributors"]
 version = "0.1.2"
 
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
 LBFGSB = "5be7bae1-8223-5378-bac3-9e7378a2f6e6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 
 [extensions]
+CUDAExt = "CUDA"
 LossFunctionsExt = "LossFunctions"
 
 [compat]
+CUDA = ">= 4.4.1"
 ForwardDiff = "0.10.36"
 IntervalSets = "0.7.7"
 LBFGSB = "0.4.1"
 LinearAlgebra = "1.6"
 LossFunctions = "0.11.1"
 julia = "1.6"
+
+[extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -0,0 +1,47 @@
+module CUDAExt
+
+using GCPDecompositions, CUDA
+
+GCPDecompositions.gcp(
+    X::CuArray,
+    r,
+    loss = LeastSquaresLoss();
+    constraints = (),
+    algorithm = GCPAlgorithms.ALS(),
+) = _gcp(X, r, loss, constraints, algorithm)
+function _gcp(
+    X::CuArray{TX,N},
+    r,
+    loss::LeastSquaresLoss,
+    constraints::Tuple{},
+    algorithm::GCPAlgorithms.ALS,
+) where {TX<:Real,N}
+    T = promote_type(TX, Float32)
+
+    # Random initialization
+    M0 = CPD(ones(T, r), rand.(T, size(X), r))
+    M0norm = sqrt(sum(abs2, M0[I] for I in CartesianIndices(size(M0))))
+    Xnorm = sqrt(mapreduce(x -> isnan(x) ? 0 : abs2(x), +, X, init=0f0))
+    for k in Base.OneTo(N)
+        M0.U[k] .*= (Xnorm / M0norm)^(1 / N)
+    end
+    λ, U = M0.λ, collect(M0.U)
+
+    # Move λ, U to gpu
+    λ = CuArray(λ)
+    U = [CuArray(U[i]) for i in 1:N]
+
+    # Inefficient but simple implementation
+    for _ in 1:algorithm.maxiters
+        for n in 1:N
+            V = reduce(.*, U[i]'U[i] for i in setdiff(1:N, n))
+            U[n] = GCPDecompositions.mttkrp(X, U, n) / V
+            λ = CuArray(CUDA.norm.(eachcol(U[n])))
+            U[n] = U[n] ./ permutedims(λ)
+        end
+    end
+
+    return CPD(Array(λ), Tuple([Array(U[i]) for i in 1:N]))
+end
+
+end
diff --git a/src/GCPDecompositions.jl b/src/GCPDecompositions.jl
@@ -39,6 +39,7 @@ include("gcp-opt.jl")
 
 if !isdefined(Base, :get_extension)
     include("../ext/LossFunctionsExt.jl")
+    include("../ext/CUDAExt.jl")
 end
 
 end
diff --git a/src/gcp-opt.jl b/src/gcp-opt.jl
@@ -150,8 +150,8 @@ function _gcp(
     constraints::Tuple{},
     algorithm::GCPAlgorithms.ALS,
 ) where {TX<:Real,N}
-    T = promote_type(TX, Float64)
-
+    #T = promote_type(TX, Float64)
+    T = promote_type(TX, Float32)
     # Random initialization
     M0 = CPD(ones(T, r), rand.(T, size(X), r))
     M0norm = sqrt(sum(abs2, M0[I] for I in CartesianIndices(size(M0))))
@@ -174,21 +174,50 @@ function _gcp(
     return CPD(λ, Tuple(U))
 end
 
-# inefficient but simple
+"""
+    mttkrp(X, U, n) -> Rn
+
+    Algorithm for computing one mode of MTTKRP is from "Fast Alternating LS Algorithms
+    for High Order CANDECOMP/PARAFAC Tensor Factorizations" by Phan et al., specifically
+    section III-B.
+"""
 function mttkrp(X, U, n)
+
     # Dimensions
     N, I, r = length(U), Tuple(size.(U, 1)), (only∘unique)(size.(U, 2))
     (N == ndims(X) && I == size(X)) || throw(DimensionMismatch("`X` and `U` do not have matching dimensions"))
 
-    # Matricized tensor (in mode n)
-    Xn = reshape(permutedims(X, [n; setdiff(1:N, n)]), size(X, n), :)
-
-    # Khatri-Rao product (in mode n)
-    Zn = similar(U[1], prod(I[setdiff(1:N, n)]), r)
-    for j in 1:r
-        Zn[:, j] = reduce(kron, [view(U[i], :, j) for i in reverse(setdiff(1:N, n))])
+    # See section III-B from "Fast Alternating LS Algorithms for High Order CANDECOMP/PARAFAC Tensor Factorizations" by Phan et al.
+    Rn = similar(U[n])
+    Jn = prod(size(X)[1:n])
+    Kn = prod(size(X)[n+1:end])
+
+    # Special cases are n = 1 and n = N (n = 1 has no outer tensor-vector products),
+    # n = N has no inner tensor-vector products
+    if n == 1
+        # Just inner tensor-vector products  
+        kr_inner = khatrirao(U[reverse(2:N)]...)
+        mul!(Rn, reshape(X, size(X, 1), :), kr_inner)
+    elseif n == N
+        # Just outer tensor-vector products
+        kr_outer = khatrirao(U[reverse(1:N-1)]...)
+        mul!(Rn, transpose(reshape(X, prod(size(X)[1:N-1]), size(X)[N])), kr_outer)
+    else
+        kr_inner = khatrirao(U[reverse(n+1:N)]...) 
+        kr_outer = khatrirao(U[reverse(1:n-1)]...)
+        inner = reshape(reshape(X, Jn, Kn) * kr_inner, (size(X)[1:n]..., r)) 
+        Jn_inner = prod(size(inner)[1:n-1])
+        Kn_inner = prod(size(inner)[n:end-1])
+        Rn = reduce(hcat, [copy(transpose(reshape(selectdim(inner, ndims(inner), j), Jn_inner, Kn_inner))) * kr_outer[:, j] for j in 1:r])
     end
+    return Rn
+end
 
-    # MTTKRP (in mode n)
-    return Xn * Zn
+function khatrirao(A::Vararg{T,N}) where {T<:AbstractMatrix,N}
+    r = size(A[1],2)
+    R = ntuple(Val(N)) do k
+        dims = (ntuple(i->1,Val(N-k))..., :, ntuple(i->1,Val(k-1))..., r)
+        return reshape(A[k],dims)
+    end
+    return reshape(broadcast(*, R...),:,r)
 end