diff --git a/Project.toml b/Project.toml
index 8a69dce..4ad2d0b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["David Hong <hong@udel.edu> and contributors"]
 version = "0.1.2"
 
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
@@ -13,12 +14,15 @@ LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 
 [extensions]
+CUDAExt = "CUDA"
 LossFunctionsExt = "LossFunctions"
 
 [compat]
+CUDA = ">= 4.4.1"
 Compat = "3.42, 4"
 ForwardDiff = "0.10.36"
 IntervalSets = "0.7.7"
@@ -27,3 +31,6 @@ LinearAlgebra = "1.6"
 LossFunctions = "0.11.1"
 Random = "1.6"
 julia = "1.6"
+
+[extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
\ No newline at end of file
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
new file mode 100644
index 0000000..2e7e99d
--- /dev/null
+++ b/ext/CUDAExt.jl
@@ -0,0 +1,47 @@
+module CUDAExt
+
+using GCPDecompositions, CUDA
+
+GCPDecompositions.gcp(
+    X::CuArray,
+    r,
+    loss = LeastSquaresLoss();
+    constraints = (),
+    algorithm = GCPAlgorithms.ALS(),
+) = _gcp(X, r, loss, constraints, algorithm)
+function _gcp(
+    X::CuArray{TX,N},
+    r,
+    loss::LeastSquaresLoss,
+    constraints::Tuple{},
+    algorithm::GCPAlgorithms.ALS,
+) where {TX<:Real,N}
+    T = promote_type(TX, Float32)
+
+    # Random initialization
+    M0 = CPD(ones(T, r), rand.(T, size(X), r))
+    M0norm = sqrt(sum(abs2, M0[I] for I in CartesianIndices(size(M0))))
+    Xnorm = sqrt(mapreduce(x -> isnan(x) ? 0 : abs2(x), +, X, init=0f0))
+    for k in Base.OneTo(N)
+        M0.U[k] .*= (Xnorm / M0norm)^(1 / N)
+    end
+    λ, U = M0.λ, collect(M0.U)
+
+    # Move λ, U to gpu
+    λ = CuArray(λ)
+    U = [CuArray(U[i]) for i in 1:N]
+
+    # Inefficient but simple implementation
+    for _ in 1:algorithm.maxiters
+        for n in 1:N
+            V = reduce(.*, U[i]'U[i] for i in setdiff(1:N, n))
+            U[n] = GCPDecompositions.mttkrp(X, U, n) / V
+            λ = vec(sqrt.(sum(abs2, U[n]; dims=1)))
+            U[n] = U[n] ./ permutedims(λ)
+        end
+    end
+
+    return CPD(Array(λ), Tuple([Array(U[i]) for i in 1:N]))
+end
+
+end
\ No newline at end of file
diff --git a/src/GCPDecompositions.jl b/src/GCPDecompositions.jl
index 05c06b7..876c957 100644
--- a/src/GCPDecompositions.jl
+++ b/src/GCPDecompositions.jl
@@ -24,6 +24,7 @@ include("gcp-algorithms.jl")
 
 if !isdefined(Base, :get_extension)
     include("../ext/LossFunctionsExt.jl")
+    include("../ext/CUDAExt.jl")
 end
 
 # Main fitting function
@@ -113,7 +114,8 @@ default_init(X, r, loss, constraints, algorithm) =
 function default_init(rng, X, r, loss, constraints, algorithm)
     # Generate CPD with random factors
     T, N = nonmissingtype(eltype(X)), ndims(X)
-    T = promote_type(T, Float64)
+    #T = promote_type(T, Float64)
+    T = promote_type(T, Float32)
     M = CPD(ones(T, r), rand.(rng, T, size(X), r))
 
     # Normalize
diff --git a/src/gcp-algorithms/als.jl b/src/gcp-algorithms/als.jl
index 082e0c1..9b56604 100644
--- a/src/gcp-algorithms/als.jl
+++ b/src/gcp-algorithms/als.jl
@@ -34,7 +34,7 @@ function _gcp(
             V = reduce(.*, M.U[i]'M.U[i] for i in setdiff(1:N, n))
             mttkrp!(M.U[n], X, M.U, n, mttkrp_buffers[n])
             rdiv!(M.U[n], lu!(V))
-            M.λ .= norm.(eachcol(M.U[n]))
+            M.λ .= vec(sqrt.(sum(abs2, U[n]; dims=1)))
             M.U[n] ./= permutedims(M.λ)
         end
     end