diff --git a/btas/btas.h b/btas/btas.h
index 53a9b008..9c4fac5f 100644
--- a/btas/btas.h
+++ b/btas/btas.h
@@ -9,9 +9,11 @@
 
 #include <btas/generic/cp_als.h>
 #include <btas/generic/cp_rals.h>
+//#include <btas/generic/cp_id.h>
 #include <btas/generic/cp_df_als.h>
 #include <btas/generic/tuck_cp_als.h>
 #include <btas/generic/coupled_cp_als.h>
+#include <btas/generic/cp_bcd.h>
 #include <btas/generic/dot_impl.h>
 #include <btas/generic/scal_impl.h>
 #include <btas/generic/axpy_impl.h>
diff --git a/btas/generic/cp.h b/btas/generic/cp.h
index 384b6e08..e59be5f8 100644
--- a/btas/generic/cp.h
+++ b/btas/generic/cp.h
@@ -571,7 +571,7 @@ namespace btas {
     /// \param[in] Mat Calculates the 2-norm of the matrix mat
     /// \return the 2-norm.
 
-    auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, Mat))); }
+    auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, btas::impl::conj(Mat)))); }
 
     /// SVD referencing code from
     /// http://www.netlib.org/lapack/explore-html/de/ddd/lapacke_8h_af31b3cb47f7cc3b9f6541303a2968c9f.html
diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index 39698e2d..592af8ba 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -339,6 +339,7 @@ namespace btas {
     Tensor &tensor_ref;        // Tensor to be decomposed
     ord_t size;                // Total number of elements
     bool factors_set = false;  // Are the factors preset (not implemented yet).
+    std::vector<std::vector<int64_t>> pivs;
 
     /// Creates an initial guess by computing the SVD of each mode
     /// If the rank of the mode is smaller than the CP rank requested
@@ -658,7 +659,82 @@ namespace btas {
                       double lambda = 0) {
       Tensor temp(A[n].extent(0), rank);
       Tensor an(A[n].range());
-      
+      // Testing the code to see if pivoted QR can help
+      if (false) {
+        // First create a Pivot matrix from the flattened tensor_ref
+        auto f = flatten(tensor_ref, n);
+        auto square_dim = f.extent(0), full = f.extent(1);
+        auto scale_factor = double(full) / double(square_dim);
+        auto extended = (scale_factor > 2.0 ? square_dim * (scale_factor - 1.0) : full);
+        std::vector<int64_t> piv;
+        if (pivs.size() < (n + 1)) {
+          piv = std::vector<int64_t>(f.extent(1));
+          std::vector<T> tau(f.extent(1));
+          btas::geqp3_pivot(blas::Layout::RowMajor, f.extent(0), f.extent(1), f.data(), f.extent(1), piv.data(),
+                            tau.data());
+          Tensor R(full, square_dim);
+          R.fill(0.0);
+          f = flatten(tensor_ref, n);
+          pivs.emplace_back(piv);
+        } else {
+          piv = pivs[n];
+        }
+
+        auto K = this->generate_KRP(n, rank, true);
+        // For reference I compute the Matricized tensor times khatri rao product
+        gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, f, K, this->zero, temp);
+        detail::set_MtKRP(converge_test, temp);
+
+        Tensor Fp(square_dim, square_dim);
+        {
+          Tensor t(square_dim, full);
+          for (auto i = 0; i < square_dim; ++i) {
+            for (auto j = 0; j < full; ++j) {
+              int v = pivs[n][j];
+              t(i, j) = f(i, v);
+            }
+          }
+
+          for (auto i = 0; i < square_dim; ++i) {
+            for (auto j = 0; j < square_dim; ++j) {
+              Fp(i, j) = t(i, j);
+            }
+          }
+        }
+
+        Tensor Kp(square_dim, rank);
+        {
+          Tensor t(full, rank);
+          for (auto j = 0; j < full; ++j) {
+            int v = pivs[n][j];
+            for (auto r = 0; r < rank; ++r) {
+              t(j, r) = K(v, r);
+            }
+          }
+
+          for (auto j = 0; j < square_dim; ++j) {
+            for (auto r = 0; r < rank; ++r) {
+              Kp(j, r) = t(j, r);
+            }
+          }
+        }
+
+        // contract the product from above with the pseudoinverse of the Hadamard
+        // produce an optimize factor matrix
+        fast_pI = false;
+        auto pInv = pseudoInverse(Kp, fast_pI);
+
+        Tensor t;
+        gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, Fp, pInv, this->zero, temp);
+
+        // compute the difference between this new factor matrix and the previous
+        // iteration
+        this->normCol(temp);
+
+        // Replace the old factor matrix with the new optimized result
+        A[n] = temp;
+        return;
+      }
 #ifdef BTAS_HAS_INTEL_MKL
 
       // Computes the Khatri-Rao product intermediate
diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
new file mode 100644
index 00000000..afa69ed0
--- /dev/null
+++ b/btas/generic/cp_bcd.h
@@ -0,0 +1,313 @@
+//
+// Created by Karl Pierce on 4/29/23.
+//
+
+#ifndef BTAS_GENERIC_CP_BCD_H
+#define BTAS_GENERIC_CP_BCD_H
+
+#include <btas/generic/cp.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+namespace btas{
+  /** \brief Computes the Canonical Product (CP) decomposition of an order-N
+    tensor using block coordinate descent (BCD).
+
+  This computes the CP decomposition of btas::Tensor objects with row
+  major storage only with fixed (compile-time) and variable (run-time)
+  ranks. Also provides Tucker and randomized Tucker-like compressions coupled
+  with CP-BCD decomposition. Does not support strided ranges.
+
+   \warning this code takes a non-const reference \c tensor_ref but does
+   not modify the values. This is a result of API (reshape needs non-const tensor)
+
+   Synopsis:
+    \code
+      // Constructors
+      CP_BCD A(tensor)         // CP_BCD object with empty factor
+                               // matrices and no symmetries
+      CP_BCD A(tensor, symms)  // CP_ALS object with empty factor
+                               // matrices and symmetries
+
+      // Operations
+      A.compute_rank(rank, converge_test)  // Computes the CP_BCD of tensor to
+                                           // rank, rank build and HOSVD options
+
+      A.compute_rank_random(rank, converge_test)  // Computes the CP_BCD of tensor to
+                                                  // rank. Factor matrices built at rank
+                                                  // with random numbers
+
+      A.compute_error(converge_test, omega)  // Computes the CP_BCD of tensor to
+                                             // 2-norm
+                                             // error < omega.
+
+      //See documentation for full range of options
+
+      // Accessing Factor Matrices
+      A.get_factor_matrices()  // Returns a vector of factor matrices, if
+                               // they have been computed
+
+      A.reconstruct()  // Returns the tensor computed using the
+                       // CP factor matrices
+    \endcode
+                                         */
+
+  template <typename Tensor, class ConvClass = NormCheck<Tensor> >
+  class CP_BCD : public CP_ALS<Tensor, ConvClass> {
+   public:
+    using T = typename Tensor::value_type;
+    using RT = real_type_t<T>;
+    using RTensor = rebind_tensor_t<Tensor, RT>;
+
+    using CP<Tensor, ConvClass>::A;
+    using CP<Tensor, ConvClass>::ndim;
+    using CP<Tensor, ConvClass>::symmetries;
+    using typename CP<Tensor, ConvClass>::ind_t;
+    using typename CP<Tensor, ConvClass>::ord_t;
+    using CP_ALS<Tensor, ConvClass>::tensor_ref;
+    using CP_ALS<Tensor, ConvClass>::size;
+
+    /// Create a CP BCD object, child class of the CP object
+    /// that stores the reference tensor.
+    /// Reference tensor has no symmetries.
+    /// \param[in] tensor the reference tensor to be decomposed.
+    CP_BCD(Tensor &tensor, ind_t block_size = 1, size_t sweeps=1, std::vector<size_t> bs = std::vector<size_t>()) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size), nsweeps(sweeps){
+      if(!bs.empty()){
+        block_sizes = bs;
+      }
+    }
+
+    /// Create a CP BCD object, child class of the CP object
+    /// that stores the reference tensor.
+    /// Reference tensor has symmetries.
+    /// Symmetries should be set such that the higher modes index
+    /// are set equal to lower mode indices (a 4th order tensor,
+    /// where the second & third modes are equal would have a
+    /// symmetries of {0,1,1,3}
+    /// \param[in] tensor the reference tensor to be decomposed.
+    /// \param[in] symms the symmetries of the reference tensor.
+    CP_BCD(Tensor &tensor, std::vector<size_t> &symms, ind_t block_size = 1, size_t sweeps=1)
+        : CP_ALS<Tensor, ConvClass>(tensor, symms), blocksize(block_size), nsweeps(sweeps){
+    }
+
+    CP_BCD() = default;
+
+    ~CP_BCD() = default;
+
+    void set_block_sizes(size_t rank, std::vector<size_t> bs){
+      BTAS_ASSERT(bs[bs.size() - 1] == rank);
+      block_sizes = bs;
+      return;
+    }
+   protected:
+    Tensor gradient, block_ref;           // Stores gradient of BCD for fitting.
+    ind_t blocksize;           // Size of block gradient
+    std::vector<Tensor> blockfactors;
+    std::vector<size_t> order;  // convenient to write order of tensors for reconstruct
+    long z = 0;
+    size_t nsweeps;
+    std::vector<size_t> block_sizes;
+
+    /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank
+    /// \param[in] rank The rank of the CP decomposition.
+    /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit.
+    /// \param[in] dir The CP decomposition be computed without calculating the
+    /// Khatri-Rao product?
+    /// \param[in] max_als If CP decomposition is to finite
+    /// error, max_als is the highest rank approximation computed before giving up
+    /// on CP-ALS. Default = 1e5.
+    /// \param[in] calculate_epsilon Should the 2-norm
+    /// error be calculated ||T_exact - T_approx|| = epsilon.
+    /// \param[in] tcutALS
+    /// How small difference in factor matrices must be to consider ALS of a
+    /// single rank converged. Default = 0.1.
+    /// \param[in, out] epsilon The 2-norm
+    /// error between the exact and approximated reference tensor
+    /// \param[in,out] fast_pI Whether the pseudo inverse be computed using a fast cholesky decomposition,
+    ///       on return \c fast_pI will be true if use of Cholesky was successful
+    virtual void ALS(ind_t rank, ConvClass &converge_test, bool dir, int max_als, bool calculate_epsilon, double &epsilon,
+                     bool &fast_pI){
+      for(auto i = 0; i < ndim; ++i){
+        order.emplace_back(i);
+      }
+
+      block_ref = tensor_ref;
+
+      // Number of full blocks with rank blocksize
+      // plus one more block with rank: rank % blocksize
+      int n_full_blocks = int( rank / blocksize),
+          last_blocksize = rank % blocksize;
+
+      if(block_sizes.empty()) {
+        size_t index = 0;
+        block_sizes.emplace_back(index);
+        for(size_t i = 0; i < n_full_blocks; ++i) {
+          index += blocksize;
+          block_sizes.emplace_back(index);
+        }
+        if(last_blocksize != 0) block_sizes.emplace_back(index + last_blocksize);
+      }
+      // copying all of A to block factors. Then we are going to take slices of A
+      // that way we can leverage all of `CP_ALS` code without modification
+      blockfactors = A;
+      gradient = tensor_ref;
+      // this stores the factors from rank 0 to #blocks * blocksize
+      // Compute all the BCD of the full blocks
+      auto matlab = true;
+      auto one_over_tref = 1.0 / this->norm(tensor_ref);
+      auto fit = 1.0, change = 0.0;
+      bool compute_full_fit = false;
+      size_t n_blocks = block_sizes.size();
+      for(auto s = 0; s < nsweeps; ++s){
+        long block_step = 0;
+        this->AtA = std::vector<Tensor>(ndim);
+
+        // E - A1 - A2 - ... = gradient
+        // initial : gradient = E
+        // gradient = E - A1
+        for (long b = 1; b < n_blocks; ++b) {
+          this->AtA = std::vector<Tensor>(ndim);
+          BCD(block_sizes[b - 1], block_sizes[b], max_als, fast_pI, matlab, converge_test, s);
+          std::cout << "\tnorm (remainder) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+          // Test the system to see if converged. Doing the hard way first
+          // To compute the accuracy fully compute CP approximation using blocks 0 through b.
+          if(compute_full_fit) {
+            compute_full(block_step + blocksize, one_over_tref, change, fit);
+          }
+        }
+//        if(last_blocksize != 0) {
+//          std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+//          this->AtA = std::vector<Tensor>(ndim);
+//          block_step = n_full_blocks * blocksize;
+//          BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s);
+//          if(compute_full_fit) {
+//            compute_full(block_step + last_blocksize, one_over_tref, change, fit);
+//          }
+//        }
+        //std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+        epsilon = (compute_full_fit == false ? 
+                        1.0 - this->norm(gradient) * one_over_tref
+                        : fit);
+        std::cout << epsilon << "\n" << std::endl;
+      }
+      detail::set_norm(converge_test, this->norm(tensor_ref));
+      converge_test.verbose(true);
+      A = blockfactors;
+      this->AtA = std::vector<Tensor>(ndim);
+      for (size_t i = 0; i < ndim; ++i) {
+        auto &a_mat = A[i];
+        contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+      }
+      gradient = reconstruct(A, order, A[ndim]);
+      std::cout << norm(tensor_ref - gradient) * one_over_tref << std::endl;
+    }
+
+    void copy_blocks(Tensor & to, const Tensor & from, ind_t block_start, ind_t block_end){
+        auto tref_dim = to.extent(0);
+        auto to_rank = to.extent(1), from_rank = from.extent(1);
+
+        auto to_ptr = to.data();
+        auto from_ptr = from.data();
+        ind_t from_pos = 0;
+        for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += to_rank){
+          for (auto b = block_start; b < block_end; ++b, ++from_pos){
+            *(to_ptr + skip + b) = *(from_ptr + from_pos);
+          }
+        }
+    }
+
+    void compute_full(long block_end, T one_over_tref, T & change, T& fit){
+      std::vector<Tensor> current_grads(ndim);
+      for (size_t i = 0; i < ndim; ++i) {
+        auto &a_mat = blockfactors[i];
+        auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_end};
+        current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+      }
+
+      auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+      auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
+      change = abs(fit - newfit);
+      fit = newfit;
+      std::cout << block_end << "\t";
+      std::cout << fit << "\t" << change << std::endl;
+    }
+    /// Computes an optimized factor matrix holding all others constant.
+    /// No Khatri-Rao product computed, immediate contraction
+
+    /// \param[in] n The mode being optimized, all other modes held constant
+    /// \param[in] rank The current rank, column dimension of the factor matrices
+    /// \param[in,out] fast_pI Should the pseudo inverse be computed using a fast cholesky decomposition
+    /// return if computing the \c fast_pI was successful.
+    /// \param[in, out] matlab If \c fast_pI = true then try to solve VA = B instead of taking pseudoinverse
+    /// in the same manner that matlab would compute the inverse. If this fails, variable will be manually
+    /// set to false and SVD will be used.
+    /// return if \c matlab was successful
+    /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged
+
+    void BCD(ind_t block_start, ind_t block_end, size_t max_als,
+             bool &fast_pI, bool &matlab, ConvClass &converge_test, size_t sweep, double lambda = 0.0){
+      // Take the b-th block of the factor matrix also take the
+      // find the partial grammian for the blocked factors
+      auto cur_block_size = block_end - block_start;
+      for (size_t i = 0; i < ndim; ++i){
+        auto & a_mat = A[i];
+        auto lower = {z, block_start}, upper = {long(a_mat.extent(0)), block_end};
+        a_mat = make_view(blockfactors[i].range().slice(lower, upper), blockfactors[i].storage());
+        contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+      }
+      A[ndim] = Tensor(cur_block_size);
+      A[ndim].fill(0.0);
+      if(sweep != 0){
+        size_t c = 0;
+        auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data();
+        for(auto b = block_start; b < block_end; ++b, ++c)
+          *(new_lam_ptr + c) = *(lam_full_ptr + b);
+
+        gradient += reconstruct(A, order, A[ndim]);
+
+      }
+      //std::cout << "\t\tnorm(remainder + hat{T}_block) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+      // Do the ALS loop
+      //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
+      // First do it manually, so we know its right
+      // Compute ALS of the bth block against the gradient
+      // computed by subtracting the previous blocks from the reference
+      size_t count = 0;
+      bool is_converged = false;
+      detail::set_norm(converge_test, this->norm(gradient));
+      converge_test.verbose(false);
+      do {
+        ++count;
+        this->num_ALS++;
+
+        for (size_t i = 0; i < ndim; i++) {
+          this->direct(i, cur_block_size, fast_pI, matlab, converge_test, gradient);
+          auto &ai = A[i];
+          contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+        }
+
+        is_converged = converge_test(A, this->AtA);
+      }while(count < max_als && !is_converged);
+      // Compute new difference
+      gradient -= reconstruct(A, order, A[ndim]);
+
+      // Copy the block computed in A to the correct portion in blockfactors
+          // (this will replace A at the end)
+      for(size_t i = 0; i < ndim; ++i){
+        copy_blocks(blockfactors[i], A[i], block_start, block_end);
+      }
+      // Copy the lambda values into the correct location in blockfactors
+      size_t c = 0;
+      auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data();
+      for (auto b = block_start; b < block_end; ++b, ++c) {
+        *(lam_full_ptr + b) = *(new_lam_ptr + c);
+      }
+    }
+
+  };
+}
+
+#endif  // BTAS_GENERIC_CP_BCD_H
diff --git a/btas/generic/cp_rals.h b/btas/generic/cp_rals.h
index 937c9061..a3d6ef08 100644
--- a/btas/generic/cp_rals.h
+++ b/btas/generic/cp_rals.h
@@ -174,7 +174,7 @@ namespace btas {
           } else if (dir) {
             this->direct(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]);
           } else {
-            update_w_KRP(i, rank, fast_pI, matlab, converge_test, lambda[i]);
+            update_w_KRP(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]);
           }
           // Compute the value s after normalizing the columns
           auto & ai = A[i];
diff --git a/btas/generic/lapack_extensions.h b/btas/generic/lapack_extensions.h
index cb1560e6..bc4ecea0 100644
--- a/btas/generic/lapack_extensions.h
+++ b/btas/generic/lapack_extensions.h
@@ -40,6 +40,32 @@ int64_t getrf( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA,
 
 }
 
+template <typename T, typename Alloc = std::allocator<T>>
+int64_t geqp3_pivot( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA,
+              int64_t* IPIV, T* tau, Alloc alloc = Alloc() ) {
+
+  //std::cout << "IN GETRF IMPL" << std::endl;
+  if( order == blas::Layout::ColMajor ) {
+    return lapack::geqp3( M, N, A, LDA, IPIV, tau);
+  } else {
+
+    // Transpose input
+    auto* A_transpose = alloc.allocate(M*N);
+    transpose( N, M, A, LDA, A_transpose, M );
+
+    // A -> LU
+    auto info = lapack::geqp3( M, N, A_transpose, M, IPIV, tau);
+
+    // Transpose output + cleanup
+    if(!info)
+      transpose( M, N, A_transpose, M, A, LDA );
+    alloc.deallocate( A_transpose, M*N );
+
+    return info;
+  }
+
+}
+
 template <typename T, typename Alloc = std::allocator<T>, 
           typename IntAlloc = std::allocator<int64_t> >
 int64_t gesv( blas::Layout order, int64_t N, int64_t NRHS, T* A, int64_t LDA,
diff --git a/external/versions.cmake b/external/versions.cmake
index e738e91b..caa3e428 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -1,4 +1,4 @@
-set(BTAS_TRACKED_VGCMAKEKIT_TAG e0d04e91a84b7e71d9b87682c46c518e9966bd78)
+set(BTAS_TRACKED_VGCMAKEKIT_TAG 9b541a5f3708a58dce59d00f3c47ac030ef4d8b4)
 
 # likely can use earlier, but
 # - as of oct 2023 tested with 1.71 and up only
diff --git a/unittest/tensor_cp_test.cc b/unittest/tensor_cp_test.cc
index beb097f4..815c521e 100644
--- a/unittest/tensor_cp_test.cc
+++ b/unittest/tensor_cp_test.cc
@@ -20,6 +20,7 @@ TEST_CASE("CP")
   using conv_class = btas::FitCheck<tensor>;
   using conv_class_coupled = btas::CoupledFitCheck<tensor>;
   using btas::CP_ALS;
+  using btas::CP_BCD;
   using btas::CP_RALS;
   using btas::CP_DF_ALS;
   using btas::COUPLED_CP_ALS;
@@ -374,7 +375,28 @@ TEST_CASE("CP")
      double diff = 1.0 - A1.compute_error(conv_coupled, 1e-2, 1, 19);
      CHECK(std::abs(diff - results(38,0)) <= epsilon);
    }
-     */
+  */
+  }
+  // Block Coodirnate descent test
+  {
+      SECTION("BCD MODE = 3, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D3, 9);
+      conv.set_norm(norm3);
+      double diff = A1.compute_rank_random(13, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
+    SECTION("BCD MODE = 4, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D4, 37);
+      conv.set_norm(norm4);
+      double diff = A1.compute_rank_random(55, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
+    SECTION("BCD MODE = 5, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D5, 26);
+      conv.set_norm(norm5);
+      double diff = A1.compute_rank_random(60, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
   }
 }
 #endif