diff --git a/btas/btas.h b/btas/btas.h index 53a9b008..9c4fac5f 100644 --- a/btas/btas.h +++ b/btas/btas.h @@ -9,9 +9,11 @@ #include #include +//#include #include #include #include +#include #include #include #include diff --git a/btas/generic/cp.h b/btas/generic/cp.h index 384b6e08..e59be5f8 100644 --- a/btas/generic/cp.h +++ b/btas/generic/cp.h @@ -571,7 +571,7 @@ namespace btas { /// \param[in] Mat Calculates the 2-norm of the matrix mat /// \return the 2-norm. - auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, Mat))); } + auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, btas::impl::conj(Mat)))); } /// SVD referencing code from /// http://www.netlib.org/lapack/explore-html/de/ddd/lapacke_8h_af31b3cb47f7cc3b9f6541303a2968c9f.html diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h index 39698e2d..592af8ba 100644 --- a/btas/generic/cp_als.h +++ b/btas/generic/cp_als.h @@ -339,6 +339,7 @@ namespace btas { Tensor &tensor_ref; // Tensor to be decomposed ord_t size; // Total number of elements bool factors_set = false; // Are the factors preset (not implemented yet). + std::vector> pivs; /// Creates an initial guess by computing the SVD of each mode /// If the rank of the mode is smaller than the CP rank requested @@ -658,7 +659,82 @@ namespace btas { double lambda = 0) { Tensor temp(A[n].extent(0), rank); Tensor an(A[n].range()); - + // Testing the code to see if pivoted QR can help + if (false) { + // First create a Pivot matrix from the flattened tensor_ref + auto f = flatten(tensor_ref, n); + auto square_dim = f.extent(0), full = f.extent(1); + auto scale_factor = double(full) / double(square_dim); + auto extended = (scale_factor > 2.0 ? square_dim * (scale_factor - 1.0) : full); + std::vector piv; + if (pivs.size() < (n + 1)) { + piv = std::vector(f.extent(1)); + std::vector tau(f.extent(1)); + btas::geqp3_pivot(blas::Layout::RowMajor, f.extent(0), f.extent(1), f.data(), f.extent(1), piv.data(), + tau.data()); + Tensor R(full, square_dim); + R.fill(0.0); + f = flatten(tensor_ref, n); + pivs.emplace_back(piv); + } else { + piv = pivs[n]; + } + + auto K = this->generate_KRP(n, rank, true); + // For reference I compute the Matricized tensor times khatri rao product + gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, f, K, this->zero, temp); + detail::set_MtKRP(converge_test, temp); + + Tensor Fp(square_dim, square_dim); + { + Tensor t(square_dim, full); + for (auto i = 0; i < square_dim; ++i) { + for (auto j = 0; j < full; ++j) { + int v = pivs[n][j]; + t(i, j) = f(i, v); + } + } + + for (auto i = 0; i < square_dim; ++i) { + for (auto j = 0; j < square_dim; ++j) { + Fp(i, j) = t(i, j); + } + } + } + + Tensor Kp(square_dim, rank); + { + Tensor t(full, rank); + for (auto j = 0; j < full; ++j) { + int v = pivs[n][j]; + for (auto r = 0; r < rank; ++r) { + t(j, r) = K(v, r); + } + } + + for (auto j = 0; j < square_dim; ++j) { + for (auto r = 0; r < rank; ++r) { + Kp(j, r) = t(j, r); + } + } + } + + // contract the product from above with the pseudoinverse of the Hadamard + // produce an optimize factor matrix + fast_pI = false; + auto pInv = pseudoInverse(Kp, fast_pI); + + Tensor t; + gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, Fp, pInv, this->zero, temp); + + // compute the difference between this new factor matrix and the previous + // iteration + this->normCol(temp); + + // Replace the old factor matrix with the new optimized result + A[n] = temp; + return; + } #ifdef BTAS_HAS_INTEL_MKL // Computes the Khatri-Rao product intermediate diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h new file mode 100644 index 00000000..afa69ed0 --- /dev/null +++ b/btas/generic/cp_bcd.h @@ -0,0 +1,313 @@ +// +// Created by Karl Pierce on 4/29/23. +// + +#ifndef BTAS_GENERIC_CP_BCD_H +#define BTAS_GENERIC_CP_BCD_H + +#include + +#include +#include +#include +#include + +namespace btas{ + /** \brief Computes the Canonical Product (CP) decomposition of an order-N + tensor using block coordinate descent (BCD). + + This computes the CP decomposition of btas::Tensor objects with row + major storage only with fixed (compile-time) and variable (run-time) + ranks. Also provides Tucker and randomized Tucker-like compressions coupled + with CP-BCD decomposition. Does not support strided ranges. + + \warning this code takes a non-const reference \c tensor_ref but does + not modify the values. This is a result of API (reshape needs non-const tensor) + + Synopsis: + \code + // Constructors + CP_BCD A(tensor) // CP_BCD object with empty factor + // matrices and no symmetries + CP_BCD A(tensor, symms) // CP_ALS object with empty factor + // matrices and symmetries + + // Operations + A.compute_rank(rank, converge_test) // Computes the CP_BCD of tensor to + // rank, rank build and HOSVD options + + A.compute_rank_random(rank, converge_test) // Computes the CP_BCD of tensor to + // rank. Factor matrices built at rank + // with random numbers + + A.compute_error(converge_test, omega) // Computes the CP_BCD of tensor to + // 2-norm + // error < omega. + + //See documentation for full range of options + + // Accessing Factor Matrices + A.get_factor_matrices() // Returns a vector of factor matrices, if + // they have been computed + + A.reconstruct() // Returns the tensor computed using the + // CP factor matrices + \endcode + */ + + template > + class CP_BCD : public CP_ALS { + public: + using T = typename Tensor::value_type; + using RT = real_type_t; + using RTensor = rebind_tensor_t; + + using CP::A; + using CP::ndim; + using CP::symmetries; + using typename CP::ind_t; + using typename CP::ord_t; + using CP_ALS::tensor_ref; + using CP_ALS::size; + + /// Create a CP BCD object, child class of the CP object + /// that stores the reference tensor. + /// Reference tensor has no symmetries. + /// \param[in] tensor the reference tensor to be decomposed. + CP_BCD(Tensor &tensor, ind_t block_size = 1, size_t sweeps=1, std::vector bs = std::vector()) : CP_ALS(tensor), blocksize(block_size), nsweeps(sweeps){ + if(!bs.empty()){ + block_sizes = bs; + } + } + + /// Create a CP BCD object, child class of the CP object + /// that stores the reference tensor. + /// Reference tensor has symmetries. + /// Symmetries should be set such that the higher modes index + /// are set equal to lower mode indices (a 4th order tensor, + /// where the second & third modes are equal would have a + /// symmetries of {0,1,1,3} + /// \param[in] tensor the reference tensor to be decomposed. + /// \param[in] symms the symmetries of the reference tensor. + CP_BCD(Tensor &tensor, std::vector &symms, ind_t block_size = 1, size_t sweeps=1) + : CP_ALS(tensor, symms), blocksize(block_size), nsweeps(sweeps){ + } + + CP_BCD() = default; + + ~CP_BCD() = default; + + void set_block_sizes(size_t rank, std::vector bs){ + BTAS_ASSERT(bs[bs.size() - 1] == rank); + block_sizes = bs; + return; + } + protected: + Tensor gradient, block_ref; // Stores gradient of BCD for fitting. + ind_t blocksize; // Size of block gradient + std::vector blockfactors; + std::vector order; // convenient to write order of tensors for reconstruct + long z = 0; + size_t nsweeps; + std::vector block_sizes; + + /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank + /// \param[in] rank The rank of the CP decomposition. + /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. + /// \param[in] dir The CP decomposition be computed without calculating the + /// Khatri-Rao product? + /// \param[in] max_als If CP decomposition is to finite + /// error, max_als is the highest rank approximation computed before giving up + /// on CP-ALS. Default = 1e5. + /// \param[in] calculate_epsilon Should the 2-norm + /// error be calculated ||T_exact - T_approx|| = epsilon. + /// \param[in] tcutALS + /// How small difference in factor matrices must be to consider ALS of a + /// single rank converged. Default = 0.1. + /// \param[in, out] epsilon The 2-norm + /// error between the exact and approximated reference tensor + /// \param[in,out] fast_pI Whether the pseudo inverse be computed using a fast cholesky decomposition, + /// on return \c fast_pI will be true if use of Cholesky was successful + virtual void ALS(ind_t rank, ConvClass &converge_test, bool dir, int max_als, bool calculate_epsilon, double &epsilon, + bool &fast_pI){ + for(auto i = 0; i < ndim; ++i){ + order.emplace_back(i); + } + + block_ref = tensor_ref; + + // Number of full blocks with rank blocksize + // plus one more block with rank: rank % blocksize + int n_full_blocks = int( rank / blocksize), + last_blocksize = rank % blocksize; + + if(block_sizes.empty()) { + size_t index = 0; + block_sizes.emplace_back(index); + for(size_t i = 0; i < n_full_blocks; ++i) { + index += blocksize; + block_sizes.emplace_back(index); + } + if(last_blocksize != 0) block_sizes.emplace_back(index + last_blocksize); + } + // copying all of A to block factors. Then we are going to take slices of A + // that way we can leverage all of `CP_ALS` code without modification + blockfactors = A; + gradient = tensor_ref; + // this stores the factors from rank 0 to #blocks * blocksize + // Compute all the BCD of the full blocks + auto matlab = true; + auto one_over_tref = 1.0 / this->norm(tensor_ref); + auto fit = 1.0, change = 0.0; + bool compute_full_fit = false; + size_t n_blocks = block_sizes.size(); + for(auto s = 0; s < nsweeps; ++s){ + long block_step = 0; + this->AtA = std::vector(ndim); + + // E - A1 - A2 - ... = gradient + // initial : gradient = E + // gradient = E - A1 + for (long b = 1; b < n_blocks; ++b) { + this->AtA = std::vector(ndim); + BCD(block_sizes[b - 1], block_sizes[b], max_als, fast_pI, matlab, converge_test, s); + std::cout << "\tnorm (remainder) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl; + // Test the system to see if converged. Doing the hard way first + // To compute the accuracy fully compute CP approximation using blocks 0 through b. + if(compute_full_fit) { + compute_full(block_step + blocksize, one_over_tref, change, fit); + } + } +// if(last_blocksize != 0) { +// std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl; +// this->AtA = std::vector(ndim); +// block_step = n_full_blocks * blocksize; +// BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s); +// if(compute_full_fit) { +// compute_full(block_step + last_blocksize, one_over_tref, change, fit); +// } +// } + //std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl; + epsilon = (compute_full_fit == false ? + 1.0 - this->norm(gradient) * one_over_tref + : fit); + std::cout << epsilon << "\n" << std::endl; + } + detail::set_norm(converge_test, this->norm(tensor_ref)); + converge_test.verbose(true); + A = blockfactors; + this->AtA = std::vector(ndim); + for (size_t i = 0; i < ndim; ++i) { + auto &a_mat = A[i]; + contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3}); + } + gradient = reconstruct(A, order, A[ndim]); + std::cout << norm(tensor_ref - gradient) * one_over_tref << std::endl; + } + + void copy_blocks(Tensor & to, const Tensor & from, ind_t block_start, ind_t block_end){ + auto tref_dim = to.extent(0); + auto to_rank = to.extent(1), from_rank = from.extent(1); + + auto to_ptr = to.data(); + auto from_ptr = from.data(); + ind_t from_pos = 0; + for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += to_rank){ + for (auto b = block_start; b < block_end; ++b, ++from_pos){ + *(to_ptr + skip + b) = *(from_ptr + from_pos); + } + } + } + + void compute_full(long block_end, T one_over_tref, T & change, T& fit){ + std::vector current_grads(ndim); + for (size_t i = 0; i < ndim; ++i) { + auto &a_mat = blockfactors[i]; + auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_end}; + current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage()); + } + + auto temp = reconstruct(current_grads, order, blockfactors[ndim]); + auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref; + change = abs(fit - newfit); + fit = newfit; + std::cout << block_end << "\t"; + std::cout << fit << "\t" << change << std::endl; + } + /// Computes an optimized factor matrix holding all others constant. + /// No Khatri-Rao product computed, immediate contraction + + /// \param[in] n The mode being optimized, all other modes held constant + /// \param[in] rank The current rank, column dimension of the factor matrices + /// \param[in,out] fast_pI Should the pseudo inverse be computed using a fast cholesky decomposition + /// return if computing the \c fast_pI was successful. + /// \param[in, out] matlab If \c fast_pI = true then try to solve VA = B instead of taking pseudoinverse + /// in the same manner that matlab would compute the inverse. If this fails, variable will be manually + /// set to false and SVD will be used. + /// return if \c matlab was successful + /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged + + void BCD(ind_t block_start, ind_t block_end, size_t max_als, + bool &fast_pI, bool &matlab, ConvClass &converge_test, size_t sweep, double lambda = 0.0){ + // Take the b-th block of the factor matrix also take the + // find the partial grammian for the blocked factors + auto cur_block_size = block_end - block_start; + for (size_t i = 0; i < ndim; ++i){ + auto & a_mat = A[i]; + auto lower = {z, block_start}, upper = {long(a_mat.extent(0)), block_end}; + a_mat = make_view(blockfactors[i].range().slice(lower, upper), blockfactors[i].storage()); + contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3}); + } + A[ndim] = Tensor(cur_block_size); + A[ndim].fill(0.0); + if(sweep != 0){ + size_t c = 0; + auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data(); + for(auto b = block_start; b < block_end; ++b, ++c) + *(new_lam_ptr + c) = *(lam_full_ptr + b); + + gradient += reconstruct(A, order, A[ndim]); + + } + //std::cout << "\t\tnorm(remainder + hat{T}_block) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl; + // Do the ALS loop + //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI); + // First do it manually, so we know its right + // Compute ALS of the bth block against the gradient + // computed by subtracting the previous blocks from the reference + size_t count = 0; + bool is_converged = false; + detail::set_norm(converge_test, this->norm(gradient)); + converge_test.verbose(false); + do { + ++count; + this->num_ALS++; + + for (size_t i = 0; i < ndim; i++) { + this->direct(i, cur_block_size, fast_pI, matlab, converge_test, gradient); + auto &ai = A[i]; + contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3}); + } + + is_converged = converge_test(A, this->AtA); + }while(count < max_als && !is_converged); + // Compute new difference + gradient -= reconstruct(A, order, A[ndim]); + + // Copy the block computed in A to the correct portion in blockfactors + // (this will replace A at the end) + for(size_t i = 0; i < ndim; ++i){ + copy_blocks(blockfactors[i], A[i], block_start, block_end); + } + // Copy the lambda values into the correct location in blockfactors + size_t c = 0; + auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data(); + for (auto b = block_start; b < block_end; ++b, ++c) { + *(lam_full_ptr + b) = *(new_lam_ptr + c); + } + } + + }; +} + +#endif // BTAS_GENERIC_CP_BCD_H diff --git a/btas/generic/cp_rals.h b/btas/generic/cp_rals.h index 937c9061..a3d6ef08 100644 --- a/btas/generic/cp_rals.h +++ b/btas/generic/cp_rals.h @@ -174,7 +174,7 @@ namespace btas { } else if (dir) { this->direct(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]); } else { - update_w_KRP(i, rank, fast_pI, matlab, converge_test, lambda[i]); + update_w_KRP(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]); } // Compute the value s after normalizing the columns auto & ai = A[i]; diff --git a/btas/generic/lapack_extensions.h b/btas/generic/lapack_extensions.h index cb1560e6..bc4ecea0 100644 --- a/btas/generic/lapack_extensions.h +++ b/btas/generic/lapack_extensions.h @@ -40,6 +40,32 @@ int64_t getrf( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA, } +template > +int64_t geqp3_pivot( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA, + int64_t* IPIV, T* tau, Alloc alloc = Alloc() ) { + + //std::cout << "IN GETRF IMPL" << std::endl; + if( order == blas::Layout::ColMajor ) { + return lapack::geqp3( M, N, A, LDA, IPIV, tau); + } else { + + // Transpose input + auto* A_transpose = alloc.allocate(M*N); + transpose( N, M, A, LDA, A_transpose, M ); + + // A -> LU + auto info = lapack::geqp3( M, N, A_transpose, M, IPIV, tau); + + // Transpose output + cleanup + if(!info) + transpose( M, N, A_transpose, M, A, LDA ); + alloc.deallocate( A_transpose, M*N ); + + return info; + } + +} + template , typename IntAlloc = std::allocator > int64_t gesv( blas::Layout order, int64_t N, int64_t NRHS, T* A, int64_t LDA, diff --git a/external/versions.cmake b/external/versions.cmake index e738e91b..caa3e428 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,4 +1,4 @@ -set(BTAS_TRACKED_VGCMAKEKIT_TAG e0d04e91a84b7e71d9b87682c46c518e9966bd78) +set(BTAS_TRACKED_VGCMAKEKIT_TAG 9b541a5f3708a58dce59d00f3c47ac030ef4d8b4) # likely can use earlier, but # - as of oct 2023 tested with 1.71 and up only diff --git a/unittest/tensor_cp_test.cc b/unittest/tensor_cp_test.cc index beb097f4..815c521e 100644 --- a/unittest/tensor_cp_test.cc +++ b/unittest/tensor_cp_test.cc @@ -20,6 +20,7 @@ TEST_CASE("CP") using conv_class = btas::FitCheck; using conv_class_coupled = btas::CoupledFitCheck; using btas::CP_ALS; + using btas::CP_BCD; using btas::CP_RALS; using btas::CP_DF_ALS; using btas::COUPLED_CP_ALS; @@ -374,7 +375,28 @@ TEST_CASE("CP") double diff = 1.0 - A1.compute_error(conv_coupled, 1e-2, 1, 19); CHECK(std::abs(diff - results(38,0)) <= epsilon); } - */ + */ + } + // Block Coodirnate descent test + { + SECTION("BCD MODE = 3, Finite rank"){ + CP_BCD A1(D3, 9); + conv.set_norm(norm3); + double diff = A1.compute_rank_random(13, conv, 100); + CHECK(std::abs(diff) <= epsilon); + } + SECTION("BCD MODE = 4, Finite rank"){ + CP_BCD A1(D4, 37); + conv.set_norm(norm4); + double diff = A1.compute_rank_random(55, conv, 100); + CHECK(std::abs(diff) <= epsilon); + } + SECTION("BCD MODE = 5, Finite rank"){ + CP_BCD A1(D5, 26); + conv.set_norm(norm5); + double diff = A1.compute_rank_random(60, conv, 100); + CHECK(std::abs(diff) <= epsilon); + } } } #endif