From e054de73c2cb5bc641df1086d5b049befed69c42 Mon Sep 17 00:00:00 2001 From: Heidi Thornquist Date: Wed, 18 Sep 2024 14:05:02 -0600 Subject: [PATCH] Fixes RCG solver to be correct with Kokkos::DualView --- packages/belos/src/BelosRCGSolMgr.hpp | 55 +++-- packages/belos/tpetra/test/RCG/CMakeLists.txt | 10 +- .../tpetra/test/RCG/test_kokkos_rcg_hb.cpp | 203 ++++++++++++++++++ 3 files changed, 247 insertions(+), 21 deletions(-) create mode 100644 packages/belos/tpetra/test/RCG/test_kokkos_rcg_hb.cpp diff --git a/packages/belos/src/BelosRCGSolMgr.hpp b/packages/belos/src/BelosRCGSolMgr.hpp index 1b31ed8a690a..b4ecf23dfbb6 100644 --- a/packages/belos/src/BelosRCGSolMgr.hpp +++ b/packages/belos/src/BelosRCGSolMgr.hpp @@ -194,8 +194,8 @@ namespace Belos { virtual ~RCGSolMgr() {}; //! clone for Inverted Injection (DII) - Teuchos::RCP > clone () const override { - return Teuchos::rcp(new RCGSolMgr); + Teuchos::RCP > clone () const override { + return Teuchos::rcp(new RCGSolMgr); } //@} @@ -817,7 +817,7 @@ void RCGSolMgr::initializeStateStorage() { if (Alpha_ == Teuchos::null) Alpha_ = Teuchos::rcp( new std::vector( numBlocks_, 1 ) ); else { - if ( Alpha_->size() != numBlocks_ ) + if ( (int)Alpha_->size() != numBlocks_ ) Alpha_->resize( numBlocks_, 1 ); } @@ -825,7 +825,7 @@ void RCGSolMgr::initializeStateStorage() { if (Beta_ == Teuchos::null) Beta_ = Teuchos::rcp( new std::vector( numBlocks_ + 1 ) ); else { - if ( (Beta_->size() != (numBlocks_+1)) ) + if ( ((int)Beta_->size() != (numBlocks_+1)) ) Beta_->resize( numBlocks_ + 1 ); } @@ -833,7 +833,7 @@ void RCGSolMgr::initializeStateStorage() { if (D_ == Teuchos::null) D_ = Teuchos::rcp( new std::vector( numBlocks_ ) ); else { - if ( D_->size() != numBlocks_ ) + if ( (int)D_->size() != numBlocks_ ) D_->resize( numBlocks_ ); } @@ -1158,8 +1158,9 @@ ReturnType RCGSolMgr::solve() { Teuchos::RCP Utr = DMT::Create(recycleBlocks_,1); Teuchos::RCP Utmp = MVT::CloneView( *U_, rindex ); MVT::MvTransMv( one, *Utmp, *r_, *Utr ); + + DMT::SyncHostToDevice(*LUUTAU_); DMT::Assign(*LUUTAU_,*UTAU_); - DMT::SyncDeviceToHost( *LUUTAU_ ); DMT::SyncDeviceToHost( *Utr ); int info = 0; @@ -1192,7 +1193,7 @@ ReturnType RCGSolMgr::solve() { Teuchos::RCP AUtmp = MVT::CloneView( *AU_, rindex ); MVT::MvTransMv( one, *AUtmp, *z_, *mu ); - DMT::SyncDeviceToHost( *mu ); + DMT::SyncDeviceToHost( *Delta_ ); char TRANS = 'N'; int info; lapack.GETRS( TRANS, recycleBlocks_, 1, DMT::GetConstRawHostPtr(*LUUTAU_), DMT::GetStride(*LUUTAU_), @@ -1253,7 +1254,7 @@ ReturnType RCGSolMgr::solve() { // //////////////////////////////////////////////////////////////////////////////////// if ( convTest_->getStatus() == Passed ) { - // We have convergence + // We have convergence break; // break from while(1){rcg_iter->iterate()} } //////////////////////////////////////////////////////////////////////////////////// @@ -1280,13 +1281,13 @@ ReturnType RCGSolMgr::solve() { if (!existU_) { if (cycle == 0) { // No U, no U1 - DMT::SyncDeviceToHost( *F_ ); - DMT::SyncDeviceToHost( *G_ ); Teuchos::RCP Ftmp = DMT::Subview( *F_, numBlocks_, numBlocks_ ); Teuchos::RCP Gtmp = DMT::Subview( *G_, numBlocks_, numBlocks_ ); DMT::PutScalar( *Ftmp, zero ); DMT::PutScalar( *Gtmp, zero ); - for (int ii=0;ii 0) { DMT::Value(*Gtmp,ii-1,ii) = -(*D_)[ii]/(*Alpha_)[ii-1]; @@ -1298,6 +1299,7 @@ ReturnType RCGSolMgr::solve() { DMT::SyncHostToDevice( *G_ ); // compute harmonic Ritz vectors + DMT::SyncDeviceToHost( *Y_ ); Teuchos::RCP Ytmp = DMT::Subview( *Y_, numBlocks_, recycleBlocks_ ); getHarmonicVecs(*Ftmp,*Gtmp,*Ytmp); DMT::SyncHostToDevice( *Y_ ); @@ -1308,6 +1310,10 @@ ReturnType RCGSolMgr::solve() { MVT::MvTimesMatAddMv( one, *Ptmp, *Ytmp, zero, *U1tmp ); // Precompute some variables for next cycle + DMT::SyncDeviceToHost(*GY_); + DMT::SyncDeviceToHost(*AU1TAU1_); + DMT::SyncDeviceToHost(*FY_); + DMT::SyncDeviceToHost(*AU1TU1_); // AU1TAU1 = Y'*G*Y; Teuchos::RCP GYtmp = DMT::Subview( *GY_, numBlocks_, recycleBlocks_ ); @@ -1344,11 +1350,12 @@ ReturnType RCGSolMgr::solve() { // Must reinitialize AU1TAP; can become dense later DMT::PutScalar( *AU1TAPtmp, zero ); // AU1TAP(:,1) = Y(end,:)' * (-1/Alpha(end)); - ScalarType alphatmp = -1.0 / (*Alpha_)[numBlocks_-1]; + DMT::SyncDeviceToHost( *AU1TAP_ ); + ScalarType alphatmp = -1.0 / (*Alpha_)[numBlocks_-1]; for (int ii=0; ii::solve() { DMT::SyncHostToDevice(*APTAP_); // F = [AU1TU1 zeros(k,m); zeros(m,k) diag(D)]; - F_->putScalar(zero); + DMT::PutScalar(*F_,zero); Teuchos::RCP F11 = DMT::Subview( *F_, recycleBlocks_, recycleBlocks_ ); Teuchos::RCP F22 = DMT::Subview( *F_, numBlocks_, numBlocks_, recycleBlocks_, recycleBlocks_ ); DMT::Assign(*F11,*AU1TU1_); @@ -1497,8 +1504,10 @@ ReturnType RCGSolMgr::solve() { DMT::SyncHostToDevice(*L2_); // AUTAP = UTAU*Delta*L2; - DMT::SyncDeviceToHost(*DeltaL2_); + DMT::SyncDeviceToHost(*Delta_); + DMT::SyncDeviceToHost(*DeltaL2_); DMT::SyncDeviceToHost(*AUTAP_); + DMT::SyncDeviceToHost(*UTAU_); //DeltaL2_->multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,one,*Delta_,*L2_,zero); blas.GEMM( Teuchos::NO_TRANS, Teuchos::NO_TRANS, recycleBlocks_, numBlocks_, numBlocks_+1, @@ -1625,7 +1634,9 @@ ReturnType RCGSolMgr::solve() { DMT::Value(*L2_,ii,ii) = 1./(*Alpha_)[ii]; DMT::Value(*L2_,ii+1,ii) = -1./(*Alpha_)[ii]; } + DMT::SyncHostToDevice(*L2_); + DMT::SyncDeviceToHost(*Delta_); DMT::SyncDeviceToHost(*DeltaL2_); DMT::SyncDeviceToHost(*AU1TUDeltaL2_); DMT::SyncDeviceToHost(*AU1TAP_); @@ -1642,13 +1653,16 @@ ReturnType RCGSolMgr::solve() { one, DMT::GetConstRawHostPtr(*AU1TU_), DMT::GetStride(*AU1TU_), DMT::GetConstRawHostPtr(*DeltaL2_), DMT::GetStride(*DeltaL2_), zero, DMT::GetRawHostPtr(*AU1TUDeltaL2_), DMT::GetStride(*AU1TUDeltaL2_)); + + DMT::SyncDeviceToHost( *Y_); Teuchos::RCP Y1 = DMT::SubviewConst( *Y_, recycleBlocks_, recycleBlocks_ ); - //AU1TAP_->multiply(Teuchos::TRANS,Teuchos::NO_TRANS,one,*Y1,*AU1TUDeltaL2_,zero); + Teuchos::RCP Y2 = DMT::SubviewConst( *Y_, numBlocks_, recycleBlocks_, recycleBlocks_, 0 ); + + //AU1TAP_->multiply(Teuchos::TRANS,Teuchos::NO_TRANS,one,*Y1,*AU1TUDeltaL2_,zero); blas.GEMM( Teuchos::TRANS, Teuchos::NO_TRANS, recycleBlocks_, numBlocks_, recycleBlocks_, one, DMT::GetConstRawHostPtr(*Y1), DMT::GetStride(*Y1), DMT::GetConstRawHostPtr(*AU1TUDeltaL2_), DMT::GetStride(*AU1TUDeltaL2_), zero, DMT::GetRawHostPtr(*AU1TAP_), DMT::GetStride(*AU1TAP_)); - Teuchos::RCP Y2 = DMT::SubviewConst( *Y_, numBlocks_, recycleBlocks_, recycleBlocks_, 0 ); ScalarType val = dold * (-(*Beta_)[0]/(*Alpha_)[0]); for(int ii=0;ii void RCGSolMgr::getHarmonicVecs(const DM& F, const DM& G, DM& Y ) { + // order of F,G - int n = F.numCols(); + int n = DMT::GetNumCols(F); // The LAPACK interface Teuchos::LAPACK lapack; @@ -1952,8 +1967,8 @@ void RCGSolMgr::getHarmonicVecs(const DM& F, int lwork = -1; int info = 0; // since SYGV destroys workspace, create copies of F,G - Teuchos::RCP F2 = DMT::CreateCopy( *F_ ); - Teuchos::RCP G2 = DMT::CreateCopy( *G_ ); + Teuchos::RCP F2 = DMT::CreateCopy( F ); + Teuchos::RCP G2 = DMT::CreateCopy( G ); DMT::SyncDeviceToHost(*F2); DMT::SyncDeviceToHost(*G2); diff --git a/packages/belos/tpetra/test/RCG/CMakeLists.txt b/packages/belos/tpetra/test/RCG/CMakeLists.txt index 6623e64cbb0b..89f034dbfef4 100644 --- a/packages/belos/tpetra/test/RCG/CMakeLists.txt +++ b/packages/belos/tpetra/test/RCG/CMakeLists.txt @@ -10,11 +10,19 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + Tpetra_DenseKokkos_rcg_hb_test + SOURCES test_kokkos_rcg_hb.cpp + ARGS + "--verbose --tol=1e-6 --filename=bcsstk14.hb --num-rhs=3 --max-subspace=100 --recycle=10 --max-iters=4000" + COMM serial mpi + ) + ASSERT_DEFINED(Anasazi_SOURCE_DIR) TRIBITS_COPY_FILES_TO_BINARY_DIR(Tpetra_CopyTestRCGFiles SOURCE_DIR ${Anasazi_SOURCE_DIR}/testmatrices SOURCE_FILES bcsstk14.hb - EXEDEPS Tpetra_rcg_hb_test + EXEDEPS Tpetra_rcg_hb_test Tpetra_DenseKokkos_rcg_hb_test ) ENDIF() diff --git a/packages/belos/tpetra/test/RCG/test_kokkos_rcg_hb.cpp b/packages/belos/tpetra/test/RCG/test_kokkos_rcg_hb.cpp new file mode 100644 index 000000000000..567209ae5e6e --- /dev/null +++ b/packages/belos/tpetra/test/RCG/test_kokkos_rcg_hb.cpp @@ -0,0 +1,203 @@ +// @HEADER +// ***************************************************************************** +// Belos: Block Linear Solvers Package +// +// Copyright 2004-2016 NTESS and the Belos contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER +// +// This driver reads a problem from a Harwell-Boeing (HB) file. +// The right-hand-side corresponds to a randomly generated solution. +// The initial guesses are all set to zero. +// +// NOTE: No preconditioner is used in this case. +// +#include "BelosConfigDefs.hpp" +#include "BelosLinearProblem.hpp" +#include "BelosTpetraAdapter.hpp" +#include "BelosKokkosDenseAdapter.hpp" +#include "BelosRCGSolMgr.hpp" + +#include +#include +#include +#include +#include + +// I/O for Harwell-Boeing files +#define HIDE_TPETRA_INOUT_IMPLEMENTATIONS +#include + +using namespace Teuchos; +using Tpetra::Operator; +using Tpetra::CrsMatrix; +using Tpetra::MultiVector; +using std::endl; +using std::cout; +using std::vector; +using Teuchos::tuple; + +int main(int argc, char *argv[]) { + + typedef Tpetra::MultiVector<>::scalar_type ST; + typedef Tpetra::MultiVector<>::impl_scalar_type IST; + typedef Kokkos::DualView DM; + typedef ScalarTraits SCT; + typedef SCT::magnitudeType MT; + typedef Tpetra::Operator OP; + typedef Tpetra::MultiVector MV; + typedef Belos::OperatorTraits OPT; + typedef Belos::MultiVecTraits MVT; + + GlobalMPISession mpisess(&argc,&argv,&cout); + + const ST one = SCT::one(); + + RCP > comm = Tpetra::getDefaultComm(); + int MyPID = rank(*comm); + + // + // Get test parameters from command-line processor + // + bool verbose = false, proc_verbose = false, debug = false; + int frequency = -1; // how often residuals are printed by solver + std::string filename("bcsstk14.hb"); + int numBlocks = 30; // maximum number of blocks the solver can use for the Krylov subspace + int recycleBlocks = 3; // maximum number of blocks the solver can use for the recycle space + int numrhs = 1; // number of right-hand sides to solve for + int maxiters = -1; // maximum number of iterations allowed per linear system + MT tol = 1.0e-5; // relative residual tolerance + + CommandLineProcessor cmdp(false,true); + cmdp.setOption("verbose","quiet",&verbose,"Print messages and results."); + cmdp.setOption("debug","nodebug",&debug,"Run debugging checks."); + cmdp.setOption("frequency",&frequency,"Solvers frequency for printing residuals (#iters)."); + cmdp.setOption("tol",&tol,"Relative residual tolerance used by the RCG solver."); + cmdp.setOption("max-subspace",&numBlocks,"Maximum number of vectors in search space (not including recycle space)."); + cmdp.setOption("recycle",&recycleBlocks,"Number of vectors in recycle space."); + cmdp.setOption("num-rhs",&numrhs,"Number of right-hand sides to be solved for."); + cmdp.setOption("max-iters",&maxiters,"Maximum number of iterations per linear system (-1 = adapted to problem/block size)."); + cmdp.setOption("filename",&filename,"Filename for Harwell-Boeing test matrix."); + if (cmdp.parse(argc,argv) != CommandLineProcessor::PARSE_SUCCESSFUL) { + return -1; + } + if (debug) { + verbose = true; + } + if (!verbose) { + frequency = -1; // reset frequency if test is not verbose + } + + proc_verbose = ( verbose && (MyPID==0) ); + + if (proc_verbose) { + std::cout << Belos::Belos_Version() << std::endl << std::endl; + } + + RCP > A; + Tpetra::Utils::readHBMatrix(filename,comm,A); + RCP > map = A->getDomainMap(); + + // Create initial vectors + RCP B, X; + X = rcp( new MV(map,numrhs) ); + MVT::MvRandom( *X ); + B = rcp( new MV(map,numrhs) ); + OPT::Apply( *A, *X, *B ); + MVT::MvInit( *X, 0.0 ); + + // + // ********Other information used by block solver*********** + // *****************(can be user specified)****************** + // + int numIters1; + const int NumGlobalElements = B->getGlobalLength(); + if (maxiters == -1) + maxiters = NumGlobalElements - 1; // maximum number of iterations to run + // + ParameterList belosList; + belosList.set( "Maximum Iterations", maxiters ); // Maximum number of iterations allowed + belosList.set( "Num Blocks", numBlocks); // Maximum number of blocks in Krylov space + belosList.set( "Num Recycled Blocks", recycleBlocks ); // Number of vectors in recycle space + belosList.set( "Convergence Tolerance", tol ); // Relative convergence tolerance requested + if (verbose) { + belosList.set( "Verbosity", Belos::Errors + Belos::Warnings + + Belos::TimingDetails + Belos::FinalSummary + Belos::StatusTestDetails ); + if (frequency > 0) + belosList.set( "Output Frequency", frequency ); + } + else + belosList.set( "Verbosity", Belos::Errors + Belos::Warnings ); + // + // Construct an unpreconditioned linear problem instance. + // + Belos::LinearProblem problem( A, X, B ); + bool set = problem.setProblem(); + if (set == false) { + if (proc_verbose) + std::cout << std::endl << "ERROR: Belos::LinearProblem failed to set up correctly!" << std::endl; + return -1; + } + // + // ******************************************************************* + // *********************Start the RCG iteration*********************** + // ******************************************************************* + // + Belos::RCGSolMgr solver( rcpFromRef(problem), rcpFromRef(belosList) ); + + // + // **********Print out information about problem******************* + // + if (proc_verbose) { + std::cout << std::endl << std::endl; + std::cout << "Dimension of matrix: " << NumGlobalElements << std::endl; + std::cout << "Number of right-hand sides: " << numrhs << std::endl; + std::cout << "Max number of RCG iterations: " << maxiters << std::endl; + std::cout << "Relative residual tolerance: " << tol << std::endl; + std::cout << std::endl; + } + // + // Perform solve + // + Belos::ReturnType ret = solver.solve(); + numIters1=solver.getNumIters(); + // + // Compute actual residuals. + // + bool badRes = false; + std::vector actual_resids( numrhs ); + std::vector rhs_norm( numrhs ); + MV resid(map, numrhs); + OPT::Apply( *A, *X, resid ); + MVT::MvAddMv( -one, resid, one, *B, resid ); + MVT::MvNorm( resid, actual_resids ); + MVT::MvNorm( *B, rhs_norm ); + if (proc_verbose) { + std::cout<< "---------- Actual Residuals (normalized) ----------"< tol) badRes = true; + } + + if (proc_verbose) { std::cout << "Solve took " << numIters1 << " iterations." << std::endl; } + + if ( ret!=Belos::Converged || badRes ) { + if (proc_verbose) { + std::cout << "\nEnd Result: TEST FAILED" << std::endl; + } + return -1; + } + // + // Default return value + // + if (proc_verbose) { + std::cout << "\nEnd Result: TEST PASSED" << std::endl; + } + return 0; + // +} // end test_rcg_hb.cpp