diff --git a/.autotools b/.autotools new file mode 100644 index 000000000..3a9f307fe --- /dev/null +++ b/.autotools @@ -0,0 +1,42 @@ + + + + + diff --git a/.gitignore b/.gitignore index 12b79ab77..dc6c55f66 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ coverage # ignoring uq tests datadriven/tests/uq/ +/Debug/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 56da1c666..000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,152 +0,0 @@ -# For use with docker, build this image and use it -#image: sgpp_img - -stages: - - build - - platforms - - switches - - doc - - -Compile Standard: - stage: build - script: - - scons -j $(nproc) VERBOSE=1 COMPILER=gnu ARCH=avx OPT=0 PYDOC=0 - tags: - - ubuntu - - gcc - - scons - - swig - - avx - only: - - branches - except: - - master - - -#Compile Intel: -# stage: platforms -# script: -# - . /opt/intel/bin/compilervars.sh intel64 -# - scons -j $(nproc) VERBOSE=1 COMPILER=Intel ARCH=avx OPT=1 PYDOC=0 -# tags: -# - ubuntu -# - intel -# - scons -# - swig -# - avx -# except: -# - master - - -#TODO validate scons flags -#Compile OSX: -# stage: platforms -# script: -# - scons -j4 CXX="g++-5" CC="g++-5" COMPILER=GNU ARCH=SSE3 VERBOSE=1 OPT=1 PYDOC=0 NO_UNIT_TESTS=1 SG_JAVA=0 CPPFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" RUN_BOOST_TESTS=0 -# tags: -# - osx -# - gcc -# - scons -# - swig -# - sse3 -# except: -# - master - - -Test PYDOC: - stage: switches - script: - - scons -j $(nproc) VERBOSE=1 COMPILER=gnu ARCH=avx OPT=0 PYDOC=1 COMPILE_BOOST_TESTS=0 - tags: - - ubuntu - - gcc - - scons - - swig - - avx - except: - - master - - -Test SinglePrecision: - stage: switches - script: - - scons -j $(nproc) VERBOSE=1 COMPILER=gnu ARCH=avx OPT=1 PYDOC=0 USE_DOUBLE_PRECISION=0 - tags: - - ubuntu - - gcc - - scons - - swig - - avx - except: - - master - - -Test StaticLib: - stage: switches - script: - - scons -j $(nproc) VERBOSE=1 COMPILER=gnu ARCH=avx OPT=1 PYDOC=0 USE_STATICLIB=1 - tags: - - ubuntu - - gcc - - scons - - swig - - avx - except: - - master - - -############################################## -#TODO all of these -############################################## - -#Compile Win32-msvc: -# stage: platforms -# script: -# - scons COMPILER=vcc ARCH=sse3 PYDOC=0 VERBOSE=1 SG_PARALLEL=0 SG_COMBIGRID=0 MSVC_USE_SCRIPT="C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\vcvars32.bat" -# tags: -# - windows -# - msvc -# - scons -# - swig -# - sse3 -# except: -# - master - -#Test Win32-msvc-StaticLib: -# stage: switches -# script: -# - scons COMPILER=vcc ARCH=sse3 PYDOC=0 VERBOSE=1 USE_STATICLIB=1 SG_PARALLEL=0 SG_COMBIGRID=0 MSVC_USE_SCRIPT="C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\vcvars32.bat" -# tags: -# - windows -# - msvc -# - scons -# - swig -# - sse3 -# except: -# - master - - -Build Documentation: - stage: doc - script: - - scons -h #generating the Doxyfile - - doxygen Doxyfile - - cat doxygen_warnings.log - tags: - - doxygen - - scons - except: - - master - - -#Style Guide: -# stage: doc -# script: -# - exit 1 -# tags: -# - someflagforstylechecking -# except: -# - master - - diff --git a/SConstruct b/SConstruct index 913c4d337..e3df90991 100644 --- a/SConstruct +++ b/SConstruct @@ -113,6 +113,10 @@ vars.Add("OCL_INCLUDE_PATH", "Set path to the OpenCL header files (parent direct vars.Add("OCL_LIBRARY_PATH", "Set path to the OpenCL library") vars.Add("BOOST_INCLUDE_PATH", "Set path to the Boost header files", "/usr/include") vars.Add("BOOST_LIBRARY_PATH", "Set path to the Boost library", "/usr/lib/x86_64-linux-gnu") +vars.Add("GLPK_INCLUDE_PATH", 'Specifies the location of the GLPK header files.', '/usr/include') +vars.Add("GLPK_LIBRARY_PATH", 'Specifies the location of the GLPK library.', '/usr/lib64') +vars.Add("GSL_INCLUDE_PATH", 'Specifies the location of the GLPK header files.', '/usr/include') +vars.Add("GSL_LIBRARY_PATH", 'Specifies the location of the GLPK library.', '/usr/lib64') vars.Add(BoolVariable("COMPILE_BOOST_TESTS", "Compile the test cases written using Boost Test", True)) vars.Add(BoolVariable("COMPILE_BOOST_PERFORMANCE_TESTS", @@ -133,6 +137,7 @@ vars.Add(BoolVariable("USE_GMMPP", "Set if Gmm++ should be used " + "(only relevant for sgpp::optimization)", False)) vars.Add(BoolVariable("USE_UMFPACK", "Set if UMFPACK should be used " + "(only relevant for sgpp::optimization)", False)) +vars.Add(BoolVariable('USE_STATICLIB', 'Sets if a static library should be built.', False)) vars.Add(BoolVariable("BUILD_STATICLIB", "Set if static libraries should be built " + "instead of shared libraries", False)) vars.Add(BoolVariable("PRINT_INSTRUCTIONS", "Print instructions for installing SG++", True)) diff --git a/compile.sh b/compile.sh new file mode 100644 index 000000000..9d5829cc4 --- /dev/null +++ b/compile.sh @@ -0,0 +1 @@ +scons -j 4 SG_ALL=0 SG_DISTRIBUTEDCOMBIGRID=1 VERBOSE=1 RUN_BOOST_TESTS=0 RUN_CPPLINT=0 BUILD_STATICLIB=0 CXX=mpic++.mpich OPT=1 diff --git a/distributedcombigrid/SConscript b/distributedcombigrid/SConscript index 12ca21b7c..b26ef825a 100755 --- a/distributedcombigrid/SConscript +++ b/distributedcombigrid/SConscript @@ -17,4 +17,4 @@ module.generatePythonDocstrings() module.buildExamples() module.buildBoostTests() module.runBoostTests() -module.runCpplint() \ No newline at end of file +module.runCpplint() diff --git a/distributedcombigrid/examples/combi_example_soft_faults/HelperFunctions.hpp b/distributedcombigrid/examples/combi_example_soft_faults/HelperFunctions.hpp new file mode 100644 index 000000000..0f121f0e9 --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/HelperFunctions.hpp @@ -0,0 +1,173 @@ +/* + * HelperFunctions.cpp + * + * Created on: Jan 13, 2016 + * Author: sccs + */ +#ifndef HELPERFUNCTIONS_HPP_ +#define HELPERFUNCTIONS_HPP_ + +#include "mpi.h" +#include + +namespace combigrid { + +void createCommunicators( size_t ngroup, size_t nprocs, int grank, int gsize, + int& key, int& managerID, MPI_Comm& gcomm, MPI_Comm& lcomm){ + /* determine global rank of each process + * the manager process always has the highest rank + * all other processes are worker processes */ + + + /* create a local communicator for each process group + * lcomm is the local communicator of its own process group for each worker process + * for manager, lcomm is a group which contains only manager process and can be ignored + */ + int color = grank / nprocs; + key = grank - color*nprocs; + MPI_Comm_split(MPI_COMM_WORLD, color, key, &lcomm); + + // create global communicator containing manager and pgroup roots + MPI_Group worldGroup; + MPI_Comm_group(MPI_COMM_WORLD, &worldGroup); + + std::vector ranks(ngroup+1); + for (size_t i = 0; i < ngroup; ++i) { + ranks[i] = i*nprocs; + } + ranks.back() = managerID; + + MPI_Group rootGroup; + MPI_Group_incl(worldGroup, (int)ranks.size(), &ranks[0], &rootGroup); + + MPI_Comm_create(MPI_COMM_WORLD, rootGroup, &gcomm); +} + +void readParameterFile(const std::string& fileName, size_t &ngroup, size_t &nprocs, DimType &dim, + LevelVector &lmin, LevelVector &lmax, LevelVector &leval, + IndexVector &p, double &time_step, double &time_start, + double &time_end, size_t &ncombi, FaultsInfo &faultsInfo, bool &plot ){ + + // parser for the ini parameter file + + boost::property_tree::ptree cfg; + boost::property_tree::ini_parser::read_ini(fileName, cfg); + // there are ngroup*nprocs+1 processes needed + ngroup = cfg.get("manager.ngroup"); + nprocs = cfg.get("manager.nprocs"); + + time_step = cfg.get("simulation.time_step"); + time_start = cfg.get("simulation.time_start"); + time_end = cfg.get("simulation.time_end"); + ncombi = cfg.get("simulation.ncombi"); + plot = cfg.get("simulation.plot"); + + dim = cfg.get("ct.dim"); + + lmin.resize(dim); + lmax.resize(dim); + leval.resize(dim); + p.resize(dim); + + cfg.get("ct.lmin") >> lmin; + cfg.get("ct.lmax") >> lmax; + cfg.get("ct.leval") >> leval; + cfg.get("ct.p") >> p; + + faultsInfo.numFaults_ = cfg.get("faults.num_faults"); + + if ( faultsInfo.numFaults_ == 0 ){ + faultsInfo.iterationFaults_.resize(1); + faultsInfo.taskFaults_.resize(1); + } else { + faultsInfo.iterationFaults_.resize(faultsInfo.numFaults_); + faultsInfo.taskFaults_.resize(faultsInfo.numFaults_); + } + + faultsInfo.sdcIndex_.resize(dim); + + cfg.get("faults.iteration_faults") >> faultsInfo.iterationFaults_; + cfg.get("faults.task_faults") >> faultsInfo.taskFaults_; + cfg.get("faults.sdc_index") >> faultsInfo.sdcIndex_; + + int sdcMag; + sdcMag = cfg.get("faults.sdc_mag"); + switch( sdcMag ){ + case 1 : faultsInfo.sdcMag_ = 1e-300; break; + case 2 : faultsInfo.sdcMag_ = pow(10,-0.5); break; + case 3 : faultsInfo.sdcMag_ = 1e150; break; + } + + faultsInfo.sdcMethod_ = cfg.get("method.sdc_method"); + + // parameter for gnuplot + // TODO: use VTK + std::ofstream paramfile; + paramfile.open("param.plt"); + paramfile << "time_step = " << time_step << std::endl; + paramfile << "time_start = " << time_start << std::endl; + paramfile << "time_end = " << time_end << std::endl; + paramfile << "dim = " << dim << std::endl; + paramfile.close(); +} + +// TODO: redundant definition of exact solution, +// use solution from Problem.hh +double exact(std::vector& x, double t) +{ + const int dim = x.size(); + + double exponent = 0; + for (int d = 0; d < dim; d++) { + x[d] -= 0.5 * t; + exponent -= std::pow(x[d]-0.5,2); + } + + return std::exp(exponent*100.0); +} + +void writeSolutionToFile(std::ofstream& outFile, const FullGrid& fg){ + + DimType dim = fg.getDimension(); + if ( dim == 2 ) { + std::vector coords(dim, 0.0); + + for (int i = 0; i < fg.getNrElements(); ++i) { + if (i % fg.length(0) == 0 && i > 0) { + outFile << std::endl; + } + fg.getCoords(i, coords); + outFile << coords[0] << "\t" + << coords[1] << "\t" + << fg.getElementVector()[i] << std::endl; + } + outFile << std::endl << std::endl; + } +} + +void writeErrorToFile(std::ofstream& outFile, FullGrid& fg, + const double &time_step, const double &step){ + + // calculate error + std::vector coords(fg.getDimension(), 0.0); + for (int i = 0; i < fg.getNrElements(); ++i) { + fg.getCoords(i, coords); + fg.getElementVector()[i] -= exact(coords, time_step*step); + } + + // output for approximation error in gnuplot + outFile << time_step*step << "\t" + << fg.getlpNorm(0) << "\t" + << fg.getlpNorm(2) << std::endl; +} + +void checkProcs(const size_t &nprocs, const IndexVector &p){ + IndexType check = 1; + for (auto k : p) + check *= k; + assert(check == IndexType(nprocs)); +} + +} // namespace combigrid + +#endif /* TASKEXAMPLE_HPP_ */ diff --git a/distributedcombigrid/examples/combi_example_soft_faults/Makefile.sample b/distributedcombigrid/examples/combi_example_soft_faults/Makefile.sample new file mode 100644 index 000000000..dfb4c42e2 --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/Makefile.sample @@ -0,0 +1,21 @@ +CC=mpic++.mpich +CFLAGS=-std=c++11 -g -fopenmp -Wno-deprecated-declarations -Wno-unused-local-typedefs -Wno-deprecated -Wno-uninitialized -Wall + +SGPP_DIR=path/to/sgpp + +LD_SGPP=-L$(SGPP_DIR)/lib/sgpp + +INC_SGPP=-I$(SGPP_DIR)/distributedcombigrid/src/ + +LDIR=$(LD_SGPP) +INC=$(INC_SGPP) + +LIBS=-lsgppdistributedcombigrid -lboost_serialization -lglpk -lgsl -lcblas + +all: combi_example_soft_faults + +combi_example_soft_faults: combi_example_soft_faults.cpp TaskExample.hpp + $(CC) $(CFLAGS) $(LDIR) $(INC) -o combi_example_soft_faults combi_example_soft_faults.cpp $(LIBS) + +clean: + rm -f *.o out/* combi_example_soft_faults diff --git a/distributedcombigrid/examples/combi_example_soft_faults/TaskExample.hpp b/distributedcombigrid/examples/combi_example_soft_faults/TaskExample.hpp new file mode 100644 index 000000000..68177f8ce --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/TaskExample.hpp @@ -0,0 +1,297 @@ +/* + * TaskExample.hpp + * + * Created on: Sep 25, 2015 + * Author: heenemo + */ + +#ifndef TASKEXAMPLE_HPP_ +#define TASKEXAMPLE_HPP_ + +#include "sgpp/distributedcombigrid/fullgrid/DistributedFullGrid.hpp" +#include "sgpp/distributedcombigrid/task/Task.hpp" +#include "sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp" + +namespace combigrid { + +class TaskExample: public Task { + + public: + /* if the constructor of the base task class is not sufficient we can provide an + * own implementation. here, we add dt, nsteps, p, and faultsInfo as a new parameters. + */ + TaskExample(DimType dim, LevelVector& l, std::vector& boundary, + real coeff, LoadModel* loadModel, real dt, + size_t nsteps, IndexVector p = IndexVector(0), + FaultsInfo faultsInfo = {0,IndexVector(0),IndexVector(0)}) : + Task(dim, l, boundary, coeff, loadModel), dt_(dt), nsteps_( + nsteps), stepsTotal_(0), p_(p), faultsInfo_(faultsInfo), + initialized_(false), combiStep_(0), dfg_(NULL) + { + } + + void init(CommunicatorType lcomm) { +// assert(!initialized_); +// assert(dfg_ == NULL); + + int lrank; + MPI_Comm_rank(lcomm, &lrank); + + /* create distributed full grid. we try to find a balanced ratio between + * the number of grid points and the number of processes per dimension + * by this very simple algorithm. to keep things simple we require powers + * of two for the number of processes here. */ + int np; + MPI_Comm_size(lcomm, &np); + + // check if power of two + if (!((np > 0) && ((np & (~np + 1)) == np))) + assert(false && "number of processes not power of two"); + + DimType dim = this->getDim(); + IndexVector p(dim, 1); + const LevelVector& l = this->getLevelVector(); + + if (p_.size() == 0) { + // compute domain decomposition + IndexType prod_p(1); + + while (prod_p != static_cast(np)) { + DimType dimMaxRatio = 0; + real maxRatio = 0.0; + + for (DimType k = 0; k < dim; ++k) { + real ratio = std::pow(2.0, l[k]) / p[k]; + + if (ratio > maxRatio) { + maxRatio = ratio; + dimMaxRatio = k; + } + } + + p[dimMaxRatio] *= 2; + prod_p = 1; + + for (DimType k = 0; k < dim; ++k) + prod_p *= p[k]; + } + } else { + p = p_; + } + + if (lrank == 0) { + std::cout << "group " << theMPISystem()->getGlobalRank() << " " + << "computing task " << this->getID() << " with l = " + << this->getLevelVector() << " and p = " << p << std::endl; + } + + // create local subgrid on each process + dfg_ = new DistributedFullGrid(dim, l, lcomm, + this->getBoundary(), p); + + /* loop over local subgrid and set initial values */ + std::vector& elements = dfg_->getElementVector(); + + for (size_t i = 0; i < elements.size(); ++i) { + IndexType globalLinearIndex = dfg_->getGlobalLinearIndex(i); + std::vector globalCoords(dim); + dfg_->getCoordsGlobal(globalLinearIndex, globalCoords); + elements[i] = TaskExample::myfunction(globalCoords, 0.0); + } + + initialized_ = true; + } + /* this is were the application code kicks in and all the magic happens. + * do whatever you have to do, but make sure that your application uses + * only lcomm or a subset of it as communicator. + * important: don't forget to set the isFinished flag at the end of the computation. + */ + void run(CommunicatorType lcomm) { + assert(initialized_); + + int lrank, globalRank; + MPI_Comm_rank(lcomm, &lrank); + MPI_Comm_rank(MPI_COMM_WORLD, &globalRank); + + /* pseudo timestepping to demonstrate the behaviour of your typical + * time-dependent simulation problem. */ + std::vector& elements = dfg_->getElementVector(); + + for (size_t step = stepsTotal_; step < stepsTotal_ + nsteps_; ++step) { + real time = (step + 1)* dt_; + + for (size_t i = 0; i < elements.size(); ++i) { + IndexType globalLinearIndex = dfg_->getGlobalLinearIndex(i); + std::vector globalCoords(this->getDim()); + dfg_->getCoordsGlobal(globalLinearIndex, globalCoords); + elements[i] = TaskExample::myfunction(globalCoords, time); + } + } + + + int localRank = theMPISystem()->getLocalRank(); + if ( failNow(localRank) && initialized_ ){ + std::cout<<"Task "<< id_<<" failed at iteration "<getGlobalLinearIndex(faultsInfo_.sdcIndex_); + IndexType sdcLinearIndex = dfg_->getLocalLinearIndex(sdcGLI); + + std::cout<<"Old function value: "<getData()[sdcLinearIndex]<getData()[sdcLinearIndex] *= faultsInfo_.sdcMag_; + std::cout<<"New function value: "<getData()[sdcLinearIndex]< coords; + IndexType globalInd = dfg_->getGlobalLinearIndex(sdcLinearIndex); + dfg_->getGlobalLI(globalInd, sdcSub, inds); + std::cout<<"SDC injected in subspace "<getCoordsLocal( sdcLinearIndex, coords ); + std::cout<<"Coordinate: "; + for (auto coord : coords) + std::cout<setFinished(true); + } + + /* this function evaluates the combination solution on a given full grid. + * here, a full grid representation of your task's solution has to be created + * on the process of lcomm with the rank r. + * typically this would require gathering your (in whatever way) distributed + * solution on one process and then converting it to the full grid representation. + * the DistributedFullGrid class offers a convenient function to do this. + */ + void getFullGrid(FullGrid& fg, RankType r, + CommunicatorType lcomm) { + assert(fg.getLevels() == dfg_->getLevels()); + dfg_->gatherFullGrid(fg, r); + } + + DistributedFullGrid& getDistributedFullGrid() { + return *dfg_; + } + + static real myfunction(std::vector& coords, real t) { + real u = std::cos(M_PI * t); + + for ( size_t d = 0; d < coords.size(); ++d ) + u *= std::cos( 2.0 * M_PI * coords[d] ); + + return u; + + /* + double res = 1.0; + for (size_t i = 0; i < coords.size(); ++i) { + res *= -4.0 * coords[i] * (coords[i] - 1); + } + + + return res; + */ + } + + inline void setStepsTotal( size_t stepsTotal ); + + void setZero(){ + std::vector& data = dfg_->getElementVector(); + + for( size_t i=0; i* dfg_; + + /** + * The serialize function has to be extended by the new member variables. + * However this concerns only member variables that need to be exchanged + * between manager and workers. We do not need to add "local" member variables + * that are only needed on either manager or worker processes. + * For serialization of the parent class members, the class must be + * registered with the BOOST_CLASS_EXPORT macro. + */ + template + void serialize(Archive& ar, const unsigned int version) { + // handles serialization of base class + ar& boost::serialization::base_object(*this); + + // add our new variables + ar& initialized_; + ar& dt_; + ar& nsteps_; + ar& stepsTotal_; + ar& p_; + ar& faultsInfo_; + } + +}; + +inline bool TaskExample::failNow( const int& localRank ){ + if ( faultsInfo_.numFaults_ == 0 ) + return false; + + IndexVector iF = faultsInfo_.iterationFaults_; + IndexVector tF = faultsInfo_.taskFaults_; + IndexVector indF = faultsInfo_.sdcIndex_; + + std::vector::iterator it; + it = std::find(iF.begin(), iF.end(), combiStep_); + IndexType idx = std::distance(iF.begin(),it); + // Check if current iteration is in iterationFaults_ + while (it!=iF.end() ){ + // Check if my task should fail + if( this->getID() == tF[idx] ){ + IndexType GLI = dfg_->getGlobalLinearIndex(indF); + IndexType LLI = dfg_->getLocalLinearIndex(GLI); + if (LLI != -1 ) + return true; + } + it = std::find(++it, iF.end(), combiStep_); + idx = std::distance(iF.begin(),it); + } + return false; +} + +inline void TaskExample::setStepsTotal( size_t stepsTotal ) { + stepsTotal_ = stepsTotal; +} + +} // namespace combigrid + +#endif /* TASKEXAMPLE_HPP_ */ diff --git a/distributedcombigrid/examples/combi_example_soft_faults/combi_example_soft_faults.cpp b/distributedcombigrid/examples/combi_example_soft_faults/combi_example_soft_faults.cpp new file mode 100644 index 000000000..65f6052bb --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/combi_example_soft_faults.cpp @@ -0,0 +1,218 @@ +/* + * combi_example.cpp + * + * Created on: Sep 23, 2015 + * Author: heenemo + */ +#include +#include +#include +#include +#include + +// compulsory includes for basic functionality +#include "sgpp/distributedcombigrid/task/Task.hpp" +#include "sgpp/distributedcombigrid/utils/Types.hpp" +#include "sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.hpp" +#include "sgpp/distributedcombigrid/fullgrid/FullGrid.hpp" +#include "sgpp/distributedcombigrid/loadmodel/LinearLoadModel.hpp" +#include "sgpp/distributedcombigrid/manager/CombiParameters.hpp" +#include "sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp" +#include "sgpp/distributedcombigrid/manager/ProcessGroupWorker.hpp" +#include "sgpp/distributedcombigrid/manager/ProcessManager.hpp" + +// include user specific task. this is the interface to your application +#include "TaskExample.hpp" + +#include "HelperFunctions.hpp" + +using namespace combigrid; + +// this is necessary for correct function of task serialization +BOOST_CLASS_EXPORT(TaskExample) + +void solveProblem(ProcessManager& manager, LevelVector& leval, + std::vector& boundary, double time_step, + int numSteps, std::vector time_steps_combi, + bool plot, int sdcMethod ) +{ + std::ofstream valueFile("out/values.dat"); + + WORLD_MANAGER_EXCLUSIVE_SECTION{ theStatsContainer()->setTimerStart("ct"); } + + int combistep = 0; + std::vector reinitFaultsID, recomputeFaultsID, faultsID; + for (int step = 0; step < numSteps; ++step) { + + if( step == 0 ){ + manager.runfirst(); + } else { + manager.runnext(); + } + + if ( step == time_steps_combi[combistep]){ + + bool success = true; + success = manager.searchSDC( sdcMethod ); + combistep++; + if ( !success ) { + std::cout << "SDC detected at combi iteration " << step << std::endl; + manager.getSDCFaultIDs( faultsID ); + /* call optimization code to find new coefficients */ + const std::string prob_name = "interpolation based optimization"; + + manager.recomputeOptimumCoefficients(prob_name, faultsID, + reinitFaultsID, recomputeFaultsID); + + /* communicate new combination scheme*/ + manager.updateCombiParameters(); + + /* if some tasks have to be recomputed, do so*/ + for ( auto id : recomputeFaultsID ) { + TaskExample* tmp = static_cast(manager.getTask(id)); + tmp->setStepsTotal((combistep-1)*numSteps); + } + manager.recompute(recomputeFaultsID); + } + + + /* combine solution */ + std::cout<<"Combining.."< fg( leval.size(), leval, boundary); + std::cout << "eval solution for plotting" << std::endl; + manager.gridEval(fg); + // output for function values in gnuplot (only in 2D) + writeSolutionToFile(valueFile, fg); + } + + if ( !success ) { + manager.reinit(reinitFaultsID); + for ( auto id : reinitFaultsID ) { + TaskExample* tmp = static_cast(manager.getTask(id)); + tmp->setStepsTotal(combistep*numSteps); + } + manager.restoreCombischeme(); + manager.updateCombiParameters(); + } + } + } + + valueFile.close(); + + /* evaluate solution */ + FullGrid fg_eval( leval.size(), leval, boundary); + manager.gridEval(fg_eval); + std::string filename( "solution.fg" ); + fg_eval.save( filename ); + + std::cout << "exiting" << std::endl; + manager.exit(); +} + +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + + /* number of process groups and number of processes per group */ + size_t ngroup, nprocs; + + DimType dim; + LevelVector lmin, lmax, leval; + IndexVector p; + size_t ncombi; + int numSteps; + std::vector time_steps_combi; + FaultsInfo faultsInfo; + + double time_step, time_start, time_end; + + bool plot; + + /* read parameter file (ctparam) */ + const std::string fileName = "settings.ini"; + readParameterFile(fileName, ngroup, nprocs, dim, lmin, lmax, + leval, p, time_step, time_start, time_end, ncombi, + faultsInfo, plot); + + numSteps = (time_end-time_start) / time_step; + for (size_t i = 0; i < ncombi; ++i) + time_steps_combi.push_back( numSteps - i*numSteps/ncombi - 1 ); + std::reverse(time_steps_combi.begin(), time_steps_combi.end()); + + // todo: read in boundary vector from ctparam + std::vector boundary(dim, true); + + theMPISystem()->init( ngroup, nprocs ); + + // ProcessGroupManager and ProcessManager Code + if (theMPISystem()->getWorldRank() == theMPISystem()->getManagerRankWorld()) { + + /* create an abstraction of the process groups for the manager's view + * a pgroup is identified by the ID in gcomm + */ + ProcessGroupManagerContainer pgroups; + for (size_t i=0; i ( pgroupRootID ) + ); + } + + // ProcessManager Code + // create DuneTasks + LoadModel* loadmodel = new LinearLoadModel(); + + CombiMinMaxScheme combischeme(dim, lmin, lmax); + combischeme.createAdaptiveCombischeme(); + combischeme.makeFaultTolerant(); + std::vector levels = combischeme.getCombiSpaces(); + std::vector coeffs = combischeme.getCoeffs(); + + TaskContainer tasks; + std::vector taskIDs; + for (uint i = 0; i < levels.size(); ++i) { + Task* t = new TaskExample(dim, levels[i], boundary, coeffs[i], + loadmodel, time_step, numSteps, p, faultsInfo); + tasks.push_back(t); + taskIDs.push_back( t->getID() ); + } + + // output of combination setup + std::cout << "lmin = " << lmin << std::endl; + std::cout << "lmax = " << lmax << std::endl; + std::cout << "CombiScheme: " << std::endl; + std::cout << combischeme << std::endl; + + // create combiparamters + CombiParameters params(dim, lmin, lmax, boundary, levels, coeffs, taskIDs); + + // create Manager with process groups + ProcessManager manager(pgroups, tasks, params); + + // combiparameters need to be set before starting the computation + manager.updateCombiParameters(); + + // start calculation + solveProblem(manager, leval, boundary, time_step, numSteps, time_steps_combi, plot, faultsInfo.sdcMethod_ ); + } + + // worker code + else { + // create abstraction of the process group from the worker's view + ProcessGroupWorker pgroup; + + // wait for instructions from manager + SignalType signal = -1; + + while (signal != EXIT) + signal = pgroup.wait(); + } + + MPI_Finalize(); + + return 0; +} diff --git a/distributedcombigrid/examples/combi_example_soft_faults/graph.gnu b/distributedcombigrid/examples/combi_example_soft_faults/graph.gnu new file mode 100644 index 000000000..353951ccf --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/graph.gnu @@ -0,0 +1,18 @@ +#!/bin/gnuplot + +#set terminal pngcairo size 800,600 enhanced font "Verdana,10" +set mouse +set xrange [0:1] +set yrange [0:1] +set zrange [-2:2] +set cbrange [-1:1] +set xlabel "x" +set ylabel "y" +set style line 1 lt -1 lw 0.3 +set pm3d hidden3d 1 +do for [i = 1:100] { + splot "out/values.dat" index (i-1) using 1:2:3 with pm3d + pause 0.02 +} +reread +pause mouse keypress diff --git a/distributedcombigrid/examples/combi_example_soft_faults/run.sh b/distributedcombigrid/examples/combi_example_soft_faults/run.sh new file mode 100755 index 000000000..fee139dc1 --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +read_properties () +{ + file="$1" + while IFS=" = " read -r key value; + do + case "$key" in + "ngroup") + ngroup=$value;; + "nprocs") + nprocs=$value;; + esac + done < "$file" +} + +ngroup=0 +nprocs=0 +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +read_properties "$SCRIPT_DIR/settings.ini" +mpiprocs=$((ngroup*nprocs+1)) + +mpirun.mpich -n "$mpiprocs" ./combi_example_soft_faults diff --git a/distributedcombigrid/examples/combi_example_soft_faults/settings.ini b/distributedcombigrid/examples/combi_example_soft_faults/settings.ini new file mode 100644 index 000000000..e4cdf384d --- /dev/null +++ b/distributedcombigrid/examples/combi_example_soft_faults/settings.ini @@ -0,0 +1,27 @@ +[manager] +ngroup = 1 +nprocs = 4 + +[simulation] +time_step = 0.001 +time_start = 0.0 +time_end = 0.05 +ncombi = 50 +plot = 1 + +[ct] +dim = 2 +lmin = 3 3 +lmax = 7 7 +leval = 5 5 +p = 2 2 + +[faults] +num_faults = 1 +iteration_faults = 3 +task_faults = 3 +sdc_index = 32 8 +sdc_mag = 2 + +[method] +sdc_method = 2 diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/combicom/CombiCom.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/combicom/CombiCom.hpp index a92b6e239..29c2f5505 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/combicom/CombiCom.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/combicom/CombiCom.hpp @@ -9,6 +9,7 @@ #define COMBICOM_HPP_ #include "sgpp/distributedcombigrid/mpi/MPISystem.hpp" +#include "sgpp/distributedcombigrid/mpi/MPIUtils.hpp" #include "sgpp/distributedcombigrid/fullgrid/FullGrid.hpp" #include "sgpp/distributedcombigrid/sparsegrid/SGrid.hpp" #include "sgpp/distributedcombigrid/utils/StatsContainer.hpp" @@ -49,7 +50,7 @@ class CombiCom { static void FGAllreduce(FullGrid& fg, MPI_Comm comm); template - static void BetasReduce( std::vector& betas, RankType r, MPI_Comm comm); + static void BetasReduce( std::vector& betas, std::vector& betasReduced, MPI_Comm comm); // multiply dfg with coeff and add to dsg. dfg will not be changed template @@ -279,20 +280,16 @@ inline void CombiCom::FGAllreduce >( MPI_DOUBLE_COMPLEX, MPI_SUM, comm); } template<> -inline void CombiCom::BetasReduce( std::vector& betas, RankType r, MPI_Comm comm ){ +inline void CombiCom::BetasReduce( std::vector& betas, std::vector& betasReduced, + MPI_Comm comm ){ - int myrank; - MPI_Comm_rank(comm, &myrank); + MPI_Op OP_MAX_ABS; - std::vector recvBetas(betas.size()); + MPI_Op_create((MPI_User_function *) MPIUtils::MAX_ABS, 1, &OP_MAX_ABS ); - if ( myrank == r ) { - MPI_Reduce( MPI_IN_PLACE, &betas[0], static_cast(betas.size()), - MPI_DOUBLE, MPI_MAX, r, comm); - } else { - MPI_Reduce(&betas[0], &recvBetas[0], static_cast(betas.size()), - MPI_DOUBLE, MPI_MAX, r, comm); - } + MPI_Allreduce(betas.data(), betasReduced.data(), static_cast(betas.size()), MPI_DOUBLE, OP_MAX_ABS, comm); + + MPI_Op_free( &OP_MAX_ABS ); } template @@ -311,7 +308,8 @@ void CombiCom::FGAllreduce(FullGrid& fg, MPI_Comm comm) { } template -void CombiCom::BetasReduce(std::vector& betas, RankType r, MPI_Comm comm) { +void CombiCom::BetasReduce(std::vector& betas, std::vector& betasReduced, + MPI_Comm comm) { assert(!"this type is not yet implemented"); } diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.cpp index 93c679cf2..427a2929f 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.cpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.cpp @@ -157,6 +157,7 @@ void CombiMinMaxScheme::computeCombiCoeffsClassical(){ LevelType p = n_ - l1; // Classical combination coefficients coefficients_.push_back(std::pow(-1, p)*boost::math::binomial_coefficient(effDim_ - 1, p) ); + } } diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.cpp new file mode 100644 index 000000000..6fac0f00a --- /dev/null +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.cpp @@ -0,0 +1,503 @@ +#include "sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp" + +namespace combigrid { +template +T str_to_number(const std::string& no) { + T value; + std::stringstream stream(no); + stream >> value; + + if (stream.fail()) { + std::runtime_error e(no); + std::cout << "Error in the conversion of " << no << "!" << std::endl; + throw e; + } + + return value; +} + +template +void remove(std::vector& vec, size_t pos) { + auto it = vec.begin(); + std::advance(it, pos); + vec.erase(it); +} + +std::string python_code_caller(const std::string& script_name, + const LevelVectorList& levels, const int& dim) { + LevelType levels_no = 0; + LevelType level_size = 0; + LevelType one_level = 0; + std::stringstream caller; + + levels_no = static_cast(levels.size()); + level_size = static_cast(levels[0].size()); + + caller << "python " << script_name << " " << dim << " "; + + for (int i = 0; i < levels_no; ++i) { + for (int j = 0; j < level_size; ++j) { + one_level = levels[i][j]; + caller << one_level << " "; + } + } + + return caller.str(); +} + +CombigridDict get_python_data(const std::string& script_run, const int& dim) { + FILE* stream; + char buffer[256]; + std::string level_x_str; + std::string level_y_str; + std::string coeff_str; + + double coeff = 0.0; + + CombigridDict dict; + + stream = popen(script_run.c_str(), "r"); + + if (stream) { + while (!feof(stream)) { + if (fgets(buffer, sizeof(buffer), stream) != NULL) { + std::string one_level_str; + int one_level = 0; + LevelVector levels; + std::stringstream temp(buffer); + + for (int i = 0; i < dim; ++i) { + temp >> one_level_str; + one_level = str_to_number(one_level_str); + levels.push_back(one_level); + } + + temp >> coeff_str; + coeff = str_to_number(coeff_str); + + dict.insert(std::make_pair(levels, coeff)); + } + } + + pclose(stream); + } else { + throw "Error reading script output!"; + } + + return dict; +} + +matrix create_M_matrix(const CombigridDict& aux_downset, const int& dim) { + int size_downset = static_cast(aux_downset.size()); + int i = 0; + int j = 0; + + matrix M(size_downset, std::vector(size_downset, 0.0)); + + for (auto ii = aux_downset.begin(); ii != aux_downset.end(); ++ii) { + i = static_cast(ii->second); + j = 0; + + LevelVector w; + for (int it = 0; it < dim; ++it) { + w.push_back(ii->first[it]); + } + + for (auto jj = aux_downset.begin(); jj != aux_downset.end(); ++jj) { + LevelVector c; + for (int it = 0; it < dim; ++it) { + c.push_back(jj->first[it]); + } + j = static_cast(jj->second); + + if (test_greater(c, w)) { + M[i][j] = 1.0; + } else { + M[i][j] = 0.0; + } + } + } + + return M; +} + +matrix get_inv_M(const CombigridDict& aux_downset, const int& dim) { + int size_downset = static_cast(aux_downset.size()); + int i = 0; + int j = 0; + + std::valarray c(dim); + std::valarray w(dim); + std::valarray diff(dim); + std::valarray zeros(0, dim); + + matrix M_inv(size_downset, std::vector(size_downset, 0.0)); + + for (auto ii = aux_downset.begin(); ii != aux_downset.end(); ++ii) { + i = static_cast(ii->second); + for (int it = 0; it < dim; ++it) { + w[it] = static_cast(ii->first[it]); + } + + for (auto jj = ii; jj != aux_downset.end(); ++jj) { + j = static_cast(jj->second); + for (int it = 0; it < dim; ++it) { + c[it] = static_cast(jj->first[it]); + } + + diff = c - w; + + if (((diff.sum() > 0) || (diff.sum() <= dim)) + && ((diff.max() <= 1) && (diff >= zeros).min())) { + M_inv[i][j] = pow(-1, diff.sum()); + } + } + } + + return M_inv; +} + +CombigridDict set_entire_downset_dict(const LevelVectorList levels, + const CombigridDict& received_dict, const int& dim) { + LevelVector level_min = levels.front(); + LevelVector level_max = levels.back(); + CombigridDict active_downset; + CombigridDict output; + + LevelVector level_active_downset; + + LevelVector level; + LevelVectorList all_levels; + LevelVectorList feasible_levels; + + double key = 0.0; + + all_levels = mindex(dim, level_max); + + for (auto ii = received_dict.begin(); ii != received_dict.end(); ++ii) { + if (ii->second > 0.0) { + active_downset.insert(std::make_pair(ii->first, ii->second)); + } + } + + for (auto ii = active_downset.begin(); ii != active_downset.end(); ++ii) { + level_active_downset = ii->first; + + for (unsigned int i = 0; i < all_levels.size(); ++i) { + if (test_greater(level_active_downset, all_levels[i]) + && test_greater(all_levels[i], level_min)) { + feasible_levels.push_back(all_levels[i]); + } + } + } + + for (unsigned int i = 0; i < feasible_levels.size(); ++i) { + level = feasible_levels[i]; + auto ii = received_dict.find(level); + + if (ii != received_dict.end()) { + key = ii->second; + output.insert(std::make_pair(level, key)); + } else { + key = 0.0; + output.insert(std::make_pair(level, key)); + } + } + + return output; +} + +CombigridDict create_aux_entire_dict(const CombigridDict& entire_downset, + const int& dim) { + real key = 0; + int i = 0; + + CombigridDict aux_dict; + + for (auto ii = entire_downset.begin(); ii != entire_downset.end(); ++ii) { + LevelVector levels; + key = static_cast(i); + + for (int i = 0; i < dim; ++i) { + levels.push_back(ii->first[i]); + } + + ++i; + aux_dict.insert(std::make_pair(levels, key)); + } + + return aux_dict; +} + +LevelVectorList get_downset_indices(const CombigridDict& entire_downset, + const int& dim) { + LevelVectorList indices; + + for (auto ii = entire_downset.begin(); ii != entire_downset.end(); ++ii) { + LevelVector index; + + for (int i = 0; i < dim; ++i) { + index.push_back(ii->first[i]); + } + + indices.push_back(index); + } + + return indices; +} + +LevelVectorList filter_faults(const LevelVectorList& faults_input, const IndexType& l_max, + const CombigridDict& received_dict) { + int no_faults = 0; + int level_fault = 0; + LevelVectorList faults_output; + + no_faults = static_cast(faults_input.size()); + + for (int i = 0; i < no_faults; ++i) { + auto it = received_dict.find(faults_input[i]); + + if (it != received_dict.end()) { + level_fault = std::accumulate(faults_input[i].begin(), + faults_input[i].end(), 0); + if ((level_fault == l_max) || (level_fault == (l_max - 1))) { + faults_output.push_back(faults_input[i]); + } + } + } + + return faults_output; +} + +CombigridDict create_out_dict(const CombigridDict& given_downset, + const std::vector& new_c, const int& dim) { + real key = 0; + int i = 0; + + CombigridDict out_dict; + + for (auto ii = given_downset.begin(); ii != given_downset.end(); ++ii) { + LevelVector levels; + key = new_c[i]; + + for (int i = 0; i < dim; ++i) { + levels.push_back(ii->first[i]); + } + + ++i; + out_dict.insert(std::make_pair(levels, key)); + } + + return out_dict; +} + +std::string set_aux_var_name(const std::string& var_name, const int& index) { + std::stringstream aux_var; + aux_var << var_name << index; + + return aux_var.str(); +} +// generate integer random numbers between 0 and #levels-1 +//int generate_random_fault(const int& no_of_levels) { +// std::random_device dev; +// std::mt19937 rng(dev()); +// std::uniform_int_distribution rand_num(0, no_of_levels - 1); +// +// return rand_num(rng); +//} + +std::vector gen_rand(const int& size) { + double rand_var = 0.0; + std::vector output; + + for (int i = 0; i < size; ++i) { + rand_var = 1e-2 * (std::rand() % 10); + output.push_back(rand_var); + } + + return output; +} + +int get_size_downset(const std::vector& level_max, const int& dim) { + int size = 1; + int min_level_max = *std::min_element(level_max.begin(), level_max.end()); + + for (int i = 0; i < dim; ++i) { + size *= (min_level_max + i); + } + + size = static_cast(size / (factorial(dim))); + + return size; +} + +int l1_norm(const LevelVector& u) { + int norm = 0; + + for (auto elem : u) { + norm += abs(static_cast(elem)); + } + + return norm; +} + +int factorial(const int& dim) { + int fact = 0; + + if (dim == 0 || dim == 1) { + fact = 1; + } else { + fact = dim * factorial(dim - 1); + } + + return fact; +} + +bool test_greater(const LevelVector& b, const LevelVector& a) { + int dim = static_cast(a.size()); + bool test = true; + + for (int i = 0; i < dim; ++i) { + test *= (b[i] >= a[i]) ? true : false; + } + + return test; +} + +LevelVectorList mindex(const int& dimension, const LevelVector& level_max) { + int j = 0; + int norm = 0; + IndexType sum_level_max = 0; + + LevelVector temp(dimension, 1); + LevelVectorList mindex_result; + + auto upper_limit = std::max_element(level_max.begin(), level_max.end()); + + for (auto elem : level_max) { + sum_level_max += elem; + } + + while (true) { + norm = l1_norm(temp); + + if (norm <= sum_level_max) { + mindex_result.push_back(temp); + } + + for (j = dimension - 1; j >= 0; --j) { + if (++temp[j] <= *upper_limit) + break; + else + temp[j] = 1; + } + + if (j < 0) + break; + } + + return mindex_result; +} + +LevelVectorList check_dimensionality(const LevelVectorList& input_levels, + LevelVector& ignored_dimensions) { + LevelVector l_min = input_levels[0]; + LevelVector l_max = input_levels[1]; + + LevelVector new_l_min; + LevelVector new_l_max; + LevelVectorList new_levels; + + for ( size_t i = 0; i < l_min.size(); ++i) { + if (l_max[i] == l_min[i]) { + ignored_dimensions.push_back(i); + } else { + new_l_min.push_back(l_min[i]); + new_l_max.push_back(l_max[i]); + } + } + + new_levels.push_back(new_l_min); + new_levels.push_back(new_l_max); + + return new_levels; +} + +LevelVectorList check_faults(const LevelVectorList& input_faults, + const LevelVector& ignored_dimensions) { + LevelVectorList new_faults; + + for (unsigned int i = 0; i < input_faults.size(); ++i) { + LevelVector new_fault; + + for (unsigned int j = 0; j < input_faults[0].size(); ++j) { + if (std::find(ignored_dimensions.begin(), ignored_dimensions.end(), j) + == ignored_dimensions.end()) { + new_fault.push_back(input_faults[i][j]); + } + } + + new_faults.push_back(new_fault); + } + + return new_faults; +} + +CombigridDict set_new_given_dict(const CombigridDict& given_dict, + const LevelVector& ignored_dimensions, const int& dim) { + real key = 0.0; + CombigridDict new_given_dict; + + for (auto ii = given_dict.begin(); ii != given_dict.end(); ++ii) { + LevelVector new_level; + + for (int i = 0; i < dim; ++i) { + if (std::find(ignored_dimensions.begin(), ignored_dimensions.end(), i) + == ignored_dimensions.end()) { + new_level.push_back(ii->first[i]); + } + } + + key = ii->second; + new_given_dict.insert(std::make_pair(new_level, key)); + } + + return new_given_dict; +} + +void check_input_levels(const LevelVectorList& levels) { + LevelVector l_min = levels[0]; + LevelVector l_max = levels[1]; + LevelVector c; + + for (unsigned int i = 0; i < l_min.size(); ++i) { + c.push_back(l_max[i] - l_min[i]); + } + + if (std::adjacent_find(c.begin(), c.end(), std::not_equal_to()) + == c.end()) { + std::cout << "Correct input levels!" << std::endl; + } else { + std::cout << "Input levels are incorrect!" << std::endl; + std::cout + << "Please input them of the form: l_max = l_min + c*ones(dim), c>=1, integer" + << std::endl; + exit(0); + } +} + +std::vector select_coeff_downset(const std::vector& all_c, + const CombigridDict& given_downset, const CombigridDict& aux_downset) { + int given_downset_index = 0; + std::vector donwset_c; + + for (auto ii = aux_downset.begin(); ii != aux_downset.end(); ++ii) { + if (given_downset.find(ii->first) != given_downset.end()) { + given_downset_index = static_cast(ii->second); + donwset_c.push_back(all_c.at(given_downset_index)); + } + } + + return donwset_c; +} +} // namespace combigrid diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp new file mode 100644 index 000000000..2ad94153f --- /dev/null +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp @@ -0,0 +1,149 @@ +#ifndef HELPER_HPP_ +#define HELPER_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sgpp/distributedcombigrid/utils/Types.hpp" +#include "sgpp/distributedcombigrid/utils/LevelVector.hpp" + +namespace combigrid { + +typedef std::vector> matrix; +typedef std::map IdToLevelDict; +typedef std::map CombigridDict; +typedef std::vector LevelVectorList; + +/* used to convert a string to a number in any format */ +template +T str_to_number(const std::string& no); + +/* used to remove a vector element of vec from position pos */ +template +void remove(std::vector& vec, size_t pos); + +/* used for calling the python code as python script_name level_min level_max */ +std::string python_code_caller(const std::string& script_name, + const LevelVectorList& levels, const int& dim); + +/* used to get data for the GCP when minimizing the interpolation error */ +CombigridDict get_python_data(const std::string& script_run, const int& dim); + +/* used to create the M matrix for the interpolation based problem */ +matrix create_M_matrix(const CombigridDict& aux_downset, const int& dim); + +/* used to create the inverse of M */ +matrix get_inv_M(const CombigridDict& aux_downset, const int& dim); + +/* used to create the inverse of M matrix for the interpolation based problem */ +CombigridDict set_entire_downset_dict(const LevelVectorList levels, + const CombigridDict& received_dict, const int& dim); + +/* used to get a vector of the entire downset indices */ +LevelVectorList get_downset_indices(const CombigridDict& entire_downset, + const int& dim); + +/* used to filter the input LevelVectorList faults such that only faults from the partial downset + (input from python) are considered */ +LevelVectorList filter_faults(const LevelVectorList& faults_input, const IndexType& l_max, + const CombigridDict& received_dict); + +/* used to create an entire downset dictionary used for setting up the M matrix */ +CombigridDict create_aux_entire_dict(const CombigridDict& entire_downset, + const int& dim); + +/* used to print the new dictionary after the optimization is performed */ +CombigridDict create_out_dict(const CombigridDict& given_downset, + const std::vector& new_c, const int& dim); + +/* used to set the row and column variables for the optimization problem */ +std::string set_aux_var_name(const std::string& var_name, const int& index); + +/* generates no_of_levels random faults */ +//int generate_random_fault(const int& no_of_levels); + +/* used to generate random variables for the W matrix in the optimization problem */ +std::vector gen_rand(const int& size); + +/* used to compute the size of the downset */ +int get_size_downset(const LevelVector& level_max, const int& dim); + +/* used to compute the L1 norm of a vector */ +int l1_norm(const LevelVector& u); + +/* used to compute factorial; needed to compute size of the downset */ +int factorial(const int& dim); + +/* test whether b >= a */ +bool test_greater(const LevelVector& b, const LevelVector& a); + +/* used to create a multi-index based on maximum level */ +LevelVectorList mindex(const int& dimension, const LevelVector& level_max); + +/* used to check input levels dimensionality */ +LevelVectorList check_dimensionality(const LevelVectorList& input_levels, + LevelVector& ignored_dimensions); + +/* used to ignore certain dimensions of the input faults based on input levels & ignored dimension */ +LevelVectorList check_faults(const LevelVectorList& input_faults, + const LevelVector& ignored_dimensions); + +/* used to create a new dictionary, based on the given dictionary and the ignored dimensions */ +CombigridDict set_new_given_dict(const CombigridDict& given_dict, + const LevelVector& ignored_dimensions, const int& dim); + +/* used to check whether the input levels are correct */ +/* i.e. they satisfy: l_max - l_min = c*ones(dim) */ +void check_input_levels(const LevelVectorList& levels); + +/* used to extract only the coefficients output by the optimization problem */ +/* which correspond to levels from the given downset */ +std::vector select_coeff_downset(const std::vector& all_c, + const CombigridDict& given_downset, const CombigridDict& aux_downset); + +/* contains information about fault simulation: +** numFaults_: how many faults occur +** iterationFaults_: vector of timesteps at which processes fail +** globalRankFaults_: global rank of process that fails +**/ +struct FaultsInfo { +public: + int numFaults_; + IndexVector iterationFaults_; + IndexVector taskFaults_; + IndexVector sdcIndex_; + CombiDataType sdcMag_; + int sdcMethod_; +private: + friend class boost::serialization::access; + // serialize + template + void serialize(Archive& ar, const unsigned int version); +}; + + +template +void FaultsInfo::serialize(Archive& ar, const unsigned int version) { + ar& numFaults_; + ar& iterationFaults_; + ar& taskFaults_; + ar& sdcIndex_; + ar& sdcMag_; + ar& sdcMethod_; +} + +} // namespace combigrid +#endif /* HELPER_HPP_ */ diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimization.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimization.hpp new file mode 100644 index 000000000..3dc8133d5 --- /dev/null +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimization.hpp @@ -0,0 +1,57 @@ +#ifndef LPOPT_HPP_ +#define LPOPT_HPP_ + +#include +#include +#include +#include +#include +#include +#include + +#include "sgpp/distributedcombigrid/fault_tolerance/FTUtils.hpp" + +#include "glpk.h" + + +namespace combigrid { + +class LP_OPT { +protected: + /* optimization type: GLP_MIN or GLP_MAX */ + int opt_type; + + /* glp problem; used in every glpk function */ + glp_prob* i_lp_prob; + + /* constraint matrix */ + std::vector constr_mat; + /* row index vector for the constraint matrix */ + std::vector row_index; + /* colums index vector for the constraint matrix */ + std::vector col_index; + +public: + /* used for LP optimization problem initialization */ + /* number of auxiliary and structural variables is set */ + /* as well as objective function and the constraints */ + virtual void init_opti_prob(const std::string& prob_name) = 0; + + /* used to set up the constraint matrix and its row and column indices*/ + virtual void set_constr_matrix() = 0; + + /* used to set up the constraint matrix and its row and column indices when it depends on matrix W*/ + virtual void set_constr_matrix(const std::vector& W) = 0; + + /* used to solve the linear programming problem, using the simplex algorithm */ + virtual void solve_opti_problem() const = 0; + + /* used to get the output of the LP problem, i.e. c and d vectors */ + virtual CombigridDict get_results(LevelVectorList& faults) const = 0; + + /* destructor; used to deallocate all the allocated memory */ + virtual ~LP_OPT() { + } +}; +} +#endif /* LPOPT_HPP_ */ diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.cpp new file mode 100644 index 000000000..5880de709 --- /dev/null +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.cpp @@ -0,0 +1,356 @@ +#include "sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.hpp" + +namespace combigrid { + +LP_OPT_INTERP::LP_OPT_INTERP() { + i_dim = 0; + i_levels = { {0}}; + level_max = {0}; + no_faults = 0; + l_max = 0; + size_downset = 0; +} + +LP_OPT_INTERP::LP_OPT_INTERP(const LevelVectorList& _levels, const int& _dim, + const int& _opt_type, const CombigridDict& _given_downset, + const LevelVectorList& _input_faults) { + assert(_opt_type == GLP_MIN || _opt_type == GLP_MAX); + i_levels = _levels; + i_dim = _dim; + opt_type = _opt_type; + given_downset = _given_downset; + input_faults = _input_faults; + + new_levels = check_dimensionality(_levels, ignored_dimensions); + new_faults = check_faults(_input_faults, ignored_dimensions); + new_dim = static_cast(new_levels[0].size()); + new_given_downset = set_new_given_dict(given_downset, ignored_dimensions, + _dim); + + //check_input_levels(new_levels); + + level_max = new_levels.back(); + entire_downset = set_entire_downset_dict(new_levels, new_given_downset, + new_dim); + + l_max = new_levels[1][0]; + for (int i = 1; i < new_dim; ++i) { + l_max += new_levels[0][i]; + } + + valid_input_faults = filter_faults(new_faults, l_max, new_given_downset); + + for (auto fault : new_faults ){ + if( std::find( valid_input_faults.begin(), valid_input_faults.end(), fault) == valid_input_faults.end() ) + recompute_faults.push_back(fault); + } + + no_faults = static_cast(valid_input_faults.size()); + no_faults_recompute = static_cast(recompute_faults.size()); + + if (no_faults == 0) { + std::cout << "No faults to fix." << std::endl; + i_lp_prob = glp_create_prob(); + } else { + + aux_entire_dict = create_aux_entire_dict(entire_downset, new_dim); + inv_M = get_inv_M(aux_entire_dict, new_dim); + downset_indices = get_downset_indices(entire_downset, new_dim); + + size_downset = static_cast(entire_downset.size()); + total_size = no_faults * size_downset; + + constr_mat.reserve(1 + total_size); + row_index.reserve(1 + total_size); + col_index.reserve(1 + total_size); + + i_lp_prob = glp_create_prob(); + assert(i_lp_prob != NULL); + } +} + +LP_OPT_INTERP::LP_OPT_INTERP(const LP_OPT_INTERP& obj) { + i_levels = obj.i_levels; + i_dim = obj.i_dim; + opt_type = obj.opt_type; + given_downset = obj.given_downset; + input_faults = obj.input_faults; + + new_levels = obj.new_levels; + new_faults = obj.new_faults; + ignored_dimensions = obj.ignored_dimensions; + new_dim = obj.new_dim; + new_given_downset = obj.new_given_downset; + + level_max = obj.level_max; + size_downset = obj.size_downset; + + l_max = obj.l_max; + + valid_input_faults = obj.valid_input_faults; + no_faults = obj.no_faults; + + total_size = obj.total_size; + + entire_downset = obj.entire_downset; + aux_entire_dict = obj.aux_entire_dict; + inv_M = obj.inv_M; + + downset_indices = obj.downset_indices; + + constr_mat = obj.constr_mat; + row_index = obj.row_index; + col_index = obj.col_index; + + i_lp_prob = glp_create_prob(); + assert(i_lp_prob != NULL); + std::memcpy(i_lp_prob, obj.i_lp_prob, 1 * sizeof(i_lp_prob)); +} + +LP_OPT_INTERP& LP_OPT_INTERP::operator=(const LP_OPT_INTERP& rhs) { + if (&rhs == this) { + return *this; + } + + i_levels = rhs.i_levels; + i_dim = rhs.i_dim; + opt_type = rhs.opt_type; + given_downset = rhs.given_downset; + input_faults = rhs.input_faults; + + new_levels = rhs.new_levels; + new_faults = rhs.new_faults; + ignored_dimensions = rhs.ignored_dimensions; + new_dim = rhs.new_dim; + new_given_downset = rhs.new_given_downset; + + level_max = rhs.level_max; + size_downset = rhs.size_downset; + + l_max = rhs.l_max; + + valid_input_faults = rhs.valid_input_faults; + no_faults = rhs.no_faults; + + total_size = rhs.total_size; + + entire_downset = rhs.entire_downset; + aux_entire_dict = rhs.aux_entire_dict; + inv_M = rhs.inv_M; + + downset_indices = rhs.downset_indices; + + constr_mat = rhs.constr_mat; + row_index = rhs.row_index; + col_index = rhs.col_index; + + i_lp_prob = glp_create_prob(); + assert(i_lp_prob != NULL); + std::memcpy(i_lp_prob, rhs.i_lp_prob, 1 * sizeof(i_lp_prob)); + + return *this; +} + +void LP_OPT_INTERP::init_opti_prob(const std::string& prob_name) { + std::string aux_var; + double neg_norm = 0.0; + double coeff = 0.0; + + glp_set_prob_name(i_lp_prob, prob_name.c_str()); + glp_set_obj_dir(i_lp_prob, opt_type); + + glp_add_rows(i_lp_prob, no_faults); + glp_add_cols(i_lp_prob, size_downset); + + for (int i = 0; i < no_faults; ++i) { + aux_var = set_aux_var_name("eq_constr_", i + 1); + glp_set_row_name(i_lp_prob, i + 1, aux_var.c_str()); + glp_set_row_bnds(i_lp_prob, i + 1, GLP_FX, 0.0, 0.0); + } + + for (int i = 0; i < size_downset; ++i) { + neg_norm = -l1_norm(downset_indices[i]); + coeff = pow(4.0, neg_norm); + + aux_var = set_aux_var_name("w", i + 1); + glp_set_col_name(i_lp_prob, i + 1, aux_var.c_str()); + glp_set_col_bnds(i_lp_prob, i + 1, GLP_DB, 0.0, 1.0); + glp_set_obj_coef(i_lp_prob, i + 1, coeff); + glp_set_col_kind(i_lp_prob, i + 1, GLP_BV); + } +} + +void LP_OPT_INTERP::set_constr_matrix() { + int inv_M_row_index = 0; + + for (int i = 0; i < no_faults; ++i) { + auto it = aux_entire_dict.find(valid_input_faults[i]); + + if (it != aux_entire_dict.end()) { + inv_M_row_index = static_cast(it->second); + + for (int j = 0; j < size_downset; ++j) { + constr_mat[j + i * size_downset + 1] = inv_M[inv_M_row_index][j]; + } + } + } + + for (int i = 0; i < no_faults; ++i) { + for (int j = 0; j < size_downset; ++j) { + row_index[j + i * size_downset + 1] = i + 1; + col_index[j + i * size_downset + 1] = j + 1; + } + } +} + +void LP_OPT_INTERP::set_constr_matrix(const std::vector& W) { + /* TO DO: nothing here */ +} + +void LP_OPT_INTERP::solve_opti_problem() const { + glp_load_matrix(i_lp_prob, total_size, &row_index[0], &col_index[0], + &constr_mat[0]); + glp_simplex(i_lp_prob, NULL); + glp_intopt(i_lp_prob, NULL); +} + +CombigridDict LP_OPT_INTERP::get_results( LevelVectorList& recomp_faults ) const { + int status = 0; + + real w_i = 0.0; + real c_i = 0.0; + std::vector w; + std::vector all_c; + std::vector new_c; + + CombigridDict output; + + status = glp_mip_status(i_lp_prob); + + if (status == GLP_OPT) { + for (int i = 0; i < size_downset; ++i) { + w_i = glp_get_col_prim(i_lp_prob, i + 1); + w.push_back(w_i); + } + + for (int i = 0; i < size_downset; ++i) { + c_i = 0.0; + for (int j = 0; j < size_downset; ++j) { + c_i += inv_M[i][j] * w[j]; + } + all_c.push_back(c_i); + } + std::cout << std::endl; + + // Print failed tasks + std::cout << "The following tasks failed:" << std::endl; + for (unsigned int i = 0; i < input_faults.size(); ++i) { + std::cout << "["; + for (int j = 0; j < i_dim; ++j) { + std::cout << input_faults[i][j] << " "; + } + std::cout << "], "; + } + std::cout << std::endl; + + // Print ignored dimensions (if any) + std::cout << "Ignored dimensions" << std::endl; + if (ignored_dimensions.size() == 0) { + std::cout << "None!"; + } + for (unsigned int i = 0; i < ignored_dimensions.size(); ++i) { + std::cout << ignored_dimensions[i] + 1 << " "; + } + std::cout << std::endl; + + // Print faults again, but ignoring those in lower diagonals + std::cout << "Faults relevant for optimization problem:" << std::endl; + for (int i = 0; i < no_faults; ++i) { + std::cout << "["; + for (int j = 0; j < new_dim; ++j) { + std::cout << valid_input_faults[i][j] << " "; + } + std::cout << "], "; + } + std::cout << std::endl; + + // Print all faults from lower diagonals + std::cout << "Faults to be potentially recomputed:" << std::endl; + for ( size_t i = 0; i < recompute_faults.size(); ++i) { + std::cout << "["; + for (int j = 0; j < new_dim; ++j) { + std::cout << recompute_faults[i][j] << " "; + } + std::cout << "], "; + } + std::cout << std::endl; + + new_c = select_coeff_downset(all_c, new_given_downset, aux_entire_dict); + output = create_out_dict(given_downset, new_c, i_dim); + + // Print original combischeme + std::cout << "Combination scheme before optimization: " << std::endl; + for (auto it = given_downset.begin(); it != given_downset.end(); ++it) { + std::cout << "["; + for (int j = 0; j < i_dim; ++j) { + std::cout << it->first[j] << " "; + } + std::cout << "]: " << it->second << ", "; + } + std::cout << std::endl << std::endl; + + // Print new combination scheme + std::cout << "Combination scheme after optimization: " << std::endl; + for (auto it = output.begin(); it != output.end(); ++it) { + std::cout << "["; + for (int j = 0; j < i_dim; ++j) { + std::cout << it->first[j] << " "; + } + std::cout << "]: " << it->second << ", "; + } + std::cout << std::endl; + + // Determine tasks to be recomputed + for (auto fault : recompute_faults){ + if (output[fault] != 0){ + recomp_faults.push_back(fault); + } + } + recompute_faults.clear(); + recompute_faults = recomp_faults; + + // Print tasks to be recomputed + std::cout << "Faults to be recomputed:" << std::endl; + for ( size_t i = 0; i < recomp_faults.size(); ++i) { + std::cout << "["; + for (int j = 0; j < new_dim; ++j) { + std::cout << recomp_faults[i][j] << " "; + } + std::cout << "], "; + } + std::cout << std::endl; + + } else { + std::cout << "Error: Optimal solution not found!" << std::endl; + exit(0); + } + + return output; +} + +int LP_OPT_INTERP::getNumFaults(){ + return no_faults; +} + +int LP_OPT_INTERP::getNumFaultsRecompute(){ + return no_faults_recompute; +} + +LevelVectorList LP_OPT_INTERP::getFaultsRecompute(){ + return recompute_faults; +} + +LP_OPT_INTERP::~LP_OPT_INTERP() { + glp_delete_prob(i_lp_prob); +} +} diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.hpp new file mode 100644 index 000000000..d3fc1910e --- /dev/null +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.hpp @@ -0,0 +1,89 @@ +#ifndef LPOPTINTERP_HPP_ +#define LPOPTINTERP_HPP_ + +#include "sgpp/distributedcombigrid/fault_tolerance/LPOptimization.hpp" + +namespace combigrid { + +class LP_OPT_INTERP: public LP_OPT { +private: + /* levels of grid indices */ + LevelVectorList i_levels; + /* top level of grid indices*/ + LevelVector level_max; + /* dimension of the problem */ + int i_dim; + /* total size of the optimization problem */ + int total_size; + + /* new dimensionality after the input levels are checked */ + int new_dim; + /* new levels based on new_dim */ + LevelVectorList new_levels; + /* if it is the case, the dimension(s) that is/are ignored */ + LevelVector ignored_dimensions; + /* if it is the case, new faults, based on ignored dimensions */ + LevelVectorList new_faults; + + /* no. of constraints */ + int no_faults; + + /* no. of faults to recompute*/ + int no_faults_recompute; + /* level max sum */ + IndexType l_max; + + // total size of the optimization problem + /* down set size */ + int size_downset; + + /* inverse of M s.t. w = Mc*/ + matrix inv_M; + + /* given dictionary */ + CombigridDict given_downset; + /* modified given downset based on ignored dimensions */ + CombigridDict new_given_downset; + /* entire donwset with corresponding indices */ + CombigridDict entire_downset; + /* auxiliary dictionary, used to create M and inv(M) */ + CombigridDict aux_entire_dict; + /* downset indices as a 2d vector*/ + LevelVectorList downset_indices; + /* faults input by user */ + LevelVectorList input_faults; + /* faults that are in the given downset from the set of input faults */ + LevelVectorList valid_input_faults; + /* faults that have to be recomputed */ + mutable LevelVectorList recompute_faults; + +public: + LP_OPT_INTERP(); + + LP_OPT_INTERP(const LevelVectorList& _levels, const int& _dim, const int& _opt_type, + const CombigridDict& _given_downset, const LevelVectorList& _input_faults); + + LP_OPT_INTERP(const LP_OPT_INTERP& obj); + + LP_OPT_INTERP& operator=(const LP_OPT_INTERP& rhs); + + virtual void init_opti_prob(const std::string& prob_name); + + virtual void set_constr_matrix(); + + virtual void set_constr_matrix(const std::vector& W); + + virtual int getNumFaults(); + + virtual int getNumFaultsRecompute(); + + virtual LevelVectorList getFaultsRecompute(); + + virtual void solve_opti_problem() const; + + virtual CombigridDict get_results( LevelVectorList& recomp_faults ) const; + + virtual ~LP_OPT_INTERP(); +}; +} +#endif /* LPOPTINTERP_HPP_ */ diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/fullgrid/DistributedFullGrid.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/fullgrid/DistributedFullGrid.hpp index 80787b603..2f5d6fce4 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/fullgrid/DistributedFullGrid.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/fullgrid/DistributedFullGrid.hpp @@ -946,8 +946,8 @@ class DistributedFullGrid { CombiDataType minAbsVal = std::min(std::abs(*it_sub[subFgId]), std::abs(fullgridVector_[i])); *it_sub[subFgId] += coeff * fullgridVector_[i]; // todo: what is a good tolerance? - if (minAbsVal > 1e-6){ - *it_sub[subFgId] = std::abs(*it_sub[subFgId]); + if (minAbsVal > 1e-10){ +// *it_sub[subFgId] = std::abs(*it_sub[subFgId]); *it_sub[subFgId] /= minAbsVal; } ++it_sub[subFgId]; diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/hierarchization/Hierarchization.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/hierarchization/Hierarchization.hpp index b8566c718..84592d04d 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/hierarchization/Hierarchization.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/hierarchization/Hierarchization.hpp @@ -11,7 +11,6 @@ #include "boost/lexical_cast.hpp" #include #include "sgpp/distributedcombigrid/fullgrid/FullGrid.hpp" -#include "sgpp/distributedcombigrid/utils/combigrid_ultils.hpp" #include "sgpp/distributedcombigrid/utils/StatsContainer.hpp" /* @@ -83,7 +82,7 @@ class Hierarchization { IndexType jump; lldiv_t divresult; - theStatsContainer()->setTimerStart("hierarchize_dim_0"); +// theStatsContainer()->setTimerStart("hierarchize_dim_0"); // dimension 1 separate as start of each pole is easier to calculate IndexType ndim = n[0]; @@ -107,11 +106,11 @@ class Hierarchization { } // end dimension 1 - theStatsContainer()->setTimerStop("hierarchize_dim_0"); +// theStatsContainer()->setTimerStop("hierarchize_dim_0"); for (DimType dim = 1; dim < d; dim++) { // hierarchize for all dims - theStatsContainer()->setTimerStart( - "hierarchize_dim_" + boost::lexical_cast(dim)); +// theStatsContainer()->setTimerStart( +// "hierarchize_dim_" + boost::lexical_cast(dim)); stride *= ndim; ndim = n[dim]; @@ -139,8 +138,8 @@ class Hierarchization { } } - theStatsContainer()->setTimerStop( - "hierarchize_dim_" + boost::lexical_cast(dim)); +// theStatsContainer()->setTimerStop( +// "hierarchize_dim_" + boost::lexical_cast(dim)); } // end loop over dimension 2 to d diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/CombiParameters.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/CombiParameters.hpp index 7b3772ae3..49c44c975 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/CombiParameters.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/CombiParameters.hpp @@ -141,6 +141,8 @@ void CombiParameters::serialize(Archive& ar, const unsigned int version) { ar& boundary_; ar& levels_; ar& coeffs_; + ar& levelsToIDs_; + ar& combiDictionary_; } } diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.cpp index 179e77746..721871da1 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.cpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.cpp @@ -99,21 +99,19 @@ bool ProcessGroupManager::combine() { } bool ProcessGroupManager::updateCombiParameters(CombiParameters& params) { - // can only send sync signal when in wait state - assert(status_ == PROCESS_GROUP_WAIT); + // can only send sync signal when not in busy state + assert(status_ != PROCESS_GROUP_BUSY); SignalType signal = UPDATE_COMBI_PARAMETERS; MPI_Send(&signal, 1, MPI_INT, pgroupRootID_, signalTag, theMPISystem()->getGlobalComm()); // send combiparameters MPIUtils::sendClass(¶ms, pgroupRootID_, theMPISystem()->getGlobalComm()); - // set status status_ = PROCESS_GROUP_BUSY; // start non-blocking MPI_IRecv to receive status recvStatus(); - return true; } @@ -144,6 +142,56 @@ bool ProcessGroupManager::addTask( Task* t ) { // only return true if task successfully send to pgroup return true; } +bool ProcessGroupManager::reinitTask( Task* t ) { + // first check status + // tying to reinit a task in a busy group is an invalid operation + // and should be avoided + if (status_ != PROCESS_GROUP_WAIT) + return false; + + // send reinit task signal to pgroup + SignalType signal = REINIT_TASK; + MPI_Send(&signal, 1, MPI_INT, pgroupRootID_, signalTag, theMPISystem()->getGlobalComm()); + + // send task + Task::send(&t, pgroupRootID_, theMPISystem()->getGlobalComm()); + + // set status + status_ = PROCESS_GROUP_BUSY; + + // start non-blocking MPI_IRecv to receive status + recvStatus(); + + // only return true if task successfully send to pgroup + return true; +} + +bool ProcessGroupManager::recompute( Task* t ) { + // first check status + // tying to add a task to a busy group is an invalid operation + // and should be avoided + if (status_ != PROCESS_GROUP_WAIT) + return false; + + // add task to list of tasks managed by this pgroup + tasks_.push_back(t); + + // send add task signal to pgroup + SignalType signal = RECOMPUTE; + MPI_Send(&signal, 1, MPI_INT, pgroupRootID_, signalTag, theMPISystem()->getGlobalComm()); + + // send task + Task::send( &t, pgroupRootID_, theMPISystem()->getGlobalComm() ); + + // set status + status_ = PROCESS_GROUP_BUSY; + + // start non-blocking MPI_IRecv to receive status + recvStatus(); + + // only return true if task successfully send to pgroup + return true; +} void ProcessGroupManager::recvStatus(){ @@ -153,4 +201,33 @@ void ProcessGroupManager::recvStatus(){ } +bool ProcessGroupManager::recoverCommunicators(){ + assert( status_ == PROCESS_GROUP_WAIT ); + + // send signal to pgroup + SignalType signal = RECOVER_COMM; + MPI_Send(&signal, 1, MPI_INT, pgroupRootID_, signalTag, theMPISystem()->getGlobalComm()); + + return true; +} + +bool ProcessGroupManager::searchSDC( SDCMethodType method ){ + + if (status_ != PROCESS_GROUP_WAIT) + return false; + + // send signal to pgroup + SignalType signal = SEARCH_SDC; + MPI_Send(&signal, 1, MPI_INT, pgroupRootID_, signalTag, theMPISystem()->getGlobalComm()); + + // send method to pgroup + MPI_Send(&method, 1, MPI_INT, pgroupRootID_, infoTag, theMPISystem()->getGlobalComm()); + + status_ = PROCESS_GROUP_BUSY; + + // start non-blocking MPI_IRecv to receive status + recvStatus(); + + return true; +} } /* namespace combigrid */ diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp index d48b7a471..2111bd693 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp @@ -76,6 +76,15 @@ class ProcessGroupManager { bool addTask( Task* ); + bool reinitTask( Task* ); + + bool recompute( Task* ); + + bool recoverCommunicators(); + + bool + searchSDC( SDCMethodType method ); + private: RankType pgroupRootID_; // rank in GlobalComm of the master process of this group diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupSignals.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupSignals.hpp index 4b3cbab4b..8fa1cf18c 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupSignals.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupSignals.hpp @@ -30,6 +30,8 @@ const SignalType ADD_TASK = 13; const SignalType RECOMPUTE = 14; const SignalType CHECK_DEAD_PROCS = 15; // check for dead workers const SignalType RECOVER_COMM = 16; +const SignalType SEARCH_SDC = 17; +const SignalType REINIT_TASK = 18; typedef int NormalizationType; const NormalizationType NO_NORMALIZATION = 0; @@ -37,9 +39,9 @@ const NormalizationType L1_NORMALIZATION = 1; const NormalizationType L2_NORMALIZATION = 2; const NormalizationType EV_NORMALIZATION = 3; -typedef int FaultSimulationType; -const FaultSimulationType RANDOM_FAIL = 0; -const FaultSimulationType GROUPS_FAIL = 1; +typedef int SDCMethodType; +const SDCMethodType COMPARE_PAIRS = 0; +const SDCMethodType COMPARE_VALUES = 1; enum TagType { signalTag = 0, statusTag = 1, infoTag = 2 diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupWorker.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupWorker.cpp index 581d685e3..602c3f0c7 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupWorker.cpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessGroupWorker.cpp @@ -16,18 +16,20 @@ #include "sgpp/distributedcombigrid/sparsegrid/DistributedSparseGridUniform.hpp" #include "sgpp/distributedcombigrid/combicom/CombiCom.hpp" #include "sgpp/distributedcombigrid/hierarchization/DistributedHierarchization.hpp" +#include "sgpp/distributedcombigrid/hierarchization/Hierarchization.hpp" #include "sgpp/distributedcombigrid/mpi/MPIUtils.hpp" +#include namespace combigrid { ProcessGroupWorker::ProcessGroupWorker() : - currentTask_( NULL), - status_(PROCESS_GROUP_WAIT), - combinedFG_( NULL), - combinedUniDSG_(NULL), - combinedFGexists_(false), - combiParameters_(), - combiParametersSet_(false) + currentTask_( NULL), + status_(PROCESS_GROUP_WAIT), + combinedFG_( NULL), + combinedUniDSG_(NULL), + combinedFGexists_(false), + combiParameters_(), + combiParametersSet_(false) { } @@ -36,7 +38,7 @@ ProcessGroupWorker::~ProcessGroupWorker() { } SignalType ProcessGroupWorker::wait() { - if (status_ != PROCESS_GROUP_WAIT) + if (status_ == PROCESS_GROUP_BUSY) return RUN_NEXT; SignalType signal = -1; @@ -44,15 +46,15 @@ SignalType ProcessGroupWorker::wait() { MASTER_EXCLUSIVE_SECTION { // receive signal from manager MPI_Recv( &signal, 1, MPI_INT, - theMPISystem()->getManagerRank(), - signalTag, - theMPISystem()->getGlobalComm(), - MPI_STATUS_IGNORE); -} + theMPISystem()->getManagerRank(), + signalTag, + theMPISystem()->getGlobalComm(), + MPI_STATUS_IGNORE); + } // distribute signal to other processes of pgroup MPI_Bcast( &signal, 1, MPI_INT, - theMPISystem()->getMasterRank(), - theMPISystem()->getLocalComm() ); + theMPISystem()->getMasterRank(), + theMPISystem()->getLocalComm() ); // process signal if (signal == RUN_FIRST) { @@ -61,8 +63,8 @@ SignalType ProcessGroupWorker::wait() { // local root receives task MASTER_EXCLUSIVE_SECTION { Task::receive( &t, - theMPISystem()->getManagerRank(), - theMPISystem()->getGlobalComm() ); + theMPISystem()->getManagerRank(), + theMPISystem()->getGlobalComm() ); } // broadcast task to other process of pgroup @@ -162,8 +164,70 @@ SignalType ProcessGroupWorker::wait() { updateCombiParameters(); - } + } else if (signal == RECOMPUTE) { + Task* t; + + // local root receives task + MASTER_EXCLUSIVE_SECTION { + Task::receive(&t, theMPISystem()->getManagerRank(), theMPISystem()->getGlobalComm()); + } + + // broadcast task to other process of pgroup + Task::broadcast(&t, theMPISystem()->getMasterRank(), theMPISystem()->getLocalComm()); + + MPI_Barrier(theMPISystem()->getLocalComm()); + // add task to task storage + tasks_.push_back(t); + + status_ = PROCESS_GROUP_BUSY; + + // set currentTask + currentTask_ = tasks_.back(); + + // initalize task + currentTask_->init(theMPISystem()->getLocalComm()); + + currentTask_->setZero(); + + // fill task with combisolution + setCombinedSolutionUniform( currentTask_ ); + + // execute task + currentTask_->run(theMPISystem()->getLocalComm()); + // } else if ( signal == RECOVER_COMM ){ + // theMPISystem()->recoverCommunicators( true ); + // return signal; + } else if (signal == SEARCH_SDC) { + searchSDC(); + } else if (signal == REINIT_TASK) { + std::cout << "reinitializing a single task" << std::endl; + + Task* t; + + // local root receives task + MASTER_EXCLUSIVE_SECTION { + Task::receive(&t, theMPISystem()->getManagerRank(), theMPISystem()->getGlobalComm()); + } + + // broadcast task to other process of pgroup + Task::broadcast(&t, theMPISystem()->getMasterRank(), theMPISystem()->getLocalComm()); + + MPI_Barrier(theMPISystem()->getLocalComm()); + + for (auto tt : tasks_){ + if (tt->getID() == t->getID()){ + currentTask_ = tt; + break; + } + } + + // initalize task and set values to zero + currentTask_->init( theMPISystem()->getLocalComm() ); + currentTask_->setZero(); + setCombinedSolutionUniform( currentTask_ ); + currentTask_->setFinished(true); + } // in the general case: send ready signal. if(!omitReadySignal) @@ -173,26 +237,24 @@ SignalType ProcessGroupWorker::wait() { } void ProcessGroupWorker::ready() { - if( status_ != PROCESS_GROUP_FAIL ){ - // check if there are unfinished tasks - for (size_t i = 0; i < tasks_.size(); ++i) { - if (!tasks_[i]->isFinished()) { - status_ = PROCESS_GROUP_BUSY; - - // set currentTask - currentTask_ = tasks_[i]; - currentTask_->run(theMPISystem()->getLocalComm()); - } + + // check if there are unfinished tasks + for (size_t i = 0; i < tasks_.size(); ++i) { + if (!tasks_[i]->isFinished()) { + status_ = PROCESS_GROUP_BUSY; + + // set currentTask + currentTask_ = tasks_[i]; + currentTask_->run(theMPISystem()->getLocalComm()); } + } - // all tasks finished -> group waiting + // all tasks finished -> group waiting + if( status_ != PROCESS_GROUP_FAIL ) status_ = PROCESS_GROUP_WAIT; - } - // send ready status to manager MASTER_EXCLUSIVE_SECTION{ StatusType status = status_; - MPI_Send(&status, 1, MPI_INT, theMPISystem()->getManagerRank(), statusTag, theMPISystem()->getGlobalComm()); } @@ -236,6 +298,7 @@ void ProcessGroupWorker::combine() { // dehierarchize dfg DistributedHierarchization::dehierarchize(dfg); + } } @@ -255,39 +318,37 @@ void ProcessGroupWorker::combineUniform() { lmax[i] -= 1; // todo: delete old dsg - if (combinedUniDSG_ != NULL) - delete combinedUniDSG_; +// if (combinedUniDSG_ != NULL) +// delete combinedUniDSG_; // erzeug dsg - combinedUniDSG_ = new DistributedSparseGridUniform(dim, lmax, - lmin, boundary, - theMPISystem()->getLocalComm()); +// combinedUniDSG_ = new DistributedSparseGridUniform(dim, lmax, +// lmin, boundary, +// theMPISystem()->getLocalComm()); // todo: move to init function to avoid reregistering // register dsg in all dfgs - for (Task* t : tasks_) { - DistributedFullGrid& dfg = t->getDistributedFullGrid(); - - dfg.registerUniformSG(*combinedUniDSG_); - } +// for (Task* t : tasks_) { +// DistributedFullGrid& dfg = t->getDistributedFullGrid(); +// +// dfg.registerUniformSG(*combinedUniDSG_); +// } for (Task* t : tasks_) { DistributedFullGrid& dfg = t->getDistributedFullGrid(); // hierarchize dfg - DistributedHierarchization::hierarchize(dfg); +// DistributedHierarchization::hierarchize(dfg); // lokales reduce auf sg -> dfg.addToUniformSG( *combinedUniDSG_, combiParameters_.getCoeff( t->getID() ) ); } CombiCom::distributedGlobalReduce( *combinedUniDSG_ ); - for (Task* t : tasks_) { // get handle to dfg DistributedFullGrid& dfg = t->getDistributedFullGrid(); - // extract dfg vom dsg dfg.extractFromUniformSG( *combinedUniDSG_ ); @@ -323,8 +384,8 @@ void ProcessGroupWorker::gridEval() { std::vector tmp(dim); MPI_Recv( &tmp[0], bsize, MPI_INT, - theMPISystem()->getManagerRank(), 0, - theMPISystem()->getGlobalComm(), MPI_STATUS_IGNORE); + theMPISystem()->getManagerRank(), 0, + theMPISystem()->getGlobalComm(), MPI_STATUS_IGNORE); leval = LevelVector(tmp.begin(), tmp.end()); } @@ -348,8 +409,8 @@ void ProcessGroupWorker::gridEval() { } t->getFullGrid( fg, - theMPISystem()->getMasterRank(), - theMPISystem()->getLocalComm() ); + theMPISystem()->getMasterRank(), + theMPISystem()->getLocalComm() ); MASTER_EXCLUSIVE_SECTION{ fg_red.add(fg, combiParameters_.getCoeff( t->getID() ) ); @@ -358,8 +419,8 @@ void ProcessGroupWorker::gridEval() { // global reduce of f_red MASTER_EXCLUSIVE_SECTION { CombiCom::FGReduce( fg_red, - theMPISystem()->getManagerRank(), - theMPISystem()->getGlobalComm() ); + theMPISystem()->getManagerRank(), + theMPISystem()->getGlobalComm() ); } } @@ -393,6 +454,8 @@ void ProcessGroupWorker::updateCombiParameters() { combiParametersSet_ = true; + status_ = PROCESS_GROUP_BUSY; + } @@ -402,50 +465,633 @@ void ProcessGroupWorker::setCombinedSolutionUniform( Task* t ) { // get handle to dfg DistributedFullGrid& dfg = t->getDistributedFullGrid(); - // extract dfg vom dsg + // extract dfg from dsg dfg.extractFromUniformSG( *combinedUniDSG_ ); // dehierarchize dfg DistributedHierarchization::dehierarchize( dfg ); } +void ProcessGroupWorker::searchSDC(){ + SDCMethodType method = -1; + MASTER_EXCLUSIVE_SECTION { + // receive signal from manager + MPI_Recv( &method, 1, MPI_INT, + theMPISystem()->getManagerRank(), + infoTag, + theMPISystem()->getGlobalComm(), + MPI_STATUS_IGNORE); + } + // distribute signal to other processes of pgroup + MPI_Bcast( &method, 1, MPI_INT, + theMPISystem()->getMasterRank(), + theMPISystem()->getLocalComm() ); -// void addToUniformSG(DistributedSparseGridUniform& dsg, -// real coeff) { -// // test if dsg has already been registered -// if (&dsg != dsg_) -// registerUniformSG(dsg); -// -// // create iterator for each subspace in dfg -// typedef typename std::vector::iterator SubspaceIterator; -// typename std::vector it_sub( -// subspaceAssigmentList_.size()); -// -// for (size_t subFgId = 0; subFgId < it_sub.size(); ++subFgId) { -// if (subspaceAssigmentList_[subFgId] < 0) -// continue; -// -// IndexType subSgId = subspaceAssigmentList_[subFgId]; -// -// it_sub[subFgId] = dsg.getDataVector(subSgId).begin(); -// } -// -// // loop over all grid points -// for (size_t i = 0; i < fullgridVector_.size(); ++i) { -// // get subspace_fg id -// size_t subFgId(assigmentList_[i]); -// -// if (subspaceAssigmentList_[subFgId] < 0) -// continue; -// -// IndexType subSgId = subspaceAssigmentList_[subFgId]; -// -// assert(it_sub[subFgId] != dsg.getDataVector(subSgId).end()); -// -// // add grid point to subspace, mul with coeff -// *it_sub[subFgId] += coeff * fullgridVector_[i]; -// -// ++it_sub[subFgId]; -// } -// } + std::vector levelsSDC; + compareSolutions( static_cast(combiParameters_.getDim()), levelsSDC, method ); + + int numLocalSDC = static_cast(levelsSDC.size()); + int numGlobalSDC; + MPI_Allreduce( &numLocalSDC, &numGlobalSDC, 1, MPI_INT, MPI_SUM, theMPISystem()->getLocalComm()); + if ( numGlobalSDC > 0 ){ + MPI_Status statusSDC; + if( numLocalSDC > 0 ){ + if (!theMPISystem()->isMaster()){ + MPI_Send( &levelsSDC[0], numLocalSDC, MPI_INT, theMPISystem()->getMasterRank(), infoTag, theMPISystem()->getLocalComm() ); + } + } else { + MASTER_EXCLUSIVE_SECTION{ + levelsSDC.resize(tasks_.size()); + MPI_Recv( &levelsSDC[0], static_cast(tasks_.size()), MPI_INT, MPI_ANY_SOURCE, infoTag, theMPISystem()->getLocalComm(), &statusSDC ); + MPI_Get_count( &statusSDC, MPI_INT, &numGlobalSDC ); + levelsSDC.resize(numGlobalSDC); + } + } + MASTER_EXCLUSIVE_SECTION{ + status_ = PROCESS_GROUP_FAIL; + MPI_Send( &levelsSDC[0], numGlobalSDC, MPI_INT, theMPISystem()->getManagerRank(), infoTag, theMPISystem()->getGlobalComm()); + } + } +} + +void ProcessGroupWorker::compareSolutions( int numNearestNeighbors, std::vector &levelsSDC, SDCMethodType method ){ + + // todo: delete old dsg + if (combinedUniDSG_ != NULL) + delete combinedUniDSG_; + + // erzeug dsg + combinedUniDSG_ = new DistributedSparseGridUniform( + combiParameters_.getDim(), combiParameters_.getLMax(), combiParameters_.getLMin(), + combiParameters_.getBoundary(), theMPISystem()->getLocalComm()); + + // We use this DSG to compare component solutions and search for SDC + DistributedSparseGridUniform* SDCUniDSG = combinedUniDSG_; + + // We work in the hierarchical basis, so hierarchization is now done here + for (auto t : tasks_){ + DistributedFullGrid& dfg = t->getDistributedFullGrid(); + DistributedHierarchization::hierarchize(dfg); + dfg.registerUniformSG( *SDCUniDSG ); + } + + if ( tasks_.size() == 1 ) + return; + + // Generate all pairs of grids + std::vector> allPairs; + generatePairs( numNearestNeighbors, allPairs ); + + std::vector allBetas; + std::vector allBetasSum; + std::vector allSubs; + std::vector allJs; + + // Iterate throuh all pairs of grids. For each pair, we store the gridpoint with + // the largest difference + for (auto pair : allPairs){ + + DistributedFullGrid& dfg_t = pair[0]->getDistributedFullGrid(); + DistributedFullGrid& dfg_s = pair[1]->getDistributedFullGrid(); + + dfg_t.addToUniformSG( *SDCUniDSG, 1.0 ); + dfg_s.addToUniformSG( *SDCUniDSG, -1.0 ); + + CombiDataType localBetaMax(0.0); + + LevelVector subMax; + size_t jMax = 0; + + for (size_t i = 0; i < SDCUniDSG->getNumSubspaces(); ++i){ + auto subData = SDCUniDSG->getData(i); + auto subSize = SDCUniDSG->getDataSize(i); + for (size_t j = 0; j < subSize; ++j){ + if (std::abs(subData[j]) >= std::abs(localBetaMax)){ + localBetaMax = subData[j]; + subMax = SDCUniDSG->getLevelVector(i); + jMax = j; + } + } + } + + allBetas.push_back( localBetaMax ); + allSubs.push_back( subMax ); + allJs.push_back( jMax ); + + // Reset sparse grid to zero before continuing to next pair + for (size_t i = 0; i < SDCUniDSG->getNumSubspaces(); ++i){ + auto subData = SDCUniDSG->getData(i); + auto subSize = SDCUniDSG->getDataSize(i); + for (size_t j = 0; j < subSize; ++j) + subData[j] = 0.0; + } + } + + // We gather all beta values across processes in a group and store only the largest per pair + std::vector allBetasReduced; + allBetasReduced.resize(allBetas.size()); + + CombiCom::BetasReduce( allBetas, allBetasReduced, theMPISystem()->getLocalComm() ); + + // Largest beta in a given group -> potential candidate for SDC + auto globalBetaMax = std::max_element(allBetasReduced.begin(), allBetasReduced.end(), + [](CombiDataType a, CombiDataType b){ return std::abs(a) < std::abs(b); } ); + + auto b = std::find( allBetas.begin(), allBetas.end(), *globalBetaMax ); + + betas_.clear(); + subspaceValues_.clear(); + + // The following is only done by the process that contains the suspicious value + if(b != allBetas.end()) { + + size_t indMax = std::distance(allBetas.begin(), b); + + LevelVector subMax = allSubs[indMax]; + std::cout<< "Subspace with potential SDC = "<getLevelVector(); + + DistributedFullGrid& dfg = t->getDistributedFullGrid(); + + dfg.addToUniformSG( *SDCUniDSG, 1.0 ); + + auto subData = SDCUniDSG->getData(subMax); + CombiDataType localValMax = subData[jMax]; + + if ( subMax <= level ) + subspaceValues_[level] = localValMax; + + // Reset sparse grid to zero + for (size_t i = 0; i < SDCUniDSG->getNumSubspaces(); ++i){ + auto subData = SDCUniDSG->getData(i); + auto subSize = SDCUniDSG->getDataSize(i); + for (size_t j = 0; j < subSize; ++j) + subData[j] = 0.0; + } + } + if( subspaceValues_.size() < 5 ){ + std::cout<<"To few measurements. Changing strategy to search pairs."<& dfg_t = pair[0]->getDistributedFullGrid(); + DistributedFullGrid& dfg_s = pair[1]->getDistributedFullGrid(); + + LevelVector t_level = pair[0]->getLevelVector(); + LevelVector s_level = pair[1]->getLevelVector(); + + dfg_t.addToUniformSG( *SDCUniDSG, 1.0 ); + dfg_s.addToUniformSG( *SDCUniDSG, -1.0 ); + + auto subData = SDCUniDSG->getData(subMax); + CombiDataType localBetaMax = subData[jMax]; + betas_[std::make_pair(t_level, s_level)] = localBetaMax; + + // Reset sparse grid to zero + for (size_t i = 0; i < SDCUniDSG->getNumSubspaces(); ++i){ + auto subData = SDCUniDSG->getData(i); + auto subSize = SDCUniDSG->getDataSize(i); + for (size_t j = 0; j < subSize; ++j) + subData[j] = 0.0; + } + } + robustRegressionPairs( levelsSDC ); + } + } + // todo: Barrier needed? + MPI_Barrier(theMPISystem()->getLocalComm()); +} + +void ProcessGroupWorker::computeStandardizedResiduals( gsl_multifit_robust_workspace* regressionWsp, gsl_vector* r_stud, gsl_vector* r_lms ){ + + size_t p = regressionWsp->p; + size_t n = regressionWsp->n; + gsl_vector *r = gsl_vector_alloc( n ); + gsl_vector *r2 = gsl_vector_alloc( n ); + gsl_vector *r2_sorted = gsl_vector_alloc( n ); + gsl_vector *r_stand = gsl_vector_alloc( n ); + gsl_vector *weights = gsl_vector_alloc( n ); + gsl_vector *ones = gsl_vector_alloc( n ); + + gsl_vector_memcpy(r, r_stud); + gsl_vector_memcpy(r_stand, r_stud); + gsl_vector_memcpy(r_lms, r_stud); + + for (size_t i = 0; i < r->size; ++i) + gsl_vector_set(r2, i, std::pow(gsl_vector_get(r, i),2)); + + gsl_vector_memcpy(r2_sorted, r2); + gsl_sort_vector(r2_sorted); + + // Median of residuals squared + double median_r2 = gsl_stats_median_from_sorted_data(r2_sorted->data, r2_sorted->stride, r2_sorted->size); + + // Preliminary scale estimate + double s0 = 1.4826*( 1 + 5.0/(n-p-1))*(std::sqrt(median_r2)); + + // Standardized residuals + gsl_vector_scale(r_stand, 1.0/s0); + + for (size_t i = 0; i < r_stand->size; ++i) + gsl_vector_set(r_stand, i, fabs(gsl_vector_get(r_stand, i))); + + // Threshold for outliers + double eps = 2.5; + + // Weights for each residual: here we assume that the can + // be multiple residuals + for(size_t i = 0; i < r_stand->size; ++i){ + if(std::abs(r_stand->data[i]) <= eps) + gsl_vector_set(weights, i, 1); + else + gsl_vector_set(weights, i, 0); + } + + // Robust scale estimate + double prod; + gsl_blas_ddot( weights, r2, &prod ); + + double sum; + gsl_vector_set_all( ones, 1 ); + gsl_blas_ddot( weights, ones, &sum ); + double s_star = std::sqrt(prod/(sum-p)); + + gsl_vector_scale( r_lms, 1.0/s_star ); +} + +void ProcessGroupWorker::generatePairs( int numNearestNeighbors, std::vector> &allPairs ){ + + std::vector levels; + std::map numPairs; + + for ( auto tt: tasks_ ){ + levels.push_back(tt->getLevelVector()); + numPairs[tt->getLevelVector()] = 0; + } + + for (Task* s : tasks_ ){ + + std::sort(levels.begin(), levels.end(), [s](LevelVector const& a, LevelVector const& b) { + return l1(a - s->getLevelVector()) < l1(b - s->getLevelVector()); + }); + + int k = 0; + + for( size_t t_i = 1; t_i < levels.size(); ++t_i ){ + std::vector currentPair; + + Task* t = *std::find_if(tasks_.begin(), tasks_.end(), + [levels,t_i](Task* const &tt) -> bool { return tt->getLevelVector() == levels[t_i]; }); + + currentPair.push_back(t); + currentPair.push_back(s); + + if(std::find(allPairs.begin(), allPairs.end(), currentPair) == allPairs.end() + ){ + allPairs.push_back({currentPair[1],currentPair[0]}); + numPairs[s->getLevelVector()]++; + numPairs[t->getLevelVector()]++; + k++; + } + + if (k == numNearestNeighbors) + break; + } + } + // Check if any grid was left out with fewer neighbors than it should + for (Task* s : tasks_ ){ + + int k = numPairs[s->getLevelVector()]; + if ( k < numNearestNeighbors ){ + + std::sort(levels.begin(), levels.end(), [s](LevelVector const& a, LevelVector const& b) { + return l1(a - s->getLevelVector()) < l1(b - s->getLevelVector()); + }); + + for( size_t t_i = 1; t_i < levels.size(); ++t_i ){ + std::vector currentPair; + std::vector currentPairBack; + + Task* t = *std::find_if(tasks_.begin(), tasks_.end(), + [levels,t_i](Task* const &tt) -> bool { return tt->getLevelVector() == levels[t_i]; }); + + currentPair.push_back(s); + currentPair.push_back(t); + currentPairBack.push_back(t); + currentPairBack.push_back(s); + + if(std::find(allPairs.begin(), allPairs.end(), currentPair) == allPairs.end() && + std::find(allPairs.begin(), allPairs.end(), currentPairBack) == allPairs.end()){ + allPairs.push_back({currentPair[1],currentPair[0]}); + numPairs[s->getLevelVector()]++; + numPairs[t->getLevelVector()]++; + k++; + } + + if (k == numNearestNeighbors) + break; + } + } + } +} + +void ProcessGroupWorker::robustRegressionPairs( std::vector &levelsSDC ){ + + auto dim = combiParameters_.getDim(); + auto lmin = combiParameters_.getLMin(); + auto lmax = combiParameters_.getLMax(); + + // Determine which indices appear in the set of multi-indices + std::set indexSet; + for ( auto t : tasks_ ){ + LevelVector key = t->getLevelVector(); + for( IndexType k_i : key ) + indexSet.insert( k_i ); + } + + std::map indexMap; + IndexType row = 0; + for ( auto ii : indexSet ){ + indexMap[ii] = row; + row++; + } + + // Number of measurements (beta values) + size_t n = betas_.size(); + + // Number of unknowns (values of functions D_i) + size_t numIndices = indexSet.size(); + size_t p = dim*numIndices; + + // Exponent in error splitting + double ex = 2; + + if ( n < p ){ + std::cout<<"Too few measurements: SDC detection skipped."<(key_t[di])); + idx_Di = di*numIndices + indexMap[key_t[di]]; + gsl_matrix_set( X, row, idx_Di, pow(ht_i,ex) ); + + CombiDataType hs_i = 1.0/pow(2.0,static_cast(key_s[di])); + idx_Di = di*numIndices + indexMap[key_s[di]]; + val = gsl_matrix_get( X, row, idx_Di ); + gsl_matrix_set( X, row, idx_Di, val - pow(hs_i,ex) ); + } + + gsl_vector_set( y, row, beta ); + + row++; + } + + gsl_set_error_handler_off(); + gsl_multifit_robust( X, y, c, cov, regressionWsp ); + + gsl_multifit_robust_residuals(X, y, c, r_stud, regressionWsp); + + // Before checking the residuals, check the data for extremely large values + double eps = 1e50; + detectOutliers( y->data, levelsSDC, eps, COMPARE_PAIRS); + + // Now we can check for large residuals + if(levelsSDC.size() == 0){ + eps = 2.5; + detectOutliers( r_stud->data, levelsSDC, eps, COMPARE_PAIRS ); + } + + // Print betas and resiguals + row = 0; + for ( auto const &entry : betas_ ){ + LevelVector key_t = entry.first.first; + LevelVector key_s = entry.first.second; + CombiDataType beta = entry.second; + CombiDataType res = r_stud->data[row]; + std::cout< &levelsSDC ){ + + auto lmin = combiParameters_.getLMin(); + auto lmax = combiParameters_.getLMax(); + + // Number of measurements (beta values) + size_t n = subspaceValues_.size(); + + // Number of unknowns + size_t p = 1; + + gsl_multifit_robust_workspace *regressionWsp = gsl_multifit_robust_alloc(gsl_multifit_robust_cauchy, n , p ); + + gsl_matrix *X = gsl_matrix_alloc( n, p ); + gsl_vector *y = gsl_vector_alloc( n ); + gsl_vector *c = gsl_vector_alloc( p ); + gsl_vector *r_lms = gsl_vector_alloc( n ); + gsl_vector *r_stud = gsl_vector_alloc( n ); + gsl_matrix *cov = gsl_matrix_alloc( p, p ); + + // Initialize matrix with ones + gsl_matrix_set_all( X, 1.0 ); + + int row = 0; + for( auto const &entry : subspaceValues_){ + CombiDataType maxVal = entry.second; + gsl_vector_set( y, row, maxVal ); + row++; + } + + // Before checking the residuals, check the data for extremely large values + double eps = 1e50; + detectOutliers( y->data, levelsSDC, eps, COMPARE_VALUES ); + + if ( n < 5 ){ + std::cout<<"Too few measurements: Robust regression skipped."<r, r_lms ); + + // Now we can check for large residuals + if(levelsSDC.size() == 0){ + eps = 2.5; + detectOutliers( r_lms->data, levelsSDC, eps, COMPARE_VALUES, y->data[0] ); + } + + // Print data and residuals + row = 0; + for ( auto const &entry : subspaceValues_){ + LevelVector key = entry.first; + CombiDataType maxVal = entry.second; + CombiDataType res = r_lms->data[row]; + std::cout< &levelsSDC, double eps, SDCMethodType method, double y ){ + + std::map mapSDC; + for ( auto t : tasks_ ){ + LevelVector key = t->getLevelVector(); + mapSDC[key] = 0; + } + + if ( method == COMPARE_PAIRS ) { + size_t numSDCPairs = 0; + size_t row = 0; + for( auto const &entry : betas_ ){ + LevelVector key_t = entry.first.first; + LevelVector key_s = entry.first.second; + CombiDataType beta = entry.second; + if ( std::abs(data[row]) > eps && beta != 0 ){ + mapSDC[key_t]++; + mapSDC[key_s]++; + numSDCPairs++; + } + row++; + } + std::cout<< "SDC grid: " << std::endl; + for (auto s : mapSDC){ + if (s.second >= 2 || (s.second == 1 && numSDCPairs == 1)){ + std::cout< eps ){ + mapSDC[key]++; + } + row++; + } + std::cout<< "SDC grid: " << std::endl; + for (auto s : mapSDC){ + if (s.second == 1 ){ + std::cout<& faultsID, double u_robust ) { + + std::map IDsToLevels = combiParameters_.getLevelsDict(); + int dim = static_cast(combiParameters_.getDim()); + + const std::string prob_name = "interpolation based optimization"; + + LevelVectorList lvlminmax; + lvlminmax.push_back(combiParameters_.getLMin()); + lvlminmax.push_back(combiParameters_.getLMax()); + + CombigridDict given_dict = combiParameters_.getCombiDict(); + + std::vector faultsIDCopy = faultsID; + + // Iterate through tasks currently marked as outliers + for (auto id : faultsIDCopy){ + + LevelVectorList faultLevelVectors; + faultLevelVectors.push_back(IDsToLevels[id]); + + // Find alternative combination coefficients to exclude suspect value + LP_OPT_INTERP opt_interp(lvlminmax,dim,GLP_MAX,given_dict,faultLevelVectors); + opt_interp.init_opti_prob(prob_name); + opt_interp.set_constr_matrix(); + opt_interp.solve_opti_problem(); + + LevelVectorList recomputeLevelVectors; + CombigridDict new_dict = opt_interp.get_results(recomputeLevelVectors); + + CombiDataType u_c = 0.0; + CombiDataType ut_c = 0.0; + CombiDataType rel_err_max = 0.10; + + // Combine values with and without the suspect + for( auto const &entry : given_dict){ + LevelVector lvl = entry.first; + CombiDataType u_i; + if ( subspaceValues_.find(lvl) == subspaceValues_.end() ) + u_i = u_robust; + else + u_i = subspaceValues_[lvl]; + u_c += given_dict[lvl]*u_i; + ut_c += new_dict[lvl]*u_i; + } + + // Calculate relative error + CombiDataType rel_err = std::abs( u_c - ut_c )/u_robust; + + // If relative error is small, value was a false positive (not an outlier) + // -> Remove ID from list of SDC IDs + if ( rel_err < rel_err_max ){ + faultsID.erase(std::remove(faultsID.begin(), faultsID.end(), id), faultsID.end()); + std::cout<<"Removing id = "< +#include +#include +#include +#include namespace combigrid { @@ -45,7 +53,48 @@ class ProcessGroupWorker { void updateCombiParameters(); + void searchSDC(); + + /* Computes the difference between all pairs of combination solutions + * (or only between neighboring ones if onlyNearestNeighbors = true) + * according to the paper on SDC detection. If the difference is large, + * a soft fault might have occurred. */ + void compareSolutions( int numNearestNeighbors, std::vector &levelsSDC, SDCMethodType method ); + + /* Obtain standardized residuals from robust residuals. If a standardized residual is larger than 2.5, + * it is considered an outlier. + * */ + void computeStandardizedResiduals( gsl_multifit_robust_workspace* regressionWsp, gsl_vector* r_stud, gsl_vector* r_lms ); + + /* Generates a list of pairs of tasks, so that for each task + * that a worker has, we find its K nearest neighbors. The distance + * between two tasks is the l1 norm of the difference of their level vectors: + * distance(task_s, task_t) = |s - t|_1 + * */ + void generatePairs( int numNearestNeighbors, std::vector> &allPairs); + + /* Robust fit of beta values using an error expansion model + * */ + void robustRegressionPairs( std::vector &levelsSDC ); + + /* Robust fit of function values using a constant model + * */ + void robustRegressionValues( std::vector &levelsSDC ); + + /* Determine from standardized residuals if a solution has been affected by SDC + * */ + void detectOutliers( double* residuals, std::vector &levelsSDC, double eps, SDCMethodType method, double y = 0.0 ); + + /* Check if a solution marked as outlier is actually only a false positive. We combine the values + * with the outlier (using the classical combination coefficients) and without it (adjusting the + * combination coefficients according to the FTCT) and compare the two combined values. If they're very + * similar, they suspect values is most likely a false positive, or the error is too small as to be + * safely ignored. + * */ + void removeFalsePositives( std::vector& faultsID, double u_robust ); + private: + TaskContainer tasks_; // task storage Task* currentTask_; @@ -62,6 +111,12 @@ class ProcessGroupWorker { bool combiParametersSet_; + MPI_File betasFile_; + + std::map , CombiDataType> betas_; + + std::map subspaceValues_; + void setCombinedSolutionUniform( Task* t ); }; diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.cpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.cpp index 30a7ebc90..2e784cec1 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.cpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.cpp @@ -86,30 +86,209 @@ void ProcessManager::exit() { void ProcessManager::updateCombiParameters() { { bool fail = waitAllFinished(); - - assert( !fail && "should not fail here" ); + // Commented out since not relevant when SDC occurs +// assert( !fail && "should not fail here" ); } for( auto g : pgroups_ ) g->updateCombiParameters(params_); - { bool fail = waitAllFinished(); + // Commented out since not relevant when SDC occurs +// assert( !fail && "should not fail here" ); + } +} - assert( !fail && "should not fail here" ); +/* + * Compute the group faults that occurred at this combination step using the fault simulator + */ +void +ProcessManager::getGroupFaultIDs( std::vector& faultsID ) { + for( auto p : pgroups_ ){ + StatusType status = p->waitStatus(); + + if( status == PROCESS_GROUP_FAIL ){ + TaskContainer failedTasks = p->getTaskContainer(); + for( auto task : failedTasks ) + faultsID.push_back(task->getID()); + } } } +void +ProcessManager::getSDCFaultIDs( std::vector& faultsID ) { + for( auto p : pgroups_ ){ + StatusType status = p->waitStatus(); + if( status == PROCESS_GROUP_FAIL ){ + size_t numTasks = p->getTaskContainer().size(); + faultsID.resize( numTasks ); + MPI_Status statusSDC; + MPI_Recv( &faultsID[0], numTasks , MPI_INT, MPI_ANY_SOURCE, infoTag, theMPISystem()->getGlobalComm(), &statusSDC ); + int numTasksSDC; + MPI_Get_count( &statusSDC, MPI_INT, &numTasksSDC ); + faultsID.resize( numTasksSDC ); + } + } +} + +void ProcessManager::redistribute( std::vector& taskID ) { + for (size_t i = 0; i < taskID.size(); ++i) { + // find id in list of tasks + Task* t = NULL; + + for ( Task* tmp : tasks_ ) { + if ( tmp->getID() == taskID[i] ) { + t = tmp; + break; + } + } + + assert( t != NULL ); + + // wait for available process group + ProcessGroupManagerID g = wait(); + + // assign instance to group + g->addTask( t ); + } + + size_t numWaiting = 0; + + while (numWaiting != pgroups_.size()) { + numWaiting = 0; + + for (size_t i = 0; i < pgroups_.size(); ++i) { + if (pgroups_[i]->getStatus() == PROCESS_GROUP_WAIT) + ++numWaiting; + } + + } + + std::cout << "Redistribute finished" << std::endl; +} + +void ProcessManager::reinit( std::vector& taskID ) { + + for (size_t i = 0; i < taskID.size(); ++i) { + // find id in list of tasks + Task* t = NULL; + + for ( Task* tmp : tasks_ ) { + if ( tmp->getID() == taskID[i] ) { + t = tmp; + break; + } + } + assert( t != NULL ); + // send signal to group where task failed + for ( auto p : pgroups_ ){ + TaskContainer groupTasks = p->getTaskContainer(); + if (std::find(groupTasks.begin(), groupTasks.end(), t) != groupTasks.end()){ + p->reinitTask(t); + break; + } + } + } + + size_t numWaiting = 0; + + while (numWaiting != pgroups_.size()) { + numWaiting = 0; + + for (size_t i = 0; i < pgroups_.size(); ++i) { + if (pgroups_[i]->getStatus() == PROCESS_GROUP_WAIT) + ++numWaiting; + } + + } + + std::cout << "Reinitialize finished" << std::endl; +} +void ProcessManager::recompute( std::vector& taskID ) { + for (size_t i = 0; i < taskID.size(); ++i) { + // find id in list of tasks + Task* t = NULL; + + for ( Task* tmp : tasks_ ) { + if ( tmp->getID() == taskID[i] ) { + t = tmp; + break; + } + } + + assert( t != NULL ); + // wait for available process group + ProcessGroupManagerID g = wait(); + + // assign instance to group + g->recompute( t ); + } + + size_t numWaiting = 0; + while (numWaiting != pgroups_.size()) { + numWaiting = 0; + + for (size_t i = 0; i < pgroups_.size(); ++i) { + if (pgroups_[i]->getStatus() == PROCESS_GROUP_WAIT) + ++numWaiting; + } + + } + + std::cout << "Recompute finished" << std::endl; +} + + +void ProcessManager::recoverCommunicators(){ + waitAllFinished(); + + // send recover communicators signal to alive groups + for( ProcessGroupManagerID g : pgroups_ ){ + if( g->getStatus() == PROCESS_GROUP_WAIT ){ + g->recoverCommunicators(); + } + } + +// theMPISystem()->recoverCommunicators( true ); + + // remove failed groups from group list and set new + // todo: this is rather error prone. this relies on the previous functions + // to have removed all processes of failed groups and that the order of + // processes has not changed + pgroups_.erase( + std::remove_if( pgroups_.begin(), + pgroups_.end(), + [] (const ProcessGroupManagerID& p) { + return (p->getStatus() == PROCESS_GROUP_FAIL); } ), + pgroups_.end() ); + + for( size_t i=0; isetMasterRank( int(i) ); + } +} + +void ProcessManager::restoreCombischeme() { + + LevelVector lmin = params_.getLMin(); + LevelVector lmax = params_.getLMax(); + CombiMinMaxScheme combischeme(params_.getDim(), lmin, lmax); + combischeme.createAdaptiveCombischeme(); + combischeme.makeFaultTolerant(); + std::vector levels = combischeme.getCombiSpaces(); + std::vector coeffs = combischeme.getCoeffs(); + + for (size_t i = 0; i < levels.size(); ++i){ + params_.setCoeff( params_.getID(levels[i]), coeffs[i] ); + } +} bool ProcessManager::waitAllFinished(){ bool group_failed = false; - int i = 0; for( auto p : pgroups_ ){ StatusType status = p->waitStatus(); if( status == PROCESS_GROUP_FAIL ){ group_failed = true; } - ++i; } return group_failed; diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.hpp index f3c346abd..a867b6395 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/manager/ProcessManager.hpp @@ -15,6 +15,7 @@ #include "sgpp/distributedcombigrid/sparsegrid/SGrid.hpp" #include "sgpp/distributedcombigrid/task/Task.hpp" #include "sgpp/distributedcombigrid/manager/ProcessGroupManager.hpp" +#include "sgpp/distributedcombigrid/fault_tolerance/LPOptimizationInterpolation.hpp" #include "sgpp/distributedcombigrid/combischeme/CombiMinMaxScheme.hpp" namespace combigrid { @@ -49,6 +50,9 @@ class ProcessManager { bool runnext(); + inline bool + searchSDC( SDCMethodType method ); + inline void combine(); @@ -60,13 +64,46 @@ class ProcessManager { inline void gridEval(FullGrid& fg); + /* Generates no_faults random faults from the combischeme */ + inline void + createRandomFaults( std::vector& faultIds, int no_faults ); + + inline void + recomputeOptimumCoefficients( std::string prob_name, + std::vector& faultsID, + std::vector& redistributefaultsID, + std::vector& recomputeFaultsID); + inline Task* getTask( int taskID ); void updateCombiParameters(); + + /* Computes group faults in current combi scheme step */ + void + getGroupFaultIDs( std::vector& faultsID ); + + /* Computes group faults in current combi scheme step */ + void + getSDCFaultIDs( std::vector& faultsID ); + inline CombiParameters& getCombiParameters(); + void redistribute( std::vector& taskID ); + + void reinit( std::vector& taskID ); + + void recompute( std::vector& taskID ); + + void recoverCommunicators(); + + /* After faults have been fixed, we need to return the combischeme + * to the original combination technique*/ + void restoreCombischeme(); + + + private: ProcessGroupManagerContainer& pgroups_; @@ -214,6 +251,72 @@ CombiParameters& ProcessManager::getCombiParameters() { return params_; } +/* + * Recompute coefficients for the combination technique based on given grid faults using + * an optimization scheme + */ +inline void +ProcessManager::recomputeOptimumCoefficients(std::string prob_name, + std::vector& faultsID, + std::vector& redistributeFaultsID, + std::vector& recomputeFaultsID) { + + CombigridDict given_dict = params_.getCombiDict(); + + std::map IDsToLevels = params_.getLevelsDict(); + LevelVectorList faultLevelVectors; + for (auto id : faultsID){ + faultLevelVectors.push_back(IDsToLevels[id]); + } + + LevelVectorList lvlminmax; + lvlminmax.push_back(params_.getLMin()); + lvlminmax.push_back(params_.getLMax()); + LP_OPT_INTERP opt_interp(lvlminmax,static_cast(params_.getDim()),GLP_MAX,given_dict,faultLevelVectors); + if ( opt_interp.getNumFaults() != 0 ) { + + opt_interp.init_opti_prob(prob_name); + opt_interp.set_constr_matrix(); + opt_interp.solve_opti_problem(); + + LevelVectorList recomputeLevelVectors; + CombigridDict new_dict = opt_interp.get_results(recomputeLevelVectors); + + LevelVectorList newLevels; + std::vector newCoeffs; + std::vector newTaskIDs; + + int numLevels = int(params_.getNumLevels()); + for( int i = 0; i < numLevels; ++i ){ + LevelVector lvl = params_.getLevel(i); + + newLevels.push_back(lvl); + newCoeffs.push_back(new_dict[lvl]); + newTaskIDs.push_back(i); + } + + params_.setLevelsCoeffs(newTaskIDs, newLevels, newCoeffs); + + std::map LevelsToIDs = params_.getLevelsToIDs(); + for(auto l : recomputeLevelVectors){ + recomputeFaultsID.push_back(LevelsToIDs[l]); + } + + std::sort(faultsID.begin(), faultsID.end()); + std::sort(recomputeFaultsID.begin(), recomputeFaultsID.end()); + + std::set_difference(faultsID.begin(), faultsID.end(), + recomputeFaultsID.begin(), recomputeFaultsID.end(), + std::inserter(redistributeFaultsID, redistributeFaultsID.begin())); + }else if( opt_interp.getNumFaultsRecompute() != 0 ){ + + LevelVectorList recomputeLevelVectors = opt_interp.getFaultsRecompute(); + std::map LevelsToIDs = params_.getLevelsToIDs(); + for(auto l : recomputeLevelVectors){ + recomputeFaultsID.push_back(LevelsToIDs[l]); + } + } +} inline Task* ProcessManager::getTask( int taskID ){ @@ -225,5 +328,32 @@ ProcessManager::getTask( int taskID ){ } return nullptr; } + +/* + */ +bool ProcessManager::searchSDC( SDCMethodType method ) { + // wait until all process groups are in wait state + // after sending the exit signal checking the status might not be possible + size_t numWaiting = 0; + + while (numWaiting != pgroups_.size()) { + numWaiting = 0; + + for (size_t i = 0; i < pgroups_.size(); ++i) { + if (pgroups_[i]->getStatus() == PROCESS_GROUP_WAIT) + ++numWaiting; + } + } + + // send signal to each group + for (size_t i = 0; i < pgroups_.size(); ++i) { + pgroups_[i]->searchSDC( method ); + } + bool group_failed = waitAllFinished(); + // return true if no group failed + return !group_failed; +} + + } /* namespace combigrid */ #endif /* PROCESSMANAGER_HPP_ */ diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPISystem.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPISystem.hpp index 8021902f7..620e26f23 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPISystem.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPISystem.hpp @@ -99,8 +99,6 @@ class MPISystem { inline size_t getNumProcs() const; - void recoverCommunicators( bool groupAlive ); - private: explicit MPISystem(); diff --git a/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPIUtils.hpp b/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPIUtils.hpp index 1c0c65418..c0392dd44 100644 --- a/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPIUtils.hpp +++ b/distributedcombigrid/src/sgpp/distributedcombigrid/mpi/MPIUtils.hpp @@ -109,6 +109,14 @@ class MPIUtils { } } + static void MAX_ABS(double *a, double *b, int *len, MPI_Datatype *dtype) { + for( int i = 0; i < *len; ++i ){ + if (std::abs(*a) > std::abs(*b)) + *b = *a; + a++; b++; + } + } + }; } diff --git a/distributedcombigrid/tests/test_hello.cpp b/distributedcombigrid/tests/test_hello.cpp index 7096e3c83..b715756c0 100644 --- a/distributedcombigrid/tests/test_hello.cpp +++ b/distributedcombigrid/tests/test_hello.cpp @@ -9,20 +9,20 @@ static bool CHECK_NPROCS( int nprocs ){ return size == nprocs; } - -BOOST_AUTO_TEST_CASE( bla4 ){ - if( CHECK_NPROCS(4) ){ - std::cout << "I'm with 4" << std::endl; - - BOOST_CHECK( true ); - } -} - - -BOOST_AUTO_TEST_CASE( bla8 ){ - if( CHECK_NPROCS(8) ){ - std::cout << "I'm with 8" << std::endl; - - BOOST_CHECK( true ); - } -} +// +//BOOST_AUTO_TEST_CASE( bla4 ){ +// if( CHECK_NPROCS(4) ){ +// std::cout << "I'm with 4" << std::endl; +// +// BOOST_CHECK( true ); +// } +//} +// +// +//BOOST_AUTO_TEST_CASE( bla8 ){ +// if( CHECK_NPROCS(8) ){ +// std::cout << "I'm with 8" << std::endl; +// +// BOOST_CHECK( true ); +// } +//} diff --git a/gsl-2.1.tar.gz b/gsl-2.1.tar.gz new file mode 100644 index 000000000..b4115872e Binary files /dev/null and b/gsl-2.1.tar.gz differ diff --git a/site_scons/SGppConfigure.py b/site_scons/SGppConfigure.py index 94f9eb6c1..ce9a7b1db 100644 --- a/site_scons/SGppConfigure.py +++ b/site_scons/SGppConfigure.py @@ -7,6 +7,7 @@ import os import subprocess import re +import sys import Helper @@ -18,6 +19,18 @@ def doConfigure(env, moduleFolders, languageWrapperFolders): "CheckJNI" : Helper.CheckJNI, "CheckFlag" : Helper.CheckFlag}) + # boost library + #TODO: add check and error + config.env.AppendUnique(CPPPATH=[config.env['BOOST_INCLUDE_PATH']]) + config.env.AppendUnique(LIBPATH=[config.env['BOOST_LIBRARY_PATH']]) + + # glpk library + config.env.AppendUnique(CPPPATH=[config.env['GLPK_INCLUDE_PATH']]) + config.env.AppendUnique(LIBPATH=[config.env['GLPK_LIBRARY_PATH']]) + + # gsl library + config.env.AppendUnique(CPPPATH=[config.env['GSL_INCLUDE_PATH']]) + config.env.AppendUnique(LIBPATH=[config.env['GSL_LIBRARY_PATH']]) # now set up all further environment settings that should never fail # compiler setup should be always after checking headers and flags, # as they can make the checks invalid, e.g., by setting "-Werror" @@ -29,6 +42,9 @@ def doConfigure(env, moduleFolders, languageWrapperFolders): else: env.Append(CPPFLAGS=["-g", "-O0"]) + if env['USE_STATICLIB']: + env.Append(CPPFLAGS=['-D_USE_STATICLIB']) + # make settings case-insensitive env["COMPILER"] = env["COMPILER"].lower() env["ARCH"] = env["ARCH"].lower() @@ -108,6 +124,30 @@ def doConfigure(env, moduleFolders, languageWrapperFolders): config.env['NVCCFLAGS'] = "-ccbin " + config.env["CXX"] + " -std=c++11 -Xcompiler -fpic,-Wall "# + flagsToForward # config.env.AppendUnique(LIBPATH=['/usr/local.nfs/sw/cuda/cuda-7.5/']) + if not config.CheckCXXHeader('glpk.h'): + sys.stderr.write("Warning: glpk library cannot be found.\n") + + install_res = False + while True: + sys.stdout.write("Would you like to install it? [Y/n] ") + sys.stdout.flush() + resp = sys.stdin.readline() + if len(resp) > 2: + continue # ask again + else: + if resp[0].lower() == 'n': + sys.stderr.write("\nglpk library installation skipped\n\n") + break + elif resp[0].lower() == 'y' or len(resp) == 1: + install_res = install_glpk() + if install_res == True: + sys.stderr.write("\nglpk library installed successfully!\n\n") + else: + sys.stderr.write("\nglpk library installation failed! Manual installation might be required or dependencies failed.\n\n") + break + else: + continue # ask again + sys.stdout.flush() env = config.Finish() print "Configuration done."