diff --git a/Makefile.config b/Makefile.config index 7b2af458..1c1be39f 100644 --- a/Makefile.config +++ b/Makefile.config @@ -75,8 +75,8 @@ ifdef AVX_512 VEC_GCC := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512 else ifdef AVX2 -VEC_GCC := -mavx2 -VEC_ICC := -mavx2 +VEC_GCC := -mavx2 -mfma +VEC_ICC := -mavx2 -mfma else VEC_GCC := -mavx # -fopt-info-vec-all VEC_ICC := -mavx @@ -150,11 +150,11 @@ ifdef USE_CUDA CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo LDFLAGS_HOST += -L${CUDALIBDIR} ifeq ($(CXX),icpc) - CXXFLAGS += -qopenmp - LDFLAGS += -qopenmp + CXXFLAGS += -qopenmp-simd + LDFLAGS += -qopenmp-simd else - CXXFLAGS += -fopenmp - LDFLAGS += -fopenmp + CXXFLAGS += -fopenmp-simd + LDFLAGS += -fopenmp-simd endif endif #CXXFLAGS += -qopenmp @@ -177,14 +177,14 @@ endif ifeq (${CXX}, ${ICC}) VEC_HOST := ${VEC_ICC} - CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all + CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all else VEC_HOST := ${VEC_GCC} endif ifeq ($(CXX), g++) CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi - CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp + CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd endif ifdef WITH_USOLIDS diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h index 11bad035..4eff166d 100644 --- a/Matriplex/MatriplexCommon.h +++ b/Matriplex/MatriplexCommon.h @@ -37,14 +37,15 @@ #define MUL(a, b) _mm512_mul_ps(a, b) #define FMA(a, b, v) _mm512_fmadd_ps(a, b, v) - #elif defined(__AVX2__) + #elif defined(__AVX2__) && defined(__FMA__) typedef __m256 IntrVec_t; #define MPLEX_INTRINSICS_WIDTH_BYTES 32 #define MPLEX_INTRINSICS_WIDTH_BITS 256 #define AVX2_INTRINSICS #define GATHER_INTRINSICS - #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_epi32(arr); + // Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2 + #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast(arr)); #define LD(a, i) _mm256_load_ps(&a[i*N+n]) #define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r) diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h index ffefc6ce..db1658c4 100644 --- a/mkFit/FitterCU-imp.h +++ b/mkFit/FitterCU-imp.h @@ -392,8 +392,8 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, #if 0 double time_input = dtime(); int itrack; - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for + //omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for for (int i = beg; i < end; ++i) { itrack = i - beg; Track &trk = tracks[i]; diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h index 3dc26039..1ed9d5b9 100644 --- a/mkFit/FitterCU.h +++ b/mkFit/FitterCU.h @@ -16,7 +16,7 @@ #include "index_selection_kernels.h" #include "best_hit_kernels.h" -#include +//#include #include #define BLOCK_SIZE_X 256 diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc index b317b0cd..eb90ef10 100644 --- a/mkFit/MkFitter.cc +++ b/mkFit/MkFitter.cc @@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, int itrack = 0; // FIXME: uncomment when track building is ported to GPU. -#if USE_CUDA_NOT_YET //#ifdef USE_CUDA // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i, ++itrack) { const Track &trk = tracks[i]; @@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, int itrack; //#ifdef USE_CUDA -#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i) { itrack = i - beg; const Track &trk = tracks[i]; @@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, MatriplexTrackPacker mtp(tracks[beg]); //#ifdef USE_CUDA -#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i) { int itrack = i - beg; diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc index cb5b2ba7..7c81e226 100644 --- a/mkFit/fittestMPlex.cc +++ b/mkFit/fittestMPlex.cc @@ -12,7 +12,7 @@ #if USE_CUDA #include "fittestMPlex.h" #include "FitterCU.h" -#include +//#include #endif #ifndef NO_ROOT @@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector& events) #endif separate_first_call_for_meaningful_profiling_numbers(); - // Reorgnanization (copyIn) can eventually be multithreaded. - omp_set_nested(1); + // Reorganization (copyIn) can eventually be multithreaded. +// FIXME: revisit multithreading when track building is ported to GPU. +// omp_set_nested(1); - omp_set_num_threads(Config::numThreadsEvents); +// omp_set_num_threads(Config::numThreadsEvents); double total_gpu_time = dtime(); -#pragma omp parallel reduction(+:s_tmp) +//#pragma omp parallel reduction(+:s_tmp) { - int numThreadsEvents = omp_get_num_threads(); - int thr_idx = omp_get_thread_num(); +// int numThreadsEvents = omp_get_num_threads(); +// int thr_idx = omp_get_thread_num(); + int numThreadsEvents = 1; + int thr_idx = 0; // FitterCU is declared here to share allocations and deallocations // between the multiple events processed by a single thread. @@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector& events) #if 0 // 0 for timing, 1 for validation // Validation crashes for multiple threads. // It is something in relation to ROOT. Not sure what. - if (omp_get_num_threads() <= 1) { + //if (omp_get_num_threads() <= 1) { //if (g_run_fit_std) { std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root"; //} - } + //} #endif } cuFitter.free_extra_addBestHit();