Skip to content

Commit

Permalink
Merge pull request #280 from trackreco/V2.0.1-0+pr241_patches
Browse files Browse the repository at this point in the history
patch from cherry-pick of commit 7bfd16f from PR279 to remove OpenMP,…
  • Loading branch information
slava77 authored Oct 13, 2020
2 parents d9dfbda + 9d0a545 commit 5af6233
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 34 deletions.
16 changes: 8 additions & 8 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ ifdef AVX_512
VEC_GCC := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
else ifdef AVX2
VEC_GCC := -mavx2
VEC_ICC := -mavx2
VEC_GCC := -mavx2 -mfma
VEC_ICC := -mavx2 -mfma
else
VEC_GCC := -mavx # -fopt-info-vec-all
VEC_ICC := -mavx
Expand Down Expand Up @@ -150,11 +150,11 @@ ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp
LDFLAGS += -qopenmp
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp
LDFLAGS += -fopenmp
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
Expand All @@ -177,14 +177,14 @@ endif

ifeq (${CXX}, ${ICC})
VEC_HOST := ${VEC_ICC}
CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all
CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all
else
VEC_HOST := ${VEC_GCC}
endif

ifeq ($(CXX), g++)
CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd
endif

ifdef WITH_USOLIDS
Expand Down
5 changes: 3 additions & 2 deletions Matriplex/MatriplexCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@
#define MUL(a, b) _mm512_mul_ps(a, b)
#define FMA(a, b, v) _mm512_fmadd_ps(a, b, v)

#elif defined(__AVX2__)
#elif defined(__AVX2__) && defined(__FMA__)

typedef __m256 IntrVec_t;
#define MPLEX_INTRINSICS_WIDTH_BYTES 32
#define MPLEX_INTRINSICS_WIDTH_BITS 256
#define AVX2_INTRINSICS
#define GATHER_INTRINSICS
#define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_epi32(arr);
// Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2
#define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr));

#define LD(a, i) _mm256_load_ps(&a[i*N+n])
#define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r)
Expand Down
4 changes: 2 additions & 2 deletions mkFit/FitterCU-imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
#if 0
double time_input = dtime();
int itrack;
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for
//omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for
for (int i = beg; i < end; ++i) {
itrack = i - beg;
Track &trk = tracks[i];
Expand Down
2 changes: 1 addition & 1 deletion mkFit/FitterCU.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "index_selection_kernels.h"
#include "best_hit_kernels.h"

#include <omp.h>
//#include <omp.h>
#include <stdexcept>

#define BLOCK_SIZE_X 256
Expand Down
21 changes: 9 additions & 12 deletions mkFit/MkFitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,
int itrack = 0;

// FIXME: uncomment when track building is ported to GPU.
#if USE_CUDA_NOT_YET
//#ifdef USE_CUDA
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i, ++itrack)
{
const Track &trk = tracks[i];
Expand Down Expand Up @@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,

int itrack;
//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i) {
itrack = i - beg;
const Track &trk = tracks[i];
Expand Down Expand Up @@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>& tracks,
MatriplexTrackPacker mtp(tracks[beg]);

//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i)
{
int itrack = i - beg;
Expand Down
21 changes: 12 additions & 9 deletions mkFit/fittestMPlex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#if USE_CUDA
#include "fittestMPlex.h"
#include "FitterCU.h"
#include <omp.h>
//#include <omp.h>
#endif

#ifndef NO_ROOT
Expand Down Expand Up @@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#endif
separate_first_call_for_meaningful_profiling_numbers();

// Reorgnanization (copyIn) can eventually be multithreaded.
omp_set_nested(1);
// Reorganization (copyIn) can eventually be multithreaded.
// FIXME: revisit multithreading when track building is ported to GPU.
// omp_set_nested(1);

omp_set_num_threads(Config::numThreadsEvents);
// omp_set_num_threads(Config::numThreadsEvents);
double total_gpu_time = dtime();
#pragma omp parallel reduction(+:s_tmp)
//#pragma omp parallel reduction(+:s_tmp)
{
int numThreadsEvents = omp_get_num_threads();
int thr_idx = omp_get_thread_num();
// int numThreadsEvents = omp_get_num_threads();
// int thr_idx = omp_get_thread_num();
int numThreadsEvents = 1;
int thr_idx = 0;

// FitterCU is declared here to share allocations and deallocations
// between the multiple events processed by a single thread.
Expand All @@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#if 0 // 0 for timing, 1 for validation
// Validation crashes for multiple threads.
// It is something in relation to ROOT. Not sure what.
if (omp_get_num_threads() <= 1) {
//if (omp_get_num_threads() <= 1) {
//if (g_run_fit_std) {
std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
//}
}
//}
#endif
}
cuFitter.free_extra_addBestHit();
Expand Down

0 comments on commit 5af6233

Please sign in to comment.