Merge pull request #291 from dan131riley/backport-tbb-cleanup

Backport of tbb cleanup
trackreco · Jan 14, 2021 · 6a2ec8c · 6a2ec8c
2 parents 5af6233 + eddabd4
commit 6a2ec8c
Show file tree

Hide file tree

Showing 81 changed files with 195 additions and 7,184 deletions.
diff --git a/BinInfoUtils.h b/BinInfoUtils.h
@@ -17,29 +17,25 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
-CUDA_CALLABLE
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 
-CUDA_CALLABLE
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
-CUDA_CALLABLE
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
   if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
   return phi;
 }
 
-CUDA_CALLABLE
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI

diff --git a/Config.cc b/Config.cc
@@ -23,11 +23,8 @@ namespace Config
 
   // Multi threading and Clone engine configuration
   int   numThreadsFinder = 1;
-
-  // GPU computations
   int   numThreadsEvents = 1;
-  int   numThreadsReorg = 1;
-
+
 #if defined(__MIC__) || defined(__AVX512F__)
   int   numThreadsSimulation = 60;
 #else

diff --git a/Config.h b/Config.h
@@ -6,12 +6,6 @@
 #include <string> // won't compile on clang gcc for mac OS w/o this!
 #include <map>
 
-#if defined(__CUDACC__)
-  #define CUDA_CALLABLE __host__ __device__
-#else
-  #define CUDA_CALLABLE 
-#endif
-
 namespace mkfit {
 
 // Cram this in here for now ...
@@ -264,7 +258,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
+  constexpr int   nEtaPart   = 11;
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
@@ -366,10 +360,7 @@ namespace Config
   // Threading
   extern int    numThreadsFinder;
   extern int    numThreadsSimulation;
-
-  // For GPU computations
   extern int    numThreadsEvents;
-  extern int    numThreadsReorg;
 
   extern int    finderReportBestOutOfN;
 
@@ -412,7 +403,6 @@ namespace Config
 
   void RecalculateDependentConstants();
 
-  CUDA_CALLABLE
   inline float BfieldFromZR(const float z, const float r)
   {
     return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
@@ -423,8 +413,6 @@ namespace Config
   #ifndef MPT_SIZE
     #if defined(__MIC__) || defined(__AVX512F__)
       #define MPT_SIZE 16
-    #elif defined USE_CUDA
-      #define MPT_SIZE 8
     #elif defined(__AVX__) || defined(__AVX2__)
       #define MPT_SIZE 8
     #else

diff --git a/Event.cc b/Event.cc
@@ -12,7 +12,7 @@
 #include "Debug.h"
 
 #ifdef TBB
-#include "tbb/tbb.h"
+#include "tbb/parallel_for.h"
 #endif
 
 #include <memory>
@@ -53,15 +53,15 @@ void Event::reset_nan_n_silly_counters()
 
 Event::Event(int evtID) :
   geom_(dummyGeometry), validation_(*dummyValidation),
-  evtID_(evtID), threads_(1), mcHitIDCounter_(0)
+  evtID_(evtID), mcHitIDCounter_(0)
 {
   reset_nan_n_silly_counters();
   layerHits_.resize(Config::nTotalLayers);
 }
 
-Event::Event(const Geometry& g, Validation& v, int evtID, int threads) :
+Event::Event(const Geometry& g, Validation& v, int evtID) :
   geom_(g), validation_(v),
-  evtID_(evtID), threads_(threads), mcHitIDCounter_(0)
+  evtID_(evtID), mcHitIDCounter_(0)
 {
   reset_nan_n_silly_counters();
   layerHits_.resize(Config::nTotalLayers);

diff --git a/Event.h b/Event.h
@@ -17,7 +17,7 @@ class Event
 {
 public:
   explicit Event(int evtID);
-  Event(const Geometry& g, Validation& v, int evtID, int threads = 1);
+  Event(const Geometry& g, Validation& v, int evtID);
 
   void Reset(int evtID);
   void RemapHits(TrackVec & tracks);
@@ -60,7 +60,6 @@ class Event
   void reset_nan_n_silly_counters();
 
 public:
-  int threads_;
   std::mutex       mcGatherMutex_;
   std::atomic<int> mcHitIDCounter_;
   std::vector<HitVec> layerHits_;

diff --git a/Hit.h b/Hit.h
@@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
-CUDA_CALLABLE
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
@@ -199,10 +197,6 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
-#if __CUDACC__
-  __device__ float* posArrayCU();
-  __device__ float* errArrayCU();
-#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}

diff --git a/Makefile b/Makefile
@@ -58,7 +58,7 @@ distclean: clean-local
 
 ${LIB_CORE}: ${CORE_OBJS}
 	@mkdir -p $(@D)
-	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
+	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}
 
 main: ${AUTO_TGTS} ${LIB_CORE} main.o ${LIBUSOLIDS}
 	${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LIBUSOLIDS} ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib

diff --git a/Makefile.config b/Makefile.config
@@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
   TBB_PREFIX := /opt/local
 endif
 
-# 2.1 Use nvcc to compile cuda code
-# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
-# It's header only and potentially exported by the environment
-# Maybe it is good enough to have:
-# CUBROOT?=Undefined
-# CUDAINCDIR and CUDALIBDIR also need to be defined
-ifneq (,$(realpath /home/ml15/tools/cub))
-	CUBROOT?=/home/ml15/tools/cub
-else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
-	CUBROOT?=/nfs/opt/cuda-8-0/include
-else ifneq (,$(realpath /usr/local/cuda/include))
-	CUBROOT?=/usr/local/cuda/include
-endif
-NV := nvcc -prec-sqrt=true -I${CUBROOT} 
-#-g -G -lineinfo
-# Comment out to compile for CPU
-#USE_CUDA := 1
-# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1
-
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
 OPT := -g -O3
@@ -146,20 +127,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
 LDFLAGS_HOST := 
 LDFLAGS_MIC  := -static-intel
 
-ifdef USE_CUDA
-	CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
-	LDFLAGS_HOST += -L${CUDALIBDIR}
-	ifeq ($(CXX),icpc)
-	  CXXFLAGS += -qopenmp-simd
-	  LDFLAGS  += -qopenmp-simd
-	else
-	  CXXFLAGS += -fopenmp-simd
-	  LDFLAGS  += -fopenmp-simd
-	endif
-endif
-#CXXFLAGS += -qopenmp
-#LDFLAGS += -qopenmp
-
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT} 
 
 ifdef USE_VTUNE_NOTIFY

diff --git a/Matrix.h b/Matrix.h
@@ -81,13 +81,11 @@ inline double dtime()
     return( tseconds );
 }
 
-CUDA_CALLABLE
 inline float hipo(float x, float y)
 {
   return std::sqrt(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline void sincos4(const float x, float& sin, float& cos)
 {
    // Had this writen with explicit division by factorial.

diff --git a/README.md b/README.md
@@ -52,7 +52,6 @@
 - **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
 - **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
 - **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
-- **GPUs**: to be filled out
 
 phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.
 
@@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
 ### TO DO
 
 - flesh out sections as needed
-- GPU specific code
+- GPU specific code?
 
 ### Vestigial code
 

diff --git a/Track.cc b/Track.cc
@@ -299,7 +299,7 @@ bool TrackExtra::isSeedHit(const int lyr, const int idx) const
       if (reftracks[refTrackID].isFindable()) 
       {
 	if (foundHits < minHits) refTrackID = -2;
-	else                     refTrackID = refTrackID;
+	//else                     refTrackID = refTrackID;
       }
       else // ref track is not findable
       {
@@ -314,7 +314,7 @@ bool TrackExtra::isSeedHit(const int lyr, const int idx) const
 	if (reftracks[trueID].isFindable()) 
 	{
 	  if (foundHits < minHits) refTrackID = -5;
-	  else                     refTrackID = refTrackID;
+	  //else                     refTrackID = refTrackID;
 	}
 	else // sim track is not findable
 	{

diff --git a/Track.h b/Track.h
@@ -63,7 +63,6 @@ typedef std::vector<ReducedTrack> RedTrackVec;
 struct TrackState //  possible to add same accessors as track? 
 {
 public:
-  CUDA_CALLABLE
   TrackState() : valid(true) {}
   TrackState(int charge, const SVector3& pos, const SVector3& mom, const SMatrixSym66& err) :
     parameters(SVector6(pos.At(0),pos.At(1),pos.At(2),mom.At(0),mom.At(1),mom.At(2))),
@@ -125,10 +124,8 @@ struct TrackState //  possible to add same accessors as track?
 class Track
 {
 public:
-  CUDA_CALLABLE
   Track() {}
 
-  CUDA_CALLABLE
   Track(const TrackState& state, float chi2, int label, int nHits, const HitOnTrack* hits) :
     state_(state),
     chi2_ (chi2),
@@ -143,7 +140,6 @@ class Track
   Track(int charge, const SVector3& position, const SVector3& momentum, const SMatrixSym66& errors, float chi2) :
     state_(charge, position, momentum, errors), chi2_(chi2) {}
 
-  CUDA_CALLABLE
   ~Track(){}
 
   bool  hasSillyValues(bool dump, bool fix, const char* pref="");
@@ -154,10 +150,6 @@ class Track
 
   const float* posArray() const {return state_.parameters.Array();}
   const float* errArray() const {return state_.errors.Array();}
-#if __CUDACC__
-  __device__ float* posArrayCU();
-  __device__ float* errArrayCU();
-#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector6&     parameters_nc() {return state_.parameters;}
@@ -167,11 +159,8 @@ class Track
   SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);}
   SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);}
 
-  CUDA_CALLABLE
   int      charge() const {return state_.charge;}
-  CUDA_CALLABLE
   float    chi2()   const {return chi2_;}
-  CUDA_CALLABLE
   int      label()  const {return label_;}
 
   float x()      const { return state_.parameters[0]; }
@@ -233,7 +222,6 @@ class Track
     }
   }
 
-  CUDA_CALLABLE
   void addHitIdx(int hitIdx, int hitLyr, float chi2)
   {
     if (lastHitIdx_ < Config::nMaxTrkHits - 1)
@@ -268,12 +256,12 @@ class Track
 
   HitOnTrack getHitOnTrack(int posHitIdx) const { return hitsOnTrk_[posHitIdx]; }
 
-  CUDA_CALLABLE int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
-  CUDA_CALLABLE int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }
+  int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
+  int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }
 
-  CUDA_CALLABLE HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
-  CUDA_CALLABLE int        getLastHitIdx()     const { return hitsOnTrk_[lastHitIdx_].index;  }
-  CUDA_CALLABLE int        getLastHitLyr()     const { return hitsOnTrk_[lastHitIdx_].layer;  }
+  HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
+  int        getLastHitIdx()     const { return hitsOnTrk_[lastHitIdx_].index;  }
+  int        getLastHitLyr()     const { return hitsOnTrk_[lastHitIdx_].layer;  }
 
   int getLastFoundHitPos() const
   {
@@ -318,12 +306,10 @@ class Track
     }
   }
 
-  CUDA_CALLABLE
   void setHitIdx(int posHitIdx, int newIdx) {
     hitsOnTrk_[posHitIdx].index = newIdx;
   }
 
-  CUDA_CALLABLE
   void setHitIdxLyr(int posHitIdx, int newIdx, int newLyr) {
     hitsOnTrk_[posHitIdx] = { newIdx, newLyr };
   }
@@ -335,15 +321,13 @@ class Track
     }
   }
 
-  CUDA_CALLABLE
   void setNFoundHits(int nHits) { nFoundHits_ = nHits; }
   void setNTotalHits(int nHits) { lastHitIdx_ = nHits - 1; }
 
-  CUDA_CALLABLE
   void resetHits() { lastHitIdx_ = -1; nFoundHits_ =  0; }
 
-  CUDA_CALLABLE int  nFoundHits() const { return nFoundHits_; }
-  CUDA_CALLABLE int  nTotalHits() const { return lastHitIdx_+1; }
+  int  nFoundHits() const { return nFoundHits_; }
+  int  nTotalHits() const { return lastHitIdx_+1; }
 
   int nStoredFoundHits() const
   {
@@ -405,13 +389,13 @@ class Track
     return layers;
   }
 
-  CUDA_CALLABLE void setCharge(int chg)  { state_.charge = chg; }
-  CUDA_CALLABLE void setChi2(float chi2) { chi2_ = chi2; }
-  CUDA_CALLABLE void setLabel(int lbl)   { label_ = lbl; }
+  void setCharge(int chg)  { state_.charge = chg; }
+  void setChi2(float chi2) { chi2_ = chi2; }
+  void setLabel(int lbl)   { label_ = lbl; }
 
-  CUDA_CALLABLE void setState(const TrackState& newState) { state_ = newState; }
+  void setState(const TrackState& newState) { state_ = newState; }
 
-  CUDA_CALLABLE Track clone() const { return Track(state_,chi2_,label_,nTotalHits(),hitsOnTrk_); }
+  Track clone() const { return Track(state_,chi2_,label_,nTotalHits(),hitsOnTrk_); }
 
   struct Status
   {