Skip to content

Commit

Permalink
Merge pull request #291 from dan131riley/backport-tbb-cleanup
Browse files Browse the repository at this point in the history
Backport of tbb cleanup
  • Loading branch information
slava77 authored Jan 14, 2021
2 parents 5af6233 + eddabd4 commit 6a2ec8c
Show file tree
Hide file tree
Showing 81 changed files with 195 additions and 7,184 deletions.
4 changes: 0 additions & 4 deletions BinInfoUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,25 @@ typedef std::pair<int, int> BinInfo;
typedef std::vector<std::vector<BinInfo>> BinInfoLayerMap;
typedef std::vector<BinInfoLayerMap> BinInfoMap;

CUDA_CALLABLE
inline float downPhi(float phi)
{
while (phi >= Config::PI) {phi-=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float upPhi(float phi)
{
while (phi <= -Config::PI) {phi+=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float normalizedPhi(float phi)
{
// return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary!
if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
return phi;
}

CUDA_CALLABLE
inline int getPhiPartition(float phi)
{
//assume phi is between -PI and PI
Expand Down
5 changes: 1 addition & 4 deletions Config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,8 @@ namespace Config

// Multi threading and Clone engine configuration
int numThreadsFinder = 1;

// GPU computations
int numThreadsEvents = 1;
int numThreadsReorg = 1;


#if defined(__MIC__) || defined(__AVX512F__)
int numThreadsSimulation = 60;
#else
Expand Down
14 changes: 1 addition & 13 deletions Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <string> // won't compile on clang gcc for mac OS w/o this!
#include <map>

#if defined(__CUDACC__)
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif

namespace mkfit {

// Cram this in here for now ...
Expand Down Expand Up @@ -264,7 +258,7 @@ namespace Config
// Config for Hit and BinInfoUtils
constexpr int nPhiPart = 1260;
constexpr float fPhiFactor = nPhiPart / TwoPI;
constexpr int nEtaPart = 11; // 1 is better for GPU best_hit
constexpr int nEtaPart = 11;
constexpr int nEtaBin = 2 * nEtaPart - 1;

constexpr float fEtaFull = 2 * Config::fEtaDet;
Expand Down Expand Up @@ -366,10 +360,7 @@ namespace Config
// Threading
extern int numThreadsFinder;
extern int numThreadsSimulation;

// For GPU computations
extern int numThreadsEvents;
extern int numThreadsReorg;

extern int finderReportBestOutOfN;

Expand Down Expand Up @@ -412,7 +403,6 @@ namespace Config

void RecalculateDependentConstants();

CUDA_CALLABLE
inline float BfieldFromZR(const float z, const float r)
{
return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
Expand All @@ -423,8 +413,6 @@ namespace Config
#ifndef MPT_SIZE
#if defined(__MIC__) || defined(__AVX512F__)
#define MPT_SIZE 16
#elif defined USE_CUDA
#define MPT_SIZE 8
#elif defined(__AVX__) || defined(__AVX2__)
#define MPT_SIZE 8
#else
Expand Down
8 changes: 4 additions & 4 deletions Event.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include "Debug.h"

#ifdef TBB
#include "tbb/tbb.h"
#include "tbb/parallel_for.h"
#endif

#include <memory>
Expand Down Expand Up @@ -53,15 +53,15 @@ void Event::reset_nan_n_silly_counters()

Event::Event(int evtID) :
geom_(dummyGeometry), validation_(*dummyValidation),
evtID_(evtID), threads_(1), mcHitIDCounter_(0)
evtID_(evtID), mcHitIDCounter_(0)
{
reset_nan_n_silly_counters();
layerHits_.resize(Config::nTotalLayers);
}

Event::Event(const Geometry& g, Validation& v, int evtID, int threads) :
Event::Event(const Geometry& g, Validation& v, int evtID) :
geom_(g), validation_(v),
evtID_(evtID), threads_(threads), mcHitIDCounter_(0)
evtID_(evtID), mcHitIDCounter_(0)
{
reset_nan_n_silly_counters();
layerHits_.resize(Config::nTotalLayers);
Expand Down
3 changes: 1 addition & 2 deletions Event.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Event
{
public:
explicit Event(int evtID);
Event(const Geometry& g, Validation& v, int evtID, int threads = 1);
Event(const Geometry& g, Validation& v, int evtID);

void Reset(int evtID);
void RemapHits(TrackVec & tracks);
Expand Down Expand Up @@ -60,7 +60,6 @@ class Event
void reset_nan_n_silly_counters();

public:
int threads_;
std::mutex mcGatherMutex_;
std::atomic<int> mcHitIDCounter_;
std::vector<HitVec> layerHits_;
Expand Down
6 changes: 0 additions & 6 deletions Hit.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
return 1.0f/(x*x + y*y);
}

CUDA_CALLABLE
inline float getPhi(float x, float y)
{
return std::atan2(y,x);
}

CUDA_CALLABLE
inline float getTheta(float r, float z){
return std::atan2(r,z);
}
Expand Down Expand Up @@ -199,10 +197,6 @@ class Hit

const float* posArray() const {return state_.pos_.Array();}
const float* errArray() const {return state_.err_.Array();}
#if __CUDACC__
__device__ float* posArrayCU();
__device__ float* errArrayCU();
#endif

// Non-const versions needed for CopyOut of Matriplex.
SVector3& parameters_nc() {return state_.pos_;}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ distclean: clean-local

${LIB_CORE}: ${CORE_OBJS}
@mkdir -p $(@D)
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}

main: ${AUTO_TGTS} ${LIB_CORE} main.o ${LIBUSOLIDS}
${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LIBUSOLIDS} ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib
Expand Down
33 changes: 0 additions & 33 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
TBB_PREFIX := /opt/local
endif

# 2.1 Use nvcc to compile cuda code
# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
# It's header only and potentially exported by the environment
# Maybe it is good enough to have:
# CUBROOT?=Undefined
# CUDAINCDIR and CUDALIBDIR also need to be defined
ifneq (,$(realpath /home/ml15/tools/cub))
CUBROOT?=/home/ml15/tools/cub
else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
CUBROOT?=/nfs/opt/cuda-8-0/include
else ifneq (,$(realpath /usr/local/cuda/include))
CUBROOT?=/usr/local/cuda/include
endif
NV := nvcc -prec-sqrt=true -I${CUBROOT}
#-g -G -lineinfo
# Comment out to compile for CPU
#USE_CUDA := 1
# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1

# 3. Optimization
# -O3 implies vectorization and simd (but not AVX)
OPT := -g -O3
Expand Down Expand Up @@ -146,20 +127,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
LDFLAGS_HOST :=
LDFLAGS_MIC := -static-intel

ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
#LDFLAGS += -qopenmp

CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT}

ifdef USE_VTUNE_NOTIFY
Expand Down
2 changes: 0 additions & 2 deletions Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,11 @@ inline double dtime()
return( tseconds );
}

CUDA_CALLABLE
inline float hipo(float x, float y)
{
return std::sqrt(x*x + y*y);
}

CUDA_CALLABLE
inline void sincos4(const float x, float& sin, float& cos)
{
// Had this writen with explicit division by factorial.
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
- **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
- **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
- **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
- **GPUs**: to be filled out

phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.

Expand Down Expand Up @@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
### TO DO

- flesh out sections as needed
- GPU specific code
- GPU specific code?

### Vestigial code

Expand Down
4 changes: 2 additions & 2 deletions Track.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ bool TrackExtra::isSeedHit(const int lyr, const int idx) const
if (reftracks[refTrackID].isFindable())
{
if (foundHits < minHits) refTrackID = -2;
else refTrackID = refTrackID;
//else refTrackID = refTrackID;
}
else // ref track is not findable
{
Expand All @@ -314,7 +314,7 @@ bool TrackExtra::isSeedHit(const int lyr, const int idx) const
if (reftracks[trueID].isFindable())
{
if (foundHits < minHits) refTrackID = -5;
else refTrackID = refTrackID;
//else refTrackID = refTrackID;
}
else // sim track is not findable
{
Expand Down
40 changes: 12 additions & 28 deletions Track.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ typedef std::vector<ReducedTrack> RedTrackVec;
struct TrackState // possible to add same accessors as track?
{
public:
CUDA_CALLABLE
TrackState() : valid(true) {}
TrackState(int charge, const SVector3& pos, const SVector3& mom, const SMatrixSym66& err) :
parameters(SVector6(pos.At(0),pos.At(1),pos.At(2),mom.At(0),mom.At(1),mom.At(2))),
Expand Down Expand Up @@ -125,10 +124,8 @@ struct TrackState // possible to add same accessors as track?
class Track
{
public:
CUDA_CALLABLE
Track() {}

CUDA_CALLABLE
Track(const TrackState& state, float chi2, int label, int nHits, const HitOnTrack* hits) :
state_(state),
chi2_ (chi2),
Expand All @@ -143,7 +140,6 @@ class Track
Track(int charge, const SVector3& position, const SVector3& momentum, const SMatrixSym66& errors, float chi2) :
state_(charge, position, momentum, errors), chi2_(chi2) {}

CUDA_CALLABLE
~Track(){}

bool hasSillyValues(bool dump, bool fix, const char* pref="");
Expand All @@ -154,10 +150,6 @@ class Track

const float* posArray() const {return state_.parameters.Array();}
const float* errArray() const {return state_.errors.Array();}
#if __CUDACC__
__device__ float* posArrayCU();
__device__ float* errArrayCU();
#endif

// Non-const versions needed for CopyOut of Matriplex.
SVector6& parameters_nc() {return state_.parameters;}
Expand All @@ -167,11 +159,8 @@ class Track
SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);}
SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);}

CUDA_CALLABLE
int charge() const {return state_.charge;}
CUDA_CALLABLE
float chi2() const {return chi2_;}
CUDA_CALLABLE
int label() const {return label_;}

float x() const { return state_.parameters[0]; }
Expand Down Expand Up @@ -233,7 +222,6 @@ class Track
}
}

CUDA_CALLABLE
void addHitIdx(int hitIdx, int hitLyr, float chi2)
{
if (lastHitIdx_ < Config::nMaxTrkHits - 1)
Expand Down Expand Up @@ -268,12 +256,12 @@ class Track

HitOnTrack getHitOnTrack(int posHitIdx) const { return hitsOnTrk_[posHitIdx]; }

CUDA_CALLABLE int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
CUDA_CALLABLE int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }
int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }

CUDA_CALLABLE HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
CUDA_CALLABLE int getLastHitIdx() const { return hitsOnTrk_[lastHitIdx_].index; }
CUDA_CALLABLE int getLastHitLyr() const { return hitsOnTrk_[lastHitIdx_].layer; }
HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
int getLastHitIdx() const { return hitsOnTrk_[lastHitIdx_].index; }
int getLastHitLyr() const { return hitsOnTrk_[lastHitIdx_].layer; }

int getLastFoundHitPos() const
{
Expand Down Expand Up @@ -318,12 +306,10 @@ class Track
}
}

CUDA_CALLABLE
void setHitIdx(int posHitIdx, int newIdx) {
hitsOnTrk_[posHitIdx].index = newIdx;
}

CUDA_CALLABLE
void setHitIdxLyr(int posHitIdx, int newIdx, int newLyr) {
hitsOnTrk_[posHitIdx] = { newIdx, newLyr };
}
Expand All @@ -335,15 +321,13 @@ class Track
}
}

CUDA_CALLABLE
void setNFoundHits(int nHits) { nFoundHits_ = nHits; }
void setNTotalHits(int nHits) { lastHitIdx_ = nHits - 1; }

CUDA_CALLABLE
void resetHits() { lastHitIdx_ = -1; nFoundHits_ = 0; }

CUDA_CALLABLE int nFoundHits() const { return nFoundHits_; }
CUDA_CALLABLE int nTotalHits() const { return lastHitIdx_+1; }
int nFoundHits() const { return nFoundHits_; }
int nTotalHits() const { return lastHitIdx_+1; }

int nStoredFoundHits() const
{
Expand Down Expand Up @@ -405,13 +389,13 @@ class Track
return layers;
}

CUDA_CALLABLE void setCharge(int chg) { state_.charge = chg; }
CUDA_CALLABLE void setChi2(float chi2) { chi2_ = chi2; }
CUDA_CALLABLE void setLabel(int lbl) { label_ = lbl; }
void setCharge(int chg) { state_.charge = chg; }
void setChi2(float chi2) { chi2_ = chi2; }
void setLabel(int lbl) { label_ = lbl; }

CUDA_CALLABLE void setState(const TrackState& newState) { state_ = newState; }
void setState(const TrackState& newState) { state_ = newState; }

CUDA_CALLABLE Track clone() const { return Track(state_,chi2_,label_,nTotalHits(),hitsOnTrk_); }
Track clone() const { return Track(state_,chi2_,label_,nTotalHits(),hitsOnTrk_); }

struct Status
{
Expand Down
Loading

0 comments on commit 6a2ec8c

Please sign in to comment.