diff --git a/cpp/gcc/Makefile b/cpp/gcc/Makefile index 23797d0f..a9f9fb6f 100644 --- a/cpp/gcc/Makefile +++ b/cpp/gcc/Makefile @@ -48,7 +48,7 @@ CUDA_LIBS = -L$(CUDA_PATH)/lib64 -L$(CUDA_MATHLIBS_PATH)/lib64 SRW_SRC_DEF += -D_OFFLOAD_GPU -DUSE_CUDA -D_FFTW3 SRW_INCLUDES += $(CUDA_INCLUDES) -SRW_CFLAGS += -std=c++17 +#SRW_CFLAGS += -std=c++17 #HG01012024 LDFLAGS += $(CUDA_LIBS) -lcudart_static -lcudadevrt -lcufft -lrt NVCFLAGS = -O3 -arch=sm_80 -dlto -rdc=true CUDA_OBJ=gmfft_gpu.o srradstr_gpu.o sroptelm_gpu.o sroptdrf_gpu.o sroptgtr_gpu.o srradmnp_gpu.o diff --git a/cpp/py/setup.py b/cpp/py/setup.py index 075520e1..40ebd4bb 100644 --- a/cpp/py/setup.py +++ b/cpp/py/setup.py @@ -21,7 +21,10 @@ if 'MODE' in os.environ: sMode = str(os.environ['MODE']) if sMode == 'cuda': # HG30112023 - ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno']}) + ext_kwargs.update({'libraries': ['srw', 'm', 'cudart_static', 'cudadevrt', 'cufft', 'fftw3f', 'fftw3', 'rt'], 'extra_compile_args': ['-O3', '-mavx2', '-fno-math-errno', '-D_OFFLOAD_GPU']}) + ext_kwargs['define_macros'].clear() #HG13012024 + ext_kwargs['include_dirs'].append('{0}/include'.format(os.environ['CUDA_PATH'])) #HG13012024 + ext_kwargs['include_dirs'].append('{0}/include'.format(os.environ['CUDA_MATHLIBS_PATH'])) #HG13012024 ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_PATH'])) ext_kwargs['library_dirs'].append('{0}/lib64'.format(os.environ['CUDA_MATHLIBS_PATH'])) elif sMode == 'omp': diff --git a/cpp/src/core/sroptel2.cpp b/cpp/src/core/sroptel2.cpp index 73d11027..d0e2fa9a 100644 --- a/cpp/src/core/sroptel2.cpp +++ b/cpp/src/core/sroptel2.cpp @@ -116,7 +116,8 @@ int srTGenOptElem::PropagateRadiationMeth_0(srTSRWRadStructAccessData* pRadAcces //if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE, pBuf)) return result; //from derived classes //OC01102019 (restored) - if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE)) return result; //from derived classes + //if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE)) return result; //from derived classes + if(result = PropagateRadiationSingleE_Meth_0(pRadDataSingleE, pPrevRadDataSingleE, pvGPU)) return result; //from derived classes //HG12012024 if(pRadDataSingleE != pRadAccessData) { diff --git a/cpp/src/core/sroptelm.h b/cpp/src/core/sroptelm.h index a4ea6919..b04374d9 100644 --- a/cpp/src/core/sroptelm.h +++ b/cpp/src/core/sroptelm.h @@ -285,7 +285,7 @@ class srTGenOptElem : public CGenObject { void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&, void* pvGPU=0); //HG01122023 //void MakeWfrEdgeCorrection(srTSRWRadStructAccessData*, float*, float*, srTDataPtrsForWfrEdgeCorr&); #ifdef _OFFLOAD_GPU //HG01122023 - void srTGenOptElem::MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU); + void MakeWfrEdgeCorrection_GPU(srTSRWRadStructAccessData* RadAccessData, float* pDataEx, float* pDataEz, srTDataPtrsForWfrEdgeCorr& DataPtrs, TGPUUsageArg* pGPU); //HG13012024 Remove 'srTGenOptElem::' at the beginning of the function name to comply with the C++ standard #endif int SetupWfrEdgeCorrData1D(srTRadSect1D*, float*, float*, srTDataPtrsForWfrEdgeCorr1D&); diff --git a/cpp/src/core/sroptgtr.h b/cpp/src/core/sroptgtr.h index 34052e34..49201255 100644 --- a/cpp/src/core/sroptgtr.h +++ b/cpp/src/core/sroptgtr.h @@ -86,7 +86,8 @@ class srTGenTransmission : public srTFocusingElem { { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically - pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y); + //pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y); + pRadAccessData->CheckAndSubtractPhaseTermsLin(TransvCenPoint.x, TransvCenPoint.y, pvGPU); //HG12012024 //} char &MethNo = ParPrecWfrPropag.MethNo; @@ -99,7 +100,8 @@ class srTGenTransmission : public srTFocusingElem { //if(ParPrecWfrPropag.AnalTreatment == 1) //{// Treating linear terms analytically - if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin(); + //if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin(); + if(!ParPrecWfrPropag.DoNotResetAnalTreatTermsAfterProp) pRadAccessData->CheckAndResetPhaseTermsLin(pvGPU); //HG12012024 //} return result; diff --git a/cpp/src/core/srradmnp_gpu.cu b/cpp/src/core/srradmnp_gpu.cu index 0cd9f633..479431ec 100644 --- a/cpp/src/core/srradmnp_gpu.cu +++ b/cpp/src/core/srradmnp_gpu.cu @@ -239,6 +239,33 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra int Int_or_ReE = RadExtract.Int_or_Phase; if (Int_or_ReE == 7) Int_or_ReE = 0; //OC150813: time/phot. energy integrated single-e intensity requires "normal" intensity here + + if (Int_or_ReE != 2) //HG13012024 Fixed bug: Output array was not allocated properly + { + if (allStokesReq) + { + RadExtract.pExtractedData = (float*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedData, 4*RadAccessData.nx*RadAccessData.nz*sizeof(float), true); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedData); + } + else + { + RadExtract.pExtractedData = (float*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedData, RadAccessData.nx*RadAccessData.nz*sizeof(float), true); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedData); + } + } + else + { + if (allStokesReq) + { + RadExtract.pExtractedDataD = (double*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedDataD, 4*RadAccessData.nx*RadAccessData.nz*sizeof(double), true); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedDataD); + } + else + { + RadExtract.pExtractedDataD = (double*)CAuxGPU::ToDevice(pGPU, RadExtract.pExtractedDataD, RadAccessData.nx*RadAccessData.nz*sizeof(double), true); + //CAuxGPU::EnsureDeviceMemoryReady(pGPU, RadExtract.pExtractedDataD); + } + } if (allStokesReq) if (intOverEnIsRequired) @@ -251,23 +278,53 @@ int srTRadGenManip::ExtractSingleElecIntensity2DvsXZ_GPU(srTRadExtract& RadExtra else ExtractSingleElecIntensity2DvsXZ_GPUSub (blocks, threads, RadExtract, RadAccessData, local_copy, arAuxInt, ie0, ie1, InvStepRelArg, Int_or_ReE); - CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true); - CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true); - CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); - CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); + if(Int_or_ReE != 2) //HG13012024 Fixed bug: Output array was not allocated properly + { + if(RadExtract.pExtractedData != NULL) + CAuxGPU::MarkUpdated(pGPU, RadExtract.pExtractedData, true, false); + } + else + { + if(RadExtract.pExtractedDataD != NULL) + CAuxGPU::MarkUpdated(pGPU, RadExtract.pExtractedDataD, true, false); + } #ifndef _DEBUG - if (RadAccessData.pBaseRadX != NULL) - RadAccessData.pBaseRadX = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadX); - if (RadAccessData.pBaseRadZ != NULL) - RadAccessData.pBaseRadZ = (float*)CAuxGPU::GetHostPtr(pGPU, RadAccessData.pBaseRadZ); + if(Int_or_ReE != 2) + { + if (RadExtract.pExtractedData != NULL) + RadExtract.pExtractedData = (float*)CAuxGPU::GetHostPtr(pGPU, RadExtract.pExtractedData); + } + else + { + if (RadExtract.pExtractedDataD != NULL) + RadExtract.pExtractedDataD = (double*)CAuxGPU::GetHostPtr(pGPU, RadExtract.pExtractedDataD); + } #endif #ifdef _DEBUG + if(Int_or_ReE != 2) + { + if (RadExtract.pExtractedData != NULL) + RadExtract.pExtractedData = (float*)CAuxGPU::ToHostAndFree(pGPU, RadExtract.pExtractedData, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + } + else + { + if (RadExtract.pExtractedDataD != NULL) + RadExtract.pExtractedDataD = (double*)CAuxGPU::ToHostAndFree(pGPU, RadExtract.pExtractedDataD, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(double)); + } +#endif + + CAuxGPU::ToHostAndFree(pGPU, local_copy, sizeof(srTRadGenManip), true); + CAuxGPU::ToHostAndFree(pGPU, arAuxInt, RadAccessData.ne*sizeof(double), true); + //CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadX, true, false); + //CAuxGPU::MarkUpdated(pGPU, RadAccessData.pBaseRadZ, true, false); if (RadAccessData.pBaseRadX != NULL) - RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + RadAccessData.pBaseRadX = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadX, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float), true); //HG13012024 Original wavefront data does not need to be copied back to CPU if (RadAccessData.pBaseRadZ != NULL) - RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float)); + RadAccessData.pBaseRadZ = (float*)CAuxGPU::ToHostAndFree(pGPU, RadAccessData.pBaseRadZ, 2*RadAccessData.ne*RadAccessData.nx*RadAccessData.nz*sizeof(float), true); //HG13012024 Original wavefront data does not need to be copied back to CPU + +#ifdef _DEBUG cudaStreamSynchronize(0); auto err = cudaGetLastError(); printf("%s\r\n", cudaGetErrorString(err)); diff --git a/cpp/src/core/srradstr.cpp b/cpp/src/core/srradstr.cpp index f7c7a87d..2e2279bb 100644 --- a/cpp/src/core/srradstr.cpp +++ b/cpp/src/core/srradstr.cpp @@ -2607,7 +2607,8 @@ void srTSRWRadStructAccessData::AddStokesAtPoint(srTEXZ& EXZ, float* pStokesVal) //************************************************************************* -void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc) +//void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc) +void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, double newZc, void *pvGPU) //HG12012024 { const double ratAllowSubtract = 0.2; const double minNumOptCycles = 10; @@ -2666,12 +2667,14 @@ void srTSRWRadStructAccessData::CheckAndSubtractPhaseTermsLin(double newXc, doub if((xMult == 0) && (zMult == 0)) return; - MultiplyElFieldByPhaseLin(xMult, zMult); + //MultiplyElFieldByPhaseLin(xMult, zMult); + MultiplyElFieldByPhaseLin(xMult, zMult, pvGPU); //HG12012024 } //************************************************************************* -void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin() +//void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin() +void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin(void* pvGPU) //HG12012024 { if((!m_xLinOnlyPhaseTermWasSubtracted) && (!m_zLinOnlyPhaseTermWasSubtracted)) return; @@ -2695,7 +2698,8 @@ void srTSRWRadStructAccessData::CheckAndResetPhaseTermsLin() if((xMult == 0) && (zMult == 0)) return; - MultiplyElFieldByPhaseLin(xMult, zMult); + //MultiplyElFieldByPhaseLin(xMult, zMult); + MultiplyElFieldByPhaseLin(xMult, zMult, pvGPU); //HG12012024 } //************************************************************************* diff --git a/cpp/src/core/srradstr.h b/cpp/src/core/srradstr.h index dee50e5f..9e693e54 100644 --- a/cpp/src/core/srradstr.h +++ b/cpp/src/core/srradstr.h @@ -243,8 +243,10 @@ class srTSRWRadStructAccessData : public CGenObject { int FindAverageDistanceToSource(srTTrjDat& TrjDat, double& Robs, double& RobsAbsErr, double& xElAtYsrc, double& zElAtYsrc, double* precPar); void AddStokesAtPoint(srTEXZ& EXZ, float* pStokesVal); - void CheckAndSubtractPhaseTermsLin(double newXc, double newZc); - void CheckAndResetPhaseTermsLin(); + //void CheckAndSubtractPhaseTermsLin(double newXc, double newZc); + void CheckAndSubtractPhaseTermsLin(double newXc, double newZc, void* pvGPU=0); //HG12012024 + //void CheckAndResetPhaseTermsLin(); + void CheckAndResetPhaseTermsLin(void* pvGPU=0); //HG12012024 void EstimateOversamplingFactors(double& estimOverSampX, double& estimOverSampZ); void MirrorFieldData(int sx, int sz, void* pvGPU=0); //OC28072023 diff --git a/cpp/src/ext/genmath/gmfft.h b/cpp/src/ext/genmath/gmfft.h index 09ae0bca..e6a84d1e 100644 --- a/cpp/src/ext/genmath/gmfft.h +++ b/cpp/src/ext/genmath/gmfft.h @@ -190,6 +190,14 @@ class CGenMathFFT2D : public CGenMathFFT { #endif } +#ifdef _OFFLOAD_GPU //HG13012024 + ~CGenMathFFT2D() + { + if(Plan2DFFT_cu != 0) cufftDestroy(Plan2DFFT_cu); + if(dPlan2DFFT_cu != 0) cufftDestroy(dPlan2DFFT_cu); + } +#endif + //int Make2DFFT(CGenMathFFT2DInfo&); //Modification by S.Yakubov for parallelizing SRW via OpenMP: #ifdef _FFTW3 //28012019 @@ -604,6 +612,14 @@ class CGenMathFFT1D : public CGenMathFFT { #endif } +#ifdef _OFFLOAD_GPU //HG13012024 + ~CGenMathFFT1D() + { + if(Plan1DFFT_cu != 0) cufftDestroy(Plan1DFFT_cu); + if(dPlan1DFFT_cu != 0) cufftDestroy(dPlan1DFFT_cu); + } +#endif + int Make1DFFT(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023 int Make1DFFT_InPlace(CGenMathFFT1DInfo& FFT1DInfo, void* pvGPU=0); //OC05092023