Finally, a working implementation of particle near surface in FFT mode.

- additional required memory was described in comments in calculator.c - definition of ZsumShift is now different for sparse and FFT modes in parallel (relative to the local or global bottom dipoles respectively). Related difference is in definition of local_Nz_Rm (used for Sommerfeld table and, in FFT mode, for filling Rmatrix). - Sommerfeld table is now not calculated during prognosis, but corresponding memory is properly accounted for. - Current implementation requires additional forward fftY and fftZ at each iteration (but see issue 177). - BlockTranspose_Dm in comm.c was renamed into BlockTranspose_DRm, since it works both for D- and Rmatrix. - existing functions to index Dmatrix has been slightly changed to not use DsizeYZ and use y>=DsizeY instead of y>smallY. - functions and plans, not specific to Dmatrix were renamed (Y and Z forward transforms), now they have 'slice' in their names. - transposeYZ_Dm was changed to transpose (a general function), and transposeYZ is now just a wrapper around the latter. - version incremented to 1.3b1. Main computational bottleneck is computation of the table of Sommerfeld integrals. Currently it is approximately equivalent to 200 iterations (issue 176). So should not be a problem for applications with a larger number of iterations. New implementation was extensively tested against previous sparse version. Still, quantitative comparison with published literature data is required. Remaining limitation is that it doesn't work in OpenCL mode (see issue 101 for details, explicit exception was added to param.c). Changes to timing: - added time for 'init interaction' (significant when Sommerfeld table is calculated). - Time for 'init Dmatrix' now may include the time for initialization of Rmatrix (doesn't include time for Sommerfeld table). - Precise timing for Dmatrix now includes a separate line for initialization of Rmatrix (without details) Other: - scattering at exactly 90 degrees (along surface) for non-trivial surfaces is now handled by a special case to produce exact 0. This makes the output consistent, avoiding large integration errors for -phi_integr or alldir. - added explicit exception to forbid combination of -no_reduced_fft and -iter cgnr in MPI FFT mode (issue 174). Tests in tests/2exec/ have been significantly improved - added SURF_EXT flag, which allows running a large suite of tests, adding '-surf ...' option to each of them. - added a number of ignores, which are always active (decreases false-positives) - added a couple of macros in suite files (NOMPI and NOMPISEQ), which indicates that the line should be skipped for a specific comparison modes. - added a number of tests to the default suite: '-surf ...', '-int_surf', -scat_plane, '-shape read ... -grid ...', '-no_reduced_fft -iter cgnr' (see above)
adda-team · Sep 25, 2013 · 3fd68c9 · 3fd68c9
1 parent c870ace
commit 3fd68c9
Show file tree

Hide file tree

Showing 19 changed files with 606 additions and 249 deletions.
diff --git a/src/GenerateB.c b/src/GenerateB.c
@@ -43,7 +43,7 @@ extern const int beam_Npars;
 extern const double beam_pars[];
 extern const char *beam_fnameY;
 extern const char *beam_fnameX;
-extern opt_index opt_beam;
+extern const opt_index opt_beam;
 
 // used in crosssec.c
 double beam_center_0[3]; // position of the beam center in laboratory reference frame

diff --git a/src/calculator.c b/src/calculator.c
@@ -43,7 +43,7 @@ extern const angle_set beta_int,gamma_int,theta_int,phi_int;
 extern const int avg_inc_pol;
 extern const char *alldir_parms,*scat_grid_parms;
 // defined and initialized in timing.c
-extern TIME_TYPE Timing_Init;
+extern TIME_TYPE Timing_Init,Timing_Init_Int;
 #ifdef OPENCL
 extern TIME_TYPE Timing_OCL_Init;
 #endif
@@ -445,14 +445,17 @@ static void AllocateEverything(void)
 	/* estimate of the memory (only the fastest scaling part):
 	 * MatVec - (288+384nprocs/boxX [+192/nprocs])*Ndip
 	 *          more exactly: gridX*gridY*gridZ*(36+48nprocs/boxX [+24/nprocs]) value in [] is only for parallel mode.
-	 *          For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory
+	 * For surf additionally: gridX*gridY*gridZ*(48+48nprocs/boxX)
+	 * 			+ for Sommerfeld table: 64*boxZ*(boxX*boxY-(MIN(boxX,boxY))^2)
+	 *    For OpenCL mode all MatVec part is allocated on GPU instead of main (CPU) memory
 	 * others - nvoid_Ndip*{271(CGNR,BiCG), 367(CSYM,QMR2), 415(BiCGStab,QMR), or 463(BCGS2)}
 	 *          + additional 8*nvoid_Ndip for OpenCL mode and CGNR or Bi-CGSTAB
 	 * PARALLEL: above is total; division over processors of MatVec is uniform, others - according to local_nvoid_Ndip
 	 *
 	 * Sparse mode - each processor needs (265--457, depending on iterative solver)*local_nvoid_Ndip + 60*nvoid_Ndip
 	 *               and division is uniform, i.e. local_nvoid_Ndip = nvoid_Ndip/nprocs
-	 *               Part of the memory is currently not distributed among processors - see issue 160.
+	 *               Sommerfeld table - same as above, but it is not divided among processors.
+	 *               Part of the memory is currently not distributed among processors - see issues 160,175.
 	 */
 	MAXIMIZE(memPeak,memory);
 	double memSum=AccumulateMax(memPeak,&memmax);
@@ -592,7 +595,9 @@ void Calculator (void)
 	else dtheta_deg=dtheta_rad=block_theta=0;
 	finish_avg=false;
 	// Do preliminary setup for MatVec
+	TIME_TYPE startInitInt=GET_TIME();
 	InitInteraction();
+	Timing_Init_Int=GET_TIME()-startInitInt;
 #ifndef SPARSE
 	// initialize D matrix (for matrix-vector multiplication)
 	D("InitDmatrix started");

diff --git a/src/comm.c b/src/comm.c
@@ -676,10 +676,10 @@ void BlockTranspose(doublecomplex * restrict X UOIP,TIME_TYPE *timing UOIP)
 
 //======================================================================================================================
 
-void BlockTranspose_Dm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP)
-/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D matrix. It can be updated to
- * accept timing argument for generality. But, since this is a specialized function, we keep the timing variable
- * hard-wired in the code.
+void BlockTranspose_DRm(doublecomplex * restrict X UOIP,const size_t lengthY UOIP,const size_t lengthZ UOIP)
+/* do the data-transposition, i.e. exchange, between fftX and fftY&fftZ; specialized for D or R matrix. It can be
+ * updated to accept timing argument for generality. But, since this is a specialized function, we keep the timing
+ * variable hard-wired in the code.
  */
 {
 #ifdef ADDA_MPI

diff --git a/src/comm.h b/src/comm.h
@@ -45,7 +45,7 @@ void ReadField(const char * restrict fname,doublecomplex *restrict field);
 
 #ifndef SPARSE
 void BlockTranspose(doublecomplex * restrict X,TIME_TYPE *timing);
-void BlockTranspose_Dm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
+void BlockTranspose_DRm(doublecomplex * restrict X,size_t lengthY,size_t lengthZ);
 // used by granule generator
 void SetGranulComm(double z0,double z1,double gdZ,int gZ,size_t gXY,size_t buf_size,int *lz0,int *lz1,int sm_gr);
 void CollectDomainGranul(unsigned char * restrict dom,size_t gXY,int lz0,int locgZ,TIME_TYPE *timing);

diff --git a/src/const.h b/src/const.h
@@ -18,7 +18,7 @@
 #define __const_h
 
 // version number (string)
-#define ADDA_VERSION "1.3a2"
+#define ADDA_VERSION "1.3b1"
 
 /* ADDA uses certain C99 extensions, which are widely supported by GNU and Intel compilers. However, they may be not
  * completely supported by e.g. Microsoft Visual Studio compiler. Therefore, we check the version of the standard here

diff --git a/src/crosssec.c b/src/crosssec.c
@@ -631,6 +631,11 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr
 	if (above) { // simple reflection
 		vCopy(nF,nN);
 		nN[2]*=-1;
+		// no scattering at exactly 90 degrees for non-trivial surface (to avoid randomness for this case)
+		if (fabs(nN[2])<ROUND_ERR && cabs(msub-1)>ROUND_ERR) {
+			cvInit(ebuff);
+			return;
+		}
 	}
 	else { // transmission
 		if (msubInf) { // no transmission for perfectly reflecting substrate => zero result
@@ -738,11 +743,13 @@ static void CalcFieldSurf(doublecomplex ebuff[static restrict 3], // where to wr
 		kt=NAN; // redundant to remove warnings below
 	}
 	else {
-		if (cabs(msub-1)<ROUND_ERR && fabs(ki)<ROUND_ERR) kt=ki; // special case to avoid randomness due to round-off errors
+		// special case to avoid randomness due to round-off errors
+		if (cabs(msub-1)<ROUND_ERR && fabs(ki)<ROUND_ERR) kt=ki;
 		else kt=cSqrtCut(msub*msub - (nN[0]*nN[0]+nN[1]*nN[1]));
 		if (above) {
 			/* here we test only the exact zero, since for other cases (when msub=1, and very small values of ki,kt) the
-			 * above assignment kt=ki guarantees correct results through standard functions
+			 * above assignment kt=ki guarantees correct results through standard functions. Also the case of
+			 * 90deg-scattering (for msub!=1) is taken care of in the beginning of this function.
 			 */
 			if (ki==0 && kt==0) cs=cp=0;
 			else {