diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py new file mode 100755 index 00000000..4485a289 --- /dev/null +++ b/benchmarks/stencil_tensor.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +import gpt as g +#grid = g.grid([64,64,64,64], g.double) +#grid = g.grid([32,32,32,32], g.double) +grid = g.grid([32,16,16,16], g.double) +m1 = g.mcolor(grid) +m2 = g.mcolor(grid) +m3 = g.mcolor(grid) +rng = g.random("D") +rng.cnormal([m1,m2,m3]) +m3ref = g(m1*m2) +code = [] +ti = g.stencil.tensor_instructions + +for i in range(3): + for j in range(3): + for l in range(3): + dst = 3*i + j + code.append( + (0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)]) + ) + +ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code) // 9) + +ein.memory_access_pattern(fast_osites=-3) + +ein(m3,m1,m2) +g.message(g.norm2(m3 - m3ref)) + + +for block_size in [1,4,8,-1,-4,-5,-8]: + ein.memory_access_pattern(fast_osites=block_size) + + g.message(block_size) + t=g.timer("d") + t("expr") + for i in range(300): + g.eval(m3,m1*m2) + t("stencil") + for i in range(300): + ein(m3,m1,m2) + t() + g.message(t) + g.message(g.norm2(m3 - m3ref)) + + +# D_{a2,a1} = epsilon_{a1,b1,c1}*epsilon_{a2,b2,c2}*spin_transpose(Q1_{b1,b2})*Q2_{c1,c2} +Q1 = g.mspincolor(grid) +Q2 = g.mspincolor(grid) +rng.cnormal([Q1,Q2]) +eps = g.epsilon(Q1.otype.shape[2]) +code = [] +acc = {} +for i in range(4): + for j in range(4): + for l in range(4): + for i1, sign1 in eps: + for i2, sign2 in eps: + dst = (i*4 + j)*9 + i2[0]*3 + i1[0] + aa = (4*i + l)*9 + i1[1]*3 + i2[1] + bb = (4*j + l)*9 + i1[2]*3 + i2[2] + if dst not in acc: + acc[dst] = True + mode = ti.mov if sign1 * sign2 > 0 else ti.mov_neg + else: + mode = ti.inc if sign1 * sign2 > 0 else ti.dec + assert dst >= 0 and dst < 12*12 + assert aa >= 0 and aa < 12*12 + assert bb >= 0 and bb < 12*12 + code.append( + (0,dst,mode,1.0,[(1,0,aa),(2,0,bb)]) + ) + +g.message(len(code)) +ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code) + +R = g.mspincolor(grid) +R[:] = 0 +ein(R, Q1, Q2) + +R2 = g.qcd.baryon.diquark(Q1,Q2) + +g.message(g.norm2(R - R2) / g.norm2(R)) +# +# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]]) + + +for block_size in [1,4,8,-1,-4,-5,-8]: + ein.memory_access_pattern(fast_osites=block_size) + + g.message(block_size) + + t=g.timer("d") + t("diquark") + for i in range(30): + g.qcd.baryon.diquark(Q1,Q2) + t("stencil") + for i in range(30): + ein(R, Q1, Q2) + t() + g.message(t) + diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h index 991d4711..def6cd45 100644 --- a/lib/cgpt/lib/stencil/tensor.h +++ b/lib/cgpt/lib/stencil/tensor.h @@ -139,9 +139,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { int n_code = code.size(); const cgpt_stencil_tensor_code_offload_t* p_code = &code[0]; - typedef decltype(coalescedRead(fields_v[0][0])) obj_t; - typedef decltype(coalescedReadElement(fields_v[0][0],0)) element_t; - typedef typename obj_t::scalar_type coeff_t; + typedef typename T::scalar_type coeff_t; int nd = fields[0].Grid()->Nd(); @@ -149,8 +147,9 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { int _npbs = n_code_parallel_block_size; uint64_t osites = fields[0].Grid()->oSites(); + uint64_t osite_blocks = osites; - int _fast_osites = fast_osites; + int _fast_osites, BLOCK_SIZE; if (local) { @@ -160,36 +159,13 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,); - // now loop - uint64_t osites = fields[0].Grid()->oSites(); - - ASSERT(_npb == 1); - -#define BLOCK_SIZE 16 - - // TODO: not-divisible by block_size fix - ASSERT(osites % BLOCK_SIZE == 0); - - - accelerator_for(ss_block,osites * _npb / BLOCK_SIZE,T::Nsimd(),{ - - uint64_t ss, oblock; - - MAP_INDEXING(ss, oblock); - -#define NN (sizeof(obj_t) / sizeof(element_t)) - - for (int iblock=0;iblock<_npbs;iblock++) { - - int i = oblock * _npbs + iblock; - - const auto _p = &p_code[i]; - const auto _f0 = &_p->factor[0]; - const auto _f1 = &_p->factor[1]; - - element_t* e_a = (element_t*)&fields_v[_f0->index][BLOCK_SIZE * ss]; - element_t* e_b = (element_t*)&fields_v[_f1->index][BLOCK_SIZE * ss]; - element_t* e_c = (element_t*)&fields_v[_p->target][BLOCK_SIZE * ss]; + if (fast_osites > 0) { + BLOCK_SIZE = fast_osites; + _fast_osites = 1; + } else { + BLOCK_SIZE = -fast_osites; + _fast_osites = 0; + } #define TC_MOV 0 #define TC_INC 1 @@ -200,47 +176,89 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { #define TC_MOV_NEG_CC 6 #define TC_DEC_CC 7 #define TC_MUL 8 + +#define _ID(a) a +#define _CONJ(a) adj(a) +#define _INC(a,b,c) a + b*c +#define _MOV(a,b,c) b*c +#define _MOV_NEG(a,b,c) - b*c +#define _DEC(a,b,c) a - b*c +#define _MUL(a,b,c) a*((coeff_t)_p->weight) +#define KERNEL(composition, mod_first) \ + for (int ff=0;ff < BLOCK_SIZE;ff++) \ + coalescedWriteElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], \ + composition(coalescedReadElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], _p->element), \ + mod_first(coalescedReadElement(fields_v[_f0->index][BLOCK_SIZE * ss + ff], _f0->element)), \ + coalescedReadElement(fields_v[_f1->index][BLOCK_SIZE * ss + ff], _f1->element)), \ + _p->element); + + + osite_blocks = osites / BLOCK_SIZE; + + for (int iter=0;iter<((osites % BLOCK_SIZE == 0) ? 1 : 2);iter++) { + + uint64_t osite_offset = (iter == 0) ? 0 : osite_blocks * BLOCK_SIZE; + if (iter == 1) { + BLOCK_SIZE = 1; + osite_blocks = osites - osite_offset; + } + + accelerator_forNB(ss_block,osite_blocks * _npb,T::Nsimd(),{ -#define ID(a) a -#define CONJ(a) adj(a) -#define KERNEL(signature, functor) \ - for (int ff=0;ffelement] signature functor(e_a[NN * ff +_f0->element]) * e_b[NN * ff + _f1->element]; - - switch (_p->instruction) { - case TC_INC: - KERNEL(+=,ID); - break; - case TC_MOV: - KERNEL(=,ID); - break; - case TC_DEC: - KERNEL(-=,ID); - break; - case TC_MOV_NEG: - KERNEL(=-,ID); - break; - case TC_INC_CC: - KERNEL(+=,CONJ); - break; - case TC_MOV_CC: - KERNEL(=,CONJ); - break; - case TC_DEC_CC: - KERNEL(-=,CONJ); - break; - case TC_MOV_NEG_CC: - KERNEL(=-,CONJ); - break; - case TC_MUL: - for (int ff=0;ffelement] *= ((coeff_t)_p->weight); - break; + uint64_t ss, oblock; + + if (_fast_osites) { + oblock = ss_block / osite_blocks; + ss = osite_offset + ss_block % osite_blocks; + } else { + ss = osite_offset + ss_block / _npb; + oblock = ss_block % _npb; } - } - - }); - + + for (int iblock=0;iblock<_npbs;iblock++) { + + int i = oblock * _npbs + iblock; + + const auto _p = &p_code[i]; + const auto _f0 = &_p->factor[0]; + const auto _f1 = &_p->factor[1]; + + switch (_p->instruction) { + case TC_INC: + KERNEL(_INC,_ID); + break; + case TC_MOV: + KERNEL(_MOV,_ID); + break; + case TC_DEC: + KERNEL(_DEC,_ID); + break; + case TC_MOV_NEG: + KERNEL(_MOV_NEG,_ID); + break; + case TC_INC_CC: + KERNEL(_INC,_CONJ); + break; + case TC_MOV_CC: + KERNEL(_MOV,_CONJ); + break; + case TC_DEC_CC: + KERNEL(_DEC,_CONJ); + break; + case TC_MOV_NEG_CC: + KERNEL(_MOV_NEG,_CONJ); + break; + case TC_MUL: + KERNEL(_MUL,_ID); + break; + } + } + + }); + } + + accelerator_barrier(); + // and cleanup CGPT_CARTESIAN_STENCIL_CLEANUP(T,); @@ -273,8 +291,6 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) { out.weight = get_complex(in, "weight"); cgpt_convert(get_key(in, "factor"), out.factor); - - // } template diff --git a/lib/gpt/qcd/fermion/register.py b/lib/gpt/qcd/fermion/register.py index 93d42a0a..8f065af2 100644 --- a/lib/gpt/qcd/fermion/register.py +++ b/lib/gpt/qcd/fermion/register.py @@ -11,15 +11,25 @@ def register(reg, op): reg.Mdiag = lambda dst, src: op.apply_unary_operator(2009, dst, src) reg.Dminus = lambda dst, src: op.apply_unary_operator(2010, dst, src) reg.DminusDag = lambda dst, src: op.apply_unary_operator(2011, dst, src) - reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2012, dst, src) - reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator(2013, dst, src) - reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator(2014, dst, src) - reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2015, dst, src) + reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( + 2012, dst, src + ) + reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator( + 2013, dst, src + ) + reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator( + 2014, dst, src + ) + reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( + 2015, dst, src + ) reg.Dhop = lambda dst, src: op.apply_unary_operator(3001, dst, src) reg.DhopDag = lambda dst, src: op.apply_unary_operator(4001, dst, src) reg.DhopEO = lambda dst, src: op.apply_unary_operator(3002, dst, src) reg.DhopEODag = lambda dst, src: op.apply_unary_operator(4002, dst, src) - reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator(5001, dst, src, dir, disp) + reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator( + 5001, dst, src, dir, disp + ) reg.MDeriv = lambda mat, dst, src: op.apply_deriv_operator(6001, mat, dst, src) reg.MDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7001, mat, dst, src) reg.MoeDeriv = lambda mat, dst, src: op.apply_deriv_operator(6002, mat, dst, src) @@ -27,8 +37,14 @@ def register(reg, op): reg.MeoDeriv = lambda mat, dst, src: op.apply_deriv_operator(6003, mat, dst, src) reg.MeoDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7003, mat, dst, src) reg.DhopDeriv = lambda mat, dst, src: op.apply_deriv_operator(6004, mat, dst, src) - reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7004, mat, dst, src) + reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator( + 7004, mat, dst, src + ) reg.DhopDerivEO = lambda mat, dst, src: op.apply_deriv_operator(6005, mat, dst, src) - reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator(7005, mat, dst, src) + reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator( + 7005, mat, dst, src + ) reg.DhopDerivOE = lambda mat, dst, src: op.apply_deriv_operator(6006, mat, dst, src) - reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator(7006, mat, dst, src) + reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator( + 7006, mat, dst, src + )