Skip to content

Commit

Permalink
gpu a100 tuning, verify performance on cpu (m1)
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 22, 2023
1 parent 9943958 commit 6d24604
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 83 deletions.
102 changes: 102 additions & 0 deletions benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
import gpt as g
#grid = g.grid([64,64,64,64], g.double)
#grid = g.grid([32,32,32,32], g.double)
grid = g.grid([32,16,16,16], g.double)
m1 = g.mcolor(grid)
m2 = g.mcolor(grid)
m3 = g.mcolor(grid)
rng = g.random("D")
rng.cnormal([m1,m2,m3])
m3ref = g(m1*m2)
code = []
ti = g.stencil.tensor_instructions

for i in range(3):
for j in range(3):
for l in range(3):
dst = 3*i + j
code.append(
(0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)])
)

ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code) // 9)

ein.memory_access_pattern(fast_osites=-3)

ein(m3,m1,m2)
g.message(g.norm2(m3 - m3ref))


for block_size in [1,4,8,-1,-4,-5,-8]:
ein.memory_access_pattern(fast_osites=block_size)

g.message(block_size)
t=g.timer("d")
t("expr")
for i in range(300):
g.eval(m3,m1*m2)
t("stencil")
for i in range(300):
ein(m3,m1,m2)
t()
g.message(t)
g.message(g.norm2(m3 - m3ref))


# D_{a2,a1} = epsilon_{a1,b1,c1}*epsilon_{a2,b2,c2}*spin_transpose(Q1_{b1,b2})*Q2_{c1,c2}
Q1 = g.mspincolor(grid)
Q2 = g.mspincolor(grid)
rng.cnormal([Q1,Q2])
eps = g.epsilon(Q1.otype.shape[2])
code = []
acc = {}
for i in range(4):
for j in range(4):
for l in range(4):
for i1, sign1 in eps:
for i2, sign2 in eps:
dst = (i*4 + j)*9 + i2[0]*3 + i1[0]
aa = (4*i + l)*9 + i1[1]*3 + i2[1]
bb = (4*j + l)*9 + i1[2]*3 + i2[2]
if dst not in acc:
acc[dst] = True
mode = ti.mov if sign1 * sign2 > 0 else ti.mov_neg
else:
mode = ti.inc if sign1 * sign2 > 0 else ti.dec
assert dst >= 0 and dst < 12*12
assert aa >= 0 and aa < 12*12
assert bb >= 0 and bb < 12*12
code.append(
(0,dst,mode,1.0,[(1,0,aa),(2,0,bb)])
)

g.message(len(code))
ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code)

R = g.mspincolor(grid)
R[:] = 0
ein(R, Q1, Q2)

R2 = g.qcd.baryon.diquark(Q1,Q2)

g.message(g.norm2(R - R2) / g.norm2(R))
#
# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])


for block_size in [1,4,8,-1,-4,-5,-8]:
ein.memory_access_pattern(fast_osites=block_size)

g.message(block_size)

t=g.timer("d")
t("diquark")
for i in range(30):
g.qcd.baryon.diquark(Q1,Q2)
t("stencil")
for i in range(30):
ein(R, Q1, Q2)
t()
g.message(t)

166 changes: 91 additions & 75 deletions lib/cgpt/lib/stencil/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,18 +139,17 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
int n_code = code.size();
const cgpt_stencil_tensor_code_offload_t* p_code = &code[0];

typedef decltype(coalescedRead(fields_v[0][0])) obj_t;
typedef decltype(coalescedReadElement(fields_v[0][0],0)) element_t;
typedef typename obj_t::scalar_type coeff_t;
typedef typename T::scalar_type coeff_t;

int nd = fields[0].Grid()->Nd();

int _npb = n_code_parallel_blocks;
int _npbs = n_code_parallel_block_size;

uint64_t osites = fields[0].Grid()->oSites();
uint64_t osite_blocks = osites;

int _fast_osites = fast_osites;
int _fast_osites, BLOCK_SIZE;

if (local) {

Expand All @@ -160,36 +159,13 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {

CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);

// now loop
uint64_t osites = fields[0].Grid()->oSites();

ASSERT(_npb == 1);

#define BLOCK_SIZE 16

// TODO: not-divisible by block_size fix
ASSERT(osites % BLOCK_SIZE == 0);


accelerator_for(ss_block,osites * _npb / BLOCK_SIZE,T::Nsimd(),{

uint64_t ss, oblock;

MAP_INDEXING(ss, oblock);

#define NN (sizeof(obj_t) / sizeof(element_t))

for (int iblock=0;iblock<_npbs;iblock++) {

int i = oblock * _npbs + iblock;

const auto _p = &p_code[i];
const auto _f0 = &_p->factor[0];
const auto _f1 = &_p->factor[1];

element_t* e_a = (element_t*)&fields_v[_f0->index][BLOCK_SIZE * ss];
element_t* e_b = (element_t*)&fields_v[_f1->index][BLOCK_SIZE * ss];
element_t* e_c = (element_t*)&fields_v[_p->target][BLOCK_SIZE * ss];
if (fast_osites > 0) {
BLOCK_SIZE = fast_osites;
_fast_osites = 1;
} else {
BLOCK_SIZE = -fast_osites;
_fast_osites = 0;
}

#define TC_MOV 0
#define TC_INC 1
Expand All @@ -200,47 +176,89 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
#define TC_MOV_NEG_CC 6
#define TC_DEC_CC 7
#define TC_MUL 8

#define _ID(a) a
#define _CONJ(a) adj(a)
#define _INC(a,b,c) a + b*c
#define _MOV(a,b,c) b*c
#define _MOV_NEG(a,b,c) - b*c
#define _DEC(a,b,c) a - b*c
#define _MUL(a,b,c) a*((coeff_t)_p->weight)
#define KERNEL(composition, mod_first) \
for (int ff=0;ff < BLOCK_SIZE;ff++) \
coalescedWriteElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], \
composition(coalescedReadElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], _p->element), \
mod_first(coalescedReadElement(fields_v[_f0->index][BLOCK_SIZE * ss + ff], _f0->element)), \
coalescedReadElement(fields_v[_f1->index][BLOCK_SIZE * ss + ff], _f1->element)), \
_p->element);


osite_blocks = osites / BLOCK_SIZE;

for (int iter=0;iter<((osites % BLOCK_SIZE == 0) ? 1 : 2);iter++) {

uint64_t osite_offset = (iter == 0) ? 0 : osite_blocks * BLOCK_SIZE;
if (iter == 1) {
BLOCK_SIZE = 1;
osite_blocks = osites - osite_offset;
}

accelerator_forNB(ss_block,osite_blocks * _npb,T::Nsimd(),{

#define ID(a) a
#define CONJ(a) adj(a)
#define KERNEL(signature, functor) \
for (int ff=0;ff<BLOCK_SIZE;ff++) \
e_c[NN * ff + _p->element] signature functor(e_a[NN * ff +_f0->element]) * e_b[NN * ff + _f1->element];

switch (_p->instruction) {
case TC_INC:
KERNEL(+=,ID);
break;
case TC_MOV:
KERNEL(=,ID);
break;
case TC_DEC:
KERNEL(-=,ID);
break;
case TC_MOV_NEG:
KERNEL(=-,ID);
break;
case TC_INC_CC:
KERNEL(+=,CONJ);
break;
case TC_MOV_CC:
KERNEL(=,CONJ);
break;
case TC_DEC_CC:
KERNEL(-=,CONJ);
break;
case TC_MOV_NEG_CC:
KERNEL(=-,CONJ);
break;
case TC_MUL:
for (int ff=0;ff<BLOCK_SIZE;ff++) \
e_c[NN * ff + _p->element] *= ((coeff_t)_p->weight);
break;
uint64_t ss, oblock;

if (_fast_osites) {
oblock = ss_block / osite_blocks;
ss = osite_offset + ss_block % osite_blocks;
} else {
ss = osite_offset + ss_block / _npb;
oblock = ss_block % _npb;
}
}

});


for (int iblock=0;iblock<_npbs;iblock++) {

int i = oblock * _npbs + iblock;

const auto _p = &p_code[i];
const auto _f0 = &_p->factor[0];
const auto _f1 = &_p->factor[1];

switch (_p->instruction) {
case TC_INC:
KERNEL(_INC,_ID);
break;
case TC_MOV:
KERNEL(_MOV,_ID);
break;
case TC_DEC:
KERNEL(_DEC,_ID);
break;
case TC_MOV_NEG:
KERNEL(_MOV_NEG,_ID);
break;
case TC_INC_CC:
KERNEL(_INC,_CONJ);
break;
case TC_MOV_CC:
KERNEL(_MOV,_CONJ);
break;
case TC_DEC_CC:
KERNEL(_DEC,_CONJ);
break;
case TC_MOV_NEG_CC:
KERNEL(_MOV_NEG,_CONJ);
break;
case TC_MUL:
KERNEL(_MUL,_ID);
break;
}
}

});
}

accelerator_barrier();

// and cleanup
CGPT_CARTESIAN_STENCIL_CLEANUP(T,);

Expand Down Expand Up @@ -273,8 +291,6 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) {
out.weight = get_complex(in, "weight");

cgpt_convert(get_key(in, "factor"), out.factor);

//
}

template<typename T>
Expand Down
32 changes: 24 additions & 8 deletions lib/gpt/qcd/fermion/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,40 @@ def register(reg, op):
reg.Mdiag = lambda dst, src: op.apply_unary_operator(2009, dst, src)
reg.Dminus = lambda dst, src: op.apply_unary_operator(2010, dst, src)
reg.DminusDag = lambda dst, src: op.apply_unary_operator(2011, dst, src)
reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2012, dst, src)
reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator(2013, dst, src)
reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator(2014, dst, src)
reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2015, dst, src)
reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(
2012, dst, src
)
reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator(
2013, dst, src
)
reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator(
2014, dst, src
)
reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(
2015, dst, src
)
reg.Dhop = lambda dst, src: op.apply_unary_operator(3001, dst, src)
reg.DhopDag = lambda dst, src: op.apply_unary_operator(4001, dst, src)
reg.DhopEO = lambda dst, src: op.apply_unary_operator(3002, dst, src)
reg.DhopEODag = lambda dst, src: op.apply_unary_operator(4002, dst, src)
reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator(5001, dst, src, dir, disp)
reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator(
5001, dst, src, dir, disp
)
reg.MDeriv = lambda mat, dst, src: op.apply_deriv_operator(6001, mat, dst, src)
reg.MDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7001, mat, dst, src)
reg.MoeDeriv = lambda mat, dst, src: op.apply_deriv_operator(6002, mat, dst, src)
reg.MoeDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7002, mat, dst, src)
reg.MeoDeriv = lambda mat, dst, src: op.apply_deriv_operator(6003, mat, dst, src)
reg.MeoDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7003, mat, dst, src)
reg.DhopDeriv = lambda mat, dst, src: op.apply_deriv_operator(6004, mat, dst, src)
reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7004, mat, dst, src)
reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator(
7004, mat, dst, src
)
reg.DhopDerivEO = lambda mat, dst, src: op.apply_deriv_operator(6005, mat, dst, src)
reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator(7005, mat, dst, src)
reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator(
7005, mat, dst, src
)
reg.DhopDerivOE = lambda mat, dst, src: op.apply_deriv_operator(6006, mat, dst, src)
reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator(7006, mat, dst, src)
reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator(
7006, mat, dst, src
)

0 comments on commit 6d24604

Please sign in to comment.