Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Improve block unrolling, enable AVX-512 #85501

Merged
merged 11 commits into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 23 additions & 52 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3134,32 +3134,17 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
// INITBLK zeroes a struct that contains GC pointers and can be observed by
// other threads (i.e. when dstAddr is not an address of a local).
// For example, this can happen when initializing a struct field of an object.
const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();

#ifdef TARGET_AMD64
// On Amd64 the JIT will not use SIMD stores for such structs and instead
// will always allocate a GP register for src node.
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
#else
// On X86 the JIT will use movq for structs that are larger than 16 bytes
// since it is more beneficial than using two mov-s from a GP register.
const bool willUseSimdMov = (size >= 16);
#endif
const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);

if (!src->isContained())
{
srcIntReg = genConsumeReg(src);
}
else
{
// If src is contained then it must be 0.
assert(src->IsIntegralConst(0));
assert(willUseSimdMov);
#ifdef TARGET_AMD64
assert(size >= XMM_REGSIZE_BYTES);
#else
assert(size % 8 == 0);
#endif
}

emitter* emit = GetEmitter();
Expand All @@ -3178,14 +3163,24 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
regSize = min(regSize, YMM_REGSIZE_BYTES);
}

bool zeroing = false;
if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
{
// If the source is constant 0 then always use xorps, it's faster
// than copying the constant from a GPR to a XMM register.
emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
zeroing = true;
}
#ifdef FEATURE_SIMD
else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
{
// Populate a constant vector from the fill value and save it
// to the data section so we can load it by address.
assert(regSize <= ZMM_REGSIZE_BYTES);
simd64_t constValue;
memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
var_types loadType = compiler->getSIMDTypeForSize(regSize);
genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
}
#endif
else
{
// TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
Expand Down Expand Up @@ -3236,53 +3231,29 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

while (bytesWritten < size)
{
#ifdef TARGET_X86
if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
{
simdMov = INS_movq;
regSize = 8;
}
#endif
if (bytesWritten + regSize > size)
{
// We have a remainder that is smaller than regSize.
break;
}

emitSimdMovs();
dstOffset += regSize;
bytesWritten += regSize;

if (!zeroing)
{
assert(regSize <= YMM_REGSIZE_BYTES);
}

if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}
}

size -= bytesWritten;

// Handle the remainder by overlapping with previously processed data (only for zeroing)
if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
// Handle the remainder by overlapping with previously processed data
if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
{
if (isPow2(size) && (size <= REGSIZE_BYTES))
{
// For sizes like 1,2,4 and 8 we delegate handling to normal stores
// because that will be a single instruction that is smaller than SIMD mov
}
else
{
// Get optimal register size to cover the whole remainder (with overlapping)
regSize = compiler->roundUpSIMDSize(size);
// Get optimal register size to cover the whole remainder (with overlapping)
regSize = compiler->roundUpSIMDSize(size);

// Rewind dstOffset so we can fit a vector for the while remainder
dstOffset -= (regSize - size);
emitSimdMovs();
size = 0;
}
// Rewind dstOffset so we can fit a vector for the while remainder
dstOffset -= (regSize - size);
emitSimdMovs();
size = 0;
}
}

Expand Down
50 changes: 21 additions & 29 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
}
else
{

// The fill value of an initblk is interpreted to hold a
// value of (unsigned int8) however a constant of any size
// may practically reside on the evaluation stack. So extract
Expand All @@ -337,38 +336,31 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)

ssize_t fill = src->AsIntCon()->IconValue() & 0xFF;

if (fill == 0)
const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences();
if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd))
{
if (size >= XMM_REGSIZE_BYTES)
{
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
// It turns out we can't use SIMD so the default threshold is too big
goto TOO_BIG_TO_UNROLL;
}

bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0;
if (!willUseOnlySimdMov)
{
// If we have a remainder we still might only use SIMD to process it (via overlapping)
// unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8)
const unsigned remainder = size % XMM_REGSIZE_BYTES;
if (!isPow2(remainder) || (remainder > REGSIZE_BYTES))
{
willUseOnlySimdMov = true;
}
}
bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
if (willUseSimd)
{
// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
// just to handle the remainder when we can do that with an overlapped SIMD load).
#ifdef FEATURE_SIMD
src->SetContained();
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
if (willUseOnlySimdMov && canUse16BytesSimdMov)
{
src->SetContained();
}
else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset,
/*canUseSimd*/ canUse16BytesSimdMov))
{
// It turns out we can't use SIMD so the default threshold is too big
goto TOO_BIG_TO_UNROLL;
}
if (fill == 0)
{
// When we don't have FEATURE_SIMD we can only handle zeroing
src->SetContained();
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
}
#endif
}
else if (fill == 0)
{
// Leave as is - zero shouldn't be contained when we don't use SIMD.
}
#ifdef TARGET_AMD64
else if (size >= REGSIZE_BYTES)
Expand Down