Skip to content

Commit

Permalink
JIT: Improve containment for widening intrinsics (#109474)
Browse files Browse the repository at this point in the history
* improve containment for widening intrinsics

* apply formatting patch

* tidying

* use tuple type for load size factor

* apply formatting patch

* fix build

* whitespace

* revert emitter changes
  • Loading branch information
saucecontrol authored Nov 21, 2024
1 parent 09b30a4 commit 975f4ea
Showing 1 changed file with 91 additions and 119 deletions.
210 changes: 91 additions & 119 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9155,13 +9155,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
// read more than sizeof(T) bytes).
bool supportsSIMDScalarLoads = false;

// parentNode supports nodes that read from an unaligned memory address
//
// This will generally be an explicit Load instruction and is generally false for machines
// without VEX support. This is because older hardware required that the SIMD operand always
// be aligned to the 'natural alignment' of the type.
bool supportsUnalignedSIMDLoads = false;

switch (category)
{
case HW_Category_MemoryLoad:
Expand Down Expand Up @@ -9190,134 +9183,124 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
break;
}

case NI_SSE2_ConvertToVector128Double:
case NI_SSE3_MoveAndDuplicate:
case NI_SSE41_ConvertToVector128Int16:
case NI_SSE41_ConvertToVector128Int32:
case NI_SSE41_ConvertToVector128Int64:
case NI_AVX_ConvertToVector256Double:
case NI_AVX2_ConvertToVector256Int16:
case NI_AVX2_ConvertToVector256Int32:
case NI_AVX2_ConvertToVector256Int64:
case NI_AVX512BW_ConvertToVector512Int16:
case NI_AVX512BW_ConvertToVector512UInt16:
case NI_AVX512F_ConvertToVector512Double:
case NI_AVX512F_ConvertToVector512Int32:
case NI_AVX512F_ConvertToVector512UInt32:
case NI_AVX512F_ConvertToVector512Int64:
case NI_AVX512F_ConvertToVector512UInt64:
case NI_AVX512F_VL_ConvertToVector128Double:
case NI_AVX512F_VL_ConvertToVector256Double:
case NI_AVX10v1_ConvertToVector128Double:
case NI_AVX10v1_ConvertToVector256Double:
{
assert(!supportsSIMDScalarLoads);
// Some of these intrinsics have pointer overloads. Treat those as normal memory loads.
if (parentNode->OperIsMemoryLoad())
{
supportsGeneralLoads = !childNode->OperIsHWIntrinsic();
break;
}

if (!parentNode->OperIsMemoryLoad())
// These are widening instructions. A load can be contained if the source is large enough
// after taking into account the multiplier of source to target element size.

const var_types parentBaseType = parentNode->GetSimdBaseType();
const instruction parentInstruction = HWIntrinsicInfo::lookupIns(parentIntrinsicId, parentBaseType);

const insTupleType tupleType = comp->GetEmitter()->insTupleTypeInfo(parentInstruction);
const unsigned parentSize = parentNode->GetSimdSize();
unsigned widenFactor = 0;

switch (tupleType)
{
// The containable form is the one that takes a SIMD value, that may be in memory.
case INS_TT_FULL:
case INS_TT_FULL_MEM:
{
widenFactor = 1;
break;
}

if (!comp->canUseVexEncoding())
case INS_TT_HALF:
case INS_TT_HALF_MEM:
{
supportsAlignedSIMDLoads = true;
supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
widenFactor = 2;
break;
}
else

case INS_TT_QUARTER_MEM:
{
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
widenFactor = 4;
break;
}

// General loads are a bit special where we need at least `sizeof(simdType) / (sizeof(baseType)
// * 2)` elements
// For example:
// * ConvertToVector128Int16 - sizeof(simdType) = 16; sizeof(baseType) = 1; expectedSize = 8
// * ConvertToVector128Int32 - sizeof(simdType) = 16; sizeof(baseType) = 1 | 2;
// expectedSize = 8 | 4
// * ConvertToVector128Int64 - sizeof(simdType) = 16; sizeof(baseType) = 1 | 2 | 4;
// expectedSize = 8 | 4 | 2
// * ConvertToVector256Int16 - sizeof(simdType) = 32; sizeof(baseType) = 1; expectedSize = 16
// * ConvertToVector256Int32 - sizeof(simdType) = 32; sizeof(baseType) = 1 | 2;
// expectedSize = 16 | 8
// * ConvertToVector256Int64 - sizeof(simdType) = 32; sizeof(baseType) = 1 | 2 | 4;
// expectedSize = 16 | 8 | 4

const unsigned sizeof_simdType = genTypeSize(parentNode->TypeGet());
const unsigned sizeof_baseType = genTypeSize(parentNode->GetSimdBaseType());

assert((sizeof_simdType == 16) || (sizeof_simdType == 32));
assert((sizeof_baseType == 1) || (sizeof_baseType == 2) || (sizeof_baseType == 4));

const unsigned expectedSize = sizeof_simdType / (sizeof_baseType * 2);
const unsigned operandSize = genTypeSize(childNode->TypeGet());
case INS_TT_EIGHTH_MEM:
{
widenFactor = 8;
break;
}

assert((sizeof_simdType != 16) || (expectedSize == 8) || (expectedSize == 4) ||
(expectedSize == 2));
assert((sizeof_simdType != 32) || (expectedSize == 16) || (expectedSize == 8) ||
(expectedSize == 4));
case INS_TT_MOVDDUP:
{
widenFactor = parentSize == 16 ? 2 : 1;
break;
}

supportsGeneralLoads = (operandSize >= expectedSize);
}
else
{
// The memory form of this already takes a pointer and should be treated like a MemoryLoad
supportsGeneralLoads = !childNode->OperIsHWIntrinsic();
default:
{
unreached();
break;
}
}
break;
}

case NI_SSE2_ConvertToVector128Double:
case NI_AVX_ConvertToVector256Double:
case NI_AVX512F_ConvertToVector512Double:
case NI_AVX512F_VL_ConvertToVector128Double:
case NI_AVX512F_VL_ConvertToVector256Double:
case NI_AVX10v1_ConvertToVector128Double:
case NI_AVX10v1_ConvertToVector256Double:
{
assert(!supportsSIMDScalarLoads);

// Most instructions under the non-VEX encoding require aligned operands.
// Those used for Sse2.ConvertToVector128Double (CVTDQ2PD and CVTPS2PD)
// are exceptions and don't fail for unaligned inputs as they read half
// the vector width instead

supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;

const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
const unsigned expectedSize = parentSize / widenFactor;
const unsigned operandSize = genTypeSize(childNode->TypeGet());

if (childNode->OperIsConst() || childNode->isMemoryOp())
if (expectedSize < 16)
{
// For broadcasts we can only optimize constants and memory operands
// since we're going from a smaller base type to a larger base type
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
}
break;
}
// We do not need to consider alignment for non-VEX encoding here because we're
// loading less than a full vector.
//
// We can also contain a SIMD scalar load, provided the element type is large enough.

case NI_SSE3_MoveAndDuplicate:
{
// Most instructions under the non-VEX encoding require aligned operands.
// Those used for Sse3.MoveAndDuplicate (MOVDDUP) are exceptions and don't
// fail for unaligned inputs as they read half the vector width instead
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);

supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
if (childNode->OperIsHWIntrinsic())
{
GenTreeHWIntrinsic* childIntrinsic = childNode->AsHWIntrinsic();
const unsigned sizeof_childBaseType = genTypeSize(childIntrinsic->GetSimdBaseType());

const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
const unsigned operandSize = genTypeSize(childNode->TypeGet());
supportsSIMDScalarLoads = (sizeof_childBaseType >= expectedSize);
}
}
else
{
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
}

supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsSIMDScalarLoads = true;
break;
}

default:
{
assert(!supportsSIMDScalarLoads);

if (!comp->canUseVexEncoding())
{
assert(!supportsUnalignedSIMDLoads);
supportsAlignedSIMDLoads = true;
}
else
{
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
}

const unsigned expectedSize = genTypeSize(parentNode->TypeGet());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
break;
}
}
Expand Down Expand Up @@ -9420,9 +9403,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
break;
}

Expand All @@ -9440,16 +9422,14 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
}
else
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
}
break;
}
Expand All @@ -9466,9 +9446,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = 16;
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);
break;
}

Expand All @@ -9482,9 +9461,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = 32;
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseEvexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseEvexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);
break;
}

Expand All @@ -9493,7 +9471,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case NI_SSE41_X64_Insert:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

if (parentNode->GetSimdBaseType() == TYP_FLOAT)
{
Expand Down Expand Up @@ -9555,7 +9532,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case NI_AVX10v1_RoundScaleScalar:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

supportsSIMDScalarLoads = true;
supportsGeneralLoads = supportsSIMDScalarLoads;
Expand All @@ -9567,7 +9543,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
break;
}
}
Expand All @@ -9577,7 +9552,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case HW_Category_SIMDScalar:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

switch (parentIntrinsicId)
{
Expand Down Expand Up @@ -9677,13 +9651,12 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(varTypeIsIntegral(childNode->TypeGet()));

assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);
assert(supportsSIMDScalarLoads == false);

unsigned expectedSize = genTypeSize(parentNode->TypeGet());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

// CRC32 codegen depends on its second oprand's type.
// CRC32 codegen depends on its second operand's type.
// Currently, we are using SIMDBaseType to store the op2Type info.
if (parentIntrinsicId == NI_SSE42_Crc32)
{
Expand All @@ -9700,7 +9673,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
break;
}
}
Expand Down

0 comments on commit 975f4ea

Please sign in to comment.