Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Improve containment for widening intrinsics #109474

Merged
merged 9 commits into from
Nov 21, 2024
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 88 additions & 122 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9155,13 +9155,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
// read more than sizeof(T) bytes).
bool supportsSIMDScalarLoads = false;

// parentNode supports nodes that read from an unaligned memory address
//
// This will generally be an explicit Load instruction and is generally false for machines
// without VEX support. This is because older hardware required that the SIMD operand always
// be aligned to the 'natural alignment' of the type.
bool supportsUnalignedSIMDLoads = false;
tannergooding marked this conversation as resolved.
Show resolved Hide resolved

switch (category)
{
case HW_Category_MemoryLoad:
Expand Down Expand Up @@ -9190,134 +9183,118 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
break;
}

case NI_SSE2_ConvertToVector128Double:
case NI_SSE3_MoveAndDuplicate:
case NI_SSE41_ConvertToVector128Int16:
case NI_SSE41_ConvertToVector128Int32:
case NI_SSE41_ConvertToVector128Int64:
case NI_AVX_ConvertToVector256Double:
case NI_AVX2_ConvertToVector256Int16:
case NI_AVX2_ConvertToVector256Int32:
case NI_AVX2_ConvertToVector256Int64:
case NI_AVX512BW_ConvertToVector512Int16:
case NI_AVX512BW_ConvertToVector512UInt16:
case NI_AVX512F_ConvertToVector512Double:
case NI_AVX512F_ConvertToVector512Int32:
case NI_AVX512F_ConvertToVector512UInt32:
case NI_AVX512F_ConvertToVector512Int64:
case NI_AVX512F_ConvertToVector512UInt64:
case NI_AVX512F_VL_ConvertToVector128Double:
case NI_AVX512F_VL_ConvertToVector256Double:
case NI_AVX10v1_ConvertToVector128Double:
case NI_AVX10v1_ConvertToVector256Double:
{
assert(!supportsSIMDScalarLoads);

if (!parentNode->OperIsMemoryLoad())
// Some of these intrinsics have pointer overloads. Treat those as normal memory loads.
if (parentNode->OperIsMemoryLoad())
{
// The containable form is the one that takes a SIMD value, that may be in memory.
supportsGeneralLoads = !childNode->OperIsHWIntrinsic();
break;
}

// These are widening instructions. A load can be contained if the source is large enough
// after taking into account the multiplier of source to target element size. Most of these
// double the width, but a few instruction forms have higher multipliers (ex: PMOVZXBQ)

if (!comp->canUseVexEncoding())
const unsigned sizeof_baseType = genTypeSize(parentNode->GetSimdBaseType());
unsigned widenFactor = 2;
saucecontrol marked this conversation as resolved.
Show resolved Hide resolved

switch (parentIntrinsicId)
{
case NI_SSE41_ConvertToVector128Int32:
case NI_AVX2_ConvertToVector256Int32:
case NI_AVX512F_ConvertToVector512Int32:
case NI_AVX512F_ConvertToVector512UInt32:
{
supportsAlignedSIMDLoads = true;
supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
if (sizeof_baseType == 1)
{
widenFactor = 4;
}
break;
}
else

case NI_SSE41_ConvertToVector128Int64:
case NI_AVX2_ConvertToVector256Int64:
case NI_AVX512F_ConvertToVector512Int64:
case NI_AVX512F_ConvertToVector512UInt64:
{
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
if (sizeof_baseType == 1)
{
widenFactor = 8;
}
else if (sizeof_baseType == 2)
{
widenFactor = 4;
}
break;
}

// General loads are a bit special where we need at least `sizeof(simdType) / (sizeof(baseType)
// * 2)` elements
// For example:
// * ConvertToVector128Int16 - sizeof(simdType) = 16; sizeof(baseType) = 1; expectedSize = 8
// * ConvertToVector128Int32 - sizeof(simdType) = 16; sizeof(baseType) = 1 | 2;
// expectedSize = 8 | 4
// * ConvertToVector128Int64 - sizeof(simdType) = 16; sizeof(baseType) = 1 | 2 | 4;
// expectedSize = 8 | 4 | 2
// * ConvertToVector256Int16 - sizeof(simdType) = 32; sizeof(baseType) = 1; expectedSize = 16
// * ConvertToVector256Int32 - sizeof(simdType) = 32; sizeof(baseType) = 1 | 2;
// expectedSize = 16 | 8
// * ConvertToVector256Int64 - sizeof(simdType) = 32; sizeof(baseType) = 1 | 2 | 4;
// expectedSize = 16 | 8 | 4

const unsigned sizeof_simdType = genTypeSize(parentNode->TypeGet());
const unsigned sizeof_baseType = genTypeSize(parentNode->GetSimdBaseType());

assert((sizeof_simdType == 16) || (sizeof_simdType == 32));
assert((sizeof_baseType == 1) || (sizeof_baseType == 2) || (sizeof_baseType == 4));

const unsigned expectedSize = sizeof_simdType / (sizeof_baseType * 2);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
const unsigned operandSize = genTypeSize(childNode->TypeGet());

assert((sizeof_simdType != 16) || (expectedSize == 8) || (expectedSize == 4) ||
(expectedSize == 2));
assert((sizeof_simdType != 32) || (expectedSize == 16) || (expectedSize == 8) ||
(expectedSize == 4));

supportsGeneralLoads = (operandSize >= expectedSize);
}
else
{
// The memory form of this already takes a pointer and should be treated like a MemoryLoad
supportsGeneralLoads = !childNode->OperIsHWIntrinsic();
default:
{
break;
}
}
break;
}

case NI_SSE2_ConvertToVector128Double:
case NI_AVX_ConvertToVector256Double:
case NI_AVX512F_ConvertToVector512Double:
case NI_AVX512F_VL_ConvertToVector128Double:
case NI_AVX512F_VL_ConvertToVector256Double:
case NI_AVX10v1_ConvertToVector128Double:
case NI_AVX10v1_ConvertToVector256Double:
{
assert(!supportsSIMDScalarLoads);

// Most instructions under the non-VEX encoding require aligned operands.
// Those used for Sse2.ConvertToVector128Double (CVTDQ2PD and CVTPS2PD)
// are exceptions and don't fail for unaligned inputs as they read half
// the vector width instead

supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;

const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
const unsigned operandSize = genTypeSize(childNode->TypeGet());
const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / widenFactor;

if (childNode->OperIsConst() || childNode->isMemoryOp())
if (expectedSize < 16)
{
// For broadcasts we can only optimize constants and memory operands
// since we're going from a smaller base type to a larger base type
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
}
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
break;
}
// If we need less than a full vector:
// * In MinOpts, we should never contain aligned loads because they will not
// fault on hardware not supporting VEX encoding as a full vector load would.
// * We can always contain an unaligned load of sufficient size because there are
// no alignment requirements below vector size.
// * We can contain a SIMD scalar load, provided the element type is large enough.

case NI_SSE3_MoveAndDuplicate:
{
// Most instructions under the non-VEX encoding require aligned operands.
// Those used for Sse3.MoveAndDuplicate (MOVDDUP) are exceptions and don't
// fail for unaligned inputs as they read half the vector width instead
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);

supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
if (childNode->OperIsHWIntrinsic())
{
GenTreeHWIntrinsic* childIntrinsic = childNode->AsHWIntrinsic();
const unsigned sizeof_childBaseType = genTypeSize(childIntrinsic->GetSimdBaseType());

const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
const unsigned operandSize = genTypeSize(childNode->TypeGet());
supportsSIMDScalarLoads = (sizeof_childBaseType >= expectedSize);
}
}
else
{
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
}

supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsSIMDScalarLoads = true;
break;
}

default:
{
assert(!supportsSIMDScalarLoads);

if (!comp->canUseVexEncoding())
{
assert(!supportsUnalignedSIMDLoads);
supportsAlignedSIMDLoads = true;
}
else
{
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = true;
}

const unsigned expectedSize = genTypeSize(parentNode->TypeGet());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
break;
}
}
Expand Down Expand Up @@ -9418,9 +9395,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
break;
}

Expand All @@ -9438,16 +9414,14 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsGeneralLoads = comp->canUseVexEncoding() && (operandSize >= expectedSize);
}
else
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
}
break;
}
Expand All @@ -9464,9 +9438,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = 16;
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
Comment on lines -9467 to -9468
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These instructions are AVX+ so they imply VEX encoding.

supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);
break;
}

Expand All @@ -9480,9 +9453,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
const unsigned expectedSize = 32;
const unsigned operandSize = genTypeSize(childNode->TypeGet());

supportsAlignedSIMDLoads = !comp->canUseEvexEncoding() || !comp->opts.MinOpts();
supportsUnalignedSIMDLoads = comp->canUseEvexEncoding();
supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
supportsAlignedSIMDLoads = !comp->opts.MinOpts();
supportsGeneralLoads = (operandSize >= expectedSize);
break;
}

Expand All @@ -9491,7 +9463,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case NI_SSE41_X64_Insert:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

if (parentNode->GetSimdBaseType() == TYP_FLOAT)
{
Expand Down Expand Up @@ -9553,7 +9524,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case NI_AVX10v1_RoundScaleScalar:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

supportsSIMDScalarLoads = true;
supportsGeneralLoads = supportsSIMDScalarLoads;
Expand All @@ -9565,7 +9535,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
break;
}
}
Expand All @@ -9575,7 +9544,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
case HW_Category_SIMDScalar:
{
assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);

switch (parentIntrinsicId)
{
Expand Down Expand Up @@ -9675,13 +9643,12 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(varTypeIsIntegral(childNode->TypeGet()));

assert(supportsAlignedSIMDLoads == false);
assert(supportsUnalignedSIMDLoads == false);
assert(supportsSIMDScalarLoads == false);

unsigned expectedSize = genTypeSize(parentNode->TypeGet());
const unsigned operandSize = genTypeSize(childNode->TypeGet());

// CRC32 codegen depends on its second oprand's type.
// CRC32 codegen depends on its second operand's type.
// Currently, we are using SIMDBaseType to store the op2Type info.
if (parentIntrinsicId == NI_SSE42_Crc32)
{
Expand All @@ -9698,7 +9665,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
assert(supportsAlignedSIMDLoads == false);
assert(supportsGeneralLoads == false);
assert(supportsSIMDScalarLoads == false);
assert(supportsUnalignedSIMDLoads == false);
break;
}
}
Expand Down
Loading