From cd6c513124ac9a4cbe43fc9eef99371303dc8460 Mon Sep 17 00:00:00 2001 From: Bradley Wood Date: Thu, 16 Jan 2025 15:04:02 -0500 Subject: [PATCH 1/5] x86: Create helpers for generating loops Signed-off-by: Bradley Wood --- compiler/x/CMakeLists.txt | 1 + compiler/x/codegen/CodegenUtils.cpp | 139 ++++++++++++++++++++++ compiler/x/codegen/CodegenUtils.hpp | 176 ++++++++++++++++++++++++++++ 3 files changed, 316 insertions(+) create mode 100644 compiler/x/codegen/CodegenUtils.cpp create mode 100644 compiler/x/codegen/CodegenUtils.hpp diff --git a/compiler/x/CMakeLists.txt b/compiler/x/CMakeLists.txt index b247cd11d66..dccf9d70cbe 100644 --- a/compiler/x/CMakeLists.txt +++ b/compiler/x/CMakeLists.txt @@ -55,6 +55,7 @@ compiler_library(x ${CMAKE_CURRENT_LIST_DIR}/codegen/OMRSnippetDelegate.cpp ${CMAKE_CURRENT_LIST_DIR}/codegen/X86SystemLinkage.cpp ${CMAKE_CURRENT_LIST_DIR}/codegen/OMRCodeGenerator.cpp + ${CMAKE_CURRENT_LIST_DIR}/codegen/CodegenUtils.cpp ${CMAKE_CURRENT_LIST_DIR}/env/OMRCPU.cpp ${CMAKE_CURRENT_LIST_DIR}/env/OMRDebugEnv.cpp ) diff --git a/compiler/x/codegen/CodegenUtils.cpp b/compiler/x/codegen/CodegenUtils.cpp new file mode 100644 index 00000000000..414bff14b72 --- /dev/null +++ b/compiler/x/codegen/CodegenUtils.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* + * Copyright IBM Corp. and others 2025 + * + * This program and the accompanying materials are made available under + * the terms of the Eclipse Public License 2.0 which accompanies this + * distribution and is available at https://www.eclipse.org/legal/epl-2.0/ + * or the Apache License, Version 2.0 which accompanies this distribution + * and is available at https://www.apache.org/licenses/LICENSE-2.0. + * + * This Source Code may also be made available under the following Secondary + * Licenses when the conditions for such availability set forth in the + * Eclipse Public License, v. 2.0 are satisfied: GNU General Public License, + * version 2 with the GNU Classpath Exception [1] and GNU General Public + * License, version 2 with the OpenJDK Assembly Exception [2]. + * + * [1] https://www.gnu.org/software/classpath/license.html + * [2] https://openjdk.org/legal/assembly-exception.html + * + * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 + *******************************************************************************/ + +#include "CodegenUtils.hpp" +#include "OMRX86Instruction.hpp" + +namespace OMR +{ +namespace X86 +{ + +void generateLoop(int32_t unrollFactor, + int32_t elementsPerIteration, + TR::Register *indexReg, + TR::Register *maxIndexReg, + TR::Node *node, + TR::CodeGenerator *cg, + std::function loopInitializerFunction, + std::function genBodyFunction, + std::function residueGenBodyFunction) + { + TR_ASSERT_FATAL((unrollFactor > 0) && ((unrollFactor & (unrollFactor - 1)) == 0), "Unroll count must be power of 2"); + TR_ASSERT_FATAL(elementsPerIteration == 1 || + (elementsPerIteration > 0) && ((elementsPerIteration & (elementsPerIteration - 1)) == 0), + "elementsPerIteration must be 1 or a power of 2"); + TR_ASSERT_FATAL(unrollFactor <= 16, "Excessive unrolling detected (unrollFactor > 16)"); + + TR::RegisterDependencyConditions *deps = generateRegisterDependencyConditions(0, 1, cg); + TR::Register *loopLimitReg = cg->allocateRegister(); + TR::LabelSymbol *begLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + TR::LabelSymbol *loopLabel = generateLabelSymbol(cg); + int32_t numElements = elementsPerIteration * unrollFactor; /* Per iteration of unrolled main loop */ + + deps->addPostCondition(loopLimitReg, RealRegister::NoReg, cg); + begLabel->setStartInternalControlFlow(); + endLabel->setEndInternalControlFlow(); + + generateLabelInstruction(InstOpCode::label, node, begLabel, cg); + + generateRegRegInstruction(InstOpCode::MOVRegReg(), node, loopLimitReg, maxIndexReg, cg); + + if (numElements != 1) + { + // Adjust loop bound to be a multiple of (elementsPerIteration * unrollFactor) + // This ensures the main loop processes all elements up to the nearest multiple + generateRegImmInstruction(InstOpCode::ANDRegImms(), node, loopLimitReg, ~(numElements - 1), cg); + } + + generateRegRegInstruction(InstOpCode::CMPRegReg(), node, indexReg, loopLimitReg, cg); + generateLabelInstruction(InstOpCode::JGE4, node, endLabel, cg); + + if (loopInitializerFunction) + { + // Call initialization logic before first iteration. + // This might be needed if the initialization logic is too expensive + // to run before checking if the main loop even executes. + loopInitializerFunction(); + } + + generateLabelInstruction(InstOpCode::label, node, loopLabel, cg); + + for (int32_t i = 0; i < unrollFactor; i++) + { + // Unroll the loop by invoking the body function for each unroll iteration + genBodyFunction(i); + } + + // Update the loop index based on the number of elements processed in the unrolled main loop + if (numElements == 1) + { + generateRegInstruction(InstOpCode::INCReg(), node, indexReg, cg); + } + else + { + generateRegImmInstruction(InstOpCode::ADDRegImm4(), node, indexReg, numElements, cg); + } + + // Compare index with numElements and loop back if necessary + generateRegRegInstruction(InstOpCode::CMP4RegReg, node, indexReg, loopLimitReg, cg); + generateLabelInstruction(InstOpCode::JL4, node, loopLabel, cg); + + generateLabelInstruction(InstOpCode::label, node, endLabel, deps, cg); + cg->stopUsingRegister(loopLimitReg); + + if (residueGenBodyFunction != NULL && numElements > 1) + { + TR_ASSERT_FATAL(residueGenBodyFunction, "Missing function to generate residue"); + + // Generate a second loop to process residual iterations + OMR::X86::generateLoop(1, 1, indexReg, maxIndexReg, node, cg, loopInitializerFunction, residueGenBodyFunction, NULL); + } + } + +void generateLoop(int32_t begin, + int32_t end, + TR::Node *node, + TR::CodeGenerator *cg, + std::function genBodyFunction) + { + TR::RegisterDependencyConditions *deps = generateRegisterDependencyConditions(0, 2, cg); + + TR::Register *indexReg = cg->allocateRegister(); + TR::Register *loopBoundReg = cg->allocateRegister(); + TR::LabelSymbol *label = generateLabelSymbol(cg); + + deps->addPostCondition(indexReg, RealRegister::NoReg, cg); + deps->addPostCondition(loopBoundReg, RealRegister::NoReg, cg); + + TreeEvaluator::loadConstant(node, begin, TR_RematerializableInt, cg, indexReg); + TreeEvaluator::loadConstant(node, end, TR_RematerializableInt, cg, loopBoundReg); + + OMR::X86::generateLoop(indexReg, loopBoundReg, node, cg, std::move(genBodyFunction)); + generateLabelInstruction(TR::InstOpCode::label, node, label, deps, cg); + + cg->stopUsingRegister(indexReg); + cg->stopUsingRegister(loopBoundReg); + } + +} +} diff --git a/compiler/x/codegen/CodegenUtils.hpp b/compiler/x/codegen/CodegenUtils.hpp new file mode 100644 index 00000000000..ed9cd77398d --- /dev/null +++ b/compiler/x/codegen/CodegenUtils.hpp @@ -0,0 +1,176 @@ +/******************************************************************************* + * Copyright IBM Corp. and others 2025 + * + * This program and the accompanying materials are made available under + * the terms of the Eclipse Public License 2.0 which accompanies this + * distribution and is available at https://www.eclipse.org/legal/epl-2.0/ + * or the Apache License, Version 2.0 which accompanies this distribution + * and is available at https://www.apache.org/licenses/LICENSE-2.0. + * + * This Source Code may also be made available under the following Secondary + * Licenses when the conditions for such availability set forth in the + * Eclipse Public License, v. 2.0 are satisfied: GNU General Public License, + * version 2 with the GNU Classpath Exception [1] and GNU General Public + * License, version 2 with the OpenJDK Assembly Exception [2]. + * + * [1] https://www.gnu.org/software/classpath/license.html + * [2] https://openjdk.org/legal/assembly-exception.html + * + * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 + *******************************************************************************/ + +#ifndef OMR_CODEGENUTILS_INCL +#define OMR_CODEGENUTILS_INCL + +#include "codegen/CodeGenerator.hpp" + +namespace OMR +{ + +namespace X86 +{ + +/** + * @brief Generates loop control flow with start and end bounds + * + * This function generates assembly code for a loop that iterates from a specified + * starting value to an ending value. The loop body is implemented by a user-provided function. + * + * @param begin The starting value of the loop index + * @param end The ending value of the loop index + * @param node The node associated with the loop + * @param cg The code generator used to emit instructions + * @param genBodyFunction A callback function that generates the body of the loop + */ +void generateLoop(int32_t begin, + int32_t end, + TR::Node *node, + TR::CodeGenerator *cg, + std::function genBodyFunction); + +/** + * @brief Generates a loop with parameterized unrolling and optional residue element processing. + * + * This function generates an optimized loop with a specified unrolling factor and the ability to process + * multiple elements per iteration. If the total number of iterations is not divisible by the number of + * elements processed in each unrolled iteration, a secondary residue loop can be generated to handle + * the leftover elements. + * + * Constraints: + * - `unrollFactor` must be a power of 2. + * - `elementsPerIteration` must be a power of 2. + * - `residueGenBodyFunction` If specified, must only process one iteration of loop index. + * + * @param unrollFactor Number of iterations to unroll in the main loop. + * @param elementsPerIteration Number of elements processed per loop body iteration. + * @param indexReg The register used to store the loop index. + * @param maxIndexReg The register containing the loop bound. + * @param node IL node associated with the loop generation. + * @param cg Pointer to the code generator responsible for emitting the loop code. + * @param loopInitializerFunction Function to generate initialization code prior to the first iteration, if applicable. + * @param genBodyFunction Function to generate the body of the loop for each unrolled iteration. + * @param residueGenBodyFunction Function to generate the body for the residue loop, if applicable. + */ +void generateLoop(int32_t unrollFactor, + int32_t elementsPerIteration, + TR::Register *indexReg, + TR::Register *maxIndexReg, + TR::Node *node, + TR::CodeGenerator *cg, + std::function loopInitializerFunction, + std::function genBodyFunction, + std::function residueGenBodyFunction = NULL); + +/** + * @brief Generates a loop with parameterized unrolling and optional residue element processing. + * + * This function generates an optimized loop with a specified unrolling factor and the ability to process + * multiple elements per iteration. If the total number of iterations is not divisible by the number of + * elements processed in each unrolled iteration, a secondary residue loop can be generated to handle + * the leftover elements. + * + * Constraints: + * - `unrollFactor` must be a power of 2. + * - `elementsPerIteration` must be a power of 2. + * - `residueGenBodyFunction` If specified, must only process one iteration of loop index. + * + * @param unrollFactor Number of iterations to unroll in the main loop. + * @param elementsPerIteration Number of elements processed per loop body iteration. + * @param indexReg The register used to store the loop index. + * @param maxIndexReg The register containing the loop bound. + * @param node IL node associated with the loop generation. + * @param cg Pointer to the code generator responsible for emitting the loop code. + * @param genBodyFunction Function to generate the body of the loop for each unrolled iteration. + * @param residueGenBodyFunction Function to generate the body for the residue loop, if applicable. + */ +inline void generateLoop(int32_t unrollFactor, + int32_t elementsPerIteration, + TR::Register *indexReg, + TR::Register *maxIndexReg, + TR::Node *node, + TR::CodeGenerator *cg, + std::function genBodyFunction, + std::function residueGenBodyFunction = NULL) + { + generateLoop(unrollFactor, elementsPerIteration, indexReg, maxIndexReg, node, cg, NULL, genBodyFunction, residueGenBodyFunction); + } + +/** + * @brief Generates a loop with parameterized unrolling and residue processing. + * + * This function generates an unrolled loop where each unrolled iteration + * processes only one element. A second loop is automatically generated to + * process leftover (residue) elements, ensuring all elements are handled. + * + * Constraints: + * - `unrollFactor` must be a power of 2. + * - `genBodyFunction` must only process one iteration of loop index. + * + * @param unrollFactor Number of iterations to unroll in the main loop. + * @param indexReg The register used to store the loop index. + * @param maxIndexReg The register containing the loop bound. + * @param node IL node associated with the loop generation. + * @param cg Pointer to the code generator responsible for emitting the loop code. + * @param genBodyFunction Function to generate the body of the loop for each unrolled iteration. + */ +inline void generateUnrolledLoopWithResidue(int32_t unrollFactor, + TR::Register *indexReg, + TR::Register *maxIndexReg, + TR::Node *node, + TR::CodeGenerator *cg, + std::function genBodyFunction) + { + generateLoop(unrollFactor, 1, indexReg, maxIndexReg, node, cg, NULL, genBodyFunction, genBodyFunction); + } + +/** + * @brief Generates a simple loop without unrolling or residue processing. + * + * This function generates a loop where each iteration processes a single element. + * It is suitable for scenarios where the loop body function processes exactly one + * loop index per iteration. No additional residue handling is required. + * + * Constraints: + * - `genBodyFunction` must only process one iteration of loop index. + * + * @param indexReg The register used to store the loop index. + * @param loopBoundReg The register containing the loop bound. + * @param node IL node associated with the loop generation. + * @param cg Pointer to the code generator responsible for emitting the loop code. + * @param genBodyFunction Function to generate the body of the loop for each iteration. + */ +inline void generateLoop(TR::Register *indexReg, + TR::Register *maxIndexReg, + TR::Node *node, + TR::CodeGenerator *cg, + std::function genBodyFunction) + { + generateLoop(1, 1, indexReg, maxIndexReg, node, cg, NULL, genBodyFunction, NULL); + } + + +} + +} + +#endif From 184f78938d50d9e671ff4b3f40e1043bd970e00e Mon Sep 17 00:00:00 2001 From: Bradley Wood Date: Fri, 17 Jan 2025 11:35:35 -0500 Subject: [PATCH 2/5] x86: Move loadConstant helpers into CodegenUtils Signed-off-by: Bradley Wood --- compiler/x/amd64/codegen/OMRTreeEvaluator.cpp | 5 +- compiler/x/codegen/CodegenUtils.cpp | 244 +++++++++++++++++- compiler/x/codegen/CodegenUtils.hpp | 47 ++++ compiler/x/codegen/ControlFlowEvaluator.cpp | 13 +- compiler/x/codegen/OMRMachine.cpp | 3 +- compiler/x/codegen/OMRTreeEvaluator.cpp | 226 ---------------- compiler/x/codegen/OMRTreeEvaluator.hpp | 8 - compiler/x/codegen/SubtractAnalyser.cpp | 3 +- compiler/x/codegen/UnaryEvaluator.cpp | 7 +- compiler/x/i386/codegen/OMRTreeEvaluator.cpp | 17 +- 10 files changed, 314 insertions(+), 259 deletions(-) diff --git a/compiler/x/amd64/codegen/OMRTreeEvaluator.cpp b/compiler/x/amd64/codegen/OMRTreeEvaluator.cpp index 3bf33265fec..9a4a06c30f0 100644 --- a/compiler/x/amd64/codegen/OMRTreeEvaluator.cpp +++ b/compiler/x/amd64/codegen/OMRTreeEvaluator.cpp @@ -50,6 +50,7 @@ #include "x/codegen/ConstantDataSnippet.hpp" #include "x/codegen/OutlinedInstructions.hpp" #include "x/codegen/X86Instruction.hpp" +#include "x/codegen/CodegenUtils.hpp" #include "codegen/InstOpCode.hpp" TR::Register* @@ -2264,7 +2265,7 @@ OMR::X86::AMD64::TreeEvaluator::lbitpermuteEvaluator(TR::Node *node, TR::CodeGen TR::Register *OMR::X86::AMD64::TreeEvaluator::aconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *targetRegister = TR::TreeEvaluator::loadConstant(node, node->getLongInt(), TR_RematerializableAddress, cg); + TR::Register *targetRegister = OMR::X86::loadConstant(node, node->getLongInt(), TR_RematerializableAddress, cg); node->setRegister(targetRegister); return targetRegister; @@ -2272,7 +2273,7 @@ TR::Register *OMR::X86::AMD64::TreeEvaluator::aconstEvaluator(TR::Node *node, TR TR::Register *OMR::X86::AMD64::TreeEvaluator::lconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *targetRegister = TR::TreeEvaluator::loadConstant(node, node->getLongInt(), TR_RematerializableLong, cg); + TR::Register *targetRegister = OMR::X86::loadConstant(node, node->getLongInt(), TR_RematerializableLong, cg); node->setRegister(targetRegister); return targetRegister; diff --git a/compiler/x/codegen/CodegenUtils.cpp b/compiler/x/codegen/CodegenUtils.cpp index 414bff14b72..a89ebf39ebd 100644 --- a/compiler/x/codegen/CodegenUtils.cpp +++ b/compiler/x/codegen/CodegenUtils.cpp @@ -19,14 +19,250 @@ * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 *******************************************************************************/ -#include "CodegenUtils.hpp" -#include "OMRX86Instruction.hpp" +#include "il/Node.hpp" +#include "il/Node_inlines.hpp" +#include "x/codegen/CodegenUtils.hpp" + +extern bool existsNextInstructionToTestFlags(TR::Instruction *startInstr, + uint8_t testMask); namespace OMR { namespace X86 { +TR::Instruction *insertLoadConstant(TR::Node *node, + TR::Register *target, + intptr_t value, + TR_RematerializableTypes type, + TR::CodeGenerator *cg, + TR::Instruction *currentInstruction) + { + TR::Compilation *comp = cg->comp(); + static const TR::InstOpCode::Mnemonic ops[TR_NumRematerializableTypes+1][3] = + // load 0 load -1 load c + { { TR::InstOpCode::UD2, TR::InstOpCode::UD2, TR::InstOpCode::UD2 }, // LEA; should not seen here + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Byte constant + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Short constant + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Char constant + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Int constant + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // 32-bit address constant + { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR8RegImms, TR::InstOpCode::UD2 } }; // Long address constant; MOVs handled specially + + enum { XOR = 0, OR = 1, MOV = 2 }; + + bool is64Bit = false; + + int opsRow = type; + if (cg->comp()->target().is64Bit()) + { + if (type == TR_RematerializableAddress) + { + // Treat 64-bit addresses as longs + opsRow++; + is64Bit = true; + } + else + { + is64Bit = (type == TR_RematerializableLong); + } + } + else + { + TR_ASSERT(type != TR_RematerializableLong, "Longs are rematerialized as pairs of ints on IA32"); + } + + TR_ExternalRelocationTargetKind reloKind = TR_NoRelocation; + if (cg->profiledPointersRequireRelocation() && node && node->getOpCodeValue() == TR::aconst && + (node->isClassPointerConstant() || node->isMethodPointerConstant())) + { + if (node->isClassPointerConstant()) + reloKind = TR_ClassPointer; + else if (node->isMethodPointerConstant()) + reloKind = TR_MethodPointer; + else + TR_ASSERT(0, "Unexpected node, don't know how to relocate"); + } + + if (currentInstruction) + { + // Optimized loads inserted arbitrarily into the instruction stream must be checked + // to ensure they don't modify any eflags needed by surrounding instructions. + // + if ((value == 0 || value == -1)) + { + uint8_t EFlags = TR::InstOpCode::getModifiedEFlags(ops[opsRow][((value == 0) ? XOR : OR)]); + + if (existsNextInstructionToTestFlags(currentInstruction, EFlags) || cg->requiresCarry()) + { + // Can't alter flags, so must use MOV. Fall through. + } + else if (value == 0) + return generateRegRegInstruction(currentInstruction, ops[opsRow][XOR], target, target, cg); + else if (value == -1) + return generateRegImmInstruction(currentInstruction, ops[opsRow][OR], target, (uint32_t)-1, cg); + } + + // No luck optimizing this. Just use a MOV + // + TR::Instruction *movInstruction = NULL; + if (is64Bit) + { + if (cg->constantAddressesCanChangeSize(node) && node && node->getOpCodeValue() == TR::aconst && + (node->isClassPointerConstant() || node->isMethodPointerConstant())) + { + movInstruction = generateRegImm64Instruction(currentInstruction, TR::InstOpCode::MOV8RegImm64, target, value, cg, reloKind); + } + else if (IS_32BIT_UNSIGNED(value)) + { + // zero-extended 4-byte MOV + movInstruction = generateRegImmInstruction(currentInstruction, TR::InstOpCode::MOV4RegImm4, target, static_cast(value), cg, reloKind); + } + else if (IS_32BIT_SIGNED(value)) // TODO:AMD64: Is there some way we could get RIP too? + { + movInstruction = generateRegImmInstruction(currentInstruction, TR::InstOpCode::MOV8RegImm4, target, static_cast(value), cg, reloKind); + } + else + { + movInstruction = generateRegImm64Instruction(currentInstruction, TR::InstOpCode::MOV8RegImm64, target, value, cg, reloKind); + } + } + else + { + movInstruction = generateRegImmInstruction(currentInstruction, ops[opsRow][MOV], target, static_cast(value), cg, reloKind); + } + + if (target && node && + node->getOpCodeValue() == TR::aconst && + node->isClassPointerConstant() && + (cg->fe()->isUnloadAssumptionRequired((TR_OpaqueClassBlock *) node->getAddress(), + comp->getCurrentMethod()) || + cg->profiledPointersRequireRelocation())) + { + comp->getStaticPICSites()->push_front(movInstruction); + } + + if (target && node && + node->getOpCodeValue() == TR::aconst && + node->isMethodPointerConstant() && + (cg->fe()->isUnloadAssumptionRequired(cg->fe()->createResolvedMethod(cg->trMemory(), (TR_OpaqueMethodBlock *) node->getAddress(), comp->getCurrentMethod())->classOfMethod(), comp->getCurrentMethod()) || + cg->profiledPointersRequireRelocation())) + { + traceMsg(comp, "Adding instr %p to MethodPICSites for node %p\n", movInstruction, node); + comp->getStaticMethodPICSites()->push_front(movInstruction); + } + + return movInstruction; + } + else + { + // constant loads between a compare and a branch cannot clobber the EFLAGS register + bool canClobberEFLAGS = !(cg->getCurrentEvaluationTreeTop()->getNode()->getOpCode().isIf() || cg->requiresCarry()); + + if (value == 0 && canClobberEFLAGS) + { + return generateRegRegInstruction(ops[opsRow][XOR], node, target, target, cg); + } + else if (value == -1 && canClobberEFLAGS) + { + return generateRegImmInstruction(ops[opsRow][OR], node, target, (uint32_t)-1, cg); + } + else + { + TR::Instruction *movInstruction = NULL; + if (is64Bit) + { + if (cg->constantAddressesCanChangeSize(node) && node && node->getOpCodeValue() == TR::aconst && + (node->isClassPointerConstant() || node->isMethodPointerConstant())) + { + movInstruction = generateRegImm64Instruction(TR::InstOpCode::MOV8RegImm64, node, target, value, cg, reloKind); + } + else if (IS_32BIT_UNSIGNED(value)) + { + // zero-extended 4-byte MOV + movInstruction = generateRegImmInstruction(TR::InstOpCode::MOV4RegImm4, node, target, static_cast(value), cg, reloKind); + } + else if (IS_32BIT_SIGNED(value)) // TODO:AMD64: Is there some way we could get RIP too? + { + movInstruction = generateRegImmInstruction(TR::InstOpCode::MOV8RegImm4, node, target, static_cast(value), cg, reloKind); + } + else + { + movInstruction = generateRegImm64Instruction(TR::InstOpCode::MOV8RegImm64, node, target, value, cg, reloKind); + } + } + else + { + movInstruction = generateRegImmInstruction(ops[opsRow][MOV], node, target, static_cast(value), cg, reloKind); + } + + // HCR register PIC site in TR::TreeEvaluator::insertLoadConstant + TR::Symbol *symbol = NULL; + if (node && node->getOpCode().hasSymbolReference()) + symbol = node->getSymbol(); + bool isPICCandidate = symbol ? target && symbol->isStatic() && symbol->isClassObject() : false; + if (isPICCandidate && comp->getOption(TR_EnableHCR)) + { + comp->getStaticHCRPICSites()->push_front(movInstruction); + } + + if (target && + node && + node->getOpCodeValue() == TR::aconst && + node->isClassPointerConstant() && + (cg->fe()->isUnloadAssumptionRequired((TR_OpaqueClassBlock *) node->getAddress(), + comp->getCurrentMethod()) || + cg->profiledPointersRequireRelocation())) + { + comp->getStaticPICSites()->push_front(movInstruction); + } + + if (target && node && + node->getOpCodeValue() == TR::aconst && + node->isMethodPointerConstant() && + (cg->fe()->isUnloadAssumptionRequired(cg->fe()->createResolvedMethod(cg->trMemory(), (TR_OpaqueMethodBlock *) node->getAddress(), comp->getCurrentMethod())->classOfMethod(), comp->getCurrentMethod()) || + cg->profiledPointersRequireRelocation())) + { + traceMsg(comp, "Adding instr %p to MethodPICSites for node %p\n", movInstruction, node); + comp->getStaticMethodPICSites()->push_front(movInstruction); + } + + return movInstruction; + } + } + } + +TR::Register *loadConstant(TR::Node *node, + intptr_t value, + TR_RematerializableTypes type, + TR::CodeGenerator *cg, + TR::Register *targetRegister) + { + if (targetRegister == NULL) + { + targetRegister = cg->allocateRegister(); + } + + TR::Instruction *instr = OMR::X86::insertLoadConstant(node, targetRegister, value, type, cg); + + // Do not rematerialize register for class pointer or method pointer if + // it's AOT compilation because it doesn't have node info in register + // rematerialization to create relocation record for the class pointer + // or the method pointer. + if (cg->enableRematerialisation() && + !(cg->comp()->compileRelocatableCode() && node && node->getOpCodeValue() == TR::aconst && + (node->isClassPointerConstant() || node->isMethodPointerConstant()))) + { + if (node && node->getOpCode().hasSymbolReference() && node->getSymbol() && node->getSymbol()->isClassObject()) + (TR::Compiler->om.generateCompressedObjectHeaders() || cg->comp()->target().is32Bit()) + ? type = TR_RematerializableInt : type = TR_RematerializableLong; + + setDiscardableIfPossible(type, targetRegister, node, instr, value, cg); + } + + return targetRegister; + } + void generateLoop(int32_t unrollFactor, int32_t elementsPerIteration, TR::Register *indexReg, @@ -125,8 +361,8 @@ void generateLoop(int32_t begin, deps->addPostCondition(indexReg, RealRegister::NoReg, cg); deps->addPostCondition(loopBoundReg, RealRegister::NoReg, cg); - TreeEvaluator::loadConstant(node, begin, TR_RematerializableInt, cg, indexReg); - TreeEvaluator::loadConstant(node, end, TR_RematerializableInt, cg, loopBoundReg); + loadConstant(node, begin, TR_RematerializableInt, cg, indexReg); + loadConstant(node, end, TR_RematerializableInt, cg, loopBoundReg); OMR::X86::generateLoop(indexReg, loopBoundReg, node, cg, std::move(genBodyFunction)); generateLabelInstruction(TR::InstOpCode::label, node, label, deps, cg); diff --git a/compiler/x/codegen/CodegenUtils.hpp b/compiler/x/codegen/CodegenUtils.hpp index ed9cd77398d..f89d39acd12 100644 --- a/compiler/x/codegen/CodegenUtils.hpp +++ b/compiler/x/codegen/CodegenUtils.hpp @@ -22,7 +22,12 @@ #ifndef OMR_CODEGENUTILS_INCL #define OMR_CODEGENUTILS_INCL +#include "codegen/Machine.hpp" #include "codegen/CodeGenerator.hpp" +#include "x/codegen/OMRX86Instruction.hpp" +#include "x/codegen/RegisterRematerialization.hpp" + +namespace TR { class Node; } namespace OMR { @@ -30,6 +35,48 @@ namespace OMR namespace X86 { +/** + * @brief Inserts an instruction to load a constant value into a specified register. + * + * This function generates an instruction to load a constant value into the target register. + * The generated instruction can be appended to a specified instruction chain. + * + * @param node The IL node that generated the load constant + * @param target The register into which the constant value should be loaded + * @param value The constant value to be loaded into the target register + * @param type The rematerializable type of the constant + * @param cg The code generator responsible for generating the instructions + * @param currentInstruction The instruction to which the new instruction will be appended (optional) + * + * @return A pointer to the generated instruction. + */ +TR::Instruction *insertLoadConstant(TR::Node *node, + TR::Register *target, + intptr_t value, + TR_RematerializableTypes type, + TR::CodeGenerator *cg, + TR::Instruction *currentInstruction = NULL); + +/** + * @brief Loads a constant value into a register and returns the register. + * + * This function either loads the specified constant value into the provided target register, + * or allocates a new register to hold the constant if no target is provided. + * + * @param node The IL node associated with this operation + * @param value The constant value to load into the registe. + * @param type The rematerializable type of the constant + * @param cg The code generator managing the instruction generation process + * @param targetRegister The target register to load the value into (optional) + * + * @return A pointer to the register containing the loaded constant value. + */ +TR::Register *loadConstant(TR::Node *node, + intptr_t value, + TR_RematerializableTypes type, + TR::CodeGenerator *cg, + TR::Register *targetRegister = NULL); + /** * @brief Generates loop control flow with start and end bounds * diff --git a/compiler/x/codegen/ControlFlowEvaluator.cpp b/compiler/x/codegen/ControlFlowEvaluator.cpp index 510602c4ae9..330a7cad769 100644 --- a/compiler/x/codegen/ControlFlowEvaluator.cpp +++ b/compiler/x/codegen/ControlFlowEvaluator.cpp @@ -74,6 +74,7 @@ #include "x/codegen/CompareAnalyser.hpp" #include "x/codegen/FPTreeEvaluator.hpp" #include "x/codegen/X86Instruction.hpp" +#include "x/codegen/CodegenUtils.hpp" #include "codegen/InstOpCode.hpp" class TR_OpaqueClassBlock; @@ -708,7 +709,7 @@ void OMR::X86::TreeEvaluator::compareIntegersForEquality(TR::Node *node, TR::Cod else if(andSecondChild->getSize() == 2) { TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, mask, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, mask, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::TEST2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } @@ -827,7 +828,7 @@ void OMR::X86::TreeEvaluator::compareIntegersForEquality(TR::Node *node, TR::Cod { //shouldn't use Imm2 instructions TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, 0, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, 0, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::CMP2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } @@ -889,7 +890,7 @@ void OMR::X86::TreeEvaluator::compareIntegersForEquality(TR::Node *node, TR::Cod { //shouldn't use Imm2 instructions TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, constValue, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, constValue, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::CMP2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } @@ -1054,7 +1055,7 @@ void OMR::X86::TreeEvaluator::compareIntegersForOrder( { //shouldn't use Imm2 instructions TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, constValue, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, constValue, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::CMP2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } @@ -1150,7 +1151,7 @@ void OMR::X86::TreeEvaluator::compare2BytesForOrder(TR::Node *node, TR::CodeGene else { TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, value, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, value, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::CMP2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } @@ -1905,7 +1906,7 @@ TR::Register *OMR::X86::TreeEvaluator::ifscmpeqEvaluator(TR::Node *node, TR::Cod { //try to avoid Imm2 instructions TR::Register *tempReg = cg->allocateRegister(); - TR::TreeEvaluator::loadConstant(node, value, TR_RematerializableShort, cg, tempReg); + OMR::X86::loadConstant(node, value, TR_RematerializableShort, cg, tempReg); generateMemRegInstruction(TR::InstOpCode::CMP2MemReg, node, tempMR, tempReg, cg); cg->stopUsingRegister(tempReg); } diff --git a/compiler/x/codegen/OMRMachine.cpp b/compiler/x/codegen/OMRMachine.cpp index e01ce6a01fd..0c520093150 100644 --- a/compiler/x/codegen/OMRMachine.cpp +++ b/compiler/x/codegen/OMRMachine.cpp @@ -62,6 +62,7 @@ #include "codegen/X86Instruction.hpp" #include "codegen/InstOpCode.hpp" #include "x/codegen/X86Register.hpp" +#include "x/codegen/CodegenUtils.hpp" extern bool existsNextInstructionToTestFlags(TR::Instruction *startInstr, uint8_t testMask); @@ -838,7 +839,7 @@ TR::RealRegister *OMR::X86::Machine::freeBestGPRegister(TR::Instruction } else { - instr = TR::TreeEvaluator::insertLoadConstant(0, best, info->getConstant(), info->getDataType(), self()->cg(), currentInstruction); + instr = OMR::X86::insertLoadConstant(0, best, info->getConstant(), info->getDataType(), self()->cg(), currentInstruction); } } diff --git a/compiler/x/codegen/OMRTreeEvaluator.cpp b/compiler/x/codegen/OMRTreeEvaluator.cpp index dc93d962373..9eb4c0ba6dd 100644 --- a/compiler/x/codegen/OMRTreeEvaluator.cpp +++ b/compiler/x/codegen/OMRTreeEvaluator.cpp @@ -174,232 +174,6 @@ void OMR::X86::TreeEvaluator::compareGPRegisterToImmediateForEquality(TR::Node generateRegImmInstruction(cmpOp, node, cmpRegister, value, cg); } -TR::Instruction *OMR::X86::TreeEvaluator::insertLoadConstant(TR::Node *node, - TR::Register *target, - intptr_t value, - TR_RematerializableTypes type, - TR::CodeGenerator *cg, - TR::Instruction *currentInstruction) - { - TR::Compilation *comp = cg->comp(); - static const TR::InstOpCode::Mnemonic ops[TR_NumRematerializableTypes+1][3] = - // load 0 load -1 load c - { { TR::InstOpCode::UD2, TR::InstOpCode::UD2, TR::InstOpCode::UD2 }, // LEA; should not seen here - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Byte constant - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Short constant - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Char constant - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // Int constant - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR4RegImms, TR::InstOpCode::MOV4RegImm4 }, // 32-bit address constant - { TR::InstOpCode::XOR4RegReg, TR::InstOpCode::OR8RegImms, TR::InstOpCode::UD2 } }; // Long address constant; MOVs handled specially - - enum { XOR = 0, OR = 1, MOV = 2 }; - - bool is64Bit = false; - - int opsRow = type; - if (cg->comp()->target().is64Bit()) - { - if (type == TR_RematerializableAddress) - { - // Treat 64-bit addresses as longs - opsRow++; - is64Bit = true; - } - else - { - is64Bit = (type == TR_RematerializableLong); - } - } - else - { - TR_ASSERT(type != TR_RematerializableLong, "Longs are rematerialized as pairs of ints on IA32"); - } - - TR_ExternalRelocationTargetKind reloKind = TR_NoRelocation; - if (cg->profiledPointersRequireRelocation() && node && node->getOpCodeValue() == TR::aconst && - (node->isClassPointerConstant() || node->isMethodPointerConstant())) - { - if (node->isClassPointerConstant()) - reloKind = TR_ClassPointer; - else if (node->isMethodPointerConstant()) - reloKind = TR_MethodPointer; - else - TR_ASSERT(0, "Unexpected node, don't know how to relocate"); - } - - if (currentInstruction) - { - // Optimized loads inserted arbitrarily into the instruction stream must be checked - // to ensure they don't modify any eflags needed by surrounding instructions. - // - if ((value == 0 || value == -1)) - { - uint8_t EFlags = TR::InstOpCode::getModifiedEFlags(ops[opsRow][((value == 0) ? XOR : OR)]); - - if (existsNextInstructionToTestFlags(currentInstruction, EFlags) || cg->requiresCarry()) - { - // Can't alter flags, so must use MOV. Fall through. - } - else if (value == 0) - return generateRegRegInstruction(currentInstruction, ops[opsRow][XOR], target, target, cg); - else if (value == -1) - return generateRegImmInstruction(currentInstruction, ops[opsRow][OR], target, (uint32_t)-1, cg); - } - - // No luck optimizing this. Just use a MOV - // - TR::Instruction *movInstruction = NULL; - if (is64Bit) - { - if (cg->constantAddressesCanChangeSize(node) && node && node->getOpCodeValue() == TR::aconst && - (node->isClassPointerConstant() || node->isMethodPointerConstant())) - { - movInstruction = generateRegImm64Instruction(currentInstruction, TR::InstOpCode::MOV8RegImm64, target, value, cg, reloKind); - } - else if (IS_32BIT_UNSIGNED(value)) - { - // zero-extended 4-byte MOV - movInstruction = generateRegImmInstruction(currentInstruction, TR::InstOpCode::MOV4RegImm4, target, static_cast(value), cg, reloKind); - } - else if (IS_32BIT_SIGNED(value)) // TODO:AMD64: Is there some way we could get RIP too? - { - movInstruction = generateRegImmInstruction(currentInstruction, TR::InstOpCode::MOV8RegImm4, target, static_cast(value), cg, reloKind); - } - else - { - movInstruction = generateRegImm64Instruction(currentInstruction, TR::InstOpCode::MOV8RegImm64, target, value, cg, reloKind); - } - } - else - { - movInstruction = generateRegImmInstruction(currentInstruction, ops[opsRow][MOV], target, static_cast(value), cg, reloKind); - } - - if (target && node && - node->getOpCodeValue() == TR::aconst && - node->isClassPointerConstant() && - (cg->fe()->isUnloadAssumptionRequired((TR_OpaqueClassBlock *) node->getAddress(), - comp->getCurrentMethod()) || - cg->profiledPointersRequireRelocation())) - { - comp->getStaticPICSites()->push_front(movInstruction); - } - - if (target && node && - node->getOpCodeValue() == TR::aconst && - node->isMethodPointerConstant() && - (cg->fe()->isUnloadAssumptionRequired(cg->fe()->createResolvedMethod(cg->trMemory(), (TR_OpaqueMethodBlock *) node->getAddress(), comp->getCurrentMethod())->classOfMethod(), comp->getCurrentMethod()) || - cg->profiledPointersRequireRelocation())) - { - traceMsg(comp, "Adding instr %p to MethodPICSites for node %p\n", movInstruction, node); - comp->getStaticMethodPICSites()->push_front(movInstruction); - } - - return movInstruction; - } - else - { - // constant loads between a compare and a branch cannot clobber the EFLAGS register - bool canClobberEFLAGS = !(cg->getCurrentEvaluationTreeTop()->getNode()->getOpCode().isIf() || cg->requiresCarry()); - - if (value == 0 && canClobberEFLAGS) - { - return generateRegRegInstruction(ops[opsRow][XOR], node, target, target, cg); - } - else if (value == -1 && canClobberEFLAGS) - { - return generateRegImmInstruction(ops[opsRow][OR], node, target, (uint32_t)-1, cg); - } - else - { - TR::Instruction *movInstruction = NULL; - if (is64Bit) - { - if (cg->constantAddressesCanChangeSize(node) && node && node->getOpCodeValue() == TR::aconst && - (node->isClassPointerConstant() || node->isMethodPointerConstant())) - { - movInstruction = generateRegImm64Instruction(TR::InstOpCode::MOV8RegImm64, node, target, value, cg, reloKind); - } - else if (IS_32BIT_UNSIGNED(value)) - { - // zero-extended 4-byte MOV - movInstruction = generateRegImmInstruction(TR::InstOpCode::MOV4RegImm4, node, target, static_cast(value), cg, reloKind); - } - else if (IS_32BIT_SIGNED(value)) // TODO:AMD64: Is there some way we could get RIP too? - { - movInstruction = generateRegImmInstruction(TR::InstOpCode::MOV8RegImm4, node, target, static_cast(value), cg, reloKind); - } - else - { - movInstruction = generateRegImm64Instruction(TR::InstOpCode::MOV8RegImm64, node, target, value, cg, reloKind); - } - } - else - { - movInstruction = generateRegImmInstruction(ops[opsRow][MOV], node, target, static_cast(value), cg, reloKind); - } - - // HCR register PIC site in TR::TreeEvaluator::insertLoadConstant - TR::Symbol *symbol = NULL; - if (node && node->getOpCode().hasSymbolReference()) - symbol = node->getSymbol(); - bool isPICCandidate = symbol ? target && symbol->isStatic() && symbol->isClassObject() : false; - if (isPICCandidate && comp->getOption(TR_EnableHCR)) - { - comp->getStaticHCRPICSites()->push_front(movInstruction); - } - - if (target && - node && - node->getOpCodeValue() == TR::aconst && - node->isClassPointerConstant() && - (cg->fe()->isUnloadAssumptionRequired((TR_OpaqueClassBlock *) node->getAddress(), - comp->getCurrentMethod()) || - cg->profiledPointersRequireRelocation())) - { - comp->getStaticPICSites()->push_front(movInstruction); - } - - if (target && node && - node->getOpCodeValue() == TR::aconst && - node->isMethodPointerConstant() && - (cg->fe()->isUnloadAssumptionRequired(cg->fe()->createResolvedMethod(cg->trMemory(), (TR_OpaqueMethodBlock *) node->getAddress(), comp->getCurrentMethod())->classOfMethod(), comp->getCurrentMethod()) || - cg->profiledPointersRequireRelocation())) - { - traceMsg(comp, "Adding instr %p to MethodPICSites for node %p\n", movInstruction, node); - comp->getStaticMethodPICSites()->push_front(movInstruction); - } - - return movInstruction; - } - } - } - -TR::Register *OMR::X86::TreeEvaluator::loadConstant(TR::Node * node, intptr_t value, TR_RematerializableTypes type, TR::CodeGenerator *cg, TR::Register *targetRegister) - { - if (targetRegister == NULL) - { - targetRegister = cg->allocateRegister(); - } - - TR::Instruction *instr = TR::TreeEvaluator::insertLoadConstant(node, targetRegister, value, type, cg); - - // Do not rematerialize register for class pointer or method pointer if - // it's AOT compilation because it doesn't have node info in register - // rematerialization to create relocation record for the class pointer - // or the method pointer. - if (cg->enableRematerialisation() && - !(cg->comp()->compileRelocatableCode() && node && node->getOpCodeValue() == TR::aconst && (node->isClassPointerConstant() || node->isMethodPointerConstant()))) - { - if (node && node->getOpCode().hasSymbolReference() && node->getSymbol() && node->getSymbol()->isClassObject()) - (TR::Compiler->om.generateCompressedObjectHeaders() || cg->comp()->target().is32Bit()) ? type = TR_RematerializableInt : type = TR_RematerializableLong; - - setDiscardableIfPossible(type, targetRegister, node, instr, value, cg); - } - - return targetRegister; - } - TR::Instruction * OMR::X86::TreeEvaluator::insertLoadMemory( TR::Node *node, diff --git a/compiler/x/codegen/OMRTreeEvaluator.hpp b/compiler/x/codegen/OMRTreeEvaluator.hpp index 52c571ebd0c..d892fa8bca6 100644 --- a/compiler/x/codegen/OMRTreeEvaluator.hpp +++ b/compiler/x/codegen/OMRTreeEvaluator.hpp @@ -449,12 +449,6 @@ class OMR_EXTENSIBLE TreeEvaluator: public OMR::TreeEvaluator static bool VMinlineCallEvaluator(TR::Node *node, bool isIndirect, TR::CodeGenerator *cg); static TR::Instruction *VMtestForReferenceArray(TR::Node *, TR::Register *objectReg, TR::CodeGenerator *cg); static bool genNullTestSequence(TR::Node *node, TR::Register *opReg, TR::Register *targetReg, TR::CodeGenerator *cg); - static TR::Instruction *insertLoadConstant(TR::Node *node, - TR::Register *target, - intptr_t value, - TR_RematerializableTypes type, - TR::CodeGenerator *cg, - TR::Instruction *currentInstruction = NULL); static TR::Instruction *insertLoadMemory(TR::Node *node, TR::Register *target, @@ -515,8 +509,6 @@ class OMR_EXTENSIBLE TreeEvaluator: public OMR::TreeEvaluator static TR::Register *performCall(TR::Node *node, bool isIndirect, bool spillFPRegs, TR::CodeGenerator *cg); - static TR::Register *loadConstant(TR::Node *node, intptr_t value, TR_RematerializableTypes t, TR::CodeGenerator *cg, TR::Register *targetRegister = NULL); - static bool setCarryBorrow(TR::Node *flagNode, bool invertValue, TR::CodeGenerator *cg); static void arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, diff --git a/compiler/x/codegen/SubtractAnalyser.cpp b/compiler/x/codegen/SubtractAnalyser.cpp index a1575b903f0..4880c8ad2b0 100644 --- a/compiler/x/codegen/SubtractAnalyser.cpp +++ b/compiler/x/codegen/SubtractAnalyser.cpp @@ -39,6 +39,7 @@ #include "infra/Assert.hpp" #include "codegen/X86Instruction.hpp" #include "codegen/InstOpCode.hpp" +#include "x/codegen/CodegenUtils.hpp" /* * \brief @@ -144,7 +145,7 @@ TR::Register* TR_X86SubtractAnalyser::integerSubtractAnalyserImpl(TR::Node * // firstchild is an inconst and it has not been evaluated. // Generate the code for an iconst. firstRegister = _cg->allocateRegister(); - TR::TreeEvaluator::insertLoadConstant(firstChild, firstRegister, firstChild->getInt(), TR_RematerializableInt, _cg); + OMR::X86::insertLoadConstant(firstChild, firstRegister, firstChild->getInt(), TR_RematerializableInt, _cg); } else { diff --git a/compiler/x/codegen/UnaryEvaluator.cpp b/compiler/x/codegen/UnaryEvaluator.cpp index 88eb02ebe4c..5739ec7b87f 100644 --- a/compiler/x/codegen/UnaryEvaluator.cpp +++ b/compiler/x/codegen/UnaryEvaluator.cpp @@ -33,6 +33,7 @@ #include "codegen/X86Instruction.hpp" #include "codegen/InstOpCode.hpp" #include "env/CompilerEnv.hpp" +#include "x/codegen/CodegenUtils.hpp" extern TR::Register *intOrLongClobberEvaluate(TR::Node *node, bool nodeIs64Bit, TR::CodeGenerator *cg); @@ -162,7 +163,7 @@ TR::Register *OMR::X86::TreeEvaluator::unaryVectorArithmeticEvaluator(TR::Node * TR::Register *OMR::X86::TreeEvaluator::bconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *reg = TR::TreeEvaluator::loadConstant(node, node->getInt(), TR_RematerializableByte, cg); + TR::Register *reg = OMR::X86::loadConstant(node, node->getInt(), TR_RematerializableByte, cg); node->setRegister(reg); if (cg->enableRegisterInterferences()) @@ -173,14 +174,14 @@ TR::Register *OMR::X86::TreeEvaluator::bconstEvaluator(TR::Node *node, TR::CodeG TR::Register *OMR::X86::TreeEvaluator::sconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *reg = TR::TreeEvaluator::loadConstant(node, node->getInt(), TR_RematerializableShort, cg); + TR::Register *reg = OMR::X86::loadConstant(node, node->getInt(), TR_RematerializableShort, cg); node->setRegister(reg); return reg; } TR::Register *OMR::X86::TreeEvaluator::iconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *reg = TR::TreeEvaluator::loadConstant(node, node->getInt(), TR_RematerializableInt, cg); + TR::Register *reg = OMR::X86::loadConstant(node, node->getInt(), TR_RematerializableInt, cg); node->setRegister(reg); return reg; } diff --git a/compiler/x/i386/codegen/OMRTreeEvaluator.cpp b/compiler/x/i386/codegen/OMRTreeEvaluator.cpp index 52692611982..675a5fc8256 100644 --- a/compiler/x/i386/codegen/OMRTreeEvaluator.cpp +++ b/compiler/x/i386/codegen/OMRTreeEvaluator.cpp @@ -66,6 +66,7 @@ #include "x/codegen/IntegerMultiplyDecomposer.hpp" #include "x/codegen/SubtractAnalyser.hpp" #include "x/codegen/X86Instruction.hpp" +#include "x/codegen/CodegenUtils.hpp" #include "codegen/InstOpCode.hpp" class TR_OpaqueMethodBlock; @@ -2350,7 +2351,7 @@ OMR::X86::I386::TreeEvaluator::lbitpermuteEvaluator(TR::Node *node, TR::CodeGene TR::Register *OMR::X86::I386::TreeEvaluator::aconstEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::Register *reg = loadConstant(node, node->getInt(), TR_RematerializableAddress, cg); + TR::Register *reg = OMR::X86::loadConstant(node, node->getInt(), TR_RematerializableAddress, cg); node->setRegister(reg); return reg; } @@ -2367,7 +2368,7 @@ TR::Register *OMR::X86::I386::TreeEvaluator::lconstEvaluator(TR::Node *node, TR: if (lowValue <= highValue) { lowRegister = cg->allocateRegister(); - highRegister = loadConstant(node, highValue, TR_RematerializableInt, cg); + highRegister = OMR::X86::loadConstant(node, highValue, TR_RematerializableInt, cg); if (lowValue == highValue) { generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, lowRegister, highRegister, cg); @@ -2380,7 +2381,7 @@ TR::Register *OMR::X86::I386::TreeEvaluator::lconstEvaluator(TR::Node *node, TR: } else { - lowRegister = loadConstant(node, lowValue, TR_RematerializableInt, cg); + lowRegister = OMR::X86::loadConstant(node, lowValue, TR_RematerializableInt, cg); highRegister = cg->allocateRegister(); generateRegMemInstruction(TR::InstOpCode::LEA4RegMem, node, highRegister, generateX86MemoryReference(lowRegister, highValue - lowValue, cg), cg); @@ -2388,8 +2389,8 @@ TR::Register *OMR::X86::I386::TreeEvaluator::lconstEvaluator(TR::Node *node, TR: } else { - lowRegister = loadConstant(node, lowValue, TR_RematerializableInt, cg); - highRegister = loadConstant(node, highValue, TR_RematerializableInt, cg); + lowRegister = OMR::X86::loadConstant(node, lowValue, TR_RematerializableInt, cg); + highRegister = OMR::X86::loadConstant(node, highValue, TR_RematerializableInt, cg); } TR::RegisterPair *longRegister = cg->allocateRegisterPair(lowRegister, highRegister); @@ -2480,7 +2481,7 @@ TR::Register *OMR::X86::I386::TreeEvaluator::lstoreEvaluator(TR::Node *node, TR: if (lowValue == highValue) { - TR::Register *valueReg = loadConstant(node, lowValue, TR_RematerializableInt, cg); + TR::Register *valueReg = OMR::X86::loadConstant(node, lowValue, TR_RematerializableInt, cg); instr = generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, lowMR, valueReg, cg); generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, highMR, valueReg, cg); cg->stopUsingRegister(valueReg); @@ -3189,7 +3190,7 @@ TR::Register *OMR::X86::I386::TreeEvaluator::integerPairMulEvaluator(TR::Node *n { highRegister = cg->allocateRegister(); } - lowRegister = loadConstant(node, lowValue, TR_RematerializableInt, cg); + lowRegister = OMR::X86::loadConstant(node, lowValue, TR_RematerializableInt, cg); TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions((uint8_t)2, 2, cg); dependencies->addPreCondition(lowRegister, TR::RealRegister::eax, cg); dependencies->addPreCondition(highRegister, TR::RealRegister::edx, cg); @@ -3377,7 +3378,7 @@ TR::Register *OMR::X86::I386::TreeEvaluator::integerPairMulEvaluator(TR::Node *n // Second lowOrder * lowValue - lowRegister = loadConstant(node, lowValue, TR_RematerializableInt, cg); + lowRegister = OMR::X86::loadConstant(node, lowValue, TR_RematerializableInt, cg); TR::RegisterDependencyConditions *dependencies = generateRegisterDependencyConditions((uint8_t)2, 2, cg); dependencies->addPreCondition(lowRegister, TR::RealRegister::eax, cg); dependencies->addPreCondition(highRegister, TR::RealRegister::edx, cg); From 80c5dd1449c7cf286c0039375ce5da02e3693f1d Mon Sep 17 00:00:00 2001 From: Bradley Wood Date: Fri, 17 Jan 2025 12:07:09 -0500 Subject: [PATCH 3/5] x86: create helpers macros to manage registers Signed-off-by: Bradley Wood --- compiler/x/codegen/CodegenUtils.hpp | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/compiler/x/codegen/CodegenUtils.hpp b/compiler/x/codegen/CodegenUtils.hpp index f89d39acd12..acb32cbb963 100644 --- a/compiler/x/codegen/CodegenUtils.hpp +++ b/compiler/x/codegen/CodegenUtils.hpp @@ -29,6 +29,36 @@ namespace TR { class Node; } +#define SETUP_CUSTOM_REGISTER(type, cg, deps, varName) \ + varName = cg->allocateRegister(type); \ + deps->addPostCondition(varName, TR::RealRegister::NoReg, cg); + +#define SETUP_CUSTOM_REGISTERS_4(type, cg, deps, varA) SETUP_CUSTOM_REGISTER(type, cg, deps, varA) +#define SETUP_CUSTOM_REGISTERS_5(type, cg, deps, varA, varB) SETUP_CUSTOM_REGISTER(type, cg, deps, varA) \ + SETUP_CUSTOM_REGISTER(type, cg, deps, varB) +#define SETUP_CUSTOM_REGISTERS_6(type, cg, deps, varA, varB, varC) SETUP_CUSTOM_REGISTERS_5(type, cg, deps, varA, varB) \ + SETUP_CUSTOM_REGISTER(type, cg, deps, varC) +#define SETUP_CUSTOM_REGISTERS_7(type, cg, deps, varA, varB, varC, varD) SETUP_CUSTOM_REGISTERS_6(type, cg, deps, varA, varB, varC) \ + SETUP_CUSTOM_REGISTER(type, cg, deps, varD) +#define SETUP_CUSTOM_REGISTERS_8(type, cg, deps, varA, varB, varC, varD, varE) SETUP_CUSTOM_REGISTERS_7(type, cg, deps, varA, varB, varC, varD) \ + SETUP_CUSTOM_REGISTER(type, cg, deps, varE) + +#define GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, NAME,...) NAME +#define SETUP_CUSTOM_REGISTERS(...) GET_MACRO(__VA_ARGS__, SETUP_CUSTOM_REGISTERS_8, SETUP_CUSTOM_REGISTERS_7, SETUP_CUSTOM_REGISTERS_6, SETUP_CUSTOM_REGISTERS_5, SETUP_CUSTOM_REGISTERS_4)(__VA_ARGS__) + + +#define STOP_USING_REGISTERS(cg, ...) \ + { \ + TR::Register* registers[] = { __VA_ARGS__ }; \ + for (int i = 0; i < sizeof(registers) / sizeof(TR::Register *); ++i) { \ + cg->stopUsingRegister(registers[i]); \ + } \ + } + +#define SETUP_GPR_REGISTERS(cg, deps, ...) SETUP_CUSTOM_REGISTERS(TR_GPR, cg, deps, __VA_ARGS__) +#define SETUP_VRF_REGISTERS(cg, deps, ...) SETUP_CUSTOM_REGISTERS(TR_VRF, cg, deps, __VA_ARGS__) +#define SETUP_FPR_REGISTERS(cg, deps, ...) SETUP_CUSTOM_REGISTERS(TR_FPR, cg, deps, __VA_ARGS__) + namespace OMR { From 251cb7bbc83f649c0e6324fc00fca662667358a3 Mon Sep 17 00:00:00 2001 From: Bradley Wood Date: Fri, 17 Jan 2025 13:23:55 -0500 Subject: [PATCH 4/5] x86: add helper to find VL for given SIMD opcodes Signed-off-by: Bradley Wood --- compiler/x/codegen/CodegenUtils.cpp | 18 ++++++++++++++++++ compiler/x/codegen/CodegenUtils.hpp | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/compiler/x/codegen/CodegenUtils.cpp b/compiler/x/codegen/CodegenUtils.cpp index a89ebf39ebd..faf32aaa8d0 100644 --- a/compiler/x/codegen/CodegenUtils.cpp +++ b/compiler/x/codegen/CodegenUtils.cpp @@ -371,5 +371,23 @@ void generateLoop(int32_t begin, cg->stopUsingRegister(loopBoundReg); } +TR::VectorLength maxVectorLength(TR::CodeGenerator *cg, TR::InstOpCode *opcodes, size_t numOpcodes) + { + TR::VectorLength maxLength = TR::VectorLength512; + + for (int i = 0; i < numOpcodes && maxLength != TR::NoVectorLength; i++) + { + OMR::X86::Encoding encoding = opcodes[i].getSIMDEncoding(&cg->comp()->target().cpu, maxLength); + + if (encoding == OMR::X86::Bad) + { + maxLength = static_cast(maxLength - 1); + i = -1; + } + } + + return maxLength; + } + } } diff --git a/compiler/x/codegen/CodegenUtils.hpp b/compiler/x/codegen/CodegenUtils.hpp index acb32cbb963..95e79291d84 100644 --- a/compiler/x/codegen/CodegenUtils.hpp +++ b/compiler/x/codegen/CodegenUtils.hpp @@ -245,6 +245,27 @@ inline void generateLoop(TR::Register *indexReg, generateLoop(1, 1, indexReg, maxIndexReg, node, cg, NULL, genBodyFunction, NULL); } +/** + * @brief Determines the maximum vector length supported by a set of SIMD opcodes. + * + * This function iterates through a list of opcodes and checks the maximum vector length + * they support by querying the SIMD encoding capabilities of the target CPU. If an opcode + * does not support a particular vector length, the function reduces the length and rechecks + * all opcodes until a common maximum vector length is determined. + * + * @param cg Pointer to the code generator, used to access the target CPU features + * @param opcodes Array of opcodes to evaluate + * @param numOpcodes Number of opcodes in the array + * + * @return The maximum vector length supported across all given opcodes, or `TR::NoVectorLength` if none are supported + */ +TR::VectorLength maxVectorLength(TR::CodeGenerator *cg, TR::InstOpCode *opcodes, size_t numOpcodes); + +#define MAX_VECTOR_LENGTH(cg, ...) \ + [&]() -> TR::VectorLength { \ + TR::InstOpCode opcodes[] = {__VA_ARGS__}; \ + return maxVectorLength(cg, opcodes, sizeof(opcodes) / sizeof(opcodes[0])); \ + }() } From 02b768f441704b5ea2d5abfba3a95501b8560a95 Mon Sep 17 00:00:00 2001 From: Bradley Wood Date: Fri, 17 Jan 2025 17:54:34 -0500 Subject: [PATCH 5/5] x86: Move broadcast logic into CodegenUtils helper Signed-off-by: Bradley Wood --- compiler/x/codegen/CodegenUtils.cpp | 130 +++++++++++++++++++++++ compiler/x/codegen/CodegenUtils.hpp | 108 +++++++++++++++++++ compiler/x/codegen/SIMDTreeEvaluator.cpp | 70 +----------- 3 files changed, 240 insertions(+), 68 deletions(-) diff --git a/compiler/x/codegen/CodegenUtils.cpp b/compiler/x/codegen/CodegenUtils.cpp index faf32aaa8d0..8cc28cd9142 100644 --- a/compiler/x/codegen/CodegenUtils.cpp +++ b/compiler/x/codegen/CodegenUtils.cpp @@ -389,5 +389,135 @@ TR::VectorLength maxVectorLength(TR::CodeGenerator *cg, TR::InstOpCode *opcodes, return maxLength; } + +TR::Register *broadcast(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + TR::DataType dt, + TR::Register *targetReg, + TR::Register *srcReg) + { + if (!targetReg) + { + targetReg = cg->allocateRegister(TR_VRF); + } + + TR_ASSERT_FATAL(targetReg->getKind() == TR_VRF, "Target register must be a vector in broadcast operations"); + TR_ASSERT_FATAL(srcReg->getKind() != TR_VRF, "Source register must not be a vector"); + + bool broadcast64 = dt.isInt64() || dt.isDouble(); + + switch (dt) + { + case TR::Int8: + case TR::Int16: + case TR::Int32: + generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, targetReg, srcReg, cg); + break; + case TR::Int64: + if (cg->comp()->target().is32Bit()) + { + TR::Register* tempVectorReg = cg->allocateRegister(TR_VRF); + generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, tempVectorReg, srcReg->getHighOrder(), cg); + generateRegImmInstruction(TR::InstOpCode::PSLLQRegImm1, node, tempVectorReg, 0x20, cg); + generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, targetReg, srcReg->getLowOrder(), cg); + generateRegRegInstruction(TR::InstOpCode::PORRegReg, node, targetReg, tempVectorReg, cg); + cg->stopUsingRegister(tempVectorReg); + } + else + { + generateRegRegInstruction(TR::InstOpCode::MOVQRegReg8, node, targetReg, srcReg, cg); + } + break; + case TR::Float: + case TR::Double: + generateRegRegInstruction(TR::InstOpCode::MOVSDRegReg, node, targetReg, srcReg, cg); + break; + default: + if (cg->comp()->getOption(TR_TraceCG)) + traceMsg(cg->comp(), "Unsupported data type, Node = %p\n", node); + TR_ASSERT_FATAL(false, "Unsupported data type"); + break; + } + + // Expand byte & word to 32-bits + switch (dt) + { + case TR::Int8: + generateRegRegInstruction(TR::InstOpCode::PUNPCKLBWRegReg, node, targetReg, targetReg, cg); + case TR::Int16: + generateRegRegImmInstruction(TR::InstOpCode::PSHUFLWRegRegImm1, node, targetReg, targetReg, 0x0, cg); + default: + break; + } + + switch (vl) + { + case TR::VectorLength128: + generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, targetReg, targetReg, broadcast64 ? 0x44 : 0, cg); + break; + case TR::VectorLength256: + { + TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "256-bit vsplats requires AVX2"); + TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDYmmYmm : TR::InstOpCode::VBROADCASTSSRegReg; + generateRegRegInstruction(opcode.getMnemonic(), node, targetReg, targetReg, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength256)); + break; + } + case TR::VectorLength512: + { + TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F), "512-bit vsplats requires AVX-512"); + TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDZmmXmm : TR::InstOpCode::VBROADCASTSSRegReg; + generateRegRegInstruction(opcode.getMnemonic(), node, targetReg, targetReg, cg, OMR::X86::EVEX_L512); + break; + } + default: + TR_ASSERT_FATAL(0, "Unsupported vector length"); + break; + } + + return targetReg; + } + +TR::Register *broadcastConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + TR::DataType dt, + intptr_t value) + { + TR_RematerializableTypes reType; + + switch (dt) + { + case TR::Int8: + reType = TR_RematerializableByte; + break; + case TR::Int16: + reType = TR_RematerializableShort; + break; + case TR::Int32: + reType = TR_RematerializableInt; + break; + case TR::Int64: + reType = TR_RematerializableLong; + break; + case TR::Float: + reType = TR_RematerializableFloat; + break; + case TR::Double: + reType = TR_RematerializableDouble; + break; + default: + TR_ASSERT_FATAL(false, "Unexpected data type"); + break; + } + + TR::Register *srcReg = loadConstant(node, value, reType, cg); + TR::Register *resultReg = broadcast(node, cg, vl, dt, NULL, srcReg); + + cg->stopUsingRegister(srcReg); + + return resultReg; + } + } } diff --git a/compiler/x/codegen/CodegenUtils.hpp b/compiler/x/codegen/CodegenUtils.hpp index 95e79291d84..662076eece6 100644 --- a/compiler/x/codegen/CodegenUtils.hpp +++ b/compiler/x/codegen/CodegenUtils.hpp @@ -267,6 +267,114 @@ TR::VectorLength maxVectorLength(TR::CodeGenerator *cg, TR::InstOpCode *opcodes, return maxVectorLength(cg, opcodes, sizeof(opcodes) / sizeof(opcodes[0])); \ }() +/** + * @brief Broadcasts the value in a source register across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param dt The data type of the vector elements (e.g., Int8, Int32, Float) + * @param targetReg Optional register to store the result; if null, a new register is allocated + * @param srcReg The source register containing the value to broadcast + * + * @return The vector register containing the result + */ +TR::Register *broadcast(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + TR::DataType dt, + TR::Register *targetReg, + TR::Register *srcReg); + +/** + * @brief Broadcasts a constant value across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param dt The data type of the vector elements (e.g., Int8, Int32, Float) + * @param value The constant value to broadcast + * + * @return The vector register containing the result + */ +TR::Register *broadcastConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + TR::DataType dt, + intptr_t value); + +/** + * @brief Broadcasts a constant 8-bit integer value across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param value The constant 8-bit integer value to broadcast + * + * @return The vector register containing the result + */ +inline TR::Register *broadcastBConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + intptr_t value) + { + return broadcastConst(node, cg, vl, TR::Int8, value); + } + +/** + * @brief Broadcasts a constant 16-bit integer value across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param value The constant 16-bit integer value to broadcast + * + * @return The vector register containing the result + */ +inline TR::Register *broadcastSConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + intptr_t value) + { + return broadcastConst(node, cg, vl, TR::Int16, value); + } + +/** + * @brief Broadcasts a constant 32-bit integer value across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param value The constant 32-bit integer value to broadcast + * + * @return The vector register containing the result + */ +inline TR::Register *broadcastIConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + intptr_t value) + { + return broadcastConst(node, cg, vl, TR::Int32, value); + } + +/** + * @brief Broadcasts a constant 64-bit integer value across all elements of a vector register. + * + * @param node The IL node representing the operation + * @param cg The code generator responsible for generating the instruction + * @param vl The vector length indicating the width of the vector (e.g., 128-bit, 256-bit) + * @param value The constant 64-bit integer value to broadcast + * + * @return The vector register containing the result + */ +inline TR::Register *broadcastLConst(TR::Node *node, + TR::CodeGenerator *cg, + TR::VectorLength vl, + intptr_t value) + { + return broadcastConst(node, cg, vl, TR::Int64, value); + } + } } diff --git a/compiler/x/codegen/SIMDTreeEvaluator.cpp b/compiler/x/codegen/SIMDTreeEvaluator.cpp index 9bf741b467d..35a7266098b 100644 --- a/compiler/x/codegen/SIMDTreeEvaluator.cpp +++ b/compiler/x/codegen/SIMDTreeEvaluator.cpp @@ -29,6 +29,7 @@ #include "il/Node_inlines.hpp" #include "infra/Assert.hpp" #include "x/codegen/X86Instruction.hpp" +#include "x/codegen/CodegenUtils.hpp" #include "codegen/InstOpCode.hpp" namespace TR { class Instruction; } @@ -234,75 +235,8 @@ TR::Register* OMR::X86::TreeEvaluator::SIMDsplatsEvaluator(TR::Node* node, TR::C TR::DataType et = node->getDataType().getVectorElementType(); TR::VectorLength vl = node->getDataType().getVectorLength(); TR::Register* resultReg = cg->allocateRegister(TR_VRF); - bool broadcast64 = et.isInt64() || et.isDouble(); - switch (et) - { - case TR::Int8: - case TR::Int16: - case TR::Int32: - generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, resultReg, childReg, cg); - break; - case TR::Int64: - if (cg->comp()->target().is32Bit()) - { - TR::Register* tempVectorReg = cg->allocateRegister(TR_VRF); - generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, tempVectorReg, childReg->getHighOrder(), cg); - generateRegImmInstruction(TR::InstOpCode::PSLLQRegImm1, node, tempVectorReg, 0x20, cg); - generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, resultReg, childReg->getLowOrder(), cg); - generateRegRegInstruction(TR::InstOpCode::PORRegReg, node, resultReg, tempVectorReg, cg); - cg->stopUsingRegister(tempVectorReg); - } - else - { - generateRegRegInstruction(TR::InstOpCode::MOVQRegReg8, node, resultReg, childReg, cg); - } - break; - case TR::Float: - case TR::Double: - generateRegRegInstruction(TR::InstOpCode::MOVSDRegReg, node, resultReg, childReg, cg); - break; - default: - if (cg->comp()->getOption(TR_TraceCG)) - traceMsg(cg->comp(), "Unsupported data type, Node = %p\n", node); - TR_ASSERT_FATAL(false, "Unsupported data type"); - break; - } - - // Expand byte & word to 32-bits - switch (et) - { - case TR::Int8: - generateRegRegInstruction(TR::InstOpCode::PUNPCKLBWRegReg, node, resultReg, resultReg, cg); - case TR::Int16: - generateRegRegImmInstruction(TR::InstOpCode::PSHUFLWRegRegImm1, node, resultReg, resultReg, 0x0, cg); - default: - break; - } - - switch (vl) - { - case TR::VectorLength128: - generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, resultReg, resultReg, broadcast64 ? 0x44 : 0, cg); - break; - case TR::VectorLength256: - { - TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "256-bit vsplats requires AVX2"); - TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDYmmYmm : TR::InstOpCode::VBROADCASTSSRegReg; - generateRegRegInstruction(opcode.getMnemonic(), node, resultReg, resultReg, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength256)); - break; - } - case TR::VectorLength512: - { - TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F), "512-bit vsplats requires AVX-512"); - TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDZmmXmm : TR::InstOpCode::VBROADCASTSSRegReg; - generateRegRegInstruction(opcode.getMnemonic(), node, resultReg, resultReg, cg, OMR::X86::EVEX_L512); - break; - } - default: - TR_ASSERT_FATAL(0, "Unsupported vector length"); - break; - } + OMR::X86::broadcast(node, cg, vl, et, resultReg, childReg); node->setRegister(resultReg); cg->decReferenceCount(childNode);