dotnet · EgorBo · May 3, 2023 · Apr 28, 2023 · Apr 28, 2023 · Apr 28, 2023
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
@@ -3134,32 +3134,17 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     // INITBLK zeroes a struct that contains GC pointers and can be observed by
     // other threads (i.e. when dstAddr is not an address of a local).
     // For example, this can happen when initializing a struct field of an object.
-    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
-
-#ifdef TARGET_AMD64
-    // On Amd64 the JIT will not use SIMD stores for such structs and instead
-    // will always allocate a GP register for src node.
-    const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
-#else
-    // On X86 the JIT will use movq for structs that are larger than 16 bytes
-    // since it is more beneficial than using two mov-s from a GP register.
-    const bool willUseSimdMov = (size >= 16);
-#endif
+    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
+    const bool willUseSimdMov       = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
 
     if (!src->isContained())
     {
         srcIntReg = genConsumeReg(src);
     }
     else
     {
-        // If src is contained then it must be 0.
-        assert(src->IsIntegralConst(0));
         assert(willUseSimdMov);
-#ifdef TARGET_AMD64
         assert(size >= XMM_REGSIZE_BYTES);
-#else
-        assert(size % 8 == 0);
-#endif
     }
 
     emitter* emit = GetEmitter();
@@ -3178,14 +3163,24 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             regSize = min(regSize, YMM_REGSIZE_BYTES);
         }
 
-        bool zeroing = false;
         if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
         {
             // If the source is constant 0 then always use xorps, it's faster
             // than copying the constant from a GPR to a XMM register.
             emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
-            zeroing = true;
         }
+#ifdef FEATURE_SIMD
+        else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
+        {
+            // Populate a constant vector from the fill value and save it
+            // to the data section so we can load it by address.
+            assert(regSize <= ZMM_REGSIZE_BYTES);
+            simd64_t constValue;
+            memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
+            var_types loadType = compiler->getSIMDTypeForSize(regSize);
+            genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
+        }
+#endif
         else
         {
             // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
@@ -3236,53 +3231,29 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 
         while (bytesWritten < size)
         {
-#ifdef TARGET_X86
-            if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
-            {
-                simdMov = INS_movq;
-                regSize = 8;
-            }
-#endif
             if (bytesWritten + regSize > size)
             {
+                // We have a remainder that is smaller than regSize.
                 break;
             }
 
             emitSimdMovs();
             dstOffset += regSize;
             bytesWritten += regSize;
-
-            if (!zeroing)
-            {
-                assert(regSize <= YMM_REGSIZE_BYTES);
-            }
-
-            if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
-            {
-                regSize = XMM_REGSIZE_BYTES;
-            }
         }
 
         size -= bytesWritten;
 
-        // Handle the remainder by overlapping with previously processed data (only for zeroing)
-        if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
+        // Handle the remainder by overlapping with previously processed data
+        if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
         {
-            if (isPow2(size) && (size <= REGSIZE_BYTES))
-            {
-                // For sizes like 1,2,4 and 8 we delegate handling to normal stores
-                // because that will be a single instruction that is smaller than SIMD mov
-            }
-            else
-            {
-                // Get optimal register size to cover the whole remainder (with overlapping)
-                regSize = compiler->roundUpSIMDSize(size);
+            // Get optimal register size to cover the whole remainder (with overlapping)
+            regSize = compiler->roundUpSIMDSize(size);
 
-                // Rewind dstOffset so we can fit a vector for the while remainder
-                dstOffset -= (regSize - size);
-                emitSimdMovs();
-                size = 0;
-            }
+            // Rewind dstOffset so we can fit a vector for the while remainder
+            dstOffset -= (regSize - size);
+            emitSimdMovs();
+            size = 0;
         }
     }
 

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
             }
             else
             {
-
                 // The fill value of an initblk is interpreted to hold a
                 // value of (unsigned int8) however a constant of any size
                 // may practically reside on the evaluation stack. So extract
@@ -337,38 +336,31 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 ssize_t fill = src->AsIntCon()->IconValue() & 0xFF;
 
-                if (fill == 0)
+                const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences();
+                if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd))
                 {
-                    if (size >= XMM_REGSIZE_BYTES)
-                    {
-                        const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
-#ifdef TARGET_AMD64
+                    // It turns out we can't use SIMD so the default threshold is too big
+                    goto TOO_BIG_TO_UNROLL;
+                }
 
-                        bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0;
-                        if (!willUseOnlySimdMov)
-                        {
-                            // If we have a remainder we still might only use SIMD to process it (via overlapping)
-                            // unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8)
-                            const unsigned remainder = size % XMM_REGSIZE_BYTES;
-                            if (!isPow2(remainder) || (remainder > REGSIZE_BYTES))
-                            {
-                                willUseOnlySimdMov = true;
-                            }
-                        }
+                bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
+                if (willUseSimd)
+                {
+// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
+// just to handle the remainder when we can do that with an overlapped SIMD load).
+#ifdef FEATURE_SIMD
+                    src->SetContained();
 #else
-                        const bool willUseOnlySimdMov = (size % 8 == 0);
-#endif
-                        if (willUseOnlySimdMov && canUse16BytesSimdMov)
-                        {
-                            src->SetContained();
-                        }
-                        else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset,
-                                                                 /*canUseSimd*/ canUse16BytesSimdMov))
-                        {
-                            // It turns out we can't use SIMD so the default threshold is too big
-                            goto TOO_BIG_TO_UNROLL;
-                        }
+                    if (fill == 0)
+                    {
+                        // When we don't have FEATURE_SIMD we can only handle zeroing
+                        src->SetContained();
                     }
+#endif
+                }
+                else if (fill == 0)
+                {
+                    // Leave as is - zero shouldn't be contained when we don't use SIMD.
                 }
 #ifdef TARGET_AMD64
                 else if (size >= REGSIZE_BYTES)