From dd6acfb01ac94be4115b1d17c5e970396111975d Mon Sep 17 00:00:00 2001 From: Shao-Ce SUN Date: Thu, 15 Jun 2023 23:32:36 +0800 Subject: [PATCH] Use a3 for widening multiply in ch6.4 This change makes the assembly code more clear and consistent, as x10 (a0) is already used to hold the total number of elements to process, while a3 is an argument register in the function calling convention that is suitable for the multiplier. This way, the programmer can easily understand the source and purpose of the multiplier for the widening multiplication. --- v-spec.adoc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/v-spec.adoc b/v-spec.adoc index d9baec2..21d205d 100644 --- a/v-spec.adoc +++ b/v-spec.adoc @@ -1331,21 +1331,22 @@ throughput on mixed-width operations in a single loop. # a0 holds the total number of elements to process # a1 holds the address of the source array # a2 holds the address of the destination array +# a3 holds the multiplier for the widening multiplication loop: - vsetvli a3, a0, e16, m4, ta, ma # vtype = 16-bit integer vectors; - # also update a3 with vl (# of elements this iteration) + vsetvli t0, a0, e16, m4, ta, ma # vtype = 16-bit integer vectors; + # also update t0 with vl (# of elements this iteration) vle16.v v4, (a1) # Get 16b vector - slli t1, a3, 1 # Multiply # elements this iteration by 2 bytes/source element + slli t1, t0, 1 # Multiply # elements this iteration by 2 bytes/source element add a1, a1, t1 # Bump pointer - vwmul.vx v8, v4, x10 # Widening multiply into 32b in + vwmul.vx v8, v4, a3 # Widening multiply into 32b in vsetvli x0, x0, e32, m8, ta, ma # Operate on 32b values vsrl.vi v8, v8, 3 vse32.v v8, (a2) # Store vector of 32b elements - slli t1, a3, 2 # Multiply # elements this iteration by 4 bytes/destination element + slli t1, t0, 2 # Multiply # elements this iteration by 4 bytes/destination element add a2, a2, t1 # Bump pointer - sub a0, a0, a3 # Decrement count by vl + sub a0, a0, t0 # Decrement count by vl bnez a0, loop # Any more? ----