From dd6acfb01ac94be4115b1d17c5e970396111975d Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <sunshaoce@iscas.ac.cn>
Date: Thu, 15 Jun 2023 23:32:36 +0800
Subject: [PATCH] Use a3 for widening multiply in ch6.4

This change makes the assembly code more clear and consistent, as x10 (a0) is already used to hold the total number of elements to process, while a3 is an argument register in the function calling convention that is suitable for the multiplier. This way, the programmer can easily understand the source and purpose of the multiplier for the widening multiplication.
---
 v-spec.adoc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/v-spec.adoc b/v-spec.adoc
index d9baec2..21d205d 100644
--- a/v-spec.adoc
+++ b/v-spec.adoc
@@ -1331,21 +1331,22 @@ throughput on mixed-width operations in a single loop.
 #  a0 holds the total number of elements to process
 #  a1 holds the address of the source array
 #  a2 holds the address of the destination array
+#  a3 holds the multiplier for the widening multiplication
 
 loop:
-    vsetvli a3, a0, e16, m4, ta, ma  # vtype = 16-bit integer vectors;
-                                     # also update a3 with vl (# of elements this iteration)
+    vsetvli t0, a0, e16, m4, ta, ma  # vtype = 16-bit integer vectors;
+                                     # also update t0 with vl (# of elements this iteration)
     vle16.v v4, (a1)        # Get 16b vector
-    slli t1, a3, 1          # Multiply # elements this iteration by 2 bytes/source element
+    slli t1, t0, 1          # Multiply # elements this iteration by 2 bytes/source element
     add a1, a1, t1          # Bump pointer
-    vwmul.vx v8, v4, x10    # Widening multiply into 32b in <v8--v15>
+    vwmul.vx v8, v4, a3     # Widening multiply into 32b in <v8--v15>
 
     vsetvli x0, x0, e32, m8, ta, ma  # Operate on 32b values
     vsrl.vi v8, v8, 3
     vse32.v v8, (a2)        # Store vector of 32b elements
-    slli t1, a3, 2          # Multiply # elements this iteration by 4 bytes/destination element
+    slli t1, t0, 2          # Multiply # elements this iteration by 4 bytes/destination element
     add a2, a2, t1          # Bump pointer
-    sub a0, a0, a3          # Decrement count by vl
+    sub a0, a0, t0          # Decrement count by vl
     bnez a0, loop           # Any more?
 ----