@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
12
12
;
13
13
; CHECK-NODOT-LABEL: udot:
14
14
; CHECK-NODOT: // %bb.0:
15
- ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16
- ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17
- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18
- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19
- ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20
- ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21
- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
15
+ ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16
+ ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17
+ ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18
+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19
+ ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20
+ ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21
+ ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22
+ ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23
+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
22
24
; CHECK-NODOT-NEXT: ret
23
25
%u.wide = zext <16 x i8 > %u to <16 x i32 >
24
26
%s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -95,17 +97,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
95
97
;
96
98
; CHECK-NODOT-LABEL: udot_narrow:
97
99
; CHECK-NODOT: // %bb.0:
98
- ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
100
+ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
101
+ ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
99
102
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
100
- ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
101
- ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
102
- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
103
- ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
103
+ ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
104
+ ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
105
+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
106
+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
107
+ ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
104
108
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
105
- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
106
- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
107
- ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
109
+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
110
+ ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
108
111
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
112
+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
109
113
; CHECK-NODOT-NEXT: ret
110
114
%u.wide = zext <8 x i8 > %u to <8 x i32 >
111
115
%s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -122,13 +126,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
122
126
;
123
127
; CHECK-NODOT-LABEL: sdot:
124
128
; CHECK-NODOT: // %bb.0:
125
- ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
126
- ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
127
- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
128
- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
129
- ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
130
- ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
131
- ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
129
+ ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
130
+ ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
131
+ ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
132
+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
133
+ ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
134
+ ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
135
+ ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
136
+ ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
137
+ ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
132
138
; CHECK-NODOT-NEXT: ret
133
139
%u.wide = sext <16 x i8 > %u to <16 x i32 >
134
140
%s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -145,17 +151,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
145
151
;
146
152
; CHECK-NODOT-LABEL: sdot_narrow:
147
153
; CHECK-NODOT: // %bb.0:
148
- ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
154
+ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
155
+ ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
149
156
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
150
- ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
151
- ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
152
- ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
153
- ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
157
+ ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
158
+ ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
159
+ ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
160
+ ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
161
+ ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
154
162
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
155
- ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
156
- ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
157
- ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
163
+ ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
164
+ ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
158
165
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
166
+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
159
167
; CHECK-NODOT-NEXT: ret
160
168
%u.wide = sext <8 x i8 > %u to <8 x i32 >
161
169
%s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -407,19 +415,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
407
415
;
408
416
; CHECK-NODOT-LABEL: udot_8to64:
409
417
; CHECK-NODOT: // %bb.0: // %entry
410
- ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
411
- ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
412
- ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
413
- ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
418
+ ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
419
+ ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
420
+ ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
421
+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
422
+ ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
423
+ ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
414
424
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
415
- ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
416
- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
417
- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
418
- ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
419
- ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
420
- ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
421
- ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
422
- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
425
+ ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
426
+ ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
427
+ ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
428
+ ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
429
+ ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
430
+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
431
+ ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
432
+ ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
433
+ ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
434
+ ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
435
+ ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
436
+ ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
437
+ ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
438
+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
423
439
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
424
440
; CHECK-NODOT-NEXT: ret
425
441
entry:
@@ -442,19 +458,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
442
458
;
443
459
; CHECK-NODOT-LABEL: sdot_8to64:
444
460
; CHECK-NODOT: // %bb.0: // %entry
445
- ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
446
- ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
447
- ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
448
- ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
461
+ ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
462
+ ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
463
+ ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
464
+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
465
+ ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
466
+ ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
449
467
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
450
- ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
451
- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
452
- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
453
- ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
454
- ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
455
- ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
456
- ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
457
- ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
468
+ ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
469
+ ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
470
+ ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
471
+ ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
472
+ ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
473
+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
474
+ ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
475
+ ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
476
+ ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
477
+ ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
478
+ ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
479
+ ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
480
+ ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
481
+ ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
458
482
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
459
483
; CHECK-NODOT-NEXT: ret
460
484
entry:
@@ -771,9 +795,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
771
795
define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
772
796
; CHECK-LABEL: not_udot:
773
797
; CHECK: // %bb.0:
774
- ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
775
- ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
776
- ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
798
+ ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
799
+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
800
+ ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
801
+ ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
777
802
; CHECK-NEXT: ret
778
803
%u.wide = zext <8 x i8 > %u to <8 x i32 >
779
804
%s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments