Text file
src/math/big/arith_s390x.s
1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
13// func addVV(z, x, y []Word) (c Word)
14
15TEXT ·addVV(SB), NOSPLIT, $0
16 MOVD addvectorfacility+0x00(SB), R1
17 BR (R1)
18
19TEXT ·addVV_check(SB), NOSPLIT, $0
20 MOVB ·hasVX(SB), R1
21 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
22 MOVD $addvectorfacility+0x00(SB), R1
23 MOVD $·addVV_novec(SB), R2
24 MOVD R2, 0(R1)
25
26 // MOVD $·addVV_novec(SB), 0(R1)
27 BR ·addVV_novec(SB)
28
29vectorimpl:
30 MOVD $addvectorfacility+0x00(SB), R1
31 MOVD $·addVV_vec(SB), R2
32 MOVD R2, 0(R1)
33
34 // MOVD $·addVV_vec(SB), 0(R1)
35 BR ·addVV_vec(SB)
36
37GLOBL addvectorfacility+0x00(SB), NOPTR, $8
38DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
39
40TEXT ·addVV_vec(SB), NOSPLIT, $0
41 MOVD z_len+8(FP), R3
42 MOVD x+24(FP), R8
43 MOVD y+48(FP), R9
44 MOVD z+0(FP), R2
45
46 MOVD $0, R4 // c = 0
47 MOVD $0, R0 // make sure it's zero
48 MOVD $0, R10 // i = 0
49
50 // s/JL/JMP/ below to disable the unrolled loop
51 SUB $4, R3
52 BLT v1
53 SUB $12, R3 // n -= 16
54 BLT A1 // if n < 0 goto A1
55
56 MOVD R8, R5
57 MOVD R9, R6
58 MOVD R2, R7
59
60 // n >= 0
61 // regular loop body unrolled 16x
62 VZERO V0 // c = 0
63
64UU1:
65 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
66 ADD $64, R5
67 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
68 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
69
70 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
71 ADD $64, R6
72 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
73 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
74
75 VACCCQ V1, V9, V0, V25
76 VACQ V1, V9, V0, V17
77 VACCCQ V2, V10, V25, V26
78 VACQ V2, V10, V25, V18
79
80 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
81 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
82 ADD $32, R5
83 ADD $32, R6
84
85 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
86 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
87 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
88 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
89
90 VACCCQ V3, V11, V26, V27
91 VACQ V3, V11, V26, V19
92 VACCCQ V4, V12, V27, V28
93 VACQ V4, V12, V27, V20
94
95 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
96 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
97 ADD $32, R5
98 ADD $32, R6
99
100 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
101 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
102 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
103 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
104
105 VACCCQ V5, V13, V28, V29
106 VACQ V5, V13, V28, V21
107 VACCCQ V6, V14, V29, V30
108 VACQ V6, V14, V29, V22
109
110 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
111 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
112 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
113 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
114
115 VACCCQ V7, V15, V30, V31
116 VACQ V7, V15, V30, V23
117 VACCCQ V8, V16, V31, V0 // V0 has carry-over
118 VACQ V8, V16, V31, V24
119
120 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
121 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
122 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
123 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
124 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
125 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
126 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
127 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
128 VSTM V17, V24, 0(R7) // 128-bytes into z
129 ADD $128, R7
130 ADD $128, R10 // i += 16
131 SUB $16, R3 // n -= 16
132 BGE UU1 // if n >= 0 goto U1
133 VLGVG $1, V0, R4 // put cf into R4
134 NEG R4, R4 // save cf
135
136A1:
137 ADD $12, R3 // n += 16
138
139 // s/JL/JMP/ below to disable the unrolled loop
140 BLT v1 // if n < 0 goto v1
141
142U1: // n >= 0
143 // regular loop body unrolled 4x
144 MOVD 0(R8)(R10*1), R5
145 MOVD 8(R8)(R10*1), R6
146 MOVD 16(R8)(R10*1), R7
147 MOVD 24(R8)(R10*1), R1
148 ADDC R4, R4 // restore CF
149 MOVD 0(R9)(R10*1), R11
150 ADDE R11, R5
151 MOVD 8(R9)(R10*1), R11
152 ADDE R11, R6
153 MOVD 16(R9)(R10*1), R11
154 ADDE R11, R7
155 MOVD 24(R9)(R10*1), R11
156 ADDE R11, R1
157 MOVD R0, R4
158 ADDE R4, R4 // save CF
159 NEG R4, R4
160 MOVD R5, 0(R2)(R10*1)
161 MOVD R6, 8(R2)(R10*1)
162 MOVD R7, 16(R2)(R10*1)
163 MOVD R1, 24(R2)(R10*1)
164
165 ADD $32, R10 // i += 4
166 SUB $4, R3 // n -= 4
167 BGE U1 // if n >= 0 goto U1
168
169v1:
170 ADD $4, R3 // n += 4
171 BLE E1 // if n <= 0 goto E1
172
173L1: // n > 0
174 ADDC R4, R4 // restore CF
175 MOVD 0(R8)(R10*1), R5
176 MOVD 0(R9)(R10*1), R11
177 ADDE R11, R5
178 MOVD R5, 0(R2)(R10*1)
179 MOVD R0, R4
180 ADDE R4, R4 // save CF
181 NEG R4, R4
182
183 ADD $8, R10 // i++
184 SUB $1, R3 // n--
185 BGT L1 // if n > 0 goto L1
186
187E1:
188 NEG R4, R4
189 MOVD R4, c+72(FP) // return c
190 RET
191
192TEXT ·addVV_novec(SB), NOSPLIT, $0
193novec:
194 MOVD z_len+8(FP), R3
195 MOVD x+24(FP), R8
196 MOVD y+48(FP), R9
197 MOVD z+0(FP), R2
198
199 MOVD $0, R4 // c = 0
200 MOVD $0, R0 // make sure it's zero
201 MOVD $0, R10 // i = 0
202
203 // s/JL/JMP/ below to disable the unrolled loop
204 SUB $4, R3 // n -= 4
205 BLT v1n // if n < 0 goto v1n
206
207U1n: // n >= 0
208 // regular loop body unrolled 4x
209 MOVD 0(R8)(R10*1), R5
210 MOVD 8(R8)(R10*1), R6
211 MOVD 16(R8)(R10*1), R7
212 MOVD 24(R8)(R10*1), R1
213 ADDC R4, R4 // restore CF
214 MOVD 0(R9)(R10*1), R11
215 ADDE R11, R5
216 MOVD 8(R9)(R10*1), R11
217 ADDE R11, R6
218 MOVD 16(R9)(R10*1), R11
219 ADDE R11, R7
220 MOVD 24(R9)(R10*1), R11
221 ADDE R11, R1
222 MOVD R0, R4
223 ADDE R4, R4 // save CF
224 NEG R4, R4
225 MOVD R5, 0(R2)(R10*1)
226 MOVD R6, 8(R2)(R10*1)
227 MOVD R7, 16(R2)(R10*1)
228 MOVD R1, 24(R2)(R10*1)
229
230 ADD $32, R10 // i += 4
231 SUB $4, R3 // n -= 4
232 BGE U1n // if n >= 0 goto U1n
233
234v1n:
235 ADD $4, R3 // n += 4
236 BLE E1n // if n <= 0 goto E1n
237
238L1n: // n > 0
239 ADDC R4, R4 // restore CF
240 MOVD 0(R8)(R10*1), R5
241 MOVD 0(R9)(R10*1), R11
242 ADDE R11, R5
243 MOVD R5, 0(R2)(R10*1)
244 MOVD R0, R4
245 ADDE R4, R4 // save CF
246 NEG R4, R4
247
248 ADD $8, R10 // i++
249 SUB $1, R3 // n--
250 BGT L1n // if n > 0 goto L1n
251
252E1n:
253 NEG R4, R4
254 MOVD R4, c+72(FP) // return c
255 RET
256
257TEXT ·subVV(SB), NOSPLIT, $0
258 MOVD subvectorfacility+0x00(SB), R1
259 BR (R1)
260
261TEXT ·subVV_check(SB), NOSPLIT, $0
262 MOVB ·hasVX(SB), R1
263 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
264 MOVD $subvectorfacility+0x00(SB), R1
265 MOVD $·subVV_novec(SB), R2
266 MOVD R2, 0(R1)
267
268 // MOVD $·subVV_novec(SB), 0(R1)
269 BR ·subVV_novec(SB)
270
271vectorimpl:
272 MOVD $subvectorfacility+0x00(SB), R1
273 MOVD $·subVV_vec(SB), R2
274 MOVD R2, 0(R1)
275
276 // MOVD $·subVV_vec(SB), 0(R1)
277 BR ·subVV_vec(SB)
278
279GLOBL subvectorfacility+0x00(SB), NOPTR, $8
280DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
281
282// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
283// func subVV(z, x, y []Word) (c Word)
284// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
285TEXT ·subVV_vec(SB), NOSPLIT, $0
286 MOVD z_len+8(FP), R3
287 MOVD x+24(FP), R8
288 MOVD y+48(FP), R9
289 MOVD z+0(FP), R2
290 MOVD $0, R4 // c = 0
291 MOVD $0, R0 // make sure it's zero
292 MOVD $0, R10 // i = 0
293
294 // s/JL/JMP/ below to disable the unrolled loop
295 SUB $4, R3 // n -= 4
296 BLT v1 // if n < 0 goto v1
297 SUB $12, R3 // n -= 16
298 BLT A1 // if n < 0 goto A1
299
300 MOVD R8, R5
301 MOVD R9, R6
302 MOVD R2, R7
303
304 // n >= 0
305 // regular loop body unrolled 16x
306 VZERO V0 // cf = 0
307 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
308 VLVGG $1, R4, V0 // put carry into V0
309
310UU1:
311 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
312 ADD $64, R5
313 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
314 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
315
316 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
317 ADD $64, R6
318 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
319 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
320
321 VSBCBIQ V1, V9, V0, V25
322 VSBIQ V1, V9, V0, V17
323 VSBCBIQ V2, V10, V25, V26
324 VSBIQ V2, V10, V25, V18
325
326 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
327 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
328 ADD $32, R5
329 ADD $32, R6
330
331 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
332 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
333 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
334 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
335
336 VSBCBIQ V3, V11, V26, V27
337 VSBIQ V3, V11, V26, V19
338 VSBCBIQ V4, V12, V27, V28
339 VSBIQ V4, V12, V27, V20
340
341 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
342 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
343 ADD $32, R5
344 ADD $32, R6
345
346 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
347 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
348 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
349 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
350
351 VSBCBIQ V5, V13, V28, V29
352 VSBIQ V5, V13, V28, V21
353 VSBCBIQ V6, V14, V29, V30
354 VSBIQ V6, V14, V29, V22
355
356 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
357 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
358 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
359 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
360
361 VSBCBIQ V7, V15, V30, V31
362 VSBIQ V7, V15, V30, V23
363 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
364 VSBIQ V8, V16, V31, V24
365
366 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
367 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
368 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
369 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
370 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
371 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
372 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
373 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
374 VSTM V17, V24, 0(R7) // 128-bytes into z
375 ADD $128, R7
376 ADD $128, R10 // i += 16
377 SUB $16, R3 // n -= 16
378 BGE UU1 // if n >= 0 goto U1
379 VLGVG $1, V0, R4 // put cf into R4
380 SUB $1, R4 // save cf
381
382A1:
383 ADD $12, R3 // n += 16
384 BLT v1 // if n < 0 goto v1
385
386U1: // n >= 0
387 // regular loop body unrolled 4x
388 MOVD 0(R8)(R10*1), R5
389 MOVD 8(R8)(R10*1), R6
390 MOVD 16(R8)(R10*1), R7
391 MOVD 24(R8)(R10*1), R1
392 MOVD R0, R11
393 SUBC R4, R11 // restore CF
394 MOVD 0(R9)(R10*1), R11
395 SUBE R11, R5
396 MOVD 8(R9)(R10*1), R11
397 SUBE R11, R6
398 MOVD 16(R9)(R10*1), R11
399 SUBE R11, R7
400 MOVD 24(R9)(R10*1), R11
401 SUBE R11, R1
402 MOVD R0, R4
403 SUBE R4, R4 // save CF
404 MOVD R5, 0(R2)(R10*1)
405 MOVD R6, 8(R2)(R10*1)
406 MOVD R7, 16(R2)(R10*1)
407 MOVD R1, 24(R2)(R10*1)
408
409 ADD $32, R10 // i += 4
410 SUB $4, R3 // n -= 4
411 BGE U1 // if n >= 0 goto U1n
412
413v1:
414 ADD $4, R3 // n += 4
415 BLE E1 // if n <= 0 goto E1
416
417L1: // n > 0
418 MOVD R0, R11
419 SUBC R4, R11 // restore CF
420 MOVD 0(R8)(R10*1), R5
421 MOVD 0(R9)(R10*1), R11
422 SUBE R11, R5
423 MOVD R5, 0(R2)(R10*1)
424 MOVD R0, R4
425 SUBE R4, R4 // save CF
426
427 ADD $8, R10 // i++
428 SUB $1, R3 // n--
429 BGT L1 // if n > 0 goto L1n
430
431E1:
432 NEG R4, R4
433 MOVD R4, c+72(FP) // return c
434 RET
435
436// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
437// func subVV(z, x, y []Word) (c Word)
438// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
439TEXT ·subVV_novec(SB), NOSPLIT, $0
440 MOVD z_len+8(FP), R3
441 MOVD x+24(FP), R8
442 MOVD y+48(FP), R9
443 MOVD z+0(FP), R2
444
445 MOVD $0, R4 // c = 0
446 MOVD $0, R0 // make sure it's zero
447 MOVD $0, R10 // i = 0
448
449 // s/JL/JMP/ below to disable the unrolled loop
450 SUB $4, R3 // n -= 4
451 BLT v1 // if n < 0 goto v1
452
453U1: // n >= 0
454 // regular loop body unrolled 4x
455 MOVD 0(R8)(R10*1), R5
456 MOVD 8(R8)(R10*1), R6
457 MOVD 16(R8)(R10*1), R7
458 MOVD 24(R8)(R10*1), R1
459 MOVD R0, R11
460 SUBC R4, R11 // restore CF
461 MOVD 0(R9)(R10*1), R11
462 SUBE R11, R5
463 MOVD 8(R9)(R10*1), R11
464 SUBE R11, R6
465 MOVD 16(R9)(R10*1), R11
466 SUBE R11, R7
467 MOVD 24(R9)(R10*1), R11
468 SUBE R11, R1
469 MOVD R0, R4
470 SUBE R4, R4 // save CF
471 MOVD R5, 0(R2)(R10*1)
472 MOVD R6, 8(R2)(R10*1)
473 MOVD R7, 16(R2)(R10*1)
474 MOVD R1, 24(R2)(R10*1)
475
476 ADD $32, R10 // i += 4
477 SUB $4, R3 // n -= 4
478 BGE U1 // if n >= 0 goto U1
479
480v1:
481 ADD $4, R3 // n += 4
482 BLE E1 // if n <= 0 goto E1
483
484L1: // n > 0
485 MOVD R0, R11
486 SUBC R4, R11 // restore CF
487 MOVD 0(R8)(R10*1), R5
488 MOVD 0(R9)(R10*1), R11
489 SUBE R11, R5
490 MOVD R5, 0(R2)(R10*1)
491 MOVD R0, R4
492 SUBE R4, R4 // save CF
493
494 ADD $8, R10 // i++
495 SUB $1, R3 // n--
496 BGT L1 // if n > 0 goto L1
497
498E1:
499 NEG R4, R4
500 MOVD R4, c+72(FP) // return c
501 RET
502
503TEXT ·addVW(SB), NOSPLIT, $0
504 MOVD z_len+8(FP), R5 // length of z
505 MOVD x+24(FP), R6
506 MOVD y+48(FP), R7 // c = y
507 MOVD z+0(FP), R8
508
509 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
510
511 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
512 ADDC 0(R6), R7
513 MOVD R7, 0(R8)
514 CMPBEQ R5, $1, returnResult // len(z) == 1
515 MOVD $0, R9
516 ADDE 8(R6), R9
517 MOVD R9, 8(R8)
518 CMPBEQ R5, $2, returnResult // len(z) == 2
519
520 // Update the counters
521 MOVD $16, R12 // i = 2
522 MOVD $-2(R5), R5 // n = n - 2
523
524loopOverEachWord:
525 BRC $12, copySetup // carry = 0, copy the rest
526 MOVD $1, R9
527
528 // Originally we used the carry flag generated in the previous iteration
529 // (i.e: ADDE could be used here to do the addition). However, since we
530 // already know carry is 1 (otherwise we will go to copy section), we can use
531 // ADDC here so the current iteration does not depend on the carry flag
532 // generated in the previous iteration. This could be useful when branch prediction happens.
533 ADDC 0(R6)(R12*1), R9
534 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
535
536 MOVD $8(R12), R12 // i++
537 BRCTG R5, loopOverEachWord // n--
538
539// Return the current carry value
540returnResult:
541 MOVD $0, R0
542 ADDE R0, R0
543 MOVD R0, c+56(FP)
544 RET
545
546// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
547// With the assumption that x and z will not overlap with each other or x and z will
548// point to same memory region, we can use a faster version of copy using only MVC here.
549// In the following implementation, we have three copy loops, each copying a word, 4 words, and
550// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
551copySetup:
552 ADD R12, R6
553 ADD R12, R8
554
555 CMPBGE R5, $4, mediumLoop
556
557smallLoop: // does a loop unrolling to copy word when n < 4
558 CMPBEQ R5, $0, returnZero
559 MVC $8, 0(R6), 0(R8)
560 CMPBEQ R5, $1, returnZero
561 MVC $8, 8(R6), 8(R8)
562 CMPBEQ R5, $2, returnZero
563 MVC $8, 16(R6), 16(R8)
564
565returnZero:
566 MOVD $0, c+56(FP) // return 0 as carry
567 RET
568
569mediumLoop:
570 CMPBLT R5, $4, smallLoop
571 CMPBLT R5, $32, mediumLoopBody
572
573largeLoop: // Copying 256 bytes at a time.
574 MVC $256, 0(R6), 0(R8)
575 MOVD $256(R6), R6
576 MOVD $256(R8), R8
577 MOVD $-32(R5), R5
578 CMPBGE R5, $32, largeLoop
579 BR mediumLoop
580
581mediumLoopBody: // Copying 32 bytes at a time
582 MVC $32, 0(R6), 0(R8)
583 MOVD $32(R6), R6
584 MOVD $32(R8), R8
585 MOVD $-4(R5), R5
586 CMPBGE R5, $4, mediumLoopBody
587 BR smallLoop
588
589returnC:
590 MOVD R7, c+56(FP)
591 RET
592
593TEXT ·subVW(SB), NOSPLIT, $0
594 MOVD z_len+8(FP), R5
595 MOVD x+24(FP), R6
596 MOVD y+48(FP), R7 // The borrow bit passed in
597 MOVD z+0(FP), R8
598 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
599
600 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
601
602 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
603 MOVD 0(R6), R9
604 SUBC R7, R9
605 MOVD R9, 0(R8)
606 CMPBEQ R5, $1, returnResult
607 MOVD 8(R6), R9
608 SUBE R0, R9
609 MOVD R9, 8(R8)
610 CMPBEQ R5, $2, returnResult
611
612 // Update the counters
613 MOVD $16, R12 // i = 2
614 MOVD $-2(R5), R5 // n = n - 2
615
616loopOverEachWord:
617 BRC $3, copySetup // no borrow, copy the rest
618 MOVD 0(R6)(R12*1), R9
619
620 // Originally we used the borrow flag generated in the previous iteration
621 // (i.e: SUBE could be used here to do the subtraction). However, since we
622 // already know borrow is 1 (otherwise we will go to copy section), we can
623 // use SUBC here so the current iteration does not depend on the borrow flag
624 // generated in the previous iteration. This could be useful when branch prediction happens.
625 SUBC $1, R9
626 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
627
628 MOVD $8(R12), R12 // i++
629 BRCTG R5, loopOverEachWord // n--
630
631// return the current borrow value
632returnResult:
633 SUBE R0, R0
634 NEG R0, R0
635 MOVD R0, c+56(FP)
636 RET
637
638// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
639// With the assumption that x and z will not overlap with each other or x and z will
640// point to same memory region, we can use a faster version of copy using only MVC here.
641// In the following implementation, we have three copy loops, each copying a word, 4 words, and
642// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
643copySetup:
644 ADD R12, R6
645 ADD R12, R8
646
647 CMPBGE R5, $4, mediumLoop
648
649smallLoop: // does a loop unrolling to copy word when n < 4
650 CMPBEQ R5, $0, returnZero
651 MVC $8, 0(R6), 0(R8)
652 CMPBEQ R5, $1, returnZero
653 MVC $8, 8(R6), 8(R8)
654 CMPBEQ R5, $2, returnZero
655 MVC $8, 16(R6), 16(R8)
656
657returnZero:
658 MOVD $0, c+56(FP) // return 0 as borrow
659 RET
660
661mediumLoop:
662 CMPBLT R5, $4, smallLoop
663 CMPBLT R5, $32, mediumLoopBody
664
665largeLoop: // Copying 256 bytes at a time
666 MVC $256, 0(R6), 0(R8)
667 MOVD $256(R6), R6
668 MOVD $256(R8), R8
669 MOVD $-32(R5), R5
670 CMPBGE R5, $32, largeLoop
671 BR mediumLoop
672
673mediumLoopBody: // Copying 32 bytes at a time
674 MVC $32, 0(R6), 0(R8)
675 MOVD $32(R6), R6
676 MOVD $32(R8), R8
677 MOVD $-4(R5), R5
678 CMPBGE R5, $4, mediumLoopBody
679 BR smallLoop
680
681returnC:
682 MOVD R7, c+56(FP)
683 RET
684
685// func shlVU(z, x []Word, s uint) (c Word)
686TEXT ·shlVU(SB), NOSPLIT, $0
687 BR ·shlVU_g(SB)
688
689// func shrVU(z, x []Word, s uint) (c Word)
690TEXT ·shrVU(SB), NOSPLIT, $0
691 BR ·shrVU_g(SB)
692
693// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
694// func mulAddVWW(z, x []Word, y, r Word) (c Word)
695TEXT ·mulAddVWW(SB), NOSPLIT, $0
696 MOVD z+0(FP), R2
697 MOVD x+24(FP), R8
698 MOVD y+48(FP), R9
699 MOVD r+56(FP), R4 // c = r
700 MOVD z_len+8(FP), R5
701 MOVD $0, R1 // i = 0
702 MOVD $0, R7 // i*8 = 0
703 MOVD $0, R0 // make sure it's zero
704 BR E5
705
706L5:
707 MOVD (R8)(R1*1), R6
708 MULHDU R9, R6
709 ADDC R4, R11 // add to low order bits
710 ADDE R0, R6
711 MOVD R11, (R2)(R1*1)
712 MOVD R6, R4
713 ADD $8, R1 // i*8 + 8
714 ADD $1, R7 // i++
715
716E5:
717 CMPBLT R7, R5, L5 // i < n
718
719 MOVD R4, c+64(FP)
720 RET
721
722// func addMulVVW(z, x []Word, y Word) (c Word)
723// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
724TEXT ·addMulVVW(SB), NOSPLIT, $0
725 MOVD z+0(FP), R2
726 MOVD x+24(FP), R8
727 MOVD y+48(FP), R9
728 MOVD z_len+8(FP), R5
729
730 MOVD $0, R1 // i*8 = 0
731 MOVD $0, R7 // i = 0
732 MOVD $0, R0 // make sure it's zero
733 MOVD $0, R4 // c = 0
734
735 MOVD R5, R12
736 AND $-2, R12
737 CMPBGE R5, $2, A6
738 BR E6
739
740A6:
741 MOVD (R8)(R1*1), R6
742 MULHDU R9, R6
743 MOVD (R2)(R1*1), R10
744 ADDC R10, R11 // add to low order bits
745 ADDE R0, R6
746 ADDC R4, R11
747 ADDE R0, R6
748 MOVD R6, R4
749 MOVD R11, (R2)(R1*1)
750
751 MOVD (8)(R8)(R1*1), R6
752 MULHDU R9, R6
753 MOVD (8)(R2)(R1*1), R10
754 ADDC R10, R11 // add to low order bits
755 ADDE R0, R6
756 ADDC R4, R11
757 ADDE R0, R6
758 MOVD R6, R4
759 MOVD R11, (8)(R2)(R1*1)
760
761 ADD $16, R1 // i*8 + 8
762 ADD $2, R7 // i++
763
764 CMPBLT R7, R12, A6
765 BR E6
766
767L6:
768 MOVD (R8)(R1*1), R6
769 MULHDU R9, R6
770 MOVD (R2)(R1*1), R10
771 ADDC R10, R11 // add to low order bits
772 ADDE R0, R6
773 ADDC R4, R11
774 ADDE R0, R6
775 MOVD R6, R4
776 MOVD R11, (R2)(R1*1)
777
778 ADD $8, R1 // i*8 + 8
779 ADD $1, R7 // i++
780
781E6:
782 CMPBLT R7, R5, L6 // i < n
783
784 MOVD R4, c+56(FP)
785 RET
786
View as plain text