Text file
src/math/big/arith_arm64.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// TODO: Consider re-implementing using Advanced SIMD
13// once the assembler supports those instructions.
14
15// func addVV(z, x, y []Word) (c Word)
16TEXT ·addVV(SB),NOSPLIT,$0
17 MOVD z_len+8(FP), R0
18 MOVD x+24(FP), R8
19 MOVD y+48(FP), R9
20 MOVD z+0(FP), R10
21 ADDS $0, R0 // clear carry flag
22 TBZ $0, R0, two
23 MOVD.P 8(R8), R11
24 MOVD.P 8(R9), R15
25 ADCS R15, R11
26 MOVD.P R11, 8(R10)
27 SUB $1, R0
28two:
29 TBZ $1, R0, loop
30 LDP.P 16(R8), (R11, R12)
31 LDP.P 16(R9), (R15, R16)
32 ADCS R15, R11
33 ADCS R16, R12
34 STP.P (R11, R12), 16(R10)
35 SUB $2, R0
36loop:
37 CBZ R0, done // careful not to touch the carry flag
38 LDP.P 32(R8), (R11, R12)
39 LDP -16(R8), (R13, R14)
40 LDP.P 32(R9), (R15, R16)
41 LDP -16(R9), (R17, R19)
42 ADCS R15, R11
43 ADCS R16, R12
44 ADCS R17, R13
45 ADCS R19, R14
46 STP.P (R11, R12), 32(R10)
47 STP (R13, R14), -16(R10)
48 SUB $4, R0
49 B loop
50done:
51 CSET HS, R0 // extract carry flag
52 MOVD R0, c+72(FP)
53 RET
54
55
56// func subVV(z, x, y []Word) (c Word)
57TEXT ·subVV(SB),NOSPLIT,$0
58 MOVD z_len+8(FP), R0
59 MOVD x+24(FP), R8
60 MOVD y+48(FP), R9
61 MOVD z+0(FP), R10
62 CMP R0, R0 // set carry flag
63 TBZ $0, R0, two
64 MOVD.P 8(R8), R11
65 MOVD.P 8(R9), R15
66 SBCS R15, R11
67 MOVD.P R11, 8(R10)
68 SUB $1, R0
69two:
70 TBZ $1, R0, loop
71 LDP.P 16(R8), (R11, R12)
72 LDP.P 16(R9), (R15, R16)
73 SBCS R15, R11
74 SBCS R16, R12
75 STP.P (R11, R12), 16(R10)
76 SUB $2, R0
77loop:
78 CBZ R0, done // careful not to touch the carry flag
79 LDP.P 32(R8), (R11, R12)
80 LDP -16(R8), (R13, R14)
81 LDP.P 32(R9), (R15, R16)
82 LDP -16(R9), (R17, R19)
83 SBCS R15, R11
84 SBCS R16, R12
85 SBCS R17, R13
86 SBCS R19, R14
87 STP.P (R11, R12), 32(R10)
88 STP (R13, R14), -16(R10)
89 SUB $4, R0
90 B loop
91done:
92 CSET LO, R0 // extract carry flag
93 MOVD R0, c+72(FP)
94 RET
95
96#define vwOneOp(instr, op1) \
97 MOVD.P 8(R1), R4; \
98 instr op1, R4; \
99 MOVD.P R4, 8(R3);
100
101// handle the first 1~4 elements before starting iteration in addVW/subVW
102#define vwPreIter(instr1, instr2, counter, target) \
103 vwOneOp(instr1, R2); \
104 SUB $1, counter; \
105 CBZ counter, target; \
106 vwOneOp(instr2, $0); \
107 SUB $1, counter; \
108 CBZ counter, target; \
109 vwOneOp(instr2, $0); \
110 SUB $1, counter; \
111 CBZ counter, target; \
112 vwOneOp(instr2, $0);
113
114// do one iteration of add or sub in addVW/subVW
115#define vwOneIter(instr, counter, exit) \
116 CBZ counter, exit; \ // careful not to touch the carry flag
117 LDP.P 32(R1), (R4, R5); \
118 LDP -16(R1), (R6, R7); \
119 instr $0, R4, R8; \
120 instr $0, R5, R9; \
121 instr $0, R6, R10; \
122 instr $0, R7, R11; \
123 STP.P (R8, R9), 32(R3); \
124 STP (R10, R11), -16(R3); \
125 SUB $4, counter;
126
127// do one iteration of copy in addVW/subVW
128#define vwOneIterCopy(counter, exit) \
129 CBZ counter, exit; \
130 LDP.P 32(R1), (R4, R5); \
131 LDP -16(R1), (R6, R7); \
132 STP.P (R4, R5), 32(R3); \
133 STP (R6, R7), -16(R3); \
134 SUB $4, counter;
135
136// func addVW(z, x []Word, y Word) (c Word)
137// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
138// and switches to copy if we are done with carries. The copying is skipped as well
139// if 'x' and 'z' happen to share the same underlying storage.
140// The overhead of the checking and branching is visible when 'z' are small (~5%),
141// so set a threshold of 32, and remain the small-sized part entirely untouched.
142TEXT ·addVW(SB),NOSPLIT,$0
143 MOVD z+0(FP), R3
144 MOVD z_len+8(FP), R0
145 MOVD x+24(FP), R1
146 MOVD y+48(FP), R2
147 CMP $32, R0
148 BGE large // large-sized 'z' and 'x'
149 CBZ R0, len0 // the length of z is 0
150 MOVD.P 8(R1), R4
151 ADDS R2, R4 // z[0] = x[0] + y, set carry
152 MOVD.P R4, 8(R3)
153 SUB $1, R0
154 CBZ R0, len1 // the length of z is 1
155 TBZ $0, R0, two
156 MOVD.P 8(R1), R4 // do it once
157 ADCS $0, R4
158 MOVD.P R4, 8(R3)
159 SUB $1, R0
160two: // do it twice
161 TBZ $1, R0, loop
162 LDP.P 16(R1), (R4, R5)
163 ADCS $0, R4, R8 // c, z[i] = x[i] + c
164 ADCS $0, R5, R9
165 STP.P (R8, R9), 16(R3)
166 SUB $2, R0
167loop: // do four times per round
168 vwOneIter(ADCS, R0, len1)
169 B loop
170len1:
171 CSET HS, R2 // extract carry flag
172len0:
173 MOVD R2, c+56(FP)
174done:
175 RET
176large:
177 AND $0x3, R0, R10
178 AND $~0x3, R0
179 // unrolling for the first 1~4 elements to avoid saving the carry
180 // flag in each step, adjust $R0 if we unrolled 4 elements
181 vwPreIter(ADDS, ADCS, R10, add4)
182 SUB $4, R0
183add4:
184 BCC copy
185 vwOneIter(ADCS, R0, len1)
186 B add4
187copy:
188 MOVD ZR, c+56(FP)
189 CMP R1, R3
190 BEQ done
191copy_4: // no carry flag, copy the rest
192 vwOneIterCopy(R0, done)
193 B copy_4
194
195// func subVW(z, x []Word, y Word) (c Word)
196// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
197// and switches to copy if we are done with carries. The copying is skipped as well
198// if 'x' and 'z' happen to share the same underlying storage.
199// The overhead of the checking and branching is visible when 'z' are small (~5%),
200// so set a threshold of 32, and remain the small-sized part entirely untouched.
201TEXT ·subVW(SB),NOSPLIT,$0
202 MOVD z+0(FP), R3
203 MOVD z_len+8(FP), R0
204 MOVD x+24(FP), R1
205 MOVD y+48(FP), R2
206 CMP $32, R0
207 BGE large // large-sized 'z' and 'x'
208 CBZ R0, len0 // the length of z is 0
209 MOVD.P 8(R1), R4
210 SUBS R2, R4 // z[0] = x[0] - y, set carry
211 MOVD.P R4, 8(R3)
212 SUB $1, R0
213 CBZ R0, len1 // the length of z is 1
214 TBZ $0, R0, two // do it once
215 MOVD.P 8(R1), R4
216 SBCS $0, R4
217 MOVD.P R4, 8(R3)
218 SUB $1, R0
219two: // do it twice
220 TBZ $1, R0, loop
221 LDP.P 16(R1), (R4, R5)
222 SBCS $0, R4, R8 // c, z[i] = x[i] + c
223 SBCS $0, R5, R9
224 STP.P (R8, R9), 16(R3)
225 SUB $2, R0
226loop: // do four times per round
227 vwOneIter(SBCS, R0, len1)
228 B loop
229len1:
230 CSET LO, R2 // extract carry flag
231len0:
232 MOVD R2, c+56(FP)
233done:
234 RET
235large:
236 AND $0x3, R0, R10
237 AND $~0x3, R0
238 // unrolling for the first 1~4 elements to avoid saving the carry
239 // flag in each step, adjust $R0 if we unrolled 4 elements
240 vwPreIter(SUBS, SBCS, R10, sub4)
241 SUB $4, R0
242sub4:
243 BCS copy
244 vwOneIter(SBCS, R0, len1)
245 B sub4
246copy:
247 MOVD ZR, c+56(FP)
248 CMP R1, R3
249 BEQ done
250copy_4: // no carry flag, copy the rest
251 vwOneIterCopy(R0, done)
252 B copy_4
253
254// func shlVU(z, x []Word, s uint) (c Word)
255// This implementation handles the shift operation from the high word to the low word,
256// which may be an error for the case where the low word of x overlaps with the high
257// word of z. When calling this function directly, you need to pay attention to this
258// situation.
259TEXT ·shlVU(SB),NOSPLIT,$0
260 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z)
261 MOVD x+24(FP), R2
262 MOVD s+48(FP), R3
263 ADD R1<<3, R0 // R0 = &z[n]
264 ADD R1<<3, R2 // R2 = &x[n]
265 CBZ R1, len0
266 CBZ R3, copy // if the number of shift is 0, just copy x to z
267 MOVD $64, R4
268 SUB R3, R4
269 // handling the most significant element x[n-1]
270 MOVD.W -8(R2), R6
271 LSR R4, R6, R5 // return value
272 LSL R3, R6, R8 // x[i] << s
273 SUB $1, R1
274one: TBZ $0, R1, two
275 MOVD.W -8(R2), R6
276 LSR R4, R6, R7
277 ORR R8, R7
278 LSL R3, R6, R8
279 SUB $1, R1
280 MOVD.W R7, -8(R0)
281two:
282 TBZ $1, R1, loop
283 LDP.W -16(R2), (R6, R7)
284 LSR R4, R7, R10
285 ORR R8, R10
286 LSL R3, R7
287 LSR R4, R6, R9
288 ORR R7, R9
289 LSL R3, R6, R8
290 SUB $2, R1
291 STP.W (R9, R10), -16(R0)
292loop:
293 CBZ R1, done
294 LDP.W -32(R2), (R10, R11)
295 LDP 16(R2), (R12, R13)
296 LSR R4, R13, R23
297 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
298 LSL R3, R13
299 LSR R4, R12, R22
300 ORR R13, R22
301 LSL R3, R12
302 LSR R4, R11, R21
303 ORR R12, R21
304 LSL R3, R11
305 LSR R4, R10, R20
306 ORR R11, R20
307 LSL R3, R10, R8
308 STP.W (R20, R21), -32(R0)
309 STP (R22, R23), 16(R0)
310 SUB $4, R1
311 B loop
312done:
313 MOVD.W R8, -8(R0) // the first element x[0]
314 MOVD R5, c+56(FP) // the part moved out from x[n-1]
315 RET
316copy:
317 CMP R0, R2
318 BEQ len0
319 TBZ $0, R1, ctwo
320 MOVD.W -8(R2), R4
321 MOVD.W R4, -8(R0)
322 SUB $1, R1
323ctwo:
324 TBZ $1, R1, cloop
325 LDP.W -16(R2), (R4, R5)
326 STP.W (R4, R5), -16(R0)
327 SUB $2, R1
328cloop:
329 CBZ R1, len0
330 LDP.W -32(R2), (R4, R5)
331 LDP 16(R2), (R6, R7)
332 STP.W (R4, R5), -32(R0)
333 STP (R6, R7), 16(R0)
334 SUB $4, R1
335 B cloop
336len0:
337 MOVD $0, c+56(FP)
338 RET
339
340// func shrVU(z, x []Word, s uint) (c Word)
341// This implementation handles the shift operation from the low word to the high word,
342// which may be an error for the case where the high word of x overlaps with the low
343// word of z. When calling this function directly, you need to pay attention to this
344// situation.
345TEXT ·shrVU(SB),NOSPLIT,$0
346 MOVD z+0(FP), R0
347 MOVD z_len+8(FP), R1
348 MOVD x+24(FP), R2
349 MOVD s+48(FP), R3
350 MOVD $0, R8
351 MOVD $64, R4
352 SUB R3, R4
353 CBZ R1, len0
354 CBZ R3, copy // if the number of shift is 0, just copy x to z
355
356 MOVD.P 8(R2), R20
357 LSR R3, R20, R8
358 LSL R4, R20
359 MOVD R20, c+56(FP) // deal with the first element
360 SUB $1, R1
361
362 TBZ $0, R1, two
363 MOVD.P 8(R2), R6
364 LSL R4, R6, R20
365 ORR R8, R20
366 LSR R3, R6, R8
367 MOVD.P R20, 8(R0)
368 SUB $1, R1
369two:
370 TBZ $1, R1, loop
371 LDP.P 16(R2), (R6, R7)
372 LSL R4, R6, R20
373 LSR R3, R6
374 ORR R8, R20
375 LSL R4, R7, R21
376 LSR R3, R7, R8
377 ORR R6, R21
378 STP.P (R20, R21), 16(R0)
379 SUB $2, R1
380loop:
381 CBZ R1, done
382 LDP.P 32(R2), (R10, R11)
383 LDP -16(R2), (R12, R13)
384 LSL R4, R10, R20
385 LSR R3, R10
386 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
387 LSL R4, R11, R21
388 LSR R3, R11
389 ORR R10, R21
390 LSL R4, R12, R22
391 LSR R3, R12
392 ORR R11, R22
393 LSL R4, R13, R23
394 LSR R3, R13, R8
395 ORR R12, R23
396 STP.P (R20, R21), 32(R0)
397 STP (R22, R23), -16(R0)
398 SUB $4, R1
399 B loop
400done:
401 MOVD R8, (R0) // deal with the last element
402 RET
403copy:
404 CMP R0, R2
405 BEQ len0
406 TBZ $0, R1, ctwo
407 MOVD.P 8(R2), R3
408 MOVD.P R3, 8(R0)
409 SUB $1, R1
410ctwo:
411 TBZ $1, R1, cloop
412 LDP.P 16(R2), (R4, R5)
413 STP.P (R4, R5), 16(R0)
414 SUB $2, R1
415cloop:
416 CBZ R1, len0
417 LDP.P 32(R2), (R4, R5)
418 LDP -16(R2), (R6, R7)
419 STP.P (R4, R5), 32(R0)
420 STP (R6, R7), -16(R0)
421 SUB $4, R1
422 B cloop
423len0:
424 MOVD $0, c+56(FP)
425 RET
426
427
428// func mulAddVWW(z, x []Word, y, r Word) (c Word)
429TEXT ·mulAddVWW(SB),NOSPLIT,$0
430 MOVD z+0(FP), R1
431 MOVD z_len+8(FP), R0
432 MOVD x+24(FP), R2
433 MOVD y+48(FP), R3
434 MOVD r+56(FP), R4
435 // c, z = x * y + r
436 TBZ $0, R0, two
437 MOVD.P 8(R2), R5
438 MUL R3, R5, R7
439 UMULH R3, R5, R8
440 ADDS R4, R7
441 ADC $0, R8, R4 // c, z[i] = x[i] * y + r
442 MOVD.P R7, 8(R1)
443 SUB $1, R0
444two:
445 TBZ $1, R0, loop
446 LDP.P 16(R2), (R5, R6)
447 MUL R3, R5, R10
448 UMULH R3, R5, R11
449 ADDS R4, R10
450 MUL R3, R6, R12
451 UMULH R3, R6, R13
452 ADCS R12, R11
453 ADC $0, R13, R4
454
455 STP.P (R10, R11), 16(R1)
456 SUB $2, R0
457loop:
458 CBZ R0, done
459 LDP.P 32(R2), (R5, R6)
460 LDP -16(R2), (R7, R8)
461
462 MUL R3, R5, R10
463 UMULH R3, R5, R11
464 ADDS R4, R10
465 MUL R3, R6, R12
466 UMULH R3, R6, R13
467 ADCS R11, R12
468
469 MUL R3, R7, R14
470 UMULH R3, R7, R15
471 ADCS R13, R14
472 MUL R3, R8, R16
473 UMULH R3, R8, R17
474 ADCS R15, R16
475 ADC $0, R17, R4
476
477 STP.P (R10, R12), 32(R1)
478 STP (R14, R16), -16(R1)
479 SUB $4, R0
480 B loop
481done:
482 MOVD R4, c+64(FP)
483 RET
484
485
486// func addMulVVW(z, x []Word, y Word) (c Word)
487TEXT ·addMulVVW(SB),NOSPLIT,$0
488 MOVD z+0(FP), R1
489 MOVD z_len+8(FP), R0
490 MOVD x+24(FP), R2
491 MOVD y+48(FP), R3
492 MOVD $0, R4
493
494 TBZ $0, R0, two
495
496 MOVD.P 8(R2), R5
497 MOVD (R1), R6
498
499 MUL R5, R3, R7
500 UMULH R5, R3, R8
501
502 ADDS R7, R6
503 ADC $0, R8, R4
504
505 MOVD.P R6, 8(R1)
506 SUB $1, R0
507
508two:
509 TBZ $1, R0, loop
510
511 LDP.P 16(R2), (R5, R10)
512 LDP (R1), (R6, R11)
513
514 MUL R10, R3, R13
515 UMULH R10, R3, R12
516
517 MUL R5, R3, R7
518 UMULH R5, R3, R8
519
520 ADDS R4, R6
521 ADCS R13, R11
522 ADC $0, R12
523
524 ADDS R7, R6
525 ADCS R8, R11
526 ADC $0, R12, R4
527
528 STP.P (R6, R11), 16(R1)
529 SUB $2, R0
530
531// The main loop of this code operates on a block of 4 words every iteration
532// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
533// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
534// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
535loop:
536 CBZ R0, done
537
538 LDP.P 16(R2), (R5, R6)
539 LDP.P 16(R2), (R7, R8)
540
541 LDP (R1), (R9, R10)
542 ADDS R4, R9
543 MUL R6, R3, R14
544 ADCS R14, R10
545 MUL R7, R3, R15
546 LDP 16(R1), (R11, R12)
547 ADCS R15, R11
548 MUL R8, R3, R16
549 ADCS R16, R12
550 UMULH R8, R3, R20
551 ADC $0, R20
552
553 MUL R5, R3, R13
554 ADDS R13, R9
555 UMULH R5, R3, R17
556 ADCS R17, R10
557 UMULH R6, R3, R21
558 STP.P (R9, R10), 16(R1)
559 ADCS R21, R11
560 UMULH R7, R3, R19
561 ADCS R19, R12
562 STP.P (R11, R12), 16(R1)
563 ADC $0, R20, R4
564
565 SUB $4, R0
566 B loop
567
568done:
569 MOVD R4, c+56(FP)
570 RET
571
572
View as plain text