Text file
src/math/big/arith_amd64.s
1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
13// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
14// This is faster than using rotate instructions.
15
16// func addVV(z, x, y []Word) (c Word)
17TEXT ·addVV(SB),NOSPLIT,$0
18 MOVQ z_len+8(FP), DI
19 MOVQ x+24(FP), R8
20 MOVQ y+48(FP), R9
21 MOVQ z+0(FP), R10
22
23 MOVQ $0, CX // c = 0
24 MOVQ $0, SI // i = 0
25
26 // s/JL/JMP/ below to disable the unrolled loop
27 SUBQ $4, DI // n -= 4
28 JL V1 // if n < 0 goto V1
29
30U1: // n >= 0
31 // regular loop body unrolled 4x
32 ADDQ CX, CX // restore CF
33 MOVQ 0(R8)(SI*8), R11
34 MOVQ 8(R8)(SI*8), R12
35 MOVQ 16(R8)(SI*8), R13
36 MOVQ 24(R8)(SI*8), R14
37 ADCQ 0(R9)(SI*8), R11
38 ADCQ 8(R9)(SI*8), R12
39 ADCQ 16(R9)(SI*8), R13
40 ADCQ 24(R9)(SI*8), R14
41 MOVQ R11, 0(R10)(SI*8)
42 MOVQ R12, 8(R10)(SI*8)
43 MOVQ R13, 16(R10)(SI*8)
44 MOVQ R14, 24(R10)(SI*8)
45 SBBQ CX, CX // save CF
46
47 ADDQ $4, SI // i += 4
48 SUBQ $4, DI // n -= 4
49 JGE U1 // if n >= 0 goto U1
50
51V1: ADDQ $4, DI // n += 4
52 JLE E1 // if n <= 0 goto E1
53
54L1: // n > 0
55 ADDQ CX, CX // restore CF
56 MOVQ 0(R8)(SI*8), R11
57 ADCQ 0(R9)(SI*8), R11
58 MOVQ R11, 0(R10)(SI*8)
59 SBBQ CX, CX // save CF
60
61 ADDQ $1, SI // i++
62 SUBQ $1, DI // n--
63 JG L1 // if n > 0 goto L1
64
65E1: NEGQ CX
66 MOVQ CX, c+72(FP) // return c
67 RET
68
69
70// func subVV(z, x, y []Word) (c Word)
71// (same as addVV except for SBBQ instead of ADCQ and label names)
72TEXT ·subVV(SB),NOSPLIT,$0
73 MOVQ z_len+8(FP), DI
74 MOVQ x+24(FP), R8
75 MOVQ y+48(FP), R9
76 MOVQ z+0(FP), R10
77
78 MOVQ $0, CX // c = 0
79 MOVQ $0, SI // i = 0
80
81 // s/JL/JMP/ below to disable the unrolled loop
82 SUBQ $4, DI // n -= 4
83 JL V2 // if n < 0 goto V2
84
85U2: // n >= 0
86 // regular loop body unrolled 4x
87 ADDQ CX, CX // restore CF
88 MOVQ 0(R8)(SI*8), R11
89 MOVQ 8(R8)(SI*8), R12
90 MOVQ 16(R8)(SI*8), R13
91 MOVQ 24(R8)(SI*8), R14
92 SBBQ 0(R9)(SI*8), R11
93 SBBQ 8(R9)(SI*8), R12
94 SBBQ 16(R9)(SI*8), R13
95 SBBQ 24(R9)(SI*8), R14
96 MOVQ R11, 0(R10)(SI*8)
97 MOVQ R12, 8(R10)(SI*8)
98 MOVQ R13, 16(R10)(SI*8)
99 MOVQ R14, 24(R10)(SI*8)
100 SBBQ CX, CX // save CF
101
102 ADDQ $4, SI // i += 4
103 SUBQ $4, DI // n -= 4
104 JGE U2 // if n >= 0 goto U2
105
106V2: ADDQ $4, DI // n += 4
107 JLE E2 // if n <= 0 goto E2
108
109L2: // n > 0
110 ADDQ CX, CX // restore CF
111 MOVQ 0(R8)(SI*8), R11
112 SBBQ 0(R9)(SI*8), R11
113 MOVQ R11, 0(R10)(SI*8)
114 SBBQ CX, CX // save CF
115
116 ADDQ $1, SI // i++
117 SUBQ $1, DI // n--
118 JG L2 // if n > 0 goto L2
119
120E2: NEGQ CX
121 MOVQ CX, c+72(FP) // return c
122 RET
123
124
125// func addVW(z, x []Word, y Word) (c Word)
126TEXT ·addVW(SB),NOSPLIT,$0
127 MOVQ z_len+8(FP), DI
128 CMPQ DI, $32
129 JG large
130 MOVQ x+24(FP), R8
131 MOVQ y+48(FP), CX // c = y
132 MOVQ z+0(FP), R10
133
134 MOVQ $0, SI // i = 0
135
136 // s/JL/JMP/ below to disable the unrolled loop
137 SUBQ $4, DI // n -= 4
138 JL V3 // if n < 4 goto V3
139
140U3: // n >= 0
141 // regular loop body unrolled 4x
142 MOVQ 0(R8)(SI*8), R11
143 MOVQ 8(R8)(SI*8), R12
144 MOVQ 16(R8)(SI*8), R13
145 MOVQ 24(R8)(SI*8), R14
146 ADDQ CX, R11
147 ADCQ $0, R12
148 ADCQ $0, R13
149 ADCQ $0, R14
150 SBBQ CX, CX // save CF
151 NEGQ CX
152 MOVQ R11, 0(R10)(SI*8)
153 MOVQ R12, 8(R10)(SI*8)
154 MOVQ R13, 16(R10)(SI*8)
155 MOVQ R14, 24(R10)(SI*8)
156
157 ADDQ $4, SI // i += 4
158 SUBQ $4, DI // n -= 4
159 JGE U3 // if n >= 0 goto U3
160
161V3: ADDQ $4, DI // n += 4
162 JLE E3 // if n <= 0 goto E3
163
164L3: // n > 0
165 ADDQ 0(R8)(SI*8), CX
166 MOVQ CX, 0(R10)(SI*8)
167 SBBQ CX, CX // save CF
168 NEGQ CX
169
170 ADDQ $1, SI // i++
171 SUBQ $1, DI // n--
172 JG L3 // if n > 0 goto L3
173
174E3: MOVQ CX, c+56(FP) // return c
175 RET
176large:
177 JMP ·addVWlarge(SB)
178
179
180// func subVW(z, x []Word, y Word) (c Word)
181// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
182TEXT ·subVW(SB),NOSPLIT,$0
183 MOVQ z_len+8(FP), DI
184 CMPQ DI, $32
185 JG large
186 MOVQ x+24(FP), R8
187 MOVQ y+48(FP), CX // c = y
188 MOVQ z+0(FP), R10
189
190 MOVQ $0, SI // i = 0
191
192 // s/JL/JMP/ below to disable the unrolled loop
193 SUBQ $4, DI // n -= 4
194 JL V4 // if n < 4 goto V4
195
196U4: // n >= 0
197 // regular loop body unrolled 4x
198 MOVQ 0(R8)(SI*8), R11
199 MOVQ 8(R8)(SI*8), R12
200 MOVQ 16(R8)(SI*8), R13
201 MOVQ 24(R8)(SI*8), R14
202 SUBQ CX, R11
203 SBBQ $0, R12
204 SBBQ $0, R13
205 SBBQ $0, R14
206 SBBQ CX, CX // save CF
207 NEGQ CX
208 MOVQ R11, 0(R10)(SI*8)
209 MOVQ R12, 8(R10)(SI*8)
210 MOVQ R13, 16(R10)(SI*8)
211 MOVQ R14, 24(R10)(SI*8)
212
213 ADDQ $4, SI // i += 4
214 SUBQ $4, DI // n -= 4
215 JGE U4 // if n >= 0 goto U4
216
217V4: ADDQ $4, DI // n += 4
218 JLE E4 // if n <= 0 goto E4
219
220L4: // n > 0
221 MOVQ 0(R8)(SI*8), R11
222 SUBQ CX, R11
223 MOVQ R11, 0(R10)(SI*8)
224 SBBQ CX, CX // save CF
225 NEGQ CX
226
227 ADDQ $1, SI // i++
228 SUBQ $1, DI // n--
229 JG L4 // if n > 0 goto L4
230
231E4: MOVQ CX, c+56(FP) // return c
232 RET
233large:
234 JMP ·subVWlarge(SB)
235
236
237// func shlVU(z, x []Word, s uint) (c Word)
238TEXT ·shlVU(SB),NOSPLIT,$0
239 MOVQ z_len+8(FP), BX // i = z
240 SUBQ $1, BX // i--
241 JL X8b // i < 0 (n <= 0)
242
243 // n > 0
244 MOVQ z+0(FP), R10
245 MOVQ x+24(FP), R8
246 MOVQ s+48(FP), CX
247 MOVQ (R8)(BX*8), AX // w1 = x[n-1]
248 MOVQ $0, DX
249 SHLQ CX, AX, DX // w1>>ŝ
250 MOVQ DX, c+56(FP)
251
252 CMPQ BX, $0
253 JLE X8a // i <= 0
254
255 // i > 0
256L8: MOVQ AX, DX // w = w1
257 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
258 SHLQ CX, AX, DX // w<<s | w1>>ŝ
259 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
260 SUBQ $1, BX // i--
261 JG L8 // i > 0
262
263 // i <= 0
264X8a: SHLQ CX, AX // w1<<s
265 MOVQ AX, (R10) // z[0] = w1<<s
266 RET
267
268X8b: MOVQ $0, c+56(FP)
269 RET
270
271
272// func shrVU(z, x []Word, s uint) (c Word)
273TEXT ·shrVU(SB),NOSPLIT,$0
274 MOVQ z_len+8(FP), R11
275 SUBQ $1, R11 // n--
276 JL X9b // n < 0 (n <= 0)
277
278 // n > 0
279 MOVQ z+0(FP), R10
280 MOVQ x+24(FP), R8
281 MOVQ s+48(FP), CX
282 MOVQ (R8), AX // w1 = x[0]
283 MOVQ $0, DX
284 SHRQ CX, AX, DX // w1<<ŝ
285 MOVQ DX, c+56(FP)
286
287 MOVQ $0, BX // i = 0
288 JMP E9
289
290 // i < n-1
291L9: MOVQ AX, DX // w = w1
292 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
293 SHRQ CX, AX, DX // w>>s | w1<<ŝ
294 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
295 ADDQ $1, BX // i++
296
297E9: CMPQ BX, R11
298 JL L9 // i < n-1
299
300 // i >= n-1
301X9a: SHRQ CX, AX // w1>>s
302 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
303 RET
304
305X9b: MOVQ $0, c+56(FP)
306 RET
307
308
309// func mulAddVWW(z, x []Word, y, r Word) (c Word)
310TEXT ·mulAddVWW(SB),NOSPLIT,$0
311 MOVQ z+0(FP), R10
312 MOVQ x+24(FP), R8
313 MOVQ y+48(FP), R9
314 MOVQ r+56(FP), CX // c = r
315 MOVQ z_len+8(FP), R11
316 MOVQ $0, BX // i = 0
317
318 CMPQ R11, $4
319 JL E5
320
321U5: // i+4 <= n
322 // regular loop body unrolled 4x
323 MOVQ (0*8)(R8)(BX*8), AX
324 MULQ R9
325 ADDQ CX, AX
326 ADCQ $0, DX
327 MOVQ AX, (0*8)(R10)(BX*8)
328 MOVQ DX, CX
329 MOVQ (1*8)(R8)(BX*8), AX
330 MULQ R9
331 ADDQ CX, AX
332 ADCQ $0, DX
333 MOVQ AX, (1*8)(R10)(BX*8)
334 MOVQ DX, CX
335 MOVQ (2*8)(R8)(BX*8), AX
336 MULQ R9
337 ADDQ CX, AX
338 ADCQ $0, DX
339 MOVQ AX, (2*8)(R10)(BX*8)
340 MOVQ DX, CX
341 MOVQ (3*8)(R8)(BX*8), AX
342 MULQ R9
343 ADDQ CX, AX
344 ADCQ $0, DX
345 MOVQ AX, (3*8)(R10)(BX*8)
346 MOVQ DX, CX
347 ADDQ $4, BX // i += 4
348
349 LEAQ 4(BX), DX
350 CMPQ DX, R11
351 JLE U5
352 JMP E5
353
354L5: MOVQ (R8)(BX*8), AX
355 MULQ R9
356 ADDQ CX, AX
357 ADCQ $0, DX
358 MOVQ AX, (R10)(BX*8)
359 MOVQ DX, CX
360 ADDQ $1, BX // i++
361
362E5: CMPQ BX, R11 // i < n
363 JL L5
364
365 MOVQ CX, c+64(FP)
366 RET
367
368
369// func addMulVVW(z, x []Word, y Word) (c Word)
370TEXT ·addMulVVW(SB),NOSPLIT,$0
371 CMPB ·support_adx(SB), $1
372 JEQ adx
373 MOVQ z+0(FP), R10
374 MOVQ x+24(FP), R8
375 MOVQ y+48(FP), R9
376 MOVQ z_len+8(FP), R11
377 MOVQ $0, BX // i = 0
378 MOVQ $0, CX // c = 0
379 MOVQ R11, R12
380 ANDQ $-2, R12
381 CMPQ R11, $2
382 JAE A6
383 JMP E6
384
385A6:
386 MOVQ (R8)(BX*8), AX
387 MULQ R9
388 ADDQ (R10)(BX*8), AX
389 ADCQ $0, DX
390 ADDQ CX, AX
391 ADCQ $0, DX
392 MOVQ DX, CX
393 MOVQ AX, (R10)(BX*8)
394
395 MOVQ (8)(R8)(BX*8), AX
396 MULQ R9
397 ADDQ (8)(R10)(BX*8), AX
398 ADCQ $0, DX
399 ADDQ CX, AX
400 ADCQ $0, DX
401 MOVQ DX, CX
402 MOVQ AX, (8)(R10)(BX*8)
403
404 ADDQ $2, BX
405 CMPQ BX, R12
406 JL A6
407 JMP E6
408
409L6: MOVQ (R8)(BX*8), AX
410 MULQ R9
411 ADDQ CX, AX
412 ADCQ $0, DX
413 ADDQ AX, (R10)(BX*8)
414 ADCQ $0, DX
415 MOVQ DX, CX
416 ADDQ $1, BX // i++
417
418E6: CMPQ BX, R11 // i < n
419 JL L6
420
421 MOVQ CX, c+56(FP)
422 RET
423
424adx:
425 MOVQ z_len+8(FP), R11
426 MOVQ z+0(FP), R10
427 MOVQ x+24(FP), R8
428 MOVQ y+48(FP), DX
429 MOVQ $0, BX // i = 0
430 MOVQ $0, CX // carry
431 CMPQ R11, $8
432 JAE adx_loop_header
433 CMPQ BX, R11
434 JL adx_short
435 MOVQ CX, c+56(FP)
436 RET
437
438adx_loop_header:
439 MOVQ R11, R13
440 ANDQ $-8, R13
441adx_loop:
442 XORQ R9, R9 // unset flags
443 MULXQ (R8), SI, DI
444 ADCXQ CX,SI
445 ADOXQ (R10), SI
446 MOVQ SI,(R10)
447
448 MULXQ 8(R8), AX, CX
449 ADCXQ DI, AX
450 ADOXQ 8(R10), AX
451 MOVQ AX, 8(R10)
452
453 MULXQ 16(R8), SI, DI
454 ADCXQ CX, SI
455 ADOXQ 16(R10), SI
456 MOVQ SI, 16(R10)
457
458 MULXQ 24(R8), AX, CX
459 ADCXQ DI, AX
460 ADOXQ 24(R10), AX
461 MOVQ AX, 24(R10)
462
463 MULXQ 32(R8), SI, DI
464 ADCXQ CX, SI
465 ADOXQ 32(R10), SI
466 MOVQ SI, 32(R10)
467
468 MULXQ 40(R8), AX, CX
469 ADCXQ DI, AX
470 ADOXQ 40(R10), AX
471 MOVQ AX, 40(R10)
472
473 MULXQ 48(R8), SI, DI
474 ADCXQ CX, SI
475 ADOXQ 48(R10), SI
476 MOVQ SI, 48(R10)
477
478 MULXQ 56(R8), AX, CX
479 ADCXQ DI, AX
480 ADOXQ 56(R10), AX
481 MOVQ AX, 56(R10)
482
483 ADCXQ R9, CX
484 ADOXQ R9, CX
485
486 ADDQ $64, R8
487 ADDQ $64, R10
488 ADDQ $8, BX
489
490 CMPQ BX, R13
491 JL adx_loop
492 MOVQ z+0(FP), R10
493 MOVQ x+24(FP), R8
494 CMPQ BX, R11
495 JL adx_short
496 MOVQ CX, c+56(FP)
497 RET
498
499adx_short:
500 MULXQ (R8)(BX*8), SI, DI
501 ADDQ CX, SI
502 ADCQ $0, DI
503 ADDQ SI, (R10)(BX*8)
504 ADCQ $0, DI
505 MOVQ DI, CX
506 ADDQ $1, BX // i++
507
508 CMPQ BX, R11
509 JL adx_short
510
511 MOVQ CX, c+56(FP)
512 RET
513
514
515
View as plain text