Text file
src/math/big/arith_ppc64x.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go && (ppc64 || ppc64le)
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// func addVV(z, y, y []Word) (c Word)
13// z[i] = x[i] + y[i] for all i, carrying
14TEXT ·addVV(SB), NOSPLIT, $0
15 MOVD z_len+8(FP), R7 // R7 = z_len
16 MOVD x+24(FP), R8 // R8 = x[]
17 MOVD y+48(FP), R9 // R9 = y[]
18 MOVD z+0(FP), R10 // R10 = z[]
19
20 // If z_len = 0, we are done
21 CMP R7, $0
22 MOVD R0, R4
23 BEQ done
24
25 // Process the first iteration out of the loop so we can
26 // use MOVDU and avoid 3 index registers updates.
27 MOVD 0(R8), R11 // R11 = x[i]
28 MOVD 0(R9), R12 // R12 = y[i]
29 ADD $-1, R7 // R7 = z_len - 1
30 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
31 CMP R7, $0
32 MOVD R15, 0(R10) // z[i]
33 BEQ final // If z_len was 1, we are done
34
35 SRD $2, R7, R5 // R5 = z_len/4
36 CMP R5, $0
37 MOVD R5, CTR // Set up loop counter
38 BEQ tail // If R5 = 0, we can't use the loop
39
40 // Process 4 elements per iteration. Unrolling this loop
41 // means a performance trade-off: we will lose performance
42 // for small values of z_len (0.90x in the worst case), but
43 // gain significant performance as z_len increases (up to
44 // 1.45x).
45
46 PCALIGN $16
47loop:
48 MOVD 8(R8), R11 // R11 = x[i]
49 MOVD 16(R8), R12 // R12 = x[i+1]
50 MOVD 24(R8), R14 // R14 = x[i+2]
51 MOVDU 32(R8), R15 // R15 = x[i+3]
52 MOVD 8(R9), R16 // R16 = y[i]
53 MOVD 16(R9), R17 // R17 = y[i+1]
54 MOVD 24(R9), R18 // R18 = y[i+2]
55 MOVDU 32(R9), R19 // R19 = y[i+3]
56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
60 MOVD R20, 8(R10) // z[i]
61 MOVD R21, 16(R10) // z[i+1]
62 MOVD R22, 24(R10) // z[i+2]
63 MOVDU R23, 32(R10) // z[i+3]
64 ADD $-4, R7 // R7 = z_len - 4
65 BDNZ loop
66
67 // We may have more elements to read
68 CMP R7, $0
69 BEQ final
70
71 // Process the remaining elements, one at a time
72tail:
73 MOVDU 8(R8), R11 // R11 = x[i]
74 MOVDU 8(R9), R16 // R16 = y[i]
75 ADD $-1, R7 // R7 = z_len - 1
76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
77 CMP R7, $0
78 MOVDU R20, 8(R10) // z[i]
79 BEQ final // If R7 = 0, we are done
80
81 MOVDU 8(R8), R11
82 MOVDU 8(R9), R16
83 ADD $-1, R7
84 ADDE R11, R16, R20
85 CMP R7, $0
86 MOVDU R20, 8(R10)
87 BEQ final
88
89 MOVD 8(R8), R11
90 MOVD 8(R9), R16
91 ADDE R11, R16, R20
92 MOVD R20, 8(R10)
93
94final:
95 ADDZE R4 // Capture CA
96
97done:
98 MOVD R4, c+72(FP)
99 RET
100
101// func subVV(z, x, y []Word) (c Word)
102// z[i] = x[i] - y[i] for all i, carrying
103TEXT ·subVV(SB), NOSPLIT, $0
104 MOVD z_len+8(FP), R7 // R7 = z_len
105 MOVD x+24(FP), R8 // R8 = x[]
106 MOVD y+48(FP), R9 // R9 = y[]
107 MOVD z+0(FP), R10 // R10 = z[]
108
109 // If z_len = 0, we are done
110 CMP R7, $0
111 MOVD R0, R4
112 BEQ done
113
114 // Process the first iteration out of the loop so we can
115 // use MOVDU and avoid 3 index registers updates.
116 MOVD 0(R8), R11 // R11 = x[i]
117 MOVD 0(R9), R12 // R12 = y[i]
118 ADD $-1, R7 // R7 = z_len - 1
119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
120 CMP R7, $0
121 MOVD R15, 0(R10) // z[i]
122 BEQ final // If z_len was 1, we are done
123
124 SRD $2, R7, R5 // R5 = z_len/4
125 CMP R5, $0
126 MOVD R5, CTR // Set up loop counter
127 BEQ tail // If R5 = 0, we can't use the loop
128
129 // Process 4 elements per iteration. Unrolling this loop
130 // means a performance trade-off: we will lose performance
131 // for small values of z_len (0.92x in the worst case), but
132 // gain significant performance as z_len increases (up to
133 // 1.45x).
134
135 PCALIGN $16
136loop:
137 MOVD 8(R8), R11 // R11 = x[i]
138 MOVD 16(R8), R12 // R12 = x[i+1]
139 MOVD 24(R8), R14 // R14 = x[i+2]
140 MOVDU 32(R8), R15 // R15 = x[i+3]
141 MOVD 8(R9), R16 // R16 = y[i]
142 MOVD 16(R9), R17 // R17 = y[i+1]
143 MOVD 24(R9), R18 // R18 = y[i+2]
144 MOVDU 32(R9), R19 // R19 = y[i+3]
145 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
146 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
147 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
148 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
149 MOVD R20, 8(R10) // z[i]
150 MOVD R21, 16(R10) // z[i+1]
151 MOVD R22, 24(R10) // z[i+2]
152 MOVDU R23, 32(R10) // z[i+3]
153 ADD $-4, R7 // R7 = z_len - 4
154 BDNZ loop
155
156 // We may have more elements to read
157 CMP R7, $0
158 BEQ final
159
160 // Process the remaining elements, one at a time
161tail:
162 MOVDU 8(R8), R11 // R11 = x[i]
163 MOVDU 8(R9), R16 // R16 = y[i]
164 ADD $-1, R7 // R7 = z_len - 1
165 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
166 CMP R7, $0
167 MOVDU R20, 8(R10) // z[i]
168 BEQ final // If R7 = 0, we are done
169
170 MOVDU 8(R8), R11
171 MOVDU 8(R9), R16
172 ADD $-1, R7
173 SUBE R16, R11, R20
174 CMP R7, $0
175 MOVDU R20, 8(R10)
176 BEQ final
177
178 MOVD 8(R8), R11
179 MOVD 8(R9), R16
180 SUBE R16, R11, R20
181 MOVD R20, 8(R10)
182
183final:
184 ADDZE R4
185 XOR $1, R4
186
187done:
188 MOVD R4, c+72(FP)
189 RET
190
191// func addVW(z, x []Word, y Word) (c Word)
192TEXT ·addVW(SB), NOSPLIT, $0
193 MOVD z+0(FP), R10 // R10 = z[]
194 MOVD x+24(FP), R8 // R8 = x[]
195 MOVD y+48(FP), R4 // R4 = y = c
196 MOVD z_len+8(FP), R11 // R11 = z_len
197
198 CMP R11, $0 // If z_len is zero, return
199 BEQ done
200
201 // We will process the first iteration out of the loop so we capture
202 // the value of c. In the subsequent iterations, we will rely on the
203 // value of CA set here.
204 MOVD 0(R8), R20 // R20 = x[i]
205 ADD $-1, R11 // R11 = z_len - 1
206 ADDC R20, R4, R6 // R6 = x[i] + c
207 CMP R11, $0 // If z_len was 1, we are done
208 MOVD R6, 0(R10) // z[i]
209 BEQ final
210
211 // We will read 4 elements per iteration
212 SRDCC $2, R11, R9 // R9 = z_len/4
213 DCBT (R8)
214 MOVD R9, CTR // Set up the loop counter
215 BEQ tail // If R9 = 0, we can't use the loop
216 PCALIGN $16
217
218loop:
219 MOVD 8(R8), R20 // R20 = x[i]
220 MOVD 16(R8), R21 // R21 = x[i+1]
221 MOVD 24(R8), R22 // R22 = x[i+2]
222 MOVDU 32(R8), R23 // R23 = x[i+3]
223 ADDZE R20, R24 // R24 = x[i] + CA
224 ADDZE R21, R25 // R25 = x[i+1] + CA
225 ADDZE R22, R26 // R26 = x[i+2] + CA
226 ADDZE R23, R27 // R27 = x[i+3] + CA
227 MOVD R24, 8(R10) // z[i]
228 MOVD R25, 16(R10) // z[i+1]
229 MOVD R26, 24(R10) // z[i+2]
230 MOVDU R27, 32(R10) // z[i+3]
231 ADD $-4, R11 // R11 = z_len - 4
232 BDNZ loop
233
234 // We may have some elements to read
235 CMP R11, $0
236 BEQ final
237
238tail:
239 MOVDU 8(R8), R20
240 ADDZE R20, R24
241 ADD $-1, R11
242 MOVDU R24, 8(R10)
243 CMP R11, $0
244 BEQ final
245
246 MOVDU 8(R8), R20
247 ADDZE R20, R24
248 ADD $-1, R11
249 MOVDU R24, 8(R10)
250 CMP R11, $0
251 BEQ final
252
253 MOVD 8(R8), R20
254 ADDZE R20, R24
255 MOVD R24, 8(R10)
256
257final:
258 ADDZE R0, R4 // c = CA
259done:
260 MOVD R4, c+56(FP)
261 RET
262
263// func subVW(z, x []Word, y Word) (c Word)
264TEXT ·subVW(SB), NOSPLIT, $0
265 MOVD z+0(FP), R10 // R10 = z[]
266 MOVD x+24(FP), R8 // R8 = x[]
267 MOVD y+48(FP), R4 // R4 = y = c
268 MOVD z_len+8(FP), R11 // R11 = z_len
269
270 CMP R11, $0 // If z_len is zero, return
271 BEQ done
272
273 // We will process the first iteration out of the loop so we capture
274 // the value of c. In the subsequent iterations, we will rely on the
275 // value of CA set here.
276 MOVD 0(R8), R20 // R20 = x[i]
277 ADD $-1, R11 // R11 = z_len - 1
278 SUBC R4, R20, R6 // R6 = x[i] - c
279 CMP R11, $0 // If z_len was 1, we are done
280 MOVD R6, 0(R10) // z[i]
281 BEQ final
282
283 // We will read 4 elements per iteration
284 SRDCC $2, R11, R9 // R9 = z_len/4
285 DCBT (R8)
286 MOVD R9, CTR // Set up the loop counter
287 BEQ tail // If R9 = 0, we can't use the loop
288
289 // The loop here is almost the same as the one used in s390x, but
290 // we don't need to capture CA every iteration because we've already
291 // done that above.
292
293 PCALIGN $16
294loop:
295 MOVD 8(R8), R20
296 MOVD 16(R8), R21
297 MOVD 24(R8), R22
298 MOVDU 32(R8), R23
299 SUBE R0, R20
300 SUBE R0, R21
301 SUBE R0, R22
302 SUBE R0, R23
303 MOVD R20, 8(R10)
304 MOVD R21, 16(R10)
305 MOVD R22, 24(R10)
306 MOVDU R23, 32(R10)
307 ADD $-4, R11
308 BDNZ loop
309
310 // We may have some elements to read
311 CMP R11, $0
312 BEQ final
313
314tail:
315 MOVDU 8(R8), R20
316 SUBE R0, R20
317 ADD $-1, R11
318 MOVDU R20, 8(R10)
319 CMP R11, $0
320 BEQ final
321
322 MOVDU 8(R8), R20
323 SUBE R0, R20
324 ADD $-1, R11
325 MOVDU R20, 8(R10)
326 CMP R11, $0
327 BEQ final
328
329 MOVD 8(R8), R20
330 SUBE R0, R20
331 MOVD R20, 8(R10)
332
333final:
334 // Capture CA
335 SUBE R4, R4
336 NEG R4, R4
337
338done:
339 MOVD R4, c+56(FP)
340 RET
341
342//func shlVU(z, x []Word, s uint) (c Word)
343TEXT ·shlVU(SB), NOSPLIT, $0
344 MOVD z+0(FP), R3
345 MOVD x+24(FP), R6
346 MOVD s+48(FP), R9
347 MOVD z_len+8(FP), R4
348 MOVD x_len+32(FP), R7
349 CMP R9, $0 // s==0 copy(z,x)
350 BEQ zeroshift
351 CMP R4, $0 // len(z)==0 return
352 BEQ done
353
354 ADD $-1, R4, R5 // len(z)-1
355 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
356 SLD $3, R5, R7
357 ADD R6, R7, R15 // save starting address &x[len(z)-1]
358 ADD R3, R7, R16 // save starting address &z[len(z)-1]
359 MOVD (R6)(R7), R14
360 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
361 CMP R5, $0 // iterate from i=len(z)-1 to 0
362 BEQ loopexit // Already at end?
363 MOVD 0(R15),R10 // x[i]
364 PCALIGN $16
365shloop:
366 SLD R9, R10, R10 // x[i]<<s
367 MOVDU -8(R15), R14
368 SRD R4, R14, R11 // x[i-1]>>ŝ
369 OR R11, R10, R10
370 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
371 MOVD R14, R10 // reuse x[i-1] for next iteration
372 ADD $-8, R16 // i--
373 CMP R15, R6 // &x[i-1]>&x[0]?
374 BGT shloop
375loopexit:
376 MOVD 0(R6), R4
377 SLD R9, R4, R4
378 MOVD R4, 0(R3) // z[0]=x[0]<<s
379 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
380 RET
381
382zeroshift:
383 CMP R6, $0 // x is null, nothing to copy
384 BEQ done
385 CMP R6, R3 // if x is same as z, nothing to copy
386 BEQ done
387 CMP R7, R4
388 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
389 SLD $3, R7, R7
390 SUB R6, R3, R11 // dest - src
391 CMPU R11, R7, CR2 // < len?
392 BLT CR2, backward // there is overlap, copy backwards
393 MOVD $0, R14
394 // shlVU processes backwards, but added a forward copy option
395 // since its faster on POWER
396repeat:
397 MOVD (R6)(R14), R15 // Copy 8 bytes at a time
398 MOVD R15, (R3)(R14)
399 ADD $8, R14
400 CMP R14, R7 // More 8 bytes left?
401 BLT repeat
402 BR done
403backward:
404 ADD $-8,R7, R14
405repeatback:
406 MOVD (R6)(R14), R15 // copy x into z backwards
407 MOVD R15, (R3)(R14) // copy 8 bytes at a time
408 SUB $8, R14
409 CMP R14, $-8 // More 8 bytes left?
410 BGT repeatback
411
412done:
413 MOVD R0, c+56(FP) // c=0
414 RET
415
416//func shrVU(z, x []Word, s uint) (c Word)
417TEXT ·shrVU(SB), NOSPLIT, $0
418 MOVD z+0(FP), R3
419 MOVD x+24(FP), R6
420 MOVD s+48(FP), R9
421 MOVD z_len+8(FP), R4
422 MOVD x_len+32(FP), R7
423
424 CMP R9, $0 // s==0, copy(z,x)
425 BEQ zeroshift
426 CMP R4, $0 // len(z)==0 return
427 BEQ done
428 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
429
430 MOVD 0(R6), R7
431 SLD R5, R7, R7 // compute x[0]<<ŝ
432 MOVD $1, R8 // iterate from i=1 to i<len(z)
433 CMP R8, R4
434 BGE loopexit // Already at end?
435
436 // vectorize if len(z) is >=3, else jump to scalar loop
437 CMP R4, $3
438 BLT scalar
439 MTVSRD R9, VS38 // s
440 VSPLTB $7, V6, V4
441 MTVSRD R5, VS39 // ŝ
442 VSPLTB $7, V7, V2
443 ADD $-2, R4, R16
444 PCALIGN $16
445loopback:
446 ADD $-1, R8, R10
447 SLD $3, R10
448 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
449 SLD $3, R8, R12
450 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
451
452 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
453 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
454 VOR V3, V5, V5 // Or(|) the two registers together
455 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
456 ADD $2, R8 // Done processing 2 entries, i and i+1
457 CMP R8, R16 // Are there at least a couple of more entries left?
458 BLE loopback
459 CMP R8, R4 // Are we at the last element?
460 BEQ loopexit
461scalar:
462 ADD $-1, R8, R10
463 SLD $3, R10
464 MOVD (R6)(R10),R11
465 SRD R9, R11, R11 // x[len(z)-2] >> s
466 SLD $3, R8, R12
467 MOVD (R6)(R12), R12
468 SLD R5, R12, R12 // x[len(z)-1]<<ŝ
469 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
470 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
471loopexit:
472 ADD $-1, R4
473 SLD $3, R4
474 MOVD (R6)(R4), R5
475 SRD R9, R5, R5 // x[len(z)-1]>>s
476 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
477 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
478 RET
479
480zeroshift:
481 CMP R6, $0 // x is null, nothing to copy
482 BEQ done
483 CMP R6, R3 // if x is same as z, nothing to copy
484 BEQ done
485 CMP R7, R4
486 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
487 SLD $3, R7, R7
488 MOVD $0, R14
489repeat:
490 MOVD (R6)(R14), R15 // copy 8 bytes at a time
491 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
492 ADD $8, R14
493 CMP R14, R7 // More 8 bytes left?
494 BLT repeat
495done:
496 MOVD R0, c+56(FP)
497 RET
498
499// func mulAddVWW(z, x []Word, y, r Word) (c Word)
500TEXT ·mulAddVWW(SB), NOSPLIT, $0
501 MOVD z+0(FP), R10 // R10 = z[]
502 MOVD x+24(FP), R8 // R8 = x[]
503 MOVD y+48(FP), R9 // R9 = y
504 MOVD r+56(FP), R4 // R4 = r = c
505 MOVD z_len+8(FP), R11 // R11 = z_len
506
507 CMP R11, $0
508 BEQ done
509
510 MOVD 0(R8), R20
511 ADD $-1, R11
512 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
513 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
514 ADDC R4, R6 // R6 = z0 + r
515 ADDZE R7, R4 // R4 = z1 + CA
516 CMP R11, $0
517 MOVD R6, 0(R10) // z[i]
518 BEQ done
519
520 // We will read 4 elements per iteration
521 SRDCC $2, R11, R14 // R14 = z_len/4
522 DCBT (R8)
523 MOVD R14, CTR // Set up the loop counter
524 BEQ tail // If R9 = 0, we can't use the loop
525 PCALIGN $16
526
527loop:
528 MOVD 8(R8), R20 // R20 = x[i]
529 MOVD 16(R8), R21 // R21 = x[i+1]
530 MOVD 24(R8), R22 // R22 = x[i+2]
531 MOVDU 32(R8), R23 // R23 = x[i+3]
532 MULLD R9, R20, R24 // R24 = z0[i]
533 MULHDU R9, R20, R20 // R20 = z1[i]
534 ADDC R4, R24 // R24 = z0[i] + c
535 MULLD R9, R21, R25
536 MULHDU R9, R21, R21
537 ADDE R20, R25
538 MULLD R9, R22, R26
539 MULHDU R9, R22, R22
540 MULLD R9, R23, R27
541 MULHDU R9, R23, R23
542 ADDE R21, R26
543 MOVD R24, 8(R10) // z[i]
544 MOVD R25, 16(R10) // z[i+1]
545 ADDE R22, R27
546 ADDZE R23,R4 // update carry
547 MOVD R26, 24(R10) // z[i+2]
548 MOVDU R27, 32(R10) // z[i+3]
549 ADD $-4, R11 // R11 = z_len - 4
550 BDNZ loop
551
552 // We may have some elements to read
553 CMP R11, $0
554 BEQ done
555
556 // Process the remaining elements, one at a time
557tail:
558 MOVDU 8(R8), R20 // R20 = x[i]
559 MULLD R9, R20, R24 // R24 = z0[i]
560 MULHDU R9, R20, R25 // R25 = z1[i]
561 ADD $-1, R11 // R11 = z_len - 1
562 ADDC R4, R24
563 ADDZE R25, R4
564 MOVDU R24, 8(R10) // z[i]
565 CMP R11, $0
566 BEQ done // If R11 = 0, we are done
567
568 MOVDU 8(R8), R20
569 MULLD R9, R20, R24
570 MULHDU R9, R20, R25
571 ADD $-1, R11
572 ADDC R4, R24
573 ADDZE R25, R4
574 MOVDU R24, 8(R10)
575 CMP R11, $0
576 BEQ done
577
578 MOVD 8(R8), R20
579 MULLD R9, R20, R24
580 MULHDU R9, R20, R25
581 ADD $-1, R11
582 ADDC R4, R24
583 ADDZE R25,R4
584 MOVD R24, 8(R10)
585
586done:
587 MOVD R4, c+64(FP)
588 RET
589
590// func addMulVVW(z, x []Word, y Word) (c Word)
591TEXT ·addMulVVW(SB), NOSPLIT, $0
592 MOVD z+0(FP), R3 // R3 = z[]
593 MOVD x+24(FP), R4 // R4 = x[]
594 MOVD y+48(FP), R5 // R5 = y
595 MOVD z_len+8(FP), R6 // R6 = z_len
596
597 CMP R6, $4
598 MOVD R0, R9 // R9 = c = 0
599 BLT tail
600 SRD $2, R6, R7
601 MOVD R7, CTR // Initialize loop counter
602 PCALIGN $16
603
604loop:
605 MOVD 0(R4), R14 // x[i]
606 MOVD 8(R4), R16 // x[i+1]
607 MOVD 16(R4), R18 // x[i+2]
608 MOVD 24(R4), R20 // x[i+3]
609 MOVD 0(R3), R15 // z[i]
610 MOVD 8(R3), R17 // z[i+1]
611 MOVD 16(R3), R19 // z[i+2]
612 MOVD 24(R3), R21 // z[i+3]
613 MULLD R5, R14, R10 // low x[i]*y
614 MULHDU R5, R14, R11 // high x[i]*y
615 ADDC R15, R10
616 ADDZE R11
617 ADDC R9, R10
618 ADDZE R11, R9
619 MULLD R5, R16, R14 // low x[i+1]*y
620 MULHDU R5, R16, R15 // high x[i+1]*y
621 ADDC R17, R14
622 ADDZE R15
623 ADDC R9, R14
624 ADDZE R15, R9
625 MULLD R5, R18, R16 // low x[i+2]*y
626 MULHDU R5, R18, R17 // high x[i+2]*y
627 ADDC R19, R16
628 ADDZE R17
629 ADDC R9, R16
630 ADDZE R17, R9
631 MULLD R5, R20, R18 // low x[i+3]*y
632 MULHDU R5, R20, R19 // high x[i+3]*y
633 ADDC R21, R18
634 ADDZE R19
635 ADDC R9, R18
636 ADDZE R19, R9
637 MOVD R10, 0(R3) // z[i]
638 MOVD R14, 8(R3) // z[i+1]
639 MOVD R16, 16(R3) // z[i+2]
640 MOVD R18, 24(R3) // z[i+3]
641 ADD $32, R3
642 ADD $32, R4
643 BDNZ loop
644
645 ANDCC $3, R6
646tail:
647 CMP R6, $0
648 BEQ done
649 MOVD R6, CTR
650 PCALIGN $16
651tailloop:
652 MOVD 0(R4), R14
653 MOVD 0(R3), R15
654 MULLD R5, R14, R10
655 MULHDU R5, R14, R11
656 ADDC R15, R10
657 ADDZE R11
658 ADDC R9, R10
659 ADDZE R11, R9
660 MOVD R10, 0(R3)
661 ADD $8, R3
662 ADD $8, R4
663 BDNZ tailloop
664
665done:
666 MOVD R9, c+56(FP)
667 RET
668
View as plain text