1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7// This file contains constant-time, 64-bit assembly implementation of
8// P256. The optimizations performed here are described in detail in:
9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
10// 256-bit primes"
11// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
12// https://eprint.iacr.org/2013/816.pdf
13
14#include "textflag.h"
15
16#define res_ptr R0
17#define a_ptr R1
18#define b_ptr R2
19
20#define acc0 R3
21#define acc1 R4
22#define acc2 R5
23#define acc3 R6
24
25#define acc4 R7
26#define acc5 R8
27#define acc6 R9
28#define acc7 R10
29#define t0 R11
30#define t1 R12
31#define t2 R13
32#define t3 R14
33#define const0 R15
34#define const1 R16
35
36#define hlp0 R17
37#define hlp1 res_ptr
38
39#define x0 R19
40#define x1 R20
41#define x2 R21
42#define x3 R22
43#define y0 R23
44#define y1 R24
45#define y2 R25
46#define y3 R26
47
48#define const2 t2
49#define const3 t3
50
51DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
52DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
53DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
54DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
55DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
56DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
57DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
58DATA p256one<>+0x00(SB)/8, $0x0000000000000001
59DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
60DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
61DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
62GLOBL p256const0<>(SB), 8, $8
63GLOBL p256const1<>(SB), 8, $8
64GLOBL p256ordK0<>(SB), 8, $8
65GLOBL p256ord<>(SB), 8, $32
66GLOBL p256one<>(SB), 8, $32
67
68/* ---------------------------------------*/
69// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
70TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
71 JMP ·p256BigToLittle(SB)
72/* ---------------------------------------*/
73// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
74TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
75 JMP ·p256BigToLittle(SB)
76/* ---------------------------------------*/
77// func p256LittleToBig(res *[32]byte, in *p256Element)
78TEXT ·p256LittleToBig(SB),NOSPLIT,$0
79 JMP ·p256BigToLittle(SB)
80/* ---------------------------------------*/
81// func p256BigToLittle(res *p256Element, in *[32]byte)
82TEXT ·p256BigToLittle(SB),NOSPLIT,$0
83 MOVD res+0(FP), res_ptr
84 MOVD in+8(FP), a_ptr
85
86 LDP 0*16(a_ptr), (acc0, acc1)
87 LDP 1*16(a_ptr), (acc2, acc3)
88
89 REV acc0, acc0
90 REV acc1, acc1
91 REV acc2, acc2
92 REV acc3, acc3
93
94 STP (acc3, acc2), 0*16(res_ptr)
95 STP (acc1, acc0), 1*16(res_ptr)
96 RET
97/* ---------------------------------------*/
98// func p256MovCond(res, a, b *P256Point, cond int)
99// If cond == 0 res=b, else res=a
100TEXT ·p256MovCond(SB),NOSPLIT,$0
101 MOVD res+0(FP), res_ptr
102 MOVD a+8(FP), a_ptr
103 MOVD b+16(FP), b_ptr
104 MOVD cond+24(FP), R3
105
106 CMP $0, R3
107 // Two remarks:
108 // 1) Will want to revisit NEON, when support is better
109 // 2) CSEL might not be constant time on all ARM processors
110 LDP 0*16(a_ptr), (R4, R5)
111 LDP 1*16(a_ptr), (R6, R7)
112 LDP 2*16(a_ptr), (R8, R9)
113 LDP 0*16(b_ptr), (R16, R17)
114 LDP 1*16(b_ptr), (R19, R20)
115 LDP 2*16(b_ptr), (R21, R22)
116 CSEL EQ, R16, R4, R4
117 CSEL EQ, R17, R5, R5
118 CSEL EQ, R19, R6, R6
119 CSEL EQ, R20, R7, R7
120 CSEL EQ, R21, R8, R8
121 CSEL EQ, R22, R9, R9
122 STP (R4, R5), 0*16(res_ptr)
123 STP (R6, R7), 1*16(res_ptr)
124 STP (R8, R9), 2*16(res_ptr)
125
126 LDP 3*16(a_ptr), (R4, R5)
127 LDP 4*16(a_ptr), (R6, R7)
128 LDP 5*16(a_ptr), (R8, R9)
129 LDP 3*16(b_ptr), (R16, R17)
130 LDP 4*16(b_ptr), (R19, R20)
131 LDP 5*16(b_ptr), (R21, R22)
132 CSEL EQ, R16, R4, R4
133 CSEL EQ, R17, R5, R5
134 CSEL EQ, R19, R6, R6
135 CSEL EQ, R20, R7, R7
136 CSEL EQ, R21, R8, R8
137 CSEL EQ, R22, R9, R9
138 STP (R4, R5), 3*16(res_ptr)
139 STP (R6, R7), 4*16(res_ptr)
140 STP (R8, R9), 5*16(res_ptr)
141
142 RET
143/* ---------------------------------------*/
144// func p256NegCond(val *p256Element, cond int)
145TEXT ·p256NegCond(SB),NOSPLIT,$0
146 MOVD val+0(FP), a_ptr
147 MOVD cond+8(FP), hlp0
148 MOVD a_ptr, res_ptr
149 // acc = poly
150 MOVD $-1, acc0
151 MOVD p256const0<>(SB), acc1
152 MOVD $0, acc2
153 MOVD p256const1<>(SB), acc3
154 // Load the original value
155 LDP 0*16(a_ptr), (t0, t1)
156 LDP 1*16(a_ptr), (t2, t3)
157 // Speculatively subtract
158 SUBS t0, acc0
159 SBCS t1, acc1
160 SBCS t2, acc2
161 SBC t3, acc3
162 // If condition is 0, keep original value
163 CMP $0, hlp0
164 CSEL EQ, t0, acc0, acc0
165 CSEL EQ, t1, acc1, acc1
166 CSEL EQ, t2, acc2, acc2
167 CSEL EQ, t3, acc3, acc3
168 // Store result
169 STP (acc0, acc1), 0*16(res_ptr)
170 STP (acc2, acc3), 1*16(res_ptr)
171
172 RET
173/* ---------------------------------------*/
174// func p256Sqr(res, in *p256Element, n int)
175TEXT ·p256Sqr(SB),NOSPLIT,$0
176 MOVD res+0(FP), res_ptr
177 MOVD in+8(FP), a_ptr
178 MOVD n+16(FP), b_ptr
179
180 MOVD p256const0<>(SB), const0
181 MOVD p256const1<>(SB), const1
182
183 LDP 0*16(a_ptr), (x0, x1)
184 LDP 1*16(a_ptr), (x2, x3)
185
186sqrLoop:
187 SUB $1, b_ptr
188 CALL p256SqrInternal<>(SB)
189 MOVD y0, x0
190 MOVD y1, x1
191 MOVD y2, x2
192 MOVD y3, x3
193 CBNZ b_ptr, sqrLoop
194
195 STP (y0, y1), 0*16(res_ptr)
196 STP (y2, y3), 1*16(res_ptr)
197 RET
198/* ---------------------------------------*/
199// func p256Mul(res, in1, in2 *p256Element)
200TEXT ·p256Mul(SB),NOSPLIT,$0
201 MOVD res+0(FP), res_ptr
202 MOVD in1+8(FP), a_ptr
203 MOVD in2+16(FP), b_ptr
204
205 MOVD p256const0<>(SB), const0
206 MOVD p256const1<>(SB), const1
207
208 LDP 0*16(a_ptr), (x0, x1)
209 LDP 1*16(a_ptr), (x2, x3)
210
211 LDP 0*16(b_ptr), (y0, y1)
212 LDP 1*16(b_ptr), (y2, y3)
213
214 CALL p256MulInternal<>(SB)
215
216 STP (y0, y1), 0*16(res_ptr)
217 STP (y2, y3), 1*16(res_ptr)
218 RET
219/* ---------------------------------------*/
220// func p256FromMont(res, in *p256Element)
221TEXT ·p256FromMont(SB),NOSPLIT,$0
222 MOVD res+0(FP), res_ptr
223 MOVD in+8(FP), a_ptr
224
225 MOVD p256const0<>(SB), const0
226 MOVD p256const1<>(SB), const1
227
228 LDP 0*16(a_ptr), (acc0, acc1)
229 LDP 1*16(a_ptr), (acc2, acc3)
230 // Only reduce, no multiplications are needed
231 // First reduction step
232 ADDS acc0<<32, acc1, acc1
233 LSR $32, acc0, t0
234 MUL acc0, const1, t1
235 UMULH acc0, const1, acc0
236 ADCS t0, acc2
237 ADCS t1, acc3
238 ADC $0, acc0
239 // Second reduction step
240 ADDS acc1<<32, acc2, acc2
241 LSR $32, acc1, t0
242 MUL acc1, const1, t1
243 UMULH acc1, const1, acc1
244 ADCS t0, acc3
245 ADCS t1, acc0
246 ADC $0, acc1
247 // Third reduction step
248 ADDS acc2<<32, acc3, acc3
249 LSR $32, acc2, t0
250 MUL acc2, const1, t1
251 UMULH acc2, const1, acc2
252 ADCS t0, acc0
253 ADCS t1, acc1
254 ADC $0, acc2
255 // Last reduction step
256 ADDS acc3<<32, acc0, acc0
257 LSR $32, acc3, t0
258 MUL acc3, const1, t1
259 UMULH acc3, const1, acc3
260 ADCS t0, acc1
261 ADCS t1, acc2
262 ADC $0, acc3
263
264 SUBS $-1, acc0, t0
265 SBCS const0, acc1, t1
266 SBCS $0, acc2, t2
267 SBCS const1, acc3, t3
268
269 CSEL CS, t0, acc0, acc0
270 CSEL CS, t1, acc1, acc1
271 CSEL CS, t2, acc2, acc2
272 CSEL CS, t3, acc3, acc3
273
274 STP (acc0, acc1), 0*16(res_ptr)
275 STP (acc2, acc3), 1*16(res_ptr)
276
277 RET
278/* ---------------------------------------*/
279// func p256Select(res *P256Point, table *p256Table, idx int)
280TEXT ·p256Select(SB),NOSPLIT,$0
281 MOVD idx+16(FP), const0
282 MOVD table+8(FP), b_ptr
283 MOVD res+0(FP), res_ptr
284
285 EOR x0, x0, x0
286 EOR x1, x1, x1
287 EOR x2, x2, x2
288 EOR x3, x3, x3
289 EOR y0, y0, y0
290 EOR y1, y1, y1
291 EOR y2, y2, y2
292 EOR y3, y3, y3
293 EOR t0, t0, t0
294 EOR t1, t1, t1
295 EOR t2, t2, t2
296 EOR t3, t3, t3
297
298 MOVD $0, const1
299
300loop_select:
301 ADD $1, const1
302 CMP const0, const1
303 LDP.P 16(b_ptr), (acc0, acc1)
304 CSEL EQ, acc0, x0, x0
305 CSEL EQ, acc1, x1, x1
306 LDP.P 16(b_ptr), (acc2, acc3)
307 CSEL EQ, acc2, x2, x2
308 CSEL EQ, acc3, x3, x3
309 LDP.P 16(b_ptr), (acc4, acc5)
310 CSEL EQ, acc4, y0, y0
311 CSEL EQ, acc5, y1, y1
312 LDP.P 16(b_ptr), (acc6, acc7)
313 CSEL EQ, acc6, y2, y2
314 CSEL EQ, acc7, y3, y3
315 LDP.P 16(b_ptr), (acc0, acc1)
316 CSEL EQ, acc0, t0, t0
317 CSEL EQ, acc1, t1, t1
318 LDP.P 16(b_ptr), (acc2, acc3)
319 CSEL EQ, acc2, t2, t2
320 CSEL EQ, acc3, t3, t3
321
322 CMP $16, const1
323 BNE loop_select
324
325 STP (x0, x1), 0*16(res_ptr)
326 STP (x2, x3), 1*16(res_ptr)
327 STP (y0, y1), 2*16(res_ptr)
328 STP (y2, y3), 3*16(res_ptr)
329 STP (t0, t1), 4*16(res_ptr)
330 STP (t2, t3), 5*16(res_ptr)
331 RET
332/* ---------------------------------------*/
333// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
334TEXT ·p256SelectAffine(SB),NOSPLIT,$0
335 MOVD idx+16(FP), t0
336 MOVD table+8(FP), t1
337 MOVD res+0(FP), res_ptr
338
339 EOR x0, x0, x0
340 EOR x1, x1, x1
341 EOR x2, x2, x2
342 EOR x3, x3, x3
343 EOR y0, y0, y0
344 EOR y1, y1, y1
345 EOR y2, y2, y2
346 EOR y3, y3, y3
347
348 MOVD $0, t2
349
350loop_select:
351 ADD $1, t2
352 CMP t0, t2
353 LDP.P 16(t1), (acc0, acc1)
354 CSEL EQ, acc0, x0, x0
355 CSEL EQ, acc1, x1, x1
356 LDP.P 16(t1), (acc2, acc3)
357 CSEL EQ, acc2, x2, x2
358 CSEL EQ, acc3, x3, x3
359 LDP.P 16(t1), (acc4, acc5)
360 CSEL EQ, acc4, y0, y0
361 CSEL EQ, acc5, y1, y1
362 LDP.P 16(t1), (acc6, acc7)
363 CSEL EQ, acc6, y2, y2
364 CSEL EQ, acc7, y3, y3
365
366 CMP $32, t2
367 BNE loop_select
368
369 STP (x0, x1), 0*16(res_ptr)
370 STP (x2, x3), 1*16(res_ptr)
371 STP (y0, y1), 2*16(res_ptr)
372 STP (y2, y3), 3*16(res_ptr)
373 RET
374/* ---------------------------------------*/
375// func p256OrdSqr(res, in *p256OrdElement, n int)
376TEXT ·p256OrdSqr(SB),NOSPLIT,$0
377 MOVD in+8(FP), a_ptr
378 MOVD n+16(FP), b_ptr
379
380 MOVD p256ordK0<>(SB), hlp1
381 LDP p256ord<>+0x00(SB), (const0, const1)
382 LDP p256ord<>+0x10(SB), (const2, const3)
383
384 LDP 0*16(a_ptr), (x0, x1)
385 LDP 1*16(a_ptr), (x2, x3)
386
387ordSqrLoop:
388 SUB $1, b_ptr
389
390 // x[1:] * x[0]
391 MUL x0, x1, acc1
392 UMULH x0, x1, acc2
393
394 MUL x0, x2, t0
395 ADDS t0, acc2, acc2
396 UMULH x0, x2, acc3
397
398 MUL x0, x3, t0
399 ADCS t0, acc3, acc3
400 UMULH x0, x3, acc4
401 ADC $0, acc4, acc4
402 // x[2:] * x[1]
403 MUL x1, x2, t0
404 ADDS t0, acc3
405 UMULH x1, x2, t1
406 ADCS t1, acc4
407 ADC $0, ZR, acc5
408
409 MUL x1, x3, t0
410 ADDS t0, acc4
411 UMULH x1, x3, t1
412 ADC t1, acc5
413 // x[3] * x[2]
414 MUL x2, x3, t0
415 ADDS t0, acc5
416 UMULH x2, x3, acc6
417 ADC $0, acc6
418
419 MOVD $0, acc7
420 // *2
421 ADDS acc1, acc1
422 ADCS acc2, acc2
423 ADCS acc3, acc3
424 ADCS acc4, acc4
425 ADCS acc5, acc5
426 ADCS acc6, acc6
427 ADC $0, acc7
428 // Missing products
429 MUL x0, x0, acc0
430 UMULH x0, x0, t0
431 ADDS t0, acc1, acc1
432
433 MUL x1, x1, t0
434 ADCS t0, acc2, acc2
435 UMULH x1, x1, t1
436 ADCS t1, acc3, acc3
437
438 MUL x2, x2, t0
439 ADCS t0, acc4, acc4
440 UMULH x2, x2, t1
441 ADCS t1, acc5, acc5
442
443 MUL x3, x3, t0
444 ADCS t0, acc6, acc6
445 UMULH x3, x3, t1
446 ADC t1, acc7, acc7
447 // First reduction step
448 MUL acc0, hlp1, hlp0
449
450 MUL const0, hlp1, t0
451 ADDS t0, acc0, acc0
452 UMULH const0, hlp0, t1
453
454 MUL const1, hlp0, t0
455 ADCS t0, acc1, acc1
456 UMULH const1, hlp0, y0
457
458 MUL const2, hlp0, t0
459 ADCS t0, acc2, acc2
460 UMULH const2, hlp0, acc0
461
462 MUL const3, hlp0, t0
463 ADCS t0, acc3, acc3
464
465 UMULH const3, hlp0, hlp0
466 ADC $0, hlp0
467
468 ADDS t1, acc1, acc1
469 ADCS y0, acc2, acc2
470 ADCS acc0, acc3, acc3
471 ADC $0, hlp0, acc0
472 // Second reduction step
473 MUL acc1, hlp1, hlp0
474
475 MUL const0, hlp1, t0
476 ADDS t0, acc1, acc1
477 UMULH const0, hlp0, t1
478
479 MUL const1, hlp0, t0
480 ADCS t0, acc2, acc2
481 UMULH const1, hlp0, y0
482
483 MUL const2, hlp0, t0
484 ADCS t0, acc3, acc3
485 UMULH const2, hlp0, acc1
486
487 MUL const3, hlp0, t0
488 ADCS t0, acc0, acc0
489
490 UMULH const3, hlp0, hlp0
491 ADC $0, hlp0
492
493 ADDS t1, acc2, acc2
494 ADCS y0, acc3, acc3
495 ADCS acc1, acc0, acc0
496 ADC $0, hlp0, acc1
497 // Third reduction step
498 MUL acc2, hlp1, hlp0
499
500 MUL const0, hlp1, t0
501 ADDS t0, acc2, acc2
502 UMULH const0, hlp0, t1
503
504 MUL const1, hlp0, t0
505 ADCS t0, acc3, acc3
506 UMULH const1, hlp0, y0
507
508 MUL const2, hlp0, t0
509 ADCS t0, acc0, acc0
510 UMULH const2, hlp0, acc2
511
512 MUL const3, hlp0, t0
513 ADCS t0, acc1, acc1
514
515 UMULH const3, hlp0, hlp0
516 ADC $0, hlp0
517
518 ADDS t1, acc3, acc3
519 ADCS y0, acc0, acc0
520 ADCS acc2, acc1, acc1
521 ADC $0, hlp0, acc2
522
523 // Last reduction step
524 MUL acc3, hlp1, hlp0
525
526 MUL const0, hlp1, t0
527 ADDS t0, acc3, acc3
528 UMULH const0, hlp0, t1
529
530 MUL const1, hlp0, t0
531 ADCS t0, acc0, acc0
532 UMULH const1, hlp0, y0
533
534 MUL const2, hlp0, t0
535 ADCS t0, acc1, acc1
536 UMULH const2, hlp0, acc3
537
538 MUL const3, hlp0, t0
539 ADCS t0, acc2, acc2
540
541 UMULH const3, hlp0, hlp0
542 ADC $0, acc7
543
544 ADDS t1, acc0, acc0
545 ADCS y0, acc1, acc1
546 ADCS acc3, acc2, acc2
547 ADC $0, hlp0, acc3
548
549 ADDS acc4, acc0, acc0
550 ADCS acc5, acc1, acc1
551 ADCS acc6, acc2, acc2
552 ADCS acc7, acc3, acc3
553 ADC $0, ZR, acc4
554
555 SUBS const0, acc0, y0
556 SBCS const1, acc1, y1
557 SBCS const2, acc2, y2
558 SBCS const3, acc3, y3
559 SBCS $0, acc4, acc4
560
561 CSEL CS, y0, acc0, x0
562 CSEL CS, y1, acc1, x1
563 CSEL CS, y2, acc2, x2
564 CSEL CS, y3, acc3, x3
565
566 CBNZ b_ptr, ordSqrLoop
567
568 MOVD res+0(FP), res_ptr
569 STP (x0, x1), 0*16(res_ptr)
570 STP (x2, x3), 1*16(res_ptr)
571
572 RET
573/* ---------------------------------------*/
574// func p256OrdMul(res, in1, in2 *p256OrdElement)
575TEXT ·p256OrdMul(SB),NOSPLIT,$0
576 MOVD in1+8(FP), a_ptr
577 MOVD in2+16(FP), b_ptr
578
579 MOVD p256ordK0<>(SB), hlp1
580 LDP p256ord<>+0x00(SB), (const0, const1)
581 LDP p256ord<>+0x10(SB), (const2, const3)
582
583 LDP 0*16(a_ptr), (x0, x1)
584 LDP 1*16(a_ptr), (x2, x3)
585 LDP 0*16(b_ptr), (y0, y1)
586 LDP 1*16(b_ptr), (y2, y3)
587
588 // y[0] * x
589 MUL y0, x0, acc0
590 UMULH y0, x0, acc1
591
592 MUL y0, x1, t0
593 ADDS t0, acc1
594 UMULH y0, x1, acc2
595
596 MUL y0, x2, t0
597 ADCS t0, acc2
598 UMULH y0, x2, acc3
599
600 MUL y0, x3, t0
601 ADCS t0, acc3
602 UMULH y0, x3, acc4
603 ADC $0, acc4
604 // First reduction step
605 MUL acc0, hlp1, hlp0
606
607 MUL const0, hlp1, t0
608 ADDS t0, acc0, acc0
609 UMULH const0, hlp0, t1
610
611 MUL const1, hlp0, t0
612 ADCS t0, acc1, acc1
613 UMULH const1, hlp0, y0
614
615 MUL const2, hlp0, t0
616 ADCS t0, acc2, acc2
617 UMULH const2, hlp0, acc0
618
619 MUL const3, hlp0, t0
620 ADCS t0, acc3, acc3
621
622 UMULH const3, hlp0, hlp0
623 ADC $0, acc4
624
625 ADDS t1, acc1, acc1
626 ADCS y0, acc2, acc2
627 ADCS acc0, acc3, acc3
628 ADC $0, hlp0, acc0
629 // y[1] * x
630 MUL y1, x0, t0
631 ADDS t0, acc1
632 UMULH y1, x0, t1
633
634 MUL y1, x1, t0
635 ADCS t0, acc2
636 UMULH y1, x1, hlp0
637
638 MUL y1, x2, t0
639 ADCS t0, acc3
640 UMULH y1, x2, y0
641
642 MUL y1, x3, t0
643 ADCS t0, acc4
644 UMULH y1, x3, y1
645 ADC $0, ZR, acc5
646
647 ADDS t1, acc2
648 ADCS hlp0, acc3
649 ADCS y0, acc4
650 ADC y1, acc5
651 // Second reduction step
652 MUL acc1, hlp1, hlp0
653
654 MUL const0, hlp1, t0
655 ADDS t0, acc1, acc1
656 UMULH const0, hlp0, t1
657
658 MUL const1, hlp0, t0
659 ADCS t0, acc2, acc2
660 UMULH const1, hlp0, y0
661
662 MUL const2, hlp0, t0
663 ADCS t0, acc3, acc3
664 UMULH const2, hlp0, acc1
665
666 MUL const3, hlp0, t0
667 ADCS t0, acc0, acc0
668
669 UMULH const3, hlp0, hlp0
670 ADC $0, acc5
671
672 ADDS t1, acc2, acc2
673 ADCS y0, acc3, acc3
674 ADCS acc1, acc0, acc0
675 ADC $0, hlp0, acc1
676 // y[2] * x
677 MUL y2, x0, t0
678 ADDS t0, acc2
679 UMULH y2, x0, t1
680
681 MUL y2, x1, t0
682 ADCS t0, acc3
683 UMULH y2, x1, hlp0
684
685 MUL y2, x2, t0
686 ADCS t0, acc4
687 UMULH y2, x2, y0
688
689 MUL y2, x3, t0
690 ADCS t0, acc5
691 UMULH y2, x3, y1
692 ADC $0, ZR, acc6
693
694 ADDS t1, acc3
695 ADCS hlp0, acc4
696 ADCS y0, acc5
697 ADC y1, acc6
698 // Third reduction step
699 MUL acc2, hlp1, hlp0
700
701 MUL const0, hlp1, t0
702 ADDS t0, acc2, acc2
703 UMULH const0, hlp0, t1
704
705 MUL const1, hlp0, t0
706 ADCS t0, acc3, acc3
707 UMULH const1, hlp0, y0
708
709 MUL const2, hlp0, t0
710 ADCS t0, acc0, acc0
711 UMULH const2, hlp0, acc2
712
713 MUL const3, hlp0, t0
714 ADCS t0, acc1, acc1
715
716 UMULH const3, hlp0, hlp0
717 ADC $0, acc6
718
719 ADDS t1, acc3, acc3
720 ADCS y0, acc0, acc0
721 ADCS acc2, acc1, acc1
722 ADC $0, hlp0, acc2
723 // y[3] * x
724 MUL y3, x0, t0
725 ADDS t0, acc3
726 UMULH y3, x0, t1
727
728 MUL y3, x1, t0
729 ADCS t0, acc4
730 UMULH y3, x1, hlp0
731
732 MUL y3, x2, t0
733 ADCS t0, acc5
734 UMULH y3, x2, y0
735
736 MUL y3, x3, t0
737 ADCS t0, acc6
738 UMULH y3, x3, y1
739 ADC $0, ZR, acc7
740
741 ADDS t1, acc4
742 ADCS hlp0, acc5
743 ADCS y0, acc6
744 ADC y1, acc7
745 // Last reduction step
746 MUL acc3, hlp1, hlp0
747
748 MUL const0, hlp1, t0
749 ADDS t0, acc3, acc3
750 UMULH const0, hlp0, t1
751
752 MUL const1, hlp0, t0
753 ADCS t0, acc0, acc0
754 UMULH const1, hlp0, y0
755
756 MUL const2, hlp0, t0
757 ADCS t0, acc1, acc1
758 UMULH const2, hlp0, acc3
759
760 MUL const3, hlp0, t0
761 ADCS t0, acc2, acc2
762
763 UMULH const3, hlp0, hlp0
764 ADC $0, acc7
765
766 ADDS t1, acc0, acc0
767 ADCS y0, acc1, acc1
768 ADCS acc3, acc2, acc2
769 ADC $0, hlp0, acc3
770
771 ADDS acc4, acc0, acc0
772 ADCS acc5, acc1, acc1
773 ADCS acc6, acc2, acc2
774 ADCS acc7, acc3, acc3
775 ADC $0, ZR, acc4
776
777 SUBS const0, acc0, t0
778 SBCS const1, acc1, t1
779 SBCS const2, acc2, t2
780 SBCS const3, acc3, t3
781 SBCS $0, acc4, acc4
782
783 CSEL CS, t0, acc0, acc0
784 CSEL CS, t1, acc1, acc1
785 CSEL CS, t2, acc2, acc2
786 CSEL CS, t3, acc3, acc3
787
788 MOVD res+0(FP), res_ptr
789 STP (acc0, acc1), 0*16(res_ptr)
790 STP (acc2, acc3), 1*16(res_ptr)
791
792 RET
793/* ---------------------------------------*/
794TEXT p256SubInternal<>(SB),NOSPLIT,$0
795 SUBS x0, y0, acc0
796 SBCS x1, y1, acc1
797 SBCS x2, y2, acc2
798 SBCS x3, y3, acc3
799 SBC $0, ZR, t0
800
801 ADDS $-1, acc0, acc4
802 ADCS const0, acc1, acc5
803 ADCS $0, acc2, acc6
804 ADC const1, acc3, acc7
805
806 ANDS $1, t0
807 CSEL EQ, acc0, acc4, x0
808 CSEL EQ, acc1, acc5, x1
809 CSEL EQ, acc2, acc6, x2
810 CSEL EQ, acc3, acc7, x3
811
812 RET
813/* ---------------------------------------*/
814TEXT p256SqrInternal<>(SB),NOSPLIT,$0
815 // x[1:] * x[0]
816 MUL x0, x1, acc1
817 UMULH x0, x1, acc2
818
819 MUL x0, x2, t0
820 ADDS t0, acc2, acc2
821 UMULH x0, x2, acc3
822
823 MUL x0, x3, t0
824 ADCS t0, acc3, acc3
825 UMULH x0, x3, acc4
826 ADC $0, acc4, acc4
827 // x[2:] * x[1]
828 MUL x1, x2, t0
829 ADDS t0, acc3
830 UMULH x1, x2, t1
831 ADCS t1, acc4
832 ADC $0, ZR, acc5
833
834 MUL x1, x3, t0
835 ADDS t0, acc4
836 UMULH x1, x3, t1
837 ADC t1, acc5
838 // x[3] * x[2]
839 MUL x2, x3, t0
840 ADDS t0, acc5
841 UMULH x2, x3, acc6
842 ADC $0, acc6
843
844 MOVD $0, acc7
845 // *2
846 ADDS acc1, acc1
847 ADCS acc2, acc2
848 ADCS acc3, acc3
849 ADCS acc4, acc4
850 ADCS acc5, acc5
851 ADCS acc6, acc6
852 ADC $0, acc7
853 // Missing products
854 MUL x0, x0, acc0
855 UMULH x0, x0, t0
856 ADDS t0, acc1, acc1
857
858 MUL x1, x1, t0
859 ADCS t0, acc2, acc2
860 UMULH x1, x1, t1
861 ADCS t1, acc3, acc3
862
863 MUL x2, x2, t0
864 ADCS t0, acc4, acc4
865 UMULH x2, x2, t1
866 ADCS t1, acc5, acc5
867
868 MUL x3, x3, t0
869 ADCS t0, acc6, acc6
870 UMULH x3, x3, t1
871 ADCS t1, acc7, acc7
872 // First reduction step
873 ADDS acc0<<32, acc1, acc1
874 LSR $32, acc0, t0
875 MUL acc0, const1, t1
876 UMULH acc0, const1, acc0
877 ADCS t0, acc2, acc2
878 ADCS t1, acc3, acc3
879 ADC $0, acc0, acc0
880 // Second reduction step
881 ADDS acc1<<32, acc2, acc2
882 LSR $32, acc1, t0
883 MUL acc1, const1, t1
884 UMULH acc1, const1, acc1
885 ADCS t0, acc3, acc3
886 ADCS t1, acc0, acc0
887 ADC $0, acc1, acc1
888 // Third reduction step
889 ADDS acc2<<32, acc3, acc3
890 LSR $32, acc2, t0
891 MUL acc2, const1, t1
892 UMULH acc2, const1, acc2
893 ADCS t0, acc0, acc0
894 ADCS t1, acc1, acc1
895 ADC $0, acc2, acc2
896 // Last reduction step
897 ADDS acc3<<32, acc0, acc0
898 LSR $32, acc3, t0
899 MUL acc3, const1, t1
900 UMULH acc3, const1, acc3
901 ADCS t0, acc1, acc1
902 ADCS t1, acc2, acc2
903 ADC $0, acc3, acc3
904 // Add bits [511:256] of the sqr result
905 ADDS acc4, acc0, acc0
906 ADCS acc5, acc1, acc1
907 ADCS acc6, acc2, acc2
908 ADCS acc7, acc3, acc3
909 ADC $0, ZR, acc4
910
911 SUBS $-1, acc0, t0
912 SBCS const0, acc1, t1
913 SBCS $0, acc2, t2
914 SBCS const1, acc3, t3
915 SBCS $0, acc4, acc4
916
917 CSEL CS, t0, acc0, y0
918 CSEL CS, t1, acc1, y1
919 CSEL CS, t2, acc2, y2
920 CSEL CS, t3, acc3, y3
921 RET
922/* ---------------------------------------*/
923TEXT p256MulInternal<>(SB),NOSPLIT,$0
924 // y[0] * x
925 MUL y0, x0, acc0
926 UMULH y0, x0, acc1
927
928 MUL y0, x1, t0
929 ADDS t0, acc1
930 UMULH y0, x1, acc2
931
932 MUL y0, x2, t0
933 ADCS t0, acc2
934 UMULH y0, x2, acc3
935
936 MUL y0, x3, t0
937 ADCS t0, acc3
938 UMULH y0, x3, acc4
939 ADC $0, acc4
940 // First reduction step
941 ADDS acc0<<32, acc1, acc1
942 LSR $32, acc0, t0
943 MUL acc0, const1, t1
944 UMULH acc0, const1, acc0
945 ADCS t0, acc2
946 ADCS t1, acc3
947 ADC $0, acc0
948 // y[1] * x
949 MUL y1, x0, t0
950 ADDS t0, acc1
951 UMULH y1, x0, t1
952
953 MUL y1, x1, t0
954 ADCS t0, acc2
955 UMULH y1, x1, t2
956
957 MUL y1, x2, t0
958 ADCS t0, acc3
959 UMULH y1, x2, t3
960
961 MUL y1, x3, t0
962 ADCS t0, acc4
963 UMULH y1, x3, hlp0
964 ADC $0, ZR, acc5
965
966 ADDS t1, acc2
967 ADCS t2, acc3
968 ADCS t3, acc4
969 ADC hlp0, acc5
970 // Second reduction step
971 ADDS acc1<<32, acc2, acc2
972 LSR $32, acc1, t0
973 MUL acc1, const1, t1
974 UMULH acc1, const1, acc1
975 ADCS t0, acc3
976 ADCS t1, acc0
977 ADC $0, acc1
978 // y[2] * x
979 MUL y2, x0, t0
980 ADDS t0, acc2
981 UMULH y2, x0, t1
982
983 MUL y2, x1, t0
984 ADCS t0, acc3
985 UMULH y2, x1, t2
986
987 MUL y2, x2, t0
988 ADCS t0, acc4
989 UMULH y2, x2, t3
990
991 MUL y2, x3, t0
992 ADCS t0, acc5
993 UMULH y2, x3, hlp0
994 ADC $0, ZR, acc6
995
996 ADDS t1, acc3
997 ADCS t2, acc4
998 ADCS t3, acc5
999 ADC hlp0, acc6
1000 // Third reduction step
1001 ADDS acc2<<32, acc3, acc3
1002 LSR $32, acc2, t0
1003 MUL acc2, const1, t1
1004 UMULH acc2, const1, acc2
1005 ADCS t0, acc0
1006 ADCS t1, acc1
1007 ADC $0, acc2
1008 // y[3] * x
1009 MUL y3, x0, t0
1010 ADDS t0, acc3
1011 UMULH y3, x0, t1
1012
1013 MUL y3, x1, t0
1014 ADCS t0, acc4
1015 UMULH y3, x1, t2
1016
1017 MUL y3, x2, t0
1018 ADCS t0, acc5
1019 UMULH y3, x2, t3
1020
1021 MUL y3, x3, t0
1022 ADCS t0, acc6
1023 UMULH y3, x3, hlp0
1024 ADC $0, ZR, acc7
1025
1026 ADDS t1, acc4
1027 ADCS t2, acc5
1028 ADCS t3, acc6
1029 ADC hlp0, acc7
1030 // Last reduction step
1031 ADDS acc3<<32, acc0, acc0
1032 LSR $32, acc3, t0
1033 MUL acc3, const1, t1
1034 UMULH acc3, const1, acc3
1035 ADCS t0, acc1
1036 ADCS t1, acc2
1037 ADC $0, acc3
1038 // Add bits [511:256] of the mul result
1039 ADDS acc4, acc0, acc0
1040 ADCS acc5, acc1, acc1
1041 ADCS acc6, acc2, acc2
1042 ADCS acc7, acc3, acc3
1043 ADC $0, ZR, acc4
1044
1045 SUBS $-1, acc0, t0
1046 SBCS const0, acc1, t1
1047 SBCS $0, acc2, t2
1048 SBCS const1, acc3, t3
1049 SBCS $0, acc4, acc4
1050
1051 CSEL CS, t0, acc0, y0
1052 CSEL CS, t1, acc1, y1
1053 CSEL CS, t2, acc2, y2
1054 CSEL CS, t3, acc3, y3
1055 RET
1056/* ---------------------------------------*/
1057#define p256MulBy2Inline \
1058 ADDS y0, y0, x0; \
1059 ADCS y1, y1, x1; \
1060 ADCS y2, y2, x2; \
1061 ADCS y3, y3, x3; \
1062 ADC $0, ZR, hlp0; \
1063 SUBS $-1, x0, t0; \
1064 SBCS const0, x1, t1;\
1065 SBCS $0, x2, t2; \
1066 SBCS const1, x3, t3;\
1067 SBCS $0, hlp0, hlp0;\
1068 CSEL CC, x0, t0, x0;\
1069 CSEL CC, x1, t1, x1;\
1070 CSEL CC, x2, t2, x2;\
1071 CSEL CC, x3, t3, x3;
1072/* ---------------------------------------*/
1073#define x1in(off) (off)(a_ptr)
1074#define y1in(off) (off + 32)(a_ptr)
1075#define z1in(off) (off + 64)(a_ptr)
1076#define x2in(off) (off)(b_ptr)
1077#define z2in(off) (off + 64)(b_ptr)
1078#define x3out(off) (off)(res_ptr)
1079#define y3out(off) (off + 32)(res_ptr)
1080#define z3out(off) (off + 64)(res_ptr)
1081#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1082#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1083#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1084#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1085/* ---------------------------------------*/
1086#define y2in(off) (32*0 + 8 + off)(RSP)
1087#define s2(off) (32*1 + 8 + off)(RSP)
1088#define z1sqr(off) (32*2 + 8 + off)(RSP)
1089#define h(off) (32*3 + 8 + off)(RSP)
1090#define r(off) (32*4 + 8 + off)(RSP)
1091#define hsqr(off) (32*5 + 8 + off)(RSP)
1092#define rsqr(off) (32*6 + 8 + off)(RSP)
1093#define hcub(off) (32*7 + 8 + off)(RSP)
1094
1095#define z2sqr(off) (32*8 + 8 + off)(RSP)
1096#define s1(off) (32*9 + 8 + off)(RSP)
1097#define u1(off) (32*10 + 8 + off)(RSP)
1098#define u2(off) (32*11 + 8 + off)(RSP)
1099
1100// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1101TEXT ·p256PointAddAffineAsm(SB),0,$264-48
1102 MOVD in1+8(FP), a_ptr
1103 MOVD in2+16(FP), b_ptr
1104 MOVD sign+24(FP), hlp0
1105 MOVD sel+32(FP), hlp1
1106 MOVD zero+40(FP), t2
1107
1108 MOVD $1, t0
1109 CMP $0, t2
1110 CSEL EQ, ZR, t0, t2
1111 CMP $0, hlp1
1112 CSEL EQ, ZR, t0, hlp1
1113
1114 MOVD p256const0<>(SB), const0
1115 MOVD p256const1<>(SB), const1
1116 EOR t2<<1, hlp1
1117
1118 // Negate y2in based on sign
1119 LDP 2*16(b_ptr), (y0, y1)
1120 LDP 3*16(b_ptr), (y2, y3)
1121 MOVD $-1, acc0
1122
1123 SUBS y0, acc0, acc0
1124 SBCS y1, const0, acc1
1125 SBCS y2, ZR, acc2
1126 SBCS y3, const1, acc3
1127 SBC $0, ZR, t0
1128
1129 ADDS $-1, acc0, acc4
1130 ADCS const0, acc1, acc5
1131 ADCS $0, acc2, acc6
1132 ADCS const1, acc3, acc7
1133 ADC $0, t0, t0
1134
1135 CMP $0, t0
1136 CSEL EQ, acc4, acc0, acc0
1137 CSEL EQ, acc5, acc1, acc1
1138 CSEL EQ, acc6, acc2, acc2
1139 CSEL EQ, acc7, acc3, acc3
1140 // If condition is 0, keep original value
1141 CMP $0, hlp0
1142 CSEL EQ, y0, acc0, y0
1143 CSEL EQ, y1, acc1, y1
1144 CSEL EQ, y2, acc2, y2
1145 CSEL EQ, y3, acc3, y3
1146 // Store result
1147 STy(y2in)
1148 // Begin point add
1149 LDx(z1in)
1150 CALL p256SqrInternal<>(SB) // z1ˆ2
1151 STy(z1sqr)
1152
1153 LDx(x2in)
1154 CALL p256MulInternal<>(SB) // x2 * z1ˆ2
1155
1156 LDx(x1in)
1157 CALL p256SubInternal<>(SB) // h = u2 - u1
1158 STx(h)
1159
1160 LDy(z1in)
1161 CALL p256MulInternal<>(SB) // z3 = h * z1
1162
1163 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1164 LDP 5*16(a_ptr), (acc2, acc3)
1165 ANDS $1, hlp1, ZR
1166 CSEL EQ, acc0, y0, y0
1167 CSEL EQ, acc1, y1, y1
1168 CSEL EQ, acc2, y2, y2
1169 CSEL EQ, acc3, y3, y3
1170 LDP p256one<>+0x00(SB), (acc0, acc1)
1171 LDP p256one<>+0x10(SB), (acc2, acc3)
1172 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1
1173 CSEL EQ, acc0, y0, y0
1174 CSEL EQ, acc1, y1, y1
1175 CSEL EQ, acc2, y2, y2
1176 CSEL EQ, acc3, y3, y3
1177 LDx(z1in)
1178 MOVD res+0(FP), t0
1179 STP (y0, y1), 4*16(t0)
1180 STP (y2, y3), 5*16(t0)
1181
1182 LDy(z1sqr)
1183 CALL p256MulInternal<>(SB) // z1 ^ 3
1184
1185 LDx(y2in)
1186 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3
1187 STy(s2)
1188
1189 LDx(y1in)
1190 CALL p256SubInternal<>(SB) // r = s2 - s1
1191 STx(r)
1192
1193 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1194 STy (rsqr)
1195
1196 LDx(h)
1197 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1198 STy(hsqr)
1199
1200 CALL p256MulInternal<>(SB) // hcub = hˆ3
1201 STy(hcub)
1202
1203 LDx(y1in)
1204 CALL p256MulInternal<>(SB) // y1 * hˆ3
1205 STy(s2)
1206
1207 LDP hsqr(0*8), (x0, x1)
1208 LDP hsqr(2*8), (x2, x3)
1209 LDP 0*16(a_ptr), (y0, y1)
1210 LDP 1*16(a_ptr), (y2, y3)
1211 CALL p256MulInternal<>(SB) // u1 * hˆ2
1212 STP (y0, y1), h(0*8)
1213 STP (y2, y3), h(2*8)
1214
1215 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1216
1217 LDy(rsqr)
1218 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1219
1220 MOVD x0, y0
1221 MOVD x1, y1
1222 MOVD x2, y2
1223 MOVD x3, y3
1224 LDx(hcub)
1225 CALL p256SubInternal<>(SB)
1226
1227 LDP 0*16(a_ptr), (acc0, acc1)
1228 LDP 1*16(a_ptr), (acc2, acc3)
1229 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1
1230 CSEL EQ, acc0, x0, x0
1231 CSEL EQ, acc1, x1, x1
1232 CSEL EQ, acc2, x2, x2
1233 CSEL EQ, acc3, x3, x3
1234 LDP 0*16(b_ptr), (acc0, acc1)
1235 LDP 1*16(b_ptr), (acc2, acc3)
1236 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2
1237 CSEL EQ, acc0, x0, x0
1238 CSEL EQ, acc1, x1, x1
1239 CSEL EQ, acc2, x2, x2
1240 CSEL EQ, acc3, x3, x3
1241 MOVD res+0(FP), t0
1242 STP (x0, x1), 0*16(t0)
1243 STP (x2, x3), 1*16(t0)
1244
1245 LDP h(0*8), (y0, y1)
1246 LDP h(2*8), (y2, y3)
1247 CALL p256SubInternal<>(SB)
1248
1249 LDP r(0*8), (y0, y1)
1250 LDP r(2*8), (y2, y3)
1251 CALL p256MulInternal<>(SB)
1252
1253 LDP s2(0*8), (x0, x1)
1254 LDP s2(2*8), (x2, x3)
1255 CALL p256SubInternal<>(SB)
1256 LDP 2*16(a_ptr), (acc0, acc1)
1257 LDP 3*16(a_ptr), (acc2, acc3)
1258 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1
1259 CSEL EQ, acc0, x0, x0
1260 CSEL EQ, acc1, x1, x1
1261 CSEL EQ, acc2, x2, x2
1262 CSEL EQ, acc3, x3, x3
1263 LDP y2in(0*8), (acc0, acc1)
1264 LDP y2in(2*8), (acc2, acc3)
1265 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2
1266 CSEL EQ, acc0, x0, x0
1267 CSEL EQ, acc1, x1, x1
1268 CSEL EQ, acc2, x2, x2
1269 CSEL EQ, acc3, x3, x3
1270 MOVD res+0(FP), t0
1271 STP (x0, x1), 2*16(t0)
1272 STP (x2, x3), 3*16(t0)
1273
1274 RET
1275
1276#define p256AddInline \
1277 ADDS y0, x0, x0; \
1278 ADCS y1, x1, x1; \
1279 ADCS y2, x2, x2; \
1280 ADCS y3, x3, x3; \
1281 ADC $0, ZR, hlp0; \
1282 SUBS $-1, x0, t0; \
1283 SBCS const0, x1, t1;\
1284 SBCS $0, x2, t2; \
1285 SBCS const1, x3, t3;\
1286 SBCS $0, hlp0, hlp0;\
1287 CSEL CC, x0, t0, x0;\
1288 CSEL CC, x1, t1, x1;\
1289 CSEL CC, x2, t2, x2;\
1290 CSEL CC, x3, t3, x3;
1291
1292#define s(off) (32*0 + 8 + off)(RSP)
1293#define m(off) (32*1 + 8 + off)(RSP)
1294#define zsqr(off) (32*2 + 8 + off)(RSP)
1295#define tmp(off) (32*3 + 8 + off)(RSP)
1296
1297//func p256PointDoubleAsm(res, in *P256Point)
1298TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
1299 MOVD res+0(FP), res_ptr
1300 MOVD in+8(FP), a_ptr
1301
1302 MOVD p256const0<>(SB), const0
1303 MOVD p256const1<>(SB), const1
1304
1305 // Begin point double
1306 LDP 4*16(a_ptr), (x0, x1)
1307 LDP 5*16(a_ptr), (x2, x3)
1308 CALL p256SqrInternal<>(SB)
1309 STP (y0, y1), zsqr(0*8)
1310 STP (y2, y3), zsqr(2*8)
1311
1312 LDP 0*16(a_ptr), (x0, x1)
1313 LDP 1*16(a_ptr), (x2, x3)
1314 p256AddInline
1315 STx(m)
1316
1317 LDx(z1in)
1318 LDy(y1in)
1319 CALL p256MulInternal<>(SB)
1320 p256MulBy2Inline
1321 STx(z3out)
1322
1323 LDy(x1in)
1324 LDx(zsqr)
1325 CALL p256SubInternal<>(SB)
1326 LDy(m)
1327 CALL p256MulInternal<>(SB)
1328
1329 // Multiply by 3
1330 p256MulBy2Inline
1331 p256AddInline
1332 STx(m)
1333
1334 LDy(y1in)
1335 p256MulBy2Inline
1336 CALL p256SqrInternal<>(SB)
1337 STy(s)
1338 MOVD y0, x0
1339 MOVD y1, x1
1340 MOVD y2, x2
1341 MOVD y3, x3
1342 CALL p256SqrInternal<>(SB)
1343
1344 // Divide by 2
1345 ADDS $-1, y0, t0
1346 ADCS const0, y1, t1
1347 ADCS $0, y2, t2
1348 ADCS const1, y3, t3
1349 ADC $0, ZR, hlp0
1350
1351 ANDS $1, y0, ZR
1352 CSEL EQ, y0, t0, t0
1353 CSEL EQ, y1, t1, t1
1354 CSEL EQ, y2, t2, t2
1355 CSEL EQ, y3, t3, t3
1356 AND y0, hlp0, hlp0
1357
1358 EXTR $1, t0, t1, y0
1359 EXTR $1, t1, t2, y1
1360 EXTR $1, t2, t3, y2
1361 EXTR $1, t3, hlp0, y3
1362 STy(y3out)
1363
1364 LDx(x1in)
1365 LDy(s)
1366 CALL p256MulInternal<>(SB)
1367 STy(s)
1368 p256MulBy2Inline
1369 STx(tmp)
1370
1371 LDx(m)
1372 CALL p256SqrInternal<>(SB)
1373 LDx(tmp)
1374 CALL p256SubInternal<>(SB)
1375
1376 STx(x3out)
1377
1378 LDy(s)
1379 CALL p256SubInternal<>(SB)
1380
1381 LDy(m)
1382 CALL p256MulInternal<>(SB)
1383
1384 LDx(y3out)
1385 CALL p256SubInternal<>(SB)
1386 STx(y3out)
1387 RET
1388/* ---------------------------------------*/
1389#undef y2in
1390#undef x3out
1391#undef y3out
1392#undef z3out
1393#define y2in(off) (off + 32)(b_ptr)
1394#define x3out(off) (off)(b_ptr)
1395#define y3out(off) (off + 32)(b_ptr)
1396#define z3out(off) (off + 64)(b_ptr)
1397// func p256PointAddAsm(res, in1, in2 *P256Point) int
1398TEXT ·p256PointAddAsm(SB),0,$392-32
1399 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1400 // Move input to stack in order to free registers
1401 MOVD in1+8(FP), a_ptr
1402 MOVD in2+16(FP), b_ptr
1403
1404 MOVD p256const0<>(SB), const0
1405 MOVD p256const1<>(SB), const1
1406
1407 // Begin point add
1408 LDx(z2in)
1409 CALL p256SqrInternal<>(SB) // z2^2
1410 STy(z2sqr)
1411
1412 CALL p256MulInternal<>(SB) // z2^3
1413
1414 LDx(y1in)
1415 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1
1416 STy(s1)
1417
1418 LDx(z1in)
1419 CALL p256SqrInternal<>(SB) // z1^2
1420 STy(z1sqr)
1421
1422 CALL p256MulInternal<>(SB) // z1^3
1423
1424 LDx(y2in)
1425 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2
1426
1427 LDx(s1)
1428 CALL p256SubInternal<>(SB) // r = s2 - s1
1429 STx(r)
1430
1431 MOVD $1, t2
1432 ORR x0, x1, t0 // Check if zero mod p256
1433 ORR x2, x3, t1
1434 ORR t1, t0, t0
1435 CMP $0, t0
1436 CSEL EQ, t2, ZR, hlp1
1437
1438 EOR $-1, x0, t0
1439 EOR const0, x1, t1
1440 EOR const1, x3, t3
1441
1442 ORR t0, t1, t0
1443 ORR x2, t3, t1
1444 ORR t1, t0, t0
1445 CMP $0, t0
1446 CSEL EQ, t2, hlp1, hlp1
1447
1448 LDx(z2sqr)
1449 LDy(x1in)
1450 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2
1451 STy(u1)
1452
1453 LDx(z1sqr)
1454 LDy(x2in)
1455 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2
1456 STy(u2)
1457
1458 LDx(u1)
1459 CALL p256SubInternal<>(SB) // h = u2 - u1
1460 STx(h)
1461
1462 MOVD $1, t2
1463 ORR x0, x1, t0 // Check if zero mod p256
1464 ORR x2, x3, t1
1465 ORR t1, t0, t0
1466 CMP $0, t0
1467 CSEL EQ, t2, ZR, hlp0
1468
1469 EOR $-1, x0, t0
1470 EOR const0, x1, t1
1471 EOR const1, x3, t3
1472
1473 ORR t0, t1, t0
1474 ORR x2, t3, t1
1475 ORR t1, t0, t0
1476 CMP $0, t0
1477 CSEL EQ, t2, hlp0, hlp0
1478
1479 AND hlp0, hlp1, hlp1
1480
1481 LDx(r)
1482 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1483 STy(rsqr)
1484
1485 LDx(h)
1486 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1487 STy(hsqr)
1488
1489 LDx(h)
1490 CALL p256MulInternal<>(SB) // hcub = hˆ3
1491 STy(hcub)
1492
1493 LDx(s1)
1494 CALL p256MulInternal<>(SB)
1495 STy(s2)
1496
1497 LDx(z1in)
1498 LDy(z2in)
1499 CALL p256MulInternal<>(SB) // z1 * z2
1500 LDx(h)
1501 CALL p256MulInternal<>(SB) // z1 * z2 * h
1502 MOVD res+0(FP), b_ptr
1503 STy(z3out)
1504
1505 LDx(hsqr)
1506 LDy(u1)
1507 CALL p256MulInternal<>(SB) // hˆ2 * u1
1508 STy(u2)
1509
1510 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1511 LDy(rsqr)
1512 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1513
1514 MOVD x0, y0
1515 MOVD x1, y1
1516 MOVD x2, y2
1517 MOVD x3, y3
1518 LDx(hcub)
1519 CALL p256SubInternal<>(SB)
1520 STx(x3out)
1521
1522 LDy(u2)
1523 CALL p256SubInternal<>(SB)
1524
1525 LDy(r)
1526 CALL p256MulInternal<>(SB)
1527
1528 LDx(s2)
1529 CALL p256SubInternal<>(SB)
1530 STx(y3out)
1531
1532 MOVD hlp1, R0
1533 MOVD R0, ret+24(FP)
1534
1535 RET
View as plain text