1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8#include "go_asm.h"
9
10DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
11DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
12DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
13DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
14DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
15DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
16DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
17DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
18DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
19DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
20DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
21DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
22DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
23DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
24DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
25DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
26DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
27DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
28DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
29DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
30DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
31DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
32DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
33DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
34DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
35DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
36DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
37DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
38DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
39DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
40DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
41DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
42DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
43DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
44DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
45DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
46DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
47GLOBL p256ordK0<>(SB), 8, $4
48GLOBL p256ord<>(SB), 8, $32
49GLOBL p256<>(SB), 8, $96
50GLOBL p256mul<>(SB), 8, $160
51
52// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
53TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
54 JMP ·p256BigToLittle(SB)
55
56// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
57TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
58 JMP ·p256BigToLittle(SB)
59
60// ---------------------------------------
61// func p256LittleToBig(res *[32]byte, in *p256Element)
62TEXT ·p256LittleToBig(SB), NOSPLIT, $0
63 JMP ·p256BigToLittle(SB)
64
65// func p256BigToLittle(res *p256Element, in *[32]byte)
66#define res_ptr R1
67#define in_ptr R2
68#define T1L V2
69#define T1H V3
70
71TEXT ·p256BigToLittle(SB), NOSPLIT, $0
72 MOVD res+0(FP), res_ptr
73 MOVD in+8(FP), in_ptr
74
75 VL 0(in_ptr), T1H
76 VL 16(in_ptr), T1L
77
78 VPDI $0x4, T1L, T1L, T1L
79 VPDI $0x4, T1H, T1H, T1H
80
81 VST T1L, 0(res_ptr)
82 VST T1H, 16(res_ptr)
83 RET
84
85#undef res_ptr
86#undef in_ptr
87#undef T1L
88#undef T1H
89
90// ---------------------------------------
91// iff cond == 1 val <- -val
92// func p256NegCond(val *p256Element, cond int)
93#define P1ptr R1
94#define CPOOL R4
95
96#define Y1L V0
97#define Y1H V1
98#define T1L V2
99#define T1H V3
100
101#define PL V30
102#define PH V31
103
104#define ZER V4
105#define SEL1 V5
106#define CAR1 V6
107TEXT ·p256NegCond(SB), NOSPLIT, $0
108 MOVD val+0(FP), P1ptr
109
110 MOVD $p256mul<>+0x00(SB), CPOOL
111 VL 16(CPOOL), PL
112 VL 0(CPOOL), PH
113
114 VL 16(P1ptr), Y1H
115 VPDI $0x4, Y1H, Y1H, Y1H
116 VL 0(P1ptr), Y1L
117 VPDI $0x4, Y1L, Y1L, Y1L
118
119 VLREPG cond+8(FP), SEL1
120 VZERO ZER
121 VCEQG SEL1, ZER, SEL1
122
123 VSCBIQ Y1L, PL, CAR1
124 VSQ Y1L, PL, T1L
125 VSBIQ PH, Y1H, CAR1, T1H
126
127 VSEL Y1L, T1L, SEL1, Y1L
128 VSEL Y1H, T1H, SEL1, Y1H
129
130 VPDI $0x4, Y1H, Y1H, Y1H
131 VST Y1H, 16(P1ptr)
132 VPDI $0x4, Y1L, Y1L, Y1L
133 VST Y1L, 0(P1ptr)
134 RET
135
136#undef P1ptr
137#undef CPOOL
138#undef Y1L
139#undef Y1H
140#undef T1L
141#undef T1H
142#undef PL
143#undef PH
144#undef ZER
145#undef SEL1
146#undef CAR1
147
148// ---------------------------------------
149// if cond == 0 res <- b; else res <- a
150// func p256MovCond(res, a, b *P256Point, cond int)
151#define P3ptr R1
152#define P1ptr R2
153#define P2ptr R3
154
155#define X1L V0
156#define X1H V1
157#define Y1L V2
158#define Y1H V3
159#define Z1L V4
160#define Z1H V5
161#define X2L V6
162#define X2H V7
163#define Y2L V8
164#define Y2H V9
165#define Z2L V10
166#define Z2H V11
167
168#define ZER V18
169#define SEL1 V19
170TEXT ·p256MovCond(SB), NOSPLIT, $0
171 MOVD res+0(FP), P3ptr
172 MOVD a+8(FP), P1ptr
173 MOVD b+16(FP), P2ptr
174 VLREPG cond+24(FP), SEL1
175 VZERO ZER
176 VCEQG SEL1, ZER, SEL1
177
178 VL 0(P1ptr), X1H
179 VL 16(P1ptr), X1L
180 VL 32(P1ptr), Y1H
181 VL 48(P1ptr), Y1L
182 VL 64(P1ptr), Z1H
183 VL 80(P1ptr), Z1L
184
185 VL 0(P2ptr), X2H
186 VL 16(P2ptr), X2L
187 VL 32(P2ptr), Y2H
188 VL 48(P2ptr), Y2L
189 VL 64(P2ptr), Z2H
190 VL 80(P2ptr), Z2L
191
192 VSEL X2L, X1L, SEL1, X1L
193 VSEL X2H, X1H, SEL1, X1H
194 VSEL Y2L, Y1L, SEL1, Y1L
195 VSEL Y2H, Y1H, SEL1, Y1H
196 VSEL Z2L, Z1L, SEL1, Z1L
197 VSEL Z2H, Z1H, SEL1, Z1H
198
199 VST X1H, 0(P3ptr)
200 VST X1L, 16(P3ptr)
201 VST Y1H, 32(P3ptr)
202 VST Y1L, 48(P3ptr)
203 VST Z1H, 64(P3ptr)
204 VST Z1L, 80(P3ptr)
205
206 RET
207
208#undef P3ptr
209#undef P1ptr
210#undef P2ptr
211#undef X1L
212#undef X1H
213#undef Y1L
214#undef Y1H
215#undef Z1L
216#undef Z1H
217#undef X2L
218#undef X2H
219#undef Y2L
220#undef Y2H
221#undef Z2L
222#undef Z2H
223#undef ZER
224#undef SEL1
225
226// ---------------------------------------
227// Constant time table access
228// Indexed from 1 to 15, with -1 offset
229// (index 0 is implicitly point at infinity)
230// func p256Select(res *P256Point, table *p256Table, idx int)
231#define P3ptr R1
232#define P1ptr R2
233#define COUNT R4
234
235#define X1L V0
236#define X1H V1
237#define Y1L V2
238#define Y1H V3
239#define Z1L V4
240#define Z1H V5
241#define X2L V6
242#define X2H V7
243#define Y2L V8
244#define Y2H V9
245#define Z2L V10
246#define Z2H V11
247
248#define ONE V18
249#define IDX V19
250#define SEL1 V20
251#define SEL2 V21
252TEXT ·p256Select(SB), NOSPLIT, $0
253 MOVD res+0(FP), P3ptr
254 MOVD table+8(FP), P1ptr
255 VLREPB idx+(16+7)(FP), IDX
256 VREPIB $1, ONE
257 VREPIB $1, SEL2
258 MOVD $1, COUNT
259
260 VZERO X1H
261 VZERO X1L
262 VZERO Y1H
263 VZERO Y1L
264 VZERO Z1H
265 VZERO Z1L
266
267loop_select:
268 VL 0(P1ptr), X2H
269 VL 16(P1ptr), X2L
270 VL 32(P1ptr), Y2H
271 VL 48(P1ptr), Y2L
272 VL 64(P1ptr), Z2H
273 VL 80(P1ptr), Z2L
274
275 VCEQG SEL2, IDX, SEL1
276
277 VSEL X2L, X1L, SEL1, X1L
278 VSEL X2H, X1H, SEL1, X1H
279 VSEL Y2L, Y1L, SEL1, Y1L
280 VSEL Y2H, Y1H, SEL1, Y1H
281 VSEL Z2L, Z1L, SEL1, Z1L
282 VSEL Z2H, Z1H, SEL1, Z1H
283
284 VAB SEL2, ONE, SEL2
285 ADDW $1, COUNT
286 ADD $96, P1ptr
287 CMPW COUNT, $17
288 BLT loop_select
289
290 VST X1H, 0(P3ptr)
291 VST X1L, 16(P3ptr)
292 VST Y1H, 32(P3ptr)
293 VST Y1L, 48(P3ptr)
294 VST Z1H, 64(P3ptr)
295 VST Z1L, 80(P3ptr)
296 RET
297
298#undef P3ptr
299#undef P1ptr
300#undef COUNT
301#undef X1L
302#undef X1H
303#undef Y1L
304#undef Y1H
305#undef Z1L
306#undef Z1H
307#undef X2L
308#undef X2H
309#undef Y2L
310#undef Y2H
311#undef Z2L
312#undef Z2H
313#undef ONE
314#undef IDX
315#undef SEL1
316#undef SEL2
317
318// ---------------------------------------
319
320// func p256FromMont(res, in *p256Element)
321#define res_ptr R1
322#define x_ptr R2
323#define CPOOL R4
324
325#define T0 V0
326#define T1 V1
327#define T2 V2
328#define TT0 V3
329#define TT1 V4
330
331#define ZER V6
332#define SEL1 V7
333#define SEL2 V8
334#define CAR1 V9
335#define CAR2 V10
336#define RED1 V11
337#define RED2 V12
338#define PL V13
339#define PH V14
340
341TEXT ·p256FromMont(SB), NOSPLIT, $0
342 MOVD res+0(FP), res_ptr
343 MOVD in+8(FP), x_ptr
344
345 VZERO T2
346 VZERO ZER
347 MOVD $p256<>+0x00(SB), CPOOL
348 VL 16(CPOOL), PL
349 VL 0(CPOOL), PH
350 VL 48(CPOOL), SEL2
351 VL 64(CPOOL), SEL1
352
353 VL (0*16)(x_ptr), T0
354 VPDI $0x4, T0, T0, T0
355 VL (1*16)(x_ptr), T1
356 VPDI $0x4, T1, T1, T1
357
358 // First round
359 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
360 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
361 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
362
363 VSLDB $8, T1, T0, T0
364 VSLDB $8, T2, T1, T1
365
366 VACCQ T0, RED1, CAR1
367 VAQ T0, RED1, T0
368 VACCCQ T1, RED2, CAR1, CAR2
369 VACQ T1, RED2, CAR1, T1
370 VAQ T2, CAR2, T2
371
372 // Second round
373 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
374 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
375 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
376
377 VSLDB $8, T1, T0, T0
378 VSLDB $8, T2, T1, T1
379
380 VACCQ T0, RED1, CAR1
381 VAQ T0, RED1, T0
382 VACCCQ T1, RED2, CAR1, CAR2
383 VACQ T1, RED2, CAR1, T1
384 VAQ T2, CAR2, T2
385
386 // Third round
387 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
388 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
389 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
390
391 VSLDB $8, T1, T0, T0
392 VSLDB $8, T2, T1, T1
393
394 VACCQ T0, RED1, CAR1
395 VAQ T0, RED1, T0
396 VACCCQ T1, RED2, CAR1, CAR2
397 VACQ T1, RED2, CAR1, T1
398 VAQ T2, CAR2, T2
399
400 // Last round
401 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
402 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
403 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
404
405 VSLDB $8, T1, T0, T0
406 VSLDB $8, T2, T1, T1
407
408 VACCQ T0, RED1, CAR1
409 VAQ T0, RED1, T0
410 VACCCQ T1, RED2, CAR1, CAR2
411 VACQ T1, RED2, CAR1, T1
412 VAQ T2, CAR2, T2
413
414 // ---------------------------------------------------
415
416 VSCBIQ PL, T0, CAR1
417 VSQ PL, T0, TT0
418 VSBCBIQ T1, PH, CAR1, CAR2
419 VSBIQ T1, PH, CAR1, TT1
420 VSBIQ T2, ZER, CAR2, T2
421
422 // what output to use, TT1||TT0 or T1||T0?
423 VSEL T0, TT0, T2, T0
424 VSEL T1, TT1, T2, T1
425
426 VPDI $0x4, T0, T0, TT0
427 VST TT0, (0*16)(res_ptr)
428 VPDI $0x4, T1, T1, TT1
429 VST TT1, (1*16)(res_ptr)
430 RET
431
432#undef res_ptr
433#undef x_ptr
434#undef CPOOL
435#undef T0
436#undef T1
437#undef T2
438#undef TT0
439#undef TT1
440#undef ZER
441#undef SEL1
442#undef SEL2
443#undef CAR1
444#undef CAR2
445#undef RED1
446#undef RED2
447#undef PL
448#undef PH
449
450// Constant time table access
451// Indexed from 1 to 15, with -1 offset
452// (index 0 is implicitly point at infinity)
453// func p256SelectBase(point *p256Point, table []p256Point, idx int)
454// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
455
456#define P3ptr R1
457#define P1ptr R2
458#define COUNT R4
459#define CPOOL R5
460
461#define X1L V0
462#define X1H V1
463#define Y1L V2
464#define Y1H V3
465#define Z1L V4
466#define Z1H V5
467#define X2L V6
468#define X2H V7
469#define Y2L V8
470#define Y2H V9
471#define Z2L V10
472#define Z2H V11
473#define LE2BE V12
474
475#define ONE V18
476#define IDX V19
477#define SEL1 V20
478#define SEL2 V21
479
480TEXT ·p256SelectAffine(SB), NOSPLIT, $0
481 MOVD res+0(FP), P3ptr
482 MOVD table+8(FP), P1ptr
483 MOVD $p256<>+0x00(SB), CPOOL
484 VLREPB idx+(16+7)(FP), IDX
485 VREPIB $1, ONE
486 VREPIB $1, SEL2
487 MOVD $1, COUNT
488 VL 80(CPOOL), LE2BE
489
490 VZERO X1H
491 VZERO X1L
492 VZERO Y1H
493 VZERO Y1L
494
495loop_select:
496 VL 0(P1ptr), X2H
497 VL 16(P1ptr), X2L
498 VL 32(P1ptr), Y2H
499 VL 48(P1ptr), Y2L
500
501 VCEQG SEL2, IDX, SEL1
502
503 VSEL X2L, X1L, SEL1, X1L
504 VSEL X2H, X1H, SEL1, X1H
505 VSEL Y2L, Y1L, SEL1, Y1L
506 VSEL Y2H, Y1H, SEL1, Y1H
507
508 VAB SEL2, ONE, SEL2
509 ADDW $1, COUNT
510 ADD $64, P1ptr
511 CMPW COUNT, $65
512 BLT loop_select
513 VST X1H, 0(P3ptr)
514 VST X1L, 16(P3ptr)
515 VST Y1H, 32(P3ptr)
516 VST Y1L, 48(P3ptr)
517
518 RET
519
520#undef P3ptr
521#undef P1ptr
522#undef COUNT
523#undef X1L
524#undef X1H
525#undef Y1L
526#undef Y1H
527#undef Z1L
528#undef Z1H
529#undef X2L
530#undef X2H
531#undef Y2L
532#undef Y2H
533#undef Z2L
534#undef Z2H
535#undef ONE
536#undef IDX
537#undef SEL1
538#undef SEL2
539#undef CPOOL
540
541// ---------------------------------------
542
543// func p256OrdMul(res, in1, in2 *p256OrdElement)
544#define res_ptr R1
545#define x_ptr R2
546#define y_ptr R3
547#define X0 V0
548#define X1 V1
549#define Y0 V2
550#define Y1 V3
551#define M0 V4
552#define M1 V5
553#define T0 V6
554#define T1 V7
555#define T2 V8
556#define YDIG V9
557
558#define ADD1 V16
559#define ADD1H V17
560#define ADD2 V18
561#define ADD2H V19
562#define RED1 V20
563#define RED1H V21
564#define RED2 V22
565#define RED2H V23
566#define CAR1 V24
567#define CAR1M V25
568
569#define MK0 V30
570#define K0 V31
571TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
572 MOVD res+0(FP), res_ptr
573 MOVD in1+8(FP), x_ptr
574 MOVD in2+16(FP), y_ptr
575
576 VZERO T2
577 MOVD $p256ordK0<>+0x00(SB), R4
578
579 // VLEF $3, 0(R4), K0
580 WORD $0xE7F40000
581 BYTE $0x38
582 BYTE $0x03
583 MOVD $p256ord<>+0x00(SB), R4
584 VL 16(R4), M0
585 VL 0(R4), M1
586
587 VL (0*16)(x_ptr), X0
588 VPDI $0x4, X0, X0, X0
589 VL (1*16)(x_ptr), X1
590 VPDI $0x4, X1, X1, X1
591 VL (0*16)(y_ptr), Y0
592 VPDI $0x4, Y0, Y0, Y0
593 VL (1*16)(y_ptr), Y1
594 VPDI $0x4, Y1, Y1, Y1
595
596 // ---------------------------------------------------------------------------/
597 VREPF $3, Y0, YDIG
598 VMLF X0, YDIG, ADD1
599 VMLF ADD1, K0, MK0
600 VREPF $3, MK0, MK0
601
602 VMLF X1, YDIG, ADD2
603 VMLHF X0, YDIG, ADD1H
604 VMLHF X1, YDIG, ADD2H
605
606 VMALF M0, MK0, ADD1, RED1
607 VMALHF M0, MK0, ADD1, RED1H
608 VMALF M1, MK0, ADD2, RED2
609 VMALHF M1, MK0, ADD2, RED2H
610
611 VSLDB $12, RED2, RED1, RED1
612 VSLDB $12, T2, RED2, RED2
613
614 VACCQ RED1, ADD1H, CAR1
615 VAQ RED1, ADD1H, T0
616 VACCQ RED1H, T0, CAR1M
617 VAQ RED1H, T0, T0
618
619 // << ready for next MK0
620
621 VACQ RED2, ADD2H, CAR1, T1
622 VACCCQ RED2, ADD2H, CAR1, CAR1
623 VACCCQ RED2H, T1, CAR1M, T2
624 VACQ RED2H, T1, CAR1M, T1
625 VAQ CAR1, T2, T2
626
627 // ---------------------------------------------------
628/* *
629 * ---+--------+--------+
630 * T2| T1 | T0 |
631 * ---+--------+--------+
632 * *(add)*
633 * +--------+--------+
634 * | X1 | X0 |
635 * +--------+--------+
636 * *(mul)*
637 * +--------+--------+
638 * | YDIG | YDIG |
639 * +--------+--------+
640 * *(add)*
641 * +--------+--------+
642 * | M1 | M0 |
643 * +--------+--------+
644 * *(mul)*
645 * +--------+--------+
646 * | MK0 | MK0 |
647 * +--------+--------+
648 *
649 * ---------------------
650 *
651 * +--------+--------+
652 * | ADD2 | ADD1 |
653 * +--------+--------+
654 * +--------+--------+
655 * | ADD2H | ADD1H |
656 * +--------+--------+
657 * +--------+--------+
658 * | RED2 | RED1 |
659 * +--------+--------+
660 * +--------+--------+
661 * | RED2H | RED1H |
662 * +--------+--------+
663 */
664 VREPF $2, Y0, YDIG
665 VMALF X0, YDIG, T0, ADD1
666 VMLF ADD1, K0, MK0
667 VREPF $3, MK0, MK0
668
669 VMALF X1, YDIG, T1, ADD2
670 VMALHF X0, YDIG, T0, ADD1H
671 VMALHF X1, YDIG, T1, ADD2H
672
673 VMALF M0, MK0, ADD1, RED1
674 VMALHF M0, MK0, ADD1, RED1H
675 VMALF M1, MK0, ADD2, RED2
676 VMALHF M1, MK0, ADD2, RED2H
677
678 VSLDB $12, RED2, RED1, RED1
679 VSLDB $12, T2, RED2, RED2
680
681 VACCQ RED1, ADD1H, CAR1
682 VAQ RED1, ADD1H, T0
683 VACCQ RED1H, T0, CAR1M
684 VAQ RED1H, T0, T0
685
686 // << ready for next MK0
687
688 VACQ RED2, ADD2H, CAR1, T1
689 VACCCQ RED2, ADD2H, CAR1, CAR1
690 VACCCQ RED2H, T1, CAR1M, T2
691 VACQ RED2H, T1, CAR1M, T1
692 VAQ CAR1, T2, T2
693
694 // ---------------------------------------------------
695 VREPF $1, Y0, YDIG
696 VMALF X0, YDIG, T0, ADD1
697 VMLF ADD1, K0, MK0
698 VREPF $3, MK0, MK0
699
700 VMALF X1, YDIG, T1, ADD2
701 VMALHF X0, YDIG, T0, ADD1H
702 VMALHF X1, YDIG, T1, ADD2H
703
704 VMALF M0, MK0, ADD1, RED1
705 VMALHF M0, MK0, ADD1, RED1H
706 VMALF M1, MK0, ADD2, RED2
707 VMALHF M1, MK0, ADD2, RED2H
708
709 VSLDB $12, RED2, RED1, RED1
710 VSLDB $12, T2, RED2, RED2
711
712 VACCQ RED1, ADD1H, CAR1
713 VAQ RED1, ADD1H, T0
714 VACCQ RED1H, T0, CAR1M
715 VAQ RED1H, T0, T0
716
717 // << ready for next MK0
718
719 VACQ RED2, ADD2H, CAR1, T1
720 VACCCQ RED2, ADD2H, CAR1, CAR1
721 VACCCQ RED2H, T1, CAR1M, T2
722 VACQ RED2H, T1, CAR1M, T1
723 VAQ CAR1, T2, T2
724
725 // ---------------------------------------------------
726 VREPF $0, Y0, YDIG
727 VMALF X0, YDIG, T0, ADD1
728 VMLF ADD1, K0, MK0
729 VREPF $3, MK0, MK0
730
731 VMALF X1, YDIG, T1, ADD2
732 VMALHF X0, YDIG, T0, ADD1H
733 VMALHF X1, YDIG, T1, ADD2H
734
735 VMALF M0, MK0, ADD1, RED1
736 VMALHF M0, MK0, ADD1, RED1H
737 VMALF M1, MK0, ADD2, RED2
738 VMALHF M1, MK0, ADD2, RED2H
739
740 VSLDB $12, RED2, RED1, RED1
741 VSLDB $12, T2, RED2, RED2
742
743 VACCQ RED1, ADD1H, CAR1
744 VAQ RED1, ADD1H, T0
745 VACCQ RED1H, T0, CAR1M
746 VAQ RED1H, T0, T0
747
748 // << ready for next MK0
749
750 VACQ RED2, ADD2H, CAR1, T1
751 VACCCQ RED2, ADD2H, CAR1, CAR1
752 VACCCQ RED2H, T1, CAR1M, T2
753 VACQ RED2H, T1, CAR1M, T1
754 VAQ CAR1, T2, T2
755
756 // ---------------------------------------------------
757 VREPF $3, Y1, YDIG
758 VMALF X0, YDIG, T0, ADD1
759 VMLF ADD1, K0, MK0
760 VREPF $3, MK0, MK0
761
762 VMALF X1, YDIG, T1, ADD2
763 VMALHF X0, YDIG, T0, ADD1H
764 VMALHF X1, YDIG, T1, ADD2H
765
766 VMALF M0, MK0, ADD1, RED1
767 VMALHF M0, MK0, ADD1, RED1H
768 VMALF M1, MK0, ADD2, RED2
769 VMALHF M1, MK0, ADD2, RED2H
770
771 VSLDB $12, RED2, RED1, RED1
772 VSLDB $12, T2, RED2, RED2
773
774 VACCQ RED1, ADD1H, CAR1
775 VAQ RED1, ADD1H, T0
776 VACCQ RED1H, T0, CAR1M
777 VAQ RED1H, T0, T0
778
779 // << ready for next MK0
780
781 VACQ RED2, ADD2H, CAR1, T1
782 VACCCQ RED2, ADD2H, CAR1, CAR1
783 VACCCQ RED2H, T1, CAR1M, T2
784 VACQ RED2H, T1, CAR1M, T1
785 VAQ CAR1, T2, T2
786
787 // ---------------------------------------------------
788 VREPF $2, Y1, YDIG
789 VMALF X0, YDIG, T0, ADD1
790 VMLF ADD1, K0, MK0
791 VREPF $3, MK0, MK0
792
793 VMALF X1, YDIG, T1, ADD2
794 VMALHF X0, YDIG, T0, ADD1H
795 VMALHF X1, YDIG, T1, ADD2H
796
797 VMALF M0, MK0, ADD1, RED1
798 VMALHF M0, MK0, ADD1, RED1H
799 VMALF M1, MK0, ADD2, RED2
800 VMALHF M1, MK0, ADD2, RED2H
801
802 VSLDB $12, RED2, RED1, RED1
803 VSLDB $12, T2, RED2, RED2
804
805 VACCQ RED1, ADD1H, CAR1
806 VAQ RED1, ADD1H, T0
807 VACCQ RED1H, T0, CAR1M
808 VAQ RED1H, T0, T0
809
810 // << ready for next MK0
811
812 VACQ RED2, ADD2H, CAR1, T1
813 VACCCQ RED2, ADD2H, CAR1, CAR1
814 VACCCQ RED2H, T1, CAR1M, T2
815 VACQ RED2H, T1, CAR1M, T1
816 VAQ CAR1, T2, T2
817
818 // ---------------------------------------------------
819 VREPF $1, Y1, YDIG
820 VMALF X0, YDIG, T0, ADD1
821 VMLF ADD1, K0, MK0
822 VREPF $3, MK0, MK0
823
824 VMALF X1, YDIG, T1, ADD2
825 VMALHF X0, YDIG, T0, ADD1H
826 VMALHF X1, YDIG, T1, ADD2H
827
828 VMALF M0, MK0, ADD1, RED1
829 VMALHF M0, MK0, ADD1, RED1H
830 VMALF M1, MK0, ADD2, RED2
831 VMALHF M1, MK0, ADD2, RED2H
832
833 VSLDB $12, RED2, RED1, RED1
834 VSLDB $12, T2, RED2, RED2
835
836 VACCQ RED1, ADD1H, CAR1
837 VAQ RED1, ADD1H, T0
838 VACCQ RED1H, T0, CAR1M
839 VAQ RED1H, T0, T0
840
841 // << ready for next MK0
842
843 VACQ RED2, ADD2H, CAR1, T1
844 VACCCQ RED2, ADD2H, CAR1, CAR1
845 VACCCQ RED2H, T1, CAR1M, T2
846 VACQ RED2H, T1, CAR1M, T1
847 VAQ CAR1, T2, T2
848
849 // ---------------------------------------------------
850 VREPF $0, Y1, YDIG
851 VMALF X0, YDIG, T0, ADD1
852 VMLF ADD1, K0, MK0
853 VREPF $3, MK0, MK0
854
855 VMALF X1, YDIG, T1, ADD2
856 VMALHF X0, YDIG, T0, ADD1H
857 VMALHF X1, YDIG, T1, ADD2H
858
859 VMALF M0, MK0, ADD1, RED1
860 VMALHF M0, MK0, ADD1, RED1H
861 VMALF M1, MK0, ADD2, RED2
862 VMALHF M1, MK0, ADD2, RED2H
863
864 VSLDB $12, RED2, RED1, RED1
865 VSLDB $12, T2, RED2, RED2
866
867 VACCQ RED1, ADD1H, CAR1
868 VAQ RED1, ADD1H, T0
869 VACCQ RED1H, T0, CAR1M
870 VAQ RED1H, T0, T0
871
872 // << ready for next MK0
873
874 VACQ RED2, ADD2H, CAR1, T1
875 VACCCQ RED2, ADD2H, CAR1, CAR1
876 VACCCQ RED2H, T1, CAR1M, T2
877 VACQ RED2H, T1, CAR1M, T1
878 VAQ CAR1, T2, T2
879
880 // ---------------------------------------------------
881
882 VZERO RED1
883 VSCBIQ M0, T0, CAR1
884 VSQ M0, T0, ADD1
885 VSBCBIQ T1, M1, CAR1, CAR1M
886 VSBIQ T1, M1, CAR1, ADD2
887 VSBIQ T2, RED1, CAR1M, T2
888
889 // what output to use, ADD2||ADD1 or T1||T0?
890 VSEL T0, ADD1, T2, T0
891 VSEL T1, ADD2, T2, T1
892
893 VPDI $0x4, T0, T0, T0
894 VST T0, (0*16)(res_ptr)
895 VPDI $0x4, T1, T1, T1
896 VST T1, (1*16)(res_ptr)
897 RET
898
899#undef res_ptr
900#undef x_ptr
901#undef y_ptr
902#undef X0
903#undef X1
904#undef Y0
905#undef Y1
906#undef M0
907#undef M1
908#undef T0
909#undef T1
910#undef T2
911#undef YDIG
912
913#undef ADD1
914#undef ADD1H
915#undef ADD2
916#undef ADD2H
917#undef RED1
918#undef RED1H
919#undef RED2
920#undef RED2H
921#undef CAR1
922#undef CAR1M
923
924#undef MK0
925#undef K0
926
927// ---------------------------------------
928// p256MulInternal
929// V0-V3,V30,V31 - Not Modified
930// V4-V15 - Volatile
931
932#define CPOOL R4
933
934// Parameters
935#define X0 V0 // Not modified
936#define X1 V1 // Not modified
937#define Y0 V2 // Not modified
938#define Y1 V3 // Not modified
939#define T0 V4
940#define T1 V5
941#define P0 V30 // Not modified
942#define P1 V31 // Not modified
943
944// Temporaries
945#define YDIG V6 // Overloaded with CAR2, ZER
946#define ADD1H V7 // Overloaded with ADD3H
947#define ADD2H V8 // Overloaded with ADD4H
948#define ADD3 V9 // Overloaded with SEL2,SEL5
949#define ADD4 V10 // Overloaded with SEL3,SEL6
950#define RED1 V11 // Overloaded with CAR2
951#define RED2 V12
952#define RED3 V13 // Overloaded with SEL1
953#define T2 V14
954// Overloaded temporaries
955#define ADD1 V4 // Overloaded with T0
956#define ADD2 V5 // Overloaded with T1
957#define ADD3H V7 // Overloaded with ADD1H
958#define ADD4H V8 // Overloaded with ADD2H
959#define ZER V6 // Overloaded with YDIG, CAR2
960#define CAR1 V6 // Overloaded with YDIG, ZER
961#define CAR2 V11 // Overloaded with RED1
962// Constant Selects
963#define SEL1 V13 // Overloaded with RED3
964#define SEL2 V9 // Overloaded with ADD3,SEL5
965#define SEL3 V10 // Overloaded with ADD4,SEL6
966#define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
967#define SEL5 V9 // Overloaded with ADD3,SEL2
968#define SEL6 V10 // Overloaded with ADD4,SEL3
969
970/* *
971 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
972 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
973 * With you, SIMD be...
974 *
975 * +--------+--------+
976 * +--------| RED2 | RED1 |
977 * | +--------+--------+
978 * | ---+--------+--------+
979 * | +---- T2| T1 | T0 |--+
980 * | | ---+--------+--------+ |
981 * | | |
982 * | | ======================= |
983 * | | |
984 * | | +--------+--------+<-+
985 * | +-------| ADD2 | ADD1 |--|-----+
986 * | | +--------+--------+ | |
987 * | | +--------+--------+<---+ |
988 * | | | ADD2H | ADD1H |--+ |
989 * | | +--------+--------+ | |
990 * | | +--------+--------+<-+ |
991 * | | | ADD4 | ADD3 |--|-+ |
992 * | | +--------+--------+ | | |
993 * | | +--------+--------+<---+ | |
994 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
995 * | | +--------+--------+ | | V
996 * | | ------------------------ | | +--------+
997 * | | | | | RED3 | [d0 0 0 d0]
998 * | | | | +--------+
999 * | +---->+--------+--------+ | | |
1000 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
1001 * | +--------+--------+ | | |
1002 * +---->---+--------+--------+ | | |
1003 * T2| T1 | T0 |----+ | |
1004 * ---+--------+--------+ | | |
1005 * ---+--------+--------+<---+ | |
1006 * +--- T2| T1 | T0 |----------+
1007 * | ---+--------+--------+ | |
1008 * | +--------+--------+<-------------+
1009 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1010 * | +--------+--------+ | | |
1011 * | +--------+<----------------------+
1012 * | | RED3 |--------------+ | [0 0 d1 d0]
1013 * | +--------+ | |
1014 * +--->+--------+--------+ | |
1015 * | T1 | T0 |--------+
1016 * +--------+--------+ | |
1017 * --------------------------- | |
1018 * | |
1019 * +--------+--------+<----+ |
1020 * | RED2 | RED1 | |
1021 * +--------+--------+ |
1022 * ---+--------+--------+<-------+
1023 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1024 * ---+--------+--------+
1025 *
1026 * *Mi obra de arte de siglo XXI @vpaprots
1027 *
1028 *
1029 * First group is special, doesn't get the two inputs:
1030 * +--------+--------+<-+
1031 * +-------| ADD2 | ADD1 |--|-----+
1032 * | +--------+--------+ | |
1033 * | +--------+--------+<---+ |
1034 * | | ADD2H | ADD1H |--+ |
1035 * | +--------+--------+ | |
1036 * | +--------+--------+<-+ |
1037 * | | ADD4 | ADD3 |--|-+ |
1038 * | +--------+--------+ | | |
1039 * | +--------+--------+<---+ | |
1040 * | | ADD4H | ADD3H |------|-+ |(+vzero)
1041 * | +--------+--------+ | | V
1042 * | ------------------------ | | +--------+
1043 * | | | | RED3 | [d0 0 0 d0]
1044 * | | | +--------+
1045 * +---->+--------+--------+ | | |
1046 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
1047 * +--------+--------+ | | |
1048 * ---+--------+--------+<---+ | |
1049 * +--- T2| T1 | T0 |----------+
1050 * | ---+--------+--------+ | |
1051 * | +--------+--------+<-------------+
1052 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1053 * | +--------+--------+ | | |
1054 * | +--------+<----------------------+
1055 * | | RED3 |--------------+ | [0 0 d1 d0]
1056 * | +--------+ | |
1057 * +--->+--------+--------+ | |
1058 * | T1 | T0 |--------+
1059 * +--------+--------+ | |
1060 * --------------------------- | |
1061 * | |
1062 * +--------+--------+<----+ |
1063 * | RED2 | RED1 | |
1064 * +--------+--------+ |
1065 * ---+--------+--------+<-------+
1066 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1067 * ---+--------+--------+
1068 *
1069 * Last 'group' needs to RED2||RED1 shifted less
1070 */
1071TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
1072 VL 32(CPOOL), SEL1
1073 VL 48(CPOOL), SEL2
1074 VL 64(CPOOL), SEL3
1075 VL 80(CPOOL), SEL4
1076
1077 // ---------------------------------------------------
1078
1079 VREPF $3, Y0, YDIG
1080 VMLHF X0, YDIG, ADD1H
1081 VMLHF X1, YDIG, ADD2H
1082 VMLF X0, YDIG, ADD1
1083 VMLF X1, YDIG, ADD2
1084
1085 VREPF $2, Y0, YDIG
1086 VMALF X0, YDIG, ADD1H, ADD3
1087 VMALF X1, YDIG, ADD2H, ADD4
1088 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1089 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1090
1091 VZERO ZER
1092 VL 32(CPOOL), SEL1
1093 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1094
1095 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1096 VSLDB $12, ZER, ADD2, T1 // ADD2 Free
1097
1098 VACCQ T0, ADD3, CAR1
1099 VAQ T0, ADD3, T0 // ADD3 Free
1100 VACCCQ T1, ADD4, CAR1, T2
1101 VACQ T1, ADD4, CAR1, T1 // ADD4 Free
1102
1103 VL 48(CPOOL), SEL2
1104 VL 64(CPOOL), SEL3
1105 VL 80(CPOOL), SEL4
1106 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1107 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1108 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1109 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1110
1111 VSLDB $12, T1, T0, T0
1112 VSLDB $12, T2, T1, T1
1113
1114 VACCQ T0, ADD3H, CAR1
1115 VAQ T0, ADD3H, T0
1116 VACCCQ T1, ADD4H, CAR1, T2
1117 VACQ T1, ADD4H, CAR1, T1
1118
1119 // ---------------------------------------------------
1120
1121 VREPF $1, Y0, YDIG
1122 VMALHF X0, YDIG, T0, ADD1H
1123 VMALHF X1, YDIG, T1, ADD2H
1124 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
1125 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
1126
1127 VREPF $0, Y0, YDIG
1128 VMALF X0, YDIG, ADD1H, ADD3
1129 VMALF X1, YDIG, ADD2H, ADD4
1130 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
1131 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
1132
1133 VZERO ZER
1134 VL 32(CPOOL), SEL1
1135 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1136
1137 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
1138 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
1139
1140 VACCQ T0, RED1, CAR1
1141 VAQ T0, RED1, T0
1142 VACCCQ T1, RED2, CAR1, T2
1143 VACQ T1, RED2, CAR1, T1
1144
1145 VACCQ T0, ADD3, CAR1
1146 VAQ T0, ADD3, T0
1147 VACCCQ T1, ADD4, CAR1, CAR2
1148 VACQ T1, ADD4, CAR1, T1
1149 VAQ T2, CAR2, T2
1150
1151 VL 48(CPOOL), SEL2
1152 VL 64(CPOOL), SEL3
1153 VL 80(CPOOL), SEL4
1154 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1155 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1156 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1157 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1158
1159 VSLDB $12, T1, T0, T0
1160 VSLDB $12, T2, T1, T1
1161
1162 VACCQ T0, ADD3H, CAR1
1163 VAQ T0, ADD3H, T0
1164 VACCCQ T1, ADD4H, CAR1, T2
1165 VACQ T1, ADD4H, CAR1, T1
1166
1167 // ---------------------------------------------------
1168
1169 VREPF $3, Y1, YDIG
1170 VMALHF X0, YDIG, T0, ADD1H
1171 VMALHF X1, YDIG, T1, ADD2H
1172 VMALF X0, YDIG, T0, ADD1
1173 VMALF X1, YDIG, T1, ADD2
1174
1175 VREPF $2, Y1, YDIG
1176 VMALF X0, YDIG, ADD1H, ADD3
1177 VMALF X1, YDIG, ADD2H, ADD4
1178 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1179 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1180
1181 VZERO ZER
1182 VL 32(CPOOL), SEL1
1183 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1184
1185 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1186 VSLDB $12, T2, ADD2, T1 // ADD2 Free
1187
1188 VACCQ T0, RED1, CAR1
1189 VAQ T0, RED1, T0
1190 VACCCQ T1, RED2, CAR1, T2
1191 VACQ T1, RED2, CAR1, T1
1192
1193 VACCQ T0, ADD3, CAR1
1194 VAQ T0, ADD3, T0
1195 VACCCQ T1, ADD4, CAR1, CAR2
1196 VACQ T1, ADD4, CAR1, T1
1197 VAQ T2, CAR2, T2
1198
1199 VL 48(CPOOL), SEL2
1200 VL 64(CPOOL), SEL3
1201 VL 80(CPOOL), SEL4
1202 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1203 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1204 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1205 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1206
1207 VSLDB $12, T1, T0, T0
1208 VSLDB $12, T2, T1, T1
1209
1210 VACCQ T0, ADD3H, CAR1
1211 VAQ T0, ADD3H, T0
1212 VACCCQ T1, ADD4H, CAR1, T2
1213 VACQ T1, ADD4H, CAR1, T1
1214
1215 // ---------------------------------------------------
1216
1217 VREPF $1, Y1, YDIG
1218 VMALHF X0, YDIG, T0, ADD1H
1219 VMALHF X1, YDIG, T1, ADD2H
1220 VMALF X0, YDIG, T0, ADD1
1221 VMALF X1, YDIG, T1, ADD2
1222
1223 VREPF $0, Y1, YDIG
1224 VMALF X0, YDIG, ADD1H, ADD3
1225 VMALF X1, YDIG, ADD2H, ADD4
1226 VMALHF X0, YDIG, ADD1H, ADD3H
1227 VMALHF X1, YDIG, ADD2H, ADD4H
1228
1229 VZERO ZER
1230 VL 32(CPOOL), SEL1
1231 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1232
1233 VSLDB $12, ADD2, ADD1, T0
1234 VSLDB $12, T2, ADD2, T1
1235
1236 VACCQ T0, RED1, CAR1
1237 VAQ T0, RED1, T0
1238 VACCCQ T1, RED2, CAR1, T2
1239 VACQ T1, RED2, CAR1, T1
1240
1241 VACCQ T0, ADD3, CAR1
1242 VAQ T0, ADD3, T0
1243 VACCCQ T1, ADD4, CAR1, CAR2
1244 VACQ T1, ADD4, CAR1, T1
1245 VAQ T2, CAR2, T2
1246
1247 VL 96(CPOOL), SEL5
1248 VL 112(CPOOL), SEL6
1249 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
1250 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1251 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
1252
1253 VSLDB $12, T1, T0, T0
1254 VSLDB $12, T2, T1, T1
1255
1256 VACCQ T0, ADD3H, CAR1
1257 VAQ T0, ADD3H, T0
1258 VACCCQ T1, ADD4H, CAR1, T2
1259 VACQ T1, ADD4H, CAR1, T1
1260
1261 VACCQ T0, RED1, CAR1
1262 VAQ T0, RED1, T0
1263 VACCCQ T1, RED2, CAR1, CAR2
1264 VACQ T1, RED2, CAR1, T1
1265 VAQ T2, CAR2, T2
1266
1267 // ---------------------------------------------------
1268
1269 VZERO RED3
1270 VSCBIQ P0, T0, CAR1
1271 VSQ P0, T0, ADD1H
1272 VSBCBIQ T1, P1, CAR1, CAR2
1273 VSBIQ T1, P1, CAR1, ADD2H
1274 VSBIQ T2, RED3, CAR2, T2
1275
1276 // what output to use, ADD2H||ADD1H or T1||T0?
1277 VSEL T0, ADD1H, T2, T0
1278 VSEL T1, ADD2H, T2, T1
1279 RET
1280
1281#undef CPOOL
1282
1283#undef X0
1284#undef X1
1285#undef Y0
1286#undef Y1
1287#undef T0
1288#undef T1
1289#undef P0
1290#undef P1
1291
1292#undef SEL1
1293#undef SEL2
1294#undef SEL3
1295#undef SEL4
1296#undef SEL5
1297#undef SEL6
1298
1299#undef YDIG
1300#undef ADD1H
1301#undef ADD2H
1302#undef ADD3
1303#undef ADD4
1304#undef RED1
1305#undef RED2
1306#undef RED3
1307#undef T2
1308#undef ADD1
1309#undef ADD2
1310#undef ADD3H
1311#undef ADD4H
1312#undef ZER
1313#undef CAR1
1314#undef CAR2
1315
1316// ---------------------------------------
1317
1318// Parameters
1319#define X0 V0
1320#define X1 V1
1321#define Y0 V2
1322#define Y1 V3
1323
1324TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
1325 VLR X0, Y0
1326 VLR X1, Y1
1327 BR p256MulInternal<>(SB)
1328
1329#undef X0
1330#undef X1
1331#undef Y0
1332#undef Y1
1333
1334#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1335 VZERO ZER \
1336 VSCBIQ Y0, X0, CAR1 \
1337 VSQ Y0, X0, T0 \
1338 VSBCBIQ X1, Y1, CAR1, SEL1 \
1339 VSBIQ X1, Y1, CAR1, T1 \
1340 VSQ SEL1, ZER, SEL1 \
1341 \
1342 VACCQ T0, PL, CAR1 \
1343 VAQ T0, PL, TT0 \
1344 VACQ T1, PH, CAR1, TT1 \
1345 \
1346 VSEL T0, TT0, SEL1, T0 \
1347 VSEL T1, TT1, SEL1, T1 \
1348
1349#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1350 VACCQ X0, Y0, CAR1 \
1351 VAQ X0, Y0, T0 \
1352 VACCCQ X1, Y1, CAR1, T2 \
1353 VACQ X1, Y1, CAR1, T1 \
1354 \
1355 VZERO ZER \
1356 VSCBIQ PL, T0, CAR1 \
1357 VSQ PL, T0, TT0 \
1358 VSBCBIQ T1, PH, CAR1, CAR2 \
1359 VSBIQ T1, PH, CAR1, TT1 \
1360 VSBIQ T2, ZER, CAR2, SEL1 \
1361 \
1362 VSEL T0, TT0, SEL1, T0 \
1363 VSEL T1, TT1, SEL1, T1
1364
1365#define p256HalfInternal(T1, T0, X1, X0) \
1366 VZERO ZER \
1367 VSBIQ ZER, ZER, X0, SEL1 \
1368 \
1369 VACCQ X0, PL, CAR1 \
1370 VAQ X0, PL, T0 \
1371 VACCCQ X1, PH, CAR1, T2 \
1372 VACQ X1, PH, CAR1, T1 \
1373 \
1374 VSEL X0, T0, SEL1, T0 \
1375 VSEL X1, T1, SEL1, T1 \
1376 VSEL ZER, T2, SEL1, T2 \
1377 \
1378 VSLDB $15, T2, ZER, TT1 \
1379 VSLDB $15, T1, ZER, TT0 \
1380 VREPIB $1, SEL1 \
1381 VSRL SEL1, T0, T0 \
1382 VSRL SEL1, T1, T1 \
1383 VREPIB $7, SEL1 \
1384 VSL SEL1, TT0, TT0 \
1385 VSL SEL1, TT1, TT1 \
1386 VO T0, TT0, T0 \
1387 VO T1, TT1, T1
1388
1389// ---------------------------------------
1390// func p256Mul(res, in1, in2 *p256Element)
1391#define res_ptr R1
1392#define x_ptr R2
1393#define y_ptr R3
1394#define CPOOL R4
1395
1396// Parameters
1397#define X0 V0
1398#define X1 V1
1399#define Y0 V2
1400#define Y1 V3
1401#define T0 V4
1402#define T1 V5
1403
1404// Constants
1405#define P0 V30
1406#define P1 V31
1407TEXT ·p256Mul(SB), NOSPLIT, $0
1408 MOVD res+0(FP), res_ptr
1409 MOVD in1+8(FP), x_ptr
1410 MOVD in2+16(FP), y_ptr
1411
1412 VL (0*16)(x_ptr), X0
1413 VPDI $0x4, X0, X0, X0
1414 VL (1*16)(x_ptr), X1
1415 VPDI $0x4, X1, X1, X1
1416 VL (0*16)(y_ptr), Y0
1417 VPDI $0x4, Y0, Y0, Y0
1418 VL (1*16)(y_ptr), Y1
1419 VPDI $0x4, Y1, Y1, Y1
1420
1421 MOVD $p256mul<>+0x00(SB), CPOOL
1422 VL 16(CPOOL), P0
1423 VL 0(CPOOL), P1
1424
1425 CALL p256MulInternal<>(SB)
1426
1427 VPDI $0x4, T0, T0, T0
1428 VST T0, (0*16)(res_ptr)
1429 VPDI $0x4, T1, T1, T1
1430 VST T1, (1*16)(res_ptr)
1431 RET
1432
1433#undef res_ptr
1434#undef x_ptr
1435#undef y_ptr
1436#undef CPOOL
1437
1438#undef X0
1439#undef X1
1440#undef Y0
1441#undef Y1
1442#undef T0
1443#undef T1
1444#undef P0
1445#undef P1
1446
1447// ---------------------------------------
1448// func p256Sqr(res, in *p256Element, n int)
1449#define res_ptr R1
1450#define x_ptr R2
1451#define y_ptr R3
1452#define CPOOL R4
1453#define COUNT R5
1454#define N R6
1455
1456// Parameters
1457#define X0 V0
1458#define X1 V1
1459#define T0 V4
1460#define T1 V5
1461
1462// Constants
1463#define P0 V30
1464#define P1 V31
1465TEXT ·p256Sqr(SB), NOSPLIT, $0
1466 MOVD res+0(FP), res_ptr
1467 MOVD in+8(FP), x_ptr
1468
1469 VL (0*16)(x_ptr), X0
1470 VPDI $0x4, X0, X0, X0
1471 VL (1*16)(x_ptr), X1
1472 VPDI $0x4, X1, X1, X1
1473
1474 MOVD $p256mul<>+0x00(SB), CPOOL
1475 MOVD $0, COUNT
1476 MOVD n+16(FP), N
1477 VL 16(CPOOL), P0
1478 VL 0(CPOOL), P1
1479
1480loop:
1481 CALL p256SqrInternal<>(SB)
1482 VLR T0, X0
1483 VLR T1, X1
1484 ADDW $1, COUNT
1485 CMPW COUNT, N
1486 BLT loop
1487
1488 VPDI $0x4, T0, T0, T0
1489 VST T0, (0*16)(res_ptr)
1490 VPDI $0x4, T1, T1, T1
1491 VST T1, (1*16)(res_ptr)
1492 RET
1493
1494#undef res_ptr
1495#undef x_ptr
1496#undef y_ptr
1497#undef CPOOL
1498#undef COUNT
1499#undef N
1500
1501#undef X0
1502#undef X1
1503#undef T0
1504#undef T1
1505#undef P0
1506#undef P1
1507
1508// Point add with P2 being affine point
1509// If sign == 1 -> P2 = -P2
1510// If sel == 0 -> P3 = P1
1511// if zero == 0 -> P3 = P2
1512// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1513#define P3ptr R1
1514#define P1ptr R2
1515#define P2ptr R3
1516#define CPOOL R4
1517
1518// Temporaries in REGs
1519#define Y2L V15
1520#define Y2H V16
1521#define T1L V17
1522#define T1H V18
1523#define T2L V19
1524#define T2H V20
1525#define T3L V21
1526#define T3H V22
1527#define T4L V23
1528#define T4H V24
1529
1530// Temps for Sub and Add
1531#define TT0 V11
1532#define TT1 V12
1533#define T2 V13
1534
1535// p256MulAsm Parameters
1536#define X0 V0
1537#define X1 V1
1538#define Y0 V2
1539#define Y1 V3
1540#define T0 V4
1541#define T1 V5
1542
1543#define PL V30
1544#define PH V31
1545
1546// Names for zero/sel selects
1547#define X1L V0
1548#define X1H V1
1549#define Y1L V2 // p256MulAsmParmY
1550#define Y1H V3 // p256MulAsmParmY
1551#define Z1L V4
1552#define Z1H V5
1553#define X2L V0
1554#define X2H V1
1555#define Z2L V4
1556#define Z2H V5
1557#define X3L V17 // T1L
1558#define X3H V18 // T1H
1559#define Y3L V21 // T3L
1560#define Y3H V22 // T3H
1561#define Z3L V28
1562#define Z3H V29
1563
1564#define ZER V6
1565#define SEL1 V7
1566#define CAR1 V8
1567#define CAR2 V9
1568/* *
1569 * Three operand formula:
1570 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1571 * T1 = Z1²
1572 * T2 = T1*Z1
1573 * T1 = T1*X2
1574 * T2 = T2*Y2
1575 * T1 = T1-X1
1576 * T2 = T2-Y1
1577 * Z3 = Z1*T1
1578 * T3 = T1²
1579 * T4 = T3*T1
1580 * T3 = T3*X1
1581 * T1 = 2*T3
1582 * X3 = T2²
1583 * X3 = X3-T1
1584 * X3 = X3-T4
1585 * T3 = T3-X3
1586 * T3 = T3*T2
1587 * T4 = T4*Y1
1588 * Y3 = T3-T4
1589
1590 * Three operand formulas, but with MulInternal X,Y used to store temps
1591X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1592X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1593X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1594X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1595SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1596SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1597X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1598X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1599X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1600X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1601ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1602X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1603SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1604SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1605SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1606X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1607X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1608SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1609
1610 */
1611TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1612 MOVD res+0(FP), P3ptr
1613 MOVD in1+8(FP), P1ptr
1614 MOVD in2+16(FP), P2ptr
1615
1616 MOVD $p256mul<>+0x00(SB), CPOOL
1617 VL 16(CPOOL), PL
1618 VL 0(CPOOL), PH
1619
1620 // if (sign == 1) {
1621 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1622 // }
1623
1624 VL 48(P2ptr), Y2H
1625 VPDI $0x4, Y2H, Y2H, Y2H
1626 VL 32(P2ptr), Y2L
1627 VPDI $0x4, Y2L, Y2L, Y2L
1628
1629 VLREPG sign+24(FP), SEL1
1630 VZERO ZER
1631 VCEQG SEL1, ZER, SEL1
1632
1633 VSCBIQ Y2L, PL, CAR1
1634 VSQ Y2L, PL, T1L
1635 VSBIQ PH, Y2H, CAR1, T1H
1636
1637 VSEL Y2L, T1L, SEL1, Y2L
1638 VSEL Y2H, T1H, SEL1, Y2H
1639
1640/* *
1641 * Three operand formula:
1642 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1643 */
1644 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1645 VL 80(P1ptr), X1 // Z1H
1646 VPDI $0x4, X1, X1, X1
1647 VL 64(P1ptr), X0 // Z1L
1648 VPDI $0x4, X0, X0, X0
1649 VLR X0, Y0
1650 VLR X1, Y1
1651 CALL p256SqrInternal<>(SB)
1652
1653 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1654 VLR T0, X0
1655 VLR T1, X1
1656 CALL p256MulInternal<>(SB)
1657 VLR T0, T2L
1658 VLR T1, T2H
1659
1660 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1661 VL 16(P2ptr), Y1 // X2H
1662 VPDI $0x4, Y1, Y1, Y1
1663 VL 0(P2ptr), Y0 // X2L
1664 VPDI $0x4, Y0, Y0, Y0
1665 CALL p256MulInternal<>(SB)
1666 VLR T0, T1L
1667 VLR T1, T1H
1668
1669 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1670 VLR T2L, X0
1671 VLR T2H, X1
1672 VLR Y2L, Y0
1673 VLR Y2H, Y1
1674 CALL p256MulInternal<>(SB)
1675
1676 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1677 VL 48(P1ptr), Y1H
1678 VPDI $0x4, Y1H, Y1H, Y1H
1679 VL 32(P1ptr), Y1L
1680 VPDI $0x4, Y1L, Y1L, Y1L
1681 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1682
1683 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1684 VL 16(P1ptr), X1H
1685 VPDI $0x4, X1H, X1H, X1H
1686 VL 0(P1ptr), X1L
1687 VPDI $0x4, X1L, X1L, X1L
1688 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1689
1690 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1691 VL 80(P1ptr), X1 // Z1H
1692 VPDI $0x4, X1, X1, X1
1693 VL 64(P1ptr), X0 // Z1L
1694 VPDI $0x4, X0, X0, X0
1695 CALL p256MulInternal<>(SB)
1696
1697 // VST T1, 64(P3ptr)
1698 // VST T0, 80(P3ptr)
1699 VLR T0, Z3L
1700 VLR T1, Z3H
1701
1702 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1703 VLR Y0, X0
1704 VLR Y1, X1
1705 CALL p256SqrInternal<>(SB)
1706 VLR T0, X0
1707 VLR T1, X1
1708
1709 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1710 CALL p256MulInternal<>(SB)
1711 VLR T0, T4L
1712 VLR T1, T4H
1713
1714 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1715 VL 16(P1ptr), Y1 // X1H
1716 VPDI $0x4, Y1, Y1, Y1
1717 VL 0(P1ptr), Y0 // X1L
1718 VPDI $0x4, Y0, Y0, Y0
1719 CALL p256MulInternal<>(SB)
1720 VLR T0, T3L
1721 VLR T1, T3H
1722
1723 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1724 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1725
1726 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1727 VLR T2L, X0
1728 VLR T2H, X1
1729 VLR T2L, Y0
1730 VLR T2H, Y1
1731 CALL p256SqrInternal<>(SB)
1732
1733 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1734 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1735
1736 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1737 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1738 VLR T0, X3L
1739 VLR T1, X3H
1740
1741 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1742 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1743
1744 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1745 CALL p256MulInternal<>(SB)
1746 VLR T0, T3L
1747 VLR T1, T3H
1748
1749 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1750 VLR T4L, X0
1751 VLR T4H, X1
1752 VL 48(P1ptr), Y1 // Y1H
1753 VPDI $0x4, Y1, Y1, Y1
1754 VL 32(P1ptr), Y0 // Y1L
1755 VPDI $0x4, Y0, Y0, Y0
1756 CALL p256MulInternal<>(SB)
1757
1758 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1759 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1760
1761 // if (sel == 0) {
1762 // copy(P3.x[:], X1)
1763 // copy(P3.y[:], Y1)
1764 // copy(P3.z[:], Z1)
1765 // }
1766
1767 VL 16(P1ptr), X1H
1768 VPDI $0x4, X1H, X1H, X1H
1769 VL 0(P1ptr), X1L
1770 VPDI $0x4, X1L, X1L, X1L
1771
1772 // Y1 already loaded, left over from addition
1773 VL 80(P1ptr), Z1H
1774 VPDI $0x4, Z1H, Z1H, Z1H
1775 VL 64(P1ptr), Z1L
1776 VPDI $0x4, Z1L, Z1L, Z1L
1777
1778 VLREPG sel+32(FP), SEL1
1779 VZERO ZER
1780 VCEQG SEL1, ZER, SEL1
1781
1782 VSEL X1L, X3L, SEL1, X3L
1783 VSEL X1H, X3H, SEL1, X3H
1784 VSEL Y1L, Y3L, SEL1, Y3L
1785 VSEL Y1H, Y3H, SEL1, Y3H
1786 VSEL Z1L, Z3L, SEL1, Z3L
1787 VSEL Z1H, Z3H, SEL1, Z3H
1788
1789 // if (zero == 0) {
1790 // copy(P3.x[:], X2)
1791 // copy(P3.y[:], Y2)
1792 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1793 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
1794 // }
1795 VL 16(P2ptr), X2H
1796 VPDI $0x4, X2H, X2H, X2H
1797 VL 0(P2ptr), X2L
1798 VPDI $0x4, X2L, X2L, X2L
1799
1800 // Y2 already loaded
1801 VL 128(CPOOL), Z2H
1802 VL 144(CPOOL), Z2L
1803
1804 VLREPG zero+40(FP), SEL1
1805 VZERO ZER
1806 VCEQG SEL1, ZER, SEL1
1807
1808 VSEL X2L, X3L, SEL1, X3L
1809 VSEL X2H, X3H, SEL1, X3H
1810 VSEL Y2L, Y3L, SEL1, Y3L
1811 VSEL Y2H, Y3H, SEL1, Y3H
1812 VSEL Z2L, Z3L, SEL1, Z3L
1813 VSEL Z2H, Z3H, SEL1, Z3H
1814
1815 // All done, store out the result!!!
1816 VPDI $0x4, X3H, X3H, X3H
1817 VST X3H, 16(P3ptr)
1818 VPDI $0x4, X3L, X3L, X3L
1819 VST X3L, 0(P3ptr)
1820 VPDI $0x4, Y3H, Y3H, Y3H
1821 VST Y3H, 48(P3ptr)
1822 VPDI $0x4, Y3L, Y3L, Y3L
1823 VST Y3L, 32(P3ptr)
1824 VPDI $0x4, Z3H, Z3H, Z3H
1825 VST Z3H, 80(P3ptr)
1826 VPDI $0x4, Z3L, Z3L, Z3L
1827 VST Z3L, 64(P3ptr)
1828
1829 RET
1830
1831#undef P3ptr
1832#undef P1ptr
1833#undef P2ptr
1834#undef CPOOL
1835
1836#undef Y2L
1837#undef Y2H
1838#undef T1L
1839#undef T1H
1840#undef T2L
1841#undef T2H
1842#undef T3L
1843#undef T3H
1844#undef T4L
1845#undef T4H
1846
1847#undef TT0
1848#undef TT1
1849#undef T2
1850
1851#undef X0
1852#undef X1
1853#undef Y0
1854#undef Y1
1855#undef T0
1856#undef T1
1857
1858#undef PL
1859#undef PH
1860
1861#undef X1L
1862#undef X1H
1863#undef Y1L
1864#undef Y1H
1865#undef Z1L
1866#undef Z1H
1867#undef X2L
1868#undef X2H
1869#undef Z2L
1870#undef Z2H
1871#undef X3L
1872#undef X3H
1873#undef Y3L
1874#undef Y3H
1875#undef Z3L
1876#undef Z3H
1877
1878#undef ZER
1879#undef SEL1
1880#undef CAR1
1881#undef CAR2
1882
1883// func p256PointDoubleAsm(res, in *P256Point)
1884// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1885// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1886// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1887#define P3ptr R1
1888#define P1ptr R2
1889#define CPOOL R4
1890
1891// Temporaries in REGs
1892#define X3L V15
1893#define X3H V16
1894#define Y3L V17
1895#define Y3H V18
1896#define T1L V19
1897#define T1H V20
1898#define T2L V21
1899#define T2H V22
1900#define T3L V23
1901#define T3H V24
1902
1903#define X1L V6
1904#define X1H V7
1905#define Y1L V8
1906#define Y1H V9
1907#define Z1L V10
1908#define Z1H V11
1909
1910// Temps for Sub and Add
1911#define TT0 V11
1912#define TT1 V12
1913#define T2 V13
1914
1915// p256MulAsm Parameters
1916#define X0 V0
1917#define X1 V1
1918#define Y0 V2
1919#define Y1 V3
1920#define T0 V4
1921#define T1 V5
1922
1923#define PL V30
1924#define PH V31
1925
1926#define Z3L V23
1927#define Z3H V24
1928
1929#define ZER V26
1930#define SEL1 V27
1931#define CAR1 V28
1932#define CAR2 V29
1933/*
1934 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1935 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1936 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1937 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1938 * B = 2Y₁
1939 * Z₃ = B×Z₁
1940 * C = B²
1941 * D = C×X₁
1942 * X₃ = A²-2D
1943 * Y₃ = (D-X₃)×A-C²/2
1944 *
1945 * Three-operand formula:
1946 * T1 = Z1²
1947 * T2 = X1-T1
1948 * T1 = X1+T1
1949 * T2 = T2*T1
1950 * T2 = 3*T2
1951 * Y3 = 2*Y1
1952 * Z3 = Y3*Z1
1953 * Y3 = Y3²
1954 * T3 = Y3*X1
1955 * Y3 = Y3²
1956 * Y3 = half*Y3
1957 * X3 = T2²
1958 * T1 = 2*T3
1959 * X3 = X3-T1
1960 * T1 = T3-X3
1961 * T1 = T1*T2
1962 * Y3 = T1-Y3
1963 */
1964
1965TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
1966 MOVD res+0(FP), P3ptr
1967 MOVD in+8(FP), P1ptr
1968
1969 MOVD $p256mul<>+0x00(SB), CPOOL
1970 VL 16(CPOOL), PL
1971 VL 0(CPOOL), PH
1972
1973 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1974 VL 80(P1ptr), X1 // Z1H
1975 VPDI $0x4, X1, X1, X1
1976 VL 64(P1ptr), X0 // Z1L
1977 VPDI $0x4, X0, X0, X0
1978 VLR X0, Y0
1979 VLR X1, Y1
1980 CALL p256SqrInternal<>(SB)
1981
1982 // SUB(X<X1-T) // T2 = X1-T1
1983 VL 16(P1ptr), X1H
1984 VPDI $0x4, X1H, X1H, X1H
1985 VL 0(P1ptr), X1L
1986 VPDI $0x4, X1L, X1L, X1L
1987 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1988
1989 // ADD(Y<X1+T) // T1 = X1+T1
1990 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1991
1992 // X- ; Y- ; MUL; T- // T2 = T2*T1
1993 CALL p256MulInternal<>(SB)
1994
1995 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1996 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1997 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1998
1999 // ADD(X<Y1+Y1) // Y3 = 2*Y1
2000 VL 48(P1ptr), Y1H
2001 VPDI $0x4, Y1H, Y1H, Y1H
2002 VL 32(P1ptr), Y1L
2003 VPDI $0x4, Y1L, Y1L, Y1L
2004 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
2005
2006 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
2007 VL 80(P1ptr), Y1 // Z1H
2008 VPDI $0x4, Y1, Y1, Y1
2009 VL 64(P1ptr), Y0 // Z1L
2010 VPDI $0x4, Y0, Y0, Y0
2011 CALL p256MulInternal<>(SB)
2012 VPDI $0x4, T1, T1, TT1
2013 VST TT1, 80(P3ptr)
2014 VPDI $0x4, T0, T0, TT0
2015 VST TT0, 64(P3ptr)
2016
2017 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2018 VLR X0, Y0
2019 VLR X1, Y1
2020 CALL p256SqrInternal<>(SB)
2021
2022 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
2023 VLR T0, X0
2024 VLR T1, X1
2025 VL 16(P1ptr), Y1
2026 VPDI $0x4, Y1, Y1, Y1
2027 VL 0(P1ptr), Y0
2028 VPDI $0x4, Y0, Y0, Y0
2029 CALL p256MulInternal<>(SB)
2030 VLR T0, T3L
2031 VLR T1, T3H
2032
2033 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2034 VLR X0, Y0
2035 VLR X1, Y1
2036 CALL p256SqrInternal<>(SB)
2037
2038 // HAL(Y3<T) // Y3 = half*Y3
2039 p256HalfInternal(Y3H,Y3L, T1,T0)
2040
2041 // X=T2; Y=T2; MUL; T- // X3 = T2²
2042 VLR T2L, X0
2043 VLR T2H, X1
2044 VLR T2L, Y0
2045 VLR T2H, Y1
2046 CALL p256SqrInternal<>(SB)
2047
2048 // ADD(T1<T3+T3) // T1 = 2*T3
2049 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
2050
2051 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
2052 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
2053 VPDI $0x4, X3H, X3H, TT1
2054 VST TT1, 16(P3ptr)
2055 VPDI $0x4, X3L, X3L, TT0
2056 VST TT0, 0(P3ptr)
2057
2058 // SUB(X<T3-X3) // T1 = T3-X3
2059 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
2060
2061 // X- ; Y- ; MUL; T- // T1 = T1*T2
2062 CALL p256MulInternal<>(SB)
2063
2064 // SUB(Y3<T-Y3) // Y3 = T1-Y3
2065 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
2066
2067 VPDI $0x4, Y3H, Y3H, Y3H
2068 VST Y3H, 48(P3ptr)
2069 VPDI $0x4, Y3L, Y3L, Y3L
2070 VST Y3L, 32(P3ptr)
2071 RET
2072
2073#undef P3ptr
2074#undef P1ptr
2075#undef CPOOL
2076#undef X3L
2077#undef X3H
2078#undef Y3L
2079#undef Y3H
2080#undef T1L
2081#undef T1H
2082#undef T2L
2083#undef T2H
2084#undef T3L
2085#undef T3H
2086#undef X1L
2087#undef X1H
2088#undef Y1L
2089#undef Y1H
2090#undef Z1L
2091#undef Z1H
2092#undef TT0
2093#undef TT1
2094#undef T2
2095#undef X0
2096#undef X1
2097#undef Y0
2098#undef Y1
2099#undef T0
2100#undef T1
2101#undef PL
2102#undef PH
2103#undef Z3L
2104#undef Z3H
2105#undef ZER
2106#undef SEL1
2107#undef CAR1
2108#undef CAR2
2109
2110// func p256PointAddAsm(res, in1, in2 *P256Point) int
2111#define P3ptr R1
2112#define P1ptr R2
2113#define P2ptr R3
2114#define CPOOL R4
2115#define ISZERO R5
2116#define TRUE R6
2117
2118// Temporaries in REGs
2119#define T1L V16
2120#define T1H V17
2121#define T2L V18
2122#define T2H V19
2123#define U1L V20
2124#define U1H V21
2125#define S1L V22
2126#define S1H V23
2127#define HL V24
2128#define HH V25
2129#define RL V26
2130#define RH V27
2131
2132// Temps for Sub and Add
2133#define ZER V6
2134#define SEL1 V7
2135#define CAR1 V8
2136#define CAR2 V9
2137#define TT0 V11
2138#define TT1 V12
2139#define T2 V13
2140
2141// p256MulAsm Parameters
2142#define X0 V0
2143#define X1 V1
2144#define Y0 V2
2145#define Y1 V3
2146#define T0 V4
2147#define T1 V5
2148
2149#define PL V30
2150#define PH V31
2151/*
2152 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
2153 *
2154 * A = X₁×Z₂²
2155 * B = Y₁×Z₂³
2156 * C = X₂×Z₁²-A
2157 * D = Y₂×Z₁³-B
2158 * X₃ = D² - 2A×C² - C³
2159 * Y₃ = D×(A×C² - X₃) - B×C³
2160 * Z₃ = Z₁×Z₂×C
2161 *
2162 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
2163 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
2164 *
2165 * T1 = Z1*Z1
2166 * T2 = Z2*Z2
2167 * U1 = X1*T2
2168 * H = X2*T1
2169 * H = H-U1
2170 * Z3 = Z1*Z2
2171 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2172 *
2173 * S1 = Z2*T2
2174 * S1 = Y1*S1
2175 * R = Z1*T1
2176 * R = Y2*R
2177 * R = R-S1
2178 *
2179 * T1 = H*H
2180 * T2 = H*T1
2181 * U1 = U1*T1
2182 *
2183 * X3 = R*R
2184 * X3 = X3-T2
2185 * T1 = 2*U1
2186 * X3 = X3-T1 << store-out X3 result reg
2187 *
2188 * T2 = S1*T2
2189 * Y3 = U1-X3
2190 * Y3 = R*Y3
2191 * Y3 = Y3-T2 << store-out Y3 result reg
2192
2193 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2194 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2195 // X=X2; Y- ; MUL; H=T // H = X2*T1
2196 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2197 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2198 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2199 // SUB(H<H-T) // H = H-U1
2200 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2201 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2202 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2203 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2204 // SUB(R<T-S1) // R = R-S1
2205 // X=H ; Y=H ; MUL; T- // T1 = H*H
2206 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2207 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2208 // X=R ; Y=R ; MUL; T- // X3 = R*R
2209 // SUB(T<T-T2) // X3 = X3-T2
2210 // ADD(X<U1+U1) // T1 = 2*U1
2211 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2212 // SUB(Y<U1-T) // Y3 = U1-X3
2213 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2214 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2215 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2216 */
2217TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
2218 MOVD res+0(FP), P3ptr
2219 MOVD in1+8(FP), P1ptr
2220 MOVD in2+16(FP), P2ptr
2221
2222 MOVD $p256mul<>+0x00(SB), CPOOL
2223 VL 16(CPOOL), PL
2224 VL 0(CPOOL), PH
2225
2226 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2227 VL 80(P1ptr), X1 // Z1H
2228 VPDI $0x4, X1, X1, X1
2229 VL 64(P1ptr), X0 // Z1L
2230 VPDI $0x4, X0, X0, X0
2231 VLR X0, Y0
2232 VLR X1, Y1
2233 CALL p256SqrInternal<>(SB)
2234
2235 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2236 VLR T0, Y0
2237 VLR T1, Y1
2238 CALL p256MulInternal<>(SB)
2239 VLR T0, RL
2240 VLR T1, RH
2241
2242 // X=X2; Y- ; MUL; H=T // H = X2*T1
2243 VL 16(P2ptr), X1 // X2H
2244 VPDI $0x4, X1, X1, X1
2245 VL 0(P2ptr), X0 // X2L
2246 VPDI $0x4, X0, X0, X0
2247 CALL p256MulInternal<>(SB)
2248 VLR T0, HL
2249 VLR T1, HH
2250
2251 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2252 VL 80(P2ptr), X1 // Z2H
2253 VPDI $0x4, X1, X1, X1
2254 VL 64(P2ptr), X0 // Z2L
2255 VPDI $0x4, X0, X0, X0
2256 VLR X0, Y0
2257 VLR X1, Y1
2258 CALL p256SqrInternal<>(SB)
2259
2260 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2261 VLR T0, Y0
2262 VLR T1, Y1
2263 CALL p256MulInternal<>(SB)
2264 VLR T0, S1L
2265 VLR T1, S1H
2266
2267 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2268 VL 16(P1ptr), X1 // X1H
2269 VPDI $0x4, X1, X1, X1
2270 VL 0(P1ptr), X0 // X1L
2271 VPDI $0x4, X0, X0, X0
2272 CALL p256MulInternal<>(SB)
2273 VLR T0, U1L
2274 VLR T1, U1H
2275
2276 // SUB(H<H-T) // H = H-U1
2277 p256SubInternal(HH,HL,HH,HL,T1,T0)
2278
2279 // if H == 0 or H^P == 0 then ret=1 else ret=0
2280 // clobbers T1H and T1L
2281 MOVD $0, ISZERO
2282 MOVD $1, TRUE
2283 VZERO ZER
2284 VO HL, HH, T1H
2285 VCEQGS ZER, T1H, T1H
2286 MOVDEQ TRUE, ISZERO
2287 VX HL, PL, T1L
2288 VX HH, PH, T1H
2289 VO T1L, T1H, T1H
2290 VCEQGS ZER, T1H, T1H
2291 MOVDEQ TRUE, ISZERO
2292 MOVD ISZERO, ret+24(FP)
2293
2294 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2295 VL 80(P1ptr), X1 // Z1H
2296 VPDI $0x4, X1, X1, X1
2297 VL 64(P1ptr), X0 // Z1L
2298 VPDI $0x4, X0, X0, X0
2299 VL 80(P2ptr), Y1 // Z2H
2300 VPDI $0x4, Y1, Y1, Y1
2301 VL 64(P2ptr), Y0 // Z2L
2302 VPDI $0x4, Y0, Y0, Y0
2303 CALL p256MulInternal<>(SB)
2304
2305 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2306 VLR T0, X0
2307 VLR T1, X1
2308 VLR HL, Y0
2309 VLR HH, Y1
2310 CALL p256MulInternal<>(SB)
2311 VPDI $0x4, T1, T1, TT1
2312 VST TT1, 80(P3ptr)
2313 VPDI $0x4, T0, T0, TT0
2314 VST TT0, 64(P3ptr)
2315
2316 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2317 VL 48(P1ptr), X1
2318 VPDI $0x4, X1, X1, X1
2319 VL 32(P1ptr), X0
2320 VPDI $0x4, X0, X0, X0
2321 VLR S1L, Y0
2322 VLR S1H, Y1
2323 CALL p256MulInternal<>(SB)
2324 VLR T0, S1L
2325 VLR T1, S1H
2326
2327 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2328 VL 48(P2ptr), X1
2329 VPDI $0x4, X1, X1, X1
2330 VL 32(P2ptr), X0
2331 VPDI $0x4, X0, X0, X0
2332 VLR RL, Y0
2333 VLR RH, Y1
2334 CALL p256MulInternal<>(SB)
2335
2336 // SUB(R<T-S1) // R = T-S1
2337 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2338
2339 // if R == 0 or R^P == 0 then ret=ret else ret=0
2340 // clobbers T1H and T1L
2341 MOVD $0, ISZERO
2342 MOVD $1, TRUE
2343 VZERO ZER
2344 VO RL, RH, T1H
2345 VCEQGS ZER, T1H, T1H
2346 MOVDEQ TRUE, ISZERO
2347 VX RL, PL, T1L
2348 VX RH, PH, T1H
2349 VO T1L, T1H, T1H
2350 VCEQGS ZER, T1H, T1H
2351 MOVDEQ TRUE, ISZERO
2352 AND ret+24(FP), ISZERO
2353 MOVD ISZERO, ret+24(FP)
2354
2355 // X=H ; Y=H ; MUL; T- // T1 = H*H
2356 VLR HL, X0
2357 VLR HH, X1
2358 VLR HL, Y0
2359 VLR HH, Y1
2360 CALL p256SqrInternal<>(SB)
2361
2362 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2363 VLR T0, Y0
2364 VLR T1, Y1
2365 CALL p256MulInternal<>(SB)
2366 VLR T0, T2L
2367 VLR T1, T2H
2368
2369 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2370 VLR U1L, X0
2371 VLR U1H, X1
2372 CALL p256MulInternal<>(SB)
2373 VLR T0, U1L
2374 VLR T1, U1H
2375
2376 // X=R ; Y=R ; MUL; T- // X3 = R*R
2377 VLR RL, X0
2378 VLR RH, X1
2379 VLR RL, Y0
2380 VLR RH, Y1
2381 CALL p256SqrInternal<>(SB)
2382
2383 // SUB(T<T-T2) // X3 = X3-T2
2384 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2385
2386 // ADD(X<U1+U1) // T1 = 2*U1
2387 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2388
2389 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2390 p256SubInternal(T1,T0,T1,T0,X1,X0)
2391 VPDI $0x4, T1, T1, TT1
2392 VST TT1, 16(P3ptr)
2393 VPDI $0x4, T0, T0, TT0
2394 VST TT0, 0(P3ptr)
2395
2396 // SUB(Y<U1-T) // Y3 = U1-X3
2397 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2398
2399 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2400 VLR RL, X0
2401 VLR RH, X1
2402 CALL p256MulInternal<>(SB)
2403 VLR T0, U1L
2404 VLR T1, U1H
2405
2406 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2407 VLR S1L, X0
2408 VLR S1H, X1
2409 VLR T2L, Y0
2410 VLR T2H, Y1
2411 CALL p256MulInternal<>(SB)
2412
2413 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2414 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2415 VPDI $0x4, T1, T1, T1
2416 VST T1, 48(P3ptr)
2417 VPDI $0x4, T0, T0, T0
2418 VST T0, 32(P3ptr)
2419
2420 RET
View as plain text