...

Text file src/crypto/internal/nistec/p256_asm_arm64.s

Documentation: crypto/internal/nistec

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7// This file contains constant-time, 64-bit assembly implementation of
     8// P256. The optimizations performed here are described in detail in:
     9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10//                          256-bit primes"
    11// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12// https://eprint.iacr.org/2013/816.pdf
    13
    14#include "textflag.h"
    15
    16#define res_ptr R0
    17#define a_ptr R1
    18#define b_ptr R2
    19
    20#define acc0 R3
    21#define acc1 R4
    22#define acc2 R5
    23#define acc3 R6
    24
    25#define acc4 R7
    26#define acc5 R8
    27#define acc6 R9
    28#define acc7 R10
    29#define t0 R11
    30#define t1 R12
    31#define t2 R13
    32#define t3 R14
    33#define const0 R15
    34#define const1 R16
    35
    36#define hlp0 R17
    37#define hlp1 res_ptr
    38
    39#define x0 R19
    40#define x1 R20
    41#define x2 R21
    42#define x3 R22
    43#define y0 R23
    44#define y1 R24
    45#define y2 R25
    46#define y3 R26
    47
    48#define const2 t2
    49#define const3 t3
    50
    51DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    52DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    53DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    54DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    55DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    56DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    57DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    58DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    59DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    60DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    61DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    62GLOBL p256const0<>(SB), 8, $8
    63GLOBL p256const1<>(SB), 8, $8
    64GLOBL p256ordK0<>(SB), 8, $8
    65GLOBL p256ord<>(SB), 8, $32
    66GLOBL p256one<>(SB), 8, $32
    67
    68/* ---------------------------------------*/
    69// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    70TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    71	JMP	·p256BigToLittle(SB)
    72/* ---------------------------------------*/
    73// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    74TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    75	JMP	·p256BigToLittle(SB)
    76/* ---------------------------------------*/
    77// func p256LittleToBig(res *[32]byte, in *p256Element)
    78TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    79	JMP	·p256BigToLittle(SB)
    80/* ---------------------------------------*/
    81// func p256BigToLittle(res *p256Element, in *[32]byte)
    82TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    83	MOVD	res+0(FP), res_ptr
    84	MOVD	in+8(FP), a_ptr
    85
    86	LDP	0*16(a_ptr), (acc0, acc1)
    87	LDP	1*16(a_ptr), (acc2, acc3)
    88
    89	REV	acc0, acc0
    90	REV	acc1, acc1
    91	REV	acc2, acc2
    92	REV	acc3, acc3
    93
    94	STP	(acc3, acc2), 0*16(res_ptr)
    95	STP	(acc1, acc0), 1*16(res_ptr)
    96	RET
    97/* ---------------------------------------*/
    98// func p256MovCond(res, a, b *P256Point, cond int)
    99// If cond == 0 res=b, else res=a
   100TEXT ·p256MovCond(SB),NOSPLIT,$0
   101	MOVD	res+0(FP), res_ptr
   102	MOVD	a+8(FP), a_ptr
   103	MOVD	b+16(FP), b_ptr
   104	MOVD	cond+24(FP), R3
   105
   106	CMP	$0, R3
   107	// Two remarks:
   108	// 1) Will want to revisit NEON, when support is better
   109	// 2) CSEL might not be constant time on all ARM processors
   110	LDP	0*16(a_ptr), (R4, R5)
   111	LDP	1*16(a_ptr), (R6, R7)
   112	LDP	2*16(a_ptr), (R8, R9)
   113	LDP	0*16(b_ptr), (R16, R17)
   114	LDP	1*16(b_ptr), (R19, R20)
   115	LDP	2*16(b_ptr), (R21, R22)
   116	CSEL	EQ, R16, R4, R4
   117	CSEL	EQ, R17, R5, R5
   118	CSEL	EQ, R19, R6, R6
   119	CSEL	EQ, R20, R7, R7
   120	CSEL	EQ, R21, R8, R8
   121	CSEL	EQ, R22, R9, R9
   122	STP	(R4, R5), 0*16(res_ptr)
   123	STP	(R6, R7), 1*16(res_ptr)
   124	STP	(R8, R9), 2*16(res_ptr)
   125
   126	LDP	3*16(a_ptr), (R4, R5)
   127	LDP	4*16(a_ptr), (R6, R7)
   128	LDP	5*16(a_ptr), (R8, R9)
   129	LDP	3*16(b_ptr), (R16, R17)
   130	LDP	4*16(b_ptr), (R19, R20)
   131	LDP	5*16(b_ptr), (R21, R22)
   132	CSEL	EQ, R16, R4, R4
   133	CSEL	EQ, R17, R5, R5
   134	CSEL	EQ, R19, R6, R6
   135	CSEL	EQ, R20, R7, R7
   136	CSEL	EQ, R21, R8, R8
   137	CSEL	EQ, R22, R9, R9
   138	STP	(R4, R5), 3*16(res_ptr)
   139	STP	(R6, R7), 4*16(res_ptr)
   140	STP	(R8, R9), 5*16(res_ptr)
   141
   142	RET
   143/* ---------------------------------------*/
   144// func p256NegCond(val *p256Element, cond int)
   145TEXT ·p256NegCond(SB),NOSPLIT,$0
   146	MOVD	val+0(FP), a_ptr
   147	MOVD	cond+8(FP), hlp0
   148	MOVD	a_ptr, res_ptr
   149	// acc = poly
   150	MOVD	$-1, acc0
   151	MOVD	p256const0<>(SB), acc1
   152	MOVD	$0, acc2
   153	MOVD	p256const1<>(SB), acc3
   154	// Load the original value
   155	LDP	0*16(a_ptr), (t0, t1)
   156	LDP	1*16(a_ptr), (t2, t3)
   157	// Speculatively subtract
   158	SUBS	t0, acc0
   159	SBCS	t1, acc1
   160	SBCS	t2, acc2
   161	SBC	t3, acc3
   162	// If condition is 0, keep original value
   163	CMP	$0, hlp0
   164	CSEL	EQ, t0, acc0, acc0
   165	CSEL	EQ, t1, acc1, acc1
   166	CSEL	EQ, t2, acc2, acc2
   167	CSEL	EQ, t3, acc3, acc3
   168	// Store result
   169	STP	(acc0, acc1), 0*16(res_ptr)
   170	STP	(acc2, acc3), 1*16(res_ptr)
   171
   172	RET
   173/* ---------------------------------------*/
   174// func p256Sqr(res, in *p256Element, n int)
   175TEXT ·p256Sqr(SB),NOSPLIT,$0
   176	MOVD	res+0(FP), res_ptr
   177	MOVD	in+8(FP), a_ptr
   178	MOVD	n+16(FP), b_ptr
   179
   180	MOVD	p256const0<>(SB), const0
   181	MOVD	p256const1<>(SB), const1
   182
   183	LDP	0*16(a_ptr), (x0, x1)
   184	LDP	1*16(a_ptr), (x2, x3)
   185
   186sqrLoop:
   187	SUB	$1, b_ptr
   188	CALL	p256SqrInternal<>(SB)
   189	MOVD	y0, x0
   190	MOVD	y1, x1
   191	MOVD	y2, x2
   192	MOVD	y3, x3
   193	CBNZ	b_ptr, sqrLoop
   194
   195	STP	(y0, y1), 0*16(res_ptr)
   196	STP	(y2, y3), 1*16(res_ptr)
   197	RET
   198/* ---------------------------------------*/
   199// func p256Mul(res, in1, in2 *p256Element)
   200TEXT ·p256Mul(SB),NOSPLIT,$0
   201	MOVD	res+0(FP), res_ptr
   202	MOVD	in1+8(FP), a_ptr
   203	MOVD	in2+16(FP), b_ptr
   204
   205	MOVD	p256const0<>(SB), const0
   206	MOVD	p256const1<>(SB), const1
   207
   208	LDP	0*16(a_ptr), (x0, x1)
   209	LDP	1*16(a_ptr), (x2, x3)
   210
   211	LDP	0*16(b_ptr), (y0, y1)
   212	LDP	1*16(b_ptr), (y2, y3)
   213
   214	CALL	p256MulInternal<>(SB)
   215
   216	STP	(y0, y1), 0*16(res_ptr)
   217	STP	(y2, y3), 1*16(res_ptr)
   218	RET
   219/* ---------------------------------------*/
   220// func p256FromMont(res, in *p256Element)
   221TEXT ·p256FromMont(SB),NOSPLIT,$0
   222	MOVD	res+0(FP), res_ptr
   223	MOVD	in+8(FP), a_ptr
   224
   225	MOVD	p256const0<>(SB), const0
   226	MOVD	p256const1<>(SB), const1
   227
   228	LDP	0*16(a_ptr), (acc0, acc1)
   229	LDP	1*16(a_ptr), (acc2, acc3)
   230	// Only reduce, no multiplications are needed
   231	// First reduction step
   232	ADDS	acc0<<32, acc1, acc1
   233	LSR	$32, acc0, t0
   234	MUL	acc0, const1, t1
   235	UMULH	acc0, const1, acc0
   236	ADCS	t0, acc2
   237	ADCS	t1, acc3
   238	ADC	$0, acc0
   239	// Second reduction step
   240	ADDS	acc1<<32, acc2, acc2
   241	LSR	$32, acc1, t0
   242	MUL	acc1, const1, t1
   243	UMULH	acc1, const1, acc1
   244	ADCS	t0, acc3
   245	ADCS	t1, acc0
   246	ADC	$0, acc1
   247	// Third reduction step
   248	ADDS	acc2<<32, acc3, acc3
   249	LSR	$32, acc2, t0
   250	MUL	acc2, const1, t1
   251	UMULH	acc2, const1, acc2
   252	ADCS	t0, acc0
   253	ADCS	t1, acc1
   254	ADC	$0, acc2
   255	// Last reduction step
   256	ADDS	acc3<<32, acc0, acc0
   257	LSR	$32, acc3, t0
   258	MUL	acc3, const1, t1
   259	UMULH	acc3, const1, acc3
   260	ADCS	t0, acc1
   261	ADCS	t1, acc2
   262	ADC	$0, acc3
   263
   264	SUBS	$-1, acc0, t0
   265	SBCS	const0, acc1, t1
   266	SBCS	$0, acc2, t2
   267	SBCS	const1, acc3, t3
   268
   269	CSEL	CS, t0, acc0, acc0
   270	CSEL	CS, t1, acc1, acc1
   271	CSEL	CS, t2, acc2, acc2
   272	CSEL	CS, t3, acc3, acc3
   273
   274	STP	(acc0, acc1), 0*16(res_ptr)
   275	STP	(acc2, acc3), 1*16(res_ptr)
   276
   277	RET
   278/* ---------------------------------------*/
   279// func p256Select(res *P256Point, table *p256Table, idx int)
   280TEXT ·p256Select(SB),NOSPLIT,$0
   281	MOVD	idx+16(FP), const0
   282	MOVD	table+8(FP), b_ptr
   283	MOVD	res+0(FP), res_ptr
   284
   285	EOR	x0, x0, x0
   286	EOR	x1, x1, x1
   287	EOR	x2, x2, x2
   288	EOR	x3, x3, x3
   289	EOR	y0, y0, y0
   290	EOR	y1, y1, y1
   291	EOR	y2, y2, y2
   292	EOR	y3, y3, y3
   293	EOR	t0, t0, t0
   294	EOR	t1, t1, t1
   295	EOR	t2, t2, t2
   296	EOR	t3, t3, t3
   297
   298	MOVD	$0, const1
   299
   300loop_select:
   301		ADD	$1, const1
   302		CMP	const0, const1
   303		LDP.P	16(b_ptr), (acc0, acc1)
   304		CSEL	EQ, acc0, x0, x0
   305		CSEL	EQ, acc1, x1, x1
   306		LDP.P	16(b_ptr), (acc2, acc3)
   307		CSEL	EQ, acc2, x2, x2
   308		CSEL	EQ, acc3, x3, x3
   309		LDP.P	16(b_ptr), (acc4, acc5)
   310		CSEL	EQ, acc4, y0, y0
   311		CSEL	EQ, acc5, y1, y1
   312		LDP.P	16(b_ptr), (acc6, acc7)
   313		CSEL	EQ, acc6, y2, y2
   314		CSEL	EQ, acc7, y3, y3
   315		LDP.P	16(b_ptr), (acc0, acc1)
   316		CSEL	EQ, acc0, t0, t0
   317		CSEL	EQ, acc1, t1, t1
   318		LDP.P	16(b_ptr), (acc2, acc3)
   319		CSEL	EQ, acc2, t2, t2
   320		CSEL	EQ, acc3, t3, t3
   321
   322		CMP	$16, const1
   323		BNE	loop_select
   324
   325	STP	(x0, x1), 0*16(res_ptr)
   326	STP	(x2, x3), 1*16(res_ptr)
   327	STP	(y0, y1), 2*16(res_ptr)
   328	STP	(y2, y3), 3*16(res_ptr)
   329	STP	(t0, t1), 4*16(res_ptr)
   330	STP	(t2, t3), 5*16(res_ptr)
   331	RET
   332/* ---------------------------------------*/
   333// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   334TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   335	MOVD	idx+16(FP), t0
   336	MOVD	table+8(FP), t1
   337	MOVD	res+0(FP), res_ptr
   338
   339	EOR	x0, x0, x0
   340	EOR	x1, x1, x1
   341	EOR	x2, x2, x2
   342	EOR	x3, x3, x3
   343	EOR	y0, y0, y0
   344	EOR	y1, y1, y1
   345	EOR	y2, y2, y2
   346	EOR	y3, y3, y3
   347
   348	MOVD	$0, t2
   349
   350loop_select:
   351		ADD	$1, t2
   352		CMP	t0, t2
   353		LDP.P	16(t1), (acc0, acc1)
   354		CSEL	EQ, acc0, x0, x0
   355		CSEL	EQ, acc1, x1, x1
   356		LDP.P	16(t1), (acc2, acc3)
   357		CSEL	EQ, acc2, x2, x2
   358		CSEL	EQ, acc3, x3, x3
   359		LDP.P	16(t1), (acc4, acc5)
   360		CSEL	EQ, acc4, y0, y0
   361		CSEL	EQ, acc5, y1, y1
   362		LDP.P	16(t1), (acc6, acc7)
   363		CSEL	EQ, acc6, y2, y2
   364		CSEL	EQ, acc7, y3, y3
   365
   366		CMP	$32, t2
   367		BNE	loop_select
   368
   369	STP	(x0, x1), 0*16(res_ptr)
   370	STP	(x2, x3), 1*16(res_ptr)
   371	STP	(y0, y1), 2*16(res_ptr)
   372	STP	(y2, y3), 3*16(res_ptr)
   373	RET
   374/* ---------------------------------------*/
   375// func p256OrdSqr(res, in *p256OrdElement, n int)
   376TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   377	MOVD	in+8(FP), a_ptr
   378	MOVD	n+16(FP), b_ptr
   379
   380	MOVD	p256ordK0<>(SB), hlp1
   381	LDP	p256ord<>+0x00(SB), (const0, const1)
   382	LDP	p256ord<>+0x10(SB), (const2, const3)
   383
   384	LDP	0*16(a_ptr), (x0, x1)
   385	LDP	1*16(a_ptr), (x2, x3)
   386
   387ordSqrLoop:
   388	SUB	$1, b_ptr
   389
   390	// x[1:] * x[0]
   391	MUL	x0, x1, acc1
   392	UMULH	x0, x1, acc2
   393
   394	MUL	x0, x2, t0
   395	ADDS	t0, acc2, acc2
   396	UMULH	x0, x2, acc3
   397
   398	MUL	x0, x3, t0
   399	ADCS	t0, acc3, acc3
   400	UMULH	x0, x3, acc4
   401	ADC	$0, acc4, acc4
   402	// x[2:] * x[1]
   403	MUL	x1, x2, t0
   404	ADDS	t0, acc3
   405	UMULH	x1, x2, t1
   406	ADCS	t1, acc4
   407	ADC	$0, ZR, acc5
   408
   409	MUL	x1, x3, t0
   410	ADDS	t0, acc4
   411	UMULH	x1, x3, t1
   412	ADC	t1, acc5
   413	// x[3] * x[2]
   414	MUL	x2, x3, t0
   415	ADDS	t0, acc5
   416	UMULH	x2, x3, acc6
   417	ADC	$0, acc6
   418
   419	MOVD	$0, acc7
   420	// *2
   421	ADDS	acc1, acc1
   422	ADCS	acc2, acc2
   423	ADCS	acc3, acc3
   424	ADCS	acc4, acc4
   425	ADCS	acc5, acc5
   426	ADCS	acc6, acc6
   427	ADC	$0, acc7
   428	// Missing products
   429	MUL	x0, x0, acc0
   430	UMULH	x0, x0, t0
   431	ADDS	t0, acc1, acc1
   432
   433	MUL	x1, x1, t0
   434	ADCS	t0, acc2, acc2
   435	UMULH	x1, x1, t1
   436	ADCS	t1, acc3, acc3
   437
   438	MUL	x2, x2, t0
   439	ADCS	t0, acc4, acc4
   440	UMULH	x2, x2, t1
   441	ADCS	t1, acc5, acc5
   442
   443	MUL	x3, x3, t0
   444	ADCS	t0, acc6, acc6
   445	UMULH	x3, x3, t1
   446	ADC	t1, acc7, acc7
   447	// First reduction step
   448	MUL	acc0, hlp1, hlp0
   449
   450	MUL	const0, hlp1, t0
   451	ADDS	t0, acc0, acc0
   452	UMULH	const0, hlp0, t1
   453
   454	MUL	const1, hlp0, t0
   455	ADCS	t0, acc1, acc1
   456	UMULH	const1, hlp0, y0
   457
   458	MUL	const2, hlp0, t0
   459	ADCS	t0, acc2, acc2
   460	UMULH	const2, hlp0, acc0
   461
   462	MUL	const3, hlp0, t0
   463	ADCS	t0, acc3, acc3
   464
   465	UMULH	const3, hlp0, hlp0
   466	ADC	$0, hlp0
   467
   468	ADDS	t1, acc1, acc1
   469	ADCS	y0, acc2, acc2
   470	ADCS	acc0, acc3, acc3
   471	ADC	$0, hlp0, acc0
   472	// Second reduction step
   473	MUL	acc1, hlp1, hlp0
   474
   475	MUL	const0, hlp1, t0
   476	ADDS	t0, acc1, acc1
   477	UMULH	const0, hlp0, t1
   478
   479	MUL	const1, hlp0, t0
   480	ADCS	t0, acc2, acc2
   481	UMULH	const1, hlp0, y0
   482
   483	MUL	const2, hlp0, t0
   484	ADCS	t0, acc3, acc3
   485	UMULH	const2, hlp0, acc1
   486
   487	MUL	const3, hlp0, t0
   488	ADCS	t0, acc0, acc0
   489
   490	UMULH	const3, hlp0, hlp0
   491	ADC	$0, hlp0
   492
   493	ADDS	t1, acc2, acc2
   494	ADCS	y0, acc3, acc3
   495	ADCS	acc1, acc0, acc0
   496	ADC	$0, hlp0, acc1
   497	// Third reduction step
   498	MUL	acc2, hlp1, hlp0
   499
   500	MUL	const0, hlp1, t0
   501	ADDS	t0, acc2, acc2
   502	UMULH	const0, hlp0, t1
   503
   504	MUL	const1, hlp0, t0
   505	ADCS	t0, acc3, acc3
   506	UMULH	const1, hlp0, y0
   507
   508	MUL	const2, hlp0, t0
   509	ADCS	t0, acc0, acc0
   510	UMULH	const2, hlp0, acc2
   511
   512	MUL	const3, hlp0, t0
   513	ADCS	t0, acc1, acc1
   514
   515	UMULH	const3, hlp0, hlp0
   516	ADC	$0, hlp0
   517
   518	ADDS	t1, acc3, acc3
   519	ADCS	y0, acc0, acc0
   520	ADCS	acc2, acc1, acc1
   521	ADC	$0, hlp0, acc2
   522
   523	// Last reduction step
   524	MUL	acc3, hlp1, hlp0
   525
   526	MUL	const0, hlp1, t0
   527	ADDS	t0, acc3, acc3
   528	UMULH	const0, hlp0, t1
   529
   530	MUL	const1, hlp0, t0
   531	ADCS	t0, acc0, acc0
   532	UMULH	const1, hlp0, y0
   533
   534	MUL	const2, hlp0, t0
   535	ADCS	t0, acc1, acc1
   536	UMULH	const2, hlp0, acc3
   537
   538	MUL	const3, hlp0, t0
   539	ADCS	t0, acc2, acc2
   540
   541	UMULH	const3, hlp0, hlp0
   542	ADC	$0, acc7
   543
   544	ADDS	t1, acc0, acc0
   545	ADCS	y0, acc1, acc1
   546	ADCS	acc3, acc2, acc2
   547	ADC	$0, hlp0, acc3
   548
   549	ADDS	acc4, acc0, acc0
   550	ADCS	acc5, acc1, acc1
   551	ADCS	acc6, acc2, acc2
   552	ADCS	acc7, acc3, acc3
   553	ADC	$0, ZR, acc4
   554
   555	SUBS	const0, acc0, y0
   556	SBCS	const1, acc1, y1
   557	SBCS	const2, acc2, y2
   558	SBCS	const3, acc3, y3
   559	SBCS	$0, acc4, acc4
   560
   561	CSEL	CS, y0, acc0, x0
   562	CSEL	CS, y1, acc1, x1
   563	CSEL	CS, y2, acc2, x2
   564	CSEL	CS, y3, acc3, x3
   565
   566	CBNZ	b_ptr, ordSqrLoop
   567
   568	MOVD	res+0(FP), res_ptr
   569	STP	(x0, x1), 0*16(res_ptr)
   570	STP	(x2, x3), 1*16(res_ptr)
   571
   572	RET
   573/* ---------------------------------------*/
   574// func p256OrdMul(res, in1, in2 *p256OrdElement)
   575TEXT ·p256OrdMul(SB),NOSPLIT,$0
   576	MOVD	in1+8(FP), a_ptr
   577	MOVD	in2+16(FP), b_ptr
   578
   579	MOVD	p256ordK0<>(SB), hlp1
   580	LDP	p256ord<>+0x00(SB), (const0, const1)
   581	LDP	p256ord<>+0x10(SB), (const2, const3)
   582
   583	LDP	0*16(a_ptr), (x0, x1)
   584	LDP	1*16(a_ptr), (x2, x3)
   585	LDP	0*16(b_ptr), (y0, y1)
   586	LDP	1*16(b_ptr), (y2, y3)
   587
   588	// y[0] * x
   589	MUL	y0, x0, acc0
   590	UMULH	y0, x0, acc1
   591
   592	MUL	y0, x1, t0
   593	ADDS	t0, acc1
   594	UMULH	y0, x1, acc2
   595
   596	MUL	y0, x2, t0
   597	ADCS	t0, acc2
   598	UMULH	y0, x2, acc3
   599
   600	MUL	y0, x3, t0
   601	ADCS	t0, acc3
   602	UMULH	y0, x3, acc4
   603	ADC	$0, acc4
   604	// First reduction step
   605	MUL	acc0, hlp1, hlp0
   606
   607	MUL	const0, hlp1, t0
   608	ADDS	t0, acc0, acc0
   609	UMULH	const0, hlp0, t1
   610
   611	MUL	const1, hlp0, t0
   612	ADCS	t0, acc1, acc1
   613	UMULH	const1, hlp0, y0
   614
   615	MUL	const2, hlp0, t0
   616	ADCS	t0, acc2, acc2
   617	UMULH	const2, hlp0, acc0
   618
   619	MUL	const3, hlp0, t0
   620	ADCS	t0, acc3, acc3
   621
   622	UMULH	const3, hlp0, hlp0
   623	ADC	$0, acc4
   624
   625	ADDS	t1, acc1, acc1
   626	ADCS	y0, acc2, acc2
   627	ADCS	acc0, acc3, acc3
   628	ADC	$0, hlp0, acc0
   629	// y[1] * x
   630	MUL	y1, x0, t0
   631	ADDS	t0, acc1
   632	UMULH	y1, x0, t1
   633
   634	MUL	y1, x1, t0
   635	ADCS	t0, acc2
   636	UMULH	y1, x1, hlp0
   637
   638	MUL	y1, x2, t0
   639	ADCS	t0, acc3
   640	UMULH	y1, x2, y0
   641
   642	MUL	y1, x3, t0
   643	ADCS	t0, acc4
   644	UMULH	y1, x3, y1
   645	ADC	$0, ZR, acc5
   646
   647	ADDS	t1, acc2
   648	ADCS	hlp0, acc3
   649	ADCS	y0, acc4
   650	ADC	y1, acc5
   651	// Second reduction step
   652	MUL	acc1, hlp1, hlp0
   653
   654	MUL	const0, hlp1, t0
   655	ADDS	t0, acc1, acc1
   656	UMULH	const0, hlp0, t1
   657
   658	MUL	const1, hlp0, t0
   659	ADCS	t0, acc2, acc2
   660	UMULH	const1, hlp0, y0
   661
   662	MUL	const2, hlp0, t0
   663	ADCS	t0, acc3, acc3
   664	UMULH	const2, hlp0, acc1
   665
   666	MUL	const3, hlp0, t0
   667	ADCS	t0, acc0, acc0
   668
   669	UMULH	const3, hlp0, hlp0
   670	ADC	$0, acc5
   671
   672	ADDS	t1, acc2, acc2
   673	ADCS	y0, acc3, acc3
   674	ADCS	acc1, acc0, acc0
   675	ADC	$0, hlp0, acc1
   676	// y[2] * x
   677	MUL	y2, x0, t0
   678	ADDS	t0, acc2
   679	UMULH	y2, x0, t1
   680
   681	MUL	y2, x1, t0
   682	ADCS	t0, acc3
   683	UMULH	y2, x1, hlp0
   684
   685	MUL	y2, x2, t0
   686	ADCS	t0, acc4
   687	UMULH	y2, x2, y0
   688
   689	MUL	y2, x3, t0
   690	ADCS	t0, acc5
   691	UMULH	y2, x3, y1
   692	ADC	$0, ZR, acc6
   693
   694	ADDS	t1, acc3
   695	ADCS	hlp0, acc4
   696	ADCS	y0, acc5
   697	ADC	y1, acc6
   698	// Third reduction step
   699	MUL	acc2, hlp1, hlp0
   700
   701	MUL	const0, hlp1, t0
   702	ADDS	t0, acc2, acc2
   703	UMULH	const0, hlp0, t1
   704
   705	MUL	const1, hlp0, t0
   706	ADCS	t0, acc3, acc3
   707	UMULH	const1, hlp0, y0
   708
   709	MUL	const2, hlp0, t0
   710	ADCS	t0, acc0, acc0
   711	UMULH	const2, hlp0, acc2
   712
   713	MUL	const3, hlp0, t0
   714	ADCS	t0, acc1, acc1
   715
   716	UMULH	const3, hlp0, hlp0
   717	ADC	$0, acc6
   718
   719	ADDS	t1, acc3, acc3
   720	ADCS	y0, acc0, acc0
   721	ADCS	acc2, acc1, acc1
   722	ADC	$0, hlp0, acc2
   723	// y[3] * x
   724	MUL	y3, x0, t0
   725	ADDS	t0, acc3
   726	UMULH	y3, x0, t1
   727
   728	MUL	y3, x1, t0
   729	ADCS	t0, acc4
   730	UMULH	y3, x1, hlp0
   731
   732	MUL	y3, x2, t0
   733	ADCS	t0, acc5
   734	UMULH	y3, x2, y0
   735
   736	MUL	y3, x3, t0
   737	ADCS	t0, acc6
   738	UMULH	y3, x3, y1
   739	ADC	$0, ZR, acc7
   740
   741	ADDS	t1, acc4
   742	ADCS	hlp0, acc5
   743	ADCS	y0, acc6
   744	ADC	y1, acc7
   745	// Last reduction step
   746	MUL	acc3, hlp1, hlp0
   747
   748	MUL	const0, hlp1, t0
   749	ADDS	t0, acc3, acc3
   750	UMULH	const0, hlp0, t1
   751
   752	MUL	const1, hlp0, t0
   753	ADCS	t0, acc0, acc0
   754	UMULH	const1, hlp0, y0
   755
   756	MUL	const2, hlp0, t0
   757	ADCS	t0, acc1, acc1
   758	UMULH	const2, hlp0, acc3
   759
   760	MUL	const3, hlp0, t0
   761	ADCS	t0, acc2, acc2
   762
   763	UMULH	const3, hlp0, hlp0
   764	ADC	$0, acc7
   765
   766	ADDS	t1, acc0, acc0
   767	ADCS	y0, acc1, acc1
   768	ADCS	acc3, acc2, acc2
   769	ADC	$0, hlp0, acc3
   770
   771	ADDS	acc4, acc0, acc0
   772	ADCS	acc5, acc1, acc1
   773	ADCS	acc6, acc2, acc2
   774	ADCS	acc7, acc3, acc3
   775	ADC	$0, ZR, acc4
   776
   777	SUBS	const0, acc0, t0
   778	SBCS	const1, acc1, t1
   779	SBCS	const2, acc2, t2
   780	SBCS	const3, acc3, t3
   781	SBCS	$0, acc4, acc4
   782
   783	CSEL	CS, t0, acc0, acc0
   784	CSEL	CS, t1, acc1, acc1
   785	CSEL	CS, t2, acc2, acc2
   786	CSEL	CS, t3, acc3, acc3
   787
   788	MOVD	res+0(FP), res_ptr
   789	STP	(acc0, acc1), 0*16(res_ptr)
   790	STP	(acc2, acc3), 1*16(res_ptr)
   791
   792	RET
   793/* ---------------------------------------*/
   794TEXT p256SubInternal<>(SB),NOSPLIT,$0
   795	SUBS	x0, y0, acc0
   796	SBCS	x1, y1, acc1
   797	SBCS	x2, y2, acc2
   798	SBCS	x3, y3, acc3
   799	SBC	$0, ZR, t0
   800
   801	ADDS	$-1, acc0, acc4
   802	ADCS	const0, acc1, acc5
   803	ADCS	$0, acc2, acc6
   804	ADC	const1, acc3, acc7
   805
   806	ANDS	$1, t0
   807	CSEL	EQ, acc0, acc4, x0
   808	CSEL	EQ, acc1, acc5, x1
   809	CSEL	EQ, acc2, acc6, x2
   810	CSEL	EQ, acc3, acc7, x3
   811
   812	RET
   813/* ---------------------------------------*/
   814TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   815	// x[1:] * x[0]
   816	MUL	x0, x1, acc1
   817	UMULH	x0, x1, acc2
   818
   819	MUL	x0, x2, t0
   820	ADDS	t0, acc2, acc2
   821	UMULH	x0, x2, acc3
   822
   823	MUL	x0, x3, t0
   824	ADCS	t0, acc3, acc3
   825	UMULH	x0, x3, acc4
   826	ADC	$0, acc4, acc4
   827	// x[2:] * x[1]
   828	MUL	x1, x2, t0
   829	ADDS	t0, acc3
   830	UMULH	x1, x2, t1
   831	ADCS	t1, acc4
   832	ADC	$0, ZR, acc5
   833
   834	MUL	x1, x3, t0
   835	ADDS	t0, acc4
   836	UMULH	x1, x3, t1
   837	ADC	t1, acc5
   838	// x[3] * x[2]
   839	MUL	x2, x3, t0
   840	ADDS	t0, acc5
   841	UMULH	x2, x3, acc6
   842	ADC	$0, acc6
   843
   844	MOVD	$0, acc7
   845	// *2
   846	ADDS	acc1, acc1
   847	ADCS	acc2, acc2
   848	ADCS	acc3, acc3
   849	ADCS	acc4, acc4
   850	ADCS	acc5, acc5
   851	ADCS	acc6, acc6
   852	ADC	$0, acc7
   853	// Missing products
   854	MUL	x0, x0, acc0
   855	UMULH	x0, x0, t0
   856	ADDS	t0, acc1, acc1
   857
   858	MUL	x1, x1, t0
   859	ADCS	t0, acc2, acc2
   860	UMULH	x1, x1, t1
   861	ADCS	t1, acc3, acc3
   862
   863	MUL	x2, x2, t0
   864	ADCS	t0, acc4, acc4
   865	UMULH	x2, x2, t1
   866	ADCS	t1, acc5, acc5
   867
   868	MUL	x3, x3, t0
   869	ADCS	t0, acc6, acc6
   870	UMULH	x3, x3, t1
   871	ADCS	t1, acc7, acc7
   872	// First reduction step
   873	ADDS	acc0<<32, acc1, acc1
   874	LSR	$32, acc0, t0
   875	MUL	acc0, const1, t1
   876	UMULH	acc0, const1, acc0
   877	ADCS	t0, acc2, acc2
   878	ADCS	t1, acc3, acc3
   879	ADC	$0, acc0, acc0
   880	// Second reduction step
   881	ADDS	acc1<<32, acc2, acc2
   882	LSR	$32, acc1, t0
   883	MUL	acc1, const1, t1
   884	UMULH	acc1, const1, acc1
   885	ADCS	t0, acc3, acc3
   886	ADCS	t1, acc0, acc0
   887	ADC	$0, acc1, acc1
   888	// Third reduction step
   889	ADDS	acc2<<32, acc3, acc3
   890	LSR	$32, acc2, t0
   891	MUL	acc2, const1, t1
   892	UMULH	acc2, const1, acc2
   893	ADCS	t0, acc0, acc0
   894	ADCS	t1, acc1, acc1
   895	ADC	$0, acc2, acc2
   896	// Last reduction step
   897	ADDS	acc3<<32, acc0, acc0
   898	LSR	$32, acc3, t0
   899	MUL	acc3, const1, t1
   900	UMULH	acc3, const1, acc3
   901	ADCS	t0, acc1, acc1
   902	ADCS	t1, acc2, acc2
   903	ADC	$0, acc3, acc3
   904	// Add bits [511:256] of the sqr result
   905	ADDS	acc4, acc0, acc0
   906	ADCS	acc5, acc1, acc1
   907	ADCS	acc6, acc2, acc2
   908	ADCS	acc7, acc3, acc3
   909	ADC	$0, ZR, acc4
   910
   911	SUBS	$-1, acc0, t0
   912	SBCS	const0, acc1, t1
   913	SBCS	$0, acc2, t2
   914	SBCS	const1, acc3, t3
   915	SBCS	$0, acc4, acc4
   916
   917	CSEL	CS, t0, acc0, y0
   918	CSEL	CS, t1, acc1, y1
   919	CSEL	CS, t2, acc2, y2
   920	CSEL	CS, t3, acc3, y3
   921	RET
   922/* ---------------------------------------*/
   923TEXT p256MulInternal<>(SB),NOSPLIT,$0
   924	// y[0] * x
   925	MUL	y0, x0, acc0
   926	UMULH	y0, x0, acc1
   927
   928	MUL	y0, x1, t0
   929	ADDS	t0, acc1
   930	UMULH	y0, x1, acc2
   931
   932	MUL	y0, x2, t0
   933	ADCS	t0, acc2
   934	UMULH	y0, x2, acc3
   935
   936	MUL	y0, x3, t0
   937	ADCS	t0, acc3
   938	UMULH	y0, x3, acc4
   939	ADC	$0, acc4
   940	// First reduction step
   941	ADDS	acc0<<32, acc1, acc1
   942	LSR	$32, acc0, t0
   943	MUL	acc0, const1, t1
   944	UMULH	acc0, const1, acc0
   945	ADCS	t0, acc2
   946	ADCS	t1, acc3
   947	ADC	$0, acc0
   948	// y[1] * x
   949	MUL	y1, x0, t0
   950	ADDS	t0, acc1
   951	UMULH	y1, x0, t1
   952
   953	MUL	y1, x1, t0
   954	ADCS	t0, acc2
   955	UMULH	y1, x1, t2
   956
   957	MUL	y1, x2, t0
   958	ADCS	t0, acc3
   959	UMULH	y1, x2, t3
   960
   961	MUL	y1, x3, t0
   962	ADCS	t0, acc4
   963	UMULH	y1, x3, hlp0
   964	ADC	$0, ZR, acc5
   965
   966	ADDS	t1, acc2
   967	ADCS	t2, acc3
   968	ADCS	t3, acc4
   969	ADC	hlp0, acc5
   970	// Second reduction step
   971	ADDS	acc1<<32, acc2, acc2
   972	LSR	$32, acc1, t0
   973	MUL	acc1, const1, t1
   974	UMULH	acc1, const1, acc1
   975	ADCS	t0, acc3
   976	ADCS	t1, acc0
   977	ADC	$0, acc1
   978	// y[2] * x
   979	MUL	y2, x0, t0
   980	ADDS	t0, acc2
   981	UMULH	y2, x0, t1
   982
   983	MUL	y2, x1, t0
   984	ADCS	t0, acc3
   985	UMULH	y2, x1, t2
   986
   987	MUL	y2, x2, t0
   988	ADCS	t0, acc4
   989	UMULH	y2, x2, t3
   990
   991	MUL	y2, x3, t0
   992	ADCS	t0, acc5
   993	UMULH	y2, x3, hlp0
   994	ADC	$0, ZR, acc6
   995
   996	ADDS	t1, acc3
   997	ADCS	t2, acc4
   998	ADCS	t3, acc5
   999	ADC	hlp0, acc6
  1000	// Third reduction step
  1001	ADDS	acc2<<32, acc3, acc3
  1002	LSR	$32, acc2, t0
  1003	MUL	acc2, const1, t1
  1004	UMULH	acc2, const1, acc2
  1005	ADCS	t0, acc0
  1006	ADCS	t1, acc1
  1007	ADC	$0, acc2
  1008	// y[3] * x
  1009	MUL	y3, x0, t0
  1010	ADDS	t0, acc3
  1011	UMULH	y3, x0, t1
  1012
  1013	MUL	y3, x1, t0
  1014	ADCS	t0, acc4
  1015	UMULH	y3, x1, t2
  1016
  1017	MUL	y3, x2, t0
  1018	ADCS	t0, acc5
  1019	UMULH	y3, x2, t3
  1020
  1021	MUL	y3, x3, t0
  1022	ADCS	t0, acc6
  1023	UMULH	y3, x3, hlp0
  1024	ADC	$0, ZR, acc7
  1025
  1026	ADDS	t1, acc4
  1027	ADCS	t2, acc5
  1028	ADCS	t3, acc6
  1029	ADC	hlp0, acc7
  1030	// Last reduction step
  1031	ADDS	acc3<<32, acc0, acc0
  1032	LSR	$32, acc3, t0
  1033	MUL	acc3, const1, t1
  1034	UMULH	acc3, const1, acc3
  1035	ADCS	t0, acc1
  1036	ADCS	t1, acc2
  1037	ADC	$0, acc3
  1038	// Add bits [511:256] of the mul result
  1039	ADDS	acc4, acc0, acc0
  1040	ADCS	acc5, acc1, acc1
  1041	ADCS	acc6, acc2, acc2
  1042	ADCS	acc7, acc3, acc3
  1043	ADC	$0, ZR, acc4
  1044
  1045	SUBS	$-1, acc0, t0
  1046	SBCS	const0, acc1, t1
  1047	SBCS	$0, acc2, t2
  1048	SBCS	const1, acc3, t3
  1049	SBCS	$0, acc4, acc4
  1050
  1051	CSEL	CS, t0, acc0, y0
  1052	CSEL	CS, t1, acc1, y1
  1053	CSEL	CS, t2, acc2, y2
  1054	CSEL	CS, t3, acc3, y3
  1055	RET
  1056/* ---------------------------------------*/
  1057#define p256MulBy2Inline       \
  1058	ADDS	y0, y0, x0;    \
  1059	ADCS	y1, y1, x1;    \
  1060	ADCS	y2, y2, x2;    \
  1061	ADCS	y3, y3, x3;    \
  1062	ADC	$0, ZR, hlp0;  \
  1063	SUBS	$-1, x0, t0;   \
  1064	SBCS	const0, x1, t1;\
  1065	SBCS	$0, x2, t2;    \
  1066	SBCS	const1, x3, t3;\
  1067	SBCS	$0, hlp0, hlp0;\
  1068	CSEL	CC, x0, t0, x0;\
  1069	CSEL	CC, x1, t1, x1;\
  1070	CSEL	CC, x2, t2, x2;\
  1071	CSEL	CC, x3, t3, x3;
  1072/* ---------------------------------------*/
  1073#define x1in(off) (off)(a_ptr)
  1074#define y1in(off) (off + 32)(a_ptr)
  1075#define z1in(off) (off + 64)(a_ptr)
  1076#define x2in(off) (off)(b_ptr)
  1077#define z2in(off) (off + 64)(b_ptr)
  1078#define x3out(off) (off)(res_ptr)
  1079#define y3out(off) (off + 32)(res_ptr)
  1080#define z3out(off) (off + 64)(res_ptr)
  1081#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1082#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1083#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1084#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1085/* ---------------------------------------*/
  1086#define y2in(off)  (32*0 + 8 + off)(RSP)
  1087#define s2(off)    (32*1 + 8 + off)(RSP)
  1088#define z1sqr(off) (32*2 + 8 + off)(RSP)
  1089#define h(off)	   (32*3 + 8 + off)(RSP)
  1090#define r(off)	   (32*4 + 8 + off)(RSP)
  1091#define hsqr(off)  (32*5 + 8 + off)(RSP)
  1092#define rsqr(off)  (32*6 + 8 + off)(RSP)
  1093#define hcub(off)  (32*7 + 8 + off)(RSP)
  1094
  1095#define z2sqr(off) (32*8 + 8 + off)(RSP)
  1096#define s1(off) (32*9 + 8 + off)(RSP)
  1097#define u1(off) (32*10 + 8 + off)(RSP)
  1098#define u2(off) (32*11 + 8 + off)(RSP)
  1099
  1100// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1101TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1102	MOVD	in1+8(FP), a_ptr
  1103	MOVD	in2+16(FP), b_ptr
  1104	MOVD	sign+24(FP), hlp0
  1105	MOVD	sel+32(FP), hlp1
  1106	MOVD	zero+40(FP), t2
  1107
  1108	MOVD	$1, t0
  1109	CMP	$0, t2
  1110	CSEL	EQ, ZR, t0, t2
  1111	CMP	$0, hlp1
  1112	CSEL	EQ, ZR, t0, hlp1
  1113
  1114	MOVD	p256const0<>(SB), const0
  1115	MOVD	p256const1<>(SB), const1
  1116	EOR	t2<<1, hlp1
  1117
  1118	// Negate y2in based on sign
  1119	LDP	2*16(b_ptr), (y0, y1)
  1120	LDP	3*16(b_ptr), (y2, y3)
  1121	MOVD	$-1, acc0
  1122
  1123	SUBS	y0, acc0, acc0
  1124	SBCS	y1, const0, acc1
  1125	SBCS	y2, ZR, acc2
  1126	SBCS	y3, const1, acc3
  1127	SBC	$0, ZR, t0
  1128
  1129	ADDS	$-1, acc0, acc4
  1130	ADCS	const0, acc1, acc5
  1131	ADCS	$0, acc2, acc6
  1132	ADCS	const1, acc3, acc7
  1133	ADC	$0, t0, t0
  1134
  1135	CMP	$0, t0
  1136	CSEL	EQ, acc4, acc0, acc0
  1137	CSEL	EQ, acc5, acc1, acc1
  1138	CSEL	EQ, acc6, acc2, acc2
  1139	CSEL	EQ, acc7, acc3, acc3
  1140	// If condition is 0, keep original value
  1141	CMP	$0, hlp0
  1142	CSEL	EQ, y0, acc0, y0
  1143	CSEL	EQ, y1, acc1, y1
  1144	CSEL	EQ, y2, acc2, y2
  1145	CSEL	EQ, y3, acc3, y3
  1146	// Store result
  1147	STy(y2in)
  1148	// Begin point add
  1149	LDx(z1in)
  1150	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1151	STy(z1sqr)
  1152
  1153	LDx(x2in)
  1154	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1155
  1156	LDx(x1in)
  1157	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1158	STx(h)
  1159
  1160	LDy(z1in)
  1161	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1162
  1163	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1164	LDP	5*16(a_ptr), (acc2, acc3)
  1165	ANDS	$1, hlp1, ZR
  1166	CSEL	EQ, acc0, y0, y0
  1167	CSEL	EQ, acc1, y1, y1
  1168	CSEL	EQ, acc2, y2, y2
  1169	CSEL	EQ, acc3, y3, y3
  1170	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1171	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1172	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1173	CSEL	EQ, acc0, y0, y0
  1174	CSEL	EQ, acc1, y1, y1
  1175	CSEL	EQ, acc2, y2, y2
  1176	CSEL	EQ, acc3, y3, y3
  1177	LDx(z1in)
  1178	MOVD	res+0(FP), t0
  1179	STP	(y0, y1), 4*16(t0)
  1180	STP	(y2, y3), 5*16(t0)
  1181
  1182	LDy(z1sqr)
  1183	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1184
  1185	LDx(y2in)
  1186	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1187	STy(s2)
  1188
  1189	LDx(y1in)
  1190	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1191	STx(r)
  1192
  1193	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1194	STy	(rsqr)
  1195
  1196	LDx(h)
  1197	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1198	STy(hsqr)
  1199
  1200	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1201	STy(hcub)
  1202
  1203	LDx(y1in)
  1204	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1205	STy(s2)
  1206
  1207	LDP	hsqr(0*8), (x0, x1)
  1208	LDP	hsqr(2*8), (x2, x3)
  1209	LDP	0*16(a_ptr), (y0, y1)
  1210	LDP	1*16(a_ptr), (y2, y3)
  1211	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1212	STP	(y0, y1), h(0*8)
  1213	STP	(y2, y3), h(2*8)
  1214
  1215	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1216
  1217	LDy(rsqr)
  1218	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1219
  1220	MOVD	x0, y0
  1221	MOVD	x1, y1
  1222	MOVD	x2, y2
  1223	MOVD	x3, y3
  1224	LDx(hcub)
  1225	CALL	p256SubInternal<>(SB)
  1226
  1227	LDP	0*16(a_ptr), (acc0, acc1)
  1228	LDP	1*16(a_ptr), (acc2, acc3)
  1229	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1230	CSEL	EQ, acc0, x0, x0
  1231	CSEL	EQ, acc1, x1, x1
  1232	CSEL	EQ, acc2, x2, x2
  1233	CSEL	EQ, acc3, x3, x3
  1234	LDP	0*16(b_ptr), (acc0, acc1)
  1235	LDP	1*16(b_ptr), (acc2, acc3)
  1236	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1237	CSEL	EQ, acc0, x0, x0
  1238	CSEL	EQ, acc1, x1, x1
  1239	CSEL	EQ, acc2, x2, x2
  1240	CSEL	EQ, acc3, x3, x3
  1241	MOVD	res+0(FP), t0
  1242	STP	(x0, x1), 0*16(t0)
  1243	STP	(x2, x3), 1*16(t0)
  1244
  1245	LDP	h(0*8), (y0, y1)
  1246	LDP	h(2*8), (y2, y3)
  1247	CALL	p256SubInternal<>(SB)
  1248
  1249	LDP	r(0*8), (y0, y1)
  1250	LDP	r(2*8), (y2, y3)
  1251	CALL	p256MulInternal<>(SB)
  1252
  1253	LDP	s2(0*8), (x0, x1)
  1254	LDP	s2(2*8), (x2, x3)
  1255	CALL	p256SubInternal<>(SB)
  1256	LDP	2*16(a_ptr), (acc0, acc1)
  1257	LDP	3*16(a_ptr), (acc2, acc3)
  1258	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1259	CSEL	EQ, acc0, x0, x0
  1260	CSEL	EQ, acc1, x1, x1
  1261	CSEL	EQ, acc2, x2, x2
  1262	CSEL	EQ, acc3, x3, x3
  1263	LDP	y2in(0*8), (acc0, acc1)
  1264	LDP	y2in(2*8), (acc2, acc3)
  1265	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1266	CSEL	EQ, acc0, x0, x0
  1267	CSEL	EQ, acc1, x1, x1
  1268	CSEL	EQ, acc2, x2, x2
  1269	CSEL	EQ, acc3, x3, x3
  1270	MOVD	res+0(FP), t0
  1271	STP	(x0, x1), 2*16(t0)
  1272	STP	(x2, x3), 3*16(t0)
  1273
  1274	RET
  1275
  1276#define p256AddInline          \
  1277	ADDS	y0, x0, x0;    \
  1278	ADCS	y1, x1, x1;    \
  1279	ADCS	y2, x2, x2;    \
  1280	ADCS	y3, x3, x3;    \
  1281	ADC	$0, ZR, hlp0;  \
  1282	SUBS	$-1, x0, t0;   \
  1283	SBCS	const0, x1, t1;\
  1284	SBCS	$0, x2, t2;    \
  1285	SBCS	const1, x3, t3;\
  1286	SBCS	$0, hlp0, hlp0;\
  1287	CSEL	CC, x0, t0, x0;\
  1288	CSEL	CC, x1, t1, x1;\
  1289	CSEL	CC, x2, t2, x2;\
  1290	CSEL	CC, x3, t3, x3;
  1291
  1292#define s(off)	(32*0 + 8 + off)(RSP)
  1293#define m(off)	(32*1 + 8 + off)(RSP)
  1294#define zsqr(off) (32*2 + 8 + off)(RSP)
  1295#define tmp(off)  (32*3 + 8 + off)(RSP)
  1296
  1297//func p256PointDoubleAsm(res, in *P256Point)
  1298TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1299	MOVD	res+0(FP), res_ptr
  1300	MOVD	in+8(FP), a_ptr
  1301
  1302	MOVD	p256const0<>(SB), const0
  1303	MOVD	p256const1<>(SB), const1
  1304
  1305	// Begin point double
  1306	LDP	4*16(a_ptr), (x0, x1)
  1307	LDP	5*16(a_ptr), (x2, x3)
  1308	CALL	p256SqrInternal<>(SB)
  1309	STP	(y0, y1), zsqr(0*8)
  1310	STP	(y2, y3), zsqr(2*8)
  1311
  1312	LDP	0*16(a_ptr), (x0, x1)
  1313	LDP	1*16(a_ptr), (x2, x3)
  1314	p256AddInline
  1315	STx(m)
  1316
  1317	LDx(z1in)
  1318	LDy(y1in)
  1319	CALL	p256MulInternal<>(SB)
  1320	p256MulBy2Inline
  1321	STx(z3out)
  1322
  1323	LDy(x1in)
  1324	LDx(zsqr)
  1325	CALL	p256SubInternal<>(SB)
  1326	LDy(m)
  1327	CALL	p256MulInternal<>(SB)
  1328
  1329	// Multiply by 3
  1330	p256MulBy2Inline
  1331	p256AddInline
  1332	STx(m)
  1333
  1334	LDy(y1in)
  1335	p256MulBy2Inline
  1336	CALL	p256SqrInternal<>(SB)
  1337	STy(s)
  1338	MOVD	y0, x0
  1339	MOVD	y1, x1
  1340	MOVD	y2, x2
  1341	MOVD	y3, x3
  1342	CALL	p256SqrInternal<>(SB)
  1343
  1344	// Divide by 2
  1345	ADDS	$-1, y0, t0
  1346	ADCS	const0, y1, t1
  1347	ADCS	$0, y2, t2
  1348	ADCS	const1, y3, t3
  1349	ADC	$0, ZR, hlp0
  1350
  1351	ANDS	$1, y0, ZR
  1352	CSEL	EQ, y0, t0, t0
  1353	CSEL	EQ, y1, t1, t1
  1354	CSEL	EQ, y2, t2, t2
  1355	CSEL	EQ, y3, t3, t3
  1356	AND	y0, hlp0, hlp0
  1357
  1358	EXTR	$1, t0, t1, y0
  1359	EXTR	$1, t1, t2, y1
  1360	EXTR	$1, t2, t3, y2
  1361	EXTR	$1, t3, hlp0, y3
  1362	STy(y3out)
  1363
  1364	LDx(x1in)
  1365	LDy(s)
  1366	CALL	p256MulInternal<>(SB)
  1367	STy(s)
  1368	p256MulBy2Inline
  1369	STx(tmp)
  1370
  1371	LDx(m)
  1372	CALL	p256SqrInternal<>(SB)
  1373	LDx(tmp)
  1374	CALL	p256SubInternal<>(SB)
  1375
  1376	STx(x3out)
  1377
  1378	LDy(s)
  1379	CALL	p256SubInternal<>(SB)
  1380
  1381	LDy(m)
  1382	CALL	p256MulInternal<>(SB)
  1383
  1384	LDx(y3out)
  1385	CALL	p256SubInternal<>(SB)
  1386	STx(y3out)
  1387	RET
  1388/* ---------------------------------------*/
  1389#undef y2in
  1390#undef x3out
  1391#undef y3out
  1392#undef z3out
  1393#define y2in(off) (off + 32)(b_ptr)
  1394#define x3out(off) (off)(b_ptr)
  1395#define y3out(off) (off + 32)(b_ptr)
  1396#define z3out(off) (off + 64)(b_ptr)
  1397// func p256PointAddAsm(res, in1, in2 *P256Point) int
  1398TEXT ·p256PointAddAsm(SB),0,$392-32
  1399	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1400	// Move input to stack in order to free registers
  1401	MOVD	in1+8(FP), a_ptr
  1402	MOVD	in2+16(FP), b_ptr
  1403
  1404	MOVD	p256const0<>(SB), const0
  1405	MOVD	p256const1<>(SB), const1
  1406
  1407	// Begin point add
  1408	LDx(z2in)
  1409	CALL	p256SqrInternal<>(SB)    // z2^2
  1410	STy(z2sqr)
  1411
  1412	CALL	p256MulInternal<>(SB)    // z2^3
  1413
  1414	LDx(y1in)
  1415	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1416	STy(s1)
  1417
  1418	LDx(z1in)
  1419	CALL	p256SqrInternal<>(SB)    // z1^2
  1420	STy(z1sqr)
  1421
  1422	CALL	p256MulInternal<>(SB)    // z1^3
  1423
  1424	LDx(y2in)
  1425	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1426
  1427	LDx(s1)
  1428	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1429	STx(r)
  1430
  1431	MOVD	$1, t2
  1432	ORR	x0, x1, t0             // Check if zero mod p256
  1433	ORR	x2, x3, t1
  1434	ORR	t1, t0, t0
  1435	CMP	$0, t0
  1436	CSEL	EQ, t2, ZR, hlp1
  1437
  1438	EOR	$-1, x0, t0
  1439	EOR	const0, x1, t1
  1440	EOR	const1, x3, t3
  1441
  1442	ORR	t0, t1, t0
  1443	ORR	x2, t3, t1
  1444	ORR	t1, t0, t0
  1445	CMP	$0, t0
  1446	CSEL	EQ, t2, hlp1, hlp1
  1447
  1448	LDx(z2sqr)
  1449	LDy(x1in)
  1450	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1451	STy(u1)
  1452
  1453	LDx(z1sqr)
  1454	LDy(x2in)
  1455	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1456	STy(u2)
  1457
  1458	LDx(u1)
  1459	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1460	STx(h)
  1461
  1462	MOVD	$1, t2
  1463	ORR	x0, x1, t0             // Check if zero mod p256
  1464	ORR	x2, x3, t1
  1465	ORR	t1, t0, t0
  1466	CMP	$0, t0
  1467	CSEL	EQ, t2, ZR, hlp0
  1468
  1469	EOR	$-1, x0, t0
  1470	EOR	const0, x1, t1
  1471	EOR	const1, x3, t3
  1472
  1473	ORR	t0, t1, t0
  1474	ORR	x2, t3, t1
  1475	ORR	t1, t0, t0
  1476	CMP	$0, t0
  1477	CSEL	EQ, t2, hlp0, hlp0
  1478
  1479	AND	hlp0, hlp1, hlp1
  1480
  1481	LDx(r)
  1482	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1483	STy(rsqr)
  1484
  1485	LDx(h)
  1486	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1487	STy(hsqr)
  1488
  1489	LDx(h)
  1490	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1491	STy(hcub)
  1492
  1493	LDx(s1)
  1494	CALL	p256MulInternal<>(SB)
  1495	STy(s2)
  1496
  1497	LDx(z1in)
  1498	LDy(z2in)
  1499	CALL	p256MulInternal<>(SB)    // z1 * z2
  1500	LDx(h)
  1501	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1502	MOVD	res+0(FP), b_ptr
  1503	STy(z3out)
  1504
  1505	LDx(hsqr)
  1506	LDy(u1)
  1507	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1508	STy(u2)
  1509
  1510	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1511	LDy(rsqr)
  1512	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1513
  1514	MOVD	x0, y0
  1515	MOVD	x1, y1
  1516	MOVD	x2, y2
  1517	MOVD	x3, y3
  1518	LDx(hcub)
  1519	CALL	p256SubInternal<>(SB)
  1520	STx(x3out)
  1521
  1522	LDy(u2)
  1523	CALL	p256SubInternal<>(SB)
  1524
  1525	LDy(r)
  1526	CALL	p256MulInternal<>(SB)
  1527
  1528	LDx(s2)
  1529	CALL	p256SubInternal<>(SB)
  1530	STx(y3out)
  1531
  1532	MOVD	hlp1, R0
  1533	MOVD	R0, ret+24(FP)
  1534
  1535	RET

View as plain text