...

Text file src/crypto/internal/fips140/nistec/p256_asm_arm64.s

Documentation: crypto/internal/fips140/nistec

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7// This file contains constant-time, 64-bit assembly implementation of
     8// P256. The optimizations performed here are described in detail in:
     9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10//                          256-bit primes"
    11// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12// https://eprint.iacr.org/2013/816.pdf
    13
    14#include "textflag.h"
    15
    16#define res_ptr R0
    17#define a_ptr R1
    18#define b_ptr R2
    19
    20#define acc0 R3
    21#define acc1 R4
    22#define acc2 R5
    23#define acc3 R6
    24
    25#define acc4 R7
    26#define acc5 R8
    27#define acc6 R9
    28#define acc7 R10
    29#define t0 R11
    30#define t1 R12
    31#define t2 R13
    32#define t3 R14
    33#define const0 R15
    34#define const1 R16
    35
    36#define hlp0 R17
    37#define hlp1 res_ptr
    38
    39#define x0 R19
    40#define x1 R20
    41#define x2 R21
    42#define x3 R22
    43#define y0 R23
    44#define y1 R24
    45#define y2 R25
    46#define y3 R26
    47
    48#define const2 t2
    49#define const3 t3
    50
    51DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    52DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    53DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    54DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    55DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    56DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    57DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    58DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    59DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    60DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    61DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    62GLOBL p256const0<>(SB), 8, $8
    63GLOBL p256const1<>(SB), 8, $8
    64GLOBL p256ordK0<>(SB), 8, $8
    65GLOBL p256ord<>(SB), 8, $32
    66GLOBL p256one<>(SB), 8, $32
    67
    68/* ---------------------------------------*/
    69// func p256MovCond(res, a, b *P256Point, cond int)
    70// If cond == 0 res=b, else res=a
    71TEXT ·p256MovCond(SB),NOSPLIT,$0
    72	MOVD	res+0(FP), res_ptr
    73	MOVD	a+8(FP), a_ptr
    74	MOVD	b+16(FP), b_ptr
    75	MOVD	cond+24(FP), R3
    76
    77	CMP	$0, R3
    78	// Two remarks:
    79	// 1) Will want to revisit NEON, when support is better
    80	// 2) CSEL might not be constant time on all ARM processors
    81	LDP	0*16(a_ptr), (R4, R5)
    82	LDP	1*16(a_ptr), (R6, R7)
    83	LDP	2*16(a_ptr), (R8, R9)
    84	LDP	0*16(b_ptr), (R16, R17)
    85	LDP	1*16(b_ptr), (R19, R20)
    86	LDP	2*16(b_ptr), (R21, R22)
    87	CSEL	EQ, R16, R4, R4
    88	CSEL	EQ, R17, R5, R5
    89	CSEL	EQ, R19, R6, R6
    90	CSEL	EQ, R20, R7, R7
    91	CSEL	EQ, R21, R8, R8
    92	CSEL	EQ, R22, R9, R9
    93	STP	(R4, R5), 0*16(res_ptr)
    94	STP	(R6, R7), 1*16(res_ptr)
    95	STP	(R8, R9), 2*16(res_ptr)
    96
    97	LDP	3*16(a_ptr), (R4, R5)
    98	LDP	4*16(a_ptr), (R6, R7)
    99	LDP	5*16(a_ptr), (R8, R9)
   100	LDP	3*16(b_ptr), (R16, R17)
   101	LDP	4*16(b_ptr), (R19, R20)
   102	LDP	5*16(b_ptr), (R21, R22)
   103	CSEL	EQ, R16, R4, R4
   104	CSEL	EQ, R17, R5, R5
   105	CSEL	EQ, R19, R6, R6
   106	CSEL	EQ, R20, R7, R7
   107	CSEL	EQ, R21, R8, R8
   108	CSEL	EQ, R22, R9, R9
   109	STP	(R4, R5), 3*16(res_ptr)
   110	STP	(R6, R7), 4*16(res_ptr)
   111	STP	(R8, R9), 5*16(res_ptr)
   112
   113	RET
   114/* ---------------------------------------*/
   115// func p256NegCond(val *p256Element, cond int)
   116TEXT ·p256NegCond(SB),NOSPLIT,$0
   117	MOVD	val+0(FP), a_ptr
   118	MOVD	cond+8(FP), hlp0
   119	MOVD	a_ptr, res_ptr
   120	// acc = poly
   121	MOVD	$-1, acc0
   122	MOVD	p256const0<>(SB), acc1
   123	MOVD	$0, acc2
   124	MOVD	p256const1<>(SB), acc3
   125	// Load the original value
   126	LDP	0*16(a_ptr), (t0, t1)
   127	LDP	1*16(a_ptr), (t2, t3)
   128	// Speculatively subtract
   129	SUBS	t0, acc0
   130	SBCS	t1, acc1
   131	SBCS	t2, acc2
   132	SBC	t3, acc3
   133	// If condition is 0, keep original value
   134	CMP	$0, hlp0
   135	CSEL	EQ, t0, acc0, acc0
   136	CSEL	EQ, t1, acc1, acc1
   137	CSEL	EQ, t2, acc2, acc2
   138	CSEL	EQ, t3, acc3, acc3
   139	// Store result
   140	STP	(acc0, acc1), 0*16(res_ptr)
   141	STP	(acc2, acc3), 1*16(res_ptr)
   142
   143	RET
   144/* ---------------------------------------*/
   145// func p256Sqr(res, in *p256Element, n int)
   146TEXT ·p256Sqr(SB),NOSPLIT,$0
   147	MOVD	res+0(FP), res_ptr
   148	MOVD	in+8(FP), a_ptr
   149	MOVD	n+16(FP), b_ptr
   150
   151	MOVD	p256const0<>(SB), const0
   152	MOVD	p256const1<>(SB), const1
   153
   154	LDP	0*16(a_ptr), (x0, x1)
   155	LDP	1*16(a_ptr), (x2, x3)
   156
   157sqrLoop:
   158	SUB	$1, b_ptr
   159	CALL	p256SqrInternal<>(SB)
   160	MOVD	y0, x0
   161	MOVD	y1, x1
   162	MOVD	y2, x2
   163	MOVD	y3, x3
   164	CBNZ	b_ptr, sqrLoop
   165
   166	STP	(y0, y1), 0*16(res_ptr)
   167	STP	(y2, y3), 1*16(res_ptr)
   168	RET
   169/* ---------------------------------------*/
   170// func p256Mul(res, in1, in2 *p256Element)
   171TEXT ·p256Mul(SB),NOSPLIT,$0
   172	MOVD	res+0(FP), res_ptr
   173	MOVD	in1+8(FP), a_ptr
   174	MOVD	in2+16(FP), b_ptr
   175
   176	MOVD	p256const0<>(SB), const0
   177	MOVD	p256const1<>(SB), const1
   178
   179	LDP	0*16(a_ptr), (x0, x1)
   180	LDP	1*16(a_ptr), (x2, x3)
   181
   182	LDP	0*16(b_ptr), (y0, y1)
   183	LDP	1*16(b_ptr), (y2, y3)
   184
   185	CALL	p256MulInternal<>(SB)
   186
   187	STP	(y0, y1), 0*16(res_ptr)
   188	STP	(y2, y3), 1*16(res_ptr)
   189	RET
   190/* ---------------------------------------*/
   191// func p256FromMont(res, in *p256Element)
   192TEXT ·p256FromMont(SB),NOSPLIT,$0
   193	MOVD	res+0(FP), res_ptr
   194	MOVD	in+8(FP), a_ptr
   195
   196	MOVD	p256const0<>(SB), const0
   197	MOVD	p256const1<>(SB), const1
   198
   199	LDP	0*16(a_ptr), (acc0, acc1)
   200	LDP	1*16(a_ptr), (acc2, acc3)
   201	// Only reduce, no multiplications are needed
   202	// First reduction step
   203	ADDS	acc0<<32, acc1, acc1
   204	LSR	$32, acc0, t0
   205	MUL	acc0, const1, t1
   206	UMULH	acc0, const1, acc0
   207	ADCS	t0, acc2
   208	ADCS	t1, acc3
   209	ADC	$0, acc0
   210	// Second reduction step
   211	ADDS	acc1<<32, acc2, acc2
   212	LSR	$32, acc1, t0
   213	MUL	acc1, const1, t1
   214	UMULH	acc1, const1, acc1
   215	ADCS	t0, acc3
   216	ADCS	t1, acc0
   217	ADC	$0, acc1
   218	// Third reduction step
   219	ADDS	acc2<<32, acc3, acc3
   220	LSR	$32, acc2, t0
   221	MUL	acc2, const1, t1
   222	UMULH	acc2, const1, acc2
   223	ADCS	t0, acc0
   224	ADCS	t1, acc1
   225	ADC	$0, acc2
   226	// Last reduction step
   227	ADDS	acc3<<32, acc0, acc0
   228	LSR	$32, acc3, t0
   229	MUL	acc3, const1, t1
   230	UMULH	acc3, const1, acc3
   231	ADCS	t0, acc1
   232	ADCS	t1, acc2
   233	ADC	$0, acc3
   234
   235	SUBS	$-1, acc0, t0
   236	SBCS	const0, acc1, t1
   237	SBCS	$0, acc2, t2
   238	SBCS	const1, acc3, t3
   239
   240	CSEL	CS, t0, acc0, acc0
   241	CSEL	CS, t1, acc1, acc1
   242	CSEL	CS, t2, acc2, acc2
   243	CSEL	CS, t3, acc3, acc3
   244
   245	STP	(acc0, acc1), 0*16(res_ptr)
   246	STP	(acc2, acc3), 1*16(res_ptr)
   247
   248	RET
   249/* ---------------------------------------*/
   250// func p256Select(res *P256Point, table *p256Table, idx int)
   251TEXT ·p256Select(SB),NOSPLIT,$0
   252	MOVD	idx+16(FP), const0
   253	MOVD	table+8(FP), b_ptr
   254	MOVD	res+0(FP), res_ptr
   255
   256	EOR	x0, x0, x0
   257	EOR	x1, x1, x1
   258	EOR	x2, x2, x2
   259	EOR	x3, x3, x3
   260	EOR	y0, y0, y0
   261	EOR	y1, y1, y1
   262	EOR	y2, y2, y2
   263	EOR	y3, y3, y3
   264	EOR	t0, t0, t0
   265	EOR	t1, t1, t1
   266	EOR	t2, t2, t2
   267	EOR	t3, t3, t3
   268
   269	MOVD	$0, const1
   270
   271loop_select:
   272		ADD	$1, const1
   273		CMP	const0, const1
   274		LDP.P	16(b_ptr), (acc0, acc1)
   275		CSEL	EQ, acc0, x0, x0
   276		CSEL	EQ, acc1, x1, x1
   277		LDP.P	16(b_ptr), (acc2, acc3)
   278		CSEL	EQ, acc2, x2, x2
   279		CSEL	EQ, acc3, x3, x3
   280		LDP.P	16(b_ptr), (acc4, acc5)
   281		CSEL	EQ, acc4, y0, y0
   282		CSEL	EQ, acc5, y1, y1
   283		LDP.P	16(b_ptr), (acc6, acc7)
   284		CSEL	EQ, acc6, y2, y2
   285		CSEL	EQ, acc7, y3, y3
   286		LDP.P	16(b_ptr), (acc0, acc1)
   287		CSEL	EQ, acc0, t0, t0
   288		CSEL	EQ, acc1, t1, t1
   289		LDP.P	16(b_ptr), (acc2, acc3)
   290		CSEL	EQ, acc2, t2, t2
   291		CSEL	EQ, acc3, t3, t3
   292
   293		CMP	$16, const1
   294		BNE	loop_select
   295
   296	STP	(x0, x1), 0*16(res_ptr)
   297	STP	(x2, x3), 1*16(res_ptr)
   298	STP	(y0, y1), 2*16(res_ptr)
   299	STP	(y2, y3), 3*16(res_ptr)
   300	STP	(t0, t1), 4*16(res_ptr)
   301	STP	(t2, t3), 5*16(res_ptr)
   302	RET
   303/* ---------------------------------------*/
   304// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   305TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   306	MOVD	idx+16(FP), t0
   307	MOVD	table+8(FP), t1
   308	MOVD	res+0(FP), res_ptr
   309
   310	EOR	x0, x0, x0
   311	EOR	x1, x1, x1
   312	EOR	x2, x2, x2
   313	EOR	x3, x3, x3
   314	EOR	y0, y0, y0
   315	EOR	y1, y1, y1
   316	EOR	y2, y2, y2
   317	EOR	y3, y3, y3
   318
   319	MOVD	$0, t2
   320
   321loop_select:
   322		ADD	$1, t2
   323		CMP	t0, t2
   324		LDP.P	16(t1), (acc0, acc1)
   325		CSEL	EQ, acc0, x0, x0
   326		CSEL	EQ, acc1, x1, x1
   327		LDP.P	16(t1), (acc2, acc3)
   328		CSEL	EQ, acc2, x2, x2
   329		CSEL	EQ, acc3, x3, x3
   330		LDP.P	16(t1), (acc4, acc5)
   331		CSEL	EQ, acc4, y0, y0
   332		CSEL	EQ, acc5, y1, y1
   333		LDP.P	16(t1), (acc6, acc7)
   334		CSEL	EQ, acc6, y2, y2
   335		CSEL	EQ, acc7, y3, y3
   336
   337		CMP	$32, t2
   338		BNE	loop_select
   339
   340	STP	(x0, x1), 0*16(res_ptr)
   341	STP	(x2, x3), 1*16(res_ptr)
   342	STP	(y0, y1), 2*16(res_ptr)
   343	STP	(y2, y3), 3*16(res_ptr)
   344	RET
   345/* ---------------------------------------*/
   346// func p256OrdSqr(res, in *p256OrdElement, n int)
   347TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   348	MOVD	in+8(FP), a_ptr
   349	MOVD	n+16(FP), b_ptr
   350
   351	MOVD	p256ordK0<>(SB), hlp1
   352	LDP	p256ord<>+0x00(SB), (const0, const1)
   353	LDP	p256ord<>+0x10(SB), (const2, const3)
   354
   355	LDP	0*16(a_ptr), (x0, x1)
   356	LDP	1*16(a_ptr), (x2, x3)
   357
   358ordSqrLoop:
   359	SUB	$1, b_ptr
   360
   361	// x[1:] * x[0]
   362	MUL	x0, x1, acc1
   363	UMULH	x0, x1, acc2
   364
   365	MUL	x0, x2, t0
   366	ADDS	t0, acc2, acc2
   367	UMULH	x0, x2, acc3
   368
   369	MUL	x0, x3, t0
   370	ADCS	t0, acc3, acc3
   371	UMULH	x0, x3, acc4
   372	ADC	$0, acc4, acc4
   373	// x[2:] * x[1]
   374	MUL	x1, x2, t0
   375	ADDS	t0, acc3
   376	UMULH	x1, x2, t1
   377	ADCS	t1, acc4
   378	ADC	$0, ZR, acc5
   379
   380	MUL	x1, x3, t0
   381	ADDS	t0, acc4
   382	UMULH	x1, x3, t1
   383	ADC	t1, acc5
   384	// x[3] * x[2]
   385	MUL	x2, x3, t0
   386	ADDS	t0, acc5
   387	UMULH	x2, x3, acc6
   388	ADC	$0, acc6
   389
   390	MOVD	$0, acc7
   391	// *2
   392	ADDS	acc1, acc1
   393	ADCS	acc2, acc2
   394	ADCS	acc3, acc3
   395	ADCS	acc4, acc4
   396	ADCS	acc5, acc5
   397	ADCS	acc6, acc6
   398	ADC	$0, acc7
   399	// Missing products
   400	MUL	x0, x0, acc0
   401	UMULH	x0, x0, t0
   402	ADDS	t0, acc1, acc1
   403
   404	MUL	x1, x1, t0
   405	ADCS	t0, acc2, acc2
   406	UMULH	x1, x1, t1
   407	ADCS	t1, acc3, acc3
   408
   409	MUL	x2, x2, t0
   410	ADCS	t0, acc4, acc4
   411	UMULH	x2, x2, t1
   412	ADCS	t1, acc5, acc5
   413
   414	MUL	x3, x3, t0
   415	ADCS	t0, acc6, acc6
   416	UMULH	x3, x3, t1
   417	ADC	t1, acc7, acc7
   418	// First reduction step
   419	MUL	acc0, hlp1, hlp0
   420
   421	MUL	const0, hlp1, t0
   422	ADDS	t0, acc0, acc0
   423	UMULH	const0, hlp0, t1
   424
   425	MUL	const1, hlp0, t0
   426	ADCS	t0, acc1, acc1
   427	UMULH	const1, hlp0, y0
   428
   429	MUL	const2, hlp0, t0
   430	ADCS	t0, acc2, acc2
   431	UMULH	const2, hlp0, acc0
   432
   433	MUL	const3, hlp0, t0
   434	ADCS	t0, acc3, acc3
   435
   436	UMULH	const3, hlp0, hlp0
   437	ADC	$0, hlp0
   438
   439	ADDS	t1, acc1, acc1
   440	ADCS	y0, acc2, acc2
   441	ADCS	acc0, acc3, acc3
   442	ADC	$0, hlp0, acc0
   443	// Second reduction step
   444	MUL	acc1, hlp1, hlp0
   445
   446	MUL	const0, hlp1, t0
   447	ADDS	t0, acc1, acc1
   448	UMULH	const0, hlp0, t1
   449
   450	MUL	const1, hlp0, t0
   451	ADCS	t0, acc2, acc2
   452	UMULH	const1, hlp0, y0
   453
   454	MUL	const2, hlp0, t0
   455	ADCS	t0, acc3, acc3
   456	UMULH	const2, hlp0, acc1
   457
   458	MUL	const3, hlp0, t0
   459	ADCS	t0, acc0, acc0
   460
   461	UMULH	const3, hlp0, hlp0
   462	ADC	$0, hlp0
   463
   464	ADDS	t1, acc2, acc2
   465	ADCS	y0, acc3, acc3
   466	ADCS	acc1, acc0, acc0
   467	ADC	$0, hlp0, acc1
   468	// Third reduction step
   469	MUL	acc2, hlp1, hlp0
   470
   471	MUL	const0, hlp1, t0
   472	ADDS	t0, acc2, acc2
   473	UMULH	const0, hlp0, t1
   474
   475	MUL	const1, hlp0, t0
   476	ADCS	t0, acc3, acc3
   477	UMULH	const1, hlp0, y0
   478
   479	MUL	const2, hlp0, t0
   480	ADCS	t0, acc0, acc0
   481	UMULH	const2, hlp0, acc2
   482
   483	MUL	const3, hlp0, t0
   484	ADCS	t0, acc1, acc1
   485
   486	UMULH	const3, hlp0, hlp0
   487	ADC	$0, hlp0
   488
   489	ADDS	t1, acc3, acc3
   490	ADCS	y0, acc0, acc0
   491	ADCS	acc2, acc1, acc1
   492	ADC	$0, hlp0, acc2
   493
   494	// Last reduction step
   495	MUL	acc3, hlp1, hlp0
   496
   497	MUL	const0, hlp1, t0
   498	ADDS	t0, acc3, acc3
   499	UMULH	const0, hlp0, t1
   500
   501	MUL	const1, hlp0, t0
   502	ADCS	t0, acc0, acc0
   503	UMULH	const1, hlp0, y0
   504
   505	MUL	const2, hlp0, t0
   506	ADCS	t0, acc1, acc1
   507	UMULH	const2, hlp0, acc3
   508
   509	MUL	const3, hlp0, t0
   510	ADCS	t0, acc2, acc2
   511
   512	UMULH	const3, hlp0, hlp0
   513	ADC	$0, acc7
   514
   515	ADDS	t1, acc0, acc0
   516	ADCS	y0, acc1, acc1
   517	ADCS	acc3, acc2, acc2
   518	ADC	$0, hlp0, acc3
   519
   520	ADDS	acc4, acc0, acc0
   521	ADCS	acc5, acc1, acc1
   522	ADCS	acc6, acc2, acc2
   523	ADCS	acc7, acc3, acc3
   524	ADC	$0, ZR, acc4
   525
   526	SUBS	const0, acc0, y0
   527	SBCS	const1, acc1, y1
   528	SBCS	const2, acc2, y2
   529	SBCS	const3, acc3, y3
   530	SBCS	$0, acc4, acc4
   531
   532	CSEL	CS, y0, acc0, x0
   533	CSEL	CS, y1, acc1, x1
   534	CSEL	CS, y2, acc2, x2
   535	CSEL	CS, y3, acc3, x3
   536
   537	CBNZ	b_ptr, ordSqrLoop
   538
   539	MOVD	res+0(FP), res_ptr
   540	STP	(x0, x1), 0*16(res_ptr)
   541	STP	(x2, x3), 1*16(res_ptr)
   542
   543	RET
   544/* ---------------------------------------*/
   545// func p256OrdMul(res, in1, in2 *p256OrdElement)
   546TEXT ·p256OrdMul(SB),NOSPLIT,$0
   547	MOVD	in1+8(FP), a_ptr
   548	MOVD	in2+16(FP), b_ptr
   549
   550	MOVD	p256ordK0<>(SB), hlp1
   551	LDP	p256ord<>+0x00(SB), (const0, const1)
   552	LDP	p256ord<>+0x10(SB), (const2, const3)
   553
   554	LDP	0*16(a_ptr), (x0, x1)
   555	LDP	1*16(a_ptr), (x2, x3)
   556	LDP	0*16(b_ptr), (y0, y1)
   557	LDP	1*16(b_ptr), (y2, y3)
   558
   559	// y[0] * x
   560	MUL	y0, x0, acc0
   561	UMULH	y0, x0, acc1
   562
   563	MUL	y0, x1, t0
   564	ADDS	t0, acc1
   565	UMULH	y0, x1, acc2
   566
   567	MUL	y0, x2, t0
   568	ADCS	t0, acc2
   569	UMULH	y0, x2, acc3
   570
   571	MUL	y0, x3, t0
   572	ADCS	t0, acc3
   573	UMULH	y0, x3, acc4
   574	ADC	$0, acc4
   575	// First reduction step
   576	MUL	acc0, hlp1, hlp0
   577
   578	MUL	const0, hlp1, t0
   579	ADDS	t0, acc0, acc0
   580	UMULH	const0, hlp0, t1
   581
   582	MUL	const1, hlp0, t0
   583	ADCS	t0, acc1, acc1
   584	UMULH	const1, hlp0, y0
   585
   586	MUL	const2, hlp0, t0
   587	ADCS	t0, acc2, acc2
   588	UMULH	const2, hlp0, acc0
   589
   590	MUL	const3, hlp0, t0
   591	ADCS	t0, acc3, acc3
   592
   593	UMULH	const3, hlp0, hlp0
   594	ADC	$0, acc4
   595
   596	ADDS	t1, acc1, acc1
   597	ADCS	y0, acc2, acc2
   598	ADCS	acc0, acc3, acc3
   599	ADC	$0, hlp0, acc0
   600	// y[1] * x
   601	MUL	y1, x0, t0
   602	ADDS	t0, acc1
   603	UMULH	y1, x0, t1
   604
   605	MUL	y1, x1, t0
   606	ADCS	t0, acc2
   607	UMULH	y1, x1, hlp0
   608
   609	MUL	y1, x2, t0
   610	ADCS	t0, acc3
   611	UMULH	y1, x2, y0
   612
   613	MUL	y1, x3, t0
   614	ADCS	t0, acc4
   615	UMULH	y1, x3, y1
   616	ADC	$0, ZR, acc5
   617
   618	ADDS	t1, acc2
   619	ADCS	hlp0, acc3
   620	ADCS	y0, acc4
   621	ADC	y1, acc5
   622	// Second reduction step
   623	MUL	acc1, hlp1, hlp0
   624
   625	MUL	const0, hlp1, t0
   626	ADDS	t0, acc1, acc1
   627	UMULH	const0, hlp0, t1
   628
   629	MUL	const1, hlp0, t0
   630	ADCS	t0, acc2, acc2
   631	UMULH	const1, hlp0, y0
   632
   633	MUL	const2, hlp0, t0
   634	ADCS	t0, acc3, acc3
   635	UMULH	const2, hlp0, acc1
   636
   637	MUL	const3, hlp0, t0
   638	ADCS	t0, acc0, acc0
   639
   640	UMULH	const3, hlp0, hlp0
   641	ADC	$0, acc5
   642
   643	ADDS	t1, acc2, acc2
   644	ADCS	y0, acc3, acc3
   645	ADCS	acc1, acc0, acc0
   646	ADC	$0, hlp0, acc1
   647	// y[2] * x
   648	MUL	y2, x0, t0
   649	ADDS	t0, acc2
   650	UMULH	y2, x0, t1
   651
   652	MUL	y2, x1, t0
   653	ADCS	t0, acc3
   654	UMULH	y2, x1, hlp0
   655
   656	MUL	y2, x2, t0
   657	ADCS	t0, acc4
   658	UMULH	y2, x2, y0
   659
   660	MUL	y2, x3, t0
   661	ADCS	t0, acc5
   662	UMULH	y2, x3, y1
   663	ADC	$0, ZR, acc6
   664
   665	ADDS	t1, acc3
   666	ADCS	hlp0, acc4
   667	ADCS	y0, acc5
   668	ADC	y1, acc6
   669	// Third reduction step
   670	MUL	acc2, hlp1, hlp0
   671
   672	MUL	const0, hlp1, t0
   673	ADDS	t0, acc2, acc2
   674	UMULH	const0, hlp0, t1
   675
   676	MUL	const1, hlp0, t0
   677	ADCS	t0, acc3, acc3
   678	UMULH	const1, hlp0, y0
   679
   680	MUL	const2, hlp0, t0
   681	ADCS	t0, acc0, acc0
   682	UMULH	const2, hlp0, acc2
   683
   684	MUL	const3, hlp0, t0
   685	ADCS	t0, acc1, acc1
   686
   687	UMULH	const3, hlp0, hlp0
   688	ADC	$0, acc6
   689
   690	ADDS	t1, acc3, acc3
   691	ADCS	y0, acc0, acc0
   692	ADCS	acc2, acc1, acc1
   693	ADC	$0, hlp0, acc2
   694	// y[3] * x
   695	MUL	y3, x0, t0
   696	ADDS	t0, acc3
   697	UMULH	y3, x0, t1
   698
   699	MUL	y3, x1, t0
   700	ADCS	t0, acc4
   701	UMULH	y3, x1, hlp0
   702
   703	MUL	y3, x2, t0
   704	ADCS	t0, acc5
   705	UMULH	y3, x2, y0
   706
   707	MUL	y3, x3, t0
   708	ADCS	t0, acc6
   709	UMULH	y3, x3, y1
   710	ADC	$0, ZR, acc7
   711
   712	ADDS	t1, acc4
   713	ADCS	hlp0, acc5
   714	ADCS	y0, acc6
   715	ADC	y1, acc7
   716	// Last reduction step
   717	MUL	acc3, hlp1, hlp0
   718
   719	MUL	const0, hlp1, t0
   720	ADDS	t0, acc3, acc3
   721	UMULH	const0, hlp0, t1
   722
   723	MUL	const1, hlp0, t0
   724	ADCS	t0, acc0, acc0
   725	UMULH	const1, hlp0, y0
   726
   727	MUL	const2, hlp0, t0
   728	ADCS	t0, acc1, acc1
   729	UMULH	const2, hlp0, acc3
   730
   731	MUL	const3, hlp0, t0
   732	ADCS	t0, acc2, acc2
   733
   734	UMULH	const3, hlp0, hlp0
   735	ADC	$0, acc7
   736
   737	ADDS	t1, acc0, acc0
   738	ADCS	y0, acc1, acc1
   739	ADCS	acc3, acc2, acc2
   740	ADC	$0, hlp0, acc3
   741
   742	ADDS	acc4, acc0, acc0
   743	ADCS	acc5, acc1, acc1
   744	ADCS	acc6, acc2, acc2
   745	ADCS	acc7, acc3, acc3
   746	ADC	$0, ZR, acc4
   747
   748	SUBS	const0, acc0, t0
   749	SBCS	const1, acc1, t1
   750	SBCS	const2, acc2, t2
   751	SBCS	const3, acc3, t3
   752	SBCS	$0, acc4, acc4
   753
   754	CSEL	CS, t0, acc0, acc0
   755	CSEL	CS, t1, acc1, acc1
   756	CSEL	CS, t2, acc2, acc2
   757	CSEL	CS, t3, acc3, acc3
   758
   759	MOVD	res+0(FP), res_ptr
   760	STP	(acc0, acc1), 0*16(res_ptr)
   761	STP	(acc2, acc3), 1*16(res_ptr)
   762
   763	RET
   764/* ---------------------------------------*/
   765TEXT p256SubInternal<>(SB),NOSPLIT,$0
   766	SUBS	x0, y0, acc0
   767	SBCS	x1, y1, acc1
   768	SBCS	x2, y2, acc2
   769	SBCS	x3, y3, acc3
   770	SBC	$0, ZR, t0
   771
   772	ADDS	$-1, acc0, acc4
   773	ADCS	const0, acc1, acc5
   774	ADCS	$0, acc2, acc6
   775	ADC	const1, acc3, acc7
   776
   777	ANDS	$1, t0
   778	CSEL	EQ, acc0, acc4, x0
   779	CSEL	EQ, acc1, acc5, x1
   780	CSEL	EQ, acc2, acc6, x2
   781	CSEL	EQ, acc3, acc7, x3
   782
   783	RET
   784/* ---------------------------------------*/
   785TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   786	// x[1:] * x[0]
   787	MUL	x0, x1, acc1
   788	UMULH	x0, x1, acc2
   789
   790	MUL	x0, x2, t0
   791	ADDS	t0, acc2, acc2
   792	UMULH	x0, x2, acc3
   793
   794	MUL	x0, x3, t0
   795	ADCS	t0, acc3, acc3
   796	UMULH	x0, x3, acc4
   797	ADC	$0, acc4, acc4
   798	// x[2:] * x[1]
   799	MUL	x1, x2, t0
   800	ADDS	t0, acc3
   801	UMULH	x1, x2, t1
   802	ADCS	t1, acc4
   803	ADC	$0, ZR, acc5
   804
   805	MUL	x1, x3, t0
   806	ADDS	t0, acc4
   807	UMULH	x1, x3, t1
   808	ADC	t1, acc5
   809	// x[3] * x[2]
   810	MUL	x2, x3, t0
   811	ADDS	t0, acc5
   812	UMULH	x2, x3, acc6
   813	ADC	$0, acc6
   814
   815	MOVD	$0, acc7
   816	// *2
   817	ADDS	acc1, acc1
   818	ADCS	acc2, acc2
   819	ADCS	acc3, acc3
   820	ADCS	acc4, acc4
   821	ADCS	acc5, acc5
   822	ADCS	acc6, acc6
   823	ADC	$0, acc7
   824	// Missing products
   825	MUL	x0, x0, acc0
   826	UMULH	x0, x0, t0
   827	ADDS	t0, acc1, acc1
   828
   829	MUL	x1, x1, t0
   830	ADCS	t0, acc2, acc2
   831	UMULH	x1, x1, t1
   832	ADCS	t1, acc3, acc3
   833
   834	MUL	x2, x2, t0
   835	ADCS	t0, acc4, acc4
   836	UMULH	x2, x2, t1
   837	ADCS	t1, acc5, acc5
   838
   839	MUL	x3, x3, t0
   840	ADCS	t0, acc6, acc6
   841	UMULH	x3, x3, t1
   842	ADCS	t1, acc7, acc7
   843	// First reduction step
   844	ADDS	acc0<<32, acc1, acc1
   845	LSR	$32, acc0, t0
   846	MUL	acc0, const1, t1
   847	UMULH	acc0, const1, acc0
   848	ADCS	t0, acc2, acc2
   849	ADCS	t1, acc3, acc3
   850	ADC	$0, acc0, acc0
   851	// Second reduction step
   852	ADDS	acc1<<32, acc2, acc2
   853	LSR	$32, acc1, t0
   854	MUL	acc1, const1, t1
   855	UMULH	acc1, const1, acc1
   856	ADCS	t0, acc3, acc3
   857	ADCS	t1, acc0, acc0
   858	ADC	$0, acc1, acc1
   859	// Third reduction step
   860	ADDS	acc2<<32, acc3, acc3
   861	LSR	$32, acc2, t0
   862	MUL	acc2, const1, t1
   863	UMULH	acc2, const1, acc2
   864	ADCS	t0, acc0, acc0
   865	ADCS	t1, acc1, acc1
   866	ADC	$0, acc2, acc2
   867	// Last reduction step
   868	ADDS	acc3<<32, acc0, acc0
   869	LSR	$32, acc3, t0
   870	MUL	acc3, const1, t1
   871	UMULH	acc3, const1, acc3
   872	ADCS	t0, acc1, acc1
   873	ADCS	t1, acc2, acc2
   874	ADC	$0, acc3, acc3
   875	// Add bits [511:256] of the sqr result
   876	ADDS	acc4, acc0, acc0
   877	ADCS	acc5, acc1, acc1
   878	ADCS	acc6, acc2, acc2
   879	ADCS	acc7, acc3, acc3
   880	ADC	$0, ZR, acc4
   881
   882	SUBS	$-1, acc0, t0
   883	SBCS	const0, acc1, t1
   884	SBCS	$0, acc2, t2
   885	SBCS	const1, acc3, t3
   886	SBCS	$0, acc4, acc4
   887
   888	CSEL	CS, t0, acc0, y0
   889	CSEL	CS, t1, acc1, y1
   890	CSEL	CS, t2, acc2, y2
   891	CSEL	CS, t3, acc3, y3
   892	RET
   893/* ---------------------------------------*/
   894TEXT p256MulInternal<>(SB),NOSPLIT,$0
   895	// y[0] * x
   896	MUL	y0, x0, acc0
   897	UMULH	y0, x0, acc1
   898
   899	MUL	y0, x1, t0
   900	ADDS	t0, acc1
   901	UMULH	y0, x1, acc2
   902
   903	MUL	y0, x2, t0
   904	ADCS	t0, acc2
   905	UMULH	y0, x2, acc3
   906
   907	MUL	y0, x3, t0
   908	ADCS	t0, acc3
   909	UMULH	y0, x3, acc4
   910	ADC	$0, acc4
   911	// First reduction step
   912	ADDS	acc0<<32, acc1, acc1
   913	LSR	$32, acc0, t0
   914	MUL	acc0, const1, t1
   915	UMULH	acc0, const1, acc0
   916	ADCS	t0, acc2
   917	ADCS	t1, acc3
   918	ADC	$0, acc0
   919	// y[1] * x
   920	MUL	y1, x0, t0
   921	ADDS	t0, acc1
   922	UMULH	y1, x0, t1
   923
   924	MUL	y1, x1, t0
   925	ADCS	t0, acc2
   926	UMULH	y1, x1, t2
   927
   928	MUL	y1, x2, t0
   929	ADCS	t0, acc3
   930	UMULH	y1, x2, t3
   931
   932	MUL	y1, x3, t0
   933	ADCS	t0, acc4
   934	UMULH	y1, x3, hlp0
   935	ADC	$0, ZR, acc5
   936
   937	ADDS	t1, acc2
   938	ADCS	t2, acc3
   939	ADCS	t3, acc4
   940	ADC	hlp0, acc5
   941	// Second reduction step
   942	ADDS	acc1<<32, acc2, acc2
   943	LSR	$32, acc1, t0
   944	MUL	acc1, const1, t1
   945	UMULH	acc1, const1, acc1
   946	ADCS	t0, acc3
   947	ADCS	t1, acc0
   948	ADC	$0, acc1
   949	// y[2] * x
   950	MUL	y2, x0, t0
   951	ADDS	t0, acc2
   952	UMULH	y2, x0, t1
   953
   954	MUL	y2, x1, t0
   955	ADCS	t0, acc3
   956	UMULH	y2, x1, t2
   957
   958	MUL	y2, x2, t0
   959	ADCS	t0, acc4
   960	UMULH	y2, x2, t3
   961
   962	MUL	y2, x3, t0
   963	ADCS	t0, acc5
   964	UMULH	y2, x3, hlp0
   965	ADC	$0, ZR, acc6
   966
   967	ADDS	t1, acc3
   968	ADCS	t2, acc4
   969	ADCS	t3, acc5
   970	ADC	hlp0, acc6
   971	// Third reduction step
   972	ADDS	acc2<<32, acc3, acc3
   973	LSR	$32, acc2, t0
   974	MUL	acc2, const1, t1
   975	UMULH	acc2, const1, acc2
   976	ADCS	t0, acc0
   977	ADCS	t1, acc1
   978	ADC	$0, acc2
   979	// y[3] * x
   980	MUL	y3, x0, t0
   981	ADDS	t0, acc3
   982	UMULH	y3, x0, t1
   983
   984	MUL	y3, x1, t0
   985	ADCS	t0, acc4
   986	UMULH	y3, x1, t2
   987
   988	MUL	y3, x2, t0
   989	ADCS	t0, acc5
   990	UMULH	y3, x2, t3
   991
   992	MUL	y3, x3, t0
   993	ADCS	t0, acc6
   994	UMULH	y3, x3, hlp0
   995	ADC	$0, ZR, acc7
   996
   997	ADDS	t1, acc4
   998	ADCS	t2, acc5
   999	ADCS	t3, acc6
  1000	ADC	hlp0, acc7
  1001	// Last reduction step
  1002	ADDS	acc3<<32, acc0, acc0
  1003	LSR	$32, acc3, t0
  1004	MUL	acc3, const1, t1
  1005	UMULH	acc3, const1, acc3
  1006	ADCS	t0, acc1
  1007	ADCS	t1, acc2
  1008	ADC	$0, acc3
  1009	// Add bits [511:256] of the mul result
  1010	ADDS	acc4, acc0, acc0
  1011	ADCS	acc5, acc1, acc1
  1012	ADCS	acc6, acc2, acc2
  1013	ADCS	acc7, acc3, acc3
  1014	ADC	$0, ZR, acc4
  1015
  1016	SUBS	$-1, acc0, t0
  1017	SBCS	const0, acc1, t1
  1018	SBCS	$0, acc2, t2
  1019	SBCS	const1, acc3, t3
  1020	SBCS	$0, acc4, acc4
  1021
  1022	CSEL	CS, t0, acc0, y0
  1023	CSEL	CS, t1, acc1, y1
  1024	CSEL	CS, t2, acc2, y2
  1025	CSEL	CS, t3, acc3, y3
  1026	RET
  1027/* ---------------------------------------*/
  1028#define p256MulBy2Inline       \
  1029	ADDS	y0, y0, x0;    \
  1030	ADCS	y1, y1, x1;    \
  1031	ADCS	y2, y2, x2;    \
  1032	ADCS	y3, y3, x3;    \
  1033	ADC	$0, ZR, hlp0;  \
  1034	SUBS	$-1, x0, t0;   \
  1035	SBCS	const0, x1, t1;\
  1036	SBCS	$0, x2, t2;    \
  1037	SBCS	const1, x3, t3;\
  1038	SBCS	$0, hlp0, hlp0;\
  1039	CSEL	CC, x0, t0, x0;\
  1040	CSEL	CC, x1, t1, x1;\
  1041	CSEL	CC, x2, t2, x2;\
  1042	CSEL	CC, x3, t3, x3;
  1043/* ---------------------------------------*/
  1044#define x1in(off) (off)(a_ptr)
  1045#define y1in(off) (off + 32)(a_ptr)
  1046#define z1in(off) (off + 64)(a_ptr)
  1047#define x2in(off) (off)(b_ptr)
  1048#define z2in(off) (off + 64)(b_ptr)
  1049#define x3out(off) (off)(res_ptr)
  1050#define y3out(off) (off + 32)(res_ptr)
  1051#define z3out(off) (off + 64)(res_ptr)
  1052#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1053#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1054#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1055#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1056/* ---------------------------------------*/
  1057#define y2in(off)  (32*0 + 8 + off)(RSP)
  1058#define s2(off)    (32*1 + 8 + off)(RSP)
  1059#define z1sqr(off) (32*2 + 8 + off)(RSP)
  1060#define h(off)	   (32*3 + 8 + off)(RSP)
  1061#define r(off)	   (32*4 + 8 + off)(RSP)
  1062#define hsqr(off)  (32*5 + 8 + off)(RSP)
  1063#define rsqr(off)  (32*6 + 8 + off)(RSP)
  1064#define hcub(off)  (32*7 + 8 + off)(RSP)
  1065
  1066#define z2sqr(off) (32*8 + 8 + off)(RSP)
  1067#define s1(off) (32*9 + 8 + off)(RSP)
  1068#define u1(off) (32*10 + 8 + off)(RSP)
  1069#define u2(off) (32*11 + 8 + off)(RSP)
  1070
  1071// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1072TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1073	MOVD	in1+8(FP), a_ptr
  1074	MOVD	in2+16(FP), b_ptr
  1075	MOVD	sign+24(FP), hlp0
  1076	MOVD	sel+32(FP), hlp1
  1077	MOVD	zero+40(FP), t2
  1078
  1079	MOVD	$1, t0
  1080	CMP	$0, t2
  1081	CSEL	EQ, ZR, t0, t2
  1082	CMP	$0, hlp1
  1083	CSEL	EQ, ZR, t0, hlp1
  1084
  1085	MOVD	p256const0<>(SB), const0
  1086	MOVD	p256const1<>(SB), const1
  1087	EOR	t2<<1, hlp1
  1088
  1089	// Negate y2in based on sign
  1090	LDP	2*16(b_ptr), (y0, y1)
  1091	LDP	3*16(b_ptr), (y2, y3)
  1092	MOVD	$-1, acc0
  1093
  1094	SUBS	y0, acc0, acc0
  1095	SBCS	y1, const0, acc1
  1096	SBCS	y2, ZR, acc2
  1097	SBCS	y3, const1, acc3
  1098	SBC	$0, ZR, t0
  1099
  1100	ADDS	$-1, acc0, acc4
  1101	ADCS	const0, acc1, acc5
  1102	ADCS	$0, acc2, acc6
  1103	ADCS	const1, acc3, acc7
  1104	ADC	$0, t0, t0
  1105
  1106	CMP	$0, t0
  1107	CSEL	EQ, acc4, acc0, acc0
  1108	CSEL	EQ, acc5, acc1, acc1
  1109	CSEL	EQ, acc6, acc2, acc2
  1110	CSEL	EQ, acc7, acc3, acc3
  1111	// If condition is 0, keep original value
  1112	CMP	$0, hlp0
  1113	CSEL	EQ, y0, acc0, y0
  1114	CSEL	EQ, y1, acc1, y1
  1115	CSEL	EQ, y2, acc2, y2
  1116	CSEL	EQ, y3, acc3, y3
  1117	// Store result
  1118	STy(y2in)
  1119	// Begin point add
  1120	LDx(z1in)
  1121	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1122	STy(z1sqr)
  1123
  1124	LDx(x2in)
  1125	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1126
  1127	LDx(x1in)
  1128	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1129	STx(h)
  1130
  1131	LDy(z1in)
  1132	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1133
  1134	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1135	LDP	5*16(a_ptr), (acc2, acc3)
  1136	ANDS	$1, hlp1, ZR
  1137	CSEL	EQ, acc0, y0, y0
  1138	CSEL	EQ, acc1, y1, y1
  1139	CSEL	EQ, acc2, y2, y2
  1140	CSEL	EQ, acc3, y3, y3
  1141	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1142	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1143	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1144	CSEL	EQ, acc0, y0, y0
  1145	CSEL	EQ, acc1, y1, y1
  1146	CSEL	EQ, acc2, y2, y2
  1147	CSEL	EQ, acc3, y3, y3
  1148	LDx(z1in)
  1149	MOVD	res+0(FP), t0
  1150	STP	(y0, y1), 4*16(t0)
  1151	STP	(y2, y3), 5*16(t0)
  1152
  1153	LDy(z1sqr)
  1154	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1155
  1156	LDx(y2in)
  1157	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1158	STy(s2)
  1159
  1160	LDx(y1in)
  1161	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1162	STx(r)
  1163
  1164	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1165	STy	(rsqr)
  1166
  1167	LDx(h)
  1168	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1169	STy(hsqr)
  1170
  1171	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1172	STy(hcub)
  1173
  1174	LDx(y1in)
  1175	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1176	STy(s2)
  1177
  1178	LDP	hsqr(0*8), (x0, x1)
  1179	LDP	hsqr(2*8), (x2, x3)
  1180	LDP	0*16(a_ptr), (y0, y1)
  1181	LDP	1*16(a_ptr), (y2, y3)
  1182	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1183	STP	(y0, y1), h(0*8)
  1184	STP	(y2, y3), h(2*8)
  1185
  1186	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1187
  1188	LDy(rsqr)
  1189	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1190
  1191	MOVD	x0, y0
  1192	MOVD	x1, y1
  1193	MOVD	x2, y2
  1194	MOVD	x3, y3
  1195	LDx(hcub)
  1196	CALL	p256SubInternal<>(SB)
  1197
  1198	LDP	0*16(a_ptr), (acc0, acc1)
  1199	LDP	1*16(a_ptr), (acc2, acc3)
  1200	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1201	CSEL	EQ, acc0, x0, x0
  1202	CSEL	EQ, acc1, x1, x1
  1203	CSEL	EQ, acc2, x2, x2
  1204	CSEL	EQ, acc3, x3, x3
  1205	LDP	0*16(b_ptr), (acc0, acc1)
  1206	LDP	1*16(b_ptr), (acc2, acc3)
  1207	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1208	CSEL	EQ, acc0, x0, x0
  1209	CSEL	EQ, acc1, x1, x1
  1210	CSEL	EQ, acc2, x2, x2
  1211	CSEL	EQ, acc3, x3, x3
  1212	MOVD	res+0(FP), t0
  1213	STP	(x0, x1), 0*16(t0)
  1214	STP	(x2, x3), 1*16(t0)
  1215
  1216	LDP	h(0*8), (y0, y1)
  1217	LDP	h(2*8), (y2, y3)
  1218	CALL	p256SubInternal<>(SB)
  1219
  1220	LDP	r(0*8), (y0, y1)
  1221	LDP	r(2*8), (y2, y3)
  1222	CALL	p256MulInternal<>(SB)
  1223
  1224	LDP	s2(0*8), (x0, x1)
  1225	LDP	s2(2*8), (x2, x3)
  1226	CALL	p256SubInternal<>(SB)
  1227	LDP	2*16(a_ptr), (acc0, acc1)
  1228	LDP	3*16(a_ptr), (acc2, acc3)
  1229	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1230	CSEL	EQ, acc0, x0, x0
  1231	CSEL	EQ, acc1, x1, x1
  1232	CSEL	EQ, acc2, x2, x2
  1233	CSEL	EQ, acc3, x3, x3
  1234	LDP	y2in(0*8), (acc0, acc1)
  1235	LDP	y2in(2*8), (acc2, acc3)
  1236	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1237	CSEL	EQ, acc0, x0, x0
  1238	CSEL	EQ, acc1, x1, x1
  1239	CSEL	EQ, acc2, x2, x2
  1240	CSEL	EQ, acc3, x3, x3
  1241	MOVD	res+0(FP), t0
  1242	STP	(x0, x1), 2*16(t0)
  1243	STP	(x2, x3), 3*16(t0)
  1244
  1245	RET
  1246
  1247#define p256AddInline          \
  1248	ADDS	y0, x0, x0;    \
  1249	ADCS	y1, x1, x1;    \
  1250	ADCS	y2, x2, x2;    \
  1251	ADCS	y3, x3, x3;    \
  1252	ADC	$0, ZR, hlp0;  \
  1253	SUBS	$-1, x0, t0;   \
  1254	SBCS	const0, x1, t1;\
  1255	SBCS	$0, x2, t2;    \
  1256	SBCS	const1, x3, t3;\
  1257	SBCS	$0, hlp0, hlp0;\
  1258	CSEL	CC, x0, t0, x0;\
  1259	CSEL	CC, x1, t1, x1;\
  1260	CSEL	CC, x2, t2, x2;\
  1261	CSEL	CC, x3, t3, x3;
  1262
  1263#define s(off)	(32*0 + 8 + off)(RSP)
  1264#define m(off)	(32*1 + 8 + off)(RSP)
  1265#define zsqr(off) (32*2 + 8 + off)(RSP)
  1266#define tmp(off)  (32*3 + 8 + off)(RSP)
  1267
  1268//func p256PointDoubleAsm(res, in *P256Point)
  1269TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1270	MOVD	res+0(FP), res_ptr
  1271	MOVD	in+8(FP), a_ptr
  1272
  1273	MOVD	p256const0<>(SB), const0
  1274	MOVD	p256const1<>(SB), const1
  1275
  1276	// Begin point double
  1277	LDP	4*16(a_ptr), (x0, x1)
  1278	LDP	5*16(a_ptr), (x2, x3)
  1279	CALL	p256SqrInternal<>(SB)
  1280	STP	(y0, y1), zsqr(0*8)
  1281	STP	(y2, y3), zsqr(2*8)
  1282
  1283	LDP	0*16(a_ptr), (x0, x1)
  1284	LDP	1*16(a_ptr), (x2, x3)
  1285	p256AddInline
  1286	STx(m)
  1287
  1288	LDx(z1in)
  1289	LDy(y1in)
  1290	CALL	p256MulInternal<>(SB)
  1291	p256MulBy2Inline
  1292	STx(z3out)
  1293
  1294	LDy(x1in)
  1295	LDx(zsqr)
  1296	CALL	p256SubInternal<>(SB)
  1297	LDy(m)
  1298	CALL	p256MulInternal<>(SB)
  1299
  1300	// Multiply by 3
  1301	p256MulBy2Inline
  1302	p256AddInline
  1303	STx(m)
  1304
  1305	LDy(y1in)
  1306	p256MulBy2Inline
  1307	CALL	p256SqrInternal<>(SB)
  1308	STy(s)
  1309	MOVD	y0, x0
  1310	MOVD	y1, x1
  1311	MOVD	y2, x2
  1312	MOVD	y3, x3
  1313	CALL	p256SqrInternal<>(SB)
  1314
  1315	// Divide by 2
  1316	ADDS	$-1, y0, t0
  1317	ADCS	const0, y1, t1
  1318	ADCS	$0, y2, t2
  1319	ADCS	const1, y3, t3
  1320	ADC	$0, ZR, hlp0
  1321
  1322	ANDS	$1, y0, ZR
  1323	CSEL	EQ, y0, t0, t0
  1324	CSEL	EQ, y1, t1, t1
  1325	CSEL	EQ, y2, t2, t2
  1326	CSEL	EQ, y3, t3, t3
  1327	AND	y0, hlp0, hlp0
  1328
  1329	EXTR	$1, t0, t1, y0
  1330	EXTR	$1, t1, t2, y1
  1331	EXTR	$1, t2, t3, y2
  1332	EXTR	$1, t3, hlp0, y3
  1333	STy(y3out)
  1334
  1335	LDx(x1in)
  1336	LDy(s)
  1337	CALL	p256MulInternal<>(SB)
  1338	STy(s)
  1339	p256MulBy2Inline
  1340	STx(tmp)
  1341
  1342	LDx(m)
  1343	CALL	p256SqrInternal<>(SB)
  1344	LDx(tmp)
  1345	CALL	p256SubInternal<>(SB)
  1346
  1347	STx(x3out)
  1348
  1349	LDy(s)
  1350	CALL	p256SubInternal<>(SB)
  1351
  1352	LDy(m)
  1353	CALL	p256MulInternal<>(SB)
  1354
  1355	LDx(y3out)
  1356	CALL	p256SubInternal<>(SB)
  1357	STx(y3out)
  1358	RET
  1359/* ---------------------------------------*/
  1360#undef y2in
  1361#undef x3out
  1362#undef y3out
  1363#undef z3out
  1364#define y2in(off) (off + 32)(b_ptr)
  1365#define x3out(off) (off)(b_ptr)
  1366#define y3out(off) (off + 32)(b_ptr)
  1367#define z3out(off) (off + 64)(b_ptr)
  1368// func p256PointAddAsm(res, in1, in2 *P256Point) int
  1369TEXT ·p256PointAddAsm(SB),0,$392-32
  1370	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1371	// Move input to stack in order to free registers
  1372	MOVD	in1+8(FP), a_ptr
  1373	MOVD	in2+16(FP), b_ptr
  1374
  1375	MOVD	p256const0<>(SB), const0
  1376	MOVD	p256const1<>(SB), const1
  1377
  1378	// Begin point add
  1379	LDx(z2in)
  1380	CALL	p256SqrInternal<>(SB)    // z2^2
  1381	STy(z2sqr)
  1382
  1383	CALL	p256MulInternal<>(SB)    // z2^3
  1384
  1385	LDx(y1in)
  1386	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1387	STy(s1)
  1388
  1389	LDx(z1in)
  1390	CALL	p256SqrInternal<>(SB)    // z1^2
  1391	STy(z1sqr)
  1392
  1393	CALL	p256MulInternal<>(SB)    // z1^3
  1394
  1395	LDx(y2in)
  1396	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1397
  1398	LDx(s1)
  1399	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1400	STx(r)
  1401
  1402	MOVD	$1, t2
  1403	ORR	x0, x1, t0             // Check if zero mod p256
  1404	ORR	x2, x3, t1
  1405	ORR	t1, t0, t0
  1406	CMP	$0, t0
  1407	CSEL	EQ, t2, ZR, hlp1
  1408
  1409	EOR	$-1, x0, t0
  1410	EOR	const0, x1, t1
  1411	EOR	const1, x3, t3
  1412
  1413	ORR	t0, t1, t0
  1414	ORR	x2, t3, t1
  1415	ORR	t1, t0, t0
  1416	CMP	$0, t0
  1417	CSEL	EQ, t2, hlp1, hlp1
  1418
  1419	LDx(z2sqr)
  1420	LDy(x1in)
  1421	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1422	STy(u1)
  1423
  1424	LDx(z1sqr)
  1425	LDy(x2in)
  1426	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1427	STy(u2)
  1428
  1429	LDx(u1)
  1430	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1431	STx(h)
  1432
  1433	MOVD	$1, t2
  1434	ORR	x0, x1, t0             // Check if zero mod p256
  1435	ORR	x2, x3, t1
  1436	ORR	t1, t0, t0
  1437	CMP	$0, t0
  1438	CSEL	EQ, t2, ZR, hlp0
  1439
  1440	EOR	$-1, x0, t0
  1441	EOR	const0, x1, t1
  1442	EOR	const1, x3, t3
  1443
  1444	ORR	t0, t1, t0
  1445	ORR	x2, t3, t1
  1446	ORR	t1, t0, t0
  1447	CMP	$0, t0
  1448	CSEL	EQ, t2, hlp0, hlp0
  1449
  1450	AND	hlp0, hlp1, hlp1
  1451
  1452	LDx(r)
  1453	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1454	STy(rsqr)
  1455
  1456	LDx(h)
  1457	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1458	STy(hsqr)
  1459
  1460	LDx(h)
  1461	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1462	STy(hcub)
  1463
  1464	LDx(s1)
  1465	CALL	p256MulInternal<>(SB)
  1466	STy(s2)
  1467
  1468	LDx(z1in)
  1469	LDy(z2in)
  1470	CALL	p256MulInternal<>(SB)    // z1 * z2
  1471	LDx(h)
  1472	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1473	MOVD	res+0(FP), b_ptr
  1474	STy(z3out)
  1475
  1476	LDx(hsqr)
  1477	LDy(u1)
  1478	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1479	STy(u2)
  1480
  1481	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1482	LDy(rsqr)
  1483	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1484
  1485	MOVD	x0, y0
  1486	MOVD	x1, y1
  1487	MOVD	x2, y2
  1488	MOVD	x3, y3
  1489	LDx(hcub)
  1490	CALL	p256SubInternal<>(SB)
  1491	STx(x3out)
  1492
  1493	LDy(u2)
  1494	CALL	p256SubInternal<>(SB)
  1495
  1496	LDy(r)
  1497	CALL	p256MulInternal<>(SB)
  1498
  1499	LDx(s2)
  1500	CALL	p256SubInternal<>(SB)
  1501	STx(y3out)
  1502
  1503	MOVD	hlp1, R0
  1504	MOVD	R0, ret+24(FP)
  1505
  1506	RET

View as plain text