...

Text file src/crypto/internal/nistec/p256_asm_amd64.s

Documentation: crypto/internal/nistec

     1// Copyright 2015 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7// This file contains constant-time, 64-bit assembly implementation of
     8// P256. The optimizations performed here are described in detail in:
     9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10//                          256-bit primes"
    11// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12// https://eprint.iacr.org/2013/816.pdf
    13
    14#include "textflag.h"
    15
    16#define res_ptr DI
    17#define x_ptr SI
    18#define y_ptr CX
    19
    20#define acc0 R8
    21#define acc1 R9
    22#define acc2 R10
    23#define acc3 R11
    24#define acc4 R12
    25#define acc5 R13
    26#define t0 R14
    27#define t1 R15
    28
    29DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    30DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    31DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    32DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    33DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    34DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    35DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    36DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    37DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    38DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    39DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    40GLOBL p256const0<>(SB), 8, $8
    41GLOBL p256const1<>(SB), 8, $8
    42GLOBL p256ordK0<>(SB), 8, $8
    43GLOBL p256ord<>(SB), 8, $32
    44GLOBL p256one<>(SB), 8, $32
    45
    46/* ---------------------------------------*/
    47// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    48TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    49	JMP ·p256BigToLittle(SB)
    50/* ---------------------------------------*/
    51// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    52TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    53	JMP ·p256BigToLittle(SB)
    54/* ---------------------------------------*/
    55// func p256LittleToBig(res *[32]byte, in *p256Element)
    56TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    57	JMP ·p256BigToLittle(SB)
    58/* ---------------------------------------*/
    59// func p256BigToLittle(res *p256Element, in *[32]byte)
    60TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    61	MOVQ res+0(FP), res_ptr
    62	MOVQ in+8(FP), x_ptr
    63
    64	MOVQ (8*0)(x_ptr), acc0
    65	MOVQ (8*1)(x_ptr), acc1
    66	MOVQ (8*2)(x_ptr), acc2
    67	MOVQ (8*3)(x_ptr), acc3
    68
    69	BSWAPQ acc0
    70	BSWAPQ acc1
    71	BSWAPQ acc2
    72	BSWAPQ acc3
    73
    74	MOVQ acc3, (8*0)(res_ptr)
    75	MOVQ acc2, (8*1)(res_ptr)
    76	MOVQ acc1, (8*2)(res_ptr)
    77	MOVQ acc0, (8*3)(res_ptr)
    78
    79	RET
    80/* ---------------------------------------*/
    81// func p256MovCond(res, a, b *P256Point, cond int)
    82TEXT ·p256MovCond(SB),NOSPLIT,$0
    83	MOVQ res+0(FP), res_ptr
    84	MOVQ a+8(FP), x_ptr
    85	MOVQ b+16(FP), y_ptr
    86	MOVQ cond+24(FP), X12
    87
    88	PXOR X13, X13
    89	PSHUFD $0, X12, X12
    90	PCMPEQL X13, X12
    91
    92	MOVOU X12, X0
    93	MOVOU (16*0)(x_ptr), X6
    94	PANDN X6, X0
    95	MOVOU X12, X1
    96	MOVOU (16*1)(x_ptr), X7
    97	PANDN X7, X1
    98	MOVOU X12, X2
    99	MOVOU (16*2)(x_ptr), X8
   100	PANDN X8, X2
   101	MOVOU X12, X3
   102	MOVOU (16*3)(x_ptr), X9
   103	PANDN X9, X3
   104	MOVOU X12, X4
   105	MOVOU (16*4)(x_ptr), X10
   106	PANDN X10, X4
   107	MOVOU X12, X5
   108	MOVOU (16*5)(x_ptr), X11
   109	PANDN X11, X5
   110
   111	MOVOU (16*0)(y_ptr), X6
   112	MOVOU (16*1)(y_ptr), X7
   113	MOVOU (16*2)(y_ptr), X8
   114	MOVOU (16*3)(y_ptr), X9
   115	MOVOU (16*4)(y_ptr), X10
   116	MOVOU (16*5)(y_ptr), X11
   117
   118	PAND X12, X6
   119	PAND X12, X7
   120	PAND X12, X8
   121	PAND X12, X9
   122	PAND X12, X10
   123	PAND X12, X11
   124
   125	PXOR X6, X0
   126	PXOR X7, X1
   127	PXOR X8, X2
   128	PXOR X9, X3
   129	PXOR X10, X4
   130	PXOR X11, X5
   131
   132	MOVOU X0, (16*0)(res_ptr)
   133	MOVOU X1, (16*1)(res_ptr)
   134	MOVOU X2, (16*2)(res_ptr)
   135	MOVOU X3, (16*3)(res_ptr)
   136	MOVOU X4, (16*4)(res_ptr)
   137	MOVOU X5, (16*5)(res_ptr)
   138
   139	RET
   140/* ---------------------------------------*/
   141// func p256NegCond(val *p256Element, cond int)
   142TEXT ·p256NegCond(SB),NOSPLIT,$0
   143	MOVQ val+0(FP), res_ptr
   144	MOVQ cond+8(FP), t0
   145	// acc = poly
   146	MOVQ $-1, acc0
   147	MOVQ p256const0<>(SB), acc1
   148	MOVQ $0, acc2
   149	MOVQ p256const1<>(SB), acc3
   150	// Load the original value
   151	MOVQ (8*0)(res_ptr), acc5
   152	MOVQ (8*1)(res_ptr), x_ptr
   153	MOVQ (8*2)(res_ptr), y_ptr
   154	MOVQ (8*3)(res_ptr), t1
   155	// Speculatively subtract
   156	SUBQ acc5, acc0
   157	SBBQ x_ptr, acc1
   158	SBBQ y_ptr, acc2
   159	SBBQ t1, acc3
   160	// If condition is 0, keep original value
   161	TESTQ t0, t0
   162	CMOVQEQ acc5, acc0
   163	CMOVQEQ x_ptr, acc1
   164	CMOVQEQ y_ptr, acc2
   165	CMOVQEQ t1, acc3
   166	// Store result
   167	MOVQ acc0, (8*0)(res_ptr)
   168	MOVQ acc1, (8*1)(res_ptr)
   169	MOVQ acc2, (8*2)(res_ptr)
   170	MOVQ acc3, (8*3)(res_ptr)
   171
   172	RET
   173/* ---------------------------------------*/
   174// func p256Sqr(res, in *p256Element, n int)
   175TEXT ·p256Sqr(SB),NOSPLIT,$0
   176	MOVQ res+0(FP), res_ptr
   177	MOVQ in+8(FP), x_ptr
   178	MOVQ n+16(FP), BX
   179
   180sqrLoop:
   181
   182	// y[1:] * y[0]
   183	MOVQ (8*0)(x_ptr), t0
   184
   185	MOVQ (8*1)(x_ptr), AX
   186	MULQ t0
   187	MOVQ AX, acc1
   188	MOVQ DX, acc2
   189
   190	MOVQ (8*2)(x_ptr), AX
   191	MULQ t0
   192	ADDQ AX, acc2
   193	ADCQ $0, DX
   194	MOVQ DX, acc3
   195
   196	MOVQ (8*3)(x_ptr), AX
   197	MULQ t0
   198	ADDQ AX, acc3
   199	ADCQ $0, DX
   200	MOVQ DX, acc4
   201	// y[2:] * y[1]
   202	MOVQ (8*1)(x_ptr), t0
   203
   204	MOVQ (8*2)(x_ptr), AX
   205	MULQ t0
   206	ADDQ AX, acc3
   207	ADCQ $0, DX
   208	MOVQ DX, t1
   209
   210	MOVQ (8*3)(x_ptr), AX
   211	MULQ t0
   212	ADDQ t1, acc4
   213	ADCQ $0, DX
   214	ADDQ AX, acc4
   215	ADCQ $0, DX
   216	MOVQ DX, acc5
   217	// y[3] * y[2]
   218	MOVQ (8*2)(x_ptr), t0
   219
   220	MOVQ (8*3)(x_ptr), AX
   221	MULQ t0
   222	ADDQ AX, acc5
   223	ADCQ $0, DX
   224	MOVQ DX, y_ptr
   225	XORQ t1, t1
   226	// *2
   227	ADDQ acc1, acc1
   228	ADCQ acc2, acc2
   229	ADCQ acc3, acc3
   230	ADCQ acc4, acc4
   231	ADCQ acc5, acc5
   232	ADCQ y_ptr, y_ptr
   233	ADCQ $0, t1
   234	// Missing products
   235	MOVQ (8*0)(x_ptr), AX
   236	MULQ AX
   237	MOVQ AX, acc0
   238	MOVQ DX, t0
   239
   240	MOVQ (8*1)(x_ptr), AX
   241	MULQ AX
   242	ADDQ t0, acc1
   243	ADCQ AX, acc2
   244	ADCQ $0, DX
   245	MOVQ DX, t0
   246
   247	MOVQ (8*2)(x_ptr), AX
   248	MULQ AX
   249	ADDQ t0, acc3
   250	ADCQ AX, acc4
   251	ADCQ $0, DX
   252	MOVQ DX, t0
   253
   254	MOVQ (8*3)(x_ptr), AX
   255	MULQ AX
   256	ADDQ t0, acc5
   257	ADCQ AX, y_ptr
   258	ADCQ DX, t1
   259	MOVQ t1, x_ptr
   260	// First reduction step
   261	MOVQ acc0, AX
   262	MOVQ acc0, t1
   263	SHLQ $32, acc0
   264	MULQ p256const1<>(SB)
   265	SHRQ $32, t1
   266	ADDQ acc0, acc1
   267	ADCQ t1, acc2
   268	ADCQ AX, acc3
   269	ADCQ $0, DX
   270	MOVQ DX, acc0
   271	// Second reduction step
   272	MOVQ acc1, AX
   273	MOVQ acc1, t1
   274	SHLQ $32, acc1
   275	MULQ p256const1<>(SB)
   276	SHRQ $32, t1
   277	ADDQ acc1, acc2
   278	ADCQ t1, acc3
   279	ADCQ AX, acc0
   280	ADCQ $0, DX
   281	MOVQ DX, acc1
   282	// Third reduction step
   283	MOVQ acc2, AX
   284	MOVQ acc2, t1
   285	SHLQ $32, acc2
   286	MULQ p256const1<>(SB)
   287	SHRQ $32, t1
   288	ADDQ acc2, acc3
   289	ADCQ t1, acc0
   290	ADCQ AX, acc1
   291	ADCQ $0, DX
   292	MOVQ DX, acc2
   293	// Last reduction step
   294	XORQ t0, t0
   295	MOVQ acc3, AX
   296	MOVQ acc3, t1
   297	SHLQ $32, acc3
   298	MULQ p256const1<>(SB)
   299	SHRQ $32, t1
   300	ADDQ acc3, acc0
   301	ADCQ t1, acc1
   302	ADCQ AX, acc2
   303	ADCQ $0, DX
   304	MOVQ DX, acc3
   305	// Add bits [511:256] of the sqr result
   306	ADCQ acc4, acc0
   307	ADCQ acc5, acc1
   308	ADCQ y_ptr, acc2
   309	ADCQ x_ptr, acc3
   310	ADCQ $0, t0
   311
   312	MOVQ acc0, acc4
   313	MOVQ acc1, acc5
   314	MOVQ acc2, y_ptr
   315	MOVQ acc3, t1
   316	// Subtract p256
   317	SUBQ $-1, acc0
   318	SBBQ p256const0<>(SB) ,acc1
   319	SBBQ $0, acc2
   320	SBBQ p256const1<>(SB), acc3
   321	SBBQ $0, t0
   322
   323	CMOVQCS acc4, acc0
   324	CMOVQCS acc5, acc1
   325	CMOVQCS y_ptr, acc2
   326	CMOVQCS t1, acc3
   327
   328	MOVQ acc0, (8*0)(res_ptr)
   329	MOVQ acc1, (8*1)(res_ptr)
   330	MOVQ acc2, (8*2)(res_ptr)
   331	MOVQ acc3, (8*3)(res_ptr)
   332	MOVQ res_ptr, x_ptr
   333	DECQ BX
   334	JNE  sqrLoop
   335
   336	RET
   337/* ---------------------------------------*/
   338// func p256Mul(res, in1, in2 *p256Element)
   339TEXT ·p256Mul(SB),NOSPLIT,$0
   340	MOVQ res+0(FP), res_ptr
   341	MOVQ in1+8(FP), x_ptr
   342	MOVQ in2+16(FP), y_ptr
   343	// x * y[0]
   344	MOVQ (8*0)(y_ptr), t0
   345
   346	MOVQ (8*0)(x_ptr), AX
   347	MULQ t0
   348	MOVQ AX, acc0
   349	MOVQ DX, acc1
   350
   351	MOVQ (8*1)(x_ptr), AX
   352	MULQ t0
   353	ADDQ AX, acc1
   354	ADCQ $0, DX
   355	MOVQ DX, acc2
   356
   357	MOVQ (8*2)(x_ptr), AX
   358	MULQ t0
   359	ADDQ AX, acc2
   360	ADCQ $0, DX
   361	MOVQ DX, acc3
   362
   363	MOVQ (8*3)(x_ptr), AX
   364	MULQ t0
   365	ADDQ AX, acc3
   366	ADCQ $0, DX
   367	MOVQ DX, acc4
   368	XORQ acc5, acc5
   369	// First reduction step
   370	MOVQ acc0, AX
   371	MOVQ acc0, t1
   372	SHLQ $32, acc0
   373	MULQ p256const1<>(SB)
   374	SHRQ $32, t1
   375	ADDQ acc0, acc1
   376	ADCQ t1, acc2
   377	ADCQ AX, acc3
   378	ADCQ DX, acc4
   379	ADCQ $0, acc5
   380	XORQ acc0, acc0
   381	// x * y[1]
   382	MOVQ (8*1)(y_ptr), t0
   383
   384	MOVQ (8*0)(x_ptr), AX
   385	MULQ t0
   386	ADDQ AX, acc1
   387	ADCQ $0, DX
   388	MOVQ DX, t1
   389
   390	MOVQ (8*1)(x_ptr), AX
   391	MULQ t0
   392	ADDQ t1, acc2
   393	ADCQ $0, DX
   394	ADDQ AX, acc2
   395	ADCQ $0, DX
   396	MOVQ DX, t1
   397
   398	MOVQ (8*2)(x_ptr), AX
   399	MULQ t0
   400	ADDQ t1, acc3
   401	ADCQ $0, DX
   402	ADDQ AX, acc3
   403	ADCQ $0, DX
   404	MOVQ DX, t1
   405
   406	MOVQ (8*3)(x_ptr), AX
   407	MULQ t0
   408	ADDQ t1, acc4
   409	ADCQ $0, DX
   410	ADDQ AX, acc4
   411	ADCQ DX, acc5
   412	ADCQ $0, acc0
   413	// Second reduction step
   414	MOVQ acc1, AX
   415	MOVQ acc1, t1
   416	SHLQ $32, acc1
   417	MULQ p256const1<>(SB)
   418	SHRQ $32, t1
   419	ADDQ acc1, acc2
   420	ADCQ t1, acc3
   421	ADCQ AX, acc4
   422	ADCQ DX, acc5
   423	ADCQ $0, acc0
   424	XORQ acc1, acc1
   425	// x * y[2]
   426	MOVQ (8*2)(y_ptr), t0
   427
   428	MOVQ (8*0)(x_ptr), AX
   429	MULQ t0
   430	ADDQ AX, acc2
   431	ADCQ $0, DX
   432	MOVQ DX, t1
   433
   434	MOVQ (8*1)(x_ptr), AX
   435	MULQ t0
   436	ADDQ t1, acc3
   437	ADCQ $0, DX
   438	ADDQ AX, acc3
   439	ADCQ $0, DX
   440	MOVQ DX, t1
   441
   442	MOVQ (8*2)(x_ptr), AX
   443	MULQ t0
   444	ADDQ t1, acc4
   445	ADCQ $0, DX
   446	ADDQ AX, acc4
   447	ADCQ $0, DX
   448	MOVQ DX, t1
   449
   450	MOVQ (8*3)(x_ptr), AX
   451	MULQ t0
   452	ADDQ t1, acc5
   453	ADCQ $0, DX
   454	ADDQ AX, acc5
   455	ADCQ DX, acc0
   456	ADCQ $0, acc1
   457	// Third reduction step
   458	MOVQ acc2, AX
   459	MOVQ acc2, t1
   460	SHLQ $32, acc2
   461	MULQ p256const1<>(SB)
   462	SHRQ $32, t1
   463	ADDQ acc2, acc3
   464	ADCQ t1, acc4
   465	ADCQ AX, acc5
   466	ADCQ DX, acc0
   467	ADCQ $0, acc1
   468	XORQ acc2, acc2
   469	// x * y[3]
   470	MOVQ (8*3)(y_ptr), t0
   471
   472	MOVQ (8*0)(x_ptr), AX
   473	MULQ t0
   474	ADDQ AX, acc3
   475	ADCQ $0, DX
   476	MOVQ DX, t1
   477
   478	MOVQ (8*1)(x_ptr), AX
   479	MULQ t0
   480	ADDQ t1, acc4
   481	ADCQ $0, DX
   482	ADDQ AX, acc4
   483	ADCQ $0, DX
   484	MOVQ DX, t1
   485
   486	MOVQ (8*2)(x_ptr), AX
   487	MULQ t0
   488	ADDQ t1, acc5
   489	ADCQ $0, DX
   490	ADDQ AX, acc5
   491	ADCQ $0, DX
   492	MOVQ DX, t1
   493
   494	MOVQ (8*3)(x_ptr), AX
   495	MULQ t0
   496	ADDQ t1, acc0
   497	ADCQ $0, DX
   498	ADDQ AX, acc0
   499	ADCQ DX, acc1
   500	ADCQ $0, acc2
   501	// Last reduction step
   502	MOVQ acc3, AX
   503	MOVQ acc3, t1
   504	SHLQ $32, acc3
   505	MULQ p256const1<>(SB)
   506	SHRQ $32, t1
   507	ADDQ acc3, acc4
   508	ADCQ t1, acc5
   509	ADCQ AX, acc0
   510	ADCQ DX, acc1
   511	ADCQ $0, acc2
   512	// Copy result [255:0]
   513	MOVQ acc4, x_ptr
   514	MOVQ acc5, acc3
   515	MOVQ acc0, t0
   516	MOVQ acc1, t1
   517	// Subtract p256
   518	SUBQ $-1, acc4
   519	SBBQ p256const0<>(SB) ,acc5
   520	SBBQ $0, acc0
   521	SBBQ p256const1<>(SB), acc1
   522	SBBQ $0, acc2
   523
   524	CMOVQCS x_ptr, acc4
   525	CMOVQCS acc3, acc5
   526	CMOVQCS t0, acc0
   527	CMOVQCS t1, acc1
   528
   529	MOVQ acc4, (8*0)(res_ptr)
   530	MOVQ acc5, (8*1)(res_ptr)
   531	MOVQ acc0, (8*2)(res_ptr)
   532	MOVQ acc1, (8*3)(res_ptr)
   533
   534	RET
   535/* ---------------------------------------*/
   536// func p256FromMont(res, in *p256Element)
   537TEXT ·p256FromMont(SB),NOSPLIT,$0
   538	MOVQ res+0(FP), res_ptr
   539	MOVQ in+8(FP), x_ptr
   540
   541	MOVQ (8*0)(x_ptr), acc0
   542	MOVQ (8*1)(x_ptr), acc1
   543	MOVQ (8*2)(x_ptr), acc2
   544	MOVQ (8*3)(x_ptr), acc3
   545	XORQ acc4, acc4
   546
   547	// Only reduce, no multiplications are needed
   548	// First stage
   549	MOVQ acc0, AX
   550	MOVQ acc0, t1
   551	SHLQ $32, acc0
   552	MULQ p256const1<>(SB)
   553	SHRQ $32, t1
   554	ADDQ acc0, acc1
   555	ADCQ t1, acc2
   556	ADCQ AX, acc3
   557	ADCQ DX, acc4
   558	XORQ acc5, acc5
   559	// Second stage
   560	MOVQ acc1, AX
   561	MOVQ acc1, t1
   562	SHLQ $32, acc1
   563	MULQ p256const1<>(SB)
   564	SHRQ $32, t1
   565	ADDQ acc1, acc2
   566	ADCQ t1, acc3
   567	ADCQ AX, acc4
   568	ADCQ DX, acc5
   569	XORQ acc0, acc0
   570	// Third stage
   571	MOVQ acc2, AX
   572	MOVQ acc2, t1
   573	SHLQ $32, acc2
   574	MULQ p256const1<>(SB)
   575	SHRQ $32, t1
   576	ADDQ acc2, acc3
   577	ADCQ t1, acc4
   578	ADCQ AX, acc5
   579	ADCQ DX, acc0
   580	XORQ acc1, acc1
   581	// Last stage
   582	MOVQ acc3, AX
   583	MOVQ acc3, t1
   584	SHLQ $32, acc3
   585	MULQ p256const1<>(SB)
   586	SHRQ $32, t1
   587	ADDQ acc3, acc4
   588	ADCQ t1, acc5
   589	ADCQ AX, acc0
   590	ADCQ DX, acc1
   591
   592	MOVQ acc4, x_ptr
   593	MOVQ acc5, acc3
   594	MOVQ acc0, t0
   595	MOVQ acc1, t1
   596
   597	SUBQ $-1, acc4
   598	SBBQ p256const0<>(SB), acc5
   599	SBBQ $0, acc0
   600	SBBQ p256const1<>(SB), acc1
   601
   602	CMOVQCS x_ptr, acc4
   603	CMOVQCS acc3, acc5
   604	CMOVQCS t0, acc0
   605	CMOVQCS t1, acc1
   606
   607	MOVQ acc4, (8*0)(res_ptr)
   608	MOVQ acc5, (8*1)(res_ptr)
   609	MOVQ acc0, (8*2)(res_ptr)
   610	MOVQ acc1, (8*3)(res_ptr)
   611
   612	RET
   613/* ---------------------------------------*/
   614// func p256Select(res *P256Point, table *p256Table, idx int)
   615TEXT ·p256Select(SB),NOSPLIT,$0
   616	MOVQ idx+16(FP),AX
   617	MOVQ table+8(FP),DI
   618	MOVQ res+0(FP),DX
   619
   620	PXOR X15, X15	// X15 = 0
   621	PCMPEQL X14, X14 // X14 = -1
   622	PSUBL X14, X15   // X15 = 1
   623	MOVL AX, X14
   624	PSHUFD $0, X14, X14
   625
   626	PXOR X0, X0
   627	PXOR X1, X1
   628	PXOR X2, X2
   629	PXOR X3, X3
   630	PXOR X4, X4
   631	PXOR X5, X5
   632	MOVQ $16, AX
   633
   634	MOVOU X15, X13
   635
   636loop_select:
   637
   638		MOVOU X13, X12
   639		PADDL X15, X13
   640		PCMPEQL X14, X12
   641
   642		MOVOU (16*0)(DI), X6
   643		MOVOU (16*1)(DI), X7
   644		MOVOU (16*2)(DI), X8
   645		MOVOU (16*3)(DI), X9
   646		MOVOU (16*4)(DI), X10
   647		MOVOU (16*5)(DI), X11
   648		ADDQ $(16*6), DI
   649
   650		PAND X12, X6
   651		PAND X12, X7
   652		PAND X12, X8
   653		PAND X12, X9
   654		PAND X12, X10
   655		PAND X12, X11
   656
   657		PXOR X6, X0
   658		PXOR X7, X1
   659		PXOR X8, X2
   660		PXOR X9, X3
   661		PXOR X10, X4
   662		PXOR X11, X5
   663
   664		DECQ AX
   665		JNE loop_select
   666
   667	MOVOU X0, (16*0)(DX)
   668	MOVOU X1, (16*1)(DX)
   669	MOVOU X2, (16*2)(DX)
   670	MOVOU X3, (16*3)(DX)
   671	MOVOU X4, (16*4)(DX)
   672	MOVOU X5, (16*5)(DX)
   673
   674	RET
   675/* ---------------------------------------*/
   676// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   677TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   678	MOVQ idx+16(FP),AX
   679	MOVQ table+8(FP),DI
   680	MOVQ res+0(FP),DX
   681
   682	PXOR X15, X15	// X15 = 0
   683	PCMPEQL X14, X14 // X14 = -1
   684	PSUBL X14, X15   // X15 = 1
   685	MOVL AX, X14
   686	PSHUFD $0, X14, X14
   687
   688	PXOR X0, X0
   689	PXOR X1, X1
   690	PXOR X2, X2
   691	PXOR X3, X3
   692	MOVQ $16, AX
   693
   694	MOVOU X15, X13
   695
   696loop_select_base:
   697
   698		MOVOU X13, X12
   699		PADDL X15, X13
   700		PCMPEQL X14, X12
   701
   702		MOVOU (16*0)(DI), X4
   703		MOVOU (16*1)(DI), X5
   704		MOVOU (16*2)(DI), X6
   705		MOVOU (16*3)(DI), X7
   706
   707		MOVOU (16*4)(DI), X8
   708		MOVOU (16*5)(DI), X9
   709		MOVOU (16*6)(DI), X10
   710		MOVOU (16*7)(DI), X11
   711
   712		ADDQ $(16*8), DI
   713
   714		PAND X12, X4
   715		PAND X12, X5
   716		PAND X12, X6
   717		PAND X12, X7
   718
   719		MOVOU X13, X12
   720		PADDL X15, X13
   721		PCMPEQL X14, X12
   722
   723		PAND X12, X8
   724		PAND X12, X9
   725		PAND X12, X10
   726		PAND X12, X11
   727
   728		PXOR X4, X0
   729		PXOR X5, X1
   730		PXOR X6, X2
   731		PXOR X7, X3
   732
   733		PXOR X8, X0
   734		PXOR X9, X1
   735		PXOR X10, X2
   736		PXOR X11, X3
   737
   738		DECQ AX
   739		JNE loop_select_base
   740
   741	MOVOU X0, (16*0)(DX)
   742	MOVOU X1, (16*1)(DX)
   743	MOVOU X2, (16*2)(DX)
   744	MOVOU X3, (16*3)(DX)
   745
   746	RET
   747/* ---------------------------------------*/
   748// func p256OrdMul(res, in1, in2 *p256OrdElement)
   749TEXT ·p256OrdMul(SB),NOSPLIT,$0
   750	MOVQ res+0(FP), res_ptr
   751	MOVQ in1+8(FP), x_ptr
   752	MOVQ in2+16(FP), y_ptr
   753	// x * y[0]
   754	MOVQ (8*0)(y_ptr), t0
   755
   756	MOVQ (8*0)(x_ptr), AX
   757	MULQ t0
   758	MOVQ AX, acc0
   759	MOVQ DX, acc1
   760
   761	MOVQ (8*1)(x_ptr), AX
   762	MULQ t0
   763	ADDQ AX, acc1
   764	ADCQ $0, DX
   765	MOVQ DX, acc2
   766
   767	MOVQ (8*2)(x_ptr), AX
   768	MULQ t0
   769	ADDQ AX, acc2
   770	ADCQ $0, DX
   771	MOVQ DX, acc3
   772
   773	MOVQ (8*3)(x_ptr), AX
   774	MULQ t0
   775	ADDQ AX, acc3
   776	ADCQ $0, DX
   777	MOVQ DX, acc4
   778	XORQ acc5, acc5
   779	// First reduction step
   780	MOVQ acc0, AX
   781	MULQ p256ordK0<>(SB)
   782	MOVQ AX, t0
   783
   784	MOVQ p256ord<>+0x00(SB), AX
   785	MULQ t0
   786	ADDQ AX, acc0
   787	ADCQ $0, DX
   788	MOVQ DX, t1
   789
   790	MOVQ p256ord<>+0x08(SB), AX
   791	MULQ t0
   792	ADDQ t1, acc1
   793	ADCQ $0, DX
   794	ADDQ AX, acc1
   795	ADCQ $0, DX
   796	MOVQ DX, t1
   797
   798	MOVQ p256ord<>+0x10(SB), AX
   799	MULQ t0
   800	ADDQ t1, acc2
   801	ADCQ $0, DX
   802	ADDQ AX, acc2
   803	ADCQ $0, DX
   804	MOVQ DX, t1
   805
   806	MOVQ p256ord<>+0x18(SB), AX
   807	MULQ t0
   808	ADDQ t1, acc3
   809	ADCQ $0, DX
   810	ADDQ AX, acc3
   811	ADCQ DX, acc4
   812	ADCQ $0, acc5
   813	// x * y[1]
   814	MOVQ (8*1)(y_ptr), t0
   815
   816	MOVQ (8*0)(x_ptr), AX
   817	MULQ t0
   818	ADDQ AX, acc1
   819	ADCQ $0, DX
   820	MOVQ DX, t1
   821
   822	MOVQ (8*1)(x_ptr), AX
   823	MULQ t0
   824	ADDQ t1, acc2
   825	ADCQ $0, DX
   826	ADDQ AX, acc2
   827	ADCQ $0, DX
   828	MOVQ DX, t1
   829
   830	MOVQ (8*2)(x_ptr), AX
   831	MULQ t0
   832	ADDQ t1, acc3
   833	ADCQ $0, DX
   834	ADDQ AX, acc3
   835	ADCQ $0, DX
   836	MOVQ DX, t1
   837
   838	MOVQ (8*3)(x_ptr), AX
   839	MULQ t0
   840	ADDQ t1, acc4
   841	ADCQ $0, DX
   842	ADDQ AX, acc4
   843	ADCQ DX, acc5
   844	ADCQ $0, acc0
   845	// Second reduction step
   846	MOVQ acc1, AX
   847	MULQ p256ordK0<>(SB)
   848	MOVQ AX, t0
   849
   850	MOVQ p256ord<>+0x00(SB), AX
   851	MULQ t0
   852	ADDQ AX, acc1
   853	ADCQ $0, DX
   854	MOVQ DX, t1
   855
   856	MOVQ p256ord<>+0x08(SB), AX
   857	MULQ t0
   858	ADDQ t1, acc2
   859	ADCQ $0, DX
   860	ADDQ AX, acc2
   861	ADCQ $0, DX
   862	MOVQ DX, t1
   863
   864	MOVQ p256ord<>+0x10(SB), AX
   865	MULQ t0
   866	ADDQ t1, acc3
   867	ADCQ $0, DX
   868	ADDQ AX, acc3
   869	ADCQ $0, DX
   870	MOVQ DX, t1
   871
   872	MOVQ p256ord<>+0x18(SB), AX
   873	MULQ t0
   874	ADDQ t1, acc4
   875	ADCQ $0, DX
   876	ADDQ AX, acc4
   877	ADCQ DX, acc5
   878	ADCQ $0, acc0
   879	// x * y[2]
   880	MOVQ (8*2)(y_ptr), t0
   881
   882	MOVQ (8*0)(x_ptr), AX
   883	MULQ t0
   884	ADDQ AX, acc2
   885	ADCQ $0, DX
   886	MOVQ DX, t1
   887
   888	MOVQ (8*1)(x_ptr), AX
   889	MULQ t0
   890	ADDQ t1, acc3
   891	ADCQ $0, DX
   892	ADDQ AX, acc3
   893	ADCQ $0, DX
   894	MOVQ DX, t1
   895
   896	MOVQ (8*2)(x_ptr), AX
   897	MULQ t0
   898	ADDQ t1, acc4
   899	ADCQ $0, DX
   900	ADDQ AX, acc4
   901	ADCQ $0, DX
   902	MOVQ DX, t1
   903
   904	MOVQ (8*3)(x_ptr), AX
   905	MULQ t0
   906	ADDQ t1, acc5
   907	ADCQ $0, DX
   908	ADDQ AX, acc5
   909	ADCQ DX, acc0
   910	ADCQ $0, acc1
   911	// Third reduction step
   912	MOVQ acc2, AX
   913	MULQ p256ordK0<>(SB)
   914	MOVQ AX, t0
   915
   916	MOVQ p256ord<>+0x00(SB), AX
   917	MULQ t0
   918	ADDQ AX, acc2
   919	ADCQ $0, DX
   920	MOVQ DX, t1
   921
   922	MOVQ p256ord<>+0x08(SB), AX
   923	MULQ t0
   924	ADDQ t1, acc3
   925	ADCQ $0, DX
   926	ADDQ AX, acc3
   927	ADCQ $0, DX
   928	MOVQ DX, t1
   929
   930	MOVQ p256ord<>+0x10(SB), AX
   931	MULQ t0
   932	ADDQ t1, acc4
   933	ADCQ $0, DX
   934	ADDQ AX, acc4
   935	ADCQ $0, DX
   936	MOVQ DX, t1
   937
   938	MOVQ p256ord<>+0x18(SB), AX
   939	MULQ t0
   940	ADDQ t1, acc5
   941	ADCQ $0, DX
   942	ADDQ AX, acc5
   943	ADCQ DX, acc0
   944	ADCQ $0, acc1
   945	// x * y[3]
   946	MOVQ (8*3)(y_ptr), t0
   947
   948	MOVQ (8*0)(x_ptr), AX
   949	MULQ t0
   950	ADDQ AX, acc3
   951	ADCQ $0, DX
   952	MOVQ DX, t1
   953
   954	MOVQ (8*1)(x_ptr), AX
   955	MULQ t0
   956	ADDQ t1, acc4
   957	ADCQ $0, DX
   958	ADDQ AX, acc4
   959	ADCQ $0, DX
   960	MOVQ DX, t1
   961
   962	MOVQ (8*2)(x_ptr), AX
   963	MULQ t0
   964	ADDQ t1, acc5
   965	ADCQ $0, DX
   966	ADDQ AX, acc5
   967	ADCQ $0, DX
   968	MOVQ DX, t1
   969
   970	MOVQ (8*3)(x_ptr), AX
   971	MULQ t0
   972	ADDQ t1, acc0
   973	ADCQ $0, DX
   974	ADDQ AX, acc0
   975	ADCQ DX, acc1
   976	ADCQ $0, acc2
   977	// Last reduction step
   978	MOVQ acc3, AX
   979	MULQ p256ordK0<>(SB)
   980	MOVQ AX, t0
   981
   982	MOVQ p256ord<>+0x00(SB), AX
   983	MULQ t0
   984	ADDQ AX, acc3
   985	ADCQ $0, DX
   986	MOVQ DX, t1
   987
   988	MOVQ p256ord<>+0x08(SB), AX
   989	MULQ t0
   990	ADDQ t1, acc4
   991	ADCQ $0, DX
   992	ADDQ AX, acc4
   993	ADCQ $0, DX
   994	MOVQ DX, t1
   995
   996	MOVQ p256ord<>+0x10(SB), AX
   997	MULQ t0
   998	ADDQ t1, acc5
   999	ADCQ $0, DX
  1000	ADDQ AX, acc5
  1001	ADCQ $0, DX
  1002	MOVQ DX, t1
  1003
  1004	MOVQ p256ord<>+0x18(SB), AX
  1005	MULQ t0
  1006	ADDQ t1, acc0
  1007	ADCQ $0, DX
  1008	ADDQ AX, acc0
  1009	ADCQ DX, acc1
  1010	ADCQ $0, acc2
  1011	// Copy result [255:0]
  1012	MOVQ acc4, x_ptr
  1013	MOVQ acc5, acc3
  1014	MOVQ acc0, t0
  1015	MOVQ acc1, t1
  1016	// Subtract p256
  1017	SUBQ p256ord<>+0x00(SB), acc4
  1018	SBBQ p256ord<>+0x08(SB) ,acc5
  1019	SBBQ p256ord<>+0x10(SB), acc0
  1020	SBBQ p256ord<>+0x18(SB), acc1
  1021	SBBQ $0, acc2
  1022
  1023	CMOVQCS x_ptr, acc4
  1024	CMOVQCS acc3, acc5
  1025	CMOVQCS t0, acc0
  1026	CMOVQCS t1, acc1
  1027
  1028	MOVQ acc4, (8*0)(res_ptr)
  1029	MOVQ acc5, (8*1)(res_ptr)
  1030	MOVQ acc0, (8*2)(res_ptr)
  1031	MOVQ acc1, (8*3)(res_ptr)
  1032
  1033	RET
  1034/* ---------------------------------------*/
  1035// func p256OrdSqr(res, in *p256OrdElement, n int)
  1036TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1037	MOVQ res+0(FP), res_ptr
  1038	MOVQ in+8(FP), x_ptr
  1039	MOVQ n+16(FP), BX
  1040
  1041ordSqrLoop:
  1042
  1043	// y[1:] * y[0]
  1044	MOVQ (8*0)(x_ptr), t0
  1045
  1046	MOVQ (8*1)(x_ptr), AX
  1047	MULQ t0
  1048	MOVQ AX, acc1
  1049	MOVQ DX, acc2
  1050
  1051	MOVQ (8*2)(x_ptr), AX
  1052	MULQ t0
  1053	ADDQ AX, acc2
  1054	ADCQ $0, DX
  1055	MOVQ DX, acc3
  1056
  1057	MOVQ (8*3)(x_ptr), AX
  1058	MULQ t0
  1059	ADDQ AX, acc3
  1060	ADCQ $0, DX
  1061	MOVQ DX, acc4
  1062	// y[2:] * y[1]
  1063	MOVQ (8*1)(x_ptr), t0
  1064
  1065	MOVQ (8*2)(x_ptr), AX
  1066	MULQ t0
  1067	ADDQ AX, acc3
  1068	ADCQ $0, DX
  1069	MOVQ DX, t1
  1070
  1071	MOVQ (8*3)(x_ptr), AX
  1072	MULQ t0
  1073	ADDQ t1, acc4
  1074	ADCQ $0, DX
  1075	ADDQ AX, acc4
  1076	ADCQ $0, DX
  1077	MOVQ DX, acc5
  1078	// y[3] * y[2]
  1079	MOVQ (8*2)(x_ptr), t0
  1080
  1081	MOVQ (8*3)(x_ptr), AX
  1082	MULQ t0
  1083	ADDQ AX, acc5
  1084	ADCQ $0, DX
  1085	MOVQ DX, y_ptr
  1086	XORQ t1, t1
  1087	// *2
  1088	ADDQ acc1, acc1
  1089	ADCQ acc2, acc2
  1090	ADCQ acc3, acc3
  1091	ADCQ acc4, acc4
  1092	ADCQ acc5, acc5
  1093	ADCQ y_ptr, y_ptr
  1094	ADCQ $0, t1
  1095	// Missing products
  1096	MOVQ (8*0)(x_ptr), AX
  1097	MULQ AX
  1098	MOVQ AX, acc0
  1099	MOVQ DX, t0
  1100
  1101	MOVQ (8*1)(x_ptr), AX
  1102	MULQ AX
  1103	ADDQ t0, acc1
  1104	ADCQ AX, acc2
  1105	ADCQ $0, DX
  1106	MOVQ DX, t0
  1107
  1108	MOVQ (8*2)(x_ptr), AX
  1109	MULQ AX
  1110	ADDQ t0, acc3
  1111	ADCQ AX, acc4
  1112	ADCQ $0, DX
  1113	MOVQ DX, t0
  1114
  1115	MOVQ (8*3)(x_ptr), AX
  1116	MULQ AX
  1117	ADDQ t0, acc5
  1118	ADCQ AX, y_ptr
  1119	ADCQ DX, t1
  1120	MOVQ t1, x_ptr
  1121	// First reduction step
  1122	MOVQ acc0, AX
  1123	MULQ p256ordK0<>(SB)
  1124	MOVQ AX, t0
  1125
  1126	MOVQ p256ord<>+0x00(SB), AX
  1127	MULQ t0
  1128	ADDQ AX, acc0
  1129	ADCQ $0, DX
  1130	MOVQ DX, t1
  1131
  1132	MOVQ p256ord<>+0x08(SB), AX
  1133	MULQ t0
  1134	ADDQ t1, acc1
  1135	ADCQ $0, DX
  1136	ADDQ AX, acc1
  1137
  1138	MOVQ t0, t1
  1139	ADCQ DX, acc2
  1140	ADCQ $0, t1
  1141	SUBQ t0, acc2
  1142	SBBQ $0, t1
  1143
  1144	MOVQ t0, AX
  1145	MOVQ t0, DX
  1146	MOVQ t0, acc0
  1147	SHLQ $32, AX
  1148	SHRQ $32, DX
  1149
  1150	ADDQ t1, acc3
  1151	ADCQ $0, acc0
  1152	SUBQ AX, acc3
  1153	SBBQ DX, acc0
  1154	// Second reduction step
  1155	MOVQ acc1, AX
  1156	MULQ p256ordK0<>(SB)
  1157	MOVQ AX, t0
  1158
  1159	MOVQ p256ord<>+0x00(SB), AX
  1160	MULQ t0
  1161	ADDQ AX, acc1
  1162	ADCQ $0, DX
  1163	MOVQ DX, t1
  1164
  1165	MOVQ p256ord<>+0x08(SB), AX
  1166	MULQ t0
  1167	ADDQ t1, acc2
  1168	ADCQ $0, DX
  1169	ADDQ AX, acc2
  1170
  1171	MOVQ t0, t1
  1172	ADCQ DX, acc3
  1173	ADCQ $0, t1
  1174	SUBQ t0, acc3
  1175	SBBQ $0, t1
  1176
  1177	MOVQ t0, AX
  1178	MOVQ t0, DX
  1179	MOVQ t0, acc1
  1180	SHLQ $32, AX
  1181	SHRQ $32, DX
  1182
  1183	ADDQ t1, acc0
  1184	ADCQ $0, acc1
  1185	SUBQ AX, acc0
  1186	SBBQ DX, acc1
  1187	// Third reduction step
  1188	MOVQ acc2, AX
  1189	MULQ p256ordK0<>(SB)
  1190	MOVQ AX, t0
  1191
  1192	MOVQ p256ord<>+0x00(SB), AX
  1193	MULQ t0
  1194	ADDQ AX, acc2
  1195	ADCQ $0, DX
  1196	MOVQ DX, t1
  1197
  1198	MOVQ p256ord<>+0x08(SB), AX
  1199	MULQ t0
  1200	ADDQ t1, acc3
  1201	ADCQ $0, DX
  1202	ADDQ AX, acc3
  1203
  1204	MOVQ t0, t1
  1205	ADCQ DX, acc0
  1206	ADCQ $0, t1
  1207	SUBQ t0, acc0
  1208	SBBQ $0, t1
  1209
  1210	MOVQ t0, AX
  1211	MOVQ t0, DX
  1212	MOVQ t0, acc2
  1213	SHLQ $32, AX
  1214	SHRQ $32, DX
  1215
  1216	ADDQ t1, acc1
  1217	ADCQ $0, acc2
  1218	SUBQ AX, acc1
  1219	SBBQ DX, acc2
  1220	// Last reduction step
  1221	MOVQ acc3, AX
  1222	MULQ p256ordK0<>(SB)
  1223	MOVQ AX, t0
  1224
  1225	MOVQ p256ord<>+0x00(SB), AX
  1226	MULQ t0
  1227	ADDQ AX, acc3
  1228	ADCQ $0, DX
  1229	MOVQ DX, t1
  1230
  1231	MOVQ p256ord<>+0x08(SB), AX
  1232	MULQ t0
  1233	ADDQ t1, acc0
  1234	ADCQ $0, DX
  1235	ADDQ AX, acc0
  1236	ADCQ $0, DX
  1237	MOVQ DX, t1
  1238
  1239	MOVQ t0, t1
  1240	ADCQ DX, acc1
  1241	ADCQ $0, t1
  1242	SUBQ t0, acc1
  1243	SBBQ $0, t1
  1244
  1245	MOVQ t0, AX
  1246	MOVQ t0, DX
  1247	MOVQ t0, acc3
  1248	SHLQ $32, AX
  1249	SHRQ $32, DX
  1250
  1251	ADDQ t1, acc2
  1252	ADCQ $0, acc3
  1253	SUBQ AX, acc2
  1254	SBBQ DX, acc3
  1255	XORQ t0, t0
  1256	// Add bits [511:256] of the sqr result
  1257	ADCQ acc4, acc0
  1258	ADCQ acc5, acc1
  1259	ADCQ y_ptr, acc2
  1260	ADCQ x_ptr, acc3
  1261	ADCQ $0, t0
  1262
  1263	MOVQ acc0, acc4
  1264	MOVQ acc1, acc5
  1265	MOVQ acc2, y_ptr
  1266	MOVQ acc3, t1
  1267	// Subtract p256
  1268	SUBQ p256ord<>+0x00(SB), acc0
  1269	SBBQ p256ord<>+0x08(SB) ,acc1
  1270	SBBQ p256ord<>+0x10(SB), acc2
  1271	SBBQ p256ord<>+0x18(SB), acc3
  1272	SBBQ $0, t0
  1273
  1274	CMOVQCS acc4, acc0
  1275	CMOVQCS acc5, acc1
  1276	CMOVQCS y_ptr, acc2
  1277	CMOVQCS t1, acc3
  1278
  1279	MOVQ acc0, (8*0)(res_ptr)
  1280	MOVQ acc1, (8*1)(res_ptr)
  1281	MOVQ acc2, (8*2)(res_ptr)
  1282	MOVQ acc3, (8*3)(res_ptr)
  1283	MOVQ res_ptr, x_ptr
  1284	DECQ BX
  1285	JNE ordSqrLoop
  1286
  1287	RET
  1288/* ---------------------------------------*/
  1289#undef res_ptr
  1290#undef x_ptr
  1291#undef y_ptr
  1292
  1293#undef acc0
  1294#undef acc1
  1295#undef acc2
  1296#undef acc3
  1297#undef acc4
  1298#undef acc5
  1299#undef t0
  1300#undef t1
  1301/* ---------------------------------------*/
  1302#define mul0 AX
  1303#define mul1 DX
  1304#define acc0 BX
  1305#define acc1 CX
  1306#define acc2 R8
  1307#define acc3 R9
  1308#define acc4 R10
  1309#define acc5 R11
  1310#define acc6 R12
  1311#define acc7 R13
  1312#define t0 R14
  1313#define t1 R15
  1314#define t2 DI
  1315#define t3 SI
  1316#define hlp BP
  1317/* ---------------------------------------*/
  1318TEXT p256SubInternal(SB),NOSPLIT,$0
  1319	XORQ mul0, mul0
  1320	SUBQ t0, acc4
  1321	SBBQ t1, acc5
  1322	SBBQ t2, acc6
  1323	SBBQ t3, acc7
  1324	SBBQ $0, mul0
  1325
  1326	MOVQ acc4, acc0
  1327	MOVQ acc5, acc1
  1328	MOVQ acc6, acc2
  1329	MOVQ acc7, acc3
  1330
  1331	ADDQ $-1, acc4
  1332	ADCQ p256const0<>(SB), acc5
  1333	ADCQ $0, acc6
  1334	ADCQ p256const1<>(SB), acc7
  1335	ANDQ $1, mul0
  1336
  1337	CMOVQEQ acc0, acc4
  1338	CMOVQEQ acc1, acc5
  1339	CMOVQEQ acc2, acc6
  1340	CMOVQEQ acc3, acc7
  1341
  1342	RET
  1343/* ---------------------------------------*/
  1344TEXT p256MulInternal(SB),NOSPLIT,$8
  1345	MOVQ acc4, mul0
  1346	MULQ t0
  1347	MOVQ mul0, acc0
  1348	MOVQ mul1, acc1
  1349
  1350	MOVQ acc4, mul0
  1351	MULQ t1
  1352	ADDQ mul0, acc1
  1353	ADCQ $0, mul1
  1354	MOVQ mul1, acc2
  1355
  1356	MOVQ acc4, mul0
  1357	MULQ t2
  1358	ADDQ mul0, acc2
  1359	ADCQ $0, mul1
  1360	MOVQ mul1, acc3
  1361
  1362	MOVQ acc4, mul0
  1363	MULQ t3
  1364	ADDQ mul0, acc3
  1365	ADCQ $0, mul1
  1366	MOVQ mul1, acc4
  1367
  1368	MOVQ acc5, mul0
  1369	MULQ t0
  1370	ADDQ mul0, acc1
  1371	ADCQ $0, mul1
  1372	MOVQ mul1, hlp
  1373
  1374	MOVQ acc5, mul0
  1375	MULQ t1
  1376	ADDQ hlp, acc2
  1377	ADCQ $0, mul1
  1378	ADDQ mul0, acc2
  1379	ADCQ $0, mul1
  1380	MOVQ mul1, hlp
  1381
  1382	MOVQ acc5, mul0
  1383	MULQ t2
  1384	ADDQ hlp, acc3
  1385	ADCQ $0, mul1
  1386	ADDQ mul0, acc3
  1387	ADCQ $0, mul1
  1388	MOVQ mul1, hlp
  1389
  1390	MOVQ acc5, mul0
  1391	MULQ t3
  1392	ADDQ hlp, acc4
  1393	ADCQ $0, mul1
  1394	ADDQ mul0, acc4
  1395	ADCQ $0, mul1
  1396	MOVQ mul1, acc5
  1397
  1398	MOVQ acc6, mul0
  1399	MULQ t0
  1400	ADDQ mul0, acc2
  1401	ADCQ $0, mul1
  1402	MOVQ mul1, hlp
  1403
  1404	MOVQ acc6, mul0
  1405	MULQ t1
  1406	ADDQ hlp, acc3
  1407	ADCQ $0, mul1
  1408	ADDQ mul0, acc3
  1409	ADCQ $0, mul1
  1410	MOVQ mul1, hlp
  1411
  1412	MOVQ acc6, mul0
  1413	MULQ t2
  1414	ADDQ hlp, acc4
  1415	ADCQ $0, mul1
  1416	ADDQ mul0, acc4
  1417	ADCQ $0, mul1
  1418	MOVQ mul1, hlp
  1419
  1420	MOVQ acc6, mul0
  1421	MULQ t3
  1422	ADDQ hlp, acc5
  1423	ADCQ $0, mul1
  1424	ADDQ mul0, acc5
  1425	ADCQ $0, mul1
  1426	MOVQ mul1, acc6
  1427
  1428	MOVQ acc7, mul0
  1429	MULQ t0
  1430	ADDQ mul0, acc3
  1431	ADCQ $0, mul1
  1432	MOVQ mul1, hlp
  1433
  1434	MOVQ acc7, mul0
  1435	MULQ t1
  1436	ADDQ hlp, acc4
  1437	ADCQ $0, mul1
  1438	ADDQ mul0, acc4
  1439	ADCQ $0, mul1
  1440	MOVQ mul1, hlp
  1441
  1442	MOVQ acc7, mul0
  1443	MULQ t2
  1444	ADDQ hlp, acc5
  1445	ADCQ $0, mul1
  1446	ADDQ mul0, acc5
  1447	ADCQ $0, mul1
  1448	MOVQ mul1, hlp
  1449
  1450	MOVQ acc7, mul0
  1451	MULQ t3
  1452	ADDQ hlp, acc6
  1453	ADCQ $0, mul1
  1454	ADDQ mul0, acc6
  1455	ADCQ $0, mul1
  1456	MOVQ mul1, acc7
  1457	// First reduction step
  1458	MOVQ acc0, mul0
  1459	MOVQ acc0, hlp
  1460	SHLQ $32, acc0
  1461	MULQ p256const1<>(SB)
  1462	SHRQ $32, hlp
  1463	ADDQ acc0, acc1
  1464	ADCQ hlp, acc2
  1465	ADCQ mul0, acc3
  1466	ADCQ $0, mul1
  1467	MOVQ mul1, acc0
  1468	// Second reduction step
  1469	MOVQ acc1, mul0
  1470	MOVQ acc1, hlp
  1471	SHLQ $32, acc1
  1472	MULQ p256const1<>(SB)
  1473	SHRQ $32, hlp
  1474	ADDQ acc1, acc2
  1475	ADCQ hlp, acc3
  1476	ADCQ mul0, acc0
  1477	ADCQ $0, mul1
  1478	MOVQ mul1, acc1
  1479	// Third reduction step
  1480	MOVQ acc2, mul0
  1481	MOVQ acc2, hlp
  1482	SHLQ $32, acc2
  1483	MULQ p256const1<>(SB)
  1484	SHRQ $32, hlp
  1485	ADDQ acc2, acc3
  1486	ADCQ hlp, acc0
  1487	ADCQ mul0, acc1
  1488	ADCQ $0, mul1
  1489	MOVQ mul1, acc2
  1490	// Last reduction step
  1491	MOVQ acc3, mul0
  1492	MOVQ acc3, hlp
  1493	SHLQ $32, acc3
  1494	MULQ p256const1<>(SB)
  1495	SHRQ $32, hlp
  1496	ADDQ acc3, acc0
  1497	ADCQ hlp, acc1
  1498	ADCQ mul0, acc2
  1499	ADCQ $0, mul1
  1500	MOVQ mul1, acc3
  1501	MOVQ $0, BP
  1502	// Add bits [511:256] of the result
  1503	ADCQ acc0, acc4
  1504	ADCQ acc1, acc5
  1505	ADCQ acc2, acc6
  1506	ADCQ acc3, acc7
  1507	ADCQ $0, hlp
  1508	// Copy result
  1509	MOVQ acc4, acc0
  1510	MOVQ acc5, acc1
  1511	MOVQ acc6, acc2
  1512	MOVQ acc7, acc3
  1513	// Subtract p256
  1514	SUBQ $-1, acc4
  1515	SBBQ p256const0<>(SB) ,acc5
  1516	SBBQ $0, acc6
  1517	SBBQ p256const1<>(SB), acc7
  1518	SBBQ $0, hlp
  1519	// If the result of the subtraction is negative, restore the previous result
  1520	CMOVQCS acc0, acc4
  1521	CMOVQCS acc1, acc5
  1522	CMOVQCS acc2, acc6
  1523	CMOVQCS acc3, acc7
  1524
  1525	RET
  1526/* ---------------------------------------*/
  1527TEXT p256SqrInternal(SB),NOSPLIT,$8
  1528
  1529	MOVQ acc4, mul0
  1530	MULQ acc5
  1531	MOVQ mul0, acc1
  1532	MOVQ mul1, acc2
  1533
  1534	MOVQ acc4, mul0
  1535	MULQ acc6
  1536	ADDQ mul0, acc2
  1537	ADCQ $0, mul1
  1538	MOVQ mul1, acc3
  1539
  1540	MOVQ acc4, mul0
  1541	MULQ acc7
  1542	ADDQ mul0, acc3
  1543	ADCQ $0, mul1
  1544	MOVQ mul1, t0
  1545
  1546	MOVQ acc5, mul0
  1547	MULQ acc6
  1548	ADDQ mul0, acc3
  1549	ADCQ $0, mul1
  1550	MOVQ mul1, hlp
  1551
  1552	MOVQ acc5, mul0
  1553	MULQ acc7
  1554	ADDQ hlp, t0
  1555	ADCQ $0, mul1
  1556	ADDQ mul0, t0
  1557	ADCQ $0, mul1
  1558	MOVQ mul1, t1
  1559
  1560	MOVQ acc6, mul0
  1561	MULQ acc7
  1562	ADDQ mul0, t1
  1563	ADCQ $0, mul1
  1564	MOVQ mul1, t2
  1565	XORQ t3, t3
  1566	// *2
  1567	ADDQ acc1, acc1
  1568	ADCQ acc2, acc2
  1569	ADCQ acc3, acc3
  1570	ADCQ t0, t0
  1571	ADCQ t1, t1
  1572	ADCQ t2, t2
  1573	ADCQ $0, t3
  1574	// Missing products
  1575	MOVQ acc4, mul0
  1576	MULQ mul0
  1577	MOVQ mul0, acc0
  1578	MOVQ DX, acc4
  1579
  1580	MOVQ acc5, mul0
  1581	MULQ mul0
  1582	ADDQ acc4, acc1
  1583	ADCQ mul0, acc2
  1584	ADCQ $0, DX
  1585	MOVQ DX, acc4
  1586
  1587	MOVQ acc6, mul0
  1588	MULQ mul0
  1589	ADDQ acc4, acc3
  1590	ADCQ mul0, t0
  1591	ADCQ $0, DX
  1592	MOVQ DX, acc4
  1593
  1594	MOVQ acc7, mul0
  1595	MULQ mul0
  1596	ADDQ acc4, t1
  1597	ADCQ mul0, t2
  1598	ADCQ DX, t3
  1599	// First reduction step
  1600	MOVQ acc0, mul0
  1601	MOVQ acc0, hlp
  1602	SHLQ $32, acc0
  1603	MULQ p256const1<>(SB)
  1604	SHRQ $32, hlp
  1605	ADDQ acc0, acc1
  1606	ADCQ hlp, acc2
  1607	ADCQ mul0, acc3
  1608	ADCQ $0, mul1
  1609	MOVQ mul1, acc0
  1610	// Second reduction step
  1611	MOVQ acc1, mul0
  1612	MOVQ acc1, hlp
  1613	SHLQ $32, acc1
  1614	MULQ p256const1<>(SB)
  1615	SHRQ $32, hlp
  1616	ADDQ acc1, acc2
  1617	ADCQ hlp, acc3
  1618	ADCQ mul0, acc0
  1619	ADCQ $0, mul1
  1620	MOVQ mul1, acc1
  1621	// Third reduction step
  1622	MOVQ acc2, mul0
  1623	MOVQ acc2, hlp
  1624	SHLQ $32, acc2
  1625	MULQ p256const1<>(SB)
  1626	SHRQ $32, hlp
  1627	ADDQ acc2, acc3
  1628	ADCQ hlp, acc0
  1629	ADCQ mul0, acc1
  1630	ADCQ $0, mul1
  1631	MOVQ mul1, acc2
  1632	// Last reduction step
  1633	MOVQ acc3, mul0
  1634	MOVQ acc3, hlp
  1635	SHLQ $32, acc3
  1636	MULQ p256const1<>(SB)
  1637	SHRQ $32, hlp
  1638	ADDQ acc3, acc0
  1639	ADCQ hlp, acc1
  1640	ADCQ mul0, acc2
  1641	ADCQ $0, mul1
  1642	MOVQ mul1, acc3
  1643	MOVQ $0, BP
  1644	// Add bits [511:256] of the result
  1645	ADCQ acc0, t0
  1646	ADCQ acc1, t1
  1647	ADCQ acc2, t2
  1648	ADCQ acc3, t3
  1649	ADCQ $0, hlp
  1650	// Copy result
  1651	MOVQ t0, acc4
  1652	MOVQ t1, acc5
  1653	MOVQ t2, acc6
  1654	MOVQ t3, acc7
  1655	// Subtract p256
  1656	SUBQ $-1, acc4
  1657	SBBQ p256const0<>(SB) ,acc5
  1658	SBBQ $0, acc6
  1659	SBBQ p256const1<>(SB), acc7
  1660	SBBQ $0, hlp
  1661	// If the result of the subtraction is negative, restore the previous result
  1662	CMOVQCS t0, acc4
  1663	CMOVQCS t1, acc5
  1664	CMOVQCS t2, acc6
  1665	CMOVQCS t3, acc7
  1666
  1667	RET
  1668/* ---------------------------------------*/
  1669#define p256MulBy2Inline\
  1670	XORQ mul0, mul0;\
  1671	ADDQ acc4, acc4;\
  1672	ADCQ acc5, acc5;\
  1673	ADCQ acc6, acc6;\
  1674	ADCQ acc7, acc7;\
  1675	ADCQ $0, mul0;\
  1676	MOVQ acc4, t0;\
  1677	MOVQ acc5, t1;\
  1678	MOVQ acc6, t2;\
  1679	MOVQ acc7, t3;\
  1680	SUBQ $-1, t0;\
  1681	SBBQ p256const0<>(SB), t1;\
  1682	SBBQ $0, t2;\
  1683	SBBQ p256const1<>(SB), t3;\
  1684	SBBQ $0, mul0;\
  1685	CMOVQCS acc4, t0;\
  1686	CMOVQCS acc5, t1;\
  1687	CMOVQCS acc6, t2;\
  1688	CMOVQCS acc7, t3;
  1689/* ---------------------------------------*/
  1690#define p256AddInline \
  1691	XORQ mul0, mul0;\
  1692	ADDQ t0, acc4;\
  1693	ADCQ t1, acc5;\
  1694	ADCQ t2, acc6;\
  1695	ADCQ t3, acc7;\
  1696	ADCQ $0, mul0;\
  1697	MOVQ acc4, t0;\
  1698	MOVQ acc5, t1;\
  1699	MOVQ acc6, t2;\
  1700	MOVQ acc7, t3;\
  1701	SUBQ $-1, t0;\
  1702	SBBQ p256const0<>(SB), t1;\
  1703	SBBQ $0, t2;\
  1704	SBBQ p256const1<>(SB), t3;\
  1705	SBBQ $0, mul0;\
  1706	CMOVQCS acc4, t0;\
  1707	CMOVQCS acc5, t1;\
  1708	CMOVQCS acc6, t2;\
  1709	CMOVQCS acc7, t3;
  1710/* ---------------------------------------*/
  1711#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1712#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1713#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1714#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1715#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1716#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1717/* ---------------------------------------*/
  1718#define x1in(off) (32*0 + off)(SP)
  1719#define y1in(off) (32*1 + off)(SP)
  1720#define z1in(off) (32*2 + off)(SP)
  1721#define x2in(off) (32*3 + off)(SP)
  1722#define y2in(off) (32*4 + off)(SP)
  1723#define xout(off) (32*5 + off)(SP)
  1724#define yout(off) (32*6 + off)(SP)
  1725#define zout(off) (32*7 + off)(SP)
  1726#define s2(off)   (32*8 + off)(SP)
  1727#define z1sqr(off) (32*9 + off)(SP)
  1728#define h(off)	  (32*10 + off)(SP)
  1729#define r(off)	  (32*11 + off)(SP)
  1730#define hsqr(off) (32*12 + off)(SP)
  1731#define rsqr(off) (32*13 + off)(SP)
  1732#define hcub(off) (32*14 + off)(SP)
  1733#define rptr	  (32*15)(SP)
  1734#define sel_save  (32*15 + 8)(SP)
  1735#define zero_save (32*15 + 8 + 4)(SP)
  1736
  1737// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1738TEXT ·p256PointAddAffineAsm(SB),0,$512-48
  1739	// Move input to stack in order to free registers
  1740	MOVQ res+0(FP), AX
  1741	MOVQ in1+8(FP), BX
  1742	MOVQ in2+16(FP), CX
  1743	MOVQ sign+24(FP), DX
  1744	MOVQ sel+32(FP), t1
  1745	MOVQ zero+40(FP), t2
  1746
  1747	MOVOU (16*0)(BX), X0
  1748	MOVOU (16*1)(BX), X1
  1749	MOVOU (16*2)(BX), X2
  1750	MOVOU (16*3)(BX), X3
  1751	MOVOU (16*4)(BX), X4
  1752	MOVOU (16*5)(BX), X5
  1753
  1754	MOVOU X0, x1in(16*0)
  1755	MOVOU X1, x1in(16*1)
  1756	MOVOU X2, y1in(16*0)
  1757	MOVOU X3, y1in(16*1)
  1758	MOVOU X4, z1in(16*0)
  1759	MOVOU X5, z1in(16*1)
  1760
  1761	MOVOU (16*0)(CX), X0
  1762	MOVOU (16*1)(CX), X1
  1763
  1764	MOVOU X0, x2in(16*0)
  1765	MOVOU X1, x2in(16*1)
  1766	// Store pointer to result
  1767	MOVQ mul0, rptr
  1768	MOVL t1, sel_save
  1769	MOVL t2, zero_save
  1770	// Negate y2in based on sign
  1771	MOVQ (16*2 + 8*0)(CX), acc4
  1772	MOVQ (16*2 + 8*1)(CX), acc5
  1773	MOVQ (16*2 + 8*2)(CX), acc6
  1774	MOVQ (16*2 + 8*3)(CX), acc7
  1775	MOVQ $-1, acc0
  1776	MOVQ p256const0<>(SB), acc1
  1777	MOVQ $0, acc2
  1778	MOVQ p256const1<>(SB), acc3
  1779	XORQ mul0, mul0
  1780	// Speculatively subtract
  1781	SUBQ acc4, acc0
  1782	SBBQ acc5, acc1
  1783	SBBQ acc6, acc2
  1784	SBBQ acc7, acc3
  1785	SBBQ $0, mul0
  1786	MOVQ acc0, t0
  1787	MOVQ acc1, t1
  1788	MOVQ acc2, t2
  1789	MOVQ acc3, t3
  1790	// Add in case the operand was > p256
  1791	ADDQ $-1, acc0
  1792	ADCQ p256const0<>(SB), acc1
  1793	ADCQ $0, acc2
  1794	ADCQ p256const1<>(SB), acc3
  1795	ADCQ $0, mul0
  1796	CMOVQNE t0, acc0
  1797	CMOVQNE t1, acc1
  1798	CMOVQNE t2, acc2
  1799	CMOVQNE t3, acc3
  1800	// If condition is 0, keep original value
  1801	TESTQ DX, DX
  1802	CMOVQEQ acc4, acc0
  1803	CMOVQEQ acc5, acc1
  1804	CMOVQEQ acc6, acc2
  1805	CMOVQEQ acc7, acc3
  1806	// Store result
  1807	MOVQ acc0, y2in(8*0)
  1808	MOVQ acc1, y2in(8*1)
  1809	MOVQ acc2, y2in(8*2)
  1810	MOVQ acc3, y2in(8*3)
  1811	// Begin point add
  1812	LDacc (z1in)
  1813	CALL p256SqrInternal(SB)	// z1ˆ2
  1814	ST (z1sqr)
  1815
  1816	LDt (x2in)
  1817	CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1818
  1819	LDt (x1in)
  1820	CALL p256SubInternal(SB)	// h = u2 - u1
  1821	ST (h)
  1822
  1823	LDt (z1in)
  1824	CALL p256MulInternal(SB)	// z3 = h * z1
  1825	ST (zout)
  1826
  1827	LDacc (z1sqr)
  1828	CALL p256MulInternal(SB)	// z1ˆ3
  1829
  1830	LDt (y2in)
  1831	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1832	ST (s2)
  1833
  1834	LDt (y1in)
  1835	CALL p256SubInternal(SB)	// r = s2 - s1
  1836	ST (r)
  1837
  1838	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1839	ST (rsqr)
  1840
  1841	LDacc (h)
  1842	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1843	ST (hsqr)
  1844
  1845	LDt (h)
  1846	CALL p256MulInternal(SB)	// hcub = hˆ3
  1847	ST (hcub)
  1848
  1849	LDt (y1in)
  1850	CALL p256MulInternal(SB)	// y1 * hˆ3
  1851	ST (s2)
  1852
  1853	LDacc (x1in)
  1854	LDt (hsqr)
  1855	CALL p256MulInternal(SB)	// u1 * hˆ2
  1856	ST (h)
  1857
  1858	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1859	LDacc (rsqr)
  1860	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1861
  1862	LDt (hcub)
  1863	CALL p256SubInternal(SB)
  1864	ST (xout)
  1865
  1866	MOVQ acc4, t0
  1867	MOVQ acc5, t1
  1868	MOVQ acc6, t2
  1869	MOVQ acc7, t3
  1870	LDacc (h)
  1871	CALL p256SubInternal(SB)
  1872
  1873	LDt (r)
  1874	CALL p256MulInternal(SB)
  1875
  1876	LDt (s2)
  1877	CALL p256SubInternal(SB)
  1878	ST (yout)
  1879	// Load stored values from stack
  1880	MOVQ rptr, AX
  1881	MOVL sel_save, BX
  1882	MOVL zero_save, CX
  1883	// The result is not valid if (sel == 0), conditional choose
  1884	MOVOU xout(16*0), X0
  1885	MOVOU xout(16*1), X1
  1886	MOVOU yout(16*0), X2
  1887	MOVOU yout(16*1), X3
  1888	MOVOU zout(16*0), X4
  1889	MOVOU zout(16*1), X5
  1890
  1891	MOVL BX, X6
  1892	MOVL CX, X7
  1893
  1894	PXOR X8, X8
  1895	PCMPEQL X9, X9
  1896
  1897	PSHUFD $0, X6, X6
  1898	PSHUFD $0, X7, X7
  1899
  1900	PCMPEQL X8, X6
  1901	PCMPEQL X8, X7
  1902
  1903	MOVOU X6, X15
  1904	PANDN X9, X15
  1905
  1906	MOVOU x1in(16*0), X9
  1907	MOVOU x1in(16*1), X10
  1908	MOVOU y1in(16*0), X11
  1909	MOVOU y1in(16*1), X12
  1910	MOVOU z1in(16*0), X13
  1911	MOVOU z1in(16*1), X14
  1912
  1913	PAND X15, X0
  1914	PAND X15, X1
  1915	PAND X15, X2
  1916	PAND X15, X3
  1917	PAND X15, X4
  1918	PAND X15, X5
  1919
  1920	PAND X6, X9
  1921	PAND X6, X10
  1922	PAND X6, X11
  1923	PAND X6, X12
  1924	PAND X6, X13
  1925	PAND X6, X14
  1926
  1927	PXOR X9, X0
  1928	PXOR X10, X1
  1929	PXOR X11, X2
  1930	PXOR X12, X3
  1931	PXOR X13, X4
  1932	PXOR X14, X5
  1933	// Similarly if zero == 0
  1934	PCMPEQL X9, X9
  1935	MOVOU X7, X15
  1936	PANDN X9, X15
  1937
  1938	MOVOU x2in(16*0), X9
  1939	MOVOU x2in(16*1), X10
  1940	MOVOU y2in(16*0), X11
  1941	MOVOU y2in(16*1), X12
  1942	MOVOU p256one<>+0x00(SB), X13
  1943	MOVOU p256one<>+0x10(SB), X14
  1944
  1945	PAND X15, X0
  1946	PAND X15, X1
  1947	PAND X15, X2
  1948	PAND X15, X3
  1949	PAND X15, X4
  1950	PAND X15, X5
  1951
  1952	PAND X7, X9
  1953	PAND X7, X10
  1954	PAND X7, X11
  1955	PAND X7, X12
  1956	PAND X7, X13
  1957	PAND X7, X14
  1958
  1959	PXOR X9, X0
  1960	PXOR X10, X1
  1961	PXOR X11, X2
  1962	PXOR X12, X3
  1963	PXOR X13, X4
  1964	PXOR X14, X5
  1965	// Finally output the result
  1966	MOVOU X0, (16*0)(AX)
  1967	MOVOU X1, (16*1)(AX)
  1968	MOVOU X2, (16*2)(AX)
  1969	MOVOU X3, (16*3)(AX)
  1970	MOVOU X4, (16*4)(AX)
  1971	MOVOU X5, (16*5)(AX)
  1972	MOVQ $0, rptr
  1973
  1974	RET
  1975#undef x1in
  1976#undef y1in
  1977#undef z1in
  1978#undef x2in
  1979#undef y2in
  1980#undef xout
  1981#undef yout
  1982#undef zout
  1983#undef s2
  1984#undef z1sqr
  1985#undef h
  1986#undef r
  1987#undef hsqr
  1988#undef rsqr
  1989#undef hcub
  1990#undef rptr
  1991#undef sel_save
  1992#undef zero_save
  1993
  1994// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1995// otherwise. It writes to [acc4..acc7], t0 and t1.
  1996TEXT p256IsZero(SB),NOSPLIT,$0
  1997	// AX contains a flag that is set if the input is zero.
  1998	XORQ AX, AX
  1999	MOVQ $1, t1
  2000
  2001	// Check whether [acc4..acc7] are all zero.
  2002	MOVQ acc4, t0
  2003	ORQ acc5, t0
  2004	ORQ acc6, t0
  2005	ORQ acc7, t0
  2006
  2007	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2008	// appear to be supported in Go. Thus t1 = 1.)
  2009	CMOVQEQ t1, AX
  2010
  2011	// XOR [acc4..acc7] with P and compare with zero again.
  2012	XORQ $-1, acc4
  2013	XORQ p256const0<>(SB), acc5
  2014	XORQ p256const1<>(SB), acc7
  2015	ORQ acc5, acc4
  2016	ORQ acc6, acc4
  2017	ORQ acc7, acc4
  2018
  2019	// Set the zero flag if so.
  2020	CMOVQEQ t1, AX
  2021	RET
  2022
  2023/* ---------------------------------------*/
  2024#define x1in(off) (32*0 + off)(SP)
  2025#define y1in(off) (32*1 + off)(SP)
  2026#define z1in(off) (32*2 + off)(SP)
  2027#define x2in(off) (32*3 + off)(SP)
  2028#define y2in(off) (32*4 + off)(SP)
  2029#define z2in(off) (32*5 + off)(SP)
  2030
  2031#define xout(off) (32*6 + off)(SP)
  2032#define yout(off) (32*7 + off)(SP)
  2033#define zout(off) (32*8 + off)(SP)
  2034
  2035#define u1(off)    (32*9 + off)(SP)
  2036#define u2(off)    (32*10 + off)(SP)
  2037#define s1(off)    (32*11 + off)(SP)
  2038#define s2(off)    (32*12 + off)(SP)
  2039#define z1sqr(off) (32*13 + off)(SP)
  2040#define z2sqr(off) (32*14 + off)(SP)
  2041#define h(off)     (32*15 + off)(SP)
  2042#define r(off)     (32*16 + off)(SP)
  2043#define hsqr(off)  (32*17 + off)(SP)
  2044#define rsqr(off)  (32*18 + off)(SP)
  2045#define hcub(off)  (32*19 + off)(SP)
  2046#define rptr       (32*20)(SP)
  2047#define points_eq  (32*20+8)(SP)
  2048
  2049//func p256PointAddAsm(res, in1, in2 *P256Point) int
  2050TEXT ·p256PointAddAsm(SB),0,$680-32
  2051	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2052	// Move input to stack in order to free registers
  2053	MOVQ res+0(FP), AX
  2054	MOVQ in1+8(FP), BX
  2055	MOVQ in2+16(FP), CX
  2056
  2057	MOVOU (16*0)(BX), X0
  2058	MOVOU (16*1)(BX), X1
  2059	MOVOU (16*2)(BX), X2
  2060	MOVOU (16*3)(BX), X3
  2061	MOVOU (16*4)(BX), X4
  2062	MOVOU (16*5)(BX), X5
  2063
  2064	MOVOU X0, x1in(16*0)
  2065	MOVOU X1, x1in(16*1)
  2066	MOVOU X2, y1in(16*0)
  2067	MOVOU X3, y1in(16*1)
  2068	MOVOU X4, z1in(16*0)
  2069	MOVOU X5, z1in(16*1)
  2070
  2071	MOVOU (16*0)(CX), X0
  2072	MOVOU (16*1)(CX), X1
  2073	MOVOU (16*2)(CX), X2
  2074	MOVOU (16*3)(CX), X3
  2075	MOVOU (16*4)(CX), X4
  2076	MOVOU (16*5)(CX), X5
  2077
  2078	MOVOU X0, x2in(16*0)
  2079	MOVOU X1, x2in(16*1)
  2080	MOVOU X2, y2in(16*0)
  2081	MOVOU X3, y2in(16*1)
  2082	MOVOU X4, z2in(16*0)
  2083	MOVOU X5, z2in(16*1)
  2084	// Store pointer to result
  2085	MOVQ AX, rptr
  2086	// Begin point add
  2087	LDacc (z2in)
  2088	CALL p256SqrInternal(SB)	// z2ˆ2
  2089	ST (z2sqr)
  2090	LDt (z2in)
  2091	CALL p256MulInternal(SB)	// z2ˆ3
  2092	LDt (y1in)
  2093	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2094	ST (s1)
  2095
  2096	LDacc (z1in)
  2097	CALL p256SqrInternal(SB)	// z1ˆ2
  2098	ST (z1sqr)
  2099	LDt (z1in)
  2100	CALL p256MulInternal(SB)	// z1ˆ3
  2101	LDt (y2in)
  2102	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2103	ST (s2)
  2104
  2105	LDt (s1)
  2106	CALL p256SubInternal(SB)	// r = s2 - s1
  2107	ST (r)
  2108	CALL p256IsZero(SB)
  2109	MOVQ AX, points_eq
  2110
  2111	LDacc (z2sqr)
  2112	LDt (x1in)
  2113	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2114	ST (u1)
  2115	LDacc (z1sqr)
  2116	LDt (x2in)
  2117	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2118	ST (u2)
  2119
  2120	LDt (u1)
  2121	CALL p256SubInternal(SB)	// h = u2 - u1
  2122	ST (h)
  2123	CALL p256IsZero(SB)
  2124	ANDQ points_eq, AX
  2125	MOVQ AX, points_eq
  2126
  2127	LDacc (r)
  2128	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2129	ST (rsqr)
  2130
  2131	LDacc (h)
  2132	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2133	ST (hsqr)
  2134
  2135	LDt (h)
  2136	CALL p256MulInternal(SB)	// hcub = hˆ3
  2137	ST (hcub)
  2138
  2139	LDt (s1)
  2140	CALL p256MulInternal(SB)
  2141	ST (s2)
  2142
  2143	LDacc (z1in)
  2144	LDt (z2in)
  2145	CALL p256MulInternal(SB)	// z1 * z2
  2146	LDt (h)
  2147	CALL p256MulInternal(SB)	// z1 * z2 * h
  2148	ST (zout)
  2149
  2150	LDacc (hsqr)
  2151	LDt (u1)
  2152	CALL p256MulInternal(SB)	// hˆ2 * u1
  2153	ST (u2)
  2154
  2155	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2156	LDacc (rsqr)
  2157	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2158
  2159	LDt (hcub)
  2160	CALL p256SubInternal(SB)
  2161	ST (xout)
  2162
  2163	MOVQ acc4, t0
  2164	MOVQ acc5, t1
  2165	MOVQ acc6, t2
  2166	MOVQ acc7, t3
  2167	LDacc (u2)
  2168	CALL p256SubInternal(SB)
  2169
  2170	LDt (r)
  2171	CALL p256MulInternal(SB)
  2172
  2173	LDt (s2)
  2174	CALL p256SubInternal(SB)
  2175	ST (yout)
  2176
  2177	MOVOU xout(16*0), X0
  2178	MOVOU xout(16*1), X1
  2179	MOVOU yout(16*0), X2
  2180	MOVOU yout(16*1), X3
  2181	MOVOU zout(16*0), X4
  2182	MOVOU zout(16*1), X5
  2183	// Finally output the result
  2184	MOVQ rptr, AX
  2185	MOVQ $0, rptr
  2186	MOVOU X0, (16*0)(AX)
  2187	MOVOU X1, (16*1)(AX)
  2188	MOVOU X2, (16*2)(AX)
  2189	MOVOU X3, (16*3)(AX)
  2190	MOVOU X4, (16*4)(AX)
  2191	MOVOU X5, (16*5)(AX)
  2192
  2193	MOVQ points_eq, AX
  2194	MOVQ AX, ret+24(FP)
  2195
  2196	RET
  2197#undef x1in
  2198#undef y1in
  2199#undef z1in
  2200#undef x2in
  2201#undef y2in
  2202#undef z2in
  2203#undef xout
  2204#undef yout
  2205#undef zout
  2206#undef s1
  2207#undef s2
  2208#undef u1
  2209#undef u2
  2210#undef z1sqr
  2211#undef z2sqr
  2212#undef h
  2213#undef r
  2214#undef hsqr
  2215#undef rsqr
  2216#undef hcub
  2217#undef rptr
  2218/* ---------------------------------------*/
  2219#define x(off) (32*0 + off)(SP)
  2220#define y(off) (32*1 + off)(SP)
  2221#define z(off) (32*2 + off)(SP)
  2222
  2223#define s(off)	(32*3 + off)(SP)
  2224#define m(off)	(32*4 + off)(SP)
  2225#define zsqr(off) (32*5 + off)(SP)
  2226#define tmp(off)  (32*6 + off)(SP)
  2227#define rptr	  (32*7)(SP)
  2228
  2229//func p256PointDoubleAsm(res, in *P256Point)
  2230TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
  2231	// Move input to stack in order to free registers
  2232	MOVQ res+0(FP), AX
  2233	MOVQ in+8(FP), BX
  2234
  2235	MOVOU (16*0)(BX), X0
  2236	MOVOU (16*1)(BX), X1
  2237	MOVOU (16*2)(BX), X2
  2238	MOVOU (16*3)(BX), X3
  2239	MOVOU (16*4)(BX), X4
  2240	MOVOU (16*5)(BX), X5
  2241
  2242	MOVOU X0, x(16*0)
  2243	MOVOU X1, x(16*1)
  2244	MOVOU X2, y(16*0)
  2245	MOVOU X3, y(16*1)
  2246	MOVOU X4, z(16*0)
  2247	MOVOU X5, z(16*1)
  2248	// Store pointer to result
  2249	MOVQ AX, rptr
  2250	// Begin point double
  2251	LDacc (z)
  2252	CALL p256SqrInternal(SB)
  2253	ST (zsqr)
  2254
  2255	LDt (x)
  2256	p256AddInline
  2257	STt (m)
  2258
  2259	LDacc (z)
  2260	LDt (y)
  2261	CALL p256MulInternal(SB)
  2262	p256MulBy2Inline
  2263	MOVQ rptr, AX
  2264	// Store z
  2265	MOVQ t0, (16*4 + 8*0)(AX)
  2266	MOVQ t1, (16*4 + 8*1)(AX)
  2267	MOVQ t2, (16*4 + 8*2)(AX)
  2268	MOVQ t3, (16*4 + 8*3)(AX)
  2269
  2270	LDacc (x)
  2271	LDt (zsqr)
  2272	CALL p256SubInternal(SB)
  2273	LDt (m)
  2274	CALL p256MulInternal(SB)
  2275	ST (m)
  2276	// Multiply by 3
  2277	p256MulBy2Inline
  2278	LDacc (m)
  2279	p256AddInline
  2280	STt (m)
  2281	////////////////////////
  2282	LDacc (y)
  2283	p256MulBy2Inline
  2284	t2acc
  2285	CALL p256SqrInternal(SB)
  2286	ST (s)
  2287	CALL p256SqrInternal(SB)
  2288	// Divide by 2
  2289	XORQ mul0, mul0
  2290	MOVQ acc4, t0
  2291	MOVQ acc5, t1
  2292	MOVQ acc6, t2
  2293	MOVQ acc7, t3
  2294
  2295	ADDQ $-1, acc4
  2296	ADCQ p256const0<>(SB), acc5
  2297	ADCQ $0, acc6
  2298	ADCQ p256const1<>(SB), acc7
  2299	ADCQ $0, mul0
  2300	TESTQ $1, t0
  2301
  2302	CMOVQEQ t0, acc4
  2303	CMOVQEQ t1, acc5
  2304	CMOVQEQ t2, acc6
  2305	CMOVQEQ t3, acc7
  2306	ANDQ t0, mul0
  2307
  2308	SHRQ $1, acc5, acc4
  2309	SHRQ $1, acc6, acc5
  2310	SHRQ $1, acc7, acc6
  2311	SHRQ $1, mul0, acc7
  2312	ST (y)
  2313	/////////////////////////
  2314	LDacc (x)
  2315	LDt (s)
  2316	CALL p256MulInternal(SB)
  2317	ST (s)
  2318	p256MulBy2Inline
  2319	STt (tmp)
  2320
  2321	LDacc (m)
  2322	CALL p256SqrInternal(SB)
  2323	LDt (tmp)
  2324	CALL p256SubInternal(SB)
  2325
  2326	MOVQ rptr, AX
  2327	// Store x
  2328	MOVQ acc4, (16*0 + 8*0)(AX)
  2329	MOVQ acc5, (16*0 + 8*1)(AX)
  2330	MOVQ acc6, (16*0 + 8*2)(AX)
  2331	MOVQ acc7, (16*0 + 8*3)(AX)
  2332
  2333	acc2t
  2334	LDacc (s)
  2335	CALL p256SubInternal(SB)
  2336
  2337	LDt (m)
  2338	CALL p256MulInternal(SB)
  2339
  2340	LDt (y)
  2341	CALL p256SubInternal(SB)
  2342	MOVQ rptr, AX
  2343	// Store y
  2344	MOVQ acc4, (16*2 + 8*0)(AX)
  2345	MOVQ acc5, (16*2 + 8*1)(AX)
  2346	MOVQ acc6, (16*2 + 8*2)(AX)
  2347	MOVQ acc7, (16*2 + 8*3)(AX)
  2348	///////////////////////
  2349	MOVQ $0, rptr
  2350
  2351	RET
  2352/* ---------------------------------------*/

View as plain text