...

Text file src/crypto/internal/fips140/nistec/p256_asm_amd64.s

Documentation: crypto/internal/fips140/nistec

     1// Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT.
     2
     3//go:build !purego
     4
     5#include "textflag.h"
     6
     7// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
     8// Requires: SSE2
     9TEXT ·p256MovCond(SB), NOSPLIT, $0-32
    10	MOVQ    res+0(FP), DI
    11	MOVQ    a+8(FP), SI
    12	MOVQ    b+16(FP), CX
    13	MOVQ    cond+24(FP), X12
    14	PXOR    X13, X13
    15	PSHUFD  $0x00, X12, X12
    16	PCMPEQL X13, X12
    17	MOVOU   X12, X0
    18	MOVOU   (SI), X6
    19	PANDN   X6, X0
    20	MOVOU   X12, X1
    21	MOVOU   16(SI), X7
    22	PANDN   X7, X1
    23	MOVOU   X12, X2
    24	MOVOU   32(SI), X8
    25	PANDN   X8, X2
    26	MOVOU   X12, X3
    27	MOVOU   48(SI), X9
    28	PANDN   X9, X3
    29	MOVOU   X12, X4
    30	MOVOU   64(SI), X10
    31	PANDN   X10, X4
    32	MOVOU   X12, X5
    33	MOVOU   80(SI), X11
    34	PANDN   X11, X5
    35	MOVOU   (CX), X6
    36	MOVOU   16(CX), X7
    37	MOVOU   32(CX), X8
    38	MOVOU   48(CX), X9
    39	MOVOU   64(CX), X10
    40	MOVOU   80(CX), X11
    41	PAND    X12, X6
    42	PAND    X12, X7
    43	PAND    X12, X8
    44	PAND    X12, X9
    45	PAND    X12, X10
    46	PAND    X12, X11
    47	PXOR    X6, X0
    48	PXOR    X7, X1
    49	PXOR    X8, X2
    50	PXOR    X9, X3
    51	PXOR    X10, X4
    52	PXOR    X11, X5
    53	MOVOU   X0, (DI)
    54	MOVOU   X1, 16(DI)
    55	MOVOU   X2, 32(DI)
    56	MOVOU   X3, 48(DI)
    57	MOVOU   X4, 64(DI)
    58	MOVOU   X5, 80(DI)
    59	RET
    60
    61// func p256NegCond(val *p256Element, cond int)
    62// Requires: CMOV
    63TEXT ·p256NegCond(SB), NOSPLIT, $0-16
    64	MOVQ val+0(FP), DI
    65	MOVQ cond+8(FP), R14
    66
    67	// acc = poly
    68	MOVQ $-1, R8
    69	MOVQ p256const0<>+0(SB), R9
    70	MOVQ $+0, R10
    71	MOVQ p256const1<>+0(SB), R11
    72
    73	// Load the original value
    74	MOVQ (DI), R13
    75	MOVQ 8(DI), SI
    76	MOVQ 16(DI), CX
    77	MOVQ 24(DI), R15
    78
    79	// Speculatively subtract
    80	SUBQ R13, R8
    81	SBBQ SI, R9
    82	SBBQ CX, R10
    83	SBBQ R15, R11
    84
    85	// If condition is 0, keep original value
    86	TESTQ   R14, R14
    87	CMOVQEQ R13, R8
    88	CMOVQEQ SI, R9
    89	CMOVQEQ CX, R10
    90	CMOVQEQ R15, R11
    91
    92	// Store result
    93	MOVQ R8, (DI)
    94	MOVQ R9, 8(DI)
    95	MOVQ R10, 16(DI)
    96	MOVQ R11, 24(DI)
    97	RET
    98
    99DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
   100GLOBL p256const0<>(SB), RODATA, $8
   101
   102DATA p256const1<>+0(SB)/8, $0xffffffff00000001
   103GLOBL p256const1<>(SB), RODATA, $8
   104
   105// func p256Sqr(res *p256Element, in *p256Element, n int)
   106// Requires: CMOV
   107TEXT ·p256Sqr(SB), NOSPLIT, $0-24
   108	MOVQ res+0(FP), DI
   109	MOVQ in+8(FP), SI
   110	MOVQ n+16(FP), BX
   111
   112sqrLoop:
   113	// y[1:] * y[0]
   114	MOVQ (SI), R14
   115	MOVQ 8(SI), AX
   116	MULQ R14
   117	MOVQ AX, R9
   118	MOVQ DX, R10
   119	MOVQ 16(SI), AX
   120	MULQ R14
   121	ADDQ AX, R10
   122	ADCQ $0x00, DX
   123	MOVQ DX, R11
   124	MOVQ 24(SI), AX
   125	MULQ R14
   126	ADDQ AX, R11
   127	ADCQ $0x00, DX
   128	MOVQ DX, R12
   129
   130	// y[2:] * y[1]
   131	MOVQ 8(SI), R14
   132	MOVQ 16(SI), AX
   133	MULQ R14
   134	ADDQ AX, R11
   135	ADCQ $0x00, DX
   136	MOVQ DX, R15
   137	MOVQ 24(SI), AX
   138	MULQ R14
   139	ADDQ R15, R12
   140	ADCQ $0x00, DX
   141	ADDQ AX, R12
   142	ADCQ $0x00, DX
   143	MOVQ DX, R13
   144
   145	// y[3] * y[2]
   146	MOVQ 16(SI), R14
   147	MOVQ 24(SI), AX
   148	MULQ R14
   149	ADDQ AX, R13
   150	ADCQ $0x00, DX
   151	MOVQ DX, CX
   152	XORQ R15, R15
   153
   154	// *2
   155	ADDQ R9, R9
   156	ADCQ R10, R10
   157	ADCQ R11, R11
   158	ADCQ R12, R12
   159	ADCQ R13, R13
   160	ADCQ CX, CX
   161	ADCQ $0x00, R15
   162
   163	// Missing products
   164	MOVQ (SI), AX
   165	MULQ AX
   166	MOVQ AX, R8
   167	MOVQ DX, R14
   168	MOVQ 8(SI), AX
   169	MULQ AX
   170	ADDQ R14, R9
   171	ADCQ AX, R10
   172	ADCQ $0x00, DX
   173	MOVQ DX, R14
   174	MOVQ 16(SI), AX
   175	MULQ AX
   176	ADDQ R14, R11
   177	ADCQ AX, R12
   178	ADCQ $0x00, DX
   179	MOVQ DX, R14
   180	MOVQ 24(SI), AX
   181	MULQ AX
   182	ADDQ R14, R13
   183	ADCQ AX, CX
   184	ADCQ DX, R15
   185	MOVQ R15, SI
   186
   187	// First reduction step
   188	MOVQ R8, AX
   189	MOVQ R8, R15
   190	SHLQ $0x20, R8
   191	MULQ p256const1<>+0(SB)
   192	SHRQ $0x20, R15
   193	ADDQ R8, R9
   194	ADCQ R15, R10
   195	ADCQ AX, R11
   196	ADCQ $0x00, DX
   197	MOVQ DX, R8
   198
   199	// Second reduction step
   200	MOVQ R9, AX
   201	MOVQ R9, R15
   202	SHLQ $0x20, R9
   203	MULQ p256const1<>+0(SB)
   204	SHRQ $0x20, R15
   205	ADDQ R9, R10
   206	ADCQ R15, R11
   207	ADCQ AX, R8
   208	ADCQ $0x00, DX
   209	MOVQ DX, R9
   210
   211	// Third reduction step
   212	MOVQ R10, AX
   213	MOVQ R10, R15
   214	SHLQ $0x20, R10
   215	MULQ p256const1<>+0(SB)
   216	SHRQ $0x20, R15
   217	ADDQ R10, R11
   218	ADCQ R15, R8
   219	ADCQ AX, R9
   220	ADCQ $0x00, DX
   221	MOVQ DX, R10
   222
   223	// Last reduction step
   224	XORQ R14, R14
   225	MOVQ R11, AX
   226	MOVQ R11, R15
   227	SHLQ $0x20, R11
   228	MULQ p256const1<>+0(SB)
   229	SHRQ $0x20, R15
   230	ADDQ R11, R8
   231	ADCQ R15, R9
   232	ADCQ AX, R10
   233	ADCQ $0x00, DX
   234	MOVQ DX, R11
   235
   236	// Add bits [511:256] of the sqr result
   237	ADCQ R12, R8
   238	ADCQ R13, R9
   239	ADCQ CX, R10
   240	ADCQ SI, R11
   241	ADCQ $0x00, R14
   242	MOVQ R8, R12
   243	MOVQ R9, R13
   244	MOVQ R10, CX
   245	MOVQ R11, R15
   246
   247	// Subtract p256
   248	SUBQ    $-1, R8
   249	SBBQ    p256const0<>+0(SB), R9
   250	SBBQ    $0x00, R10
   251	SBBQ    p256const1<>+0(SB), R11
   252	SBBQ    $0x00, R14
   253	CMOVQCS R12, R8
   254	CMOVQCS R13, R9
   255	CMOVQCS CX, R10
   256	CMOVQCS R15, R11
   257	MOVQ    R8, (DI)
   258	MOVQ    R9, 8(DI)
   259	MOVQ    R10, 16(DI)
   260	MOVQ    R11, 24(DI)
   261	MOVQ    DI, SI
   262	DECQ    BX
   263	JNE     sqrLoop
   264	RET
   265
   266// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
   267// Requires: CMOV
   268TEXT ·p256Mul(SB), NOSPLIT, $0-24
   269	MOVQ res+0(FP), DI
   270	MOVQ in1+8(FP), SI
   271	MOVQ in2+16(FP), CX
   272
   273	// x * y[0]
   274	MOVQ (CX), R14
   275	MOVQ (SI), AX
   276	MULQ R14
   277	MOVQ AX, R8
   278	MOVQ DX, R9
   279	MOVQ 8(SI), AX
   280	MULQ R14
   281	ADDQ AX, R9
   282	ADCQ $0x00, DX
   283	MOVQ DX, R10
   284	MOVQ 16(SI), AX
   285	MULQ R14
   286	ADDQ AX, R10
   287	ADCQ $0x00, DX
   288	MOVQ DX, R11
   289	MOVQ 24(SI), AX
   290	MULQ R14
   291	ADDQ AX, R11
   292	ADCQ $0x00, DX
   293	MOVQ DX, R12
   294	XORQ R13, R13
   295
   296	// First reduction step
   297	MOVQ R8, AX
   298	MOVQ R8, R15
   299	SHLQ $0x20, R8
   300	MULQ p256const1<>+0(SB)
   301	SHRQ $0x20, R15
   302	ADDQ R8, R9
   303	ADCQ R15, R10
   304	ADCQ AX, R11
   305	ADCQ DX, R12
   306	ADCQ $0x00, R13
   307	XORQ R8, R8
   308
   309	// x * y[1]
   310	MOVQ 8(CX), R14
   311	MOVQ (SI), AX
   312	MULQ R14
   313	ADDQ AX, R9
   314	ADCQ $0x00, DX
   315	MOVQ DX, R15
   316	MOVQ 8(SI), AX
   317	MULQ R14
   318	ADDQ R15, R10
   319	ADCQ $0x00, DX
   320	ADDQ AX, R10
   321	ADCQ $0x00, DX
   322	MOVQ DX, R15
   323	MOVQ 16(SI), AX
   324	MULQ R14
   325	ADDQ R15, R11
   326	ADCQ $0x00, DX
   327	ADDQ AX, R11
   328	ADCQ $0x00, DX
   329	MOVQ DX, R15
   330	MOVQ 24(SI), AX
   331	MULQ R14
   332	ADDQ R15, R12
   333	ADCQ $0x00, DX
   334	ADDQ AX, R12
   335	ADCQ DX, R13
   336	ADCQ $0x00, R8
   337
   338	// Second reduction step
   339	MOVQ R9, AX
   340	MOVQ R9, R15
   341	SHLQ $0x20, R9
   342	MULQ p256const1<>+0(SB)
   343	SHRQ $0x20, R15
   344	ADDQ R9, R10
   345	ADCQ R15, R11
   346	ADCQ AX, R12
   347	ADCQ DX, R13
   348	ADCQ $0x00, R8
   349	XORQ R9, R9
   350
   351	// x * y[2]
   352	MOVQ 16(CX), R14
   353	MOVQ (SI), AX
   354	MULQ R14
   355	ADDQ AX, R10
   356	ADCQ $0x00, DX
   357	MOVQ DX, R15
   358	MOVQ 8(SI), AX
   359	MULQ R14
   360	ADDQ R15, R11
   361	ADCQ $0x00, DX
   362	ADDQ AX, R11
   363	ADCQ $0x00, DX
   364	MOVQ DX, R15
   365	MOVQ 16(SI), AX
   366	MULQ R14
   367	ADDQ R15, R12
   368	ADCQ $0x00, DX
   369	ADDQ AX, R12
   370	ADCQ $0x00, DX
   371	MOVQ DX, R15
   372	MOVQ 24(SI), AX
   373	MULQ R14
   374	ADDQ R15, R13
   375	ADCQ $0x00, DX
   376	ADDQ AX, R13
   377	ADCQ DX, R8
   378	ADCQ $0x00, R9
   379
   380	// Third reduction step
   381	MOVQ R10, AX
   382	MOVQ R10, R15
   383	SHLQ $0x20, R10
   384	MULQ p256const1<>+0(SB)
   385	SHRQ $0x20, R15
   386	ADDQ R10, R11
   387	ADCQ R15, R12
   388	ADCQ AX, R13
   389	ADCQ DX, R8
   390	ADCQ $0x00, R9
   391	XORQ R10, R10
   392
   393	// x * y[3]
   394	MOVQ 24(CX), R14
   395	MOVQ (SI), AX
   396	MULQ R14
   397	ADDQ AX, R11
   398	ADCQ $0x00, DX
   399	MOVQ DX, R15
   400	MOVQ 8(SI), AX
   401	MULQ R14
   402	ADDQ R15, R12
   403	ADCQ $0x00, DX
   404	ADDQ AX, R12
   405	ADCQ $0x00, DX
   406	MOVQ DX, R15
   407	MOVQ 16(SI), AX
   408	MULQ R14
   409	ADDQ R15, R13
   410	ADCQ $0x00, DX
   411	ADDQ AX, R13
   412	ADCQ $0x00, DX
   413	MOVQ DX, R15
   414	MOVQ 24(SI), AX
   415	MULQ R14
   416	ADDQ R15, R8
   417	ADCQ $0x00, DX
   418	ADDQ AX, R8
   419	ADCQ DX, R9
   420	ADCQ $0x00, R10
   421
   422	// Last reduction step
   423	MOVQ R11, AX
   424	MOVQ R11, R15
   425	SHLQ $0x20, R11
   426	MULQ p256const1<>+0(SB)
   427	SHRQ $0x20, R15
   428	ADDQ R11, R12
   429	ADCQ R15, R13
   430	ADCQ AX, R8
   431	ADCQ DX, R9
   432	ADCQ $0x00, R10
   433
   434	// Copy result [255:0]
   435	MOVQ R12, SI
   436	MOVQ R13, R11
   437	MOVQ R8, R14
   438	MOVQ R9, R15
   439
   440	// Subtract p256
   441	SUBQ    $-1, R12
   442	SBBQ    p256const0<>+0(SB), R13
   443	SBBQ    $0x00, R8
   444	SBBQ    p256const1<>+0(SB), R9
   445	SBBQ    $0x00, R10
   446	CMOVQCS SI, R12
   447	CMOVQCS R11, R13
   448	CMOVQCS R14, R8
   449	CMOVQCS R15, R9
   450	MOVQ    R12, (DI)
   451	MOVQ    R13, 8(DI)
   452	MOVQ    R8, 16(DI)
   453	MOVQ    R9, 24(DI)
   454	RET
   455
   456// func p256FromMont(res *p256Element, in *p256Element)
   457// Requires: CMOV
   458TEXT ·p256FromMont(SB), NOSPLIT, $0-16
   459	MOVQ res+0(FP), DI
   460	MOVQ in+8(FP), SI
   461	MOVQ (SI), R8
   462	MOVQ 8(SI), R9
   463	MOVQ 16(SI), R10
   464	MOVQ 24(SI), R11
   465	XORQ R12, R12
   466
   467	// Only reduce, no multiplications are needed
   468	// First stage
   469	MOVQ R8, AX
   470	MOVQ R8, R15
   471	SHLQ $0x20, R8
   472	MULQ p256const1<>+0(SB)
   473	SHRQ $0x20, R15
   474	ADDQ R8, R9
   475	ADCQ R15, R10
   476	ADCQ AX, R11
   477	ADCQ DX, R12
   478	XORQ R13, R13
   479
   480	// Second stage
   481	MOVQ R9, AX
   482	MOVQ R9, R15
   483	SHLQ $0x20, R9
   484	MULQ p256const1<>+0(SB)
   485	SHRQ $0x20, R15
   486	ADDQ R9, R10
   487	ADCQ R15, R11
   488	ADCQ AX, R12
   489	ADCQ DX, R13
   490	XORQ R8, R8
   491
   492	// Third stage
   493	MOVQ R10, AX
   494	MOVQ R10, R15
   495	SHLQ $0x20, R10
   496	MULQ p256const1<>+0(SB)
   497	SHRQ $0x20, R15
   498	ADDQ R10, R11
   499	ADCQ R15, R12
   500	ADCQ AX, R13
   501	ADCQ DX, R8
   502	XORQ R9, R9
   503
   504	// Last stage
   505	MOVQ    R11, AX
   506	MOVQ    R11, R15
   507	SHLQ    $0x20, R11
   508	MULQ    p256const1<>+0(SB)
   509	SHRQ    $0x20, R15
   510	ADDQ    R11, R12
   511	ADCQ    R15, R13
   512	ADCQ    AX, R8
   513	ADCQ    DX, R9
   514	MOVQ    R12, SI
   515	MOVQ    R13, R11
   516	MOVQ    R8, R14
   517	MOVQ    R9, R15
   518	SUBQ    $-1, R12
   519	SBBQ    p256const0<>+0(SB), R13
   520	SBBQ    $0x00, R8
   521	SBBQ    p256const1<>+0(SB), R9
   522	CMOVQCS SI, R12
   523	CMOVQCS R11, R13
   524	CMOVQCS R14, R8
   525	CMOVQCS R15, R9
   526	MOVQ    R12, (DI)
   527	MOVQ    R13, 8(DI)
   528	MOVQ    R8, 16(DI)
   529	MOVQ    R9, 24(DI)
   530	RET
   531
   532// func p256Select(res *P256Point, table *p256Table, idx int)
   533// Requires: SSE2
   534TEXT ·p256Select(SB), NOSPLIT, $0-24
   535	MOVQ    idx+16(FP), AX
   536	MOVQ    table+8(FP), DI
   537	MOVQ    res+0(FP), DX
   538	PXOR    X15, X15
   539	PCMPEQL X14, X14
   540	PSUBL   X14, X15
   541	MOVL    AX, X14
   542	PSHUFD  $0x00, X14, X14
   543	PXOR    X0, X0
   544	PXOR    X1, X1
   545	PXOR    X2, X2
   546	PXOR    X3, X3
   547	PXOR    X4, X4
   548	PXOR    X5, X5
   549	MOVQ    $0x00000010, AX
   550	MOVOU   X15, X13
   551
   552loop_select:
   553	MOVOU   X13, X12
   554	PADDL   X15, X13
   555	PCMPEQL X14, X12
   556	MOVOU   (DI), X6
   557	MOVOU   16(DI), X7
   558	MOVOU   32(DI), X8
   559	MOVOU   48(DI), X9
   560	MOVOU   64(DI), X10
   561	MOVOU   80(DI), X11
   562	ADDQ    $0x60, DI
   563	PAND    X12, X6
   564	PAND    X12, X7
   565	PAND    X12, X8
   566	PAND    X12, X9
   567	PAND    X12, X10
   568	PAND    X12, X11
   569	PXOR    X6, X0
   570	PXOR    X7, X1
   571	PXOR    X8, X2
   572	PXOR    X9, X3
   573	PXOR    X10, X4
   574	PXOR    X11, X5
   575	DECQ    AX
   576	JNE     loop_select
   577	MOVOU   X0, (DX)
   578	MOVOU   X1, 16(DX)
   579	MOVOU   X2, 32(DX)
   580	MOVOU   X3, 48(DX)
   581	MOVOU   X4, 64(DX)
   582	MOVOU   X5, 80(DX)
   583	RET
   584
   585// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   586// Requires: SSE2
   587TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
   588	MOVQ    idx+16(FP), AX
   589	MOVQ    table+8(FP), DI
   590	MOVQ    res+0(FP), DX
   591	PXOR    X15, X15
   592	PCMPEQL X14, X14
   593	PSUBL   X14, X15
   594	MOVL    AX, X14
   595	PSHUFD  $0x00, X14, X14
   596	PXOR    X0, X0
   597	PXOR    X1, X1
   598	PXOR    X2, X2
   599	PXOR    X3, X3
   600	MOVQ    $0x00000010, AX
   601	MOVOU   X15, X13
   602
   603loop_select_base:
   604	MOVOU   X13, X12
   605	PADDL   X15, X13
   606	PCMPEQL X14, X12
   607	MOVOU   (DI), X4
   608	MOVOU   16(DI), X5
   609	MOVOU   32(DI), X6
   610	MOVOU   48(DI), X7
   611	MOVOU   64(DI), X8
   612	MOVOU   80(DI), X9
   613	MOVOU   96(DI), X10
   614	MOVOU   112(DI), X11
   615	ADDQ    $0x80, DI
   616	PAND    X12, X4
   617	PAND    X12, X5
   618	PAND    X12, X6
   619	PAND    X12, X7
   620	MOVOU   X13, X12
   621	PADDL   X15, X13
   622	PCMPEQL X14, X12
   623	PAND    X12, X8
   624	PAND    X12, X9
   625	PAND    X12, X10
   626	PAND    X12, X11
   627	PXOR    X4, X0
   628	PXOR    X5, X1
   629	PXOR    X6, X2
   630	PXOR    X7, X3
   631	PXOR    X8, X0
   632	PXOR    X9, X1
   633	PXOR    X10, X2
   634	PXOR    X11, X3
   635	DECQ    AX
   636	JNE     loop_select_base
   637	MOVOU   X0, (DX)
   638	MOVOU   X1, 16(DX)
   639	MOVOU   X2, 32(DX)
   640	MOVOU   X3, 48(DX)
   641	RET
   642
   643// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
   644// Requires: CMOV
   645TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
   646	MOVQ res+0(FP), DI
   647	MOVQ in1+8(FP), SI
   648	MOVQ in2+16(FP), CX
   649
   650	// x * y[0]
   651	MOVQ (CX), R14
   652	MOVQ (SI), AX
   653	MULQ R14
   654	MOVQ AX, R8
   655	MOVQ DX, R9
   656	MOVQ 8(SI), AX
   657	MULQ R14
   658	ADDQ AX, R9
   659	ADCQ $0x00, DX
   660	MOVQ DX, R10
   661	MOVQ 16(SI), AX
   662	MULQ R14
   663	ADDQ AX, R10
   664	ADCQ $0x00, DX
   665	MOVQ DX, R11
   666	MOVQ 24(SI), AX
   667	MULQ R14
   668	ADDQ AX, R11
   669	ADCQ $0x00, DX
   670	MOVQ DX, R12
   671	XORQ R13, R13
   672
   673	// First reduction step
   674	MOVQ R8, AX
   675	MULQ p256ordK0<>+0(SB)
   676	MOVQ AX, R14
   677	MOVQ p256ord<>+0(SB), AX
   678	MULQ R14
   679	ADDQ AX, R8
   680	ADCQ $0x00, DX
   681	MOVQ DX, R15
   682	MOVQ p256ord<>+8(SB), AX
   683	MULQ R14
   684	ADDQ R15, R9
   685	ADCQ $0x00, DX
   686	ADDQ AX, R9
   687	ADCQ $0x00, DX
   688	MOVQ DX, R15
   689	MOVQ p256ord<>+16(SB), AX
   690	MULQ R14
   691	ADDQ R15, R10
   692	ADCQ $0x00, DX
   693	ADDQ AX, R10
   694	ADCQ $0x00, DX
   695	MOVQ DX, R15
   696	MOVQ p256ord<>+24(SB), AX
   697	MULQ R14
   698	ADDQ R15, R11
   699	ADCQ $0x00, DX
   700	ADDQ AX, R11
   701	ADCQ DX, R12
   702	ADCQ $0x00, R13
   703
   704	// x * y[1]
   705	MOVQ 8(CX), R14
   706	MOVQ (SI), AX
   707	MULQ R14
   708	ADDQ AX, R9
   709	ADCQ $0x00, DX
   710	MOVQ DX, R15
   711	MOVQ 8(SI), AX
   712	MULQ R14
   713	ADDQ R15, R10
   714	ADCQ $0x00, DX
   715	ADDQ AX, R10
   716	ADCQ $0x00, DX
   717	MOVQ DX, R15
   718	MOVQ 16(SI), AX
   719	MULQ R14
   720	ADDQ R15, R11
   721	ADCQ $0x00, DX
   722	ADDQ AX, R11
   723	ADCQ $0x00, DX
   724	MOVQ DX, R15
   725	MOVQ 24(SI), AX
   726	MULQ R14
   727	ADDQ R15, R12
   728	ADCQ $0x00, DX
   729	ADDQ AX, R12
   730	ADCQ DX, R13
   731	ADCQ $0x00, R8
   732
   733	// Second reduction step
   734	MOVQ R9, AX
   735	MULQ p256ordK0<>+0(SB)
   736	MOVQ AX, R14
   737	MOVQ p256ord<>+0(SB), AX
   738	MULQ R14
   739	ADDQ AX, R9
   740	ADCQ $0x00, DX
   741	MOVQ DX, R15
   742	MOVQ p256ord<>+8(SB), AX
   743	MULQ R14
   744	ADDQ R15, R10
   745	ADCQ $0x00, DX
   746	ADDQ AX, R10
   747	ADCQ $0x00, DX
   748	MOVQ DX, R15
   749	MOVQ p256ord<>+16(SB), AX
   750	MULQ R14
   751	ADDQ R15, R11
   752	ADCQ $0x00, DX
   753	ADDQ AX, R11
   754	ADCQ $0x00, DX
   755	MOVQ DX, R15
   756	MOVQ p256ord<>+24(SB), AX
   757	MULQ R14
   758	ADDQ R15, R12
   759	ADCQ $0x00, DX
   760	ADDQ AX, R12
   761	ADCQ DX, R13
   762	ADCQ $0x00, R8
   763
   764	// x * y[2]
   765	MOVQ 16(CX), R14
   766	MOVQ (SI), AX
   767	MULQ R14
   768	ADDQ AX, R10
   769	ADCQ $0x00, DX
   770	MOVQ DX, R15
   771	MOVQ 8(SI), AX
   772	MULQ R14
   773	ADDQ R15, R11
   774	ADCQ $0x00, DX
   775	ADDQ AX, R11
   776	ADCQ $0x00, DX
   777	MOVQ DX, R15
   778	MOVQ 16(SI), AX
   779	MULQ R14
   780	ADDQ R15, R12
   781	ADCQ $0x00, DX
   782	ADDQ AX, R12
   783	ADCQ $0x00, DX
   784	MOVQ DX, R15
   785	MOVQ 24(SI), AX
   786	MULQ R14
   787	ADDQ R15, R13
   788	ADCQ $0x00, DX
   789	ADDQ AX, R13
   790	ADCQ DX, R8
   791	ADCQ $0x00, R9
   792
   793	// Third reduction step
   794	MOVQ R10, AX
   795	MULQ p256ordK0<>+0(SB)
   796	MOVQ AX, R14
   797	MOVQ p256ord<>+0(SB), AX
   798	MULQ R14
   799	ADDQ AX, R10
   800	ADCQ $0x00, DX
   801	MOVQ DX, R15
   802	MOVQ p256ord<>+8(SB), AX
   803	MULQ R14
   804	ADDQ R15, R11
   805	ADCQ $0x00, DX
   806	ADDQ AX, R11
   807	ADCQ $0x00, DX
   808	MOVQ DX, R15
   809	MOVQ p256ord<>+16(SB), AX
   810	MULQ R14
   811	ADDQ R15, R12
   812	ADCQ $0x00, DX
   813	ADDQ AX, R12
   814	ADCQ $0x00, DX
   815	MOVQ DX, R15
   816	MOVQ p256ord<>+24(SB), AX
   817	MULQ R14
   818	ADDQ R15, R13
   819	ADCQ $0x00, DX
   820	ADDQ AX, R13
   821	ADCQ DX, R8
   822	ADCQ $0x00, R9
   823
   824	// x * y[3]
   825	MOVQ 24(CX), R14
   826	MOVQ (SI), AX
   827	MULQ R14
   828	ADDQ AX, R11
   829	ADCQ $0x00, DX
   830	MOVQ DX, R15
   831	MOVQ 8(SI), AX
   832	MULQ R14
   833	ADDQ R15, R12
   834	ADCQ $0x00, DX
   835	ADDQ AX, R12
   836	ADCQ $0x00, DX
   837	MOVQ DX, R15
   838	MOVQ 16(SI), AX
   839	MULQ R14
   840	ADDQ R15, R13
   841	ADCQ $0x00, DX
   842	ADDQ AX, R13
   843	ADCQ $0x00, DX
   844	MOVQ DX, R15
   845	MOVQ 24(SI), AX
   846	MULQ R14
   847	ADDQ R15, R8
   848	ADCQ $0x00, DX
   849	ADDQ AX, R8
   850	ADCQ DX, R9
   851	ADCQ $0x00, R10
   852
   853	// Last reduction step
   854	MOVQ R11, AX
   855	MULQ p256ordK0<>+0(SB)
   856	MOVQ AX, R14
   857	MOVQ p256ord<>+0(SB), AX
   858	MULQ R14
   859	ADDQ AX, R11
   860	ADCQ $0x00, DX
   861	MOVQ DX, R15
   862	MOVQ p256ord<>+8(SB), AX
   863	MULQ R14
   864	ADDQ R15, R12
   865	ADCQ $0x00, DX
   866	ADDQ AX, R12
   867	ADCQ $0x00, DX
   868	MOVQ DX, R15
   869	MOVQ p256ord<>+16(SB), AX
   870	MULQ R14
   871	ADDQ R15, R13
   872	ADCQ $0x00, DX
   873	ADDQ AX, R13
   874	ADCQ $0x00, DX
   875	MOVQ DX, R15
   876	MOVQ p256ord<>+24(SB), AX
   877	MULQ R14
   878	ADDQ R15, R8
   879	ADCQ $0x00, DX
   880	ADDQ AX, R8
   881	ADCQ DX, R9
   882	ADCQ $0x00, R10
   883
   884	// Copy result [255:0]
   885	MOVQ R12, SI
   886	MOVQ R13, R11
   887	MOVQ R8, R14
   888	MOVQ R9, R15
   889
   890	// Subtract p256
   891	SUBQ    p256ord<>+0(SB), R12
   892	SBBQ    p256ord<>+8(SB), R13
   893	SBBQ    p256ord<>+16(SB), R8
   894	SBBQ    p256ord<>+24(SB), R9
   895	SBBQ    $0x00, R10
   896	CMOVQCS SI, R12
   897	CMOVQCS R11, R13
   898	CMOVQCS R14, R8
   899	CMOVQCS R15, R9
   900	MOVQ    R12, (DI)
   901	MOVQ    R13, 8(DI)
   902	MOVQ    R8, 16(DI)
   903	MOVQ    R9, 24(DI)
   904	RET
   905
   906DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
   907GLOBL p256ordK0<>(SB), RODATA, $8
   908
   909DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
   910DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
   911DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
   912DATA p256ord<>+24(SB)/8, $0xffffffff00000000
   913GLOBL p256ord<>(SB), RODATA, $32
   914
   915// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
   916// Requires: CMOV
   917TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
   918	MOVQ res+0(FP), DI
   919	MOVQ in+8(FP), SI
   920	MOVQ n+16(FP), BX
   921
   922ordSqrLoop:
   923	// y[1:] * y[0]
   924	MOVQ (SI), R14
   925	MOVQ 8(SI), AX
   926	MULQ R14
   927	MOVQ AX, R9
   928	MOVQ DX, R10
   929	MOVQ 16(SI), AX
   930	MULQ R14
   931	ADDQ AX, R10
   932	ADCQ $0x00, DX
   933	MOVQ DX, R11
   934	MOVQ 24(SI), AX
   935	MULQ R14
   936	ADDQ AX, R11
   937	ADCQ $0x00, DX
   938	MOVQ DX, R12
   939
   940	// y[2:] * y[1]
   941	MOVQ 8(SI), R14
   942	MOVQ 16(SI), AX
   943	MULQ R14
   944	ADDQ AX, R11
   945	ADCQ $0x00, DX
   946	MOVQ DX, R15
   947	MOVQ 24(SI), AX
   948	MULQ R14
   949	ADDQ R15, R12
   950	ADCQ $0x00, DX
   951	ADDQ AX, R12
   952	ADCQ $0x00, DX
   953	MOVQ DX, R13
   954
   955	// y[3] * y[2]
   956	MOVQ 16(SI), R14
   957	MOVQ 24(SI), AX
   958	MULQ R14
   959	ADDQ AX, R13
   960	ADCQ $0x00, DX
   961	MOVQ DX, CX
   962	XORQ R15, R15
   963
   964	// *2
   965	ADDQ R9, R9
   966	ADCQ R10, R10
   967	ADCQ R11, R11
   968	ADCQ R12, R12
   969	ADCQ R13, R13
   970	ADCQ CX, CX
   971	ADCQ $0x00, R15
   972
   973	// Missing products
   974	MOVQ (SI), AX
   975	MULQ AX
   976	MOVQ AX, R8
   977	MOVQ DX, R14
   978	MOVQ 8(SI), AX
   979	MULQ AX
   980	ADDQ R14, R9
   981	ADCQ AX, R10
   982	ADCQ $0x00, DX
   983	MOVQ DX, R14
   984	MOVQ 16(SI), AX
   985	MULQ AX
   986	ADDQ R14, R11
   987	ADCQ AX, R12
   988	ADCQ $0x00, DX
   989	MOVQ DX, R14
   990	MOVQ 24(SI), AX
   991	MULQ AX
   992	ADDQ R14, R13
   993	ADCQ AX, CX
   994	ADCQ DX, R15
   995	MOVQ R15, SI
   996
   997	// First reduction step
   998	MOVQ R8, AX
   999	MULQ p256ordK0<>+0(SB)
  1000	MOVQ AX, R14
  1001	MOVQ p256ord<>+0(SB), AX
  1002	MULQ R14
  1003	ADDQ AX, R8
  1004	ADCQ $0x00, DX
  1005	MOVQ DX, R15
  1006	MOVQ p256ord<>+8(SB), AX
  1007	MULQ R14
  1008	ADDQ R15, R9
  1009	ADCQ $0x00, DX
  1010	ADDQ AX, R9
  1011	MOVQ R14, R15
  1012	ADCQ DX, R10
  1013	ADCQ $0x00, R15
  1014	SUBQ R14, R10
  1015	SBBQ $0x00, R15
  1016	MOVQ R14, AX
  1017	MOVQ R14, DX
  1018	MOVQ R14, R8
  1019	SHLQ $0x20, AX
  1020	SHRQ $0x20, DX
  1021	ADDQ R15, R11
  1022	ADCQ $0x00, R8
  1023	SUBQ AX, R11
  1024	SBBQ DX, R8
  1025
  1026	// Second reduction step
  1027	MOVQ R9, AX
  1028	MULQ p256ordK0<>+0(SB)
  1029	MOVQ AX, R14
  1030	MOVQ p256ord<>+0(SB), AX
  1031	MULQ R14
  1032	ADDQ AX, R9
  1033	ADCQ $0x00, DX
  1034	MOVQ DX, R15
  1035	MOVQ p256ord<>+8(SB), AX
  1036	MULQ R14
  1037	ADDQ R15, R10
  1038	ADCQ $0x00, DX
  1039	ADDQ AX, R10
  1040	MOVQ R14, R15
  1041	ADCQ DX, R11
  1042	ADCQ $0x00, R15
  1043	SUBQ R14, R11
  1044	SBBQ $0x00, R15
  1045	MOVQ R14, AX
  1046	MOVQ R14, DX
  1047	MOVQ R14, R9
  1048	SHLQ $0x20, AX
  1049	SHRQ $0x20, DX
  1050	ADDQ R15, R8
  1051	ADCQ $0x00, R9
  1052	SUBQ AX, R8
  1053	SBBQ DX, R9
  1054
  1055	// Third reduction step
  1056	MOVQ R10, AX
  1057	MULQ p256ordK0<>+0(SB)
  1058	MOVQ AX, R14
  1059	MOVQ p256ord<>+0(SB), AX
  1060	MULQ R14
  1061	ADDQ AX, R10
  1062	ADCQ $0x00, DX
  1063	MOVQ DX, R15
  1064	MOVQ p256ord<>+8(SB), AX
  1065	MULQ R14
  1066	ADDQ R15, R11
  1067	ADCQ $0x00, DX
  1068	ADDQ AX, R11
  1069	MOVQ R14, R15
  1070	ADCQ DX, R8
  1071	ADCQ $0x00, R15
  1072	SUBQ R14, R8
  1073	SBBQ $0x00, R15
  1074	MOVQ R14, AX
  1075	MOVQ R14, DX
  1076	MOVQ R14, R10
  1077	SHLQ $0x20, AX
  1078	SHRQ $0x20, DX
  1079	ADDQ R15, R9
  1080	ADCQ $0x00, R10
  1081	SUBQ AX, R9
  1082	SBBQ DX, R10
  1083
  1084	// Last reduction step
  1085	MOVQ R11, AX
  1086	MULQ p256ordK0<>+0(SB)
  1087	MOVQ AX, R14
  1088	MOVQ p256ord<>+0(SB), AX
  1089	MULQ R14
  1090	ADDQ AX, R11
  1091	ADCQ $0x00, DX
  1092	MOVQ DX, R15
  1093	MOVQ p256ord<>+8(SB), AX
  1094	MULQ R14
  1095	ADDQ R15, R8
  1096	ADCQ $0x00, DX
  1097	ADDQ AX, R8
  1098	ADCQ $0x00, DX
  1099	MOVQ DX, R15
  1100	MOVQ R14, R15
  1101	ADCQ DX, R9
  1102	ADCQ $0x00, R15
  1103	SUBQ R14, R9
  1104	SBBQ $0x00, R15
  1105	MOVQ R14, AX
  1106	MOVQ R14, DX
  1107	MOVQ R14, R11
  1108	SHLQ $0x20, AX
  1109	SHRQ $0x20, DX
  1110	ADDQ R15, R10
  1111	ADCQ $0x00, R11
  1112	SUBQ AX, R10
  1113	SBBQ DX, R11
  1114	XORQ R14, R14
  1115
  1116	// Add bits [511:256] of the sqr result
  1117	ADCQ R12, R8
  1118	ADCQ R13, R9
  1119	ADCQ CX, R10
  1120	ADCQ SI, R11
  1121	ADCQ $0x00, R14
  1122	MOVQ R8, R12
  1123	MOVQ R9, R13
  1124	MOVQ R10, CX
  1125	MOVQ R11, R15
  1126
  1127	// Subtract p256
  1128	SUBQ    p256ord<>+0(SB), R8
  1129	SBBQ    p256ord<>+8(SB), R9
  1130	SBBQ    p256ord<>+16(SB), R10
  1131	SBBQ    p256ord<>+24(SB), R11
  1132	SBBQ    $0x00, R14
  1133	CMOVQCS R12, R8
  1134	CMOVQCS R13, R9
  1135	CMOVQCS CX, R10
  1136	CMOVQCS R15, R11
  1137	MOVQ    R8, (DI)
  1138	MOVQ    R9, 8(DI)
  1139	MOVQ    R10, 16(DI)
  1140	MOVQ    R11, 24(DI)
  1141	MOVQ    DI, SI
  1142	DECQ    BX
  1143	JNE     ordSqrLoop
  1144	RET
  1145
  1146// func p256SubInternal()
  1147// Requires: CMOV
  1148TEXT p256SubInternal(SB), NOSPLIT, $0
  1149	XORQ    AX, AX
  1150	SUBQ    R14, R10
  1151	SBBQ    R15, R11
  1152	SBBQ    DI, R12
  1153	SBBQ    SI, R13
  1154	SBBQ    $0x00, AX
  1155	MOVQ    R10, BX
  1156	MOVQ    R11, CX
  1157	MOVQ    R12, R8
  1158	MOVQ    R13, R9
  1159	ADDQ    $-1, R10
  1160	ADCQ    p256const0<>+0(SB), R11
  1161	ADCQ    $0x00, R12
  1162	ADCQ    p256const1<>+0(SB), R13
  1163	ANDQ    $0x01, AX
  1164	CMOVQEQ BX, R10
  1165	CMOVQEQ CX, R11
  1166	CMOVQEQ R8, R12
  1167	CMOVQEQ R9, R13
  1168	RET
  1169
  1170// func p256MulInternal()
  1171// Requires: CMOV
  1172TEXT p256MulInternal(SB), NOSPLIT, $8
  1173	MOVQ R10, AX
  1174	MULQ R14
  1175	MOVQ AX, BX
  1176	MOVQ DX, CX
  1177	MOVQ R10, AX
  1178	MULQ R15
  1179	ADDQ AX, CX
  1180	ADCQ $0x00, DX
  1181	MOVQ DX, R8
  1182	MOVQ R10, AX
  1183	MULQ DI
  1184	ADDQ AX, R8
  1185	ADCQ $0x00, DX
  1186	MOVQ DX, R9
  1187	MOVQ R10, AX
  1188	MULQ SI
  1189	ADDQ AX, R9
  1190	ADCQ $0x00, DX
  1191	MOVQ DX, R10
  1192	MOVQ R11, AX
  1193	MULQ R14
  1194	ADDQ AX, CX
  1195	ADCQ $0x00, DX
  1196	MOVQ DX, BP
  1197	MOVQ R11, AX
  1198	MULQ R15
  1199	ADDQ BP, R8
  1200	ADCQ $0x00, DX
  1201	ADDQ AX, R8
  1202	ADCQ $0x00, DX
  1203	MOVQ DX, BP
  1204	MOVQ R11, AX
  1205	MULQ DI
  1206	ADDQ BP, R9
  1207	ADCQ $0x00, DX
  1208	ADDQ AX, R9
  1209	ADCQ $0x00, DX
  1210	MOVQ DX, BP
  1211	MOVQ R11, AX
  1212	MULQ SI
  1213	ADDQ BP, R10
  1214	ADCQ $0x00, DX
  1215	ADDQ AX, R10
  1216	ADCQ $0x00, DX
  1217	MOVQ DX, R11
  1218	MOVQ R12, AX
  1219	MULQ R14
  1220	ADDQ AX, R8
  1221	ADCQ $0x00, DX
  1222	MOVQ DX, BP
  1223	MOVQ R12, AX
  1224	MULQ R15
  1225	ADDQ BP, R9
  1226	ADCQ $0x00, DX
  1227	ADDQ AX, R9
  1228	ADCQ $0x00, DX
  1229	MOVQ DX, BP
  1230	MOVQ R12, AX
  1231	MULQ DI
  1232	ADDQ BP, R10
  1233	ADCQ $0x00, DX
  1234	ADDQ AX, R10
  1235	ADCQ $0x00, DX
  1236	MOVQ DX, BP
  1237	MOVQ R12, AX
  1238	MULQ SI
  1239	ADDQ BP, R11
  1240	ADCQ $0x00, DX
  1241	ADDQ AX, R11
  1242	ADCQ $0x00, DX
  1243	MOVQ DX, R12
  1244	MOVQ R13, AX
  1245	MULQ R14
  1246	ADDQ AX, R9
  1247	ADCQ $0x00, DX
  1248	MOVQ DX, BP
  1249	MOVQ R13, AX
  1250	MULQ R15
  1251	ADDQ BP, R10
  1252	ADCQ $0x00, DX
  1253	ADDQ AX, R10
  1254	ADCQ $0x00, DX
  1255	MOVQ DX, BP
  1256	MOVQ R13, AX
  1257	MULQ DI
  1258	ADDQ BP, R11
  1259	ADCQ $0x00, DX
  1260	ADDQ AX, R11
  1261	ADCQ $0x00, DX
  1262	MOVQ DX, BP
  1263	MOVQ R13, AX
  1264	MULQ SI
  1265	ADDQ BP, R12
  1266	ADCQ $0x00, DX
  1267	ADDQ AX, R12
  1268	ADCQ $0x00, DX
  1269	MOVQ DX, R13
  1270
  1271	// First reduction step
  1272	MOVQ BX, AX
  1273	MOVQ BX, BP
  1274	SHLQ $0x20, BX
  1275	MULQ p256const1<>+0(SB)
  1276	SHRQ $0x20, BP
  1277	ADDQ BX, CX
  1278	ADCQ BP, R8
  1279	ADCQ AX, R9
  1280	ADCQ $0x00, DX
  1281	MOVQ DX, BX
  1282
  1283	// Second reduction step
  1284	MOVQ CX, AX
  1285	MOVQ CX, BP
  1286	SHLQ $0x20, CX
  1287	MULQ p256const1<>+0(SB)
  1288	SHRQ $0x20, BP
  1289	ADDQ CX, R8
  1290	ADCQ BP, R9
  1291	ADCQ AX, BX
  1292	ADCQ $0x00, DX
  1293	MOVQ DX, CX
  1294
  1295	// Third reduction step
  1296	MOVQ R8, AX
  1297	MOVQ R8, BP
  1298	SHLQ $0x20, R8
  1299	MULQ p256const1<>+0(SB)
  1300	SHRQ $0x20, BP
  1301	ADDQ R8, R9
  1302	ADCQ BP, BX
  1303	ADCQ AX, CX
  1304	ADCQ $0x00, DX
  1305	MOVQ DX, R8
  1306
  1307	// Last reduction step
  1308	MOVQ R9, AX
  1309	MOVQ R9, BP
  1310	SHLQ $0x20, R9
  1311	MULQ p256const1<>+0(SB)
  1312	SHRQ $0x20, BP
  1313	ADDQ R9, BX
  1314	ADCQ BP, CX
  1315	ADCQ AX, R8
  1316	ADCQ $0x00, DX
  1317	MOVQ DX, R9
  1318	MOVQ $0x00000000, BP
  1319
  1320	// Add bits [511:256] of the result
  1321	ADCQ BX, R10
  1322	ADCQ CX, R11
  1323	ADCQ R8, R12
  1324	ADCQ R9, R13
  1325	ADCQ $0x00, BP
  1326
  1327	// Copy result
  1328	MOVQ R10, BX
  1329	MOVQ R11, CX
  1330	MOVQ R12, R8
  1331	MOVQ R13, R9
  1332
  1333	// Subtract p256
  1334	SUBQ $-1, R10
  1335	SBBQ p256const0<>+0(SB), R11
  1336	SBBQ $0x00, R12
  1337	SBBQ p256const1<>+0(SB), R13
  1338	SBBQ $0x00, BP
  1339
  1340	// If the result of the subtraction is negative, restore the previous result
  1341	CMOVQCS BX, R10
  1342	CMOVQCS CX, R11
  1343	CMOVQCS R8, R12
  1344	CMOVQCS R9, R13
  1345	RET
  1346
  1347// func p256SqrInternal()
  1348// Requires: CMOV
  1349TEXT p256SqrInternal(SB), NOSPLIT, $8
  1350	MOVQ R10, AX
  1351	MULQ R11
  1352	MOVQ AX, CX
  1353	MOVQ DX, R8
  1354	MOVQ R10, AX
  1355	MULQ R12
  1356	ADDQ AX, R8
  1357	ADCQ $0x00, DX
  1358	MOVQ DX, R9
  1359	MOVQ R10, AX
  1360	MULQ R13
  1361	ADDQ AX, R9
  1362	ADCQ $0x00, DX
  1363	MOVQ DX, R14
  1364	MOVQ R11, AX
  1365	MULQ R12
  1366	ADDQ AX, R9
  1367	ADCQ $0x00, DX
  1368	MOVQ DX, BP
  1369	MOVQ R11, AX
  1370	MULQ R13
  1371	ADDQ BP, R14
  1372	ADCQ $0x00, DX
  1373	ADDQ AX, R14
  1374	ADCQ $0x00, DX
  1375	MOVQ DX, R15
  1376	MOVQ R12, AX
  1377	MULQ R13
  1378	ADDQ AX, R15
  1379	ADCQ $0x00, DX
  1380	MOVQ DX, DI
  1381	XORQ SI, SI
  1382
  1383	// *2
  1384	ADDQ CX, CX
  1385	ADCQ R8, R8
  1386	ADCQ R9, R9
  1387	ADCQ R14, R14
  1388	ADCQ R15, R15
  1389	ADCQ DI, DI
  1390	ADCQ $0x00, SI
  1391
  1392	// Missing products
  1393	MOVQ R10, AX
  1394	MULQ AX
  1395	MOVQ AX, BX
  1396	MOVQ DX, R10
  1397	MOVQ R11, AX
  1398	MULQ AX
  1399	ADDQ R10, CX
  1400	ADCQ AX, R8
  1401	ADCQ $0x00, DX
  1402	MOVQ DX, R10
  1403	MOVQ R12, AX
  1404	MULQ AX
  1405	ADDQ R10, R9
  1406	ADCQ AX, R14
  1407	ADCQ $0x00, DX
  1408	MOVQ DX, R10
  1409	MOVQ R13, AX
  1410	MULQ AX
  1411	ADDQ R10, R15
  1412	ADCQ AX, DI
  1413	ADCQ DX, SI
  1414
  1415	// First reduction step
  1416	MOVQ BX, AX
  1417	MOVQ BX, BP
  1418	SHLQ $0x20, BX
  1419	MULQ p256const1<>+0(SB)
  1420	SHRQ $0x20, BP
  1421	ADDQ BX, CX
  1422	ADCQ BP, R8
  1423	ADCQ AX, R9
  1424	ADCQ $0x00, DX
  1425	MOVQ DX, BX
  1426
  1427	// Second reduction step
  1428	MOVQ CX, AX
  1429	MOVQ CX, BP
  1430	SHLQ $0x20, CX
  1431	MULQ p256const1<>+0(SB)
  1432	SHRQ $0x20, BP
  1433	ADDQ CX, R8
  1434	ADCQ BP, R9
  1435	ADCQ AX, BX
  1436	ADCQ $0x00, DX
  1437	MOVQ DX, CX
  1438
  1439	// Third reduction step
  1440	MOVQ R8, AX
  1441	MOVQ R8, BP
  1442	SHLQ $0x20, R8
  1443	MULQ p256const1<>+0(SB)
  1444	SHRQ $0x20, BP
  1445	ADDQ R8, R9
  1446	ADCQ BP, BX
  1447	ADCQ AX, CX
  1448	ADCQ $0x00, DX
  1449	MOVQ DX, R8
  1450
  1451	// Last reduction step
  1452	MOVQ R9, AX
  1453	MOVQ R9, BP
  1454	SHLQ $0x20, R9
  1455	MULQ p256const1<>+0(SB)
  1456	SHRQ $0x20, BP
  1457	ADDQ R9, BX
  1458	ADCQ BP, CX
  1459	ADCQ AX, R8
  1460	ADCQ $0x00, DX
  1461	MOVQ DX, R9
  1462	MOVQ $0x00000000, BP
  1463
  1464	// Add bits [511:256] of the result
  1465	ADCQ BX, R14
  1466	ADCQ CX, R15
  1467	ADCQ R8, DI
  1468	ADCQ R9, SI
  1469	ADCQ $0x00, BP
  1470
  1471	// Copy result
  1472	MOVQ R14, R10
  1473	MOVQ R15, R11
  1474	MOVQ DI, R12
  1475	MOVQ SI, R13
  1476
  1477	// Subtract p256
  1478	SUBQ $-1, R10
  1479	SBBQ p256const0<>+0(SB), R11
  1480	SBBQ $0x00, R12
  1481	SBBQ p256const1<>+0(SB), R13
  1482	SBBQ $0x00, BP
  1483
  1484	// If the result of the subtraction is negative, restore the previous result
  1485	CMOVQCS R14, R10
  1486	CMOVQCS R15, R11
  1487	CMOVQCS DI, R12
  1488	CMOVQCS SI, R13
  1489	RET
  1490
  1491// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
  1492// Requires: CMOV, SSE2
  1493TEXT ·p256PointAddAffineAsm(SB), $512-48
  1494	MOVQ  res+0(FP), AX
  1495	MOVQ  in1+8(FP), BX
  1496	MOVQ  in2+16(FP), CX
  1497	MOVQ  sign+24(FP), DX
  1498	MOVQ  sel+32(FP), R15
  1499	MOVQ  zero+40(FP), DI
  1500	MOVOU (BX), X0
  1501	MOVOU 16(BX), X1
  1502	MOVOU 32(BX), X2
  1503	MOVOU 48(BX), X3
  1504	MOVOU 64(BX), X4
  1505	MOVOU 80(BX), X5
  1506	MOVOU X0, (SP)
  1507	MOVOU X1, 16(SP)
  1508	MOVOU X2, 32(SP)
  1509	MOVOU X3, 48(SP)
  1510	MOVOU X4, 64(SP)
  1511	MOVOU X5, 80(SP)
  1512	MOVOU (CX), X0
  1513	MOVOU 16(CX), X1
  1514	MOVOU X0, 96(SP)
  1515	MOVOU X1, 112(SP)
  1516
  1517	// Store pointer to result
  1518	MOVQ AX, 480(SP)
  1519	MOVL R15, 488(SP)
  1520	MOVL DI, 492(SP)
  1521
  1522	// Negate y2in based on sign
  1523	MOVQ 32(CX), R10
  1524	MOVQ 40(CX), R11
  1525	MOVQ 48(CX), R12
  1526	MOVQ 56(CX), R13
  1527	MOVQ $-1, BX
  1528	MOVQ p256const0<>+0(SB), CX
  1529	MOVQ $0x00000000, R8
  1530	MOVQ p256const1<>+0(SB), R9
  1531	XORQ AX, AX
  1532
  1533	// Speculatively subtract
  1534	SUBQ R10, BX
  1535	SBBQ R11, CX
  1536	SBBQ R12, R8
  1537	SBBQ R13, R9
  1538	SBBQ $0x00, AX
  1539	MOVQ BX, R14
  1540	MOVQ CX, R15
  1541	MOVQ R8, DI
  1542	MOVQ R9, SI
  1543
  1544	// Add in case the operand was > p256
  1545	ADDQ    $-1, BX
  1546	ADCQ    p256const0<>+0(SB), CX
  1547	ADCQ    $0x00, R8
  1548	ADCQ    p256const1<>+0(SB), R9
  1549	ADCQ    $0x00, AX
  1550	CMOVQNE R14, BX
  1551	CMOVQNE R15, CX
  1552	CMOVQNE DI, R8
  1553	CMOVQNE SI, R9
  1554
  1555	// If condition is 0, keep original value
  1556	TESTQ   DX, DX
  1557	CMOVQEQ R10, BX
  1558	CMOVQEQ R11, CX
  1559	CMOVQEQ R12, R8
  1560	CMOVQEQ R13, R9
  1561
  1562	// Store result
  1563	MOVQ BX, 128(SP)
  1564	MOVQ CX, 136(SP)
  1565	MOVQ R8, 144(SP)
  1566	MOVQ R9, 152(SP)
  1567
  1568	// Begin point add
  1569	MOVQ    64(SP), R10
  1570	MOVQ    72(SP), R11
  1571	MOVQ    80(SP), R12
  1572	MOVQ    88(SP), R13
  1573	CALL    p256SqrInternal(SB)
  1574	MOVQ    R10, 288(SP)
  1575	MOVQ    R11, 296(SP)
  1576	MOVQ    R12, 304(SP)
  1577	MOVQ    R13, 312(SP)
  1578	MOVQ    96(SP), R14
  1579	MOVQ    104(SP), R15
  1580	MOVQ    112(SP), DI
  1581	MOVQ    120(SP), SI
  1582	CALL    p256MulInternal(SB)
  1583	MOVQ    (SP), R14
  1584	MOVQ    8(SP), R15
  1585	MOVQ    16(SP), DI
  1586	MOVQ    24(SP), SI
  1587	CALL    p256SubInternal(SB)
  1588	MOVQ    R10, 320(SP)
  1589	MOVQ    R11, 328(SP)
  1590	MOVQ    R12, 336(SP)
  1591	MOVQ    R13, 344(SP)
  1592	MOVQ    64(SP), R14
  1593	MOVQ    72(SP), R15
  1594	MOVQ    80(SP), DI
  1595	MOVQ    88(SP), SI
  1596	CALL    p256MulInternal(SB)
  1597	MOVQ    R10, 224(SP)
  1598	MOVQ    R11, 232(SP)
  1599	MOVQ    R12, 240(SP)
  1600	MOVQ    R13, 248(SP)
  1601	MOVQ    288(SP), R10
  1602	MOVQ    296(SP), R11
  1603	MOVQ    304(SP), R12
  1604	MOVQ    312(SP), R13
  1605	CALL    p256MulInternal(SB)
  1606	MOVQ    128(SP), R14
  1607	MOVQ    136(SP), R15
  1608	MOVQ    144(SP), DI
  1609	MOVQ    152(SP), SI
  1610	CALL    p256MulInternal(SB)
  1611	MOVQ    R10, 256(SP)
  1612	MOVQ    R11, 264(SP)
  1613	MOVQ    R12, 272(SP)
  1614	MOVQ    R13, 280(SP)
  1615	MOVQ    32(SP), R14
  1616	MOVQ    40(SP), R15
  1617	MOVQ    48(SP), DI
  1618	MOVQ    56(SP), SI
  1619	CALL    p256SubInternal(SB)
  1620	MOVQ    R10, 352(SP)
  1621	MOVQ    R11, 360(SP)
  1622	MOVQ    R12, 368(SP)
  1623	MOVQ    R13, 376(SP)
  1624	CALL    p256SqrInternal(SB)
  1625	MOVQ    R10, 416(SP)
  1626	MOVQ    R11, 424(SP)
  1627	MOVQ    R12, 432(SP)
  1628	MOVQ    R13, 440(SP)
  1629	MOVQ    320(SP), R10
  1630	MOVQ    328(SP), R11
  1631	MOVQ    336(SP), R12
  1632	MOVQ    344(SP), R13
  1633	CALL    p256SqrInternal(SB)
  1634	MOVQ    R10, 384(SP)
  1635	MOVQ    R11, 392(SP)
  1636	MOVQ    R12, 400(SP)
  1637	MOVQ    R13, 408(SP)
  1638	MOVQ    320(SP), R14
  1639	MOVQ    328(SP), R15
  1640	MOVQ    336(SP), DI
  1641	MOVQ    344(SP), SI
  1642	CALL    p256MulInternal(SB)
  1643	MOVQ    R10, 448(SP)
  1644	MOVQ    R11, 456(SP)
  1645	MOVQ    R12, 464(SP)
  1646	MOVQ    R13, 472(SP)
  1647	MOVQ    32(SP), R14
  1648	MOVQ    40(SP), R15
  1649	MOVQ    48(SP), DI
  1650	MOVQ    56(SP), SI
  1651	CALL    p256MulInternal(SB)
  1652	MOVQ    R10, 256(SP)
  1653	MOVQ    R11, 264(SP)
  1654	MOVQ    R12, 272(SP)
  1655	MOVQ    R13, 280(SP)
  1656	MOVQ    (SP), R10
  1657	MOVQ    8(SP), R11
  1658	MOVQ    16(SP), R12
  1659	MOVQ    24(SP), R13
  1660	MOVQ    384(SP), R14
  1661	MOVQ    392(SP), R15
  1662	MOVQ    400(SP), DI
  1663	MOVQ    408(SP), SI
  1664	CALL    p256MulInternal(SB)
  1665	MOVQ    R10, 320(SP)
  1666	MOVQ    R11, 328(SP)
  1667	MOVQ    R12, 336(SP)
  1668	MOVQ    R13, 344(SP)
  1669	XORQ    AX, AX
  1670	ADDQ    R10, R10
  1671	ADCQ    R11, R11
  1672	ADCQ    R12, R12
  1673	ADCQ    R13, R13
  1674	ADCQ    $+0, AX
  1675	MOVQ    R10, R14
  1676	MOVQ    R11, R15
  1677	MOVQ    R12, DI
  1678	MOVQ    R13, SI
  1679	SUBQ    $-1, R14
  1680	SBBQ    p256const0<>+0(SB), R15
  1681	SBBQ    $+0, DI
  1682	SBBQ    p256const1<>+0(SB), SI
  1683	SBBQ    $+0, AX
  1684	CMOVQCS R10, R14
  1685	CMOVQCS R11, R15
  1686	CMOVQCS R12, DI
  1687	CMOVQCS R13, SI
  1688	MOVQ    416(SP), R10
  1689	MOVQ    424(SP), R11
  1690	MOVQ    432(SP), R12
  1691	MOVQ    440(SP), R13
  1692	CALL    p256SubInternal(SB)
  1693	MOVQ    448(SP), R14
  1694	MOVQ    456(SP), R15
  1695	MOVQ    464(SP), DI
  1696	MOVQ    472(SP), SI
  1697	CALL    p256SubInternal(SB)
  1698	MOVQ    R10, 160(SP)
  1699	MOVQ    R11, 168(SP)
  1700	MOVQ    R12, 176(SP)
  1701	MOVQ    R13, 184(SP)
  1702	MOVQ    R10, R14
  1703	MOVQ    R11, R15
  1704	MOVQ    R12, DI
  1705	MOVQ    R13, SI
  1706	MOVQ    320(SP), R10
  1707	MOVQ    328(SP), R11
  1708	MOVQ    336(SP), R12
  1709	MOVQ    344(SP), R13
  1710	CALL    p256SubInternal(SB)
  1711	MOVQ    352(SP), R14
  1712	MOVQ    360(SP), R15
  1713	MOVQ    368(SP), DI
  1714	MOVQ    376(SP), SI
  1715	CALL    p256MulInternal(SB)
  1716	MOVQ    256(SP), R14
  1717	MOVQ    264(SP), R15
  1718	MOVQ    272(SP), DI
  1719	MOVQ    280(SP), SI
  1720	CALL    p256SubInternal(SB)
  1721	MOVQ    R10, 192(SP)
  1722	MOVQ    R11, 200(SP)
  1723	MOVQ    R12, 208(SP)
  1724	MOVQ    R13, 216(SP)
  1725
  1726	// Load stored values from stack
  1727	MOVQ 480(SP), AX
  1728	MOVL 488(SP), BX
  1729	MOVL 492(SP), CX
  1730
  1731	// The result is not valid if (sel == 0), conditional choose
  1732	MOVOU   160(SP), X0
  1733	MOVOU   176(SP), X1
  1734	MOVOU   192(SP), X2
  1735	MOVOU   208(SP), X3
  1736	MOVOU   224(SP), X4
  1737	MOVOU   240(SP), X5
  1738	MOVL    BX, X6
  1739	MOVL    CX, X7
  1740	PXOR    X8, X8
  1741	PCMPEQL X9, X9
  1742	PSHUFD  $0x00, X6, X6
  1743	PSHUFD  $0x00, X7, X7
  1744	PCMPEQL X8, X6
  1745	PCMPEQL X8, X7
  1746	MOVOU   X6, X15
  1747	PANDN   X9, X15
  1748	MOVOU   (SP), X9
  1749	MOVOU   16(SP), X10
  1750	MOVOU   32(SP), X11
  1751	MOVOU   48(SP), X12
  1752	MOVOU   64(SP), X13
  1753	MOVOU   80(SP), X14
  1754	PAND    X15, X0
  1755	PAND    X15, X1
  1756	PAND    X15, X2
  1757	PAND    X15, X3
  1758	PAND    X15, X4
  1759	PAND    X15, X5
  1760	PAND    X6, X9
  1761	PAND    X6, X10
  1762	PAND    X6, X11
  1763	PAND    X6, X12
  1764	PAND    X6, X13
  1765	PAND    X6, X14
  1766	PXOR    X9, X0
  1767	PXOR    X10, X1
  1768	PXOR    X11, X2
  1769	PXOR    X12, X3
  1770	PXOR    X13, X4
  1771	PXOR    X14, X5
  1772
  1773	// Similarly if zero == 0
  1774	PCMPEQL X9, X9
  1775	MOVOU   X7, X15
  1776	PANDN   X9, X15
  1777	MOVOU   96(SP), X9
  1778	MOVOU   112(SP), X10
  1779	MOVOU   128(SP), X11
  1780	MOVOU   144(SP), X12
  1781	MOVOU   p256one<>+0(SB), X13
  1782	MOVOU   p256one<>+16(SB), X14
  1783	PAND    X15, X0
  1784	PAND    X15, X1
  1785	PAND    X15, X2
  1786	PAND    X15, X3
  1787	PAND    X15, X4
  1788	PAND    X15, X5
  1789	PAND    X7, X9
  1790	PAND    X7, X10
  1791	PAND    X7, X11
  1792	PAND    X7, X12
  1793	PAND    X7, X13
  1794	PAND    X7, X14
  1795	PXOR    X9, X0
  1796	PXOR    X10, X1
  1797	PXOR    X11, X2
  1798	PXOR    X12, X3
  1799	PXOR    X13, X4
  1800	PXOR    X14, X5
  1801
  1802	// Finally output the result
  1803	MOVOU X0, (AX)
  1804	MOVOU X1, 16(AX)
  1805	MOVOU X2, 32(AX)
  1806	MOVOU X3, 48(AX)
  1807	MOVOU X4, 64(AX)
  1808	MOVOU X5, 80(AX)
  1809	MOVQ  $0x00000000, 480(SP)
  1810	RET
  1811
  1812DATA p256one<>+0(SB)/8, $0x0000000000000001
  1813DATA p256one<>+8(SB)/8, $0xffffffff00000000
  1814DATA p256one<>+16(SB)/8, $0xffffffffffffffff
  1815DATA p256one<>+24(SB)/8, $0x00000000fffffffe
  1816GLOBL p256one<>(SB), RODATA, $32
  1817
  1818// func p256IsZero()
  1819// Requires: CMOV
  1820TEXT p256IsZero(SB), NOSPLIT, $0
  1821	// AX contains a flag that is set if the input is zero.
  1822	XORQ AX, AX
  1823	MOVQ $0x00000001, R15
  1824
  1825	// Check whether [acc4..acc7] are all zero.
  1826	MOVQ R10, R14
  1827	ORQ  R11, R14
  1828	ORQ  R12, R14
  1829	ORQ  R13, R14
  1830
  1831	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  1832	// appear to be supported in Go. Thus t1 = 1.)
  1833	CMOVQEQ R15, AX
  1834
  1835	// XOR [acc4..acc7] with P and compare with zero again.
  1836	XORQ $-1, R10
  1837	XORQ p256const0<>+0(SB), R11
  1838	XORQ p256const1<>+0(SB), R13
  1839	ORQ  R11, R10
  1840	ORQ  R12, R10
  1841	ORQ  R13, R10
  1842
  1843	// Set the zero flag if so.
  1844	CMOVQEQ R15, AX
  1845	RET
  1846
  1847// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
  1848// Requires: CMOV, SSE2
  1849TEXT ·p256PointAddAsm(SB), $680-32
  1850	// Move input to stack in order to free registers
  1851	MOVQ  res+0(FP), AX
  1852	MOVQ  in1+8(FP), BX
  1853	MOVQ  in2+16(FP), CX
  1854	MOVOU (BX), X0
  1855	MOVOU 16(BX), X1
  1856	MOVOU 32(BX), X2
  1857	MOVOU 48(BX), X3
  1858	MOVOU 64(BX), X4
  1859	MOVOU 80(BX), X5
  1860	MOVOU X0, (SP)
  1861	MOVOU X1, 16(SP)
  1862	MOVOU X2, 32(SP)
  1863	MOVOU X3, 48(SP)
  1864	MOVOU X4, 64(SP)
  1865	MOVOU X5, 80(SP)
  1866	MOVOU (CX), X0
  1867	MOVOU 16(CX), X1
  1868	MOVOU 32(CX), X2
  1869	MOVOU 48(CX), X3
  1870	MOVOU 64(CX), X4
  1871	MOVOU 80(CX), X5
  1872	MOVOU X0, 96(SP)
  1873	MOVOU X1, 112(SP)
  1874	MOVOU X2, 128(SP)
  1875	MOVOU X3, 144(SP)
  1876	MOVOU X4, 160(SP)
  1877	MOVOU X5, 176(SP)
  1878
  1879	// Store pointer to result
  1880	MOVQ AX, 640(SP)
  1881
  1882	// Begin point add
  1883	MOVQ    160(SP), R10
  1884	MOVQ    168(SP), R11
  1885	MOVQ    176(SP), R12
  1886	MOVQ    184(SP), R13
  1887	CALL    p256SqrInternal(SB)
  1888	MOVQ    R10, 448(SP)
  1889	MOVQ    R11, 456(SP)
  1890	MOVQ    R12, 464(SP)
  1891	MOVQ    R13, 472(SP)
  1892	MOVQ    160(SP), R14
  1893	MOVQ    168(SP), R15
  1894	MOVQ    176(SP), DI
  1895	MOVQ    184(SP), SI
  1896	CALL    p256MulInternal(SB)
  1897	MOVQ    32(SP), R14
  1898	MOVQ    40(SP), R15
  1899	MOVQ    48(SP), DI
  1900	MOVQ    56(SP), SI
  1901	CALL    p256MulInternal(SB)
  1902	MOVQ    R10, 352(SP)
  1903	MOVQ    R11, 360(SP)
  1904	MOVQ    R12, 368(SP)
  1905	MOVQ    R13, 376(SP)
  1906	MOVQ    64(SP), R10
  1907	MOVQ    72(SP), R11
  1908	MOVQ    80(SP), R12
  1909	MOVQ    88(SP), R13
  1910	CALL    p256SqrInternal(SB)
  1911	MOVQ    R10, 416(SP)
  1912	MOVQ    R11, 424(SP)
  1913	MOVQ    R12, 432(SP)
  1914	MOVQ    R13, 440(SP)
  1915	MOVQ    64(SP), R14
  1916	MOVQ    72(SP), R15
  1917	MOVQ    80(SP), DI
  1918	MOVQ    88(SP), SI
  1919	CALL    p256MulInternal(SB)
  1920	MOVQ    128(SP), R14
  1921	MOVQ    136(SP), R15
  1922	MOVQ    144(SP), DI
  1923	MOVQ    152(SP), SI
  1924	CALL    p256MulInternal(SB)
  1925	MOVQ    R10, 384(SP)
  1926	MOVQ    R11, 392(SP)
  1927	MOVQ    R12, 400(SP)
  1928	MOVQ    R13, 408(SP)
  1929	MOVQ    352(SP), R14
  1930	MOVQ    360(SP), R15
  1931	MOVQ    368(SP), DI
  1932	MOVQ    376(SP), SI
  1933	CALL    p256SubInternal(SB)
  1934	MOVQ    R10, 512(SP)
  1935	MOVQ    R11, 520(SP)
  1936	MOVQ    R12, 528(SP)
  1937	MOVQ    R13, 536(SP)
  1938	CALL    p256IsZero(SB)
  1939	MOVQ    AX, 648(SP)
  1940	MOVQ    448(SP), R10
  1941	MOVQ    456(SP), R11
  1942	MOVQ    464(SP), R12
  1943	MOVQ    472(SP), R13
  1944	MOVQ    (SP), R14
  1945	MOVQ    8(SP), R15
  1946	MOVQ    16(SP), DI
  1947	MOVQ    24(SP), SI
  1948	CALL    p256MulInternal(SB)
  1949	MOVQ    R10, 288(SP)
  1950	MOVQ    R11, 296(SP)
  1951	MOVQ    R12, 304(SP)
  1952	MOVQ    R13, 312(SP)
  1953	MOVQ    416(SP), R10
  1954	MOVQ    424(SP), R11
  1955	MOVQ    432(SP), R12
  1956	MOVQ    440(SP), R13
  1957	MOVQ    96(SP), R14
  1958	MOVQ    104(SP), R15
  1959	MOVQ    112(SP), DI
  1960	MOVQ    120(SP), SI
  1961	CALL    p256MulInternal(SB)
  1962	MOVQ    R10, 320(SP)
  1963	MOVQ    R11, 328(SP)
  1964	MOVQ    R12, 336(SP)
  1965	MOVQ    R13, 344(SP)
  1966	MOVQ    288(SP), R14
  1967	MOVQ    296(SP), R15
  1968	MOVQ    304(SP), DI
  1969	MOVQ    312(SP), SI
  1970	CALL    p256SubInternal(SB)
  1971	MOVQ    R10, 480(SP)
  1972	MOVQ    R11, 488(SP)
  1973	MOVQ    R12, 496(SP)
  1974	MOVQ    R13, 504(SP)
  1975	CALL    p256IsZero(SB)
  1976	ANDQ    648(SP), AX
  1977	MOVQ    AX, 648(SP)
  1978	MOVQ    512(SP), R10
  1979	MOVQ    520(SP), R11
  1980	MOVQ    528(SP), R12
  1981	MOVQ    536(SP), R13
  1982	CALL    p256SqrInternal(SB)
  1983	MOVQ    R10, 576(SP)
  1984	MOVQ    R11, 584(SP)
  1985	MOVQ    R12, 592(SP)
  1986	MOVQ    R13, 600(SP)
  1987	MOVQ    480(SP), R10
  1988	MOVQ    488(SP), R11
  1989	MOVQ    496(SP), R12
  1990	MOVQ    504(SP), R13
  1991	CALL    p256SqrInternal(SB)
  1992	MOVQ    R10, 544(SP)
  1993	MOVQ    R11, 552(SP)
  1994	MOVQ    R12, 560(SP)
  1995	MOVQ    R13, 568(SP)
  1996	MOVQ    480(SP), R14
  1997	MOVQ    488(SP), R15
  1998	MOVQ    496(SP), DI
  1999	MOVQ    504(SP), SI
  2000	CALL    p256MulInternal(SB)
  2001	MOVQ    R10, 608(SP)
  2002	MOVQ    R11, 616(SP)
  2003	MOVQ    R12, 624(SP)
  2004	MOVQ    R13, 632(SP)
  2005	MOVQ    352(SP), R14
  2006	MOVQ    360(SP), R15
  2007	MOVQ    368(SP), DI
  2008	MOVQ    376(SP), SI
  2009	CALL    p256MulInternal(SB)
  2010	MOVQ    R10, 384(SP)
  2011	MOVQ    R11, 392(SP)
  2012	MOVQ    R12, 400(SP)
  2013	MOVQ    R13, 408(SP)
  2014	MOVQ    64(SP), R10
  2015	MOVQ    72(SP), R11
  2016	MOVQ    80(SP), R12
  2017	MOVQ    88(SP), R13
  2018	MOVQ    160(SP), R14
  2019	MOVQ    168(SP), R15
  2020	MOVQ    176(SP), DI
  2021	MOVQ    184(SP), SI
  2022	CALL    p256MulInternal(SB)
  2023	MOVQ    480(SP), R14
  2024	MOVQ    488(SP), R15
  2025	MOVQ    496(SP), DI
  2026	MOVQ    504(SP), SI
  2027	CALL    p256MulInternal(SB)
  2028	MOVQ    R10, 256(SP)
  2029	MOVQ    R11, 264(SP)
  2030	MOVQ    R12, 272(SP)
  2031	MOVQ    R13, 280(SP)
  2032	MOVQ    544(SP), R10
  2033	MOVQ    552(SP), R11
  2034	MOVQ    560(SP), R12
  2035	MOVQ    568(SP), R13
  2036	MOVQ    288(SP), R14
  2037	MOVQ    296(SP), R15
  2038	MOVQ    304(SP), DI
  2039	MOVQ    312(SP), SI
  2040	CALL    p256MulInternal(SB)
  2041	MOVQ    R10, 320(SP)
  2042	MOVQ    R11, 328(SP)
  2043	MOVQ    R12, 336(SP)
  2044	MOVQ    R13, 344(SP)
  2045	XORQ    AX, AX
  2046	ADDQ    R10, R10
  2047	ADCQ    R11, R11
  2048	ADCQ    R12, R12
  2049	ADCQ    R13, R13
  2050	ADCQ    $+0, AX
  2051	MOVQ    R10, R14
  2052	MOVQ    R11, R15
  2053	MOVQ    R12, DI
  2054	MOVQ    R13, SI
  2055	SUBQ    $-1, R14
  2056	SBBQ    p256const0<>+0(SB), R15
  2057	SBBQ    $+0, DI
  2058	SBBQ    p256const1<>+0(SB), SI
  2059	SBBQ    $+0, AX
  2060	CMOVQCS R10, R14
  2061	CMOVQCS R11, R15
  2062	CMOVQCS R12, DI
  2063	CMOVQCS R13, SI
  2064	MOVQ    576(SP), R10
  2065	MOVQ    584(SP), R11
  2066	MOVQ    592(SP), R12
  2067	MOVQ    600(SP), R13
  2068	CALL    p256SubInternal(SB)
  2069	MOVQ    608(SP), R14
  2070	MOVQ    616(SP), R15
  2071	MOVQ    624(SP), DI
  2072	MOVQ    632(SP), SI
  2073	CALL    p256SubInternal(SB)
  2074	MOVQ    R10, 192(SP)
  2075	MOVQ    R11, 200(SP)
  2076	MOVQ    R12, 208(SP)
  2077	MOVQ    R13, 216(SP)
  2078	MOVQ    R10, R14
  2079	MOVQ    R11, R15
  2080	MOVQ    R12, DI
  2081	MOVQ    R13, SI
  2082	MOVQ    320(SP), R10
  2083	MOVQ    328(SP), R11
  2084	MOVQ    336(SP), R12
  2085	MOVQ    344(SP), R13
  2086	CALL    p256SubInternal(SB)
  2087	MOVQ    512(SP), R14
  2088	MOVQ    520(SP), R15
  2089	MOVQ    528(SP), DI
  2090	MOVQ    536(SP), SI
  2091	CALL    p256MulInternal(SB)
  2092	MOVQ    384(SP), R14
  2093	MOVQ    392(SP), R15
  2094	MOVQ    400(SP), DI
  2095	MOVQ    408(SP), SI
  2096	CALL    p256SubInternal(SB)
  2097	MOVQ    R10, 224(SP)
  2098	MOVQ    R11, 232(SP)
  2099	MOVQ    R12, 240(SP)
  2100	MOVQ    R13, 248(SP)
  2101	MOVOU   192(SP), X0
  2102	MOVOU   208(SP), X1
  2103	MOVOU   224(SP), X2
  2104	MOVOU   240(SP), X3
  2105	MOVOU   256(SP), X4
  2106	MOVOU   272(SP), X5
  2107
  2108	// Finally output the result
  2109	MOVQ  640(SP), AX
  2110	MOVQ  $0x00000000, 640(SP)
  2111	MOVOU X0, (AX)
  2112	MOVOU X1, 16(AX)
  2113	MOVOU X2, 32(AX)
  2114	MOVOU X3, 48(AX)
  2115	MOVOU X4, 64(AX)
  2116	MOVOU X5, 80(AX)
  2117	MOVQ  648(SP), AX
  2118	MOVQ  AX, ret+24(FP)
  2119	RET
  2120
  2121// func p256PointDoubleAsm(res *P256Point, in *P256Point)
  2122// Requires: CMOV, SSE2
  2123TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
  2124	MOVQ  res+0(FP), AX
  2125	MOVQ  in+8(FP), BX
  2126	MOVOU (BX), X0
  2127	MOVOU 16(BX), X1
  2128	MOVOU 32(BX), X2
  2129	MOVOU 48(BX), X3
  2130	MOVOU 64(BX), X4
  2131	MOVOU 80(BX), X5
  2132	MOVOU X0, (SP)
  2133	MOVOU X1, 16(SP)
  2134	MOVOU X2, 32(SP)
  2135	MOVOU X3, 48(SP)
  2136	MOVOU X4, 64(SP)
  2137	MOVOU X5, 80(SP)
  2138
  2139	// Store pointer to result
  2140	MOVQ AX, 224(SP)
  2141
  2142	// Begin point double
  2143	MOVQ    64(SP), R10
  2144	MOVQ    72(SP), R11
  2145	MOVQ    80(SP), R12
  2146	MOVQ    88(SP), R13
  2147	CALL    p256SqrInternal(SB)
  2148	MOVQ    R10, 160(SP)
  2149	MOVQ    R11, 168(SP)
  2150	MOVQ    R12, 176(SP)
  2151	MOVQ    R13, 184(SP)
  2152	MOVQ    (SP), R14
  2153	MOVQ    8(SP), R15
  2154	MOVQ    16(SP), DI
  2155	MOVQ    24(SP), SI
  2156	XORQ    AX, AX
  2157	ADDQ    R14, R10
  2158	ADCQ    R15, R11
  2159	ADCQ    DI, R12
  2160	ADCQ    SI, R13
  2161	ADCQ    $+0, AX
  2162	MOVQ    R10, R14
  2163	MOVQ    R11, R15
  2164	MOVQ    R12, DI
  2165	MOVQ    R13, SI
  2166	SUBQ    $-1, R14
  2167	SBBQ    p256const0<>+0(SB), R15
  2168	SBBQ    $+0, DI
  2169	SBBQ    p256const1<>+0(SB), SI
  2170	SBBQ    $+0, AX
  2171	CMOVQCS R10, R14
  2172	CMOVQCS R11, R15
  2173	CMOVQCS R12, DI
  2174	CMOVQCS R13, SI
  2175	MOVQ    R14, 128(SP)
  2176	MOVQ    R15, 136(SP)
  2177	MOVQ    DI, 144(SP)
  2178	MOVQ    SI, 152(SP)
  2179	MOVQ    64(SP), R10
  2180	MOVQ    72(SP), R11
  2181	MOVQ    80(SP), R12
  2182	MOVQ    88(SP), R13
  2183	MOVQ    32(SP), R14
  2184	MOVQ    40(SP), R15
  2185	MOVQ    48(SP), DI
  2186	MOVQ    56(SP), SI
  2187	CALL    p256MulInternal(SB)
  2188	XORQ    AX, AX
  2189	ADDQ    R10, R10
  2190	ADCQ    R11, R11
  2191	ADCQ    R12, R12
  2192	ADCQ    R13, R13
  2193	ADCQ    $+0, AX
  2194	MOVQ    R10, R14
  2195	MOVQ    R11, R15
  2196	MOVQ    R12, DI
  2197	MOVQ    R13, SI
  2198	SUBQ    $-1, R14
  2199	SBBQ    p256const0<>+0(SB), R15
  2200	SBBQ    $+0, DI
  2201	SBBQ    p256const1<>+0(SB), SI
  2202	SBBQ    $+0, AX
  2203	CMOVQCS R10, R14
  2204	CMOVQCS R11, R15
  2205	CMOVQCS R12, DI
  2206	CMOVQCS R13, SI
  2207	MOVQ    224(SP), AX
  2208
  2209	// Store z
  2210	MOVQ R14, 64(AX)
  2211	MOVQ R15, 72(AX)
  2212	MOVQ DI, 80(AX)
  2213	MOVQ SI, 88(AX)
  2214	MOVQ (SP), R10
  2215	MOVQ 8(SP), R11
  2216	MOVQ 16(SP), R12
  2217	MOVQ 24(SP), R13
  2218	MOVQ 160(SP), R14
  2219	MOVQ 168(SP), R15
  2220	MOVQ 176(SP), DI
  2221	MOVQ 184(SP), SI
  2222	CALL p256SubInternal(SB)
  2223	MOVQ 128(SP), R14
  2224	MOVQ 136(SP), R15
  2225	MOVQ 144(SP), DI
  2226	MOVQ 152(SP), SI
  2227	CALL p256MulInternal(SB)
  2228	MOVQ R10, 128(SP)
  2229	MOVQ R11, 136(SP)
  2230	MOVQ R12, 144(SP)
  2231	MOVQ R13, 152(SP)
  2232
  2233	// Multiply by 3
  2234	XORQ    AX, AX
  2235	ADDQ    R10, R10
  2236	ADCQ    R11, R11
  2237	ADCQ    R12, R12
  2238	ADCQ    R13, R13
  2239	ADCQ    $+0, AX
  2240	MOVQ    R10, R14
  2241	MOVQ    R11, R15
  2242	MOVQ    R12, DI
  2243	MOVQ    R13, SI
  2244	SUBQ    $-1, R14
  2245	SBBQ    p256const0<>+0(SB), R15
  2246	SBBQ    $+0, DI
  2247	SBBQ    p256const1<>+0(SB), SI
  2248	SBBQ    $+0, AX
  2249	CMOVQCS R10, R14
  2250	CMOVQCS R11, R15
  2251	CMOVQCS R12, DI
  2252	CMOVQCS R13, SI
  2253	MOVQ    128(SP), R10
  2254	MOVQ    136(SP), R11
  2255	MOVQ    144(SP), R12
  2256	MOVQ    152(SP), R13
  2257	XORQ    AX, AX
  2258	ADDQ    R14, R10
  2259	ADCQ    R15, R11
  2260	ADCQ    DI, R12
  2261	ADCQ    SI, R13
  2262	ADCQ    $+0, AX
  2263	MOVQ    R10, R14
  2264	MOVQ    R11, R15
  2265	MOVQ    R12, DI
  2266	MOVQ    R13, SI
  2267	SUBQ    $-1, R14
  2268	SBBQ    p256const0<>+0(SB), R15
  2269	SBBQ    $+0, DI
  2270	SBBQ    p256const1<>+0(SB), SI
  2271	SBBQ    $+0, AX
  2272	CMOVQCS R10, R14
  2273	CMOVQCS R11, R15
  2274	CMOVQCS R12, DI
  2275	CMOVQCS R13, SI
  2276	MOVQ    R14, 128(SP)
  2277	MOVQ    R15, 136(SP)
  2278	MOVQ    DI, 144(SP)
  2279	MOVQ    SI, 152(SP)
  2280
  2281	// ////////////////////////
  2282	MOVQ    32(SP), R10
  2283	MOVQ    40(SP), R11
  2284	MOVQ    48(SP), R12
  2285	MOVQ    56(SP), R13
  2286	XORQ    AX, AX
  2287	ADDQ    R10, R10
  2288	ADCQ    R11, R11
  2289	ADCQ    R12, R12
  2290	ADCQ    R13, R13
  2291	ADCQ    $+0, AX
  2292	MOVQ    R10, R14
  2293	MOVQ    R11, R15
  2294	MOVQ    R12, DI
  2295	MOVQ    R13, SI
  2296	SUBQ    $-1, R14
  2297	SBBQ    p256const0<>+0(SB), R15
  2298	SBBQ    $+0, DI
  2299	SBBQ    p256const1<>+0(SB), SI
  2300	SBBQ    $+0, AX
  2301	CMOVQCS R10, R14
  2302	CMOVQCS R11, R15
  2303	CMOVQCS R12, DI
  2304	CMOVQCS R13, SI
  2305	MOVQ    R14, R10
  2306	MOVQ    R15, R11
  2307	MOVQ    DI, R12
  2308	MOVQ    SI, R13
  2309	CALL    p256SqrInternal(SB)
  2310	MOVQ    R10, 96(SP)
  2311	MOVQ    R11, 104(SP)
  2312	MOVQ    R12, 112(SP)
  2313	MOVQ    R13, 120(SP)
  2314	CALL    p256SqrInternal(SB)
  2315
  2316	// Divide by 2
  2317	XORQ    AX, AX
  2318	MOVQ    R10, R14
  2319	MOVQ    R11, R15
  2320	MOVQ    R12, DI
  2321	MOVQ    R13, SI
  2322	ADDQ    $-1, R10
  2323	ADCQ    p256const0<>+0(SB), R11
  2324	ADCQ    $0x00, R12
  2325	ADCQ    p256const1<>+0(SB), R13
  2326	ADCQ    $0x00, AX
  2327	TESTQ   $0x00000001, R14
  2328	CMOVQEQ R14, R10
  2329	CMOVQEQ R15, R11
  2330	CMOVQEQ DI, R12
  2331	CMOVQEQ SI, R13
  2332	ANDQ    R14, AX
  2333	SHRQ    $0x01, R11, R10
  2334	SHRQ    $0x01, R12, R11
  2335	SHRQ    $0x01, R13, R12
  2336	SHRQ    $0x01, AX, R13
  2337	MOVQ    R10, 32(SP)
  2338	MOVQ    R11, 40(SP)
  2339	MOVQ    R12, 48(SP)
  2340	MOVQ    R13, 56(SP)
  2341
  2342	// /////////////////////////
  2343	MOVQ    (SP), R10
  2344	MOVQ    8(SP), R11
  2345	MOVQ    16(SP), R12
  2346	MOVQ    24(SP), R13
  2347	MOVQ    96(SP), R14
  2348	MOVQ    104(SP), R15
  2349	MOVQ    112(SP), DI
  2350	MOVQ    120(SP), SI
  2351	CALL    p256MulInternal(SB)
  2352	MOVQ    R10, 96(SP)
  2353	MOVQ    R11, 104(SP)
  2354	MOVQ    R12, 112(SP)
  2355	MOVQ    R13, 120(SP)
  2356	XORQ    AX, AX
  2357	ADDQ    R10, R10
  2358	ADCQ    R11, R11
  2359	ADCQ    R12, R12
  2360	ADCQ    R13, R13
  2361	ADCQ    $+0, AX
  2362	MOVQ    R10, R14
  2363	MOVQ    R11, R15
  2364	MOVQ    R12, DI
  2365	MOVQ    R13, SI
  2366	SUBQ    $-1, R14
  2367	SBBQ    p256const0<>+0(SB), R15
  2368	SBBQ    $+0, DI
  2369	SBBQ    p256const1<>+0(SB), SI
  2370	SBBQ    $+0, AX
  2371	CMOVQCS R10, R14
  2372	CMOVQCS R11, R15
  2373	CMOVQCS R12, DI
  2374	CMOVQCS R13, SI
  2375	MOVQ    R14, 192(SP)
  2376	MOVQ    R15, 200(SP)
  2377	MOVQ    DI, 208(SP)
  2378	MOVQ    SI, 216(SP)
  2379	MOVQ    128(SP), R10
  2380	MOVQ    136(SP), R11
  2381	MOVQ    144(SP), R12
  2382	MOVQ    152(SP), R13
  2383	CALL    p256SqrInternal(SB)
  2384	MOVQ    192(SP), R14
  2385	MOVQ    200(SP), R15
  2386	MOVQ    208(SP), DI
  2387	MOVQ    216(SP), SI
  2388	CALL    p256SubInternal(SB)
  2389	MOVQ    224(SP), AX
  2390
  2391	// Store x
  2392	MOVQ R10, (AX)
  2393	MOVQ R11, 8(AX)
  2394	MOVQ R12, 16(AX)
  2395	MOVQ R13, 24(AX)
  2396	MOVQ R10, R14
  2397	MOVQ R11, R15
  2398	MOVQ R12, DI
  2399	MOVQ R13, SI
  2400	MOVQ 96(SP), R10
  2401	MOVQ 104(SP), R11
  2402	MOVQ 112(SP), R12
  2403	MOVQ 120(SP), R13
  2404	CALL p256SubInternal(SB)
  2405	MOVQ 128(SP), R14
  2406	MOVQ 136(SP), R15
  2407	MOVQ 144(SP), DI
  2408	MOVQ 152(SP), SI
  2409	CALL p256MulInternal(SB)
  2410	MOVQ 32(SP), R14
  2411	MOVQ 40(SP), R15
  2412	MOVQ 48(SP), DI
  2413	MOVQ 56(SP), SI
  2414	CALL p256SubInternal(SB)
  2415	MOVQ 224(SP), AX
  2416
  2417	// Store y
  2418	MOVQ R10, 32(AX)
  2419	MOVQ R11, 40(AX)
  2420	MOVQ R12, 48(AX)
  2421	MOVQ R13, 56(AX)
  2422
  2423	// ///////////////////////
  2424	MOVQ $0x00000000, 224(SP)
  2425	RET

View as plain text