...

Text file src/crypto/internal/nistec/p256_asm_s390x.s

Documentation: crypto/internal/nistec

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7#include "textflag.h"
     8#include "go_asm.h"
     9
    10DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
    11DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
    12DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    13DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    14DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    15DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    16DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    17DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    18DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    19DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    20DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    21DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    22DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    23DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    24DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    25DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
    26DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
    27DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    28DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    29DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    30DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    31DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    32DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    33DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    34DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    35DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    36DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    37DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    38DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    39DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    40DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    41DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    42DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    43DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    44DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    45DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    46DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    47GLOBL p256ordK0<>(SB), 8, $4
    48GLOBL p256ord<>(SB), 8, $32
    49GLOBL p256<>(SB), 8, $96
    50GLOBL p256mul<>(SB), 8, $160
    51
    52// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    53TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
    54	JMP ·p256BigToLittle(SB)
    55
    56// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    57TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
    58	JMP ·p256BigToLittle(SB)
    59
    60// ---------------------------------------
    61// func p256LittleToBig(res *[32]byte, in *p256Element)
    62TEXT ·p256LittleToBig(SB), NOSPLIT, $0
    63	JMP ·p256BigToLittle(SB)
    64
    65// func p256BigToLittle(res *p256Element, in *[32]byte)
    66#define res_ptr   R1
    67#define in_ptr   R2
    68#define T1L   V2
    69#define T1H   V3
    70
    71TEXT ·p256BigToLittle(SB), NOSPLIT, $0
    72	MOVD res+0(FP), res_ptr
    73	MOVD in+8(FP), in_ptr
    74
    75	VL 0(in_ptr), T1H
    76	VL 16(in_ptr), T1L
    77
    78	VPDI $0x4, T1L, T1L, T1L
    79	VPDI $0x4, T1H, T1H, T1H
    80
    81	VST T1L, 0(res_ptr)
    82	VST T1H, 16(res_ptr)
    83	RET
    84
    85#undef res_ptr
    86#undef in_ptr
    87#undef T1L
    88#undef T1H
    89
    90// ---------------------------------------
    91// iff cond == 1  val <- -val
    92// func p256NegCond(val *p256Element, cond int)
    93#define P1ptr   R1
    94#define CPOOL   R4
    95
    96#define Y1L   V0
    97#define Y1H   V1
    98#define T1L   V2
    99#define T1H   V3
   100
   101#define PL    V30
   102#define PH    V31
   103
   104#define ZER   V4
   105#define SEL1  V5
   106#define CAR1  V6
   107TEXT ·p256NegCond(SB), NOSPLIT, $0
   108	MOVD val+0(FP), P1ptr
   109
   110	MOVD $p256mul<>+0x00(SB), CPOOL
   111	VL   16(CPOOL), PL
   112	VL   0(CPOOL), PH
   113
   114	VL   16(P1ptr), Y1H
   115	VPDI $0x4, Y1H, Y1H, Y1H
   116	VL   0(P1ptr), Y1L
   117	VPDI $0x4, Y1L, Y1L, Y1L
   118
   119	VLREPG cond+8(FP), SEL1
   120	VZERO  ZER
   121	VCEQG  SEL1, ZER, SEL1
   122
   123	VSCBIQ Y1L, PL, CAR1
   124	VSQ    Y1L, PL, T1L
   125	VSBIQ  PH, Y1H, CAR1, T1H
   126
   127	VSEL Y1L, T1L, SEL1, Y1L
   128	VSEL Y1H, T1H, SEL1, Y1H
   129
   130	VPDI $0x4, Y1H, Y1H, Y1H
   131	VST  Y1H, 16(P1ptr)
   132	VPDI $0x4, Y1L, Y1L, Y1L
   133	VST  Y1L, 0(P1ptr)
   134	RET
   135
   136#undef P1ptr
   137#undef CPOOL
   138#undef Y1L
   139#undef Y1H
   140#undef T1L
   141#undef T1H
   142#undef PL
   143#undef PH
   144#undef ZER
   145#undef SEL1
   146#undef CAR1
   147
   148// ---------------------------------------
   149// if cond == 0 res <- b; else res <- a
   150// func p256MovCond(res, a, b *P256Point, cond int)
   151#define P3ptr   R1
   152#define P1ptr   R2
   153#define P2ptr   R3
   154
   155#define X1L    V0
   156#define X1H    V1
   157#define Y1L    V2
   158#define Y1H    V3
   159#define Z1L    V4
   160#define Z1H    V5
   161#define X2L    V6
   162#define X2H    V7
   163#define Y2L    V8
   164#define Y2H    V9
   165#define Z2L    V10
   166#define Z2H    V11
   167
   168#define ZER   V18
   169#define SEL1  V19
   170TEXT ·p256MovCond(SB), NOSPLIT, $0
   171	MOVD   res+0(FP), P3ptr
   172	MOVD   a+8(FP), P1ptr
   173	MOVD   b+16(FP), P2ptr
   174	VLREPG cond+24(FP), SEL1
   175	VZERO  ZER
   176	VCEQG  SEL1, ZER, SEL1
   177
   178	VL 0(P1ptr), X1H
   179	VL 16(P1ptr), X1L
   180	VL 32(P1ptr), Y1H
   181	VL 48(P1ptr), Y1L
   182	VL 64(P1ptr), Z1H
   183	VL 80(P1ptr), Z1L
   184
   185	VL 0(P2ptr), X2H
   186	VL 16(P2ptr), X2L
   187	VL 32(P2ptr), Y2H
   188	VL 48(P2ptr), Y2L
   189	VL 64(P2ptr), Z2H
   190	VL 80(P2ptr), Z2L
   191
   192	VSEL X2L, X1L, SEL1, X1L
   193	VSEL X2H, X1H, SEL1, X1H
   194	VSEL Y2L, Y1L, SEL1, Y1L
   195	VSEL Y2H, Y1H, SEL1, Y1H
   196	VSEL Z2L, Z1L, SEL1, Z1L
   197	VSEL Z2H, Z1H, SEL1, Z1H
   198
   199	VST X1H, 0(P3ptr)
   200	VST X1L, 16(P3ptr)
   201	VST Y1H, 32(P3ptr)
   202	VST Y1L, 48(P3ptr)
   203	VST Z1H, 64(P3ptr)
   204	VST Z1L, 80(P3ptr)
   205
   206	RET
   207
   208#undef P3ptr
   209#undef P1ptr
   210#undef P2ptr
   211#undef X1L
   212#undef X1H
   213#undef Y1L
   214#undef Y1H
   215#undef Z1L
   216#undef Z1H
   217#undef X2L
   218#undef X2H
   219#undef Y2L
   220#undef Y2H
   221#undef Z2L
   222#undef Z2H
   223#undef ZER
   224#undef SEL1
   225
   226// ---------------------------------------
   227// Constant time table access
   228// Indexed from 1 to 15, with -1 offset
   229// (index 0 is implicitly point at infinity)
   230// func p256Select(res *P256Point, table *p256Table, idx int)
   231#define P3ptr   R1
   232#define P1ptr   R2
   233#define COUNT   R4
   234
   235#define X1L    V0
   236#define X1H    V1
   237#define Y1L    V2
   238#define Y1H    V3
   239#define Z1L    V4
   240#define Z1H    V5
   241#define X2L    V6
   242#define X2H    V7
   243#define Y2L    V8
   244#define Y2H    V9
   245#define Z2L    V10
   246#define Z2H    V11
   247
   248#define ONE   V18
   249#define IDX   V19
   250#define SEL1  V20
   251#define SEL2  V21
   252TEXT ·p256Select(SB), NOSPLIT, $0
   253	MOVD   res+0(FP), P3ptr
   254	MOVD   table+8(FP), P1ptr
   255	VLREPB idx+(16+7)(FP), IDX
   256	VREPIB $1, ONE
   257	VREPIB $1, SEL2
   258	MOVD   $1, COUNT
   259
   260	VZERO X1H
   261	VZERO X1L
   262	VZERO Y1H
   263	VZERO Y1L
   264	VZERO Z1H
   265	VZERO Z1L
   266
   267loop_select:
   268	VL 0(P1ptr), X2H
   269	VL 16(P1ptr), X2L
   270	VL 32(P1ptr), Y2H
   271	VL 48(P1ptr), Y2L
   272	VL 64(P1ptr), Z2H
   273	VL 80(P1ptr), Z2L
   274
   275	VCEQG SEL2, IDX, SEL1
   276
   277	VSEL X2L, X1L, SEL1, X1L
   278	VSEL X2H, X1H, SEL1, X1H
   279	VSEL Y2L, Y1L, SEL1, Y1L
   280	VSEL Y2H, Y1H, SEL1, Y1H
   281	VSEL Z2L, Z1L, SEL1, Z1L
   282	VSEL Z2H, Z1H, SEL1, Z1H
   283
   284	VAB  SEL2, ONE, SEL2
   285	ADDW $1, COUNT
   286	ADD  $96, P1ptr
   287	CMPW COUNT, $17
   288	BLT  loop_select
   289
   290	VST X1H, 0(P3ptr)
   291	VST X1L, 16(P3ptr)
   292	VST Y1H, 32(P3ptr)
   293	VST Y1L, 48(P3ptr)
   294	VST Z1H, 64(P3ptr)
   295	VST Z1L, 80(P3ptr)
   296	RET
   297
   298#undef P3ptr
   299#undef P1ptr
   300#undef COUNT
   301#undef X1L
   302#undef X1H
   303#undef Y1L
   304#undef Y1H
   305#undef Z1L
   306#undef Z1H
   307#undef X2L
   308#undef X2H
   309#undef Y2L
   310#undef Y2H
   311#undef Z2L
   312#undef Z2H
   313#undef ONE
   314#undef IDX
   315#undef SEL1
   316#undef SEL2
   317
   318// ---------------------------------------
   319
   320//  func p256FromMont(res, in *p256Element)
   321#define res_ptr R1
   322#define x_ptr   R2
   323#define CPOOL   R4
   324
   325#define T0   V0
   326#define T1   V1
   327#define T2   V2
   328#define TT0  V3
   329#define TT1  V4
   330
   331#define ZER   V6
   332#define SEL1  V7
   333#define SEL2  V8
   334#define CAR1  V9
   335#define CAR2  V10
   336#define RED1  V11
   337#define RED2  V12
   338#define PL    V13
   339#define PH    V14
   340
   341TEXT ·p256FromMont(SB), NOSPLIT, $0
   342	MOVD res+0(FP), res_ptr
   343	MOVD in+8(FP), x_ptr
   344
   345	VZERO T2
   346	VZERO ZER
   347	MOVD  $p256<>+0x00(SB), CPOOL
   348	VL    16(CPOOL), PL
   349	VL    0(CPOOL), PH
   350	VL    48(CPOOL), SEL2
   351	VL    64(CPOOL), SEL1
   352
   353	VL   (0*16)(x_ptr), T0
   354	VPDI $0x4, T0, T0, T0
   355	VL   (1*16)(x_ptr), T1
   356	VPDI $0x4, T1, T1, T1
   357
   358	// First round
   359	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   360	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   361	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   362
   363	VSLDB $8, T1, T0, T0
   364	VSLDB $8, T2, T1, T1
   365
   366	VACCQ  T0, RED1, CAR1
   367	VAQ    T0, RED1, T0
   368	VACCCQ T1, RED2, CAR1, CAR2
   369	VACQ   T1, RED2, CAR1, T1
   370	VAQ    T2, CAR2, T2
   371
   372	// Second round
   373	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   374	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   375	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   376
   377	VSLDB $8, T1, T0, T0
   378	VSLDB $8, T2, T1, T1
   379
   380	VACCQ  T0, RED1, CAR1
   381	VAQ    T0, RED1, T0
   382	VACCCQ T1, RED2, CAR1, CAR2
   383	VACQ   T1, RED2, CAR1, T1
   384	VAQ    T2, CAR2, T2
   385
   386	// Third round
   387	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   388	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   389	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   390
   391	VSLDB $8, T1, T0, T0
   392	VSLDB $8, T2, T1, T1
   393
   394	VACCQ  T0, RED1, CAR1
   395	VAQ    T0, RED1, T0
   396	VACCCQ T1, RED2, CAR1, CAR2
   397	VACQ   T1, RED2, CAR1, T1
   398	VAQ    T2, CAR2, T2
   399
   400	// Last round
   401	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   402	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   403	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   404
   405	VSLDB $8, T1, T0, T0
   406	VSLDB $8, T2, T1, T1
   407
   408	VACCQ  T0, RED1, CAR1
   409	VAQ    T0, RED1, T0
   410	VACCCQ T1, RED2, CAR1, CAR2
   411	VACQ   T1, RED2, CAR1, T1
   412	VAQ    T2, CAR2, T2
   413
   414	// ---------------------------------------------------
   415
   416	VSCBIQ  PL, T0, CAR1
   417	VSQ     PL, T0, TT0
   418	VSBCBIQ T1, PH, CAR1, CAR2
   419	VSBIQ   T1, PH, CAR1, TT1
   420	VSBIQ   T2, ZER, CAR2, T2
   421
   422	// what output to use, TT1||TT0 or T1||T0?
   423	VSEL T0, TT0, T2, T0
   424	VSEL T1, TT1, T2, T1
   425
   426	VPDI $0x4, T0, T0, TT0
   427	VST  TT0, (0*16)(res_ptr)
   428	VPDI $0x4, T1, T1, TT1
   429	VST  TT1, (1*16)(res_ptr)
   430	RET
   431
   432#undef res_ptr
   433#undef x_ptr
   434#undef CPOOL
   435#undef T0
   436#undef T1
   437#undef T2
   438#undef TT0
   439#undef TT1
   440#undef ZER
   441#undef SEL1
   442#undef SEL2
   443#undef CAR1
   444#undef CAR2
   445#undef RED1
   446#undef RED2
   447#undef PL
   448#undef PH
   449
   450// Constant time table access
   451// Indexed from 1 to 15, with -1 offset
   452// (index 0 is implicitly point at infinity)
   453// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   454// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   455
   456#define P3ptr   R1
   457#define P1ptr   R2
   458#define COUNT   R4
   459#define CPOOL   R5
   460
   461#define X1L    V0
   462#define X1H    V1
   463#define Y1L    V2
   464#define Y1H    V3
   465#define Z1L    V4
   466#define Z1H    V5
   467#define X2L    V6
   468#define X2H    V7
   469#define Y2L    V8
   470#define Y2H    V9
   471#define Z2L    V10
   472#define Z2H    V11
   473#define LE2BE  V12
   474
   475#define ONE   V18
   476#define IDX   V19
   477#define SEL1  V20
   478#define SEL2  V21
   479
   480TEXT ·p256SelectAffine(SB), NOSPLIT, $0
   481	MOVD   res+0(FP), P3ptr
   482	MOVD   table+8(FP), P1ptr
   483	MOVD   $p256<>+0x00(SB), CPOOL
   484	VLREPB idx+(16+7)(FP), IDX
   485	VREPIB $1, ONE
   486	VREPIB $1, SEL2
   487	MOVD   $1, COUNT
   488	VL     80(CPOOL), LE2BE
   489
   490	VZERO X1H
   491	VZERO X1L
   492	VZERO Y1H
   493	VZERO Y1L
   494
   495loop_select:
   496	VL 0(P1ptr), X2H
   497	VL 16(P1ptr), X2L
   498	VL 32(P1ptr), Y2H
   499	VL 48(P1ptr), Y2L
   500
   501	VCEQG SEL2, IDX, SEL1
   502
   503	VSEL X2L, X1L, SEL1, X1L
   504	VSEL X2H, X1H, SEL1, X1H
   505	VSEL Y2L, Y1L, SEL1, Y1L
   506	VSEL Y2H, Y1H, SEL1, Y1H
   507
   508	VAB  SEL2, ONE, SEL2
   509	ADDW $1, COUNT
   510	ADD  $64, P1ptr
   511	CMPW COUNT, $65
   512	BLT  loop_select
   513	VST  X1H, 0(P3ptr)
   514	VST  X1L, 16(P3ptr)
   515	VST  Y1H, 32(P3ptr)
   516	VST  Y1L, 48(P3ptr)
   517
   518	RET
   519
   520#undef P3ptr
   521#undef P1ptr
   522#undef COUNT
   523#undef X1L
   524#undef X1H
   525#undef Y1L
   526#undef Y1H
   527#undef Z1L
   528#undef Z1H
   529#undef X2L
   530#undef X2H
   531#undef Y2L
   532#undef Y2H
   533#undef Z2L
   534#undef Z2H
   535#undef ONE
   536#undef IDX
   537#undef SEL1
   538#undef SEL2
   539#undef CPOOL
   540
   541// ---------------------------------------
   542
   543// func p256OrdMul(res, in1, in2 *p256OrdElement)
   544#define res_ptr R1
   545#define x_ptr R2
   546#define y_ptr R3
   547#define X0    V0
   548#define X1    V1
   549#define Y0    V2
   550#define Y1    V3
   551#define M0    V4
   552#define M1    V5
   553#define T0    V6
   554#define T1    V7
   555#define T2    V8
   556#define YDIG  V9
   557
   558#define ADD1  V16
   559#define ADD1H V17
   560#define ADD2  V18
   561#define ADD2H V19
   562#define RED1  V20
   563#define RED1H V21
   564#define RED2  V22
   565#define RED2H V23
   566#define CAR1  V24
   567#define CAR1M V25
   568
   569#define MK0   V30
   570#define K0    V31
   571TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
   572	MOVD res+0(FP), res_ptr
   573	MOVD in1+8(FP), x_ptr
   574	MOVD in2+16(FP), y_ptr
   575
   576	VZERO T2
   577	MOVD  $p256ordK0<>+0x00(SB), R4
   578
   579	// VLEF    $3, 0(R4), K0
   580	WORD $0xE7F40000
   581	BYTE $0x38
   582	BYTE $0x03
   583	MOVD $p256ord<>+0x00(SB), R4
   584	VL   16(R4), M0
   585	VL   0(R4), M1
   586
   587	VL   (0*16)(x_ptr), X0
   588	VPDI $0x4, X0, X0, X0
   589	VL   (1*16)(x_ptr), X1
   590	VPDI $0x4, X1, X1, X1
   591	VL   (0*16)(y_ptr), Y0
   592	VPDI $0x4, Y0, Y0, Y0
   593	VL   (1*16)(y_ptr), Y1
   594	VPDI $0x4, Y1, Y1, Y1
   595
   596	// ---------------------------------------------------------------------------/
   597	VREPF $3, Y0, YDIG
   598	VMLF  X0, YDIG, ADD1
   599	VMLF  ADD1, K0, MK0
   600	VREPF $3, MK0, MK0
   601
   602	VMLF  X1, YDIG, ADD2
   603	VMLHF X0, YDIG, ADD1H
   604	VMLHF X1, YDIG, ADD2H
   605
   606	VMALF  M0, MK0, ADD1, RED1
   607	VMALHF M0, MK0, ADD1, RED1H
   608	VMALF  M1, MK0, ADD2, RED2
   609	VMALHF M1, MK0, ADD2, RED2H
   610
   611	VSLDB $12, RED2, RED1, RED1
   612	VSLDB $12, T2, RED2, RED2
   613
   614	VACCQ RED1, ADD1H, CAR1
   615	VAQ   RED1, ADD1H, T0
   616	VACCQ RED1H, T0, CAR1M
   617	VAQ   RED1H, T0, T0
   618
   619	// << ready for next MK0
   620
   621	VACQ   RED2, ADD2H, CAR1, T1
   622	VACCCQ RED2, ADD2H, CAR1, CAR1
   623	VACCCQ RED2H, T1, CAR1M, T2
   624	VACQ   RED2H, T1, CAR1M, T1
   625	VAQ    CAR1, T2, T2
   626
   627	// ---------------------------------------------------
   628/* *
   629 * ---+--------+--------+
   630 *  T2|   T1   |   T0   |
   631 * ---+--------+--------+
   632 *           *(add)*
   633 *    +--------+--------+
   634 *    |   X1   |   X0   |
   635 *    +--------+--------+
   636 *           *(mul)*
   637 *    +--------+--------+
   638 *    |  YDIG  |  YDIG  |
   639 *    +--------+--------+
   640 *           *(add)*
   641 *    +--------+--------+
   642 *    |   M1   |   M0   |
   643 *    +--------+--------+
   644 *           *(mul)*
   645 *    +--------+--------+
   646 *    |   MK0  |   MK0  |
   647 *    +--------+--------+
   648 *
   649 *   ---------------------
   650 *
   651 *    +--------+--------+
   652 *    |  ADD2  |  ADD1  |
   653 *    +--------+--------+
   654 *  +--------+--------+
   655 *  | ADD2H  | ADD1H  |
   656 *  +--------+--------+
   657 *    +--------+--------+
   658 *    |  RED2  |  RED1  |
   659 *    +--------+--------+
   660 *  +--------+--------+
   661 *  | RED2H  | RED1H  |
   662 *  +--------+--------+
   663 */
   664	VREPF $2, Y0, YDIG
   665	VMALF X0, YDIG, T0, ADD1
   666	VMLF  ADD1, K0, MK0
   667	VREPF $3, MK0, MK0
   668
   669	VMALF  X1, YDIG, T1, ADD2
   670	VMALHF X0, YDIG, T0, ADD1H
   671	VMALHF X1, YDIG, T1, ADD2H
   672
   673	VMALF  M0, MK0, ADD1, RED1
   674	VMALHF M0, MK0, ADD1, RED1H
   675	VMALF  M1, MK0, ADD2, RED2
   676	VMALHF M1, MK0, ADD2, RED2H
   677
   678	VSLDB $12, RED2, RED1, RED1
   679	VSLDB $12, T2, RED2, RED2
   680
   681	VACCQ RED1, ADD1H, CAR1
   682	VAQ   RED1, ADD1H, T0
   683	VACCQ RED1H, T0, CAR1M
   684	VAQ   RED1H, T0, T0
   685
   686	// << ready for next MK0
   687
   688	VACQ   RED2, ADD2H, CAR1, T1
   689	VACCCQ RED2, ADD2H, CAR1, CAR1
   690	VACCCQ RED2H, T1, CAR1M, T2
   691	VACQ   RED2H, T1, CAR1M, T1
   692	VAQ    CAR1, T2, T2
   693
   694	// ---------------------------------------------------
   695	VREPF $1, Y0, YDIG
   696	VMALF X0, YDIG, T0, ADD1
   697	VMLF  ADD1, K0, MK0
   698	VREPF $3, MK0, MK0
   699
   700	VMALF  X1, YDIG, T1, ADD2
   701	VMALHF X0, YDIG, T0, ADD1H
   702	VMALHF X1, YDIG, T1, ADD2H
   703
   704	VMALF  M0, MK0, ADD1, RED1
   705	VMALHF M0, MK0, ADD1, RED1H
   706	VMALF  M1, MK0, ADD2, RED2
   707	VMALHF M1, MK0, ADD2, RED2H
   708
   709	VSLDB $12, RED2, RED1, RED1
   710	VSLDB $12, T2, RED2, RED2
   711
   712	VACCQ RED1, ADD1H, CAR1
   713	VAQ   RED1, ADD1H, T0
   714	VACCQ RED1H, T0, CAR1M
   715	VAQ   RED1H, T0, T0
   716
   717	// << ready for next MK0
   718
   719	VACQ   RED2, ADD2H, CAR1, T1
   720	VACCCQ RED2, ADD2H, CAR1, CAR1
   721	VACCCQ RED2H, T1, CAR1M, T2
   722	VACQ   RED2H, T1, CAR1M, T1
   723	VAQ    CAR1, T2, T2
   724
   725	// ---------------------------------------------------
   726	VREPF $0, Y0, YDIG
   727	VMALF X0, YDIG, T0, ADD1
   728	VMLF  ADD1, K0, MK0
   729	VREPF $3, MK0, MK0
   730
   731	VMALF  X1, YDIG, T1, ADD2
   732	VMALHF X0, YDIG, T0, ADD1H
   733	VMALHF X1, YDIG, T1, ADD2H
   734
   735	VMALF  M0, MK0, ADD1, RED1
   736	VMALHF M0, MK0, ADD1, RED1H
   737	VMALF  M1, MK0, ADD2, RED2
   738	VMALHF M1, MK0, ADD2, RED2H
   739
   740	VSLDB $12, RED2, RED1, RED1
   741	VSLDB $12, T2, RED2, RED2
   742
   743	VACCQ RED1, ADD1H, CAR1
   744	VAQ   RED1, ADD1H, T0
   745	VACCQ RED1H, T0, CAR1M
   746	VAQ   RED1H, T0, T0
   747
   748	// << ready for next MK0
   749
   750	VACQ   RED2, ADD2H, CAR1, T1
   751	VACCCQ RED2, ADD2H, CAR1, CAR1
   752	VACCCQ RED2H, T1, CAR1M, T2
   753	VACQ   RED2H, T1, CAR1M, T1
   754	VAQ    CAR1, T2, T2
   755
   756	// ---------------------------------------------------
   757	VREPF $3, Y1, YDIG
   758	VMALF X0, YDIG, T0, ADD1
   759	VMLF  ADD1, K0, MK0
   760	VREPF $3, MK0, MK0
   761
   762	VMALF  X1, YDIG, T1, ADD2
   763	VMALHF X0, YDIG, T0, ADD1H
   764	VMALHF X1, YDIG, T1, ADD2H
   765
   766	VMALF  M0, MK0, ADD1, RED1
   767	VMALHF M0, MK0, ADD1, RED1H
   768	VMALF  M1, MK0, ADD2, RED2
   769	VMALHF M1, MK0, ADD2, RED2H
   770
   771	VSLDB $12, RED2, RED1, RED1
   772	VSLDB $12, T2, RED2, RED2
   773
   774	VACCQ RED1, ADD1H, CAR1
   775	VAQ   RED1, ADD1H, T0
   776	VACCQ RED1H, T0, CAR1M
   777	VAQ   RED1H, T0, T0
   778
   779	// << ready for next MK0
   780
   781	VACQ   RED2, ADD2H, CAR1, T1
   782	VACCCQ RED2, ADD2H, CAR1, CAR1
   783	VACCCQ RED2H, T1, CAR1M, T2
   784	VACQ   RED2H, T1, CAR1M, T1
   785	VAQ    CAR1, T2, T2
   786
   787	// ---------------------------------------------------
   788	VREPF $2, Y1, YDIG
   789	VMALF X0, YDIG, T0, ADD1
   790	VMLF  ADD1, K0, MK0
   791	VREPF $3, MK0, MK0
   792
   793	VMALF  X1, YDIG, T1, ADD2
   794	VMALHF X0, YDIG, T0, ADD1H
   795	VMALHF X1, YDIG, T1, ADD2H
   796
   797	VMALF  M0, MK0, ADD1, RED1
   798	VMALHF M0, MK0, ADD1, RED1H
   799	VMALF  M1, MK0, ADD2, RED2
   800	VMALHF M1, MK0, ADD2, RED2H
   801
   802	VSLDB $12, RED2, RED1, RED1
   803	VSLDB $12, T2, RED2, RED2
   804
   805	VACCQ RED1, ADD1H, CAR1
   806	VAQ   RED1, ADD1H, T0
   807	VACCQ RED1H, T0, CAR1M
   808	VAQ   RED1H, T0, T0
   809
   810	// << ready for next MK0
   811
   812	VACQ   RED2, ADD2H, CAR1, T1
   813	VACCCQ RED2, ADD2H, CAR1, CAR1
   814	VACCCQ RED2H, T1, CAR1M, T2
   815	VACQ   RED2H, T1, CAR1M, T1
   816	VAQ    CAR1, T2, T2
   817
   818	// ---------------------------------------------------
   819	VREPF $1, Y1, YDIG
   820	VMALF X0, YDIG, T0, ADD1
   821	VMLF  ADD1, K0, MK0
   822	VREPF $3, MK0, MK0
   823
   824	VMALF  X1, YDIG, T1, ADD2
   825	VMALHF X0, YDIG, T0, ADD1H
   826	VMALHF X1, YDIG, T1, ADD2H
   827
   828	VMALF  M0, MK0, ADD1, RED1
   829	VMALHF M0, MK0, ADD1, RED1H
   830	VMALF  M1, MK0, ADD2, RED2
   831	VMALHF M1, MK0, ADD2, RED2H
   832
   833	VSLDB $12, RED2, RED1, RED1
   834	VSLDB $12, T2, RED2, RED2
   835
   836	VACCQ RED1, ADD1H, CAR1
   837	VAQ   RED1, ADD1H, T0
   838	VACCQ RED1H, T0, CAR1M
   839	VAQ   RED1H, T0, T0
   840
   841	// << ready for next MK0
   842
   843	VACQ   RED2, ADD2H, CAR1, T1
   844	VACCCQ RED2, ADD2H, CAR1, CAR1
   845	VACCCQ RED2H, T1, CAR1M, T2
   846	VACQ   RED2H, T1, CAR1M, T1
   847	VAQ    CAR1, T2, T2
   848
   849	// ---------------------------------------------------
   850	VREPF $0, Y1, YDIG
   851	VMALF X0, YDIG, T0, ADD1
   852	VMLF  ADD1, K0, MK0
   853	VREPF $3, MK0, MK0
   854
   855	VMALF  X1, YDIG, T1, ADD2
   856	VMALHF X0, YDIG, T0, ADD1H
   857	VMALHF X1, YDIG, T1, ADD2H
   858
   859	VMALF  M0, MK0, ADD1, RED1
   860	VMALHF M0, MK0, ADD1, RED1H
   861	VMALF  M1, MK0, ADD2, RED2
   862	VMALHF M1, MK0, ADD2, RED2H
   863
   864	VSLDB $12, RED2, RED1, RED1
   865	VSLDB $12, T2, RED2, RED2
   866
   867	VACCQ RED1, ADD1H, CAR1
   868	VAQ   RED1, ADD1H, T0
   869	VACCQ RED1H, T0, CAR1M
   870	VAQ   RED1H, T0, T0
   871
   872	// << ready for next MK0
   873
   874	VACQ   RED2, ADD2H, CAR1, T1
   875	VACCCQ RED2, ADD2H, CAR1, CAR1
   876	VACCCQ RED2H, T1, CAR1M, T2
   877	VACQ   RED2H, T1, CAR1M, T1
   878	VAQ    CAR1, T2, T2
   879
   880	// ---------------------------------------------------
   881
   882	VZERO   RED1
   883	VSCBIQ  M0, T0, CAR1
   884	VSQ     M0, T0, ADD1
   885	VSBCBIQ T1, M1, CAR1, CAR1M
   886	VSBIQ   T1, M1, CAR1, ADD2
   887	VSBIQ   T2, RED1, CAR1M, T2
   888
   889	// what output to use, ADD2||ADD1 or T1||T0?
   890	VSEL T0, ADD1, T2, T0
   891	VSEL T1, ADD2, T2, T1
   892
   893	VPDI $0x4, T0, T0, T0
   894	VST  T0, (0*16)(res_ptr)
   895	VPDI $0x4, T1, T1, T1
   896	VST  T1, (1*16)(res_ptr)
   897	RET
   898
   899#undef res_ptr
   900#undef x_ptr
   901#undef y_ptr
   902#undef X0
   903#undef X1
   904#undef Y0
   905#undef Y1
   906#undef M0
   907#undef M1
   908#undef T0
   909#undef T1
   910#undef T2
   911#undef YDIG
   912
   913#undef ADD1
   914#undef ADD1H
   915#undef ADD2
   916#undef ADD2H
   917#undef RED1
   918#undef RED1H
   919#undef RED2
   920#undef RED2H
   921#undef CAR1
   922#undef CAR1M
   923
   924#undef MK0
   925#undef K0
   926
   927// ---------------------------------------
   928// p256MulInternal
   929// V0-V3,V30,V31 - Not Modified
   930// V4-V15 - Volatile
   931
   932#define CPOOL   R4
   933
   934// Parameters
   935#define X0    V0 // Not modified
   936#define X1    V1 // Not modified
   937#define Y0    V2 // Not modified
   938#define Y1    V3 // Not modified
   939#define T0    V4
   940#define T1    V5
   941#define P0    V30 // Not modified
   942#define P1    V31 // Not modified
   943
   944// Temporaries
   945#define YDIG  V6 // Overloaded with CAR2, ZER
   946#define ADD1H V7 // Overloaded with ADD3H
   947#define ADD2H V8 // Overloaded with ADD4H
   948#define ADD3  V9 // Overloaded with SEL2,SEL5
   949#define ADD4  V10 // Overloaded with SEL3,SEL6
   950#define RED1  V11 // Overloaded with CAR2
   951#define RED2  V12
   952#define RED3  V13 // Overloaded with SEL1
   953#define T2    V14
   954// Overloaded temporaries
   955#define ADD1  V4 // Overloaded with T0
   956#define ADD2  V5 // Overloaded with T1
   957#define ADD3H V7 // Overloaded with ADD1H
   958#define ADD4H V8 // Overloaded with ADD2H
   959#define ZER   V6 // Overloaded with YDIG, CAR2
   960#define CAR1  V6 // Overloaded with YDIG, ZER
   961#define CAR2  V11 // Overloaded with RED1
   962// Constant Selects
   963#define SEL1  V13 // Overloaded with RED3
   964#define SEL2  V9 // Overloaded with ADD3,SEL5
   965#define SEL3  V10 // Overloaded with ADD4,SEL6
   966#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   967#define SEL5  V9 // Overloaded with ADD3,SEL2
   968#define SEL6  V10 // Overloaded with ADD4,SEL3
   969
   970/* *
   971 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   972 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   973 * With you, SIMD be...
   974 *
   975 *                                           +--------+--------+
   976 *                                  +--------|  RED2  |  RED1  |
   977 *                                  |        +--------+--------+
   978 *                                  |       ---+--------+--------+
   979 *                                  |  +---- T2|   T1   |   T0   |--+
   980 *                                  |  |    ---+--------+--------+  |
   981 *                                  |  |                            |
   982 *                                  |  |    ======================= |
   983 *                                  |  |                            |
   984 *                                  |  |       +--------+--------+<-+
   985 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   986 *                                  |  |       +--------+--------+  |     |
   987 *                                  |  |     +--------+--------+<---+     |
   988 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   989 *                                  |  |     +--------+--------+  |       |
   990 *                                  |  |     +--------+--------+<-+       |
   991 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   992 *                                  |  |     +--------+--------+  | |     |
   993 *                                  |  |   +--------+--------+<---+ |     |
   994 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   995 *                                  |  |   +--------+--------+      | |   V
   996 *                                  |  | ------------------------   | | +--------+
   997 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   998 *                                  |  |                            | | +--------+
   999 *                                  |  +---->+--------+--------+    | |   |
  1000 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
  1001 *                                  |        +--------+--------+    | |   |
  1002 *                                  +---->---+--------+--------+    | |   |
  1003 *                                         T2|   T1   |   T0   |----+ |   |
  1004 *                                        ---+--------+--------+    | |   |
  1005 *                                        ---+--------+--------+<---+ |   |
  1006 *                                    +--- T2|   T1   |   T0   |----------+
  1007 *                                    |   ---+--------+--------+      |   |
  1008 *                                    |  +--------+--------+<-------------+
  1009 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1010 *                                    |  +--------+--------+     |    |   |
  1011 *                                    |  +--------+<----------------------+
  1012 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1013 *                                    |  +--------+              |    |
  1014 *                                    +--->+--------+--------+   |    |
  1015 *                                         |   T1   |   T0   |--------+
  1016 *                                         +--------+--------+   |    |
  1017 *                                   --------------------------- |    |
  1018 *                                                               |    |
  1019 *                                       +--------+--------+<----+    |
  1020 *                                       |  RED2  |  RED1  |          |
  1021 *                                       +--------+--------+          |
  1022 *                                      ---+--------+--------+<-------+
  1023 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1024 *                                      ---+--------+--------+
  1025 *
  1026 *                                                                *Mi obra de arte de siglo XXI @vpaprots
  1027 *
  1028 *
  1029 * First group is special, doesn't get the two inputs:
  1030 *                                             +--------+--------+<-+
  1031 *                                     +-------|  ADD2  |  ADD1  |--|-----+
  1032 *                                     |       +--------+--------+  |     |
  1033 *                                     |     +--------+--------+<---+     |
  1034 *                                     |     | ADD2H  | ADD1H  |--+       |
  1035 *                                     |     +--------+--------+  |       |
  1036 *                                     |     +--------+--------+<-+       |
  1037 *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1038 *                                     |     +--------+--------+  | |     |
  1039 *                                     |   +--------+--------+<---+ |     |
  1040 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1041 *                                     |   +--------+--------+      | |   V
  1042 *                                     | ------------------------   | | +--------+
  1043 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1044 *                                     |                            | | +--------+
  1045 *                                     +---->+--------+--------+    | |   |
  1046 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1047 *                                           +--------+--------+    | |   |
  1048 *                                        ---+--------+--------+<---+ |   |
  1049 *                                    +--- T2|   T1   |   T0   |----------+
  1050 *                                    |   ---+--------+--------+      |   |
  1051 *                                    |  +--------+--------+<-------------+
  1052 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1053 *                                    |  +--------+--------+     |    |   |
  1054 *                                    |  +--------+<----------------------+
  1055 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1056 *                                    |  +--------+              |    |
  1057 *                                    +--->+--------+--------+   |    |
  1058 *                                         |   T1   |   T0   |--------+
  1059 *                                         +--------+--------+   |    |
  1060 *                                   --------------------------- |    |
  1061 *                                                               |    |
  1062 *                                       +--------+--------+<----+    |
  1063 *                                       |  RED2  |  RED1  |          |
  1064 *                                       +--------+--------+          |
  1065 *                                      ---+--------+--------+<-------+
  1066 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1067 *                                      ---+--------+--------+
  1068 *
  1069 * Last 'group' needs to RED2||RED1 shifted less
  1070 */
  1071TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
  1072	VL 32(CPOOL), SEL1
  1073	VL 48(CPOOL), SEL2
  1074	VL 64(CPOOL), SEL3
  1075	VL 80(CPOOL), SEL4
  1076
  1077	// ---------------------------------------------------
  1078
  1079	VREPF $3, Y0, YDIG
  1080	VMLHF X0, YDIG, ADD1H
  1081	VMLHF X1, YDIG, ADD2H
  1082	VMLF  X0, YDIG, ADD1
  1083	VMLF  X1, YDIG, ADD2
  1084
  1085	VREPF  $2, Y0, YDIG
  1086	VMALF  X0, YDIG, ADD1H, ADD3
  1087	VMALF  X1, YDIG, ADD2H, ADD4
  1088	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1089	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1090
  1091	VZERO ZER
  1092	VL    32(CPOOL), SEL1
  1093	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1094
  1095	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1096	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1097
  1098	VACCQ  T0, ADD3, CAR1
  1099	VAQ    T0, ADD3, T0       // ADD3 Free
  1100	VACCCQ T1, ADD4, CAR1, T2
  1101	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1102
  1103	VL    48(CPOOL), SEL2
  1104	VL    64(CPOOL), SEL3
  1105	VL    80(CPOOL), SEL4
  1106	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1107	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1108	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1109	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1110
  1111	VSLDB $12, T1, T0, T0
  1112	VSLDB $12, T2, T1, T1
  1113
  1114	VACCQ  T0, ADD3H, CAR1
  1115	VAQ    T0, ADD3H, T0
  1116	VACCCQ T1, ADD4H, CAR1, T2
  1117	VACQ   T1, ADD4H, CAR1, T1
  1118
  1119	// ---------------------------------------------------
  1120
  1121	VREPF  $1, Y0, YDIG
  1122	VMALHF X0, YDIG, T0, ADD1H
  1123	VMALHF X1, YDIG, T1, ADD2H
  1124	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1125	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1126
  1127	VREPF  $0, Y0, YDIG
  1128	VMALF  X0, YDIG, ADD1H, ADD3
  1129	VMALF  X1, YDIG, ADD2H, ADD4
  1130	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1131	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1132
  1133	VZERO ZER
  1134	VL    32(CPOOL), SEL1
  1135	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1136
  1137	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1138	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1139
  1140	VACCQ  T0, RED1, CAR1
  1141	VAQ    T0, RED1, T0
  1142	VACCCQ T1, RED2, CAR1, T2
  1143	VACQ   T1, RED2, CAR1, T1
  1144
  1145	VACCQ  T0, ADD3, CAR1
  1146	VAQ    T0, ADD3, T0
  1147	VACCCQ T1, ADD4, CAR1, CAR2
  1148	VACQ   T1, ADD4, CAR1, T1
  1149	VAQ    T2, CAR2, T2
  1150
  1151	VL    48(CPOOL), SEL2
  1152	VL    64(CPOOL), SEL3
  1153	VL    80(CPOOL), SEL4
  1154	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1155	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1156	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1157	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1158
  1159	VSLDB $12, T1, T0, T0
  1160	VSLDB $12, T2, T1, T1
  1161
  1162	VACCQ  T0, ADD3H, CAR1
  1163	VAQ    T0, ADD3H, T0
  1164	VACCCQ T1, ADD4H, CAR1, T2
  1165	VACQ   T1, ADD4H, CAR1, T1
  1166
  1167	// ---------------------------------------------------
  1168
  1169	VREPF  $3, Y1, YDIG
  1170	VMALHF X0, YDIG, T0, ADD1H
  1171	VMALHF X1, YDIG, T1, ADD2H
  1172	VMALF  X0, YDIG, T0, ADD1
  1173	VMALF  X1, YDIG, T1, ADD2
  1174
  1175	VREPF  $2, Y1, YDIG
  1176	VMALF  X0, YDIG, ADD1H, ADD3
  1177	VMALF  X1, YDIG, ADD2H, ADD4
  1178	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1179	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1180
  1181	VZERO ZER
  1182	VL    32(CPOOL), SEL1
  1183	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1184
  1185	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1186	VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1187
  1188	VACCQ  T0, RED1, CAR1
  1189	VAQ    T0, RED1, T0
  1190	VACCCQ T1, RED2, CAR1, T2
  1191	VACQ   T1, RED2, CAR1, T1
  1192
  1193	VACCQ  T0, ADD3, CAR1
  1194	VAQ    T0, ADD3, T0
  1195	VACCCQ T1, ADD4, CAR1, CAR2
  1196	VACQ   T1, ADD4, CAR1, T1
  1197	VAQ    T2, CAR2, T2
  1198
  1199	VL    48(CPOOL), SEL2
  1200	VL    64(CPOOL), SEL3
  1201	VL    80(CPOOL), SEL4
  1202	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1203	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1204	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1205	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1206
  1207	VSLDB $12, T1, T0, T0
  1208	VSLDB $12, T2, T1, T1
  1209
  1210	VACCQ  T0, ADD3H, CAR1
  1211	VAQ    T0, ADD3H, T0
  1212	VACCCQ T1, ADD4H, CAR1, T2
  1213	VACQ   T1, ADD4H, CAR1, T1
  1214
  1215	// ---------------------------------------------------
  1216
  1217	VREPF  $1, Y1, YDIG
  1218	VMALHF X0, YDIG, T0, ADD1H
  1219	VMALHF X1, YDIG, T1, ADD2H
  1220	VMALF  X0, YDIG, T0, ADD1
  1221	VMALF  X1, YDIG, T1, ADD2
  1222
  1223	VREPF  $0, Y1, YDIG
  1224	VMALF  X0, YDIG, ADD1H, ADD3
  1225	VMALF  X1, YDIG, ADD2H, ADD4
  1226	VMALHF X0, YDIG, ADD1H, ADD3H
  1227	VMALHF X1, YDIG, ADD2H, ADD4H
  1228
  1229	VZERO ZER
  1230	VL    32(CPOOL), SEL1
  1231	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1232
  1233	VSLDB $12, ADD2, ADD1, T0
  1234	VSLDB $12, T2, ADD2, T1
  1235
  1236	VACCQ  T0, RED1, CAR1
  1237	VAQ    T0, RED1, T0
  1238	VACCCQ T1, RED2, CAR1, T2
  1239	VACQ   T1, RED2, CAR1, T1
  1240
  1241	VACCQ  T0, ADD3, CAR1
  1242	VAQ    T0, ADD3, T0
  1243	VACCCQ T1, ADD4, CAR1, CAR2
  1244	VACQ   T1, ADD4, CAR1, T1
  1245	VAQ    T2, CAR2, T2
  1246
  1247	VL    96(CPOOL), SEL5
  1248	VL    112(CPOOL), SEL6
  1249	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1250	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1251	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1252
  1253	VSLDB $12, T1, T0, T0
  1254	VSLDB $12, T2, T1, T1
  1255
  1256	VACCQ  T0, ADD3H, CAR1
  1257	VAQ    T0, ADD3H, T0
  1258	VACCCQ T1, ADD4H, CAR1, T2
  1259	VACQ   T1, ADD4H, CAR1, T1
  1260
  1261	VACCQ  T0, RED1, CAR1
  1262	VAQ    T0, RED1, T0
  1263	VACCCQ T1, RED2, CAR1, CAR2
  1264	VACQ   T1, RED2, CAR1, T1
  1265	VAQ    T2, CAR2, T2
  1266
  1267	// ---------------------------------------------------
  1268
  1269	VZERO   RED3
  1270	VSCBIQ  P0, T0, CAR1
  1271	VSQ     P0, T0, ADD1H
  1272	VSBCBIQ T1, P1, CAR1, CAR2
  1273	VSBIQ   T1, P1, CAR1, ADD2H
  1274	VSBIQ   T2, RED3, CAR2, T2
  1275
  1276	// what output to use, ADD2H||ADD1H or T1||T0?
  1277	VSEL T0, ADD1H, T2, T0
  1278	VSEL T1, ADD2H, T2, T1
  1279	RET
  1280
  1281#undef CPOOL
  1282
  1283#undef X0
  1284#undef X1
  1285#undef Y0
  1286#undef Y1
  1287#undef T0
  1288#undef T1
  1289#undef P0
  1290#undef P1
  1291
  1292#undef SEL1
  1293#undef SEL2
  1294#undef SEL3
  1295#undef SEL4
  1296#undef SEL5
  1297#undef SEL6
  1298
  1299#undef YDIG
  1300#undef ADD1H
  1301#undef ADD2H
  1302#undef ADD3
  1303#undef ADD4
  1304#undef RED1
  1305#undef RED2
  1306#undef RED3
  1307#undef T2
  1308#undef ADD1
  1309#undef ADD2
  1310#undef ADD3H
  1311#undef ADD4H
  1312#undef ZER
  1313#undef CAR1
  1314#undef CAR2
  1315
  1316// ---------------------------------------
  1317
  1318// Parameters
  1319#define X0    V0
  1320#define X1    V1
  1321#define Y0    V2
  1322#define Y1    V3
  1323
  1324TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
  1325	VLR X0, Y0
  1326	VLR X1, Y1
  1327	BR  p256MulInternal<>(SB)
  1328
  1329#undef X0
  1330#undef X1
  1331#undef Y0
  1332#undef Y1
  1333
  1334#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1335	VZERO   ZER                \
  1336	VSCBIQ  Y0, X0, CAR1       \
  1337	VSQ     Y0, X0, T0         \
  1338	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1339	VSBIQ   X1, Y1, CAR1, T1   \
  1340	VSQ     SEL1, ZER, SEL1    \
  1341	                           \
  1342	VACCQ   T0, PL, CAR1       \
  1343	VAQ     T0, PL, TT0        \
  1344	VACQ    T1, PH, CAR1, TT1  \
  1345	                           \
  1346	VSEL    T0, TT0, SEL1, T0  \
  1347	VSEL    T1, TT1, SEL1, T1  \
  1348
  1349#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1350	VACCQ   X0, Y0, CAR1        \
  1351	VAQ     X0, Y0, T0          \
  1352	VACCCQ  X1, Y1, CAR1, T2    \
  1353	VACQ    X1, Y1, CAR1, T1    \
  1354	                            \
  1355	VZERO   ZER                 \
  1356	VSCBIQ  PL, T0, CAR1        \
  1357	VSQ     PL, T0, TT0         \
  1358	VSBCBIQ T1, PH, CAR1, CAR2  \
  1359	VSBIQ   T1, PH, CAR1, TT1   \
  1360	VSBIQ   T2, ZER, CAR2, SEL1 \
  1361	                            \
  1362	VSEL    T0, TT0, SEL1, T0   \
  1363	VSEL    T1, TT1, SEL1, T1
  1364
  1365#define p256HalfInternal(T1, T0, X1, X0) \
  1366	VZERO  ZER                \
  1367	VSBIQ  ZER, ZER, X0, SEL1 \
  1368	                          \
  1369	VACCQ  X0, PL, CAR1       \
  1370	VAQ    X0, PL, T0         \
  1371	VACCCQ X1, PH, CAR1, T2   \
  1372	VACQ   X1, PH, CAR1, T1   \
  1373	                          \
  1374	VSEL   X0, T0, SEL1, T0   \
  1375	VSEL   X1, T1, SEL1, T1   \
  1376	VSEL   ZER, T2, SEL1, T2  \
  1377	                          \
  1378	VSLDB  $15, T2, ZER, TT1  \
  1379	VSLDB  $15, T1, ZER, TT0  \
  1380	VREPIB $1, SEL1           \
  1381	VSRL   SEL1, T0, T0       \
  1382	VSRL   SEL1, T1, T1       \
  1383	VREPIB $7, SEL1           \
  1384	VSL    SEL1, TT0, TT0     \
  1385	VSL    SEL1, TT1, TT1     \
  1386	VO     T0, TT0, T0        \
  1387	VO     T1, TT1, T1
  1388
  1389// ---------------------------------------
  1390// func p256Mul(res, in1, in2 *p256Element)
  1391#define res_ptr R1
  1392#define x_ptr   R2
  1393#define y_ptr   R3
  1394#define CPOOL   R4
  1395
  1396// Parameters
  1397#define X0    V0
  1398#define X1    V1
  1399#define Y0    V2
  1400#define Y1    V3
  1401#define T0    V4
  1402#define T1    V5
  1403
  1404// Constants
  1405#define P0    V30
  1406#define P1    V31
  1407TEXT ·p256Mul(SB), NOSPLIT, $0
  1408	MOVD res+0(FP), res_ptr
  1409	MOVD in1+8(FP), x_ptr
  1410	MOVD in2+16(FP), y_ptr
  1411
  1412	VL   (0*16)(x_ptr), X0
  1413	VPDI $0x4, X0, X0, X0
  1414	VL   (1*16)(x_ptr), X1
  1415	VPDI $0x4, X1, X1, X1
  1416	VL   (0*16)(y_ptr), Y0
  1417	VPDI $0x4, Y0, Y0, Y0
  1418	VL   (1*16)(y_ptr), Y1
  1419	VPDI $0x4, Y1, Y1, Y1
  1420
  1421	MOVD $p256mul<>+0x00(SB), CPOOL
  1422	VL   16(CPOOL), P0
  1423	VL   0(CPOOL), P1
  1424
  1425	CALL p256MulInternal<>(SB)
  1426
  1427	VPDI $0x4, T0, T0, T0
  1428	VST  T0, (0*16)(res_ptr)
  1429	VPDI $0x4, T1, T1, T1
  1430	VST  T1, (1*16)(res_ptr)
  1431	RET
  1432
  1433#undef res_ptr
  1434#undef x_ptr
  1435#undef y_ptr
  1436#undef CPOOL
  1437
  1438#undef X0
  1439#undef X1
  1440#undef Y0
  1441#undef Y1
  1442#undef T0
  1443#undef T1
  1444#undef P0
  1445#undef P1
  1446
  1447// ---------------------------------------
  1448//  func p256Sqr(res, in *p256Element, n int)
  1449#define res_ptr R1
  1450#define x_ptr   R2
  1451#define y_ptr   R3
  1452#define CPOOL   R4
  1453#define COUNT   R5
  1454#define N       R6
  1455
  1456// Parameters
  1457#define X0    V0
  1458#define X1    V1
  1459#define T0    V4
  1460#define T1    V5
  1461
  1462// Constants
  1463#define P0    V30
  1464#define P1    V31
  1465TEXT ·p256Sqr(SB), NOSPLIT, $0
  1466	MOVD res+0(FP), res_ptr
  1467	MOVD in+8(FP), x_ptr
  1468
  1469	VL   (0*16)(x_ptr), X0
  1470	VPDI $0x4, X0, X0, X0
  1471	VL   (1*16)(x_ptr), X1
  1472	VPDI $0x4, X1, X1, X1
  1473
  1474	MOVD $p256mul<>+0x00(SB), CPOOL
  1475	MOVD $0, COUNT
  1476	MOVD n+16(FP), N
  1477	VL   16(CPOOL), P0
  1478	VL   0(CPOOL), P1
  1479
  1480loop:
  1481	CALL p256SqrInternal<>(SB)
  1482	VLR  T0, X0
  1483	VLR  T1, X1
  1484	ADDW $1, COUNT
  1485	CMPW COUNT, N
  1486	BLT  loop
  1487
  1488	VPDI $0x4, T0, T0, T0
  1489	VST  T0, (0*16)(res_ptr)
  1490	VPDI $0x4, T1, T1, T1
  1491	VST  T1, (1*16)(res_ptr)
  1492	RET
  1493
  1494#undef res_ptr
  1495#undef x_ptr
  1496#undef y_ptr
  1497#undef CPOOL
  1498#undef COUNT
  1499#undef N
  1500
  1501#undef X0
  1502#undef X1
  1503#undef T0
  1504#undef T1
  1505#undef P0
  1506#undef P1
  1507
  1508// Point add with P2 being affine point
  1509// If sign == 1 -> P2 = -P2
  1510// If sel == 0 -> P3 = P1
  1511// if zero == 0 -> P3 = P2
  1512// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1513#define P3ptr   R1
  1514#define P1ptr   R2
  1515#define P2ptr   R3
  1516#define CPOOL   R4
  1517
  1518// Temporaries in REGs
  1519#define Y2L    V15
  1520#define Y2H    V16
  1521#define T1L    V17
  1522#define T1H    V18
  1523#define T2L    V19
  1524#define T2H    V20
  1525#define T3L    V21
  1526#define T3H    V22
  1527#define T4L    V23
  1528#define T4H    V24
  1529
  1530// Temps for Sub and Add
  1531#define TT0  V11
  1532#define TT1  V12
  1533#define T2   V13
  1534
  1535// p256MulAsm Parameters
  1536#define X0    V0
  1537#define X1    V1
  1538#define Y0    V2
  1539#define Y1    V3
  1540#define T0    V4
  1541#define T1    V5
  1542
  1543#define PL    V30
  1544#define PH    V31
  1545
  1546// Names for zero/sel selects
  1547#define X1L    V0
  1548#define X1H    V1
  1549#define Y1L    V2 // p256MulAsmParmY
  1550#define Y1H    V3 // p256MulAsmParmY
  1551#define Z1L    V4
  1552#define Z1H    V5
  1553#define X2L    V0
  1554#define X2H    V1
  1555#define Z2L    V4
  1556#define Z2H    V5
  1557#define X3L    V17 // T1L
  1558#define X3H    V18 // T1H
  1559#define Y3L    V21 // T3L
  1560#define Y3H    V22 // T3H
  1561#define Z3L    V28
  1562#define Z3H    V29
  1563
  1564#define ZER   V6
  1565#define SEL1  V7
  1566#define CAR1  V8
  1567#define CAR2  V9
  1568/* *
  1569 * Three operand formula:
  1570 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1571 * T1 = Z1²
  1572 * T2 = T1*Z1
  1573 * T1 = T1*X2
  1574 * T2 = T2*Y2
  1575 * T1 = T1-X1
  1576 * T2 = T2-Y1
  1577 * Z3 = Z1*T1
  1578 * T3 = T1²
  1579 * T4 = T3*T1
  1580 * T3 = T3*X1
  1581 * T1 = 2*T3
  1582 * X3 = T2²
  1583 * X3 = X3-T1
  1584 * X3 = X3-T4
  1585 * T3 = T3-X3
  1586 * T3 = T3*T2
  1587 * T4 = T4*Y1
  1588 * Y3 = T3-T4
  1589
  1590 * Three operand formulas, but with MulInternal X,Y used to store temps
  1591X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1592X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1593X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1594X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1595SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1596SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1597X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1598X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1599X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1600X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1601ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1602X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1603SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1604SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1605SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1606X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1607X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1608SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1609
  1610	*/
  1611TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1612	MOVD res+0(FP), P3ptr
  1613	MOVD in1+8(FP), P1ptr
  1614	MOVD in2+16(FP), P2ptr
  1615
  1616	MOVD $p256mul<>+0x00(SB), CPOOL
  1617	VL   16(CPOOL), PL
  1618	VL   0(CPOOL), PH
  1619
  1620	//	if (sign == 1) {
  1621	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1622	//	}
  1623
  1624	VL   48(P2ptr), Y2H
  1625	VPDI $0x4, Y2H, Y2H, Y2H
  1626	VL   32(P2ptr), Y2L
  1627	VPDI $0x4, Y2L, Y2L, Y2L
  1628
  1629	VLREPG sign+24(FP), SEL1
  1630	VZERO  ZER
  1631	VCEQG  SEL1, ZER, SEL1
  1632
  1633	VSCBIQ Y2L, PL, CAR1
  1634	VSQ    Y2L, PL, T1L
  1635	VSBIQ  PH, Y2H, CAR1, T1H
  1636
  1637	VSEL Y2L, T1L, SEL1, Y2L
  1638	VSEL Y2H, T1H, SEL1, Y2H
  1639
  1640/* *
  1641 * Three operand formula:
  1642 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1643 */
  1644	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1645	VL   80(P1ptr), X1       // Z1H
  1646	VPDI $0x4, X1, X1, X1
  1647	VL   64(P1ptr), X0       // Z1L
  1648	VPDI $0x4, X0, X0, X0
  1649	VLR  X0, Y0
  1650	VLR  X1, Y1
  1651	CALL p256SqrInternal<>(SB)
  1652
  1653	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1654	VLR  T0, X0
  1655	VLR  T1, X1
  1656	CALL p256MulInternal<>(SB)
  1657	VLR  T0, T2L
  1658	VLR  T1, T2H
  1659
  1660	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1661	VL   16(P2ptr), Y1       // X2H
  1662	VPDI $0x4, Y1, Y1, Y1
  1663	VL   0(P2ptr), Y0        // X2L
  1664	VPDI $0x4, Y0, Y0, Y0
  1665	CALL p256MulInternal<>(SB)
  1666	VLR  T0, T1L
  1667	VLR  T1, T1H
  1668
  1669	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1670	VLR  T2L, X0
  1671	VLR  T2H, X1
  1672	VLR  Y2L, Y0
  1673	VLR  Y2H, Y1
  1674	CALL p256MulInternal<>(SB)
  1675
  1676	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1677	VL   48(P1ptr), Y1H
  1678	VPDI $0x4, Y1H, Y1H, Y1H
  1679	VL   32(P1ptr), Y1L
  1680	VPDI $0x4, Y1L, Y1L, Y1L
  1681	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1682
  1683	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1684	VL   16(P1ptr), X1H
  1685	VPDI $0x4, X1H, X1H, X1H
  1686	VL   0(P1ptr), X1L
  1687	VPDI $0x4, X1L, X1L, X1L
  1688	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1689
  1690	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1691	VL   80(P1ptr), X1       // Z1H
  1692	VPDI $0x4, X1, X1, X1
  1693	VL   64(P1ptr), X0       // Z1L
  1694	VPDI $0x4, X0, X0, X0
  1695	CALL p256MulInternal<>(SB)
  1696
  1697	// VST T1, 64(P3ptr)
  1698	// VST T0, 80(P3ptr)
  1699	VLR T0, Z3L
  1700	VLR T1, Z3H
  1701
  1702	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1703	VLR  Y0, X0
  1704	VLR  Y1, X1
  1705	CALL p256SqrInternal<>(SB)
  1706	VLR  T0, X0
  1707	VLR  T1, X1
  1708
  1709	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1710	CALL p256MulInternal<>(SB)
  1711	VLR  T0, T4L
  1712	VLR  T1, T4H
  1713
  1714	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1715	VL   16(P1ptr), Y1       // X1H
  1716	VPDI $0x4, Y1, Y1, Y1
  1717	VL   0(P1ptr), Y0        // X1L
  1718	VPDI $0x4, Y0, Y0, Y0
  1719	CALL p256MulInternal<>(SB)
  1720	VLR  T0, T3L
  1721	VLR  T1, T3H
  1722
  1723	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1724	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1725
  1726	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1727	VLR  T2L, X0
  1728	VLR  T2H, X1
  1729	VLR  T2L, Y0
  1730	VLR  T2H, Y1
  1731	CALL p256SqrInternal<>(SB)
  1732
  1733	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1734	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1735
  1736	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1737	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1738	VLR T0, X3L
  1739	VLR T1, X3H
  1740
  1741	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1742	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1743
  1744	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1745	CALL p256MulInternal<>(SB)
  1746	VLR  T0, T3L
  1747	VLR  T1, T3H
  1748
  1749	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1750	VLR  T4L, X0
  1751	VLR  T4H, X1
  1752	VL   48(P1ptr), Y1       // Y1H
  1753	VPDI $0x4, Y1, Y1, Y1
  1754	VL   32(P1ptr), Y0       // Y1L
  1755	VPDI $0x4, Y0, Y0, Y0
  1756	CALL p256MulInternal<>(SB)
  1757
  1758	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1759	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1760
  1761	//	if (sel == 0) {
  1762	//		copy(P3.x[:], X1)
  1763	//		copy(P3.y[:], Y1)
  1764	//		copy(P3.z[:], Z1)
  1765	//	}
  1766
  1767	VL   16(P1ptr), X1H
  1768	VPDI $0x4, X1H, X1H, X1H
  1769	VL   0(P1ptr), X1L
  1770	VPDI $0x4, X1L, X1L, X1L
  1771
  1772	// Y1 already loaded, left over from addition
  1773	VL   80(P1ptr), Z1H
  1774	VPDI $0x4, Z1H, Z1H, Z1H
  1775	VL   64(P1ptr), Z1L
  1776	VPDI $0x4, Z1L, Z1L, Z1L
  1777
  1778	VLREPG sel+32(FP), SEL1
  1779	VZERO  ZER
  1780	VCEQG  SEL1, ZER, SEL1
  1781
  1782	VSEL X1L, X3L, SEL1, X3L
  1783	VSEL X1H, X3H, SEL1, X3H
  1784	VSEL Y1L, Y3L, SEL1, Y3L
  1785	VSEL Y1H, Y3H, SEL1, Y3H
  1786	VSEL Z1L, Z3L, SEL1, Z3L
  1787	VSEL Z1H, Z3H, SEL1, Z3H
  1788
  1789	//	if (zero == 0) {
  1790	//		copy(P3.x[:], X2)
  1791	//		copy(P3.y[:], Y2)
  1792	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1793	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1794	//	}
  1795	VL   16(P2ptr), X2H
  1796	VPDI $0x4, X2H, X2H, X2H
  1797	VL   0(P2ptr), X2L
  1798	VPDI $0x4, X2L, X2L, X2L
  1799
  1800	// Y2 already loaded
  1801	VL 128(CPOOL), Z2H
  1802	VL 144(CPOOL), Z2L
  1803
  1804	VLREPG zero+40(FP), SEL1
  1805	VZERO  ZER
  1806	VCEQG  SEL1, ZER, SEL1
  1807
  1808	VSEL X2L, X3L, SEL1, X3L
  1809	VSEL X2H, X3H, SEL1, X3H
  1810	VSEL Y2L, Y3L, SEL1, Y3L
  1811	VSEL Y2H, Y3H, SEL1, Y3H
  1812	VSEL Z2L, Z3L, SEL1, Z3L
  1813	VSEL Z2H, Z3H, SEL1, Z3H
  1814
  1815	// All done, store out the result!!!
  1816	VPDI $0x4, X3H, X3H, X3H
  1817	VST  X3H, 16(P3ptr)
  1818	VPDI $0x4, X3L, X3L, X3L
  1819	VST  X3L, 0(P3ptr)
  1820	VPDI $0x4, Y3H, Y3H, Y3H
  1821	VST  Y3H, 48(P3ptr)
  1822	VPDI $0x4, Y3L, Y3L, Y3L
  1823	VST  Y3L, 32(P3ptr)
  1824	VPDI $0x4, Z3H, Z3H, Z3H
  1825	VST  Z3H, 80(P3ptr)
  1826	VPDI $0x4, Z3L, Z3L, Z3L
  1827	VST  Z3L, 64(P3ptr)
  1828
  1829	RET
  1830
  1831#undef P3ptr
  1832#undef P1ptr
  1833#undef P2ptr
  1834#undef CPOOL
  1835
  1836#undef Y2L
  1837#undef Y2H
  1838#undef T1L
  1839#undef T1H
  1840#undef T2L
  1841#undef T2H
  1842#undef T3L
  1843#undef T3H
  1844#undef T4L
  1845#undef T4H
  1846
  1847#undef TT0
  1848#undef TT1
  1849#undef T2
  1850
  1851#undef X0
  1852#undef X1
  1853#undef Y0
  1854#undef Y1
  1855#undef T0
  1856#undef T1
  1857
  1858#undef PL
  1859#undef PH
  1860
  1861#undef X1L
  1862#undef X1H
  1863#undef Y1L
  1864#undef Y1H
  1865#undef Z1L
  1866#undef Z1H
  1867#undef X2L
  1868#undef X2H
  1869#undef Z2L
  1870#undef Z2H
  1871#undef X3L
  1872#undef X3H
  1873#undef Y3L
  1874#undef Y3H
  1875#undef Z3L
  1876#undef Z3H
  1877
  1878#undef ZER
  1879#undef SEL1
  1880#undef CAR1
  1881#undef CAR2
  1882
  1883// func p256PointDoubleAsm(res, in *P256Point)
  1884// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1885// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1886// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1887#define P3ptr   R1
  1888#define P1ptr   R2
  1889#define CPOOL   R4
  1890
  1891// Temporaries in REGs
  1892#define X3L    V15
  1893#define X3H    V16
  1894#define Y3L    V17
  1895#define Y3H    V18
  1896#define T1L    V19
  1897#define T1H    V20
  1898#define T2L    V21
  1899#define T2H    V22
  1900#define T3L    V23
  1901#define T3H    V24
  1902
  1903#define X1L    V6
  1904#define X1H    V7
  1905#define Y1L    V8
  1906#define Y1H    V9
  1907#define Z1L    V10
  1908#define Z1H    V11
  1909
  1910// Temps for Sub and Add
  1911#define TT0  V11
  1912#define TT1  V12
  1913#define T2   V13
  1914
  1915// p256MulAsm Parameters
  1916#define X0    V0
  1917#define X1    V1
  1918#define Y0    V2
  1919#define Y1    V3
  1920#define T0    V4
  1921#define T1    V5
  1922
  1923#define PL    V30
  1924#define PH    V31
  1925
  1926#define Z3L    V23
  1927#define Z3H    V24
  1928
  1929#define ZER   V26
  1930#define SEL1  V27
  1931#define CAR1  V28
  1932#define CAR2  V29
  1933/*
  1934 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1935 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1936 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1937 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1938 * 	B  = 2Y₁
  1939 * 	Z₃ = B×Z₁
  1940 * 	C  = B²
  1941 * 	D  = C×X₁
  1942 * 	X₃ = A²-2D
  1943 * 	Y₃ = (D-X₃)×A-C²/2
  1944 *
  1945 * Three-operand formula:
  1946 *       T1 = Z1²
  1947 *       T2 = X1-T1
  1948 *       T1 = X1+T1
  1949 *       T2 = T2*T1
  1950 *       T2 = 3*T2
  1951 *       Y3 = 2*Y1
  1952 *       Z3 = Y3*Z1
  1953 *       Y3 = Y3²
  1954 *       T3 = Y3*X1
  1955 *       Y3 = Y3²
  1956 *       Y3 = half*Y3
  1957 *       X3 = T2²
  1958 *       T1 = 2*T3
  1959 *       X3 = X3-T1
  1960 *       T1 = T3-X3
  1961 *       T1 = T1*T2
  1962 *       Y3 = T1-Y3
  1963 */
  1964
  1965TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1966	MOVD res+0(FP), P3ptr
  1967	MOVD in+8(FP), P1ptr
  1968
  1969	MOVD $p256mul<>+0x00(SB), CPOOL
  1970	VL   16(CPOOL), PL
  1971	VL   0(CPOOL), PH
  1972
  1973	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1974	VL   80(P1ptr), X1        // Z1H
  1975	VPDI $0x4, X1, X1, X1
  1976	VL   64(P1ptr), X0        // Z1L
  1977	VPDI $0x4, X0, X0, X0
  1978	VLR  X0, Y0
  1979	VLR  X1, Y1
  1980	CALL p256SqrInternal<>(SB)
  1981
  1982	// SUB(X<X1-T)            // T2 = X1-T1
  1983	VL   16(P1ptr), X1H
  1984	VPDI $0x4, X1H, X1H, X1H
  1985	VL   0(P1ptr), X1L
  1986	VPDI $0x4, X1L, X1L, X1L
  1987	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1988
  1989	// ADD(Y<X1+T)            // T1 = X1+T1
  1990	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1991
  1992	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1993	CALL p256MulInternal<>(SB)
  1994
  1995	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1996	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1997	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1998
  1999	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  2000	VL   48(P1ptr), Y1H
  2001	VPDI $0x4, Y1H, Y1H, Y1H
  2002	VL   32(P1ptr), Y1L
  2003	VPDI $0x4, Y1L, Y1L, Y1L
  2004	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  2005
  2006	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  2007	VL   80(P1ptr), Y1        // Z1H
  2008	VPDI $0x4, Y1, Y1, Y1
  2009	VL   64(P1ptr), Y0        // Z1L
  2010	VPDI $0x4, Y0, Y0, Y0
  2011	CALL p256MulInternal<>(SB)
  2012	VPDI $0x4, T1, T1, TT1
  2013	VST  TT1, 80(P3ptr)
  2014	VPDI $0x4, T0, T0, TT0
  2015	VST  TT0, 64(P3ptr)
  2016
  2017	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2018	VLR  X0, Y0
  2019	VLR  X1, Y1
  2020	CALL p256SqrInternal<>(SB)
  2021
  2022	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  2023	VLR  T0, X0
  2024	VLR  T1, X1
  2025	VL   16(P1ptr), Y1
  2026	VPDI $0x4, Y1, Y1, Y1
  2027	VL   0(P1ptr), Y0
  2028	VPDI $0x4, Y0, Y0, Y0
  2029	CALL p256MulInternal<>(SB)
  2030	VLR  T0, T3L
  2031	VLR  T1, T3H
  2032
  2033	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2034	VLR  X0, Y0
  2035	VLR  X1, Y1
  2036	CALL p256SqrInternal<>(SB)
  2037
  2038	// HAL(Y3<T)              // Y3 = half*Y3
  2039	p256HalfInternal(Y3H,Y3L, T1,T0)
  2040
  2041	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  2042	VLR  T2L, X0
  2043	VLR  T2H, X1
  2044	VLR  T2L, Y0
  2045	VLR  T2H, Y1
  2046	CALL p256SqrInternal<>(SB)
  2047
  2048	// ADD(T1<T3+T3)          // T1 = 2*T3
  2049	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  2050
  2051	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2052	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  2053	VPDI $0x4, X3H, X3H, TT1
  2054	VST  TT1, 16(P3ptr)
  2055	VPDI $0x4, X3L, X3L, TT0
  2056	VST  TT0, 0(P3ptr)
  2057
  2058	// SUB(X<T3-X3)           // T1 = T3-X3
  2059	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  2060
  2061	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2062	CALL p256MulInternal<>(SB)
  2063
  2064	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2065	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  2066
  2067	VPDI $0x4, Y3H, Y3H, Y3H
  2068	VST  Y3H, 48(P3ptr)
  2069	VPDI $0x4, Y3L, Y3L, Y3L
  2070	VST  Y3L, 32(P3ptr)
  2071	RET
  2072
  2073#undef P3ptr
  2074#undef P1ptr
  2075#undef CPOOL
  2076#undef X3L
  2077#undef X3H
  2078#undef Y3L
  2079#undef Y3H
  2080#undef T1L
  2081#undef T1H
  2082#undef T2L
  2083#undef T2H
  2084#undef T3L
  2085#undef T3H
  2086#undef X1L
  2087#undef X1H
  2088#undef Y1L
  2089#undef Y1H
  2090#undef Z1L
  2091#undef Z1H
  2092#undef TT0
  2093#undef TT1
  2094#undef T2
  2095#undef X0
  2096#undef X1
  2097#undef Y0
  2098#undef Y1
  2099#undef T0
  2100#undef T1
  2101#undef PL
  2102#undef PH
  2103#undef Z3L
  2104#undef Z3H
  2105#undef ZER
  2106#undef SEL1
  2107#undef CAR1
  2108#undef CAR2
  2109
  2110// func p256PointAddAsm(res, in1, in2 *P256Point) int
  2111#define P3ptr  R1
  2112#define P1ptr  R2
  2113#define P2ptr  R3
  2114#define CPOOL  R4
  2115#define ISZERO R5
  2116#define TRUE   R6
  2117
  2118// Temporaries in REGs
  2119#define T1L   V16
  2120#define T1H   V17
  2121#define T2L   V18
  2122#define T2H   V19
  2123#define U1L   V20
  2124#define U1H   V21
  2125#define S1L   V22
  2126#define S1H   V23
  2127#define HL    V24
  2128#define HH    V25
  2129#define RL    V26
  2130#define RH    V27
  2131
  2132// Temps for Sub and Add
  2133#define ZER   V6
  2134#define SEL1  V7
  2135#define CAR1  V8
  2136#define CAR2  V9
  2137#define TT0  V11
  2138#define TT1  V12
  2139#define T2   V13
  2140
  2141// p256MulAsm Parameters
  2142#define X0    V0
  2143#define X1    V1
  2144#define Y0    V2
  2145#define Y1    V3
  2146#define T0    V4
  2147#define T1    V5
  2148
  2149#define PL    V30
  2150#define PH    V31
  2151/*
  2152 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2153 *
  2154 * A = X₁×Z₂²
  2155 * B = Y₁×Z₂³
  2156 * C = X₂×Z₁²-A
  2157 * D = Y₂×Z₁³-B
  2158 * X₃ = D² - 2A×C² - C³
  2159 * Y₃ = D×(A×C² - X₃) - B×C³
  2160 * Z₃ = Z₁×Z₂×C
  2161 *
  2162 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2163 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2164 *
  2165 * T1 = Z1*Z1
  2166 * T2 = Z2*Z2
  2167 * U1 = X1*T2
  2168 * H  = X2*T1
  2169 * H  = H-U1
  2170 * Z3 = Z1*Z2
  2171 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2172 *
  2173 * S1 = Z2*T2
  2174 * S1 = Y1*S1
  2175 * R  = Z1*T1
  2176 * R  = Y2*R
  2177 * R  = R-S1
  2178 *
  2179 * T1 = H*H
  2180 * T2 = H*T1
  2181 * U1 = U1*T1
  2182 *
  2183 * X3 = R*R
  2184 * X3 = X3-T2
  2185 * T1 = 2*U1
  2186 * X3 = X3-T1 << store-out X3 result reg
  2187 *
  2188 * T2 = S1*T2
  2189 * Y3 = U1-X3
  2190 * Y3 = R*Y3
  2191 * Y3 = Y3-T2 << store-out Y3 result reg
  2192
  2193 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2194	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2195	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2196	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2197	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2198	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2199	// SUB(H<H-T)            // H  = H-U1
  2200	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2201	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2202	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2203	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2204	// SUB(R<T-S1)           // R  = R-S1
  2205	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2206	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2207	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2208	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2209	// SUB(T<T-T2)           // X3 = X3-T2
  2210	// ADD(X<U1+U1)          // T1 = 2*U1
  2211	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2212	// SUB(Y<U1-T)           // Y3 = U1-X3
  2213	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2214	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2215	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2216	*/
  2217TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2218	MOVD res+0(FP), P3ptr
  2219	MOVD in1+8(FP), P1ptr
  2220	MOVD in2+16(FP), P2ptr
  2221
  2222	MOVD $p256mul<>+0x00(SB), CPOOL
  2223	VL   16(CPOOL), PL
  2224	VL   0(CPOOL), PH
  2225
  2226	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2227	VL   80(P1ptr), X1       // Z1H
  2228	VPDI $0x4, X1, X1, X1
  2229	VL   64(P1ptr), X0       // Z1L
  2230	VPDI $0x4, X0, X0, X0
  2231	VLR  X0, Y0
  2232	VLR  X1, Y1
  2233	CALL p256SqrInternal<>(SB)
  2234
  2235	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2236	VLR  T0, Y0
  2237	VLR  T1, Y1
  2238	CALL p256MulInternal<>(SB)
  2239	VLR  T0, RL
  2240	VLR  T1, RH
  2241
  2242	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2243	VL   16(P2ptr), X1       // X2H
  2244	VPDI $0x4, X1, X1, X1
  2245	VL   0(P2ptr), X0        // X2L
  2246	VPDI $0x4, X0, X0, X0
  2247	CALL p256MulInternal<>(SB)
  2248	VLR  T0, HL
  2249	VLR  T1, HH
  2250
  2251	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2252	VL   80(P2ptr), X1       // Z2H
  2253	VPDI $0x4, X1, X1, X1
  2254	VL   64(P2ptr), X0       // Z2L
  2255	VPDI $0x4, X0, X0, X0
  2256	VLR  X0, Y0
  2257	VLR  X1, Y1
  2258	CALL p256SqrInternal<>(SB)
  2259
  2260	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2261	VLR  T0, Y0
  2262	VLR  T1, Y1
  2263	CALL p256MulInternal<>(SB)
  2264	VLR  T0, S1L
  2265	VLR  T1, S1H
  2266
  2267	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2268	VL   16(P1ptr), X1       // X1H
  2269	VPDI $0x4, X1, X1, X1
  2270	VL   0(P1ptr), X0        // X1L
  2271	VPDI $0x4, X0, X0, X0
  2272	CALL p256MulInternal<>(SB)
  2273	VLR  T0, U1L
  2274	VLR  T1, U1H
  2275
  2276	// SUB(H<H-T)            // H  = H-U1
  2277	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2278
  2279	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2280	// clobbers T1H and T1L
  2281	MOVD   $0, ISZERO
  2282	MOVD   $1, TRUE
  2283	VZERO  ZER
  2284	VO     HL, HH, T1H
  2285	VCEQGS ZER, T1H, T1H
  2286	MOVDEQ TRUE, ISZERO
  2287	VX     HL, PL, T1L
  2288	VX     HH, PH, T1H
  2289	VO     T1L, T1H, T1H
  2290	VCEQGS ZER, T1H, T1H
  2291	MOVDEQ TRUE, ISZERO
  2292	MOVD   ISZERO, ret+24(FP)
  2293
  2294	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2295	VL   80(P1ptr), X1       // Z1H
  2296	VPDI $0x4, X1, X1, X1
  2297	VL   64(P1ptr), X0       // Z1L
  2298	VPDI $0x4, X0, X0, X0
  2299	VL   80(P2ptr), Y1       // Z2H
  2300	VPDI $0x4, Y1, Y1, Y1
  2301	VL   64(P2ptr), Y0       // Z2L
  2302	VPDI $0x4, Y0, Y0, Y0
  2303	CALL p256MulInternal<>(SB)
  2304
  2305	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2306	VLR  T0, X0
  2307	VLR  T1, X1
  2308	VLR  HL, Y0
  2309	VLR  HH, Y1
  2310	CALL p256MulInternal<>(SB)
  2311	VPDI $0x4, T1, T1, TT1
  2312	VST  TT1, 80(P3ptr)
  2313	VPDI $0x4, T0, T0, TT0
  2314	VST  TT0, 64(P3ptr)
  2315
  2316	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2317	VL   48(P1ptr), X1
  2318	VPDI $0x4, X1, X1, X1
  2319	VL   32(P1ptr), X0
  2320	VPDI $0x4, X0, X0, X0
  2321	VLR  S1L, Y0
  2322	VLR  S1H, Y1
  2323	CALL p256MulInternal<>(SB)
  2324	VLR  T0, S1L
  2325	VLR  T1, S1H
  2326
  2327	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2328	VL   48(P2ptr), X1
  2329	VPDI $0x4, X1, X1, X1
  2330	VL   32(P2ptr), X0
  2331	VPDI $0x4, X0, X0, X0
  2332	VLR  RL, Y0
  2333	VLR  RH, Y1
  2334	CALL p256MulInternal<>(SB)
  2335
  2336	// SUB(R<T-S1)           // R  = T-S1
  2337	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2338
  2339	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2340	// clobbers T1H and T1L
  2341	MOVD   $0, ISZERO
  2342	MOVD   $1, TRUE
  2343	VZERO  ZER
  2344	VO     RL, RH, T1H
  2345	VCEQGS ZER, T1H, T1H
  2346	MOVDEQ TRUE, ISZERO
  2347	VX     RL, PL, T1L
  2348	VX     RH, PH, T1H
  2349	VO     T1L, T1H, T1H
  2350	VCEQGS ZER, T1H, T1H
  2351	MOVDEQ TRUE, ISZERO
  2352	AND    ret+24(FP), ISZERO
  2353	MOVD   ISZERO, ret+24(FP)
  2354
  2355	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2356	VLR  HL, X0
  2357	VLR  HH, X1
  2358	VLR  HL, Y0
  2359	VLR  HH, Y1
  2360	CALL p256SqrInternal<>(SB)
  2361
  2362	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2363	VLR  T0, Y0
  2364	VLR  T1, Y1
  2365	CALL p256MulInternal<>(SB)
  2366	VLR  T0, T2L
  2367	VLR  T1, T2H
  2368
  2369	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2370	VLR  U1L, X0
  2371	VLR  U1H, X1
  2372	CALL p256MulInternal<>(SB)
  2373	VLR  T0, U1L
  2374	VLR  T1, U1H
  2375
  2376	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2377	VLR  RL, X0
  2378	VLR  RH, X1
  2379	VLR  RL, Y0
  2380	VLR  RH, Y1
  2381	CALL p256SqrInternal<>(SB)
  2382
  2383	// SUB(T<T-T2)           // X3 = X3-T2
  2384	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2385
  2386	// ADD(X<U1+U1)          // T1 = 2*U1
  2387	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2388
  2389	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2390	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2391	VPDI $0x4, T1, T1, TT1
  2392	VST  TT1, 16(P3ptr)
  2393	VPDI $0x4, T0, T0, TT0
  2394	VST  TT0, 0(P3ptr)
  2395
  2396	// SUB(Y<U1-T)           // Y3 = U1-X3
  2397	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2398
  2399	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2400	VLR  RL, X0
  2401	VLR  RH, X1
  2402	CALL p256MulInternal<>(SB)
  2403	VLR  T0, U1L
  2404	VLR  T1, U1H
  2405
  2406	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2407	VLR  S1L, X0
  2408	VLR  S1H, X1
  2409	VLR  T2L, Y0
  2410	VLR  T2H, Y1
  2411	CALL p256MulInternal<>(SB)
  2412
  2413	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2414	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2415	VPDI $0x4, T1, T1, T1
  2416	VST  T1, 48(P3ptr)
  2417	VPDI $0x4, T0, T0, T0
  2418	VST  T0, 32(P3ptr)
  2419
  2420	RET

View as plain text