...

Text file src/crypto/internal/fips140/nistec/p256_asm_s390x.s

Documentation: crypto/internal/fips140/nistec

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7#include "textflag.h"
     8#include "go_asm.h"
     9
    10DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
    11DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
    12DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    13DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    14DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    15DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    16DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    17DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    18DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    19DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    20DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    21DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    22DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    23DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    24DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    25DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
    26DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
    27DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    28DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    29DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    30DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    31DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    32DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    33DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    34DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    35DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    36DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    37DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    38DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    39DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    40DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    41DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    42DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    43DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    44DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    45DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    46DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    47GLOBL p256ordK0<>(SB), 8, $4
    48GLOBL p256ord<>(SB), 8, $32
    49GLOBL p256<>(SB), 8, $96
    50GLOBL p256mul<>(SB), 8, $160
    51
    52// ---------------------------------------
    53// iff cond == 1  val <- -val
    54// func p256NegCond(val *p256Element, cond int)
    55#define P1ptr   R1
    56#define CPOOL   R4
    57
    58#define Y1L   V0
    59#define Y1H   V1
    60#define T1L   V2
    61#define T1H   V3
    62
    63#define PL    V30
    64#define PH    V31
    65
    66#define ZER   V4
    67#define SEL1  V5
    68#define CAR1  V6
    69TEXT ·p256NegCond(SB), NOSPLIT, $0
    70	MOVD val+0(FP), P1ptr
    71
    72	MOVD $p256mul<>+0x00(SB), CPOOL
    73	VL   16(CPOOL), PL
    74	VL   0(CPOOL), PH
    75
    76	VL   16(P1ptr), Y1H
    77	VPDI $0x4, Y1H, Y1H, Y1H
    78	VL   0(P1ptr), Y1L
    79	VPDI $0x4, Y1L, Y1L, Y1L
    80
    81	VLREPG cond+8(FP), SEL1
    82	VZERO  ZER
    83	VCEQG  SEL1, ZER, SEL1
    84
    85	VSCBIQ Y1L, PL, CAR1
    86	VSQ    Y1L, PL, T1L
    87	VSBIQ  PH, Y1H, CAR1, T1H
    88
    89	VSEL Y1L, T1L, SEL1, Y1L
    90	VSEL Y1H, T1H, SEL1, Y1H
    91
    92	VPDI $0x4, Y1H, Y1H, Y1H
    93	VST  Y1H, 16(P1ptr)
    94	VPDI $0x4, Y1L, Y1L, Y1L
    95	VST  Y1L, 0(P1ptr)
    96	RET
    97
    98#undef P1ptr
    99#undef CPOOL
   100#undef Y1L
   101#undef Y1H
   102#undef T1L
   103#undef T1H
   104#undef PL
   105#undef PH
   106#undef ZER
   107#undef SEL1
   108#undef CAR1
   109
   110// ---------------------------------------
   111// if cond == 0 res <- b; else res <- a
   112// func p256MovCond(res, a, b *P256Point, cond int)
   113#define P3ptr   R1
   114#define P1ptr   R2
   115#define P2ptr   R3
   116
   117#define X1L    V0
   118#define X1H    V1
   119#define Y1L    V2
   120#define Y1H    V3
   121#define Z1L    V4
   122#define Z1H    V5
   123#define X2L    V6
   124#define X2H    V7
   125#define Y2L    V8
   126#define Y2H    V9
   127#define Z2L    V10
   128#define Z2H    V11
   129
   130#define ZER   V18
   131#define SEL1  V19
   132TEXT ·p256MovCond(SB), NOSPLIT, $0
   133	MOVD   res+0(FP), P3ptr
   134	MOVD   a+8(FP), P1ptr
   135	MOVD   b+16(FP), P2ptr
   136	VLREPG cond+24(FP), SEL1
   137	VZERO  ZER
   138	VCEQG  SEL1, ZER, SEL1
   139
   140	VL 0(P1ptr), X1H
   141	VL 16(P1ptr), X1L
   142	VL 32(P1ptr), Y1H
   143	VL 48(P1ptr), Y1L
   144	VL 64(P1ptr), Z1H
   145	VL 80(P1ptr), Z1L
   146
   147	VL 0(P2ptr), X2H
   148	VL 16(P2ptr), X2L
   149	VL 32(P2ptr), Y2H
   150	VL 48(P2ptr), Y2L
   151	VL 64(P2ptr), Z2H
   152	VL 80(P2ptr), Z2L
   153
   154	VSEL X2L, X1L, SEL1, X1L
   155	VSEL X2H, X1H, SEL1, X1H
   156	VSEL Y2L, Y1L, SEL1, Y1L
   157	VSEL Y2H, Y1H, SEL1, Y1H
   158	VSEL Z2L, Z1L, SEL1, Z1L
   159	VSEL Z2H, Z1H, SEL1, Z1H
   160
   161	VST X1H, 0(P3ptr)
   162	VST X1L, 16(P3ptr)
   163	VST Y1H, 32(P3ptr)
   164	VST Y1L, 48(P3ptr)
   165	VST Z1H, 64(P3ptr)
   166	VST Z1L, 80(P3ptr)
   167
   168	RET
   169
   170#undef P3ptr
   171#undef P1ptr
   172#undef P2ptr
   173#undef X1L
   174#undef X1H
   175#undef Y1L
   176#undef Y1H
   177#undef Z1L
   178#undef Z1H
   179#undef X2L
   180#undef X2H
   181#undef Y2L
   182#undef Y2H
   183#undef Z2L
   184#undef Z2H
   185#undef ZER
   186#undef SEL1
   187
   188// ---------------------------------------
   189// Constant time table access
   190// Indexed from 1 to 15, with -1 offset
   191// (index 0 is implicitly point at infinity)
   192// func p256Select(res *P256Point, table *p256Table, idx int)
   193#define P3ptr   R1
   194#define P1ptr   R2
   195#define COUNT   R4
   196
   197#define X1L    V0
   198#define X1H    V1
   199#define Y1L    V2
   200#define Y1H    V3
   201#define Z1L    V4
   202#define Z1H    V5
   203#define X2L    V6
   204#define X2H    V7
   205#define Y2L    V8
   206#define Y2H    V9
   207#define Z2L    V10
   208#define Z2H    V11
   209
   210#define ONE   V18
   211#define IDX   V19
   212#define SEL1  V20
   213#define SEL2  V21
   214TEXT ·p256Select(SB), NOSPLIT, $0
   215	MOVD   res+0(FP), P3ptr
   216	MOVD   table+8(FP), P1ptr
   217	VLREPB idx+(16+7)(FP), IDX
   218	VREPIB $1, ONE
   219	VREPIB $1, SEL2
   220	MOVD   $1, COUNT
   221
   222	VZERO X1H
   223	VZERO X1L
   224	VZERO Y1H
   225	VZERO Y1L
   226	VZERO Z1H
   227	VZERO Z1L
   228
   229loop_select:
   230	VL 0(P1ptr), X2H
   231	VL 16(P1ptr), X2L
   232	VL 32(P1ptr), Y2H
   233	VL 48(P1ptr), Y2L
   234	VL 64(P1ptr), Z2H
   235	VL 80(P1ptr), Z2L
   236
   237	VCEQG SEL2, IDX, SEL1
   238
   239	VSEL X2L, X1L, SEL1, X1L
   240	VSEL X2H, X1H, SEL1, X1H
   241	VSEL Y2L, Y1L, SEL1, Y1L
   242	VSEL Y2H, Y1H, SEL1, Y1H
   243	VSEL Z2L, Z1L, SEL1, Z1L
   244	VSEL Z2H, Z1H, SEL1, Z1H
   245
   246	VAB  SEL2, ONE, SEL2
   247	ADDW $1, COUNT
   248	ADD  $96, P1ptr
   249	CMPW COUNT, $17
   250	BLT  loop_select
   251
   252	VST X1H, 0(P3ptr)
   253	VST X1L, 16(P3ptr)
   254	VST Y1H, 32(P3ptr)
   255	VST Y1L, 48(P3ptr)
   256	VST Z1H, 64(P3ptr)
   257	VST Z1L, 80(P3ptr)
   258	RET
   259
   260#undef P3ptr
   261#undef P1ptr
   262#undef COUNT
   263#undef X1L
   264#undef X1H
   265#undef Y1L
   266#undef Y1H
   267#undef Z1L
   268#undef Z1H
   269#undef X2L
   270#undef X2H
   271#undef Y2L
   272#undef Y2H
   273#undef Z2L
   274#undef Z2H
   275#undef ONE
   276#undef IDX
   277#undef SEL1
   278#undef SEL2
   279
   280// ---------------------------------------
   281
   282//  func p256FromMont(res, in *p256Element)
   283#define res_ptr R1
   284#define x_ptr   R2
   285#define CPOOL   R4
   286
   287#define T0   V0
   288#define T1   V1
   289#define T2   V2
   290#define TT0  V3
   291#define TT1  V4
   292
   293#define ZER   V6
   294#define SEL1  V7
   295#define SEL2  V8
   296#define CAR1  V9
   297#define CAR2  V10
   298#define RED1  V11
   299#define RED2  V12
   300#define PL    V13
   301#define PH    V14
   302
   303TEXT ·p256FromMont(SB), NOSPLIT, $0
   304	MOVD res+0(FP), res_ptr
   305	MOVD in+8(FP), x_ptr
   306
   307	VZERO T2
   308	VZERO ZER
   309	MOVD  $p256<>+0x00(SB), CPOOL
   310	VL    16(CPOOL), PL
   311	VL    0(CPOOL), PH
   312	VL    48(CPOOL), SEL2
   313	VL    64(CPOOL), SEL1
   314
   315	VL   (0*16)(x_ptr), T0
   316	VPDI $0x4, T0, T0, T0
   317	VL   (1*16)(x_ptr), T1
   318	VPDI $0x4, T1, T1, T1
   319
   320	// First round
   321	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   322	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   323	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   324
   325	VSLDB $8, T1, T0, T0
   326	VSLDB $8, T2, T1, T1
   327
   328	VACCQ  T0, RED1, CAR1
   329	VAQ    T0, RED1, T0
   330	VACCCQ T1, RED2, CAR1, CAR2
   331	VACQ   T1, RED2, CAR1, T1
   332	VAQ    T2, CAR2, T2
   333
   334	// Second round
   335	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   336	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   337	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   338
   339	VSLDB $8, T1, T0, T0
   340	VSLDB $8, T2, T1, T1
   341
   342	VACCQ  T0, RED1, CAR1
   343	VAQ    T0, RED1, T0
   344	VACCCQ T1, RED2, CAR1, CAR2
   345	VACQ   T1, RED2, CAR1, T1
   346	VAQ    T2, CAR2, T2
   347
   348	// Third round
   349	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   350	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   351	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   352
   353	VSLDB $8, T1, T0, T0
   354	VSLDB $8, T2, T1, T1
   355
   356	VACCQ  T0, RED1, CAR1
   357	VAQ    T0, RED1, T0
   358	VACCCQ T1, RED2, CAR1, CAR2
   359	VACQ   T1, RED2, CAR1, T1
   360	VAQ    T2, CAR2, T2
   361
   362	// Last round
   363	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   364	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   365	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   366
   367	VSLDB $8, T1, T0, T0
   368	VSLDB $8, T2, T1, T1
   369
   370	VACCQ  T0, RED1, CAR1
   371	VAQ    T0, RED1, T0
   372	VACCCQ T1, RED2, CAR1, CAR2
   373	VACQ   T1, RED2, CAR1, T1
   374	VAQ    T2, CAR2, T2
   375
   376	// ---------------------------------------------------
   377
   378	VSCBIQ  PL, T0, CAR1
   379	VSQ     PL, T0, TT0
   380	VSBCBIQ T1, PH, CAR1, CAR2
   381	VSBIQ   T1, PH, CAR1, TT1
   382	VSBIQ   T2, ZER, CAR2, T2
   383
   384	// what output to use, TT1||TT0 or T1||T0?
   385	VSEL T0, TT0, T2, T0
   386	VSEL T1, TT1, T2, T1
   387
   388	VPDI $0x4, T0, T0, TT0
   389	VST  TT0, (0*16)(res_ptr)
   390	VPDI $0x4, T1, T1, TT1
   391	VST  TT1, (1*16)(res_ptr)
   392	RET
   393
   394#undef res_ptr
   395#undef x_ptr
   396#undef CPOOL
   397#undef T0
   398#undef T1
   399#undef T2
   400#undef TT0
   401#undef TT1
   402#undef ZER
   403#undef SEL1
   404#undef SEL2
   405#undef CAR1
   406#undef CAR2
   407#undef RED1
   408#undef RED2
   409#undef PL
   410#undef PH
   411
   412// Constant time table access
   413// Indexed from 1 to 15, with -1 offset
   414// (index 0 is implicitly point at infinity)
   415// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   416// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   417
   418#define P3ptr   R1
   419#define P1ptr   R2
   420#define COUNT   R4
   421#define CPOOL   R5
   422
   423#define X1L    V0
   424#define X1H    V1
   425#define Y1L    V2
   426#define Y1H    V3
   427#define Z1L    V4
   428#define Z1H    V5
   429#define X2L    V6
   430#define X2H    V7
   431#define Y2L    V8
   432#define Y2H    V9
   433#define Z2L    V10
   434#define Z2H    V11
   435#define LE2BE  V12
   436
   437#define ONE   V18
   438#define IDX   V19
   439#define SEL1  V20
   440#define SEL2  V21
   441
   442TEXT ·p256SelectAffine(SB), NOSPLIT, $0
   443	MOVD   res+0(FP), P3ptr
   444	MOVD   table+8(FP), P1ptr
   445	MOVD   $p256<>+0x00(SB), CPOOL
   446	VLREPB idx+(16+7)(FP), IDX
   447	VREPIB $1, ONE
   448	VREPIB $1, SEL2
   449	MOVD   $1, COUNT
   450	VL     80(CPOOL), LE2BE
   451
   452	VZERO X1H
   453	VZERO X1L
   454	VZERO Y1H
   455	VZERO Y1L
   456
   457loop_select:
   458	VL 0(P1ptr), X2H
   459	VL 16(P1ptr), X2L
   460	VL 32(P1ptr), Y2H
   461	VL 48(P1ptr), Y2L
   462
   463	VCEQG SEL2, IDX, SEL1
   464
   465	VSEL X2L, X1L, SEL1, X1L
   466	VSEL X2H, X1H, SEL1, X1H
   467	VSEL Y2L, Y1L, SEL1, Y1L
   468	VSEL Y2H, Y1H, SEL1, Y1H
   469
   470	VAB  SEL2, ONE, SEL2
   471	ADDW $1, COUNT
   472	ADD  $64, P1ptr
   473	CMPW COUNT, $33 // len(p256AffineTable) + 1
   474	BLT  loop_select
   475	VST  X1H, 0(P3ptr)
   476	VST  X1L, 16(P3ptr)
   477	VST  Y1H, 32(P3ptr)
   478	VST  Y1L, 48(P3ptr)
   479
   480	RET
   481
   482#undef P3ptr
   483#undef P1ptr
   484#undef COUNT
   485#undef X1L
   486#undef X1H
   487#undef Y1L
   488#undef Y1H
   489#undef Z1L
   490#undef Z1H
   491#undef X2L
   492#undef X2H
   493#undef Y2L
   494#undef Y2H
   495#undef Z2L
   496#undef Z2H
   497#undef ONE
   498#undef IDX
   499#undef SEL1
   500#undef SEL2
   501#undef CPOOL
   502
   503// ---------------------------------------
   504
   505// func p256OrdMul(res, in1, in2 *p256OrdElement)
   506#define res_ptr R1
   507#define x_ptr R2
   508#define y_ptr R3
   509#define X0    V0
   510#define X1    V1
   511#define Y0    V2
   512#define Y1    V3
   513#define M0    V4
   514#define M1    V5
   515#define T0    V6
   516#define T1    V7
   517#define T2    V8
   518#define YDIG  V9
   519
   520#define ADD1  V16
   521#define ADD1H V17
   522#define ADD2  V18
   523#define ADD2H V19
   524#define RED1  V20
   525#define RED1H V21
   526#define RED2  V22
   527#define RED2H V23
   528#define CAR1  V24
   529#define CAR1M V25
   530
   531#define MK0   V30
   532#define K0    V31
   533TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
   534	MOVD res+0(FP), res_ptr
   535	MOVD in1+8(FP), x_ptr
   536	MOVD in2+16(FP), y_ptr
   537
   538	VZERO T2
   539	MOVD  $p256ordK0<>+0x00(SB), R4
   540
   541	// VLEF    $3, 0(R4), K0
   542	WORD $0xE7F40000
   543	BYTE $0x38
   544	BYTE $0x03
   545	MOVD $p256ord<>+0x00(SB), R4
   546	VL   16(R4), M0
   547	VL   0(R4), M1
   548
   549	VL   (0*16)(x_ptr), X0
   550	VPDI $0x4, X0, X0, X0
   551	VL   (1*16)(x_ptr), X1
   552	VPDI $0x4, X1, X1, X1
   553	VL   (0*16)(y_ptr), Y0
   554	VPDI $0x4, Y0, Y0, Y0
   555	VL   (1*16)(y_ptr), Y1
   556	VPDI $0x4, Y1, Y1, Y1
   557
   558	// ---------------------------------------------------------------------------/
   559	VREPF $3, Y0, YDIG
   560	VMLF  X0, YDIG, ADD1
   561	VMLF  ADD1, K0, MK0
   562	VREPF $3, MK0, MK0
   563
   564	VMLF  X1, YDIG, ADD2
   565	VMLHF X0, YDIG, ADD1H
   566	VMLHF X1, YDIG, ADD2H
   567
   568	VMALF  M0, MK0, ADD1, RED1
   569	VMALHF M0, MK0, ADD1, RED1H
   570	VMALF  M1, MK0, ADD2, RED2
   571	VMALHF M1, MK0, ADD2, RED2H
   572
   573	VSLDB $12, RED2, RED1, RED1
   574	VSLDB $12, T2, RED2, RED2
   575
   576	VACCQ RED1, ADD1H, CAR1
   577	VAQ   RED1, ADD1H, T0
   578	VACCQ RED1H, T0, CAR1M
   579	VAQ   RED1H, T0, T0
   580
   581	// << ready for next MK0
   582
   583	VACQ   RED2, ADD2H, CAR1, T1
   584	VACCCQ RED2, ADD2H, CAR1, CAR1
   585	VACCCQ RED2H, T1, CAR1M, T2
   586	VACQ   RED2H, T1, CAR1M, T1
   587	VAQ    CAR1, T2, T2
   588
   589	// ---------------------------------------------------
   590/* *
   591 * ---+--------+--------+
   592 *  T2|   T1   |   T0   |
   593 * ---+--------+--------+
   594 *           *(add)*
   595 *    +--------+--------+
   596 *    |   X1   |   X0   |
   597 *    +--------+--------+
   598 *           *(mul)*
   599 *    +--------+--------+
   600 *    |  YDIG  |  YDIG  |
   601 *    +--------+--------+
   602 *           *(add)*
   603 *    +--------+--------+
   604 *    |   M1   |   M0   |
   605 *    +--------+--------+
   606 *           *(mul)*
   607 *    +--------+--------+
   608 *    |   MK0  |   MK0  |
   609 *    +--------+--------+
   610 *
   611 *   ---------------------
   612 *
   613 *    +--------+--------+
   614 *    |  ADD2  |  ADD1  |
   615 *    +--------+--------+
   616 *  +--------+--------+
   617 *  | ADD2H  | ADD1H  |
   618 *  +--------+--------+
   619 *    +--------+--------+
   620 *    |  RED2  |  RED1  |
   621 *    +--------+--------+
   622 *  +--------+--------+
   623 *  | RED2H  | RED1H  |
   624 *  +--------+--------+
   625 */
   626	VREPF $2, Y0, YDIG
   627	VMALF X0, YDIG, T0, ADD1
   628	VMLF  ADD1, K0, MK0
   629	VREPF $3, MK0, MK0
   630
   631	VMALF  X1, YDIG, T1, ADD2
   632	VMALHF X0, YDIG, T0, ADD1H
   633	VMALHF X1, YDIG, T1, ADD2H
   634
   635	VMALF  M0, MK0, ADD1, RED1
   636	VMALHF M0, MK0, ADD1, RED1H
   637	VMALF  M1, MK0, ADD2, RED2
   638	VMALHF M1, MK0, ADD2, RED2H
   639
   640	VSLDB $12, RED2, RED1, RED1
   641	VSLDB $12, T2, RED2, RED2
   642
   643	VACCQ RED1, ADD1H, CAR1
   644	VAQ   RED1, ADD1H, T0
   645	VACCQ RED1H, T0, CAR1M
   646	VAQ   RED1H, T0, T0
   647
   648	// << ready for next MK0
   649
   650	VACQ   RED2, ADD2H, CAR1, T1
   651	VACCCQ RED2, ADD2H, CAR1, CAR1
   652	VACCCQ RED2H, T1, CAR1M, T2
   653	VACQ   RED2H, T1, CAR1M, T1
   654	VAQ    CAR1, T2, T2
   655
   656	// ---------------------------------------------------
   657	VREPF $1, Y0, YDIG
   658	VMALF X0, YDIG, T0, ADD1
   659	VMLF  ADD1, K0, MK0
   660	VREPF $3, MK0, MK0
   661
   662	VMALF  X1, YDIG, T1, ADD2
   663	VMALHF X0, YDIG, T0, ADD1H
   664	VMALHF X1, YDIG, T1, ADD2H
   665
   666	VMALF  M0, MK0, ADD1, RED1
   667	VMALHF M0, MK0, ADD1, RED1H
   668	VMALF  M1, MK0, ADD2, RED2
   669	VMALHF M1, MK0, ADD2, RED2H
   670
   671	VSLDB $12, RED2, RED1, RED1
   672	VSLDB $12, T2, RED2, RED2
   673
   674	VACCQ RED1, ADD1H, CAR1
   675	VAQ   RED1, ADD1H, T0
   676	VACCQ RED1H, T0, CAR1M
   677	VAQ   RED1H, T0, T0
   678
   679	// << ready for next MK0
   680
   681	VACQ   RED2, ADD2H, CAR1, T1
   682	VACCCQ RED2, ADD2H, CAR1, CAR1
   683	VACCCQ RED2H, T1, CAR1M, T2
   684	VACQ   RED2H, T1, CAR1M, T1
   685	VAQ    CAR1, T2, T2
   686
   687	// ---------------------------------------------------
   688	VREPF $0, Y0, YDIG
   689	VMALF X0, YDIG, T0, ADD1
   690	VMLF  ADD1, K0, MK0
   691	VREPF $3, MK0, MK0
   692
   693	VMALF  X1, YDIG, T1, ADD2
   694	VMALHF X0, YDIG, T0, ADD1H
   695	VMALHF X1, YDIG, T1, ADD2H
   696
   697	VMALF  M0, MK0, ADD1, RED1
   698	VMALHF M0, MK0, ADD1, RED1H
   699	VMALF  M1, MK0, ADD2, RED2
   700	VMALHF M1, MK0, ADD2, RED2H
   701
   702	VSLDB $12, RED2, RED1, RED1
   703	VSLDB $12, T2, RED2, RED2
   704
   705	VACCQ RED1, ADD1H, CAR1
   706	VAQ   RED1, ADD1H, T0
   707	VACCQ RED1H, T0, CAR1M
   708	VAQ   RED1H, T0, T0
   709
   710	// << ready for next MK0
   711
   712	VACQ   RED2, ADD2H, CAR1, T1
   713	VACCCQ RED2, ADD2H, CAR1, CAR1
   714	VACCCQ RED2H, T1, CAR1M, T2
   715	VACQ   RED2H, T1, CAR1M, T1
   716	VAQ    CAR1, T2, T2
   717
   718	// ---------------------------------------------------
   719	VREPF $3, Y1, YDIG
   720	VMALF X0, YDIG, T0, ADD1
   721	VMLF  ADD1, K0, MK0
   722	VREPF $3, MK0, MK0
   723
   724	VMALF  X1, YDIG, T1, ADD2
   725	VMALHF X0, YDIG, T0, ADD1H
   726	VMALHF X1, YDIG, T1, ADD2H
   727
   728	VMALF  M0, MK0, ADD1, RED1
   729	VMALHF M0, MK0, ADD1, RED1H
   730	VMALF  M1, MK0, ADD2, RED2
   731	VMALHF M1, MK0, ADD2, RED2H
   732
   733	VSLDB $12, RED2, RED1, RED1
   734	VSLDB $12, T2, RED2, RED2
   735
   736	VACCQ RED1, ADD1H, CAR1
   737	VAQ   RED1, ADD1H, T0
   738	VACCQ RED1H, T0, CAR1M
   739	VAQ   RED1H, T0, T0
   740
   741	// << ready for next MK0
   742
   743	VACQ   RED2, ADD2H, CAR1, T1
   744	VACCCQ RED2, ADD2H, CAR1, CAR1
   745	VACCCQ RED2H, T1, CAR1M, T2
   746	VACQ   RED2H, T1, CAR1M, T1
   747	VAQ    CAR1, T2, T2
   748
   749	// ---------------------------------------------------
   750	VREPF $2, Y1, YDIG
   751	VMALF X0, YDIG, T0, ADD1
   752	VMLF  ADD1, K0, MK0
   753	VREPF $3, MK0, MK0
   754
   755	VMALF  X1, YDIG, T1, ADD2
   756	VMALHF X0, YDIG, T0, ADD1H
   757	VMALHF X1, YDIG, T1, ADD2H
   758
   759	VMALF  M0, MK0, ADD1, RED1
   760	VMALHF M0, MK0, ADD1, RED1H
   761	VMALF  M1, MK0, ADD2, RED2
   762	VMALHF M1, MK0, ADD2, RED2H
   763
   764	VSLDB $12, RED2, RED1, RED1
   765	VSLDB $12, T2, RED2, RED2
   766
   767	VACCQ RED1, ADD1H, CAR1
   768	VAQ   RED1, ADD1H, T0
   769	VACCQ RED1H, T0, CAR1M
   770	VAQ   RED1H, T0, T0
   771
   772	// << ready for next MK0
   773
   774	VACQ   RED2, ADD2H, CAR1, T1
   775	VACCCQ RED2, ADD2H, CAR1, CAR1
   776	VACCCQ RED2H, T1, CAR1M, T2
   777	VACQ   RED2H, T1, CAR1M, T1
   778	VAQ    CAR1, T2, T2
   779
   780	// ---------------------------------------------------
   781	VREPF $1, Y1, YDIG
   782	VMALF X0, YDIG, T0, ADD1
   783	VMLF  ADD1, K0, MK0
   784	VREPF $3, MK0, MK0
   785
   786	VMALF  X1, YDIG, T1, ADD2
   787	VMALHF X0, YDIG, T0, ADD1H
   788	VMALHF X1, YDIG, T1, ADD2H
   789
   790	VMALF  M0, MK0, ADD1, RED1
   791	VMALHF M0, MK0, ADD1, RED1H
   792	VMALF  M1, MK0, ADD2, RED2
   793	VMALHF M1, MK0, ADD2, RED2H
   794
   795	VSLDB $12, RED2, RED1, RED1
   796	VSLDB $12, T2, RED2, RED2
   797
   798	VACCQ RED1, ADD1H, CAR1
   799	VAQ   RED1, ADD1H, T0
   800	VACCQ RED1H, T0, CAR1M
   801	VAQ   RED1H, T0, T0
   802
   803	// << ready for next MK0
   804
   805	VACQ   RED2, ADD2H, CAR1, T1
   806	VACCCQ RED2, ADD2H, CAR1, CAR1
   807	VACCCQ RED2H, T1, CAR1M, T2
   808	VACQ   RED2H, T1, CAR1M, T1
   809	VAQ    CAR1, T2, T2
   810
   811	// ---------------------------------------------------
   812	VREPF $0, Y1, YDIG
   813	VMALF X0, YDIG, T0, ADD1
   814	VMLF  ADD1, K0, MK0
   815	VREPF $3, MK0, MK0
   816
   817	VMALF  X1, YDIG, T1, ADD2
   818	VMALHF X0, YDIG, T0, ADD1H
   819	VMALHF X1, YDIG, T1, ADD2H
   820
   821	VMALF  M0, MK0, ADD1, RED1
   822	VMALHF M0, MK0, ADD1, RED1H
   823	VMALF  M1, MK0, ADD2, RED2
   824	VMALHF M1, MK0, ADD2, RED2H
   825
   826	VSLDB $12, RED2, RED1, RED1
   827	VSLDB $12, T2, RED2, RED2
   828
   829	VACCQ RED1, ADD1H, CAR1
   830	VAQ   RED1, ADD1H, T0
   831	VACCQ RED1H, T0, CAR1M
   832	VAQ   RED1H, T0, T0
   833
   834	// << ready for next MK0
   835
   836	VACQ   RED2, ADD2H, CAR1, T1
   837	VACCCQ RED2, ADD2H, CAR1, CAR1
   838	VACCCQ RED2H, T1, CAR1M, T2
   839	VACQ   RED2H, T1, CAR1M, T1
   840	VAQ    CAR1, T2, T2
   841
   842	// ---------------------------------------------------
   843
   844	VZERO   RED1
   845	VSCBIQ  M0, T0, CAR1
   846	VSQ     M0, T0, ADD1
   847	VSBCBIQ T1, M1, CAR1, CAR1M
   848	VSBIQ   T1, M1, CAR1, ADD2
   849	VSBIQ   T2, RED1, CAR1M, T2
   850
   851	// what output to use, ADD2||ADD1 or T1||T0?
   852	VSEL T0, ADD1, T2, T0
   853	VSEL T1, ADD2, T2, T1
   854
   855	VPDI $0x4, T0, T0, T0
   856	VST  T0, (0*16)(res_ptr)
   857	VPDI $0x4, T1, T1, T1
   858	VST  T1, (1*16)(res_ptr)
   859	RET
   860
   861#undef res_ptr
   862#undef x_ptr
   863#undef y_ptr
   864#undef X0
   865#undef X1
   866#undef Y0
   867#undef Y1
   868#undef M0
   869#undef M1
   870#undef T0
   871#undef T1
   872#undef T2
   873#undef YDIG
   874
   875#undef ADD1
   876#undef ADD1H
   877#undef ADD2
   878#undef ADD2H
   879#undef RED1
   880#undef RED1H
   881#undef RED2
   882#undef RED2H
   883#undef CAR1
   884#undef CAR1M
   885
   886#undef MK0
   887#undef K0
   888
   889// ---------------------------------------
   890// p256MulInternal
   891// V0-V3,V30,V31 - Not Modified
   892// V4-V15 - Volatile
   893
   894#define CPOOL   R4
   895
   896// Parameters
   897#define X0    V0 // Not modified
   898#define X1    V1 // Not modified
   899#define Y0    V2 // Not modified
   900#define Y1    V3 // Not modified
   901#define T0    V4
   902#define T1    V5
   903#define P0    V30 // Not modified
   904#define P1    V31 // Not modified
   905
   906// Temporaries
   907#define YDIG  V6 // Overloaded with CAR2, ZER
   908#define ADD1H V7 // Overloaded with ADD3H
   909#define ADD2H V8 // Overloaded with ADD4H
   910#define ADD3  V9 // Overloaded with SEL2,SEL5
   911#define ADD4  V10 // Overloaded with SEL3,SEL6
   912#define RED1  V11 // Overloaded with CAR2
   913#define RED2  V12
   914#define RED3  V13 // Overloaded with SEL1
   915#define T2    V14
   916// Overloaded temporaries
   917#define ADD1  V4 // Overloaded with T0
   918#define ADD2  V5 // Overloaded with T1
   919#define ADD3H V7 // Overloaded with ADD1H
   920#define ADD4H V8 // Overloaded with ADD2H
   921#define ZER   V6 // Overloaded with YDIG, CAR2
   922#define CAR1  V6 // Overloaded with YDIG, ZER
   923#define CAR2  V11 // Overloaded with RED1
   924// Constant Selects
   925#define SEL1  V13 // Overloaded with RED3
   926#define SEL2  V9 // Overloaded with ADD3,SEL5
   927#define SEL3  V10 // Overloaded with ADD4,SEL6
   928#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   929#define SEL5  V9 // Overloaded with ADD3,SEL2
   930#define SEL6  V10 // Overloaded with ADD4,SEL3
   931
   932/* *
   933 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   934 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   935 * With you, SIMD be...
   936 *
   937 *                                           +--------+--------+
   938 *                                  +--------|  RED2  |  RED1  |
   939 *                                  |        +--------+--------+
   940 *                                  |       ---+--------+--------+
   941 *                                  |  +---- T2|   T1   |   T0   |--+
   942 *                                  |  |    ---+--------+--------+  |
   943 *                                  |  |                            |
   944 *                                  |  |    ======================= |
   945 *                                  |  |                            |
   946 *                                  |  |       +--------+--------+<-+
   947 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   948 *                                  |  |       +--------+--------+  |     |
   949 *                                  |  |     +--------+--------+<---+     |
   950 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   951 *                                  |  |     +--------+--------+  |       |
   952 *                                  |  |     +--------+--------+<-+       |
   953 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   954 *                                  |  |     +--------+--------+  | |     |
   955 *                                  |  |   +--------+--------+<---+ |     |
   956 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   957 *                                  |  |   +--------+--------+      | |   V
   958 *                                  |  | ------------------------   | | +--------+
   959 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   960 *                                  |  |                            | | +--------+
   961 *                                  |  +---->+--------+--------+    | |   |
   962 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   963 *                                  |        +--------+--------+    | |   |
   964 *                                  +---->---+--------+--------+    | |   |
   965 *                                         T2|   T1   |   T0   |----+ |   |
   966 *                                        ---+--------+--------+    | |   |
   967 *                                        ---+--------+--------+<---+ |   |
   968 *                                    +--- T2|   T1   |   T0   |----------+
   969 *                                    |   ---+--------+--------+      |   |
   970 *                                    |  +--------+--------+<-------------+
   971 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   972 *                                    |  +--------+--------+     |    |   |
   973 *                                    |  +--------+<----------------------+
   974 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   975 *                                    |  +--------+              |    |
   976 *                                    +--->+--------+--------+   |    |
   977 *                                         |   T1   |   T0   |--------+
   978 *                                         +--------+--------+   |    |
   979 *                                   --------------------------- |    |
   980 *                                                               |    |
   981 *                                       +--------+--------+<----+    |
   982 *                                       |  RED2  |  RED1  |          |
   983 *                                       +--------+--------+          |
   984 *                                      ---+--------+--------+<-------+
   985 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   986 *                                      ---+--------+--------+
   987 *
   988 *                                                                *Mi obra de arte de siglo XXI @vpaprots
   989 *
   990 *
   991 * First group is special, doesn't get the two inputs:
   992 *                                             +--------+--------+<-+
   993 *                                     +-------|  ADD2  |  ADD1  |--|-----+
   994 *                                     |       +--------+--------+  |     |
   995 *                                     |     +--------+--------+<---+     |
   996 *                                     |     | ADD2H  | ADD1H  |--+       |
   997 *                                     |     +--------+--------+  |       |
   998 *                                     |     +--------+--------+<-+       |
   999 *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1000 *                                     |     +--------+--------+  | |     |
  1001 *                                     |   +--------+--------+<---+ |     |
  1002 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1003 *                                     |   +--------+--------+      | |   V
  1004 *                                     | ------------------------   | | +--------+
  1005 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1006 *                                     |                            | | +--------+
  1007 *                                     +---->+--------+--------+    | |   |
  1008 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1009 *                                           +--------+--------+    | |   |
  1010 *                                        ---+--------+--------+<---+ |   |
  1011 *                                    +--- T2|   T1   |   T0   |----------+
  1012 *                                    |   ---+--------+--------+      |   |
  1013 *                                    |  +--------+--------+<-------------+
  1014 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1015 *                                    |  +--------+--------+     |    |   |
  1016 *                                    |  +--------+<----------------------+
  1017 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1018 *                                    |  +--------+              |    |
  1019 *                                    +--->+--------+--------+   |    |
  1020 *                                         |   T1   |   T0   |--------+
  1021 *                                         +--------+--------+   |    |
  1022 *                                   --------------------------- |    |
  1023 *                                                               |    |
  1024 *                                       +--------+--------+<----+    |
  1025 *                                       |  RED2  |  RED1  |          |
  1026 *                                       +--------+--------+          |
  1027 *                                      ---+--------+--------+<-------+
  1028 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1029 *                                      ---+--------+--------+
  1030 *
  1031 * Last 'group' needs to RED2||RED1 shifted less
  1032 */
  1033TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
  1034	VL 32(CPOOL), SEL1
  1035	VL 48(CPOOL), SEL2
  1036	VL 64(CPOOL), SEL3
  1037	VL 80(CPOOL), SEL4
  1038
  1039	// ---------------------------------------------------
  1040
  1041	VREPF $3, Y0, YDIG
  1042	VMLHF X0, YDIG, ADD1H
  1043	VMLHF X1, YDIG, ADD2H
  1044	VMLF  X0, YDIG, ADD1
  1045	VMLF  X1, YDIG, ADD2
  1046
  1047	VREPF  $2, Y0, YDIG
  1048	VMALF  X0, YDIG, ADD1H, ADD3
  1049	VMALF  X1, YDIG, ADD2H, ADD4
  1050	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1051	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1052
  1053	VZERO ZER
  1054	VL    32(CPOOL), SEL1
  1055	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1056
  1057	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1058	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1059
  1060	VACCQ  T0, ADD3, CAR1
  1061	VAQ    T0, ADD3, T0       // ADD3 Free
  1062	VACCCQ T1, ADD4, CAR1, T2
  1063	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1064
  1065	VL    48(CPOOL), SEL2
  1066	VL    64(CPOOL), SEL3
  1067	VL    80(CPOOL), SEL4
  1068	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1069	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1070	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1071	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1072
  1073	VSLDB $12, T1, T0, T0
  1074	VSLDB $12, T2, T1, T1
  1075
  1076	VACCQ  T0, ADD3H, CAR1
  1077	VAQ    T0, ADD3H, T0
  1078	VACCCQ T1, ADD4H, CAR1, T2
  1079	VACQ   T1, ADD4H, CAR1, T1
  1080
  1081	// ---------------------------------------------------
  1082
  1083	VREPF  $1, Y0, YDIG
  1084	VMALHF X0, YDIG, T0, ADD1H
  1085	VMALHF X1, YDIG, T1, ADD2H
  1086	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1087	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1088
  1089	VREPF  $0, Y0, YDIG
  1090	VMALF  X0, YDIG, ADD1H, ADD3
  1091	VMALF  X1, YDIG, ADD2H, ADD4
  1092	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1093	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1094
  1095	VZERO ZER
  1096	VL    32(CPOOL), SEL1
  1097	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1098
  1099	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1100	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1101
  1102	VACCQ  T0, RED1, CAR1
  1103	VAQ    T0, RED1, T0
  1104	VACCCQ T1, RED2, CAR1, T2
  1105	VACQ   T1, RED2, CAR1, T1
  1106
  1107	VACCQ  T0, ADD3, CAR1
  1108	VAQ    T0, ADD3, T0
  1109	VACCCQ T1, ADD4, CAR1, CAR2
  1110	VACQ   T1, ADD4, CAR1, T1
  1111	VAQ    T2, CAR2, T2
  1112
  1113	VL    48(CPOOL), SEL2
  1114	VL    64(CPOOL), SEL3
  1115	VL    80(CPOOL), SEL4
  1116	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1117	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1118	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1119	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1120
  1121	VSLDB $12, T1, T0, T0
  1122	VSLDB $12, T2, T1, T1
  1123
  1124	VACCQ  T0, ADD3H, CAR1
  1125	VAQ    T0, ADD3H, T0
  1126	VACCCQ T1, ADD4H, CAR1, T2
  1127	VACQ   T1, ADD4H, CAR1, T1
  1128
  1129	// ---------------------------------------------------
  1130
  1131	VREPF  $3, Y1, YDIG
  1132	VMALHF X0, YDIG, T0, ADD1H
  1133	VMALHF X1, YDIG, T1, ADD2H
  1134	VMALF  X0, YDIG, T0, ADD1
  1135	VMALF  X1, YDIG, T1, ADD2
  1136
  1137	VREPF  $2, Y1, YDIG
  1138	VMALF  X0, YDIG, ADD1H, ADD3
  1139	VMALF  X1, YDIG, ADD2H, ADD4
  1140	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1141	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1142
  1143	VZERO ZER
  1144	VL    32(CPOOL), SEL1
  1145	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1146
  1147	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1148	VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1149
  1150	VACCQ  T0, RED1, CAR1
  1151	VAQ    T0, RED1, T0
  1152	VACCCQ T1, RED2, CAR1, T2
  1153	VACQ   T1, RED2, CAR1, T1
  1154
  1155	VACCQ  T0, ADD3, CAR1
  1156	VAQ    T0, ADD3, T0
  1157	VACCCQ T1, ADD4, CAR1, CAR2
  1158	VACQ   T1, ADD4, CAR1, T1
  1159	VAQ    T2, CAR2, T2
  1160
  1161	VL    48(CPOOL), SEL2
  1162	VL    64(CPOOL), SEL3
  1163	VL    80(CPOOL), SEL4
  1164	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1165	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1166	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1167	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1168
  1169	VSLDB $12, T1, T0, T0
  1170	VSLDB $12, T2, T1, T1
  1171
  1172	VACCQ  T0, ADD3H, CAR1
  1173	VAQ    T0, ADD3H, T0
  1174	VACCCQ T1, ADD4H, CAR1, T2
  1175	VACQ   T1, ADD4H, CAR1, T1
  1176
  1177	// ---------------------------------------------------
  1178
  1179	VREPF  $1, Y1, YDIG
  1180	VMALHF X0, YDIG, T0, ADD1H
  1181	VMALHF X1, YDIG, T1, ADD2H
  1182	VMALF  X0, YDIG, T0, ADD1
  1183	VMALF  X1, YDIG, T1, ADD2
  1184
  1185	VREPF  $0, Y1, YDIG
  1186	VMALF  X0, YDIG, ADD1H, ADD3
  1187	VMALF  X1, YDIG, ADD2H, ADD4
  1188	VMALHF X0, YDIG, ADD1H, ADD3H
  1189	VMALHF X1, YDIG, ADD2H, ADD4H
  1190
  1191	VZERO ZER
  1192	VL    32(CPOOL), SEL1
  1193	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1194
  1195	VSLDB $12, ADD2, ADD1, T0
  1196	VSLDB $12, T2, ADD2, T1
  1197
  1198	VACCQ  T0, RED1, CAR1
  1199	VAQ    T0, RED1, T0
  1200	VACCCQ T1, RED2, CAR1, T2
  1201	VACQ   T1, RED2, CAR1, T1
  1202
  1203	VACCQ  T0, ADD3, CAR1
  1204	VAQ    T0, ADD3, T0
  1205	VACCCQ T1, ADD4, CAR1, CAR2
  1206	VACQ   T1, ADD4, CAR1, T1
  1207	VAQ    T2, CAR2, T2
  1208
  1209	VL    96(CPOOL), SEL5
  1210	VL    112(CPOOL), SEL6
  1211	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1212	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1213	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1214
  1215	VSLDB $12, T1, T0, T0
  1216	VSLDB $12, T2, T1, T1
  1217
  1218	VACCQ  T0, ADD3H, CAR1
  1219	VAQ    T0, ADD3H, T0
  1220	VACCCQ T1, ADD4H, CAR1, T2
  1221	VACQ   T1, ADD4H, CAR1, T1
  1222
  1223	VACCQ  T0, RED1, CAR1
  1224	VAQ    T0, RED1, T0
  1225	VACCCQ T1, RED2, CAR1, CAR2
  1226	VACQ   T1, RED2, CAR1, T1
  1227	VAQ    T2, CAR2, T2
  1228
  1229	// ---------------------------------------------------
  1230
  1231	VZERO   RED3
  1232	VSCBIQ  P0, T0, CAR1
  1233	VSQ     P0, T0, ADD1H
  1234	VSBCBIQ T1, P1, CAR1, CAR2
  1235	VSBIQ   T1, P1, CAR1, ADD2H
  1236	VSBIQ   T2, RED3, CAR2, T2
  1237
  1238	// what output to use, ADD2H||ADD1H or T1||T0?
  1239	VSEL T0, ADD1H, T2, T0
  1240	VSEL T1, ADD2H, T2, T1
  1241	RET
  1242
  1243#undef CPOOL
  1244
  1245#undef X0
  1246#undef X1
  1247#undef Y0
  1248#undef Y1
  1249#undef T0
  1250#undef T1
  1251#undef P0
  1252#undef P1
  1253
  1254#undef SEL1
  1255#undef SEL2
  1256#undef SEL3
  1257#undef SEL4
  1258#undef SEL5
  1259#undef SEL6
  1260
  1261#undef YDIG
  1262#undef ADD1H
  1263#undef ADD2H
  1264#undef ADD3
  1265#undef ADD4
  1266#undef RED1
  1267#undef RED2
  1268#undef RED3
  1269#undef T2
  1270#undef ADD1
  1271#undef ADD2
  1272#undef ADD3H
  1273#undef ADD4H
  1274#undef ZER
  1275#undef CAR1
  1276#undef CAR2
  1277
  1278// ---------------------------------------
  1279
  1280// Parameters
  1281#define X0    V0
  1282#define X1    V1
  1283#define Y0    V2
  1284#define Y1    V3
  1285
  1286TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
  1287	VLR X0, Y0
  1288	VLR X1, Y1
  1289	BR  p256MulInternal<>(SB)
  1290
  1291#undef X0
  1292#undef X1
  1293#undef Y0
  1294#undef Y1
  1295
  1296#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1297	VZERO   ZER                \
  1298	VSCBIQ  Y0, X0, CAR1       \
  1299	VSQ     Y0, X0, T0         \
  1300	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1301	VSBIQ   X1, Y1, CAR1, T1   \
  1302	VSQ     SEL1, ZER, SEL1    \
  1303	                           \
  1304	VACCQ   T0, PL, CAR1       \
  1305	VAQ     T0, PL, TT0        \
  1306	VACQ    T1, PH, CAR1, TT1  \
  1307	                           \
  1308	VSEL    T0, TT0, SEL1, T0  \
  1309	VSEL    T1, TT1, SEL1, T1  \
  1310
  1311#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1312	VACCQ   X0, Y0, CAR1        \
  1313	VAQ     X0, Y0, T0          \
  1314	VACCCQ  X1, Y1, CAR1, T2    \
  1315	VACQ    X1, Y1, CAR1, T1    \
  1316	                            \
  1317	VZERO   ZER                 \
  1318	VSCBIQ  PL, T0, CAR1        \
  1319	VSQ     PL, T0, TT0         \
  1320	VSBCBIQ T1, PH, CAR1, CAR2  \
  1321	VSBIQ   T1, PH, CAR1, TT1   \
  1322	VSBIQ   T2, ZER, CAR2, SEL1 \
  1323	                            \
  1324	VSEL    T0, TT0, SEL1, T0   \
  1325	VSEL    T1, TT1, SEL1, T1
  1326
  1327#define p256HalfInternal(T1, T0, X1, X0) \
  1328	VZERO  ZER                \
  1329	VSBIQ  ZER, ZER, X0, SEL1 \
  1330	                          \
  1331	VACCQ  X0, PL, CAR1       \
  1332	VAQ    X0, PL, T0         \
  1333	VACCCQ X1, PH, CAR1, T2   \
  1334	VACQ   X1, PH, CAR1, T1   \
  1335	                          \
  1336	VSEL   X0, T0, SEL1, T0   \
  1337	VSEL   X1, T1, SEL1, T1   \
  1338	VSEL   ZER, T2, SEL1, T2  \
  1339	                          \
  1340	VSLDB  $15, T2, ZER, TT1  \
  1341	VSLDB  $15, T1, ZER, TT0  \
  1342	VREPIB $1, SEL1           \
  1343	VSRL   SEL1, T0, T0       \
  1344	VSRL   SEL1, T1, T1       \
  1345	VREPIB $7, SEL1           \
  1346	VSL    SEL1, TT0, TT0     \
  1347	VSL    SEL1, TT1, TT1     \
  1348	VO     T0, TT0, T0        \
  1349	VO     T1, TT1, T1
  1350
  1351// ---------------------------------------
  1352// func p256Mul(res, in1, in2 *p256Element)
  1353#define res_ptr R1
  1354#define x_ptr   R2
  1355#define y_ptr   R3
  1356#define CPOOL   R4
  1357
  1358// Parameters
  1359#define X0    V0
  1360#define X1    V1
  1361#define Y0    V2
  1362#define Y1    V3
  1363#define T0    V4
  1364#define T1    V5
  1365
  1366// Constants
  1367#define P0    V30
  1368#define P1    V31
  1369TEXT ·p256Mul(SB), NOSPLIT, $0
  1370	MOVD res+0(FP), res_ptr
  1371	MOVD in1+8(FP), x_ptr
  1372	MOVD in2+16(FP), y_ptr
  1373
  1374	VL   (0*16)(x_ptr), X0
  1375	VPDI $0x4, X0, X0, X0
  1376	VL   (1*16)(x_ptr), X1
  1377	VPDI $0x4, X1, X1, X1
  1378	VL   (0*16)(y_ptr), Y0
  1379	VPDI $0x4, Y0, Y0, Y0
  1380	VL   (1*16)(y_ptr), Y1
  1381	VPDI $0x4, Y1, Y1, Y1
  1382
  1383	MOVD $p256mul<>+0x00(SB), CPOOL
  1384	VL   16(CPOOL), P0
  1385	VL   0(CPOOL), P1
  1386
  1387	CALL p256MulInternal<>(SB)
  1388
  1389	VPDI $0x4, T0, T0, T0
  1390	VST  T0, (0*16)(res_ptr)
  1391	VPDI $0x4, T1, T1, T1
  1392	VST  T1, (1*16)(res_ptr)
  1393	RET
  1394
  1395#undef res_ptr
  1396#undef x_ptr
  1397#undef y_ptr
  1398#undef CPOOL
  1399
  1400#undef X0
  1401#undef X1
  1402#undef Y0
  1403#undef Y1
  1404#undef T0
  1405#undef T1
  1406#undef P0
  1407#undef P1
  1408
  1409// ---------------------------------------
  1410//  func p256Sqr(res, in *p256Element, n int)
  1411#define res_ptr R1
  1412#define x_ptr   R2
  1413#define y_ptr   R3
  1414#define CPOOL   R4
  1415#define COUNT   R5
  1416#define N       R6
  1417
  1418// Parameters
  1419#define X0    V0
  1420#define X1    V1
  1421#define T0    V4
  1422#define T1    V5
  1423
  1424// Constants
  1425#define P0    V30
  1426#define P1    V31
  1427TEXT ·p256Sqr(SB), NOSPLIT, $0
  1428	MOVD res+0(FP), res_ptr
  1429	MOVD in+8(FP), x_ptr
  1430
  1431	VL   (0*16)(x_ptr), X0
  1432	VPDI $0x4, X0, X0, X0
  1433	VL   (1*16)(x_ptr), X1
  1434	VPDI $0x4, X1, X1, X1
  1435
  1436	MOVD $p256mul<>+0x00(SB), CPOOL
  1437	MOVD $0, COUNT
  1438	MOVD n+16(FP), N
  1439	VL   16(CPOOL), P0
  1440	VL   0(CPOOL), P1
  1441
  1442loop:
  1443	CALL p256SqrInternal<>(SB)
  1444	VLR  T0, X0
  1445	VLR  T1, X1
  1446	ADDW $1, COUNT
  1447	CMPW COUNT, N
  1448	BLT  loop
  1449
  1450	VPDI $0x4, T0, T0, T0
  1451	VST  T0, (0*16)(res_ptr)
  1452	VPDI $0x4, T1, T1, T1
  1453	VST  T1, (1*16)(res_ptr)
  1454	RET
  1455
  1456#undef res_ptr
  1457#undef x_ptr
  1458#undef y_ptr
  1459#undef CPOOL
  1460#undef COUNT
  1461#undef N
  1462
  1463#undef X0
  1464#undef X1
  1465#undef T0
  1466#undef T1
  1467#undef P0
  1468#undef P1
  1469
  1470// Point add with P2 being affine point
  1471// If sign == 1 -> P2 = -P2
  1472// If sel == 0 -> P3 = P1
  1473// if zero == 0 -> P3 = P2
  1474// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1475#define P3ptr   R1
  1476#define P1ptr   R2
  1477#define P2ptr   R3
  1478#define CPOOL   R4
  1479
  1480// Temporaries in REGs
  1481#define Y2L    V15
  1482#define Y2H    V16
  1483#define T1L    V17
  1484#define T1H    V18
  1485#define T2L    V19
  1486#define T2H    V20
  1487#define T3L    V21
  1488#define T3H    V22
  1489#define T4L    V23
  1490#define T4H    V24
  1491
  1492// Temps for Sub and Add
  1493#define TT0  V11
  1494#define TT1  V12
  1495#define T2   V13
  1496
  1497// p256MulAsm Parameters
  1498#define X0    V0
  1499#define X1    V1
  1500#define Y0    V2
  1501#define Y1    V3
  1502#define T0    V4
  1503#define T1    V5
  1504
  1505#define PL    V30
  1506#define PH    V31
  1507
  1508// Names for zero/sel selects
  1509#define X1L    V0
  1510#define X1H    V1
  1511#define Y1L    V2 // p256MulAsmParmY
  1512#define Y1H    V3 // p256MulAsmParmY
  1513#define Z1L    V4
  1514#define Z1H    V5
  1515#define X2L    V0
  1516#define X2H    V1
  1517#define Z2L    V4
  1518#define Z2H    V5
  1519#define X3L    V17 // T1L
  1520#define X3H    V18 // T1H
  1521#define Y3L    V21 // T3L
  1522#define Y3H    V22 // T3H
  1523#define Z3L    V28
  1524#define Z3H    V29
  1525
  1526#define ZER   V6
  1527#define SEL1  V7
  1528#define CAR1  V8
  1529#define CAR2  V9
  1530/* *
  1531 * Three operand formula:
  1532 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1533 * T1 = Z1²
  1534 * T2 = T1*Z1
  1535 * T1 = T1*X2
  1536 * T2 = T2*Y2
  1537 * T1 = T1-X1
  1538 * T2 = T2-Y1
  1539 * Z3 = Z1*T1
  1540 * T3 = T1²
  1541 * T4 = T3*T1
  1542 * T3 = T3*X1
  1543 * T1 = 2*T3
  1544 * X3 = T2²
  1545 * X3 = X3-T1
  1546 * X3 = X3-T4
  1547 * T3 = T3-X3
  1548 * T3 = T3*T2
  1549 * T4 = T4*Y1
  1550 * Y3 = T3-T4
  1551
  1552 * Three operand formulas, but with MulInternal X,Y used to store temps
  1553X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1554X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1555X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1556X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1557SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1558SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1559X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1560X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1561X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1562X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1563ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1564X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1565SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1566SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1567SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1568X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1569X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1570SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1571
  1572	*/
  1573TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1574	MOVD res+0(FP), P3ptr
  1575	MOVD in1+8(FP), P1ptr
  1576	MOVD in2+16(FP), P2ptr
  1577
  1578	MOVD $p256mul<>+0x00(SB), CPOOL
  1579	VL   16(CPOOL), PL
  1580	VL   0(CPOOL), PH
  1581
  1582	//	if (sign == 1) {
  1583	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1584	//	}
  1585
  1586	VL   48(P2ptr), Y2H
  1587	VPDI $0x4, Y2H, Y2H, Y2H
  1588	VL   32(P2ptr), Y2L
  1589	VPDI $0x4, Y2L, Y2L, Y2L
  1590
  1591	VLREPG sign+24(FP), SEL1
  1592	VZERO  ZER
  1593	VCEQG  SEL1, ZER, SEL1
  1594
  1595	VSCBIQ Y2L, PL, CAR1
  1596	VSQ    Y2L, PL, T1L
  1597	VSBIQ  PH, Y2H, CAR1, T1H
  1598
  1599	VSEL Y2L, T1L, SEL1, Y2L
  1600	VSEL Y2H, T1H, SEL1, Y2H
  1601
  1602/* *
  1603 * Three operand formula:
  1604 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1605 */
  1606	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1607	VL   80(P1ptr), X1       // Z1H
  1608	VPDI $0x4, X1, X1, X1
  1609	VL   64(P1ptr), X0       // Z1L
  1610	VPDI $0x4, X0, X0, X0
  1611	VLR  X0, Y0
  1612	VLR  X1, Y1
  1613	CALL p256SqrInternal<>(SB)
  1614
  1615	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1616	VLR  T0, X0
  1617	VLR  T1, X1
  1618	CALL p256MulInternal<>(SB)
  1619	VLR  T0, T2L
  1620	VLR  T1, T2H
  1621
  1622	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1623	VL   16(P2ptr), Y1       // X2H
  1624	VPDI $0x4, Y1, Y1, Y1
  1625	VL   0(P2ptr), Y0        // X2L
  1626	VPDI $0x4, Y0, Y0, Y0
  1627	CALL p256MulInternal<>(SB)
  1628	VLR  T0, T1L
  1629	VLR  T1, T1H
  1630
  1631	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1632	VLR  T2L, X0
  1633	VLR  T2H, X1
  1634	VLR  Y2L, Y0
  1635	VLR  Y2H, Y1
  1636	CALL p256MulInternal<>(SB)
  1637
  1638	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1639	VL   48(P1ptr), Y1H
  1640	VPDI $0x4, Y1H, Y1H, Y1H
  1641	VL   32(P1ptr), Y1L
  1642	VPDI $0x4, Y1L, Y1L, Y1L
  1643	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1644
  1645	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1646	VL   16(P1ptr), X1H
  1647	VPDI $0x4, X1H, X1H, X1H
  1648	VL   0(P1ptr), X1L
  1649	VPDI $0x4, X1L, X1L, X1L
  1650	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1651
  1652	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1653	VL   80(P1ptr), X1       // Z1H
  1654	VPDI $0x4, X1, X1, X1
  1655	VL   64(P1ptr), X0       // Z1L
  1656	VPDI $0x4, X0, X0, X0
  1657	CALL p256MulInternal<>(SB)
  1658
  1659	// VST T1, 64(P3ptr)
  1660	// VST T0, 80(P3ptr)
  1661	VLR T0, Z3L
  1662	VLR T1, Z3H
  1663
  1664	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1665	VLR  Y0, X0
  1666	VLR  Y1, X1
  1667	CALL p256SqrInternal<>(SB)
  1668	VLR  T0, X0
  1669	VLR  T1, X1
  1670
  1671	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1672	CALL p256MulInternal<>(SB)
  1673	VLR  T0, T4L
  1674	VLR  T1, T4H
  1675
  1676	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1677	VL   16(P1ptr), Y1       // X1H
  1678	VPDI $0x4, Y1, Y1, Y1
  1679	VL   0(P1ptr), Y0        // X1L
  1680	VPDI $0x4, Y0, Y0, Y0
  1681	CALL p256MulInternal<>(SB)
  1682	VLR  T0, T3L
  1683	VLR  T1, T3H
  1684
  1685	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1686	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1687
  1688	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1689	VLR  T2L, X0
  1690	VLR  T2H, X1
  1691	VLR  T2L, Y0
  1692	VLR  T2H, Y1
  1693	CALL p256SqrInternal<>(SB)
  1694
  1695	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1696	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1697
  1698	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1699	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1700	VLR T0, X3L
  1701	VLR T1, X3H
  1702
  1703	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1704	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1705
  1706	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1707	CALL p256MulInternal<>(SB)
  1708	VLR  T0, T3L
  1709	VLR  T1, T3H
  1710
  1711	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1712	VLR  T4L, X0
  1713	VLR  T4H, X1
  1714	VL   48(P1ptr), Y1       // Y1H
  1715	VPDI $0x4, Y1, Y1, Y1
  1716	VL   32(P1ptr), Y0       // Y1L
  1717	VPDI $0x4, Y0, Y0, Y0
  1718	CALL p256MulInternal<>(SB)
  1719
  1720	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1721	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1722
  1723	//	if (sel == 0) {
  1724	//		copy(P3.x[:], X1)
  1725	//		copy(P3.y[:], Y1)
  1726	//		copy(P3.z[:], Z1)
  1727	//	}
  1728
  1729	VL   16(P1ptr), X1H
  1730	VPDI $0x4, X1H, X1H, X1H
  1731	VL   0(P1ptr), X1L
  1732	VPDI $0x4, X1L, X1L, X1L
  1733
  1734	// Y1 already loaded, left over from addition
  1735	VL   80(P1ptr), Z1H
  1736	VPDI $0x4, Z1H, Z1H, Z1H
  1737	VL   64(P1ptr), Z1L
  1738	VPDI $0x4, Z1L, Z1L, Z1L
  1739
  1740	VLREPG sel+32(FP), SEL1
  1741	VZERO  ZER
  1742	VCEQG  SEL1, ZER, SEL1
  1743
  1744	VSEL X1L, X3L, SEL1, X3L
  1745	VSEL X1H, X3H, SEL1, X3H
  1746	VSEL Y1L, Y3L, SEL1, Y3L
  1747	VSEL Y1H, Y3H, SEL1, Y3H
  1748	VSEL Z1L, Z3L, SEL1, Z3L
  1749	VSEL Z1H, Z3H, SEL1, Z3H
  1750
  1751	//	if (zero == 0) {
  1752	//		copy(P3.x[:], X2)
  1753	//		copy(P3.y[:], Y2)
  1754	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1755	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1756	//	}
  1757	VL   16(P2ptr), X2H
  1758	VPDI $0x4, X2H, X2H, X2H
  1759	VL   0(P2ptr), X2L
  1760	VPDI $0x4, X2L, X2L, X2L
  1761
  1762	// Y2 already loaded
  1763	VL 128(CPOOL), Z2H
  1764	VL 144(CPOOL), Z2L
  1765
  1766	VLREPG zero+40(FP), SEL1
  1767	VZERO  ZER
  1768	VCEQG  SEL1, ZER, SEL1
  1769
  1770	VSEL X2L, X3L, SEL1, X3L
  1771	VSEL X2H, X3H, SEL1, X3H
  1772	VSEL Y2L, Y3L, SEL1, Y3L
  1773	VSEL Y2H, Y3H, SEL1, Y3H
  1774	VSEL Z2L, Z3L, SEL1, Z3L
  1775	VSEL Z2H, Z3H, SEL1, Z3H
  1776
  1777	// All done, store out the result!!!
  1778	VPDI $0x4, X3H, X3H, X3H
  1779	VST  X3H, 16(P3ptr)
  1780	VPDI $0x4, X3L, X3L, X3L
  1781	VST  X3L, 0(P3ptr)
  1782	VPDI $0x4, Y3H, Y3H, Y3H
  1783	VST  Y3H, 48(P3ptr)
  1784	VPDI $0x4, Y3L, Y3L, Y3L
  1785	VST  Y3L, 32(P3ptr)
  1786	VPDI $0x4, Z3H, Z3H, Z3H
  1787	VST  Z3H, 80(P3ptr)
  1788	VPDI $0x4, Z3L, Z3L, Z3L
  1789	VST  Z3L, 64(P3ptr)
  1790
  1791	RET
  1792
  1793#undef P3ptr
  1794#undef P1ptr
  1795#undef P2ptr
  1796#undef CPOOL
  1797
  1798#undef Y2L
  1799#undef Y2H
  1800#undef T1L
  1801#undef T1H
  1802#undef T2L
  1803#undef T2H
  1804#undef T3L
  1805#undef T3H
  1806#undef T4L
  1807#undef T4H
  1808
  1809#undef TT0
  1810#undef TT1
  1811#undef T2
  1812
  1813#undef X0
  1814#undef X1
  1815#undef Y0
  1816#undef Y1
  1817#undef T0
  1818#undef T1
  1819
  1820#undef PL
  1821#undef PH
  1822
  1823#undef X1L
  1824#undef X1H
  1825#undef Y1L
  1826#undef Y1H
  1827#undef Z1L
  1828#undef Z1H
  1829#undef X2L
  1830#undef X2H
  1831#undef Z2L
  1832#undef Z2H
  1833#undef X3L
  1834#undef X3H
  1835#undef Y3L
  1836#undef Y3H
  1837#undef Z3L
  1838#undef Z3H
  1839
  1840#undef ZER
  1841#undef SEL1
  1842#undef CAR1
  1843#undef CAR2
  1844
  1845// func p256PointDoubleAsm(res, in *P256Point)
  1846// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1847// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1848// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1849#define P3ptr   R1
  1850#define P1ptr   R2
  1851#define CPOOL   R4
  1852
  1853// Temporaries in REGs
  1854#define X3L    V15
  1855#define X3H    V16
  1856#define Y3L    V17
  1857#define Y3H    V18
  1858#define T1L    V19
  1859#define T1H    V20
  1860#define T2L    V21
  1861#define T2H    V22
  1862#define T3L    V23
  1863#define T3H    V24
  1864
  1865#define X1L    V6
  1866#define X1H    V7
  1867#define Y1L    V8
  1868#define Y1H    V9
  1869#define Z1L    V10
  1870#define Z1H    V11
  1871
  1872// Temps for Sub and Add
  1873#define TT0  V11
  1874#define TT1  V12
  1875#define T2   V13
  1876
  1877// p256MulAsm Parameters
  1878#define X0    V0
  1879#define X1    V1
  1880#define Y0    V2
  1881#define Y1    V3
  1882#define T0    V4
  1883#define T1    V5
  1884
  1885#define PL    V30
  1886#define PH    V31
  1887
  1888#define Z3L    V23
  1889#define Z3H    V24
  1890
  1891#define ZER   V26
  1892#define SEL1  V27
  1893#define CAR1  V28
  1894#define CAR2  V29
  1895/*
  1896 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1897 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1898 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1899 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1900 * 	B  = 2Y₁
  1901 * 	Z₃ = B×Z₁
  1902 * 	C  = B²
  1903 * 	D  = C×X₁
  1904 * 	X₃ = A²-2D
  1905 * 	Y₃ = (D-X₃)×A-C²/2
  1906 *
  1907 * Three-operand formula:
  1908 *       T1 = Z1²
  1909 *       T2 = X1-T1
  1910 *       T1 = X1+T1
  1911 *       T2 = T2*T1
  1912 *       T2 = 3*T2
  1913 *       Y3 = 2*Y1
  1914 *       Z3 = Y3*Z1
  1915 *       Y3 = Y3²
  1916 *       T3 = Y3*X1
  1917 *       Y3 = Y3²
  1918 *       Y3 = half*Y3
  1919 *       X3 = T2²
  1920 *       T1 = 2*T3
  1921 *       X3 = X3-T1
  1922 *       T1 = T3-X3
  1923 *       T1 = T1*T2
  1924 *       Y3 = T1-Y3
  1925 */
  1926
  1927TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1928	MOVD res+0(FP), P3ptr
  1929	MOVD in+8(FP), P1ptr
  1930
  1931	MOVD $p256mul<>+0x00(SB), CPOOL
  1932	VL   16(CPOOL), PL
  1933	VL   0(CPOOL), PH
  1934
  1935	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1936	VL   80(P1ptr), X1        // Z1H
  1937	VPDI $0x4, X1, X1, X1
  1938	VL   64(P1ptr), X0        // Z1L
  1939	VPDI $0x4, X0, X0, X0
  1940	VLR  X0, Y0
  1941	VLR  X1, Y1
  1942	CALL p256SqrInternal<>(SB)
  1943
  1944	// SUB(X<X1-T)            // T2 = X1-T1
  1945	VL   16(P1ptr), X1H
  1946	VPDI $0x4, X1H, X1H, X1H
  1947	VL   0(P1ptr), X1L
  1948	VPDI $0x4, X1L, X1L, X1L
  1949	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1950
  1951	// ADD(Y<X1+T)            // T1 = X1+T1
  1952	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1953
  1954	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1955	CALL p256MulInternal<>(SB)
  1956
  1957	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1958	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1959	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1960
  1961	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1962	VL   48(P1ptr), Y1H
  1963	VPDI $0x4, Y1H, Y1H, Y1H
  1964	VL   32(P1ptr), Y1L
  1965	VPDI $0x4, Y1L, Y1L, Y1L
  1966	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1967
  1968	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1969	VL   80(P1ptr), Y1        // Z1H
  1970	VPDI $0x4, Y1, Y1, Y1
  1971	VL   64(P1ptr), Y0        // Z1L
  1972	VPDI $0x4, Y0, Y0, Y0
  1973	CALL p256MulInternal<>(SB)
  1974	VPDI $0x4, T1, T1, TT1
  1975	VST  TT1, 80(P3ptr)
  1976	VPDI $0x4, T0, T0, TT0
  1977	VST  TT0, 64(P3ptr)
  1978
  1979	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1980	VLR  X0, Y0
  1981	VLR  X1, Y1
  1982	CALL p256SqrInternal<>(SB)
  1983
  1984	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1985	VLR  T0, X0
  1986	VLR  T1, X1
  1987	VL   16(P1ptr), Y1
  1988	VPDI $0x4, Y1, Y1, Y1
  1989	VL   0(P1ptr), Y0
  1990	VPDI $0x4, Y0, Y0, Y0
  1991	CALL p256MulInternal<>(SB)
  1992	VLR  T0, T3L
  1993	VLR  T1, T3H
  1994
  1995	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1996	VLR  X0, Y0
  1997	VLR  X1, Y1
  1998	CALL p256SqrInternal<>(SB)
  1999
  2000	// HAL(Y3<T)              // Y3 = half*Y3
  2001	p256HalfInternal(Y3H,Y3L, T1,T0)
  2002
  2003	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  2004	VLR  T2L, X0
  2005	VLR  T2H, X1
  2006	VLR  T2L, Y0
  2007	VLR  T2H, Y1
  2008	CALL p256SqrInternal<>(SB)
  2009
  2010	// ADD(T1<T3+T3)          // T1 = 2*T3
  2011	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  2012
  2013	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2014	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  2015	VPDI $0x4, X3H, X3H, TT1
  2016	VST  TT1, 16(P3ptr)
  2017	VPDI $0x4, X3L, X3L, TT0
  2018	VST  TT0, 0(P3ptr)
  2019
  2020	// SUB(X<T3-X3)           // T1 = T3-X3
  2021	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  2022
  2023	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2024	CALL p256MulInternal<>(SB)
  2025
  2026	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2027	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  2028
  2029	VPDI $0x4, Y3H, Y3H, Y3H
  2030	VST  Y3H, 48(P3ptr)
  2031	VPDI $0x4, Y3L, Y3L, Y3L
  2032	VST  Y3L, 32(P3ptr)
  2033	RET
  2034
  2035#undef P3ptr
  2036#undef P1ptr
  2037#undef CPOOL
  2038#undef X3L
  2039#undef X3H
  2040#undef Y3L
  2041#undef Y3H
  2042#undef T1L
  2043#undef T1H
  2044#undef T2L
  2045#undef T2H
  2046#undef T3L
  2047#undef T3H
  2048#undef X1L
  2049#undef X1H
  2050#undef Y1L
  2051#undef Y1H
  2052#undef Z1L
  2053#undef Z1H
  2054#undef TT0
  2055#undef TT1
  2056#undef T2
  2057#undef X0
  2058#undef X1
  2059#undef Y0
  2060#undef Y1
  2061#undef T0
  2062#undef T1
  2063#undef PL
  2064#undef PH
  2065#undef Z3L
  2066#undef Z3H
  2067#undef ZER
  2068#undef SEL1
  2069#undef CAR1
  2070#undef CAR2
  2071
  2072// func p256PointAddAsm(res, in1, in2 *P256Point) int
  2073#define P3ptr  R1
  2074#define P1ptr  R2
  2075#define P2ptr  R3
  2076#define CPOOL  R4
  2077#define ISZERO R5
  2078#define TRUE   R6
  2079
  2080// Temporaries in REGs
  2081#define T1L   V16
  2082#define T1H   V17
  2083#define T2L   V18
  2084#define T2H   V19
  2085#define U1L   V20
  2086#define U1H   V21
  2087#define S1L   V22
  2088#define S1H   V23
  2089#define HL    V24
  2090#define HH    V25
  2091#define RL    V26
  2092#define RH    V27
  2093
  2094// Temps for Sub and Add
  2095#define ZER   V6
  2096#define SEL1  V7
  2097#define CAR1  V8
  2098#define CAR2  V9
  2099#define TT0  V11
  2100#define TT1  V12
  2101#define T2   V13
  2102
  2103// p256MulAsm Parameters
  2104#define X0    V0
  2105#define X1    V1
  2106#define Y0    V2
  2107#define Y1    V3
  2108#define T0    V4
  2109#define T1    V5
  2110
  2111#define PL    V30
  2112#define PH    V31
  2113/*
  2114 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2115 *
  2116 * A = X₁×Z₂²
  2117 * B = Y₁×Z₂³
  2118 * C = X₂×Z₁²-A
  2119 * D = Y₂×Z₁³-B
  2120 * X₃ = D² - 2A×C² - C³
  2121 * Y₃ = D×(A×C² - X₃) - B×C³
  2122 * Z₃ = Z₁×Z₂×C
  2123 *
  2124 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2125 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2126 *
  2127 * T1 = Z1*Z1
  2128 * T2 = Z2*Z2
  2129 * U1 = X1*T2
  2130 * H  = X2*T1
  2131 * H  = H-U1
  2132 * Z3 = Z1*Z2
  2133 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2134 *
  2135 * S1 = Z2*T2
  2136 * S1 = Y1*S1
  2137 * R  = Z1*T1
  2138 * R  = Y2*R
  2139 * R  = R-S1
  2140 *
  2141 * T1 = H*H
  2142 * T2 = H*T1
  2143 * U1 = U1*T1
  2144 *
  2145 * X3 = R*R
  2146 * X3 = X3-T2
  2147 * T1 = 2*U1
  2148 * X3 = X3-T1 << store-out X3 result reg
  2149 *
  2150 * T2 = S1*T2
  2151 * Y3 = U1-X3
  2152 * Y3 = R*Y3
  2153 * Y3 = Y3-T2 << store-out Y3 result reg
  2154
  2155 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2156	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2157	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2158	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2159	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2160	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2161	// SUB(H<H-T)            // H  = H-U1
  2162	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2163	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2164	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2165	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2166	// SUB(R<T-S1)           // R  = R-S1
  2167	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2168	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2169	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2170	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2171	// SUB(T<T-T2)           // X3 = X3-T2
  2172	// ADD(X<U1+U1)          // T1 = 2*U1
  2173	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2174	// SUB(Y<U1-T)           // Y3 = U1-X3
  2175	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2176	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2177	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2178	*/
  2179TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2180	MOVD res+0(FP), P3ptr
  2181	MOVD in1+8(FP), P1ptr
  2182	MOVD in2+16(FP), P2ptr
  2183
  2184	MOVD $p256mul<>+0x00(SB), CPOOL
  2185	VL   16(CPOOL), PL
  2186	VL   0(CPOOL), PH
  2187
  2188	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2189	VL   80(P1ptr), X1       // Z1H
  2190	VPDI $0x4, X1, X1, X1
  2191	VL   64(P1ptr), X0       // Z1L
  2192	VPDI $0x4, X0, X0, X0
  2193	VLR  X0, Y0
  2194	VLR  X1, Y1
  2195	CALL p256SqrInternal<>(SB)
  2196
  2197	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2198	VLR  T0, Y0
  2199	VLR  T1, Y1
  2200	CALL p256MulInternal<>(SB)
  2201	VLR  T0, RL
  2202	VLR  T1, RH
  2203
  2204	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2205	VL   16(P2ptr), X1       // X2H
  2206	VPDI $0x4, X1, X1, X1
  2207	VL   0(P2ptr), X0        // X2L
  2208	VPDI $0x4, X0, X0, X0
  2209	CALL p256MulInternal<>(SB)
  2210	VLR  T0, HL
  2211	VLR  T1, HH
  2212
  2213	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2214	VL   80(P2ptr), X1       // Z2H
  2215	VPDI $0x4, X1, X1, X1
  2216	VL   64(P2ptr), X0       // Z2L
  2217	VPDI $0x4, X0, X0, X0
  2218	VLR  X0, Y0
  2219	VLR  X1, Y1
  2220	CALL p256SqrInternal<>(SB)
  2221
  2222	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2223	VLR  T0, Y0
  2224	VLR  T1, Y1
  2225	CALL p256MulInternal<>(SB)
  2226	VLR  T0, S1L
  2227	VLR  T1, S1H
  2228
  2229	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2230	VL   16(P1ptr), X1       // X1H
  2231	VPDI $0x4, X1, X1, X1
  2232	VL   0(P1ptr), X0        // X1L
  2233	VPDI $0x4, X0, X0, X0
  2234	CALL p256MulInternal<>(SB)
  2235	VLR  T0, U1L
  2236	VLR  T1, U1H
  2237
  2238	// SUB(H<H-T)            // H  = H-U1
  2239	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2240
  2241	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2242	// clobbers T1H and T1L
  2243	MOVD   $0, ISZERO
  2244	MOVD   $1, TRUE
  2245	VZERO  ZER
  2246	VO     HL, HH, T1H
  2247	VCEQGS ZER, T1H, T1H
  2248	MOVDEQ TRUE, ISZERO
  2249	VX     HL, PL, T1L
  2250	VX     HH, PH, T1H
  2251	VO     T1L, T1H, T1H
  2252	VCEQGS ZER, T1H, T1H
  2253	MOVDEQ TRUE, ISZERO
  2254	MOVD   ISZERO, ret+24(FP)
  2255
  2256	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2257	VL   80(P1ptr), X1       // Z1H
  2258	VPDI $0x4, X1, X1, X1
  2259	VL   64(P1ptr), X0       // Z1L
  2260	VPDI $0x4, X0, X0, X0
  2261	VL   80(P2ptr), Y1       // Z2H
  2262	VPDI $0x4, Y1, Y1, Y1
  2263	VL   64(P2ptr), Y0       // Z2L
  2264	VPDI $0x4, Y0, Y0, Y0
  2265	CALL p256MulInternal<>(SB)
  2266
  2267	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2268	VLR  T0, X0
  2269	VLR  T1, X1
  2270	VLR  HL, Y0
  2271	VLR  HH, Y1
  2272	CALL p256MulInternal<>(SB)
  2273	VPDI $0x4, T1, T1, TT1
  2274	VST  TT1, 80(P3ptr)
  2275	VPDI $0x4, T0, T0, TT0
  2276	VST  TT0, 64(P3ptr)
  2277
  2278	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2279	VL   48(P1ptr), X1
  2280	VPDI $0x4, X1, X1, X1
  2281	VL   32(P1ptr), X0
  2282	VPDI $0x4, X0, X0, X0
  2283	VLR  S1L, Y0
  2284	VLR  S1H, Y1
  2285	CALL p256MulInternal<>(SB)
  2286	VLR  T0, S1L
  2287	VLR  T1, S1H
  2288
  2289	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2290	VL   48(P2ptr), X1
  2291	VPDI $0x4, X1, X1, X1
  2292	VL   32(P2ptr), X0
  2293	VPDI $0x4, X0, X0, X0
  2294	VLR  RL, Y0
  2295	VLR  RH, Y1
  2296	CALL p256MulInternal<>(SB)
  2297
  2298	// SUB(R<T-S1)           // R  = T-S1
  2299	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2300
  2301	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2302	// clobbers T1H and T1L
  2303	MOVD   $0, ISZERO
  2304	MOVD   $1, TRUE
  2305	VZERO  ZER
  2306	VO     RL, RH, T1H
  2307	VCEQGS ZER, T1H, T1H
  2308	MOVDEQ TRUE, ISZERO
  2309	VX     RL, PL, T1L
  2310	VX     RH, PH, T1H
  2311	VO     T1L, T1H, T1H
  2312	VCEQGS ZER, T1H, T1H
  2313	MOVDEQ TRUE, ISZERO
  2314	AND    ret+24(FP), ISZERO
  2315	MOVD   ISZERO, ret+24(FP)
  2316
  2317	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2318	VLR  HL, X0
  2319	VLR  HH, X1
  2320	VLR  HL, Y0
  2321	VLR  HH, Y1
  2322	CALL p256SqrInternal<>(SB)
  2323
  2324	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2325	VLR  T0, Y0
  2326	VLR  T1, Y1
  2327	CALL p256MulInternal<>(SB)
  2328	VLR  T0, T2L
  2329	VLR  T1, T2H
  2330
  2331	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2332	VLR  U1L, X0
  2333	VLR  U1H, X1
  2334	CALL p256MulInternal<>(SB)
  2335	VLR  T0, U1L
  2336	VLR  T1, U1H
  2337
  2338	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2339	VLR  RL, X0
  2340	VLR  RH, X1
  2341	VLR  RL, Y0
  2342	VLR  RH, Y1
  2343	CALL p256SqrInternal<>(SB)
  2344
  2345	// SUB(T<T-T2)           // X3 = X3-T2
  2346	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2347
  2348	// ADD(X<U1+U1)          // T1 = 2*U1
  2349	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2350
  2351	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2352	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2353	VPDI $0x4, T1, T1, TT1
  2354	VST  TT1, 16(P3ptr)
  2355	VPDI $0x4, T0, T0, TT0
  2356	VST  TT0, 0(P3ptr)
  2357
  2358	// SUB(Y<U1-T)           // Y3 = U1-X3
  2359	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2360
  2361	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2362	VLR  RL, X0
  2363	VLR  RH, X1
  2364	CALL p256MulInternal<>(SB)
  2365	VLR  T0, U1L
  2366	VLR  T1, U1H
  2367
  2368	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2369	VLR  S1L, X0
  2370	VLR  S1H, X1
  2371	VLR  T2L, Y0
  2372	VLR  T2H, Y1
  2373	CALL p256MulInternal<>(SB)
  2374
  2375	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2376	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2377	VPDI $0x4, T1, T1, T1
  2378	VST  T1, 48(P3ptr)
  2379	VPDI $0x4, T0, T0, T0
  2380	VST  T0, 32(P3ptr)
  2381
  2382	RET

View as plain text