...

Text file src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s

Documentation: vendor/golang.org/x/crypto/chacha20

     1// Copyright 2019 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// Based on CRYPTOGAMS code with the following comment:
     6// # ====================================================================
     7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8// # project. The module is, however, dual licensed under OpenSSL and
     9// # CRYPTOGAMS licenses depending on where you obtain it. For further
    10// # details see http://www.openssl.org/~appro/cryptogams/.
    11// # ====================================================================
    12
    13// Code for the perl script that generates the ppc64 assembler
    14// can be found in the cryptogams repository at the link below. It is based on
    15// the original from openssl.
    16
    17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18
    19// The differences in this and the original implementation are
    20// due to the calling conventions and initialization of constants.
    21
    22//go:build gc && !purego && (ppc64 || ppc64le)
    23
    24#include "textflag.h"
    25
    26#define OUT  R3
    27#define INP  R4
    28#define LEN  R5
    29#define KEY  R6
    30#define CNT  R7
    31#define TMP  R15
    32
    33#define CONSTBASE  R16
    34#define BLOCKS R17
    35
    36// for VPERMXOR
    37#define MASK  R18
    38
    39DATA consts<>+0x00(SB)/4, $0x61707865
    40DATA consts<>+0x04(SB)/4, $0x3320646e
    41DATA consts<>+0x08(SB)/4, $0x79622d32
    42DATA consts<>+0x0c(SB)/4, $0x6b206574
    43DATA consts<>+0x10(SB)/4, $0x00000001
    44DATA consts<>+0x14(SB)/4, $0x00000000
    45DATA consts<>+0x18(SB)/4, $0x00000000
    46DATA consts<>+0x1c(SB)/4, $0x00000000
    47DATA consts<>+0x20(SB)/4, $0x00000004
    48DATA consts<>+0x24(SB)/4, $0x00000000
    49DATA consts<>+0x28(SB)/4, $0x00000000
    50DATA consts<>+0x2c(SB)/4, $0x00000000
    51DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
    52DATA consts<>+0x34(SB)/4, $0x0a0b0809
    53DATA consts<>+0x38(SB)/4, $0x06070405
    54DATA consts<>+0x3c(SB)/4, $0x02030001
    55DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
    56DATA consts<>+0x44(SB)/4, $0x090a0b08
    57DATA consts<>+0x48(SB)/4, $0x05060704
    58DATA consts<>+0x4c(SB)/4, $0x01020300
    59DATA consts<>+0x50(SB)/4, $0x61707865
    60DATA consts<>+0x54(SB)/4, $0x61707865
    61DATA consts<>+0x58(SB)/4, $0x61707865
    62DATA consts<>+0x5c(SB)/4, $0x61707865
    63DATA consts<>+0x60(SB)/4, $0x3320646e
    64DATA consts<>+0x64(SB)/4, $0x3320646e
    65DATA consts<>+0x68(SB)/4, $0x3320646e
    66DATA consts<>+0x6c(SB)/4, $0x3320646e
    67DATA consts<>+0x70(SB)/4, $0x79622d32
    68DATA consts<>+0x74(SB)/4, $0x79622d32
    69DATA consts<>+0x78(SB)/4, $0x79622d32
    70DATA consts<>+0x7c(SB)/4, $0x79622d32
    71DATA consts<>+0x80(SB)/4, $0x6b206574
    72DATA consts<>+0x84(SB)/4, $0x6b206574
    73DATA consts<>+0x88(SB)/4, $0x6b206574
    74DATA consts<>+0x8c(SB)/4, $0x6b206574
    75DATA consts<>+0x90(SB)/4, $0x00000000
    76DATA consts<>+0x94(SB)/4, $0x00000001
    77DATA consts<>+0x98(SB)/4, $0x00000002
    78DATA consts<>+0x9c(SB)/4, $0x00000003
    79DATA consts<>+0xa0(SB)/4, $0x11223300
    80DATA consts<>+0xa4(SB)/4, $0x55667744
    81DATA consts<>+0xa8(SB)/4, $0x99aabb88
    82DATA consts<>+0xac(SB)/4, $0xddeeffcc
    83DATA consts<>+0xb0(SB)/4, $0x22330011
    84DATA consts<>+0xb4(SB)/4, $0x66774455
    85DATA consts<>+0xb8(SB)/4, $0xaabb8899
    86DATA consts<>+0xbc(SB)/4, $0xeeffccdd
    87GLOBL consts<>(SB), RODATA, $0xc0
    88
    89#ifdef GOARCH_ppc64
    90#define BE_XXBRW_INIT() \
    91		LVSL (R0)(R0), V24 \
    92		VSPLTISB $3, V25   \
    93		VXOR V24, V25, V24 \
    94
    95#define BE_XXBRW(vr) VPERM vr, vr, V24, vr
    96#else
    97#define BE_XXBRW_INIT()
    98#define BE_XXBRW(vr)
    99#endif
   100
   101//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
   102TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
   103	MOVD out+0(FP), OUT
   104	MOVD inp+8(FP), INP
   105	MOVD len+16(FP), LEN
   106	MOVD key+24(FP), KEY
   107	MOVD counter+32(FP), CNT
   108
   109	// Addressing for constants
   110	MOVD $consts<>+0x00(SB), CONSTBASE
   111	MOVD $16, R8
   112	MOVD $32, R9
   113	MOVD $48, R10
   114	MOVD $64, R11
   115	SRD $6, LEN, BLOCKS
   116	// for VPERMXOR
   117	MOVD $consts<>+0xa0(SB), MASK
   118	MOVD $16, R20
   119	// V16
   120	LXVW4X (CONSTBASE)(R0), VS48
   121	ADD $80,CONSTBASE
   122
   123	// Load key into V17,V18
   124	LXVW4X (KEY)(R0), VS49
   125	LXVW4X (KEY)(R8), VS50
   126
   127	// Load CNT, NONCE into V19
   128	LXVW4X (CNT)(R0), VS51
   129
   130	// Clear V27
   131	VXOR V27, V27, V27
   132
   133	BE_XXBRW_INIT()
   134
   135	// V28
   136	LXVW4X (CONSTBASE)(R11), VS60
   137
   138	// Load mask constants for VPERMXOR
   139	LXVW4X (MASK)(R0), V20
   140	LXVW4X (MASK)(R20), V21
   141
   142	// splat slot from V19 -> V26
   143	VSPLTW $0, V19, V26
   144
   145	VSLDOI $4, V19, V27, V19
   146	VSLDOI $12, V27, V19, V19
   147
   148	VADDUWM V26, V28, V26
   149
   150	MOVD $10, R14
   151	MOVD R14, CTR
   152	PCALIGN $16
   153loop_outer_vsx:
   154	// V0, V1, V2, V3
   155	LXVW4X (R0)(CONSTBASE), VS32
   156	LXVW4X (R8)(CONSTBASE), VS33
   157	LXVW4X (R9)(CONSTBASE), VS34
   158	LXVW4X (R10)(CONSTBASE), VS35
   159
   160	// splat values from V17, V18 into V4-V11
   161	VSPLTW $0, V17, V4
   162	VSPLTW $1, V17, V5
   163	VSPLTW $2, V17, V6
   164	VSPLTW $3, V17, V7
   165	VSPLTW $0, V18, V8
   166	VSPLTW $1, V18, V9
   167	VSPLTW $2, V18, V10
   168	VSPLTW $3, V18, V11
   169
   170	// VOR
   171	VOR V26, V26, V12
   172
   173	// splat values from V19 -> V13, V14, V15
   174	VSPLTW $1, V19, V13
   175	VSPLTW $2, V19, V14
   176	VSPLTW $3, V19, V15
   177
   178	// splat   const values
   179	VSPLTISW $-16, V27
   180	VSPLTISW $12, V28
   181	VSPLTISW $8, V29
   182	VSPLTISW $7, V30
   183	PCALIGN $16
   184loop_vsx:
   185	VADDUWM V0, V4, V0
   186	VADDUWM V1, V5, V1
   187	VADDUWM V2, V6, V2
   188	VADDUWM V3, V7, V3
   189
   190	VPERMXOR V12, V0, V21, V12
   191	VPERMXOR V13, V1, V21, V13
   192	VPERMXOR V14, V2, V21, V14
   193	VPERMXOR V15, V3, V21, V15
   194
   195	VADDUWM V8, V12, V8
   196	VADDUWM V9, V13, V9
   197	VADDUWM V10, V14, V10
   198	VADDUWM V11, V15, V11
   199
   200	VXOR V4, V8, V4
   201	VXOR V5, V9, V5
   202	VXOR V6, V10, V6
   203	VXOR V7, V11, V7
   204
   205	VRLW V4, V28, V4
   206	VRLW V5, V28, V5
   207	VRLW V6, V28, V6
   208	VRLW V7, V28, V7
   209
   210	VADDUWM V0, V4, V0
   211	VADDUWM V1, V5, V1
   212	VADDUWM V2, V6, V2
   213	VADDUWM V3, V7, V3
   214
   215	VPERMXOR V12, V0, V20, V12
   216	VPERMXOR V13, V1, V20, V13
   217	VPERMXOR V14, V2, V20, V14
   218	VPERMXOR V15, V3, V20, V15
   219
   220	VADDUWM V8, V12, V8
   221	VADDUWM V9, V13, V9
   222	VADDUWM V10, V14, V10
   223	VADDUWM V11, V15, V11
   224
   225	VXOR V4, V8, V4
   226	VXOR V5, V9, V5
   227	VXOR V6, V10, V6
   228	VXOR V7, V11, V7
   229
   230	VRLW V4, V30, V4
   231	VRLW V5, V30, V5
   232	VRLW V6, V30, V6
   233	VRLW V7, V30, V7
   234
   235	VADDUWM V0, V5, V0
   236	VADDUWM V1, V6, V1
   237	VADDUWM V2, V7, V2
   238	VADDUWM V3, V4, V3
   239
   240	VPERMXOR V15, V0, V21, V15
   241	VPERMXOR V12, V1, V21, V12
   242	VPERMXOR V13, V2, V21, V13
   243	VPERMXOR V14, V3, V21, V14
   244
   245	VADDUWM V10, V15, V10
   246	VADDUWM V11, V12, V11
   247	VADDUWM V8, V13, V8
   248	VADDUWM V9, V14, V9
   249
   250	VXOR V5, V10, V5
   251	VXOR V6, V11, V6
   252	VXOR V7, V8, V7
   253	VXOR V4, V9, V4
   254
   255	VRLW V5, V28, V5
   256	VRLW V6, V28, V6
   257	VRLW V7, V28, V7
   258	VRLW V4, V28, V4
   259
   260	VADDUWM V0, V5, V0
   261	VADDUWM V1, V6, V1
   262	VADDUWM V2, V7, V2
   263	VADDUWM V3, V4, V3
   264
   265	VPERMXOR V15, V0, V20, V15
   266	VPERMXOR V12, V1, V20, V12
   267	VPERMXOR V13, V2, V20, V13
   268	VPERMXOR V14, V3, V20, V14
   269
   270	VADDUWM V10, V15, V10
   271	VADDUWM V11, V12, V11
   272	VADDUWM V8, V13, V8
   273	VADDUWM V9, V14, V9
   274
   275	VXOR V5, V10, V5
   276	VXOR V6, V11, V6
   277	VXOR V7, V8, V7
   278	VXOR V4, V9, V4
   279
   280	VRLW V5, V30, V5
   281	VRLW V6, V30, V6
   282	VRLW V7, V30, V7
   283	VRLW V4, V30, V4
   284	BDNZ   loop_vsx
   285
   286	VADDUWM V12, V26, V12
   287
   288	VMRGEW V0, V1, V27
   289	VMRGEW V2, V3, V28
   290
   291	VMRGOW V0, V1, V0
   292	VMRGOW V2, V3, V2
   293
   294	VMRGEW V4, V5, V29
   295	VMRGEW V6, V7, V30
   296
   297	XXPERMDI VS32, VS34, $0, VS33
   298	XXPERMDI VS32, VS34, $3, VS35
   299	XXPERMDI VS59, VS60, $0, VS32
   300	XXPERMDI VS59, VS60, $3, VS34
   301
   302	VMRGOW V4, V5, V4
   303	VMRGOW V6, V7, V6
   304
   305	VMRGEW V8, V9, V27
   306	VMRGEW V10, V11, V28
   307
   308	XXPERMDI VS36, VS38, $0, VS37
   309	XXPERMDI VS36, VS38, $3, VS39
   310	XXPERMDI VS61, VS62, $0, VS36
   311	XXPERMDI VS61, VS62, $3, VS38
   312
   313	VMRGOW V8, V9, V8
   314	VMRGOW V10, V11, V10
   315
   316	VMRGEW V12, V13, V29
   317	VMRGEW V14, V15, V30
   318
   319	XXPERMDI VS40, VS42, $0, VS41
   320	XXPERMDI VS40, VS42, $3, VS43
   321	XXPERMDI VS59, VS60, $0, VS40
   322	XXPERMDI VS59, VS60, $3, VS42
   323
   324	VMRGOW V12, V13, V12
   325	VMRGOW V14, V15, V14
   326
   327	VSPLTISW $4, V27
   328	VADDUWM V26, V27, V26
   329
   330	XXPERMDI VS44, VS46, $0, VS45
   331	XXPERMDI VS44, VS46, $3, VS47
   332	XXPERMDI VS61, VS62, $0, VS44
   333	XXPERMDI VS61, VS62, $3, VS46
   334
   335	VADDUWM V0, V16, V0
   336	VADDUWM V4, V17, V4
   337	VADDUWM V8, V18, V8
   338	VADDUWM V12, V19, V12
   339
   340	BE_XXBRW(V0)
   341	BE_XXBRW(V4)
   342	BE_XXBRW(V8)
   343	BE_XXBRW(V12)
   344
   345	CMPU LEN, $64
   346	BLT tail_vsx
   347
   348	// Bottom of loop
   349	LXVW4X (INP)(R0), VS59
   350	LXVW4X (INP)(R8), VS60
   351	LXVW4X (INP)(R9), VS61
   352	LXVW4X (INP)(R10), VS62
   353
   354	VXOR V27, V0, V27
   355	VXOR V28, V4, V28
   356	VXOR V29, V8, V29
   357	VXOR V30, V12, V30
   358
   359	STXVW4X VS59, (OUT)(R0)
   360	STXVW4X VS60, (OUT)(R8)
   361	ADD     $64, INP
   362	STXVW4X VS61, (OUT)(R9)
   363	ADD     $-64, LEN
   364	STXVW4X VS62, (OUT)(R10)
   365	ADD     $64, OUT
   366	BEQ     done_vsx
   367
   368	VADDUWM V1, V16, V0
   369	VADDUWM V5, V17, V4
   370	VADDUWM V9, V18, V8
   371	VADDUWM V13, V19, V12
   372
   373	BE_XXBRW(V0)
   374	BE_XXBRW(V4)
   375	BE_XXBRW(V8)
   376	BE_XXBRW(V12)
   377
   378	CMPU  LEN, $64
   379	BLT   tail_vsx
   380
   381	LXVW4X (INP)(R0), VS59
   382	LXVW4X (INP)(R8), VS60
   383	LXVW4X (INP)(R9), VS61
   384	LXVW4X (INP)(R10), VS62
   385
   386	VXOR V27, V0, V27
   387	VXOR V28, V4, V28
   388	VXOR V29, V8, V29
   389	VXOR V30, V12, V30
   390
   391	STXVW4X VS59, (OUT)(R0)
   392	STXVW4X VS60, (OUT)(R8)
   393	ADD     $64, INP
   394	STXVW4X VS61, (OUT)(R9)
   395	ADD     $-64, LEN
   396	STXVW4X VS62, (OUT)(V10)
   397	ADD     $64, OUT
   398	BEQ     done_vsx
   399
   400	VADDUWM V2, V16, V0
   401	VADDUWM V6, V17, V4
   402	VADDUWM V10, V18, V8
   403	VADDUWM V14, V19, V12
   404
   405	BE_XXBRW(V0)
   406	BE_XXBRW(V4)
   407	BE_XXBRW(V8)
   408	BE_XXBRW(V12)
   409
   410	CMPU LEN, $64
   411	BLT  tail_vsx
   412
   413	LXVW4X (INP)(R0), VS59
   414	LXVW4X (INP)(R8), VS60
   415	LXVW4X (INP)(R9), VS61
   416	LXVW4X (INP)(R10), VS62
   417
   418	VXOR V27, V0, V27
   419	VXOR V28, V4, V28
   420	VXOR V29, V8, V29
   421	VXOR V30, V12, V30
   422
   423	STXVW4X VS59, (OUT)(R0)
   424	STXVW4X VS60, (OUT)(R8)
   425	ADD     $64, INP
   426	STXVW4X VS61, (OUT)(R9)
   427	ADD     $-64, LEN
   428	STXVW4X VS62, (OUT)(R10)
   429	ADD     $64, OUT
   430	BEQ     done_vsx
   431
   432	VADDUWM V3, V16, V0
   433	VADDUWM V7, V17, V4
   434	VADDUWM V11, V18, V8
   435	VADDUWM V15, V19, V12
   436
   437	BE_XXBRW(V0)
   438	BE_XXBRW(V4)
   439	BE_XXBRW(V8)
   440	BE_XXBRW(V12)
   441
   442	CMPU  LEN, $64
   443	BLT   tail_vsx
   444
   445	LXVW4X (INP)(R0), VS59
   446	LXVW4X (INP)(R8), VS60
   447	LXVW4X (INP)(R9), VS61
   448	LXVW4X (INP)(R10), VS62
   449
   450	VXOR V27, V0, V27
   451	VXOR V28, V4, V28
   452	VXOR V29, V8, V29
   453	VXOR V30, V12, V30
   454
   455	STXVW4X VS59, (OUT)(R0)
   456	STXVW4X VS60, (OUT)(R8)
   457	ADD     $64, INP
   458	STXVW4X VS61, (OUT)(R9)
   459	ADD     $-64, LEN
   460	STXVW4X VS62, (OUT)(R10)
   461	ADD     $64, OUT
   462
   463	MOVD $10, R14
   464	MOVD R14, CTR
   465	BNE  loop_outer_vsx
   466
   467done_vsx:
   468	// Increment counter by number of 64 byte blocks
   469	MOVWZ (CNT), R14
   470	ADD  BLOCKS, R14
   471	MOVWZ R14, (CNT)
   472	RET
   473
   474tail_vsx:
   475	ADD  $32, R1, R11
   476	MOVD LEN, CTR
   477
   478	// Save values on stack to copy from
   479	STXVW4X VS32, (R11)(R0)
   480	STXVW4X VS36, (R11)(R8)
   481	STXVW4X VS40, (R11)(R9)
   482	STXVW4X VS44, (R11)(R10)
   483	ADD $-1, R11, R12
   484	ADD $-1, INP
   485	ADD $-1, OUT
   486	PCALIGN $16
   487looptail_vsx:
   488	// Copying the result to OUT
   489	// in bytes.
   490	MOVBZU 1(R12), KEY
   491	MOVBZU 1(INP), TMP
   492	XOR    KEY, TMP, KEY
   493	MOVBU  KEY, 1(OUT)
   494	BDNZ   looptail_vsx
   495
   496	// Clear the stack values
   497	STXVW4X VS48, (R11)(R0)
   498	STXVW4X VS48, (R11)(R8)
   499	STXVW4X VS48, (R11)(R9)
   500	STXVW4X VS48, (R11)(R10)
   501	BR      done_vsx

View as plain text