...

Text file src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s

Documentation: vendor/golang.org/x/crypto/chacha20

     1// Copyright 2019 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// Based on CRYPTOGAMS code with the following comment:
     6// # ====================================================================
     7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8// # project. The module is, however, dual licensed under OpenSSL and
     9// # CRYPTOGAMS licenses depending on where you obtain it. For further
    10// # details see http://www.openssl.org/~appro/cryptogams/.
    11// # ====================================================================
    12
    13// Code for the perl script that generates the ppc64 assembler
    14// can be found in the cryptogams repository at the link below. It is based on
    15// the original from openssl.
    16
    17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18
    19// The differences in this and the original implementation are
    20// due to the calling conventions and initialization of constants.
    21
    22//go:build gc && !purego
    23
    24#include "textflag.h"
    25
    26#define OUT  R3
    27#define INP  R4
    28#define LEN  R5
    29#define KEY  R6
    30#define CNT  R7
    31#define TMP  R15
    32
    33#define CONSTBASE  R16
    34#define BLOCKS R17
    35
    36// for VPERMXOR
    37#define MASK  R18
    38
    39DATA consts<>+0x00(SB)/8, $0x3320646e61707865
    40DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
    41DATA consts<>+0x10(SB)/8, $0x0000000000000001
    42DATA consts<>+0x18(SB)/8, $0x0000000000000000
    43DATA consts<>+0x20(SB)/8, $0x0000000000000004
    44DATA consts<>+0x28(SB)/8, $0x0000000000000000
    45DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
    46DATA consts<>+0x38(SB)/8, $0x0203000106070405
    47DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
    48DATA consts<>+0x48(SB)/8, $0x0102030005060704
    49DATA consts<>+0x50(SB)/8, $0x6170786561707865
    50DATA consts<>+0x58(SB)/8, $0x6170786561707865
    51DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
    52DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
    53DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
    54DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
    55DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
    56DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
    57DATA consts<>+0x90(SB)/8, $0x0000000100000000
    58DATA consts<>+0x98(SB)/8, $0x0000000300000002
    59DATA consts<>+0xa0(SB)/8, $0x5566774411223300
    60DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
    61DATA consts<>+0xb0(SB)/8, $0x6677445522330011
    62DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
    63GLOBL consts<>(SB), RODATA, $0xc0
    64
    65//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
    66TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
    67	MOVD out+0(FP), OUT
    68	MOVD inp+8(FP), INP
    69	MOVD len+16(FP), LEN
    70	MOVD key+24(FP), KEY
    71	MOVD counter+32(FP), CNT
    72
    73	// Addressing for constants
    74	MOVD $consts<>+0x00(SB), CONSTBASE
    75	MOVD $16, R8
    76	MOVD $32, R9
    77	MOVD $48, R10
    78	MOVD $64, R11
    79	SRD $6, LEN, BLOCKS
    80	// for VPERMXOR
    81	MOVD $consts<>+0xa0(SB), MASK
    82	MOVD $16, R20
    83	// V16
    84	LXVW4X (CONSTBASE)(R0), VS48
    85	ADD $80,CONSTBASE
    86
    87	// Load key into V17,V18
    88	LXVW4X (KEY)(R0), VS49
    89	LXVW4X (KEY)(R8), VS50
    90
    91	// Load CNT, NONCE into V19
    92	LXVW4X (CNT)(R0), VS51
    93
    94	// Clear V27
    95	VXOR V27, V27, V27
    96
    97	// V28
    98	LXVW4X (CONSTBASE)(R11), VS60
    99
   100	// Load mask constants for VPERMXOR
   101	LXVW4X (MASK)(R0), V20
   102	LXVW4X (MASK)(R20), V21
   103
   104	// splat slot from V19 -> V26
   105	VSPLTW $0, V19, V26
   106
   107	VSLDOI $4, V19, V27, V19
   108	VSLDOI $12, V27, V19, V19
   109
   110	VADDUWM V26, V28, V26
   111
   112	MOVD $10, R14
   113	MOVD R14, CTR
   114	PCALIGN $16
   115loop_outer_vsx:
   116	// V0, V1, V2, V3
   117	LXVW4X (R0)(CONSTBASE), VS32
   118	LXVW4X (R8)(CONSTBASE), VS33
   119	LXVW4X (R9)(CONSTBASE), VS34
   120	LXVW4X (R10)(CONSTBASE), VS35
   121
   122	// splat values from V17, V18 into V4-V11
   123	VSPLTW $0, V17, V4
   124	VSPLTW $1, V17, V5
   125	VSPLTW $2, V17, V6
   126	VSPLTW $3, V17, V7
   127	VSPLTW $0, V18, V8
   128	VSPLTW $1, V18, V9
   129	VSPLTW $2, V18, V10
   130	VSPLTW $3, V18, V11
   131
   132	// VOR
   133	VOR V26, V26, V12
   134
   135	// splat values from V19 -> V13, V14, V15
   136	VSPLTW $1, V19, V13
   137	VSPLTW $2, V19, V14
   138	VSPLTW $3, V19, V15
   139
   140	// splat   const values
   141	VSPLTISW $-16, V27
   142	VSPLTISW $12, V28
   143	VSPLTISW $8, V29
   144	VSPLTISW $7, V30
   145	PCALIGN $16
   146loop_vsx:
   147	VADDUWM V0, V4, V0
   148	VADDUWM V1, V5, V1
   149	VADDUWM V2, V6, V2
   150	VADDUWM V3, V7, V3
   151
   152	VPERMXOR V12, V0, V21, V12
   153	VPERMXOR V13, V1, V21, V13
   154	VPERMXOR V14, V2, V21, V14
   155	VPERMXOR V15, V3, V21, V15
   156
   157	VADDUWM V8, V12, V8
   158	VADDUWM V9, V13, V9
   159	VADDUWM V10, V14, V10
   160	VADDUWM V11, V15, V11
   161
   162	VXOR V4, V8, V4
   163	VXOR V5, V9, V5
   164	VXOR V6, V10, V6
   165	VXOR V7, V11, V7
   166
   167	VRLW V4, V28, V4
   168	VRLW V5, V28, V5
   169	VRLW V6, V28, V6
   170	VRLW V7, V28, V7
   171
   172	VADDUWM V0, V4, V0
   173	VADDUWM V1, V5, V1
   174	VADDUWM V2, V6, V2
   175	VADDUWM V3, V7, V3
   176
   177	VPERMXOR V12, V0, V20, V12
   178	VPERMXOR V13, V1, V20, V13
   179	VPERMXOR V14, V2, V20, V14
   180	VPERMXOR V15, V3, V20, V15
   181
   182	VADDUWM V8, V12, V8
   183	VADDUWM V9, V13, V9
   184	VADDUWM V10, V14, V10
   185	VADDUWM V11, V15, V11
   186
   187	VXOR V4, V8, V4
   188	VXOR V5, V9, V5
   189	VXOR V6, V10, V6
   190	VXOR V7, V11, V7
   191
   192	VRLW V4, V30, V4
   193	VRLW V5, V30, V5
   194	VRLW V6, V30, V6
   195	VRLW V7, V30, V7
   196
   197	VADDUWM V0, V5, V0
   198	VADDUWM V1, V6, V1
   199	VADDUWM V2, V7, V2
   200	VADDUWM V3, V4, V3
   201
   202	VPERMXOR V15, V0, V21, V15
   203	VPERMXOR V12, V1, V21, V12
   204	VPERMXOR V13, V2, V21, V13
   205	VPERMXOR V14, V3, V21, V14
   206
   207	VADDUWM V10, V15, V10
   208	VADDUWM V11, V12, V11
   209	VADDUWM V8, V13, V8
   210	VADDUWM V9, V14, V9
   211
   212	VXOR V5, V10, V5
   213	VXOR V6, V11, V6
   214	VXOR V7, V8, V7
   215	VXOR V4, V9, V4
   216
   217	VRLW V5, V28, V5
   218	VRLW V6, V28, V6
   219	VRLW V7, V28, V7
   220	VRLW V4, V28, V4
   221
   222	VADDUWM V0, V5, V0
   223	VADDUWM V1, V6, V1
   224	VADDUWM V2, V7, V2
   225	VADDUWM V3, V4, V3
   226
   227	VPERMXOR V15, V0, V20, V15
   228	VPERMXOR V12, V1, V20, V12
   229	VPERMXOR V13, V2, V20, V13
   230	VPERMXOR V14, V3, V20, V14
   231
   232	VADDUWM V10, V15, V10
   233	VADDUWM V11, V12, V11
   234	VADDUWM V8, V13, V8
   235	VADDUWM V9, V14, V9
   236
   237	VXOR V5, V10, V5
   238	VXOR V6, V11, V6
   239	VXOR V7, V8, V7
   240	VXOR V4, V9, V4
   241
   242	VRLW V5, V30, V5
   243	VRLW V6, V30, V6
   244	VRLW V7, V30, V7
   245	VRLW V4, V30, V4
   246	BDNZ   loop_vsx
   247
   248	VADDUWM V12, V26, V12
   249
   250	VMRGEW V0, V1, V27
   251	VMRGEW V2, V3, V28
   252
   253	VMRGOW V0, V1, V0
   254	VMRGOW V2, V3, V2
   255
   256	VMRGEW V4, V5, V29
   257	VMRGEW V6, V7, V30
   258
   259	XXPERMDI VS32, VS34, $0, VS33
   260	XXPERMDI VS32, VS34, $3, VS35
   261	XXPERMDI VS59, VS60, $0, VS32
   262	XXPERMDI VS59, VS60, $3, VS34
   263
   264	VMRGOW V4, V5, V4
   265	VMRGOW V6, V7, V6
   266
   267	VMRGEW V8, V9, V27
   268	VMRGEW V10, V11, V28
   269
   270	XXPERMDI VS36, VS38, $0, VS37
   271	XXPERMDI VS36, VS38, $3, VS39
   272	XXPERMDI VS61, VS62, $0, VS36
   273	XXPERMDI VS61, VS62, $3, VS38
   274
   275	VMRGOW V8, V9, V8
   276	VMRGOW V10, V11, V10
   277
   278	VMRGEW V12, V13, V29
   279	VMRGEW V14, V15, V30
   280
   281	XXPERMDI VS40, VS42, $0, VS41
   282	XXPERMDI VS40, VS42, $3, VS43
   283	XXPERMDI VS59, VS60, $0, VS40
   284	XXPERMDI VS59, VS60, $3, VS42
   285
   286	VMRGOW V12, V13, V12
   287	VMRGOW V14, V15, V14
   288
   289	VSPLTISW $4, V27
   290	VADDUWM V26, V27, V26
   291
   292	XXPERMDI VS44, VS46, $0, VS45
   293	XXPERMDI VS44, VS46, $3, VS47
   294	XXPERMDI VS61, VS62, $0, VS44
   295	XXPERMDI VS61, VS62, $3, VS46
   296
   297	VADDUWM V0, V16, V0
   298	VADDUWM V4, V17, V4
   299	VADDUWM V8, V18, V8
   300	VADDUWM V12, V19, V12
   301
   302	CMPU LEN, $64
   303	BLT tail_vsx
   304
   305	// Bottom of loop
   306	LXVW4X (INP)(R0), VS59
   307	LXVW4X (INP)(R8), VS60
   308	LXVW4X (INP)(R9), VS61
   309	LXVW4X (INP)(R10), VS62
   310
   311	VXOR V27, V0, V27
   312	VXOR V28, V4, V28
   313	VXOR V29, V8, V29
   314	VXOR V30, V12, V30
   315
   316	STXVW4X VS59, (OUT)(R0)
   317	STXVW4X VS60, (OUT)(R8)
   318	ADD     $64, INP
   319	STXVW4X VS61, (OUT)(R9)
   320	ADD     $-64, LEN
   321	STXVW4X VS62, (OUT)(R10)
   322	ADD     $64, OUT
   323	BEQ     done_vsx
   324
   325	VADDUWM V1, V16, V0
   326	VADDUWM V5, V17, V4
   327	VADDUWM V9, V18, V8
   328	VADDUWM V13, V19, V12
   329
   330	CMPU  LEN, $64
   331	BLT   tail_vsx
   332
   333	LXVW4X (INP)(R0), VS59
   334	LXVW4X (INP)(R8), VS60
   335	LXVW4X (INP)(R9), VS61
   336	LXVW4X (INP)(R10), VS62
   337	VXOR   V27, V0, V27
   338
   339	VXOR V28, V4, V28
   340	VXOR V29, V8, V29
   341	VXOR V30, V12, V30
   342
   343	STXVW4X VS59, (OUT)(R0)
   344	STXVW4X VS60, (OUT)(R8)
   345	ADD     $64, INP
   346	STXVW4X VS61, (OUT)(R9)
   347	ADD     $-64, LEN
   348	STXVW4X VS62, (OUT)(V10)
   349	ADD     $64, OUT
   350	BEQ     done_vsx
   351
   352	VADDUWM V2, V16, V0
   353	VADDUWM V6, V17, V4
   354	VADDUWM V10, V18, V8
   355	VADDUWM V14, V19, V12
   356
   357	CMPU LEN, $64
   358	BLT  tail_vsx
   359
   360	LXVW4X (INP)(R0), VS59
   361	LXVW4X (INP)(R8), VS60
   362	LXVW4X (INP)(R9), VS61
   363	LXVW4X (INP)(R10), VS62
   364
   365	VXOR V27, V0, V27
   366	VXOR V28, V4, V28
   367	VXOR V29, V8, V29
   368	VXOR V30, V12, V30
   369
   370	STXVW4X VS59, (OUT)(R0)
   371	STXVW4X VS60, (OUT)(R8)
   372	ADD     $64, INP
   373	STXVW4X VS61, (OUT)(R9)
   374	ADD     $-64, LEN
   375	STXVW4X VS62, (OUT)(R10)
   376	ADD     $64, OUT
   377	BEQ     done_vsx
   378
   379	VADDUWM V3, V16, V0
   380	VADDUWM V7, V17, V4
   381	VADDUWM V11, V18, V8
   382	VADDUWM V15, V19, V12
   383
   384	CMPU  LEN, $64
   385	BLT   tail_vsx
   386
   387	LXVW4X (INP)(R0), VS59
   388	LXVW4X (INP)(R8), VS60
   389	LXVW4X (INP)(R9), VS61
   390	LXVW4X (INP)(R10), VS62
   391
   392	VXOR V27, V0, V27
   393	VXOR V28, V4, V28
   394	VXOR V29, V8, V29
   395	VXOR V30, V12, V30
   396
   397	STXVW4X VS59, (OUT)(R0)
   398	STXVW4X VS60, (OUT)(R8)
   399	ADD     $64, INP
   400	STXVW4X VS61, (OUT)(R9)
   401	ADD     $-64, LEN
   402	STXVW4X VS62, (OUT)(R10)
   403	ADD     $64, OUT
   404
   405	MOVD $10, R14
   406	MOVD R14, CTR
   407	BNE  loop_outer_vsx
   408
   409done_vsx:
   410	// Increment counter by number of 64 byte blocks
   411	MOVD (CNT), R14
   412	ADD  BLOCKS, R14
   413	MOVD R14, (CNT)
   414	RET
   415
   416tail_vsx:
   417	ADD  $32, R1, R11
   418	MOVD LEN, CTR
   419
   420	// Save values on stack to copy from
   421	STXVW4X VS32, (R11)(R0)
   422	STXVW4X VS36, (R11)(R8)
   423	STXVW4X VS40, (R11)(R9)
   424	STXVW4X VS44, (R11)(R10)
   425	ADD $-1, R11, R12
   426	ADD $-1, INP
   427	ADD $-1, OUT
   428	PCALIGN $16
   429looptail_vsx:
   430	// Copying the result to OUT
   431	// in bytes.
   432	MOVBZU 1(R12), KEY
   433	MOVBZU 1(INP), TMP
   434	XOR    KEY, TMP, KEY
   435	MOVBU  KEY, 1(OUT)
   436	BDNZ   looptail_vsx
   437
   438	// Clear the stack values
   439	STXVW4X VS48, (R11)(R0)
   440	STXVW4X VS48, (R11)(R8)
   441	STXVW4X VS48, (R11)(R9)
   442	STXVW4X VS48, (R11)(R10)
   443	BR      done_vsx

View as plain text