...

Text file src/crypto/internal/fips140/aes/aes_ppc64x.s

Documentation: crypto/internal/fips140/aes

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build (ppc64 || ppc64le) && !purego
     6
     7// Based on CRYPTOGAMS code with the following comment:
     8// # ====================================================================
     9// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    10// # project. The module is, however, dual licensed under OpenSSL and
    11// # CRYPTOGAMS licenses depending on where you obtain it. For further
    12// # details see http://www.openssl.org/~appro/cryptogams/.
    13// # ====================================================================
    14
    15// Original code can be found at the link below:
    16// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
    17
    18// Some function names were changed to be consistent with Go function
    19// names. For instance, function aes_p8_set_{en,de}crypt_key become
    20// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
    21// and a new session was created (doEncryptKeyAsm). This was necessary to
    22// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
    23// There were other modifications as well but kept the same functionality.
    24
    25#include "textflag.h"
    26
    27// For expandKeyAsm
    28#define INP     R3
    29#define BITS    R4
    30#define OUTENC  R5 // Pointer to next expanded encrypt key
    31#define PTR     R6
    32#define CNT     R7
    33#define ROUNDS  R8
    34#define OUTDEC  R9  // Pointer to next expanded decrypt key
    35#define TEMP    R19
    36#define ZERO    V0
    37#define IN0     V1
    38#define IN1     V2
    39#define KEY     V3
    40#define RCON    V4
    41#define MASK    V5
    42#define TMP     V6
    43#define STAGE   V7
    44#define OUTPERM V8
    45#define OUTMASK V9
    46#define OUTHEAD V10
    47#define OUTTAIL V11
    48
    49// For P9 instruction emulation
    50#define ESPERM  V21  // Endian swapping permute into BE
    51#define TMP2    V22  // Temporary for P8_STXVB16X/P8_STXVB16X
    52
    53// For {en,de}cryptBlockAsm
    54#define BLK_INP    R3
    55#define BLK_OUT    R4
    56#define BLK_KEY    R5
    57#define BLK_ROUNDS R6
    58#define BLK_IDX    R7
    59
    60DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
    61DATA ·rcon+0x08(SB)/8, $0x0706050403020100
    62DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
    63DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
    64DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
    65DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
    66DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
    67DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
    68DATA ·rcon+0x40(SB)/8, $0x0000000000000000
    69DATA ·rcon+0x48(SB)/8, $0x0000000000000000
    70GLOBL ·rcon(SB), RODATA, $80
    71
    72#ifdef GOARCH_ppc64le
    73#  ifdef GOPPC64_power9
    74#define P8_LXVB16X(RA,RB,VT)  LXVB16X	(RA+RB), VT
    75#define P8_STXVB16X(VS,RA,RB) STXVB16X	VS, (RA+RB)
    76#define XXBRD_ON_LE(VA,VT)    XXBRD	VA, VT
    77#define SETUP_ESPERM(rtmp)
    78#  else
    79// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
    80// doublewords and byte-swapping each doubleword to emulate BE load/stores.
    81#define NEEDS_ESPERM
    82#define P8_LXVB16X(RA,RB,VT) \
    83	LXVD2X	(RA+RB), VT \
    84	VPERM	VT, VT, ESPERM, VT
    85
    86#define P8_STXVB16X(VS,RA,RB) \
    87	VPERM	VS, VS, ESPERM, TMP2 \
    88	STXVD2X	TMP2, (RA+RB)
    89
    90#define XXBRD_ON_LE(VA,VT) \
    91	VPERM	VA, VA, ESPERM, VT
    92
    93// Setup byte-swapping permute value in ESPERM for POWER9 instruction
    94// emulation macros.
    95#define SETUP_ESPERM(rtmp) \
    96	MOVD	$·rcon(SB), rtmp \
    97	LVX	(rtmp), ESPERM
    98#  endif // defined(GOPPC64_power9)
    99#else
   100#define P8_LXVB16X(RA,RB,VT)  LXVD2X	(RA+RB), VT
   101#define P8_STXVB16X(VS,RA,RB) STXVD2X	VS, (RA+RB)
   102#define XXBRD_ON_LE(VA, VT)
   103#define SETUP_ESPERM(rtmp)
   104#endif // defined(GOARCH_ppc64le)
   105
   106// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
   107TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
   108	// Load the arguments inside the registers
   109	MOVD	nr+0(FP), ROUNDS
   110	MOVD	key+8(FP), INP
   111	MOVD	enc+16(FP), OUTENC
   112	MOVD	dec+24(FP), OUTDEC
   113
   114#ifdef NEEDS_ESPERM
   115	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
   116	LVX	(PTR), ESPERM
   117	ADD	$0x10, PTR
   118#else
   119	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
   120#endif
   121
   122	// Get key from memory and write aligned into VR
   123	P8_LXVB16X(INP, R0, IN0)
   124	ADD	$0x10, INP, INP
   125	MOVD	$0x20, TEMP
   126
   127	CMPW	ROUNDS, $12
   128	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
   129	LVX	(PTR)(TEMP), MASK
   130	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
   131	MOVD	$8, CNT            // li    7,8        CNT = 8
   132	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
   133	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)
   134
   135	// The expanded decrypt key is the expanded encrypt key stored in reverse order.
   136	// Move OUTDEC to the last key location, and store in descending order.
   137	ADD	$160, OUTDEC, OUTDEC
   138	BLT	loop128
   139	ADD	$32, OUTDEC, OUTDEC
   140	BEQ	l192
   141	ADD	$32, OUTDEC, OUTDEC
   142	JMP	l256
   143
   144loop128:
   145	// Key schedule (Round 1 to 8)
   146	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
   147	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   148	STXVD2X	IN0, (R0+OUTENC)
   149	STXVD2X	IN0, (R0+OUTDEC)
   150	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   151	ADD	$16, OUTENC, OUTENC
   152	ADD	$-16, OUTDEC, OUTDEC
   153
   154	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   155	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   156	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   157	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   158	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   159	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
   160	VXOR	IN0, KEY, IN0       // vxor 1,1,3
   161	BDNZ	loop128
   162
   163	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys
   164
   165	// Key schedule (Round 9)
   166	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
   167	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   168	STXVD2X	IN0, (R0+OUTENC)
   169	STXVD2X	IN0, (R0+OUTDEC)
   170	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   171	ADD	$16, OUTENC, OUTENC
   172	ADD	$-16, OUTDEC, OUTDEC
   173
   174	// Key schedule (Round 10)
   175	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   176	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   177	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   178	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   179	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   180	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
   181	VXOR	IN0, KEY, IN0       // vxor 1,1,3
   182
   183	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
   184	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   185	STXVD2X	IN0, (R0+OUTENC)
   186	STXVD2X	IN0, (R0+OUTDEC)
   187	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   188	ADD	$16, OUTENC, OUTENC
   189	ADD	$-16, OUTDEC, OUTDEC
   190
   191	// Key schedule (Round 11)
   192	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   193	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   194	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   195	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   196	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   197	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   198	STXVD2X	IN0, (R0+OUTENC)
   199	STXVD2X	IN0, (R0+OUTDEC)
   200
   201	RET
   202
   203l192:
   204	LXSDX	(INP+R0), IN1                    // Load next 8 bytes into upper half of VSR.
   205	XXBRD_ON_LE(IN1, IN1)                    // and convert to BE ordering on LE hosts.
   206	MOVD	$4, CNT                          // li 7,4
   207	STXVD2X	IN0, (R0+OUTENC)
   208	STXVD2X	IN0, (R0+OUTDEC)
   209	ADD	$16, OUTENC, OUTENC
   210	ADD	$-16, OUTDEC, OUTDEC
   211	VSPLTISB	$8, KEY                  // vspltisb 3,8
   212	MOVD	CNT, CTR                         // mtctr 7
   213	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3
   214
   215loop192:
   216	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
   217	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
   218	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4
   219
   220	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   221	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   222	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   223	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   224	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   225
   226	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
   227	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
   228	VXOR	TMP, IN1, TMP         // vxor 6,6,2
   229	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
   230	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
   231	VXOR	IN1, TMP, IN1         // vxor 2,2,6
   232	VXOR	IN0, KEY, IN0         // vxor 1,1,3
   233	VXOR	IN1, KEY, IN1         // vxor 2,2,3
   234	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
   235
   236	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
   237	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   238	STXVD2X	STAGE, (R0+OUTENC)
   239	STXVD2X	STAGE, (R0+OUTDEC)
   240	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   241	ADD	$16, OUTENC, OUTENC
   242	ADD	$-16, OUTDEC, OUTDEC
   243
   244	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
   245	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   246	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   247	STXVD2X	STAGE, (R0+OUTENC)
   248	STXVD2X	STAGE, (R0+OUTDEC)
   249	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   250	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   251	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   252	ADD	$16, OUTENC, OUTENC
   253	ADD	$-16, OUTDEC, OUTDEC
   254
   255	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
   256	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
   257	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
   258	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
   259	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
   260	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   261	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
   262	STXVD2X	IN0, (R0+OUTENC)
   263	STXVD2X	IN0, (R0+OUTDEC)
   264	ADD	$16, OUTENC, OUTENC
   265	ADD	$-16, OUTDEC, OUTDEC
   266	BDNZ	loop192
   267
   268	RET
   269
   270l256:
   271	P8_LXVB16X(INP, R0, IN1)
   272	MOVD	$7, CNT                          // li 7,7
   273	STXVD2X	IN0, (R0+OUTENC)
   274	STXVD2X	IN0, (R0+OUTDEC)
   275	ADD	$16, OUTENC, OUTENC
   276	ADD	$-16, OUTDEC, OUTDEC
   277	MOVD	CNT, CTR                         // mtctr 7
   278
   279loop256:
   280	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
   281	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   282	STXVD2X	IN1, (R0+OUTENC)
   283	STXVD2X	IN1, (R0+OUTDEC)
   284	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   285	ADD	$16, OUTENC, OUTENC
   286	ADD	$-16, OUTDEC, OUTDEC
   287
   288	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   289	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   290	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   291	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   292	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   293	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
   294	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   295	STXVD2X	IN0, (R0+OUTENC)
   296	STXVD2X	IN0, (R0+OUTDEC)
   297	ADD	$16, OUTENC, OUTENC
   298	ADD	$-16, OUTDEC, OUTDEC
   299	BDZ	done
   300
   301	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
   302	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
   303	VSBOX	KEY, KEY            // vsbox 3,3
   304
   305	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   306	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   307	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   308	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   309	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   310
   311	VXOR	IN1, KEY, IN1 // vxor 2,2,3
   312	JMP	loop256       // b .Loop256
   313
   314done:
   315	RET
   316
   317// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
   318TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
   319	MOVD	nr+0(FP), R6   // Round count/Key size
   320	MOVD	xk+8(FP), R5   // Key pointer
   321	MOVD	dst+16(FP), R3 // Dest pointer
   322	MOVD	src+24(FP), R4 // Src pointer
   323	SETUP_ESPERM(R7)
   324
   325	// Set CR{1,2,3}EQ to hold the key size information.
   326	CMPU	R6, $10, CR1
   327	CMPU	R6, $12, CR2
   328	CMPU	R6, $14, CR3
   329
   330	MOVD	$16, R6
   331	MOVD	$32, R7
   332	MOVD	$48, R8
   333	MOVD	$64, R9
   334	MOVD	$80, R10
   335	MOVD	$96, R11
   336	MOVD	$112, R12
   337
   338	// Load text in BE order
   339	P8_LXVB16X(R4, R0, V0)
   340
   341	// V1, V2 will hold keys, V0 is a temp.
   342	// At completion, V2 will hold the ciphertext.
   343	// Load xk[0:3] and xor with text
   344	LXVD2X	(R0+R5), V1
   345	VXOR	V0, V1, V0
   346
   347	// Load xk[4:11] and cipher
   348	LXVD2X	(R6+R5), V1
   349	LXVD2X	(R7+R5), V2
   350	VCIPHER	V0, V1, V0
   351	VCIPHER	V0, V2, V0
   352
   353	// Load xk[12:19] and cipher
   354	LXVD2X	(R8+R5), V1
   355	LXVD2X	(R9+R5), V2
   356	VCIPHER	V0, V1, V0
   357	VCIPHER	V0, V2, V0
   358
   359	// Load xk[20:27] and cipher
   360	LXVD2X	(R10+R5), V1
   361	LXVD2X	(R11+R5), V2
   362	VCIPHER	V0, V1, V0
   363	VCIPHER	V0, V2, V0
   364
   365	// Increment xk pointer to reuse constant offsets in R6-R12.
   366	ADD	$112, R5
   367
   368	// Load xk[28:35] and cipher
   369	LXVD2X	(R0+R5), V1
   370	LXVD2X	(R6+R5), V2
   371	VCIPHER	V0, V1, V0
   372	VCIPHER	V0, V2, V0
   373
   374	// Load xk[36:43] and cipher
   375	LXVD2X	(R7+R5), V1
   376	LXVD2X	(R8+R5), V2
   377	BEQ	CR1, Ldec_tail // Key size 10?
   378	VCIPHER	V0, V1, V0
   379	VCIPHER	V0, V2, V0
   380
   381	// Load xk[44:51] and cipher
   382	LXVD2X	(R9+R5), V1
   383	LXVD2X	(R10+R5), V2
   384	BEQ	CR2, Ldec_tail // Key size 12?
   385	VCIPHER	V0, V1, V0
   386	VCIPHER	V0, V2, V0
   387
   388	// Load xk[52:59] and cipher
   389	LXVD2X	(R11+R5), V1
   390	LXVD2X	(R12+R5), V2
   391	BNE	CR3, Linvalid_key_len // Not key size 14?
   392	// Fallthrough to final cipher
   393
   394Ldec_tail:
   395	// Cipher last two keys such that key information is
   396	// cleared from V1 and V2.
   397	VCIPHER		V0, V1, V1
   398	VCIPHERLAST	V1, V2, V2
   399
   400	// Store the result in BE order.
   401	P8_STXVB16X(V2, R3, R0)
   402	RET
   403
   404Linvalid_key_len:
   405	// Segfault, this should never happen. Only 3 keys sizes are created/used.
   406	MOVD	R0, 0(R0)
   407	RET
   408
   409// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
   410TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
   411	MOVD	nr+0(FP), R6   // Round count/Key size
   412	MOVD	xk+8(FP), R5   // Key pointer
   413	MOVD	dst+16(FP), R3 // Dest pointer
   414	MOVD	src+24(FP), R4 // Src pointer
   415	SETUP_ESPERM(R7)
   416
   417	// Set CR{1,2,3}EQ to hold the key size information.
   418	CMPU	R6, $10, CR1
   419	CMPU	R6, $12, CR2
   420	CMPU	R6, $14, CR3
   421
   422	MOVD	$16, R6
   423	MOVD	$32, R7
   424	MOVD	$48, R8
   425	MOVD	$64, R9
   426	MOVD	$80, R10
   427	MOVD	$96, R11
   428	MOVD	$112, R12
   429
   430	// Load text in BE order
   431	P8_LXVB16X(R4, R0, V0)
   432
   433	// V1, V2 will hold keys, V0 is a temp.
   434	// At completion, V2 will hold the text.
   435	// Load xk[0:3] and xor with ciphertext
   436	LXVD2X	(R0+R5), V1
   437	VXOR	V0, V1, V0
   438
   439	// Load xk[4:11] and cipher
   440	LXVD2X	(R6+R5), V1
   441	LXVD2X	(R7+R5), V2
   442	VNCIPHER	V0, V1, V0
   443	VNCIPHER	V0, V2, V0
   444
   445	// Load xk[12:19] and cipher
   446	LXVD2X	(R8+R5), V1
   447	LXVD2X	(R9+R5), V2
   448	VNCIPHER	V0, V1, V0
   449	VNCIPHER	V0, V2, V0
   450
   451	// Load xk[20:27] and cipher
   452	LXVD2X	(R10+R5), V1
   453	LXVD2X	(R11+R5), V2
   454	VNCIPHER	V0, V1, V0
   455	VNCIPHER	V0, V2, V0
   456
   457	// Increment xk pointer to reuse constant offsets in R6-R12.
   458	ADD	$112, R5
   459
   460	// Load xk[28:35] and cipher
   461	LXVD2X	(R0+R5), V1
   462	LXVD2X	(R6+R5), V2
   463	VNCIPHER	V0, V1, V0
   464	VNCIPHER	V0, V2, V0
   465
   466	// Load xk[36:43] and cipher
   467	LXVD2X	(R7+R5), V1
   468	LXVD2X	(R8+R5), V2
   469	BEQ	CR1, Ldec_tail // Key size 10?
   470	VNCIPHER	V0, V1, V0
   471	VNCIPHER	V0, V2, V0
   472
   473	// Load xk[44:51] and cipher
   474	LXVD2X	(R9+R5), V1
   475	LXVD2X	(R10+R5), V2
   476	BEQ	CR2, Ldec_tail // Key size 12?
   477	VNCIPHER	V0, V1, V0
   478	VNCIPHER	V0, V2, V0
   479
   480	// Load xk[52:59] and cipher
   481	LXVD2X	(R11+R5), V1
   482	LXVD2X	(R12+R5), V2
   483	BNE	CR3, Linvalid_key_len // Not key size 14?
   484	// Fallthrough to final cipher
   485
   486Ldec_tail:
   487	// Cipher last two keys such that key information is
   488	// cleared from V1 and V2.
   489	VNCIPHER	V0, V1, V1
   490	VNCIPHERLAST	V1, V2, V2
   491
   492	// Store the result in BE order.
   493	P8_STXVB16X(V2, R3, R0)
   494	RET
   495
   496Linvalid_key_len:
   497	// Segfault, this should never happen. Only 3 keys sizes are created/used.
   498	MOVD	R0, 0(R0)
   499	RET
   500
   501// Remove defines from above so they can be defined here
   502#undef INP
   503#undef OUTENC
   504#undef ROUNDS
   505#undef KEY
   506#undef TMP
   507
   508#define INP R3
   509#define OUTP R4
   510#define LEN R5
   511#define KEYP R6
   512#define ROUNDS R7
   513#define IVP R8
   514#define ENC R9
   515
   516#define INOUT V2
   517#define TMP V3
   518#define IVEC V4
   519
   520// Load the crypt key into VSRs.
   521//
   522// The expanded key is stored and loaded using
   523// STXVD2X/LXVD2X. The in-memory byte ordering
   524// depends on the endianness of the machine. The
   525// expanded keys are generated by expandKeyAsm above.
   526//
   527// Rkeyp holds the key pointer. It is clobbered. Once
   528// the expanded keys are loaded, it is not needed.
   529//
   530// R12,R14-R21 are scratch registers.
   531// For keyp of 10, V6, V11-V20 hold the expanded key.
   532// For keyp of 12, V6, V9-V20 hold the expanded key.
   533// For keyp of 14, V6, V7-V20 hold the expanded key.
   534#define LOAD_KEY(Rkeyp) \
   535	MOVD	$16, R12 \
   536	MOVD	$32, R14 \
   537	MOVD	$48, R15 \
   538	MOVD	$64, R16 \
   539	MOVD	$80, R17 \
   540	MOVD	$96, R18 \
   541	MOVD	$112, R19 \
   542	MOVD	$128, R20 \
   543	MOVD	$144, R21 \
   544	LXVD2X	(R0+Rkeyp), V6 \
   545	ADD	$16, Rkeyp \
   546	BEQ	CR1, L_start10 \
   547	BEQ	CR2, L_start12 \
   548	LXVD2X	(R0+Rkeyp), V7 \
   549	LXVD2X	(R12+Rkeyp), V8 \
   550	ADD	$32, Rkeyp \
   551	L_start12: \
   552	LXVD2X	(R0+Rkeyp), V9 \
   553	LXVD2X	(R12+Rkeyp), V10 \
   554	ADD	$32, Rkeyp \
   555	L_start10: \
   556	LXVD2X	(R0+Rkeyp), V11 \
   557	LXVD2X	(R12+Rkeyp), V12 \
   558	LXVD2X	(R14+Rkeyp), V13 \
   559	LXVD2X	(R15+Rkeyp), V14 \
   560	LXVD2X	(R16+Rkeyp), V15 \
   561	LXVD2X	(R17+Rkeyp), V16 \
   562	LXVD2X	(R18+Rkeyp), V17 \
   563	LXVD2X	(R19+Rkeyp), V18 \
   564	LXVD2X	(R20+Rkeyp), V19 \
   565	LXVD2X	(R21+Rkeyp), V20
   566
   567// Perform aes cipher operation for keysize 10/12/14 using the keys
   568// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
   569//
   570// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
   571// performance V6 and IVEC can be swapped (xor is both associative and
   572// commutative) during encryption:
   573//
   574//	VXOR INOUT, IVEC, INOUT
   575//	VXOR INOUT, V6, INOUT
   576//
   577//	into
   578//
   579//	VXOR INOUT, V6, INOUT
   580//	VXOR INOUT, IVEC, INOUT
   581//
   582#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
   583	VXOR	Vin, Vxor, Vout \
   584	BEQ	CR1, label10 \
   585	BEQ	CR2, label12 \
   586	vcipher	Vout, V7, Vout \
   587	vcipher	Vout, V8, Vout \
   588	label12: \
   589	vcipher	Vout, V9, Vout \
   590	vcipher	Vout, V10, Vout \
   591	label10: \
   592	vcipher	Vout, V11, Vout \
   593	vcipher	Vout, V12, Vout \
   594	vcipher	Vout, V13, Vout \
   595	vcipher	Vout, V14, Vout \
   596	vcipher	Vout, V15, Vout \
   597	vcipher	Vout, V16, Vout \
   598	vcipher	Vout, V17, Vout \
   599	vcipher	Vout, V18, Vout \
   600	vcipher	Vout, V19, Vout \
   601	vciphel	Vout, V20, Vout \
   602
   603#define CLEAR_KEYS() \
   604	VXOR	V6, V6, V6 \
   605	VXOR	V7, V7, V7 \
   606	VXOR	V8, V8, V8 \
   607	VXOR	V9, V9, V9 \
   608	VXOR	V10, V10, V10 \
   609	VXOR	V11, V11, V11 \
   610	VXOR	V12, V12, V12 \
   611	VXOR	V13, V13, V13 \
   612	VXOR	V14, V14, V14 \
   613	VXOR	V15, V15, V15 \
   614	VXOR	V16, V16, V16 \
   615	VXOR	V17, V17, V17 \
   616	VXOR	V18, V18, V18 \
   617	VXOR	V19, V19, V19 \
   618	VXOR	V20, V20, V20
   619
   620//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
   621TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
   622	MOVD	src+0(FP), INP
   623	MOVD	dst+8(FP), OUTP
   624	MOVD	length+16(FP), LEN
   625	MOVD	key+24(FP), KEYP
   626	MOVD	iv+32(FP), IVP
   627	MOVD	enc+40(FP), ENC
   628	MOVD	nr+48(FP), ROUNDS
   629
   630	SETUP_ESPERM(R11)
   631
   632	// Assume len > 0 && len % blockSize == 0.
   633	CMPW	ENC, $0
   634	P8_LXVB16X(IVP, R0, IVEC)
   635	CMPU	ROUNDS, $10, CR1
   636	CMPU	ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
   637
   638	// Setup key in VSRs, and set loop count in CTR.
   639	LOAD_KEY(KEYP)
   640	SRD	$4, LEN
   641	MOVD	LEN, CTR
   642
   643	BEQ	Lcbc_dec
   644
   645	PCALIGN $16
   646Lcbc_enc:
   647	P8_LXVB16X(INP, R0, INOUT)
   648	ADD	$16, INP
   649	VXOR	INOUT, V6, INOUT
   650	CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
   651	VOR	INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
   652	P8_STXVB16X(INOUT, OUTP, R0)
   653	ADD	$16, OUTP
   654	BDNZ	Lcbc_enc
   655
   656	P8_STXVB16X(INOUT, IVP, R0)
   657	CLEAR_KEYS()
   658	RET
   659
   660	PCALIGN $16
   661Lcbc_dec:
   662	P8_LXVB16X(INP, R0, TMP)
   663	ADD	$16, INP
   664	CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
   665	VXOR	INOUT, IVEC, INOUT
   666	VOR	TMP, TMP, IVEC // TMP is IVEC for next block.
   667	P8_STXVB16X(INOUT, OUTP, R0)
   668	ADD	$16, OUTP
   669	BDNZ	Lcbc_dec
   670
   671	P8_STXVB16X(IVEC, IVP, R0)
   672	CLEAR_KEYS()
   673	RET
   674
   675
   676#define DO1_CIPHER(iv0, keyv, key, op) \
   677	LXVD2X	(key), keyv   \
   678	ADD	$16, key      \
   679	op	iv0, keyv, iv0
   680
   681#define DO2_CIPHER(iv0, iv1, keyv, key, op) \
   682	DO1_CIPHER(iv0, keyv, key, op) \
   683	op	iv1, keyv, iv1
   684
   685#define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
   686	DO2_CIPHER(iv0, iv1, keyv, key, op) \
   687	op	iv2, keyv, iv2              \
   688	op	iv3, keyv, iv3
   689
   690#define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
   691	DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
   692	op	iv4, keyv, iv4                        \
   693	op	iv5, keyv, iv5                        \
   694	op	iv6, keyv, iv6                        \
   695	op	iv7, keyv, iv7
   696
   697#define XOR_STORE(src, iv, dstp, dstpoff) \
   698	XXLXOR    src, iv, V8 \
   699	P8_STXVB16X(V8,dstp,dstpoff)
   700
   701//func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
   702TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0
   703
   704#define CTRBLOCK_PROLOGUE \
   705	MOVD	nr+0(FP), R3     \
   706	MOVD	xk+8(FP), R4     \
   707	MOVD	dst+16(FP), R5   \
   708	MOVD	src+24(FP), R6   \
   709	MOVD	ivlo+32(FP), R8  \
   710	MOVD	ivhi+40(FP), R9  \
   711	CMP	R3, $12, CR1     \
   712	MTVSRD	R8, V0		 \
   713	MTVSRD	R9, V1		 \
   714	XXPERMDI V1, V0, $0, V0	 \
   715	SETUP_ESPERM(R8)
   716
   717	CTRBLOCK_PROLOGUE
   718
   719	DO1_CIPHER(V0,V8,R4,VXOR)
   720
   721	BEQ	CR1, key_12
   722	BLT	CR1, key_10
   723key_14:
   724	DO1_CIPHER(V0,V8,R4,VCIPHER)
   725	DO1_CIPHER(V0,V8,R4,VCIPHER)
   726key_12:
   727	DO1_CIPHER(V0,V8,R4,VCIPHER)
   728	DO1_CIPHER(V0,V8,R4,VCIPHER)
   729key_10:
   730	P8_LXVB16X(R6,R0,V9)
   731	DO1_CIPHER(V0,V8,R4,VCIPHER)
   732	DO1_CIPHER(V0,V8,R4,VCIPHER)
   733	DO1_CIPHER(V0,V8,R4,VCIPHER)
   734	DO1_CIPHER(V0,V8,R4,VCIPHER)
   735
   736	DO1_CIPHER(V0,V8,R4,VCIPHER)
   737	DO1_CIPHER(V0,V8,R4,VCIPHER)
   738	DO1_CIPHER(V0,V8,R4,VCIPHER)
   739	DO1_CIPHER(V0,V8,R4,VCIPHER)
   740
   741	DO1_CIPHER(V0,V8,R4,VCIPHER)
   742	DO1_CIPHER(V0,V8,R4,VCIPHERLAST)
   743
   744	XOR_STORE(V9,V0,R5,R0)
   745	RET
   746
   747//func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
   748TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
   749	CTRBLOCK_PROLOGUE
   750
   751	XXLEQV  V8, V8, V8	// V0 is -1
   752	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   753
   754	DO2_CIPHER(V0,V1,V8,R4,VXOR)
   755
   756	BEQ	CR1, key_12
   757	BLT	CR1, key_10
   758key_14:
   759	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   760	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   761key_12:
   762	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   763	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   764key_10:
   765	P8_LXVB16X(R6,R0,V9)
   766	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   767	MOVD	$16, R8
   768	P8_LXVB16X(R6,R8,V10)
   769	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   770	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   771	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   772	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   773	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   774	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   775	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   776	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   777	DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)
   778
   779	XOR_STORE(V9,V0,R5,R0)
   780	XOR_STORE(V10,V1,R5,R8)
   781
   782	RET
   783
   784//func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
   785TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
   786	CTRBLOCK_PROLOGUE
   787
   788	XXLEQV  V8, V8, V8	// V0 is -1
   789	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   790	VSUBUQM V1, V8, V2
   791	VSUBUQM V2, V8, V3
   792
   793	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)
   794
   795	BEQ	CR1, key_12
   796	BLT	CR1, key_10
   797key_14:
   798	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   799	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   800key_12:
   801	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   802	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   803key_10:
   804	P8_LXVB16X(R6,R0,V9)
   805	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   806	MOVD	$16, R8
   807	P8_LXVB16X(R6,R8,V10)
   808	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   809	MOVD	$32, R9
   810	P8_LXVB16X(R6,R9,V11)
   811	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   812	MOVD	$48, R10
   813	P8_LXVB16X(R6,R10,V12)
   814	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   815	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   816	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   817	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   818	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   819	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   820	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)
   821
   822	XOR_STORE(V9,V0,R5,R0)
   823	XOR_STORE(V10,V1,R5,R8)
   824	XOR_STORE(V11,V2,R5,R9)
   825	XOR_STORE(V12,V3,R5,R10)
   826
   827	RET
   828
   829//func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
   830TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
   831	CTRBLOCK_PROLOGUE
   832
   833	XXLEQV  V8, V8, V8	// V8 is -1
   834	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   835	VADDUQM V8, V8, V9	// V9 is -2
   836
   837	VSUBUQM V0, V9, V2
   838	VSUBUQM V1, V9, V3
   839	VSUBUQM V2, V9, V4
   840	VSUBUQM V3, V9, V5
   841	VSUBUQM V4, V9, V6
   842	VSUBUQM V5, V9, V7
   843
   844	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)
   845
   846	BEQ	CR1, key_12
   847	BLT	CR1, key_10
   848key_14:
   849	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   850	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   851key_12:
   852	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   853	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   854key_10:
   855	P8_LXVB16X(R6,R0,V9)
   856	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   857	MOVD	$16, R8
   858	P8_LXVB16X(R6,R8,V10)
   859	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   860	MOVD	$32, R9
   861	P8_LXVB16X(R6,R9,V11)
   862	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   863	MOVD	$48, R10
   864	P8_LXVB16X(R6,R10,V12)
   865	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   866	MOVD	$64, R11
   867	P8_LXVB16X(R6,R11,V13)
   868	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   869	MOVD	$80, R12
   870	P8_LXVB16X(R6,R12,V14)
   871	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   872	MOVD	$96, R14
   873	P8_LXVB16X(R6,R14,V15)
   874	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   875	MOVD	$112, R15
   876	P8_LXVB16X(R6,R15,V16)
   877	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   878	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   879	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)
   880
   881	XOR_STORE(V9,V0,R5,R0)
   882	XOR_STORE(V10,V1,R5,R8)
   883	XOR_STORE(V11,V2,R5,R9)
   884	XOR_STORE(V12,V3,R5,R10)
   885	XOR_STORE(V13,V4,R5,R11)
   886	XOR_STORE(V14,V5,R5,R12)
   887	XOR_STORE(V15,V6,R5,R14)
   888	XOR_STORE(V16,V7,R5,R15)
   889
   890	RET
   891

View as plain text