...

Text file src/crypto/aes/gcm_arm64.s

Documentation: crypto/aes

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7#include "textflag.h"
     8
     9#define B0 V0
    10#define B1 V1
    11#define B2 V2
    12#define B3 V3
    13#define B4 V4
    14#define B5 V5
    15#define B6 V6
    16#define B7 V7
    17
    18#define ACC0 V8
    19#define ACC1 V9
    20#define ACCM V10
    21
    22#define T0 V11
    23#define T1 V12
    24#define T2 V13
    25#define T3 V14
    26
    27#define POLY V15
    28#define ZERO V16
    29#define INC V17
    30#define CTR V18
    31
    32#define K0 V19
    33#define K1 V20
    34#define K2 V21
    35#define K3 V22
    36#define K4 V23
    37#define K5 V24
    38#define K6 V25
    39#define K7 V26
    40#define K8 V27
    41#define K9 V28
    42#define K10 V29
    43#define K11 V30
    44#define KLAST V31
    45
    46#define reduce() \
    47	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    48	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    49	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    50	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    51	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    52	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    53	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    54	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    55	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    56	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    57	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    58	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    59	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    60
    61// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    62TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    63#define pTbl R0
    64#define tMsk R1
    65#define tPtr R2
    66#define plen R3
    67#define dlen R4
    68
    69	MOVD	$0xC2, R1
    70	LSL	$56, R1
    71	MOVD	$1, R0
    72	VMOV	R1, POLY.D[0]
    73	VMOV	R0, POLY.D[1]
    74	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    75
    76	MOVD	productTable+0(FP), pTbl
    77	MOVD	tagMask+8(FP), tMsk
    78	MOVD	T+16(FP), tPtr
    79	MOVD	pLen+24(FP), plen
    80	MOVD	dLen+32(FP), dlen
    81
    82	VLD1	(tPtr), [ACC0.B16]
    83	VLD1	(tMsk), [B1.B16]
    84
    85	LSL	$3, plen
    86	LSL	$3, dlen
    87
    88	VMOV	dlen, B0.D[0]
    89	VMOV	plen, B0.D[1]
    90
    91	ADD	$14*16, pTbl
    92	VLD1.P	(pTbl), [T1.B16, T2.B16]
    93
    94	VEOR	ACC0.B16, B0.B16, B0.B16
    95
    96	VEXT	$8, B0.B16, B0.B16, T0.B16
    97	VEOR	B0.B16, T0.B16, T0.B16
    98	VPMULL	B0.D1, T1.D1, ACC1.Q1
    99	VPMULL2	B0.D2, T1.D2, ACC0.Q1
   100	VPMULL	T0.D1, T2.D1, ACCM.Q1
   101
   102	reduce()
   103
   104	VREV64	ACC0.B16, ACC0.B16
   105	VEOR	B1.B16, ACC0.B16, ACC0.B16
   106
   107	VST1	[ACC0.B16], (tPtr)
   108	RET
   109#undef pTbl
   110#undef tMsk
   111#undef tPtr
   112#undef plen
   113#undef dlen
   114
   115// func gcmAesInit(productTable *[256]byte, ks []uint32)
   116TEXT ·gcmAesInit(SB),NOSPLIT,$0
   117#define pTbl R0
   118#define KS R1
   119#define NR R2
   120#define I R3
   121	MOVD	productTable+0(FP), pTbl
   122	MOVD	ks_base+8(FP), KS
   123	MOVD	ks_len+16(FP), NR
   124
   125	MOVD	$0xC2, I
   126	LSL	$56, I
   127	VMOV	I, POLY.D[0]
   128	MOVD	$1, I
   129	VMOV	I, POLY.D[1]
   130	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   131
   132	// Encrypt block 0 with the AES key to generate the hash key H
   133	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   134	VEOR	B0.B16, B0.B16, B0.B16
   135	AESE	T0.B16, B0.B16
   136	AESMC	B0.B16, B0.B16
   137	AESE	T1.B16, B0.B16
   138	AESMC	B0.B16, B0.B16
   139	AESE	T2.B16, B0.B16
   140	AESMC	B0.B16, B0.B16
   141	AESE	T3.B16, B0.B16
   142	AESMC	B0.B16, B0.B16
   143	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   144	AESE	T0.B16, B0.B16
   145	AESMC	B0.B16, B0.B16
   146	AESE	T1.B16, B0.B16
   147	AESMC	B0.B16, B0.B16
   148	AESE	T2.B16, B0.B16
   149	AESMC	B0.B16, B0.B16
   150	AESE	T3.B16, B0.B16
   151	AESMC	B0.B16, B0.B16
   152	TBZ	$4, NR, initEncFinish
   153	VLD1.P	32(KS), [T0.B16, T1.B16]
   154	AESE	T0.B16, B0.B16
   155	AESMC	B0.B16, B0.B16
   156	AESE	T1.B16, B0.B16
   157	AESMC	B0.B16, B0.B16
   158	TBZ	$3, NR, initEncFinish
   159	VLD1.P	32(KS), [T0.B16, T1.B16]
   160	AESE	T0.B16, B0.B16
   161	AESMC	B0.B16, B0.B16
   162	AESE	T1.B16, B0.B16
   163	AESMC	B0.B16, B0.B16
   164initEncFinish:
   165	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
   166	AESE	T0.B16, B0.B16
   167	AESMC	B0.B16, B0.B16
   168	AESE	T1.B16, B0.B16
   169	VEOR	T2.B16, B0.B16, B0.B16
   170
   171	VREV64	B0.B16, B0.B16
   172
   173	// Multiply by 2 modulo P
   174	VMOV	B0.D[0], I
   175	ASR	$63, I
   176	VMOV	I, T1.D[0]
   177	VMOV	I, T1.D[1]
   178	VAND	POLY.B16, T1.B16, T1.B16
   179	VUSHR	$63, B0.D2, T2.D2
   180	VEXT	$8, ZERO.B16, T2.B16, T2.B16
   181	VSHL	$1, B0.D2, B0.D2
   182	VEOR	T1.B16, B0.B16, B0.B16
   183	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
   184
   185	// Karatsuba pre-computation
   186	VEXT	$8, B0.B16, B0.B16, B1.B16
   187	VEOR	B0.B16, B1.B16, B1.B16
   188
   189	ADD	$14*16, pTbl
   190	VST1	[B0.B16, B1.B16], (pTbl)
   191	SUB	$2*16, pTbl
   192
   193	VMOV	B0.B16, B2.B16
   194	VMOV	B1.B16, B3.B16
   195
   196	MOVD	$7, I
   197
   198initLoop:
   199	// Compute powers of H
   200	SUBS	$1, I
   201
   202	VPMULL	B0.D1, B2.D1, T1.Q1
   203	VPMULL2	B0.D2, B2.D2, T0.Q1
   204	VPMULL	B1.D1, B3.D1, T2.Q1
   205	VEOR	T0.B16, T2.B16, T2.B16
   206	VEOR	T1.B16, T2.B16, T2.B16
   207	VEXT	$8, ZERO.B16, T2.B16, T3.B16
   208	VEXT	$8, T2.B16, ZERO.B16, T2.B16
   209	VEOR	T2.B16, T0.B16, T0.B16
   210	VEOR	T3.B16, T1.B16, T1.B16
   211	VPMULL	POLY.D1, T0.D1, T2.Q1
   212	VEXT	$8, T0.B16, T0.B16, T0.B16
   213	VEOR	T2.B16, T0.B16, T0.B16
   214	VPMULL	POLY.D1, T0.D1, T2.Q1
   215	VEXT	$8, T0.B16, T0.B16, T0.B16
   216	VEOR	T2.B16, T0.B16, T0.B16
   217	VEOR	T1.B16, T0.B16, B2.B16
   218	VMOV	B2.B16, B3.B16
   219	VEXT	$8, B2.B16, B2.B16, B2.B16
   220	VEOR	B2.B16, B3.B16, B3.B16
   221
   222	VST1	[B2.B16, B3.B16], (pTbl)
   223	SUB	$2*16, pTbl
   224
   225	BNE	initLoop
   226	RET
   227#undef I
   228#undef NR
   229#undef KS
   230#undef pTbl
   231
   232// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   233TEXT ·gcmAesData(SB),NOSPLIT,$0
   234#define pTbl R0
   235#define aut R1
   236#define tPtr R2
   237#define autLen R3
   238#define H0 R4
   239#define pTblSave R5
   240
   241#define mulRound(X) \
   242	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   243	VREV64	X.B16, X.B16               \
   244	VEXT	$8, X.B16, X.B16, T0.B16   \
   245	VEOR	X.B16, T0.B16, T0.B16      \
   246	VPMULL	X.D1, T1.D1, T3.Q1         \
   247	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   248	VPMULL2	X.D2, T1.D2, T3.Q1         \
   249	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   250	VPMULL	T0.D1, T2.D1, T3.Q1        \
   251	VEOR	T3.B16, ACCM.B16, ACCM.B16
   252
   253	MOVD	productTable+0(FP), pTbl
   254	MOVD	data_base+8(FP), aut
   255	MOVD	data_len+16(FP), autLen
   256	MOVD	T+32(FP), tPtr
   257
   258	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   259	CBZ	autLen, dataBail
   260
   261	MOVD	$0xC2, H0
   262	LSL	$56, H0
   263	VMOV	H0, POLY.D[0]
   264	MOVD	$1, H0
   265	VMOV	H0, POLY.D[1]
   266	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   267	MOVD	pTbl, pTblSave
   268
   269	CMP	$13, autLen
   270	BEQ	dataTLS
   271	CMP	$128, autLen
   272	BLT	startSinglesLoop
   273	B	octetsLoop
   274
   275dataTLS:
   276	ADD	$14*16, pTbl
   277	VLD1.P	(pTbl), [T1.B16, T2.B16]
   278	VEOR	B0.B16, B0.B16, B0.B16
   279
   280	MOVD	(aut), H0
   281	VMOV	H0, B0.D[0]
   282	MOVW	8(aut), H0
   283	VMOV	H0, B0.S[2]
   284	MOVB	12(aut), H0
   285	VMOV	H0, B0.B[12]
   286
   287	MOVD	$0, autLen
   288	B	dataMul
   289
   290octetsLoop:
   291		CMP	$128, autLen
   292		BLT	startSinglesLoop
   293		SUB	$128, autLen
   294
   295		VLD1.P	32(aut), [B0.B16, B1.B16]
   296
   297		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   298		VREV64	B0.B16, B0.B16
   299		VEOR	ACC0.B16, B0.B16, B0.B16
   300		VEXT	$8, B0.B16, B0.B16, T0.B16
   301		VEOR	B0.B16, T0.B16, T0.B16
   302		VPMULL	B0.D1, T1.D1, ACC1.Q1
   303		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   304		VPMULL	T0.D1, T2.D1, ACCM.Q1
   305
   306		mulRound(B1)
   307		VLD1.P  32(aut), [B2.B16, B3.B16]
   308		mulRound(B2)
   309		mulRound(B3)
   310		VLD1.P  32(aut), [B4.B16, B5.B16]
   311		mulRound(B4)
   312		mulRound(B5)
   313		VLD1.P  32(aut), [B6.B16, B7.B16]
   314		mulRound(B6)
   315		mulRound(B7)
   316
   317		MOVD	pTblSave, pTbl
   318		reduce()
   319	B	octetsLoop
   320
   321startSinglesLoop:
   322
   323	ADD	$14*16, pTbl
   324	VLD1.P	(pTbl), [T1.B16, T2.B16]
   325
   326singlesLoop:
   327
   328		CMP	$16, autLen
   329		BLT	dataEnd
   330		SUB	$16, autLen
   331
   332		VLD1.P	16(aut), [B0.B16]
   333dataMul:
   334		VREV64	B0.B16, B0.B16
   335		VEOR	ACC0.B16, B0.B16, B0.B16
   336
   337		VEXT	$8, B0.B16, B0.B16, T0.B16
   338		VEOR	B0.B16, T0.B16, T0.B16
   339		VPMULL	B0.D1, T1.D1, ACC1.Q1
   340		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   341		VPMULL	T0.D1, T2.D1, ACCM.Q1
   342
   343		reduce()
   344
   345	B	singlesLoop
   346
   347dataEnd:
   348
   349	CBZ	autLen, dataBail
   350	VEOR	B0.B16, B0.B16, B0.B16
   351	ADD	autLen, aut
   352
   353dataLoadLoop:
   354		MOVB.W	-1(aut), H0
   355		VEXT	$15, B0.B16, ZERO.B16, B0.B16
   356		VMOV	H0, B0.B[0]
   357		SUBS	$1, autLen
   358		BNE	dataLoadLoop
   359	B	dataMul
   360
   361dataBail:
   362	VST1	[ACC0.B16], (tPtr)
   363	RET
   364
   365#undef pTbl
   366#undef aut
   367#undef tPtr
   368#undef autLen
   369#undef H0
   370#undef pTblSave
   371
   372// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   373TEXT ·gcmAesEnc(SB),NOSPLIT,$0
   374#define pTbl R0
   375#define dstPtr R1
   376#define ctrPtr R2
   377#define srcPtr R3
   378#define ks R4
   379#define tPtr R5
   380#define srcPtrLen R6
   381#define aluCTR R7
   382#define aluTMP R8
   383#define aluK R9
   384#define NR R10
   385#define H0 R11
   386#define H1 R12
   387#define curK R13
   388#define pTblSave R14
   389
   390#define aesrndx8(K) \
   391	AESE	K.B16, B0.B16    \
   392	AESMC	B0.B16, B0.B16   \
   393	AESE	K.B16, B1.B16    \
   394	AESMC	B1.B16, B1.B16   \
   395	AESE	K.B16, B2.B16    \
   396	AESMC	B2.B16, B2.B16   \
   397	AESE	K.B16, B3.B16    \
   398	AESMC	B3.B16, B3.B16   \
   399	AESE	K.B16, B4.B16    \
   400	AESMC	B4.B16, B4.B16   \
   401	AESE	K.B16, B5.B16    \
   402	AESMC	B5.B16, B5.B16   \
   403	AESE	K.B16, B6.B16    \
   404	AESMC	B6.B16, B6.B16   \
   405	AESE	K.B16, B7.B16    \
   406	AESMC	B7.B16, B7.B16
   407
   408#define aesrndlastx8(K) \
   409	AESE	K.B16, B0.B16    \
   410	AESE	K.B16, B1.B16    \
   411	AESE	K.B16, B2.B16    \
   412	AESE	K.B16, B3.B16    \
   413	AESE	K.B16, B4.B16    \
   414	AESE	K.B16, B5.B16    \
   415	AESE	K.B16, B6.B16    \
   416	AESE	K.B16, B7.B16
   417
   418	MOVD	productTable+0(FP), pTbl
   419	MOVD	dst+8(FP), dstPtr
   420	MOVD	src_base+32(FP), srcPtr
   421	MOVD	src_len+40(FP), srcPtrLen
   422	MOVD	ctr+56(FP), ctrPtr
   423	MOVD	T+64(FP), tPtr
   424	MOVD	ks_base+72(FP), ks
   425	MOVD	ks_len+80(FP), NR
   426
   427	MOVD	$0xC2, H1
   428	LSL	$56, H1
   429	MOVD	$1, H0
   430	VMOV	H1, POLY.D[0]
   431	VMOV	H0, POLY.D[1]
   432	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   433	// Compute NR from len(ks)
   434	MOVD	pTbl, pTblSave
   435	// Current tag, after AAD
   436	VLD1	(tPtr), [ACC0.B16]
   437	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   438	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   439	// Prepare initial counter, and the increment vector
   440	VLD1	(ctrPtr), [CTR.B16]
   441	VEOR	INC.B16, INC.B16, INC.B16
   442	MOVD	$1, H0
   443	VMOV	H0, INC.S[3]
   444	VREV32	CTR.B16, CTR.B16
   445	VADD	CTR.S4, INC.S4, CTR.S4
   446	// Skip to <8 blocks loop
   447	CMP	$128, srcPtrLen
   448
   449	MOVD	ks, H0
   450	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   451	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   452	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   453	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   454	VMOV	K10.B16, KLAST.B16
   455
   456	BLT	startSingles
   457	// There are at least 8 blocks to encrypt
   458	TBZ	$4, NR, octetsLoop
   459
   460	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   461	VMOV	K8.B16, K10.B16
   462	VMOV	K9.B16, K11.B16
   463	VMOV	KLAST.B16, K8.B16
   464	VLD1.P	16(H0), [K9.B16]
   465	VLD1.P  16(H0), [KLAST.B16]
   466	TBZ	$3, NR, octetsLoop
   467	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   468	VMOV	KLAST.B16, K8.B16
   469	VLD1.P	16(H0), [K9.B16]
   470	VLD1.P  16(H0), [KLAST.B16]
   471	ADD	$10*16, ks, H0
   472	MOVD	H0, curK
   473
   474octetsLoop:
   475		SUB	$128, srcPtrLen
   476
   477		VMOV	CTR.B16, B0.B16
   478		VADD	B0.S4, INC.S4, B1.S4
   479		VREV32	B0.B16, B0.B16
   480		VADD	B1.S4, INC.S4, B2.S4
   481		VREV32	B1.B16, B1.B16
   482		VADD	B2.S4, INC.S4, B3.S4
   483		VREV32	B2.B16, B2.B16
   484		VADD	B3.S4, INC.S4, B4.S4
   485		VREV32	B3.B16, B3.B16
   486		VADD	B4.S4, INC.S4, B5.S4
   487		VREV32	B4.B16, B4.B16
   488		VADD	B5.S4, INC.S4, B6.S4
   489		VREV32	B5.B16, B5.B16
   490		VADD	B6.S4, INC.S4, B7.S4
   491		VREV32	B6.B16, B6.B16
   492		VADD	B7.S4, INC.S4, CTR.S4
   493		VREV32	B7.B16, B7.B16
   494
   495		aesrndx8(K0)
   496		aesrndx8(K1)
   497		aesrndx8(K2)
   498		aesrndx8(K3)
   499		aesrndx8(K4)
   500		aesrndx8(K5)
   501		aesrndx8(K6)
   502		aesrndx8(K7)
   503		TBZ	$4, NR, octetsFinish
   504		aesrndx8(K10)
   505		aesrndx8(K11)
   506		TBZ	$3, NR, octetsFinish
   507		VLD1.P	32(curK), [T1.B16, T2.B16]
   508		aesrndx8(T1)
   509		aesrndx8(T2)
   510		MOVD	H0, curK
   511octetsFinish:
   512		aesrndx8(K8)
   513		aesrndlastx8(K9)
   514
   515		VEOR	KLAST.B16, B0.B16, B0.B16
   516		VEOR	KLAST.B16, B1.B16, B1.B16
   517		VEOR	KLAST.B16, B2.B16, B2.B16
   518		VEOR	KLAST.B16, B3.B16, B3.B16
   519		VEOR	KLAST.B16, B4.B16, B4.B16
   520		VEOR	KLAST.B16, B5.B16, B5.B16
   521		VEOR	KLAST.B16, B6.B16, B6.B16
   522		VEOR	KLAST.B16, B7.B16, B7.B16
   523
   524		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   525		VEOR	B0.B16, T1.B16, B0.B16
   526		VEOR	B1.B16, T2.B16, B1.B16
   527		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   528		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   529		VEOR	B2.B16, T1.B16, B2.B16
   530		VEOR	B3.B16, T2.B16, B3.B16
   531		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   532		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   533		VEOR	B4.B16, T1.B16, B4.B16
   534		VEOR	B5.B16, T2.B16, B5.B16
   535		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   536		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   537		VEOR	B6.B16, T1.B16, B6.B16
   538		VEOR	B7.B16, T2.B16, B7.B16
   539		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   540
   541		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   542		VREV64	B0.B16, B0.B16
   543		VEOR	ACC0.B16, B0.B16, B0.B16
   544		VEXT	$8, B0.B16, B0.B16, T0.B16
   545		VEOR	B0.B16, T0.B16, T0.B16
   546		VPMULL	B0.D1, T1.D1, ACC1.Q1
   547		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   548		VPMULL	T0.D1, T2.D1, ACCM.Q1
   549
   550		mulRound(B1)
   551		mulRound(B2)
   552		mulRound(B3)
   553		mulRound(B4)
   554		mulRound(B5)
   555		mulRound(B6)
   556		mulRound(B7)
   557		MOVD	pTblSave, pTbl
   558		reduce()
   559
   560		CMP	$128, srcPtrLen
   561		BGE	octetsLoop
   562
   563startSingles:
   564	CBZ	srcPtrLen, done
   565	ADD	$14*16, pTbl
   566	// Preload H and its Karatsuba precomp
   567	VLD1.P	(pTbl), [T1.B16, T2.B16]
   568	// Preload AES round keys
   569	ADD	$128, ks
   570	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   571	VMOV	K10.B16, KLAST.B16
   572	TBZ	$4, NR, singlesLoop
   573	VLD1.P	32(ks), [B1.B16, B2.B16]
   574	VMOV	B2.B16, KLAST.B16
   575	TBZ	$3, NR, singlesLoop
   576	VLD1.P	32(ks), [B3.B16, B4.B16]
   577	VMOV	B4.B16, KLAST.B16
   578
   579singlesLoop:
   580		CMP	$16, srcPtrLen
   581		BLT	tail
   582		SUB	$16, srcPtrLen
   583
   584		VLD1.P	16(srcPtr), [T0.B16]
   585		VEOR	KLAST.B16, T0.B16, T0.B16
   586
   587		VREV32	CTR.B16, B0.B16
   588		VADD	CTR.S4, INC.S4, CTR.S4
   589
   590		AESE	K0.B16, B0.B16
   591		AESMC	B0.B16, B0.B16
   592		AESE	K1.B16, B0.B16
   593		AESMC	B0.B16, B0.B16
   594		AESE	K2.B16, B0.B16
   595		AESMC	B0.B16, B0.B16
   596		AESE	K3.B16, B0.B16
   597		AESMC	B0.B16, B0.B16
   598		AESE	K4.B16, B0.B16
   599		AESMC	B0.B16, B0.B16
   600		AESE	K5.B16, B0.B16
   601		AESMC	B0.B16, B0.B16
   602		AESE	K6.B16, B0.B16
   603		AESMC	B0.B16, B0.B16
   604		AESE	K7.B16, B0.B16
   605		AESMC	B0.B16, B0.B16
   606		AESE	K8.B16, B0.B16
   607		AESMC	B0.B16, B0.B16
   608		AESE	K9.B16, B0.B16
   609		TBZ	$4, NR, singlesLast
   610		AESMC	B0.B16, B0.B16
   611		AESE	K10.B16, B0.B16
   612		AESMC	B0.B16, B0.B16
   613		AESE	B1.B16, B0.B16
   614		TBZ	$3, NR, singlesLast
   615		AESMC	B0.B16, B0.B16
   616		AESE	B2.B16, B0.B16
   617		AESMC	B0.B16, B0.B16
   618		AESE	B3.B16, B0.B16
   619singlesLast:
   620		VEOR	T0.B16, B0.B16, B0.B16
   621encReduce:
   622		VST1.P	[B0.B16], 16(dstPtr)
   623
   624		VREV64	B0.B16, B0.B16
   625		VEOR	ACC0.B16, B0.B16, B0.B16
   626
   627		VEXT	$8, B0.B16, B0.B16, T0.B16
   628		VEOR	B0.B16, T0.B16, T0.B16
   629		VPMULL	B0.D1, T1.D1, ACC1.Q1
   630		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   631		VPMULL	T0.D1, T2.D1, ACCM.Q1
   632
   633		reduce()
   634
   635	B	singlesLoop
   636tail:
   637	CBZ	srcPtrLen, done
   638
   639	VEOR	T0.B16, T0.B16, T0.B16
   640	VEOR	T3.B16, T3.B16, T3.B16
   641	MOVD	$0, H1
   642	SUB	$1, H1
   643	ADD	srcPtrLen, srcPtr
   644
   645	TBZ	$3, srcPtrLen, ld4
   646	MOVD.W	-8(srcPtr), H0
   647	VMOV	H0, T0.D[0]
   648	VMOV	H1, T3.D[0]
   649ld4:
   650	TBZ	$2, srcPtrLen, ld2
   651	MOVW.W	-4(srcPtr), H0
   652	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   653	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   654	VMOV	H0, T0.S[0]
   655	VMOV	H1, T3.S[0]
   656ld2:
   657	TBZ	$1, srcPtrLen, ld1
   658	MOVH.W	-2(srcPtr), H0
   659	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   660	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   661	VMOV	H0, T0.H[0]
   662	VMOV	H1, T3.H[0]
   663ld1:
   664	TBZ	$0, srcPtrLen, ld0
   665	MOVB.W	-1(srcPtr), H0
   666	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   667	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   668	VMOV	H0, T0.B[0]
   669	VMOV	H1, T3.B[0]
   670ld0:
   671
   672	MOVD	ZR, srcPtrLen
   673	VEOR	KLAST.B16, T0.B16, T0.B16
   674	VREV32	CTR.B16, B0.B16
   675
   676	AESE	K0.B16, B0.B16
   677	AESMC	B0.B16, B0.B16
   678	AESE	K1.B16, B0.B16
   679	AESMC	B0.B16, B0.B16
   680	AESE	K2.B16, B0.B16
   681	AESMC	B0.B16, B0.B16
   682	AESE	K3.B16, B0.B16
   683	AESMC	B0.B16, B0.B16
   684	AESE	K4.B16, B0.B16
   685	AESMC	B0.B16, B0.B16
   686	AESE	K5.B16, B0.B16
   687	AESMC	B0.B16, B0.B16
   688	AESE	K6.B16, B0.B16
   689	AESMC	B0.B16, B0.B16
   690	AESE	K7.B16, B0.B16
   691	AESMC	B0.B16, B0.B16
   692	AESE	K8.B16, B0.B16
   693	AESMC	B0.B16, B0.B16
   694	AESE	K9.B16, B0.B16
   695	TBZ	$4, NR, tailLast
   696	AESMC	B0.B16, B0.B16
   697	AESE	K10.B16, B0.B16
   698	AESMC	B0.B16, B0.B16
   699	AESE	B1.B16, B0.B16
   700	TBZ	$3, NR, tailLast
   701	AESMC	B0.B16, B0.B16
   702	AESE	B2.B16, B0.B16
   703	AESMC	B0.B16, B0.B16
   704	AESE	B3.B16, B0.B16
   705
   706tailLast:
   707	VEOR	T0.B16, B0.B16, B0.B16
   708	VAND	T3.B16, B0.B16, B0.B16
   709	B	encReduce
   710
   711done:
   712	VST1	[ACC0.B16], (tPtr)
   713	RET
   714
   715// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   716TEXT ·gcmAesDec(SB),NOSPLIT,$0
   717	MOVD	productTable+0(FP), pTbl
   718	MOVD	dst+8(FP), dstPtr
   719	MOVD	src_base+32(FP), srcPtr
   720	MOVD	src_len+40(FP), srcPtrLen
   721	MOVD	ctr+56(FP), ctrPtr
   722	MOVD	T+64(FP), tPtr
   723	MOVD	ks_base+72(FP), ks
   724	MOVD	ks_len+80(FP), NR
   725
   726	MOVD	$0xC2, H1
   727	LSL	$56, H1
   728	MOVD	$1, H0
   729	VMOV	H1, POLY.D[0]
   730	VMOV	H0, POLY.D[1]
   731	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   732	// Compute NR from len(ks)
   733	MOVD	pTbl, pTblSave
   734	// Current tag, after AAD
   735	VLD1	(tPtr), [ACC0.B16]
   736	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   737	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   738	// Prepare initial counter, and the increment vector
   739	VLD1	(ctrPtr), [CTR.B16]
   740	VEOR	INC.B16, INC.B16, INC.B16
   741	MOVD	$1, H0
   742	VMOV	H0, INC.S[3]
   743	VREV32	CTR.B16, CTR.B16
   744	VADD	CTR.S4, INC.S4, CTR.S4
   745
   746	MOVD	ks, H0
   747	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   748	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   749	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   750	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   751	VMOV	K10.B16, KLAST.B16
   752
   753	// Skip to <8 blocks loop
   754	CMP	$128, srcPtrLen
   755	BLT	startSingles
   756	// There are at least 8 blocks to encrypt
   757	TBZ	$4, NR, octetsLoop
   758
   759	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   760	VMOV	K8.B16, K10.B16
   761	VMOV	K9.B16, K11.B16
   762	VMOV	KLAST.B16, K8.B16
   763	VLD1.P	16(H0), [K9.B16]
   764	VLD1.P  16(H0), [KLAST.B16]
   765	TBZ	$3, NR, octetsLoop
   766	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   767	VMOV	KLAST.B16, K8.B16
   768	VLD1.P	16(H0), [K9.B16]
   769	VLD1.P  16(H0), [KLAST.B16]
   770	ADD	$10*16, ks, H0
   771	MOVD	H0, curK
   772
   773octetsLoop:
   774		SUB	$128, srcPtrLen
   775
   776		VMOV	CTR.B16, B0.B16
   777		VADD	B0.S4, INC.S4, B1.S4
   778		VREV32	B0.B16, B0.B16
   779		VADD	B1.S4, INC.S4, B2.S4
   780		VREV32	B1.B16, B1.B16
   781		VADD	B2.S4, INC.S4, B3.S4
   782		VREV32	B2.B16, B2.B16
   783		VADD	B3.S4, INC.S4, B4.S4
   784		VREV32	B3.B16, B3.B16
   785		VADD	B4.S4, INC.S4, B5.S4
   786		VREV32	B4.B16, B4.B16
   787		VADD	B5.S4, INC.S4, B6.S4
   788		VREV32	B5.B16, B5.B16
   789		VADD	B6.S4, INC.S4, B7.S4
   790		VREV32	B6.B16, B6.B16
   791		VADD	B7.S4, INC.S4, CTR.S4
   792		VREV32	B7.B16, B7.B16
   793
   794		aesrndx8(K0)
   795		aesrndx8(K1)
   796		aesrndx8(K2)
   797		aesrndx8(K3)
   798		aesrndx8(K4)
   799		aesrndx8(K5)
   800		aesrndx8(K6)
   801		aesrndx8(K7)
   802		TBZ	$4, NR, octetsFinish
   803		aesrndx8(K10)
   804		aesrndx8(K11)
   805		TBZ	$3, NR, octetsFinish
   806		VLD1.P	32(curK), [T1.B16, T2.B16]
   807		aesrndx8(T1)
   808		aesrndx8(T2)
   809		MOVD	H0, curK
   810octetsFinish:
   811		aesrndx8(K8)
   812		aesrndlastx8(K9)
   813
   814		VEOR	KLAST.B16, B0.B16, T1.B16
   815		VEOR	KLAST.B16, B1.B16, T2.B16
   816		VEOR	KLAST.B16, B2.B16, B2.B16
   817		VEOR	KLAST.B16, B3.B16, B3.B16
   818		VEOR	KLAST.B16, B4.B16, B4.B16
   819		VEOR	KLAST.B16, B5.B16, B5.B16
   820		VEOR	KLAST.B16, B6.B16, B6.B16
   821		VEOR	KLAST.B16, B7.B16, B7.B16
   822
   823		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   824		VEOR	B0.B16, T1.B16, T1.B16
   825		VEOR	B1.B16, T2.B16, T2.B16
   826		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   827
   828		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   829		VREV64	B0.B16, B0.B16
   830		VEOR	ACC0.B16, B0.B16, B0.B16
   831		VEXT	$8, B0.B16, B0.B16, T0.B16
   832		VEOR	B0.B16, T0.B16, T0.B16
   833		VPMULL	B0.D1, T1.D1, ACC1.Q1
   834		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   835		VPMULL	T0.D1, T2.D1, ACCM.Q1
   836		mulRound(B1)
   837
   838		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   839		VEOR	B2.B16, B0.B16, T1.B16
   840		VEOR	B3.B16, B1.B16, T2.B16
   841		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   842		mulRound(B0)
   843		mulRound(B1)
   844
   845		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   846		VEOR	B4.B16, B0.B16, T1.B16
   847		VEOR	B5.B16, B1.B16, T2.B16
   848		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   849		mulRound(B0)
   850		mulRound(B1)
   851
   852		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   853		VEOR	B6.B16, B0.B16, T1.B16
   854		VEOR	B7.B16, B1.B16, T2.B16
   855		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   856		mulRound(B0)
   857		mulRound(B1)
   858
   859		MOVD	pTblSave, pTbl
   860		reduce()
   861
   862		CMP	$128, srcPtrLen
   863		BGE	octetsLoop
   864
   865startSingles:
   866	CBZ	srcPtrLen, done
   867	ADD	$14*16, pTbl
   868	// Preload H and its Karatsuba precomp
   869	VLD1.P	(pTbl), [T1.B16, T2.B16]
   870	// Preload AES round keys
   871	ADD	$128, ks
   872	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   873	VMOV	K10.B16, KLAST.B16
   874	TBZ	$4, NR, singlesLoop
   875	VLD1.P	32(ks), [B1.B16, B2.B16]
   876	VMOV	B2.B16, KLAST.B16
   877	TBZ	$3, NR, singlesLoop
   878	VLD1.P	32(ks), [B3.B16, B4.B16]
   879	VMOV	B4.B16, KLAST.B16
   880
   881singlesLoop:
   882		CMP	$16, srcPtrLen
   883		BLT	tail
   884		SUB	$16, srcPtrLen
   885
   886		VLD1.P	16(srcPtr), [T0.B16]
   887		VREV64	T0.B16, B5.B16
   888		VEOR	KLAST.B16, T0.B16, T0.B16
   889
   890		VREV32	CTR.B16, B0.B16
   891		VADD	CTR.S4, INC.S4, CTR.S4
   892
   893		AESE	K0.B16, B0.B16
   894		AESMC	B0.B16, B0.B16
   895		AESE	K1.B16, B0.B16
   896		AESMC	B0.B16, B0.B16
   897		AESE	K2.B16, B0.B16
   898		AESMC	B0.B16, B0.B16
   899		AESE	K3.B16, B0.B16
   900		AESMC	B0.B16, B0.B16
   901		AESE	K4.B16, B0.B16
   902		AESMC	B0.B16, B0.B16
   903		AESE	K5.B16, B0.B16
   904		AESMC	B0.B16, B0.B16
   905		AESE	K6.B16, B0.B16
   906		AESMC	B0.B16, B0.B16
   907		AESE	K7.B16, B0.B16
   908		AESMC	B0.B16, B0.B16
   909		AESE	K8.B16, B0.B16
   910		AESMC	B0.B16, B0.B16
   911		AESE	K9.B16, B0.B16
   912		TBZ	$4, NR, singlesLast
   913		AESMC	B0.B16, B0.B16
   914		AESE	K10.B16, B0.B16
   915		AESMC	B0.B16, B0.B16
   916		AESE	B1.B16, B0.B16
   917		TBZ	$3, NR, singlesLast
   918		AESMC	B0.B16, B0.B16
   919		AESE	B2.B16, B0.B16
   920		AESMC	B0.B16, B0.B16
   921		AESE	B3.B16, B0.B16
   922singlesLast:
   923		VEOR	T0.B16, B0.B16, B0.B16
   924
   925		VST1.P	[B0.B16], 16(dstPtr)
   926
   927		VEOR	ACC0.B16, B5.B16, B5.B16
   928		VEXT	$8, B5.B16, B5.B16, T0.B16
   929		VEOR	B5.B16, T0.B16, T0.B16
   930		VPMULL	B5.D1, T1.D1, ACC1.Q1
   931		VPMULL2	B5.D2, T1.D2, ACC0.Q1
   932		VPMULL	T0.D1, T2.D1, ACCM.Q1
   933		reduce()
   934
   935	B	singlesLoop
   936tail:
   937	CBZ	srcPtrLen, done
   938
   939	VREV32	CTR.B16, B0.B16
   940	VADD	CTR.S4, INC.S4, CTR.S4
   941
   942	AESE	K0.B16, B0.B16
   943	AESMC	B0.B16, B0.B16
   944	AESE	K1.B16, B0.B16
   945	AESMC	B0.B16, B0.B16
   946	AESE	K2.B16, B0.B16
   947	AESMC	B0.B16, B0.B16
   948	AESE	K3.B16, B0.B16
   949	AESMC	B0.B16, B0.B16
   950	AESE	K4.B16, B0.B16
   951	AESMC	B0.B16, B0.B16
   952	AESE	K5.B16, B0.B16
   953	AESMC	B0.B16, B0.B16
   954	AESE	K6.B16, B0.B16
   955	AESMC	B0.B16, B0.B16
   956	AESE	K7.B16, B0.B16
   957	AESMC	B0.B16, B0.B16
   958	AESE	K8.B16, B0.B16
   959	AESMC	B0.B16, B0.B16
   960	AESE	K9.B16, B0.B16
   961	TBZ	$4, NR, tailLast
   962	AESMC	B0.B16, B0.B16
   963	AESE	K10.B16, B0.B16
   964	AESMC	B0.B16, B0.B16
   965	AESE	B1.B16, B0.B16
   966	TBZ	$3, NR, tailLast
   967	AESMC	B0.B16, B0.B16
   968	AESE	B2.B16, B0.B16
   969	AESMC	B0.B16, B0.B16
   970	AESE	B3.B16, B0.B16
   971tailLast:
   972	VEOR	KLAST.B16, B0.B16, B0.B16
   973
   974	// Assuming it is safe to load past dstPtr due to the presence of the tag
   975	VLD1	(srcPtr), [B5.B16]
   976
   977	VEOR	B5.B16, B0.B16, B0.B16
   978
   979	VEOR	T3.B16, T3.B16, T3.B16
   980	MOVD	$0, H1
   981	SUB	$1, H1
   982
   983	TBZ	$3, srcPtrLen, ld4
   984	VMOV	B0.D[0], H0
   985	MOVD.P	H0, 8(dstPtr)
   986	VMOV	H1, T3.D[0]
   987	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   988ld4:
   989	TBZ	$2, srcPtrLen, ld2
   990	VMOV	B0.S[0], H0
   991	MOVW.P	H0, 4(dstPtr)
   992	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   993	VMOV	H1, T3.S[0]
   994	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   995ld2:
   996	TBZ	$1, srcPtrLen, ld1
   997	VMOV	B0.H[0], H0
   998	MOVH.P	H0, 2(dstPtr)
   999	VEXT	$14, T3.B16, ZERO.B16, T3.B16
  1000	VMOV	H1, T3.H[0]
  1001	VEXT	$2, ZERO.B16, B0.B16, B0.B16
  1002ld1:
  1003	TBZ	$0, srcPtrLen, ld0
  1004	VMOV	B0.B[0], H0
  1005	MOVB.P	H0, 1(dstPtr)
  1006	VEXT	$15, T3.B16, ZERO.B16, T3.B16
  1007	VMOV	H1, T3.B[0]
  1008ld0:
  1009
  1010	VAND	T3.B16, B5.B16, B5.B16
  1011	VREV64	B5.B16, B5.B16
  1012
  1013	VEOR	ACC0.B16, B5.B16, B5.B16
  1014	VEXT	$8, B5.B16, B5.B16, T0.B16
  1015	VEOR	B5.B16, T0.B16, T0.B16
  1016	VPMULL	B5.D1, T1.D1, ACC1.Q1
  1017	VPMULL2	B5.D2, T1.D2, ACC0.Q1
  1018	VPMULL	T0.D1, T2.D1, ACCM.Q1
  1019	reduce()
  1020done:
  1021	VST1	[ACC0.B16], (tPtr)
  1022
  1023	RET

View as plain text