...

Text file src/crypto/aes/gcm_amd64.s

Documentation: crypto/aes

     1// Copyright 2015 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     8// The implementation uses some optimization as described in:
     9// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
    10//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
    11// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    12//     Hardware
    13
    14#include "textflag.h"
    15
    16#define B0 X0
    17#define B1 X1
    18#define B2 X2
    19#define B3 X3
    20#define B4 X4
    21#define B5 X5
    22#define B6 X6
    23#define B7 X7
    24
    25#define ACC0 X8
    26#define ACC1 X9
    27#define ACCM X10
    28
    29#define T0 X11
    30#define T1 X12
    31#define T2 X13
    32#define POLY X14
    33#define BSWAP X15
    34
    35DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    36DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    37
    38DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    39DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    40
    41DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    42DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    43DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    44DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    45DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    46DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    47DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    48DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    49DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    50DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    51DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    52DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    53DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    54DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    55DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    56DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    57DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    58DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    59DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    60DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    61DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    62DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    63DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    64DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    65DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    66DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    67DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    68DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    69DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    70DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    71
    72GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    73GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    74GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    75
    76// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    77TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    78#define pTbl DI
    79#define tMsk SI
    80#define tPtr DX
    81#define plen AX
    82#define dlen CX
    83
    84	MOVQ productTable+0(FP), pTbl
    85	MOVQ tagMask+8(FP), tMsk
    86	MOVQ T+16(FP), tPtr
    87	MOVQ pLen+24(FP), plen
    88	MOVQ dLen+32(FP), dlen
    89
    90	MOVOU (tPtr), ACC0
    91	MOVOU (tMsk), T2
    92
    93	MOVOU bswapMask<>(SB), BSWAP
    94	MOVOU gcmPoly<>(SB), POLY
    95
    96	SHLQ $3, plen
    97	SHLQ $3, dlen
    98
    99	MOVQ plen, B0
   100	PINSRQ $1, dlen, B0
   101
   102	PXOR ACC0, B0
   103
   104	MOVOU (16*14)(pTbl), ACC0
   105	MOVOU (16*15)(pTbl), ACCM
   106	MOVOU ACC0, ACC1
   107
   108	PCLMULQDQ $0x00, B0, ACC0
   109	PCLMULQDQ $0x11, B0, ACC1
   110	PSHUFD $78, B0, T0
   111	PXOR B0, T0
   112	PCLMULQDQ $0x00, T0, ACCM
   113
   114	PXOR ACC0, ACCM
   115	PXOR ACC1, ACCM
   116	MOVOU ACCM, T0
   117	PSRLDQ $8, ACCM
   118	PSLLDQ $8, T0
   119	PXOR ACCM, ACC1
   120	PXOR T0, ACC0
   121
   122	MOVOU POLY, T0
   123	PCLMULQDQ $0x01, ACC0, T0
   124	PSHUFD $78, ACC0, ACC0
   125	PXOR T0, ACC0
   126
   127	MOVOU POLY, T0
   128	PCLMULQDQ $0x01, ACC0, T0
   129	PSHUFD $78, ACC0, ACC0
   130	PXOR T0, ACC0
   131
   132	PXOR ACC1, ACC0
   133
   134	PSHUFB BSWAP, ACC0
   135	PXOR T2, ACC0
   136	MOVOU ACC0, (tPtr)
   137
   138	RET
   139#undef pTbl
   140#undef tMsk
   141#undef tPtr
   142#undef plen
   143#undef dlen
   144
   145// func gcmAesInit(productTable *[256]byte, ks []uint32)
   146TEXT ·gcmAesInit(SB),NOSPLIT,$0
   147#define dst DI
   148#define KS SI
   149#define NR DX
   150
   151	MOVQ productTable+0(FP), dst
   152	MOVQ ks_base+8(FP), KS
   153	MOVQ ks_len+16(FP), NR
   154
   155	SHRQ $2, NR
   156	DECQ NR
   157
   158	MOVOU bswapMask<>(SB), BSWAP
   159	MOVOU gcmPoly<>(SB), POLY
   160
   161	// Encrypt block 0, with the AES key to generate the hash key H
   162	MOVOU (16*0)(KS), B0
   163	MOVOU (16*1)(KS), T0
   164	AESENC T0, B0
   165	MOVOU (16*2)(KS), T0
   166	AESENC T0, B0
   167	MOVOU (16*3)(KS), T0
   168	AESENC T0, B0
   169	MOVOU (16*4)(KS), T0
   170	AESENC T0, B0
   171	MOVOU (16*5)(KS), T0
   172	AESENC T0, B0
   173	MOVOU (16*6)(KS), T0
   174	AESENC T0, B0
   175	MOVOU (16*7)(KS), T0
   176	AESENC T0, B0
   177	MOVOU (16*8)(KS), T0
   178	AESENC T0, B0
   179	MOVOU (16*9)(KS), T0
   180	AESENC T0, B0
   181	MOVOU (16*10)(KS), T0
   182	CMPQ NR, $12
   183	JB initEncLast
   184	AESENC T0, B0
   185	MOVOU (16*11)(KS), T0
   186	AESENC T0, B0
   187	MOVOU (16*12)(KS), T0
   188	JE initEncLast
   189	AESENC T0, B0
   190	MOVOU (16*13)(KS), T0
   191	AESENC T0, B0
   192	MOVOU (16*14)(KS), T0
   193initEncLast:
   194	AESENCLAST T0, B0
   195
   196	PSHUFB BSWAP, B0
   197	// H * 2
   198	PSHUFD $0xff, B0, T0
   199	MOVOU B0, T1
   200	PSRAL $31, T0
   201	PAND POLY, T0
   202	PSRLL $31, T1
   203	PSLLDQ $4, T1
   204	PSLLL $1, B0
   205	PXOR T0, B0
   206	PXOR T1, B0
   207	// Karatsuba pre-computations
   208	MOVOU B0, (16*14)(dst)
   209	PSHUFD $78, B0, B1
   210	PXOR B0, B1
   211	MOVOU B1, (16*15)(dst)
   212
   213	MOVOU B0, B2
   214	MOVOU B1, B3
   215	// Now prepare powers of H and pre-computations for them
   216	MOVQ $7, AX
   217
   218initLoop:
   219		MOVOU B2, T0
   220		MOVOU B2, T1
   221		MOVOU B3, T2
   222		PCLMULQDQ $0x00, B0, T0
   223		PCLMULQDQ $0x11, B0, T1
   224		PCLMULQDQ $0x00, B1, T2
   225
   226		PXOR T0, T2
   227		PXOR T1, T2
   228		MOVOU T2, B4
   229		PSLLDQ $8, B4
   230		PSRLDQ $8, T2
   231		PXOR B4, T0
   232		PXOR T2, T1
   233
   234		MOVOU POLY, B2
   235		PCLMULQDQ $0x01, T0, B2
   236		PSHUFD $78, T0, T0
   237		PXOR B2, T0
   238		MOVOU POLY, B2
   239		PCLMULQDQ $0x01, T0, B2
   240		PSHUFD $78, T0, T0
   241		PXOR T0, B2
   242		PXOR T1, B2
   243
   244		MOVOU B2, (16*12)(dst)
   245		PSHUFD $78, B2, B3
   246		PXOR B2, B3
   247		MOVOU B3, (16*13)(dst)
   248
   249		DECQ AX
   250		LEAQ (-16*2)(dst), dst
   251	JNE initLoop
   252
   253	RET
   254#undef NR
   255#undef KS
   256#undef dst
   257
   258// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   259TEXT ·gcmAesData(SB),NOSPLIT,$0
   260#define pTbl DI
   261#define aut SI
   262#define tPtr CX
   263#define autLen DX
   264
   265#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   266#define mulRoundAAD(X ,i) \
   267	MOVOU (16*(i*2))(pTbl), T1;\
   268	MOVOU T1, T2;\
   269	PCLMULQDQ $0x00, X, T1;\
   270	PXOR T1, ACC0;\
   271	PCLMULQDQ $0x11, X, T2;\
   272	PXOR T2, ACC1;\
   273	PSHUFD $78, X, T1;\
   274	PXOR T1, X;\
   275	MOVOU (16*(i*2+1))(pTbl), T1;\
   276	PCLMULQDQ $0x00, X, T1;\
   277	PXOR T1, ACCM
   278
   279	MOVQ productTable+0(FP), pTbl
   280	MOVQ data_base+8(FP), aut
   281	MOVQ data_len+16(FP), autLen
   282	MOVQ T+32(FP), tPtr
   283
   284	PXOR ACC0, ACC0
   285	MOVOU bswapMask<>(SB), BSWAP
   286	MOVOU gcmPoly<>(SB), POLY
   287
   288	TESTQ autLen, autLen
   289	JEQ dataBail
   290
   291	CMPQ autLen, $13	// optimize the TLS case
   292	JE dataTLS
   293	CMPQ autLen, $128
   294	JB startSinglesLoop
   295	JMP dataOctaLoop
   296
   297dataTLS:
   298	MOVOU (16*14)(pTbl), T1
   299	MOVOU (16*15)(pTbl), T2
   300	PXOR B0, B0
   301	MOVQ (aut), B0
   302	PINSRD $2, 8(aut), B0
   303	PINSRB $12, 12(aut), B0
   304	XORQ autLen, autLen
   305	JMP dataMul
   306
   307dataOctaLoop:
   308		CMPQ autLen, $128
   309		JB startSinglesLoop
   310		SUBQ $128, autLen
   311
   312		MOVOU (16*0)(aut), X0
   313		MOVOU (16*1)(aut), X1
   314		MOVOU (16*2)(aut), X2
   315		MOVOU (16*3)(aut), X3
   316		MOVOU (16*4)(aut), X4
   317		MOVOU (16*5)(aut), X5
   318		MOVOU (16*6)(aut), X6
   319		MOVOU (16*7)(aut), X7
   320		LEAQ (16*8)(aut), aut
   321		PSHUFB BSWAP, X0
   322		PSHUFB BSWAP, X1
   323		PSHUFB BSWAP, X2
   324		PSHUFB BSWAP, X3
   325		PSHUFB BSWAP, X4
   326		PSHUFB BSWAP, X5
   327		PSHUFB BSWAP, X6
   328		PSHUFB BSWAP, X7
   329		PXOR ACC0, X0
   330
   331		MOVOU (16*0)(pTbl), ACC0
   332		MOVOU (16*1)(pTbl), ACCM
   333		MOVOU ACC0, ACC1
   334		PSHUFD $78, X0, T1
   335		PXOR X0, T1
   336		PCLMULQDQ $0x00, X0, ACC0
   337		PCLMULQDQ $0x11, X0, ACC1
   338		PCLMULQDQ $0x00, T1, ACCM
   339
   340		mulRoundAAD(X1, 1)
   341		mulRoundAAD(X2, 2)
   342		mulRoundAAD(X3, 3)
   343		mulRoundAAD(X4, 4)
   344		mulRoundAAD(X5, 5)
   345		mulRoundAAD(X6, 6)
   346		mulRoundAAD(X7, 7)
   347
   348		PXOR ACC0, ACCM
   349		PXOR ACC1, ACCM
   350		MOVOU ACCM, T0
   351		PSRLDQ $8, ACCM
   352		PSLLDQ $8, T0
   353		PXOR ACCM, ACC1
   354		PXOR T0, ACC0
   355		reduceRound(ACC0)
   356		reduceRound(ACC0)
   357		PXOR ACC1, ACC0
   358	JMP dataOctaLoop
   359
   360startSinglesLoop:
   361	MOVOU (16*14)(pTbl), T1
   362	MOVOU (16*15)(pTbl), T2
   363
   364dataSinglesLoop:
   365
   366		CMPQ autLen, $16
   367		JB dataEnd
   368		SUBQ $16, autLen
   369
   370		MOVOU (aut), B0
   371dataMul:
   372		PSHUFB BSWAP, B0
   373		PXOR ACC0, B0
   374
   375		MOVOU T1, ACC0
   376		MOVOU T2, ACCM
   377		MOVOU T1, ACC1
   378
   379		PSHUFD $78, B0, T0
   380		PXOR B0, T0
   381		PCLMULQDQ $0x00, B0, ACC0
   382		PCLMULQDQ $0x11, B0, ACC1
   383		PCLMULQDQ $0x00, T0, ACCM
   384
   385		PXOR ACC0, ACCM
   386		PXOR ACC1, ACCM
   387		MOVOU ACCM, T0
   388		PSRLDQ $8, ACCM
   389		PSLLDQ $8, T0
   390		PXOR ACCM, ACC1
   391		PXOR T0, ACC0
   392
   393		MOVOU POLY, T0
   394		PCLMULQDQ $0x01, ACC0, T0
   395		PSHUFD $78, ACC0, ACC0
   396		PXOR T0, ACC0
   397
   398		MOVOU POLY, T0
   399		PCLMULQDQ $0x01, ACC0, T0
   400		PSHUFD $78, ACC0, ACC0
   401		PXOR T0, ACC0
   402		PXOR ACC1, ACC0
   403
   404		LEAQ 16(aut), aut
   405
   406	JMP dataSinglesLoop
   407
   408dataEnd:
   409
   410	TESTQ autLen, autLen
   411	JEQ dataBail
   412
   413	PXOR B0, B0
   414	LEAQ -1(aut)(autLen*1), aut
   415
   416dataLoadLoop:
   417
   418		PSLLDQ $1, B0
   419		PINSRB $0, (aut), B0
   420
   421		LEAQ -1(aut), aut
   422		DECQ autLen
   423		JNE dataLoadLoop
   424
   425	JMP dataMul
   426
   427dataBail:
   428	MOVOU ACC0, (tPtr)
   429	RET
   430#undef pTbl
   431#undef aut
   432#undef tPtr
   433#undef autLen
   434
   435// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   436TEXT ·gcmAesEnc(SB),0,$256-96
   437#define pTbl DI
   438#define ctx DX
   439#define ctrPtr CX
   440#define ptx SI
   441#define ks AX
   442#define tPtr R8
   443#define ptxLen R9
   444#define aluCTR R10
   445#define aluTMP R11
   446#define aluK R12
   447#define NR R13
   448
   449#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   450#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   451#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   452#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   453#define combinedRound(i) \
   454	MOVOU (16*i)(ks), T0;\
   455	AESENC T0, B0;\
   456	AESENC T0, B1;\
   457	AESENC T0, B2;\
   458	AESENC T0, B3;\
   459	 MOVOU (16*(i*2))(pTbl), T1;\
   460	 MOVOU T1, T2;\
   461	AESENC T0, B4;\
   462	AESENC T0, B5;\
   463	AESENC T0, B6;\
   464	AESENC T0, B7;\
   465	 MOVOU (16*i)(SP), T0;\
   466	 PCLMULQDQ $0x00, T0, T1;\
   467	 PXOR T1, ACC0;\
   468	 PSHUFD $78, T0, T1;\
   469	 PCLMULQDQ $0x11, T0, T2;\
   470	 PXOR T1, T0;\
   471	 PXOR T2, ACC1;\
   472	 MOVOU (16*(i*2+1))(pTbl), T2;\
   473	 PCLMULQDQ $0x00, T2, T0;\
   474	 PXOR T0, ACCM
   475#define mulRound(i) \
   476	MOVOU (16*i)(SP), T0;\
   477	MOVOU (16*(i*2))(pTbl), T1;\
   478	MOVOU T1, T2;\
   479	PCLMULQDQ $0x00, T0, T1;\
   480	PXOR T1, ACC0;\
   481	PCLMULQDQ $0x11, T0, T2;\
   482	PXOR T2, ACC1;\
   483	PSHUFD $78, T0, T1;\
   484	PXOR T1, T0;\
   485	MOVOU (16*(i*2+1))(pTbl), T1;\
   486	PCLMULQDQ $0x00, T0, T1;\
   487	PXOR T1, ACCM
   488
   489	MOVQ productTable+0(FP), pTbl
   490	MOVQ dst+8(FP), ctx
   491	MOVQ src_base+32(FP), ptx
   492	MOVQ src_len+40(FP), ptxLen
   493	MOVQ ctr+56(FP), ctrPtr
   494	MOVQ T+64(FP), tPtr
   495	MOVQ ks_base+72(FP), ks
   496	MOVQ ks_len+80(FP), NR
   497
   498	SHRQ $2, NR
   499	DECQ NR
   500
   501	MOVOU bswapMask<>(SB), BSWAP
   502	MOVOU gcmPoly<>(SB), POLY
   503
   504	MOVOU (tPtr), ACC0
   505	PXOR ACC1, ACC1
   506	PXOR ACCM, ACCM
   507	MOVOU (ctrPtr), B0
   508	MOVL (3*4)(ctrPtr), aluCTR
   509	MOVOU (ks), T0
   510	MOVL (3*4)(ks), aluK
   511	BSWAPL aluCTR
   512	BSWAPL aluK
   513
   514	PXOR B0, T0
   515	MOVOU T0, (8*16 + 0*16)(SP)
   516	increment(0)
   517
   518	CMPQ ptxLen, $128
   519	JB gcmAesEncSingles
   520	SUBQ $128, ptxLen
   521
   522	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   523	MOVOU T0, (8*16 + 1*16)(SP)
   524	increment(1)
   525	MOVOU T0, (8*16 + 2*16)(SP)
   526	increment(2)
   527	MOVOU T0, (8*16 + 3*16)(SP)
   528	increment(3)
   529	MOVOU T0, (8*16 + 4*16)(SP)
   530	increment(4)
   531	MOVOU T0, (8*16 + 5*16)(SP)
   532	increment(5)
   533	MOVOU T0, (8*16 + 6*16)(SP)
   534	increment(6)
   535	MOVOU T0, (8*16 + 7*16)(SP)
   536	increment(7)
   537
   538	MOVOU (8*16 + 0*16)(SP), B0
   539	MOVOU (8*16 + 1*16)(SP), B1
   540	MOVOU (8*16 + 2*16)(SP), B2
   541	MOVOU (8*16 + 3*16)(SP), B3
   542	MOVOU (8*16 + 4*16)(SP), B4
   543	MOVOU (8*16 + 5*16)(SP), B5
   544	MOVOU (8*16 + 6*16)(SP), B6
   545	MOVOU (8*16 + 7*16)(SP), B7
   546
   547	aesRound(1)
   548	increment(0)
   549	aesRound(2)
   550	increment(1)
   551	aesRound(3)
   552	increment(2)
   553	aesRound(4)
   554	increment(3)
   555	aesRound(5)
   556	increment(4)
   557	aesRound(6)
   558	increment(5)
   559	aesRound(7)
   560	increment(6)
   561	aesRound(8)
   562	increment(7)
   563	aesRound(9)
   564	MOVOU (16*10)(ks), T0
   565	CMPQ NR, $12
   566	JB encLast1
   567	aesRnd(T0)
   568	aesRound(11)
   569	MOVOU (16*12)(ks), T0
   570	JE encLast1
   571	aesRnd(T0)
   572	aesRound(13)
   573	MOVOU (16*14)(ks), T0
   574encLast1:
   575	aesRndLast(T0)
   576
   577	MOVOU (16*0)(ptx), T0
   578	PXOR T0, B0
   579	MOVOU (16*1)(ptx), T0
   580	PXOR T0, B1
   581	MOVOU (16*2)(ptx), T0
   582	PXOR T0, B2
   583	MOVOU (16*3)(ptx), T0
   584	PXOR T0, B3
   585	MOVOU (16*4)(ptx), T0
   586	PXOR T0, B4
   587	MOVOU (16*5)(ptx), T0
   588	PXOR T0, B5
   589	MOVOU (16*6)(ptx), T0
   590	PXOR T0, B6
   591	MOVOU (16*7)(ptx), T0
   592	PXOR T0, B7
   593
   594	MOVOU B0, (16*0)(ctx)
   595	PSHUFB BSWAP, B0
   596	PXOR ACC0, B0
   597	MOVOU B1, (16*1)(ctx)
   598	PSHUFB BSWAP, B1
   599	MOVOU B2, (16*2)(ctx)
   600	PSHUFB BSWAP, B2
   601	MOVOU B3, (16*3)(ctx)
   602	PSHUFB BSWAP, B3
   603	MOVOU B4, (16*4)(ctx)
   604	PSHUFB BSWAP, B4
   605	MOVOU B5, (16*5)(ctx)
   606	PSHUFB BSWAP, B5
   607	MOVOU B6, (16*6)(ctx)
   608	PSHUFB BSWAP, B6
   609	MOVOU B7, (16*7)(ctx)
   610	PSHUFB BSWAP, B7
   611
   612	MOVOU B0, (16*0)(SP)
   613	MOVOU B1, (16*1)(SP)
   614	MOVOU B2, (16*2)(SP)
   615	MOVOU B3, (16*3)(SP)
   616	MOVOU B4, (16*4)(SP)
   617	MOVOU B5, (16*5)(SP)
   618	MOVOU B6, (16*6)(SP)
   619	MOVOU B7, (16*7)(SP)
   620
   621	LEAQ 128(ptx), ptx
   622	LEAQ 128(ctx), ctx
   623
   624gcmAesEncOctetsLoop:
   625
   626		CMPQ ptxLen, $128
   627		JB gcmAesEncOctetsEnd
   628		SUBQ $128, ptxLen
   629
   630		MOVOU (8*16 + 0*16)(SP), B0
   631		MOVOU (8*16 + 1*16)(SP), B1
   632		MOVOU (8*16 + 2*16)(SP), B2
   633		MOVOU (8*16 + 3*16)(SP), B3
   634		MOVOU (8*16 + 4*16)(SP), B4
   635		MOVOU (8*16 + 5*16)(SP), B5
   636		MOVOU (8*16 + 6*16)(SP), B6
   637		MOVOU (8*16 + 7*16)(SP), B7
   638
   639		MOVOU (16*0)(SP), T0
   640		PSHUFD $78, T0, T1
   641		PXOR T0, T1
   642
   643		MOVOU (16*0)(pTbl), ACC0
   644		MOVOU (16*1)(pTbl), ACCM
   645		MOVOU ACC0, ACC1
   646
   647		PCLMULQDQ $0x00, T1, ACCM
   648		PCLMULQDQ $0x00, T0, ACC0
   649		PCLMULQDQ $0x11, T0, ACC1
   650
   651		combinedRound(1)
   652		increment(0)
   653		combinedRound(2)
   654		increment(1)
   655		combinedRound(3)
   656		increment(2)
   657		combinedRound(4)
   658		increment(3)
   659		combinedRound(5)
   660		increment(4)
   661		combinedRound(6)
   662		increment(5)
   663		combinedRound(7)
   664		increment(6)
   665
   666		aesRound(8)
   667		increment(7)
   668
   669		PXOR ACC0, ACCM
   670		PXOR ACC1, ACCM
   671		MOVOU ACCM, T0
   672		PSRLDQ $8, ACCM
   673		PSLLDQ $8, T0
   674		PXOR ACCM, ACC1
   675		PXOR T0, ACC0
   676
   677		reduceRound(ACC0)
   678		aesRound(9)
   679
   680		reduceRound(ACC0)
   681		PXOR ACC1, ACC0
   682
   683		MOVOU (16*10)(ks), T0
   684		CMPQ NR, $12
   685		JB encLast2
   686		aesRnd(T0)
   687		aesRound(11)
   688		MOVOU (16*12)(ks), T0
   689		JE encLast2
   690		aesRnd(T0)
   691		aesRound(13)
   692		MOVOU (16*14)(ks), T0
   693encLast2:
   694		aesRndLast(T0)
   695
   696		MOVOU (16*0)(ptx), T0
   697		PXOR T0, B0
   698		MOVOU (16*1)(ptx), T0
   699		PXOR T0, B1
   700		MOVOU (16*2)(ptx), T0
   701		PXOR T0, B2
   702		MOVOU (16*3)(ptx), T0
   703		PXOR T0, B3
   704		MOVOU (16*4)(ptx), T0
   705		PXOR T0, B4
   706		MOVOU (16*5)(ptx), T0
   707		PXOR T0, B5
   708		MOVOU (16*6)(ptx), T0
   709		PXOR T0, B6
   710		MOVOU (16*7)(ptx), T0
   711		PXOR T0, B7
   712
   713		MOVOU B0, (16*0)(ctx)
   714		PSHUFB BSWAP, B0
   715		PXOR ACC0, B0
   716		MOVOU B1, (16*1)(ctx)
   717		PSHUFB BSWAP, B1
   718		MOVOU B2, (16*2)(ctx)
   719		PSHUFB BSWAP, B2
   720		MOVOU B3, (16*3)(ctx)
   721		PSHUFB BSWAP, B3
   722		MOVOU B4, (16*4)(ctx)
   723		PSHUFB BSWAP, B4
   724		MOVOU B5, (16*5)(ctx)
   725		PSHUFB BSWAP, B5
   726		MOVOU B6, (16*6)(ctx)
   727		PSHUFB BSWAP, B6
   728		MOVOU B7, (16*7)(ctx)
   729		PSHUFB BSWAP, B7
   730
   731		MOVOU B0, (16*0)(SP)
   732		MOVOU B1, (16*1)(SP)
   733		MOVOU B2, (16*2)(SP)
   734		MOVOU B3, (16*3)(SP)
   735		MOVOU B4, (16*4)(SP)
   736		MOVOU B5, (16*5)(SP)
   737		MOVOU B6, (16*6)(SP)
   738		MOVOU B7, (16*7)(SP)
   739
   740		LEAQ 128(ptx), ptx
   741		LEAQ 128(ctx), ctx
   742
   743		JMP gcmAesEncOctetsLoop
   744
   745gcmAesEncOctetsEnd:
   746
   747	MOVOU (16*0)(SP), T0
   748	MOVOU (16*0)(pTbl), ACC0
   749	MOVOU (16*1)(pTbl), ACCM
   750	MOVOU ACC0, ACC1
   751	PSHUFD $78, T0, T1
   752	PXOR T0, T1
   753	PCLMULQDQ $0x00, T0, ACC0
   754	PCLMULQDQ $0x11, T0, ACC1
   755	PCLMULQDQ $0x00, T1, ACCM
   756
   757	mulRound(1)
   758	mulRound(2)
   759	mulRound(3)
   760	mulRound(4)
   761	mulRound(5)
   762	mulRound(6)
   763	mulRound(7)
   764
   765	PXOR ACC0, ACCM
   766	PXOR ACC1, ACCM
   767	MOVOU ACCM, T0
   768	PSRLDQ $8, ACCM
   769	PSLLDQ $8, T0
   770	PXOR ACCM, ACC1
   771	PXOR T0, ACC0
   772
   773	reduceRound(ACC0)
   774	reduceRound(ACC0)
   775	PXOR ACC1, ACC0
   776
   777	TESTQ ptxLen, ptxLen
   778	JE gcmAesEncDone
   779
   780	SUBQ $7, aluCTR
   781
   782gcmAesEncSingles:
   783
   784	MOVOU (16*1)(ks), B1
   785	MOVOU (16*2)(ks), B2
   786	MOVOU (16*3)(ks), B3
   787	MOVOU (16*4)(ks), B4
   788	MOVOU (16*5)(ks), B5
   789	MOVOU (16*6)(ks), B6
   790	MOVOU (16*7)(ks), B7
   791
   792	MOVOU (16*14)(pTbl), T2
   793
   794gcmAesEncSinglesLoop:
   795
   796		CMPQ ptxLen, $16
   797		JB gcmAesEncTail
   798		SUBQ $16, ptxLen
   799
   800		MOVOU (8*16 + 0*16)(SP), B0
   801		increment(0)
   802
   803		AESENC B1, B0
   804		AESENC B2, B0
   805		AESENC B3, B0
   806		AESENC B4, B0
   807		AESENC B5, B0
   808		AESENC B6, B0
   809		AESENC B7, B0
   810		MOVOU (16*8)(ks), T0
   811		AESENC T0, B0
   812		MOVOU (16*9)(ks), T0
   813		AESENC T0, B0
   814		MOVOU (16*10)(ks), T0
   815		CMPQ NR, $12
   816		JB encLast3
   817		AESENC T0, B0
   818		MOVOU (16*11)(ks), T0
   819		AESENC T0, B0
   820		MOVOU (16*12)(ks), T0
   821		JE encLast3
   822		AESENC T0, B0
   823		MOVOU (16*13)(ks), T0
   824		AESENC T0, B0
   825		MOVOU (16*14)(ks), T0
   826encLast3:
   827		AESENCLAST T0, B0
   828
   829		MOVOU (ptx), T0
   830		PXOR T0, B0
   831		MOVOU B0, (ctx)
   832
   833		PSHUFB BSWAP, B0
   834		PXOR ACC0, B0
   835
   836		MOVOU T2, ACC0
   837		MOVOU T2, ACC1
   838		MOVOU (16*15)(pTbl), ACCM
   839
   840		PSHUFD $78, B0, T0
   841		PXOR B0, T0
   842		PCLMULQDQ $0x00, B0, ACC0
   843		PCLMULQDQ $0x11, B0, ACC1
   844		PCLMULQDQ $0x00, T0, ACCM
   845
   846		PXOR ACC0, ACCM
   847		PXOR ACC1, ACCM
   848		MOVOU ACCM, T0
   849		PSRLDQ $8, ACCM
   850		PSLLDQ $8, T0
   851		PXOR ACCM, ACC1
   852		PXOR T0, ACC0
   853
   854		reduceRound(ACC0)
   855		reduceRound(ACC0)
   856		PXOR ACC1, ACC0
   857
   858		LEAQ (16*1)(ptx), ptx
   859		LEAQ (16*1)(ctx), ctx
   860
   861	JMP gcmAesEncSinglesLoop
   862
   863gcmAesEncTail:
   864	TESTQ ptxLen, ptxLen
   865	JE gcmAesEncDone
   866
   867	MOVOU (8*16 + 0*16)(SP), B0
   868	AESENC B1, B0
   869	AESENC B2, B0
   870	AESENC B3, B0
   871	AESENC B4, B0
   872	AESENC B5, B0
   873	AESENC B6, B0
   874	AESENC B7, B0
   875	MOVOU (16*8)(ks), T0
   876	AESENC T0, B0
   877	MOVOU (16*9)(ks), T0
   878	AESENC T0, B0
   879	MOVOU (16*10)(ks), T0
   880	CMPQ NR, $12
   881	JB encLast4
   882	AESENC T0, B0
   883	MOVOU (16*11)(ks), T0
   884	AESENC T0, B0
   885	MOVOU (16*12)(ks), T0
   886	JE encLast4
   887	AESENC T0, B0
   888	MOVOU (16*13)(ks), T0
   889	AESENC T0, B0
   890	MOVOU (16*14)(ks), T0
   891encLast4:
   892	AESENCLAST T0, B0
   893	MOVOU B0, T0
   894
   895	LEAQ -1(ptx)(ptxLen*1), ptx
   896
   897	MOVQ ptxLen, aluTMP
   898	SHLQ $4, aluTMP
   899
   900	LEAQ andMask<>(SB), aluCTR
   901	MOVOU -16(aluCTR)(aluTMP*1), T1
   902
   903	PXOR B0, B0
   904ptxLoadLoop:
   905		PSLLDQ $1, B0
   906		PINSRB $0, (ptx), B0
   907		LEAQ -1(ptx), ptx
   908		DECQ ptxLen
   909	JNE ptxLoadLoop
   910
   911	PXOR T0, B0
   912	PAND T1, B0
   913	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   914
   915	PSHUFB BSWAP, B0
   916	PXOR ACC0, B0
   917
   918	MOVOU T2, ACC0
   919	MOVOU T2, ACC1
   920	MOVOU (16*15)(pTbl), ACCM
   921
   922	PSHUFD $78, B0, T0
   923	PXOR B0, T0
   924	PCLMULQDQ $0x00, B0, ACC0
   925	PCLMULQDQ $0x11, B0, ACC1
   926	PCLMULQDQ $0x00, T0, ACCM
   927
   928	PXOR ACC0, ACCM
   929	PXOR ACC1, ACCM
   930	MOVOU ACCM, T0
   931	PSRLDQ $8, ACCM
   932	PSLLDQ $8, T0
   933	PXOR ACCM, ACC1
   934	PXOR T0, ACC0
   935
   936	reduceRound(ACC0)
   937	reduceRound(ACC0)
   938	PXOR ACC1, ACC0
   939
   940gcmAesEncDone:
   941	MOVOU ACC0, (tPtr)
   942	RET
   943#undef increment
   944
   945// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   946TEXT ·gcmAesDec(SB),0,$128-96
   947#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   948#define combinedDecRound(i) \
   949	MOVOU (16*i)(ks), T0;\
   950	AESENC T0, B0;\
   951	AESENC T0, B1;\
   952	AESENC T0, B2;\
   953	AESENC T0, B3;\
   954	MOVOU (16*(i*2))(pTbl), T1;\
   955	MOVOU T1, T2;\
   956	AESENC T0, B4;\
   957	AESENC T0, B5;\
   958	AESENC T0, B6;\
   959	AESENC T0, B7;\
   960	MOVOU (16*i)(ctx), T0;\
   961	PSHUFB BSWAP, T0;\
   962	PCLMULQDQ $0x00, T0, T1;\
   963	PXOR T1, ACC0;\
   964	PSHUFD $78, T0, T1;\
   965	PCLMULQDQ $0x11, T0, T2;\
   966	PXOR T1, T0;\
   967	PXOR T2, ACC1;\
   968	MOVOU (16*(i*2+1))(pTbl), T2;\
   969	PCLMULQDQ $0x00, T2, T0;\
   970	PXOR T0, ACCM
   971
   972	MOVQ productTable+0(FP), pTbl
   973	MOVQ dst+8(FP), ptx
   974	MOVQ src_base+32(FP), ctx
   975	MOVQ src_len+40(FP), ptxLen
   976	MOVQ ctr+56(FP), ctrPtr
   977	MOVQ T+64(FP), tPtr
   978	MOVQ ks_base+72(FP), ks
   979	MOVQ ks_len+80(FP), NR
   980
   981	SHRQ $2, NR
   982	DECQ NR
   983
   984	MOVOU bswapMask<>(SB), BSWAP
   985	MOVOU gcmPoly<>(SB), POLY
   986
   987	MOVOU (tPtr), ACC0
   988	PXOR ACC1, ACC1
   989	PXOR ACCM, ACCM
   990	MOVOU (ctrPtr), B0
   991	MOVL (3*4)(ctrPtr), aluCTR
   992	MOVOU (ks), T0
   993	MOVL (3*4)(ks), aluK
   994	BSWAPL aluCTR
   995	BSWAPL aluK
   996
   997	PXOR B0, T0
   998	MOVOU T0, (0*16)(SP)
   999	increment(0)
  1000
  1001	CMPQ ptxLen, $128
  1002	JB gcmAesDecSingles
  1003
  1004	MOVOU T0, (1*16)(SP)
  1005	increment(1)
  1006	MOVOU T0, (2*16)(SP)
  1007	increment(2)
  1008	MOVOU T0, (3*16)(SP)
  1009	increment(3)
  1010	MOVOU T0, (4*16)(SP)
  1011	increment(4)
  1012	MOVOU T0, (5*16)(SP)
  1013	increment(5)
  1014	MOVOU T0, (6*16)(SP)
  1015	increment(6)
  1016	MOVOU T0, (7*16)(SP)
  1017	increment(7)
  1018
  1019gcmAesDecOctetsLoop:
  1020
  1021		CMPQ ptxLen, $128
  1022		JB gcmAesDecEndOctets
  1023		SUBQ $128, ptxLen
  1024
  1025		MOVOU (0*16)(SP), B0
  1026		MOVOU (1*16)(SP), B1
  1027		MOVOU (2*16)(SP), B2
  1028		MOVOU (3*16)(SP), B3
  1029		MOVOU (4*16)(SP), B4
  1030		MOVOU (5*16)(SP), B5
  1031		MOVOU (6*16)(SP), B6
  1032		MOVOU (7*16)(SP), B7
  1033
  1034		MOVOU (16*0)(ctx), T0
  1035		PSHUFB BSWAP, T0
  1036		PXOR ACC0, T0
  1037		PSHUFD $78, T0, T1
  1038		PXOR T0, T1
  1039
  1040		MOVOU (16*0)(pTbl), ACC0
  1041		MOVOU (16*1)(pTbl), ACCM
  1042		MOVOU ACC0, ACC1
  1043
  1044		PCLMULQDQ $0x00, T1, ACCM
  1045		PCLMULQDQ $0x00, T0, ACC0
  1046		PCLMULQDQ $0x11, T0, ACC1
  1047
  1048		combinedDecRound(1)
  1049		increment(0)
  1050		combinedDecRound(2)
  1051		increment(1)
  1052		combinedDecRound(3)
  1053		increment(2)
  1054		combinedDecRound(4)
  1055		increment(3)
  1056		combinedDecRound(5)
  1057		increment(4)
  1058		combinedDecRound(6)
  1059		increment(5)
  1060		combinedDecRound(7)
  1061		increment(6)
  1062
  1063		aesRound(8)
  1064		increment(7)
  1065
  1066		PXOR ACC0, ACCM
  1067		PXOR ACC1, ACCM
  1068		MOVOU ACCM, T0
  1069		PSRLDQ $8, ACCM
  1070		PSLLDQ $8, T0
  1071		PXOR ACCM, ACC1
  1072		PXOR T0, ACC0
  1073
  1074		reduceRound(ACC0)
  1075		aesRound(9)
  1076
  1077		reduceRound(ACC0)
  1078		PXOR ACC1, ACC0
  1079
  1080		MOVOU (16*10)(ks), T0
  1081		CMPQ NR, $12
  1082		JB decLast1
  1083		aesRnd(T0)
  1084		aesRound(11)
  1085		MOVOU (16*12)(ks), T0
  1086		JE decLast1
  1087		aesRnd(T0)
  1088		aesRound(13)
  1089		MOVOU (16*14)(ks), T0
  1090decLast1:
  1091		aesRndLast(T0)
  1092
  1093		MOVOU (16*0)(ctx), T0
  1094		PXOR T0, B0
  1095		MOVOU (16*1)(ctx), T0
  1096		PXOR T0, B1
  1097		MOVOU (16*2)(ctx), T0
  1098		PXOR T0, B2
  1099		MOVOU (16*3)(ctx), T0
  1100		PXOR T0, B3
  1101		MOVOU (16*4)(ctx), T0
  1102		PXOR T0, B4
  1103		MOVOU (16*5)(ctx), T0
  1104		PXOR T0, B5
  1105		MOVOU (16*6)(ctx), T0
  1106		PXOR T0, B6
  1107		MOVOU (16*7)(ctx), T0
  1108		PXOR T0, B7
  1109
  1110		MOVOU B0, (16*0)(ptx)
  1111		MOVOU B1, (16*1)(ptx)
  1112		MOVOU B2, (16*2)(ptx)
  1113		MOVOU B3, (16*3)(ptx)
  1114		MOVOU B4, (16*4)(ptx)
  1115		MOVOU B5, (16*5)(ptx)
  1116		MOVOU B6, (16*6)(ptx)
  1117		MOVOU B7, (16*7)(ptx)
  1118
  1119		LEAQ 128(ptx), ptx
  1120		LEAQ 128(ctx), ctx
  1121
  1122		JMP gcmAesDecOctetsLoop
  1123
  1124gcmAesDecEndOctets:
  1125
  1126	SUBQ $7, aluCTR
  1127
  1128gcmAesDecSingles:
  1129
  1130	MOVOU (16*1)(ks), B1
  1131	MOVOU (16*2)(ks), B2
  1132	MOVOU (16*3)(ks), B3
  1133	MOVOU (16*4)(ks), B4
  1134	MOVOU (16*5)(ks), B5
  1135	MOVOU (16*6)(ks), B6
  1136	MOVOU (16*7)(ks), B7
  1137
  1138	MOVOU (16*14)(pTbl), T2
  1139
  1140gcmAesDecSinglesLoop:
  1141
  1142		CMPQ ptxLen, $16
  1143		JB gcmAesDecTail
  1144		SUBQ $16, ptxLen
  1145
  1146		MOVOU (ctx), B0
  1147		MOVOU B0, T1
  1148		PSHUFB BSWAP, B0
  1149		PXOR ACC0, B0
  1150
  1151		MOVOU T2, ACC0
  1152		MOVOU T2, ACC1
  1153		MOVOU (16*15)(pTbl), ACCM
  1154
  1155		PCLMULQDQ $0x00, B0, ACC0
  1156		PCLMULQDQ $0x11, B0, ACC1
  1157		PSHUFD $78, B0, T0
  1158		PXOR B0, T0
  1159		PCLMULQDQ $0x00, T0, ACCM
  1160
  1161		PXOR ACC0, ACCM
  1162		PXOR ACC1, ACCM
  1163		MOVOU ACCM, T0
  1164		PSRLDQ $8, ACCM
  1165		PSLLDQ $8, T0
  1166		PXOR ACCM, ACC1
  1167		PXOR T0, ACC0
  1168
  1169		reduceRound(ACC0)
  1170		reduceRound(ACC0)
  1171		PXOR ACC1, ACC0
  1172
  1173		MOVOU (0*16)(SP), B0
  1174		increment(0)
  1175		AESENC B1, B0
  1176		AESENC B2, B0
  1177		AESENC B3, B0
  1178		AESENC B4, B0
  1179		AESENC B5, B0
  1180		AESENC B6, B0
  1181		AESENC B7, B0
  1182		MOVOU (16*8)(ks), T0
  1183		AESENC T0, B0
  1184		MOVOU (16*9)(ks), T0
  1185		AESENC T0, B0
  1186		MOVOU (16*10)(ks), T0
  1187		CMPQ NR, $12
  1188		JB decLast2
  1189		AESENC T0, B0
  1190		MOVOU (16*11)(ks), T0
  1191		AESENC T0, B0
  1192		MOVOU (16*12)(ks), T0
  1193		JE decLast2
  1194		AESENC T0, B0
  1195		MOVOU (16*13)(ks), T0
  1196		AESENC T0, B0
  1197		MOVOU (16*14)(ks), T0
  1198decLast2:
  1199		AESENCLAST T0, B0
  1200
  1201		PXOR T1, B0
  1202		MOVOU B0, (ptx)
  1203
  1204		LEAQ (16*1)(ptx), ptx
  1205		LEAQ (16*1)(ctx), ctx
  1206
  1207	JMP gcmAesDecSinglesLoop
  1208
  1209gcmAesDecTail:
  1210
  1211	TESTQ ptxLen, ptxLen
  1212	JE gcmAesDecDone
  1213
  1214	MOVQ ptxLen, aluTMP
  1215	SHLQ $4, aluTMP
  1216	LEAQ andMask<>(SB), aluCTR
  1217	MOVOU -16(aluCTR)(aluTMP*1), T1
  1218
  1219	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1220	PAND T1, B0
  1221
  1222	MOVOU B0, T1
  1223	PSHUFB BSWAP, B0
  1224	PXOR ACC0, B0
  1225
  1226	MOVOU (16*14)(pTbl), ACC0
  1227	MOVOU (16*15)(pTbl), ACCM
  1228	MOVOU ACC0, ACC1
  1229
  1230	PCLMULQDQ $0x00, B0, ACC0
  1231	PCLMULQDQ $0x11, B0, ACC1
  1232	PSHUFD $78, B0, T0
  1233	PXOR B0, T0
  1234	PCLMULQDQ $0x00, T0, ACCM
  1235
  1236	PXOR ACC0, ACCM
  1237	PXOR ACC1, ACCM
  1238	MOVOU ACCM, T0
  1239	PSRLDQ $8, ACCM
  1240	PSLLDQ $8, T0
  1241	PXOR ACCM, ACC1
  1242	PXOR T0, ACC0
  1243
  1244	reduceRound(ACC0)
  1245	reduceRound(ACC0)
  1246	PXOR ACC1, ACC0
  1247
  1248	MOVOU (0*16)(SP), B0
  1249	increment(0)
  1250	AESENC B1, B0
  1251	AESENC B2, B0
  1252	AESENC B3, B0
  1253	AESENC B4, B0
  1254	AESENC B5, B0
  1255	AESENC B6, B0
  1256	AESENC B7, B0
  1257	MOVOU (16*8)(ks), T0
  1258	AESENC T0, B0
  1259	MOVOU (16*9)(ks), T0
  1260	AESENC T0, B0
  1261	MOVOU (16*10)(ks), T0
  1262	CMPQ NR, $12
  1263	JB decLast3
  1264	AESENC T0, B0
  1265	MOVOU (16*11)(ks), T0
  1266	AESENC T0, B0
  1267	MOVOU (16*12)(ks), T0
  1268	JE decLast3
  1269	AESENC T0, B0
  1270	MOVOU (16*13)(ks), T0
  1271	AESENC T0, B0
  1272	MOVOU (16*14)(ks), T0
  1273decLast3:
  1274	AESENCLAST T0, B0
  1275	PXOR T1, B0
  1276
  1277ptxStoreLoop:
  1278		PEXTRB $0, B0, (ptx)
  1279		PSRLDQ $1, B0
  1280		LEAQ 1(ptx), ptx
  1281		DECQ ptxLen
  1282
  1283	JNE ptxStoreLoop
  1284
  1285gcmAesDecDone:
  1286
  1287	MOVOU ACC0, (tPtr)
  1288	RET

View as plain text