...

Text file src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s

Documentation: vendor/golang.org/x/crypto/chacha20poly1305

     1// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
     2
     3//go:build gc && !purego
     4
     5#include "textflag.h"
     6
     7// func polyHashADInternal<>()
     8TEXT polyHashADInternal<>(SB), NOSPLIT, $0
     9	// Hack: Must declare #define macros inside of a function due to Avo constraints
    10	// ROL rotates the uint32s in register R left by N bits, using temporary T.
    11	#define ROL(N, R, T) \
    12		MOVO R, T; \
    13		PSLLL $(N), T; \
    14		PSRLL $(32-(N)), R; \
    15		PXOR T, R
    16
    17	// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
    18	#ifdef GOAMD64_v2
    19		#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
    20	#else
    21		#define ROL8(R, T) ROL(8, R, T)
    22	#endif
    23
    24	// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
    25	#ifdef GOAMD64_v2
    26		#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
    27	#else
    28		#define ROL16(R, T) ROL(16, R, T)
    29	#endif
    30	XORQ  R10, R10
    31	XORQ  R11, R11
    32	XORQ  R12, R12
    33	CMPQ  R9, $0x0d
    34	JNE   hashADLoop
    35	MOVQ  (CX), R10
    36	MOVQ  5(CX), R11
    37	SHRQ  $0x18, R11
    38	MOVQ  $0x00000001, R12
    39	MOVQ  (BP), AX
    40	MOVQ  AX, R15
    41	MULQ  R10
    42	MOVQ  AX, R13
    43	MOVQ  DX, R14
    44	MOVQ  (BP), AX
    45	MULQ  R11
    46	IMULQ R12, R15
    47	ADDQ  AX, R14
    48	ADCQ  DX, R15
    49	MOVQ  8(BP), AX
    50	MOVQ  AX, R8
    51	MULQ  R10
    52	ADDQ  AX, R14
    53	ADCQ  $0x00, DX
    54	MOVQ  DX, R10
    55	MOVQ  8(BP), AX
    56	MULQ  R11
    57	ADDQ  AX, R15
    58	ADCQ  $0x00, DX
    59	IMULQ R12, R8
    60	ADDQ  R10, R15
    61	ADCQ  DX, R8
    62	MOVQ  R13, R10
    63	MOVQ  R14, R11
    64	MOVQ  R15, R12
    65	ANDQ  $0x03, R12
    66	MOVQ  R15, R13
    67	ANDQ  $-4, R13
    68	MOVQ  R8, R14
    69	SHRQ  $0x02, R8, R15
    70	SHRQ  $0x02, R8
    71	ADDQ  R13, R10
    72	ADCQ  R14, R11
    73	ADCQ  $0x00, R12
    74	ADDQ  R15, R10
    75	ADCQ  R8, R11
    76	ADCQ  $0x00, R12
    77	RET
    78
    79hashADLoop:
    80	// Hash in 16 byte chunks
    81	CMPQ  R9, $0x10
    82	JB    hashADTail
    83	ADDQ  (CX), R10
    84	ADCQ  8(CX), R11
    85	ADCQ  $0x01, R12
    86	LEAQ  16(CX), CX
    87	SUBQ  $0x10, R9
    88	MOVQ  (BP), AX
    89	MOVQ  AX, R15
    90	MULQ  R10
    91	MOVQ  AX, R13
    92	MOVQ  DX, R14
    93	MOVQ  (BP), AX
    94	MULQ  R11
    95	IMULQ R12, R15
    96	ADDQ  AX, R14
    97	ADCQ  DX, R15
    98	MOVQ  8(BP), AX
    99	MOVQ  AX, R8
   100	MULQ  R10
   101	ADDQ  AX, R14
   102	ADCQ  $0x00, DX
   103	MOVQ  DX, R10
   104	MOVQ  8(BP), AX
   105	MULQ  R11
   106	ADDQ  AX, R15
   107	ADCQ  $0x00, DX
   108	IMULQ R12, R8
   109	ADDQ  R10, R15
   110	ADCQ  DX, R8
   111	MOVQ  R13, R10
   112	MOVQ  R14, R11
   113	MOVQ  R15, R12
   114	ANDQ  $0x03, R12
   115	MOVQ  R15, R13
   116	ANDQ  $-4, R13
   117	MOVQ  R8, R14
   118	SHRQ  $0x02, R8, R15
   119	SHRQ  $0x02, R8
   120	ADDQ  R13, R10
   121	ADCQ  R14, R11
   122	ADCQ  $0x00, R12
   123	ADDQ  R15, R10
   124	ADCQ  R8, R11
   125	ADCQ  $0x00, R12
   126	JMP   hashADLoop
   127
   128hashADTail:
   129	CMPQ R9, $0x00
   130	JE   hashADDone
   131
   132	// Hash last < 16 byte tail
   133	XORQ R13, R13
   134	XORQ R14, R14
   135	XORQ R15, R15
   136	ADDQ R9, CX
   137
   138hashADTailLoop:
   139	SHLQ  $0x08, R13, R14
   140	SHLQ  $0x08, R13
   141	MOVB  -1(CX), R15
   142	XORQ  R15, R13
   143	DECQ  CX
   144	DECQ  R9
   145	JNE   hashADTailLoop
   146	ADDQ  R13, R10
   147	ADCQ  R14, R11
   148	ADCQ  $0x01, R12
   149	MOVQ  (BP), AX
   150	MOVQ  AX, R15
   151	MULQ  R10
   152	MOVQ  AX, R13
   153	MOVQ  DX, R14
   154	MOVQ  (BP), AX
   155	MULQ  R11
   156	IMULQ R12, R15
   157	ADDQ  AX, R14
   158	ADCQ  DX, R15
   159	MOVQ  8(BP), AX
   160	MOVQ  AX, R8
   161	MULQ  R10
   162	ADDQ  AX, R14
   163	ADCQ  $0x00, DX
   164	MOVQ  DX, R10
   165	MOVQ  8(BP), AX
   166	MULQ  R11
   167	ADDQ  AX, R15
   168	ADCQ  $0x00, DX
   169	IMULQ R12, R8
   170	ADDQ  R10, R15
   171	ADCQ  DX, R8
   172	MOVQ  R13, R10
   173	MOVQ  R14, R11
   174	MOVQ  R15, R12
   175	ANDQ  $0x03, R12
   176	MOVQ  R15, R13
   177	ANDQ  $-4, R13
   178	MOVQ  R8, R14
   179	SHRQ  $0x02, R8, R15
   180	SHRQ  $0x02, R8
   181	ADDQ  R13, R10
   182	ADCQ  R14, R11
   183	ADCQ  $0x00, R12
   184	ADDQ  R15, R10
   185	ADCQ  R8, R11
   186	ADCQ  $0x00, R12
   187
   188hashADDone:
   189	RET
   190
   191// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
   192// Requires: AVX, AVX2, BMI2, CMOV, SSE2
   193TEXT ·chacha20Poly1305Open(SB), $288-97
   194	// For aligned stack access
   195	MOVQ SP, BP
   196	ADDQ $0x20, BP
   197	ANDQ $-32, BP
   198	MOVQ dst_base+0(FP), DI
   199	MOVQ key_base+24(FP), R8
   200	MOVQ src_base+48(FP), SI
   201	MOVQ src_len+56(FP), BX
   202	MOVQ ad_base+72(FP), CX
   203
   204	// Check for AVX2 support
   205	CMPB ·useAVX2+0(SB), $0x01
   206	JE   chacha20Poly1305Open_AVX2
   207
   208	// Special optimization, for very short buffers
   209	CMPQ BX, $0x80
   210	JBE  openSSE128
   211
   212	// For long buffers, prepare the poly key first
   213	MOVOU ·chacha20Constants<>+0(SB), X0
   214	MOVOU 16(R8), X3
   215	MOVOU 32(R8), X6
   216	MOVOU 48(R8), X9
   217	MOVO  X9, X13
   218
   219	// Store state on stack for future use
   220	MOVO X3, 32(BP)
   221	MOVO X6, 48(BP)
   222	MOVO X9, 128(BP)
   223	MOVQ $0x0000000a, R9
   224
   225openSSEPreparePolyKey:
   226	PADDD X3, X0
   227	PXOR  X0, X9
   228	ROL16(X9, X12)
   229	PADDD X9, X6
   230	PXOR  X6, X3
   231	MOVO  X3, X12
   232	PSLLL $0x0c, X12
   233	PSRLL $0x14, X3
   234	PXOR  X12, X3
   235	PADDD X3, X0
   236	PXOR  X0, X9
   237	ROL8(X9, X12)
   238	PADDD X9, X6
   239	PXOR  X6, X3
   240	MOVO  X3, X12
   241	PSLLL $0x07, X12
   242	PSRLL $0x19, X3
   243	PXOR  X12, X3
   244	BYTE  $0x66
   245	BYTE  $0x0f
   246	BYTE  $0x3a
   247	BYTE  $0x0f
   248	BYTE  $0xdb
   249	BYTE  $0x04
   250	BYTE  $0x66
   251	BYTE  $0x0f
   252	BYTE  $0x3a
   253	BYTE  $0x0f
   254	BYTE  $0xf6
   255	BYTE  $0x08
   256	BYTE  $0x66
   257	BYTE  $0x45
   258	BYTE  $0x0f
   259	BYTE  $0x3a
   260	BYTE  $0x0f
   261	BYTE  $0xc9
   262	BYTE  $0x0c
   263	PADDD X3, X0
   264	PXOR  X0, X9
   265	ROL16(X9, X12)
   266	PADDD X9, X6
   267	PXOR  X6, X3
   268	MOVO  X3, X12
   269	PSLLL $0x0c, X12
   270	PSRLL $0x14, X3
   271	PXOR  X12, X3
   272	PADDD X3, X0
   273	PXOR  X0, X9
   274	ROL8(X9, X12)
   275	PADDD X9, X6
   276	PXOR  X6, X3
   277	MOVO  X3, X12
   278	PSLLL $0x07, X12
   279	PSRLL $0x19, X3
   280	PXOR  X12, X3
   281	BYTE  $0x66
   282	BYTE  $0x0f
   283	BYTE  $0x3a
   284	BYTE  $0x0f
   285	BYTE  $0xdb
   286	BYTE  $0x0c
   287	BYTE  $0x66
   288	BYTE  $0x0f
   289	BYTE  $0x3a
   290	BYTE  $0x0f
   291	BYTE  $0xf6
   292	BYTE  $0x08
   293	BYTE  $0x66
   294	BYTE  $0x45
   295	BYTE  $0x0f
   296	BYTE  $0x3a
   297	BYTE  $0x0f
   298	BYTE  $0xc9
   299	BYTE  $0x04
   300	DECQ  R9
   301	JNE   openSSEPreparePolyKey
   302
   303	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   304	PADDL ·chacha20Constants<>+0(SB), X0
   305	PADDL 32(BP), X3
   306
   307	// Clamp and store the key
   308	PAND ·polyClampMask<>+0(SB), X0
   309	MOVO X0, (BP)
   310	MOVO X3, 16(BP)
   311
   312	// Hash AAD
   313	MOVQ ad_len+80(FP), R9
   314	CALL polyHashADInternal<>(SB)
   315
   316openSSEMainLoop:
   317	CMPQ BX, $0x00000100
   318	JB   openSSEMainLoopDone
   319
   320	// Load state, increment counter blocks
   321	MOVO  ·chacha20Constants<>+0(SB), X0
   322	MOVO  32(BP), X3
   323	MOVO  48(BP), X6
   324	MOVO  128(BP), X9
   325	PADDL ·sseIncMask<>+0(SB), X9
   326	MOVO  X0, X1
   327	MOVO  X3, X4
   328	MOVO  X6, X7
   329	MOVO  X9, X10
   330	PADDL ·sseIncMask<>+0(SB), X10
   331	MOVO  X1, X2
   332	MOVO  X4, X5
   333	MOVO  X7, X8
   334	MOVO  X10, X11
   335	PADDL ·sseIncMask<>+0(SB), X11
   336	MOVO  X2, X12
   337	MOVO  X5, X13
   338	MOVO  X8, X14
   339	MOVO  X11, X15
   340	PADDL ·sseIncMask<>+0(SB), X15
   341
   342	// Store counters
   343	MOVO X9, 80(BP)
   344	MOVO X10, 96(BP)
   345	MOVO X11, 112(BP)
   346	MOVO X15, 128(BP)
   347
   348	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
   349	// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   350	MOVQ $0x00000004, CX
   351	MOVQ SI, R9
   352
   353openSSEInternalLoop:
   354	MOVO  X14, 64(BP)
   355	PADDD X3, X0
   356	PXOR  X0, X9
   357	ROL16(X9, X14)
   358	PADDD X9, X6
   359	PXOR  X6, X3
   360	MOVO  X3, X14
   361	PSLLL $0x0c, X14
   362	PSRLL $0x14, X3
   363	PXOR  X14, X3
   364	PADDD X3, X0
   365	PXOR  X0, X9
   366	ROL8(X9, X14)
   367	PADDD X9, X6
   368	PXOR  X6, X3
   369	MOVO  X3, X14
   370	PSLLL $0x07, X14
   371	PSRLL $0x19, X3
   372	PXOR  X14, X3
   373	PADDD X4, X1
   374	PXOR  X1, X10
   375	ROL16(X10, X14)
   376	PADDD X10, X7
   377	PXOR  X7, X4
   378	MOVO  X4, X14
   379	PSLLL $0x0c, X14
   380	PSRLL $0x14, X4
   381	PXOR  X14, X4
   382	PADDD X4, X1
   383	PXOR  X1, X10
   384	ROL8(X10, X14)
   385	PADDD X10, X7
   386	PXOR  X7, X4
   387	MOVO  X4, X14
   388	PSLLL $0x07, X14
   389	PSRLL $0x19, X4
   390	PXOR  X14, X4
   391	PADDD X5, X2
   392	PXOR  X2, X11
   393	ROL16(X11, X14)
   394	PADDD X11, X8
   395	PXOR  X8, X5
   396	MOVO  X5, X14
   397	PSLLL $0x0c, X14
   398	PSRLL $0x14, X5
   399	PXOR  X14, X5
   400	PADDD X5, X2
   401	PXOR  X2, X11
   402	ROL8(X11, X14)
   403	PADDD X11, X8
   404	PXOR  X8, X5
   405	MOVO  X5, X14
   406	PSLLL $0x07, X14
   407	PSRLL $0x19, X5
   408	PXOR  X14, X5
   409	MOVO  64(BP), X14
   410	MOVO  X7, 64(BP)
   411	PADDD X13, X12
   412	PXOR  X12, X15
   413	ROL16(X15, X7)
   414	PADDD X15, X14
   415	PXOR  X14, X13
   416	MOVO  X13, X7
   417	PSLLL $0x0c, X7
   418	PSRLL $0x14, X13
   419	PXOR  X7, X13
   420	PADDD X13, X12
   421	PXOR  X12, X15
   422	ROL8(X15, X7)
   423	PADDD X15, X14
   424	PXOR  X14, X13
   425	MOVO  X13, X7
   426	PSLLL $0x07, X7
   427	PSRLL $0x19, X13
   428	PXOR  X7, X13
   429	MOVO  64(BP), X7
   430	ADDQ  (R9), R10
   431	ADCQ  8(R9), R11
   432	ADCQ  $0x01, R12
   433	BYTE  $0x66
   434	BYTE  $0x0f
   435	BYTE  $0x3a
   436	BYTE  $0x0f
   437	BYTE  $0xdb
   438	BYTE  $0x04
   439	BYTE  $0x66
   440	BYTE  $0x0f
   441	BYTE  $0x3a
   442	BYTE  $0x0f
   443	BYTE  $0xe4
   444	BYTE  $0x04
   445	BYTE  $0x66
   446	BYTE  $0x0f
   447	BYTE  $0x3a
   448	BYTE  $0x0f
   449	BYTE  $0xed
   450	BYTE  $0x04
   451	BYTE  $0x66
   452	BYTE  $0x45
   453	BYTE  $0x0f
   454	BYTE  $0x3a
   455	BYTE  $0x0f
   456	BYTE  $0xed
   457	BYTE  $0x04
   458	BYTE  $0x66
   459	BYTE  $0x0f
   460	BYTE  $0x3a
   461	BYTE  $0x0f
   462	BYTE  $0xf6
   463	BYTE  $0x08
   464	BYTE  $0x66
   465	BYTE  $0x0f
   466	BYTE  $0x3a
   467	BYTE  $0x0f
   468	BYTE  $0xff
   469	BYTE  $0x08
   470	BYTE  $0x66
   471	BYTE  $0x45
   472	BYTE  $0x0f
   473	BYTE  $0x3a
   474	BYTE  $0x0f
   475	BYTE  $0xc0
   476	BYTE  $0x08
   477	BYTE  $0x66
   478	BYTE  $0x45
   479	BYTE  $0x0f
   480	BYTE  $0x3a
   481	BYTE  $0x0f
   482	BYTE  $0xf6
   483	BYTE  $0x08
   484	BYTE  $0x66
   485	BYTE  $0x45
   486	BYTE  $0x0f
   487	BYTE  $0x3a
   488	BYTE  $0x0f
   489	BYTE  $0xc9
   490	BYTE  $0x0c
   491	BYTE  $0x66
   492	BYTE  $0x45
   493	BYTE  $0x0f
   494	BYTE  $0x3a
   495	BYTE  $0x0f
   496	BYTE  $0xd2
   497	BYTE  $0x0c
   498	BYTE  $0x66
   499	BYTE  $0x45
   500	BYTE  $0x0f
   501	BYTE  $0x3a
   502	BYTE  $0x0f
   503	BYTE  $0xdb
   504	BYTE  $0x0c
   505	BYTE  $0x66
   506	BYTE  $0x45
   507	BYTE  $0x0f
   508	BYTE  $0x3a
   509	BYTE  $0x0f
   510	BYTE  $0xff
   511	BYTE  $0x0c
   512	MOVQ  (BP), AX
   513	MOVQ  AX, R15
   514	MULQ  R10
   515	MOVQ  AX, R13
   516	MOVQ  DX, R14
   517	MOVQ  (BP), AX
   518	MULQ  R11
   519	IMULQ R12, R15
   520	ADDQ  AX, R14
   521	ADCQ  DX, R15
   522	MOVQ  8(BP), AX
   523	MOVQ  AX, R8
   524	MULQ  R10
   525	ADDQ  AX, R14
   526	ADCQ  $0x00, DX
   527	MOVQ  DX, R10
   528	MOVQ  8(BP), AX
   529	MULQ  R11
   530	ADDQ  AX, R15
   531	ADCQ  $0x00, DX
   532	LEAQ  16(R9), R9
   533	MOVO  X14, 64(BP)
   534	PADDD X3, X0
   535	PXOR  X0, X9
   536	ROL16(X9, X14)
   537	PADDD X9, X6
   538	PXOR  X6, X3
   539	MOVO  X3, X14
   540	PSLLL $0x0c, X14
   541	PSRLL $0x14, X3
   542	PXOR  X14, X3
   543	PADDD X3, X0
   544	PXOR  X0, X9
   545	ROL8(X9, X14)
   546	PADDD X9, X6
   547	PXOR  X6, X3
   548	MOVO  X3, X14
   549	PSLLL $0x07, X14
   550	PSRLL $0x19, X3
   551	PXOR  X14, X3
   552	PADDD X4, X1
   553	PXOR  X1, X10
   554	ROL16(X10, X14)
   555	PADDD X10, X7
   556	PXOR  X7, X4
   557	MOVO  X4, X14
   558	PSLLL $0x0c, X14
   559	PSRLL $0x14, X4
   560	PXOR  X14, X4
   561	PADDD X4, X1
   562	PXOR  X1, X10
   563	ROL8(X10, X14)
   564	PADDD X10, X7
   565	PXOR  X7, X4
   566	MOVO  X4, X14
   567	PSLLL $0x07, X14
   568	PSRLL $0x19, X4
   569	PXOR  X14, X4
   570	PADDD X5, X2
   571	PXOR  X2, X11
   572	ROL16(X11, X14)
   573	PADDD X11, X8
   574	PXOR  X8, X5
   575	MOVO  X5, X14
   576	PSLLL $0x0c, X14
   577	PSRLL $0x14, X5
   578	PXOR  X14, X5
   579	PADDD X5, X2
   580	PXOR  X2, X11
   581	ROL8(X11, X14)
   582	PADDD X11, X8
   583	PXOR  X8, X5
   584	MOVO  X5, X14
   585	PSLLL $0x07, X14
   586	PSRLL $0x19, X5
   587	PXOR  X14, X5
   588	MOVO  64(BP), X14
   589	MOVO  X7, 64(BP)
   590	IMULQ R12, R8
   591	ADDQ  R10, R15
   592	ADCQ  DX, R8
   593	PADDD X13, X12
   594	PXOR  X12, X15
   595	ROL16(X15, X7)
   596	PADDD X15, X14
   597	PXOR  X14, X13
   598	MOVO  X13, X7
   599	PSLLL $0x0c, X7
   600	PSRLL $0x14, X13
   601	PXOR  X7, X13
   602	PADDD X13, X12
   603	PXOR  X12, X15
   604	ROL8(X15, X7)
   605	PADDD X15, X14
   606	PXOR  X14, X13
   607	MOVO  X13, X7
   608	PSLLL $0x07, X7
   609	PSRLL $0x19, X13
   610	PXOR  X7, X13
   611	MOVO  64(BP), X7
   612	MOVQ  R13, R10
   613	MOVQ  R14, R11
   614	MOVQ  R15, R12
   615	ANDQ  $0x03, R12
   616	MOVQ  R15, R13
   617	ANDQ  $-4, R13
   618	MOVQ  R8, R14
   619	SHRQ  $0x02, R8, R15
   620	SHRQ  $0x02, R8
   621	ADDQ  R13, R10
   622	ADCQ  R14, R11
   623	ADCQ  $0x00, R12
   624	ADDQ  R15, R10
   625	ADCQ  R8, R11
   626	ADCQ  $0x00, R12
   627	BYTE  $0x66
   628	BYTE  $0x0f
   629	BYTE  $0x3a
   630	BYTE  $0x0f
   631	BYTE  $0xdb
   632	BYTE  $0x0c
   633	BYTE  $0x66
   634	BYTE  $0x0f
   635	BYTE  $0x3a
   636	BYTE  $0x0f
   637	BYTE  $0xe4
   638	BYTE  $0x0c
   639	BYTE  $0x66
   640	BYTE  $0x0f
   641	BYTE  $0x3a
   642	BYTE  $0x0f
   643	BYTE  $0xed
   644	BYTE  $0x0c
   645	BYTE  $0x66
   646	BYTE  $0x45
   647	BYTE  $0x0f
   648	BYTE  $0x3a
   649	BYTE  $0x0f
   650	BYTE  $0xed
   651	BYTE  $0x0c
   652	BYTE  $0x66
   653	BYTE  $0x0f
   654	BYTE  $0x3a
   655	BYTE  $0x0f
   656	BYTE  $0xf6
   657	BYTE  $0x08
   658	BYTE  $0x66
   659	BYTE  $0x0f
   660	BYTE  $0x3a
   661	BYTE  $0x0f
   662	BYTE  $0xff
   663	BYTE  $0x08
   664	BYTE  $0x66
   665	BYTE  $0x45
   666	BYTE  $0x0f
   667	BYTE  $0x3a
   668	BYTE  $0x0f
   669	BYTE  $0xc0
   670	BYTE  $0x08
   671	BYTE  $0x66
   672	BYTE  $0x45
   673	BYTE  $0x0f
   674	BYTE  $0x3a
   675	BYTE  $0x0f
   676	BYTE  $0xf6
   677	BYTE  $0x08
   678	BYTE  $0x66
   679	BYTE  $0x45
   680	BYTE  $0x0f
   681	BYTE  $0x3a
   682	BYTE  $0x0f
   683	BYTE  $0xc9
   684	BYTE  $0x04
   685	BYTE  $0x66
   686	BYTE  $0x45
   687	BYTE  $0x0f
   688	BYTE  $0x3a
   689	BYTE  $0x0f
   690	BYTE  $0xd2
   691	BYTE  $0x04
   692	BYTE  $0x66
   693	BYTE  $0x45
   694	BYTE  $0x0f
   695	BYTE  $0x3a
   696	BYTE  $0x0f
   697	BYTE  $0xdb
   698	BYTE  $0x04
   699	BYTE  $0x66
   700	BYTE  $0x45
   701	BYTE  $0x0f
   702	BYTE  $0x3a
   703	BYTE  $0x0f
   704	BYTE  $0xff
   705	BYTE  $0x04
   706	DECQ  CX
   707	JGE   openSSEInternalLoop
   708	ADDQ  (R9), R10
   709	ADCQ  8(R9), R11
   710	ADCQ  $0x01, R12
   711	MOVQ  (BP), AX
   712	MOVQ  AX, R15
   713	MULQ  R10
   714	MOVQ  AX, R13
   715	MOVQ  DX, R14
   716	MOVQ  (BP), AX
   717	MULQ  R11
   718	IMULQ R12, R15
   719	ADDQ  AX, R14
   720	ADCQ  DX, R15
   721	MOVQ  8(BP), AX
   722	MOVQ  AX, R8
   723	MULQ  R10
   724	ADDQ  AX, R14
   725	ADCQ  $0x00, DX
   726	MOVQ  DX, R10
   727	MOVQ  8(BP), AX
   728	MULQ  R11
   729	ADDQ  AX, R15
   730	ADCQ  $0x00, DX
   731	IMULQ R12, R8
   732	ADDQ  R10, R15
   733	ADCQ  DX, R8
   734	MOVQ  R13, R10
   735	MOVQ  R14, R11
   736	MOVQ  R15, R12
   737	ANDQ  $0x03, R12
   738	MOVQ  R15, R13
   739	ANDQ  $-4, R13
   740	MOVQ  R8, R14
   741	SHRQ  $0x02, R8, R15
   742	SHRQ  $0x02, R8
   743	ADDQ  R13, R10
   744	ADCQ  R14, R11
   745	ADCQ  $0x00, R12
   746	ADDQ  R15, R10
   747	ADCQ  R8, R11
   748	ADCQ  $0x00, R12
   749	LEAQ  16(R9), R9
   750	CMPQ  CX, $-6
   751	JG    openSSEInternalLoop
   752
   753	// Add in the state
   754	PADDD ·chacha20Constants<>+0(SB), X0
   755	PADDD ·chacha20Constants<>+0(SB), X1
   756	PADDD ·chacha20Constants<>+0(SB), X2
   757	PADDD ·chacha20Constants<>+0(SB), X12
   758	PADDD 32(BP), X3
   759	PADDD 32(BP), X4
   760	PADDD 32(BP), X5
   761	PADDD 32(BP), X13
   762	PADDD 48(BP), X6
   763	PADDD 48(BP), X7
   764	PADDD 48(BP), X8
   765	PADDD 48(BP), X14
   766	PADDD 80(BP), X9
   767	PADDD 96(BP), X10
   768	PADDD 112(BP), X11
   769	PADDD 128(BP), X15
   770
   771	// Load - xor - store
   772	MOVO  X15, 64(BP)
   773	MOVOU (SI), X15
   774	PXOR  X15, X0
   775	MOVOU X0, (DI)
   776	MOVOU 16(SI), X15
   777	PXOR  X15, X3
   778	MOVOU X3, 16(DI)
   779	MOVOU 32(SI), X15
   780	PXOR  X15, X6
   781	MOVOU X6, 32(DI)
   782	MOVOU 48(SI), X15
   783	PXOR  X15, X9
   784	MOVOU X9, 48(DI)
   785	MOVOU 64(SI), X9
   786	PXOR  X9, X1
   787	MOVOU X1, 64(DI)
   788	MOVOU 80(SI), X9
   789	PXOR  X9, X4
   790	MOVOU X4, 80(DI)
   791	MOVOU 96(SI), X9
   792	PXOR  X9, X7
   793	MOVOU X7, 96(DI)
   794	MOVOU 112(SI), X9
   795	PXOR  X9, X10
   796	MOVOU X10, 112(DI)
   797	MOVOU 128(SI), X9
   798	PXOR  X9, X2
   799	MOVOU X2, 128(DI)
   800	MOVOU 144(SI), X9
   801	PXOR  X9, X5
   802	MOVOU X5, 144(DI)
   803	MOVOU 160(SI), X9
   804	PXOR  X9, X8
   805	MOVOU X8, 160(DI)
   806	MOVOU 176(SI), X9
   807	PXOR  X9, X11
   808	MOVOU X11, 176(DI)
   809	MOVOU 192(SI), X9
   810	PXOR  X9, X12
   811	MOVOU X12, 192(DI)
   812	MOVOU 208(SI), X9
   813	PXOR  X9, X13
   814	MOVOU X13, 208(DI)
   815	MOVOU 224(SI), X9
   816	PXOR  X9, X14
   817	MOVOU X14, 224(DI)
   818	MOVOU 240(SI), X9
   819	PXOR  64(BP), X9
   820	MOVOU X9, 240(DI)
   821	LEAQ  256(SI), SI
   822	LEAQ  256(DI), DI
   823	SUBQ  $0x00000100, BX
   824	JMP   openSSEMainLoop
   825
   826openSSEMainLoopDone:
   827	// Handle the various tail sizes efficiently
   828	TESTQ BX, BX
   829	JE    openSSEFinalize
   830	CMPQ  BX, $0x40
   831	JBE   openSSETail64
   832	CMPQ  BX, $0x80
   833	JBE   openSSETail128
   834	CMPQ  BX, $0xc0
   835	JBE   openSSETail192
   836	JMP   openSSETail256
   837
   838openSSEFinalize:
   839	// Hash in the PT, AAD lengths
   840	ADDQ  ad_len+80(FP), R10
   841	ADCQ  src_len+56(FP), R11
   842	ADCQ  $0x01, R12
   843	MOVQ  (BP), AX
   844	MOVQ  AX, R15
   845	MULQ  R10
   846	MOVQ  AX, R13
   847	MOVQ  DX, R14
   848	MOVQ  (BP), AX
   849	MULQ  R11
   850	IMULQ R12, R15
   851	ADDQ  AX, R14
   852	ADCQ  DX, R15
   853	MOVQ  8(BP), AX
   854	MOVQ  AX, R8
   855	MULQ  R10
   856	ADDQ  AX, R14
   857	ADCQ  $0x00, DX
   858	MOVQ  DX, R10
   859	MOVQ  8(BP), AX
   860	MULQ  R11
   861	ADDQ  AX, R15
   862	ADCQ  $0x00, DX
   863	IMULQ R12, R8
   864	ADDQ  R10, R15
   865	ADCQ  DX, R8
   866	MOVQ  R13, R10
   867	MOVQ  R14, R11
   868	MOVQ  R15, R12
   869	ANDQ  $0x03, R12
   870	MOVQ  R15, R13
   871	ANDQ  $-4, R13
   872	MOVQ  R8, R14
   873	SHRQ  $0x02, R8, R15
   874	SHRQ  $0x02, R8
   875	ADDQ  R13, R10
   876	ADCQ  R14, R11
   877	ADCQ  $0x00, R12
   878	ADDQ  R15, R10
   879	ADCQ  R8, R11
   880	ADCQ  $0x00, R12
   881
   882	// Final reduce
   883	MOVQ    R10, R13
   884	MOVQ    R11, R14
   885	MOVQ    R12, R15
   886	SUBQ    $-5, R10
   887	SBBQ    $-1, R11
   888	SBBQ    $0x03, R12
   889	CMOVQCS R13, R10
   890	CMOVQCS R14, R11
   891	CMOVQCS R15, R12
   892
   893	// Add in the "s" part of the key
   894	ADDQ 16(BP), R10
   895	ADCQ 24(BP), R11
   896
   897	// Finally, constant time compare to the tag at the end of the message
   898	XORQ    AX, AX
   899	MOVQ    $0x00000001, DX
   900	XORQ    (SI), R10
   901	XORQ    8(SI), R11
   902	ORQ     R11, R10
   903	CMOVQEQ DX, AX
   904
   905	// Return true iff tags are equal
   906	MOVB AX, ret+96(FP)
   907	RET
   908
   909openSSE128:
   910	MOVOU ·chacha20Constants<>+0(SB), X0
   911	MOVOU 16(R8), X3
   912	MOVOU 32(R8), X6
   913	MOVOU 48(R8), X9
   914	MOVO  X0, X1
   915	MOVO  X3, X4
   916	MOVO  X6, X7
   917	MOVO  X9, X10
   918	PADDL ·sseIncMask<>+0(SB), X10
   919	MOVO  X1, X2
   920	MOVO  X4, X5
   921	MOVO  X7, X8
   922	MOVO  X10, X11
   923	PADDL ·sseIncMask<>+0(SB), X11
   924	MOVO  X3, X13
   925	MOVO  X6, X14
   926	MOVO  X10, X15
   927	MOVQ  $0x0000000a, R9
   928
   929openSSE128InnerCipherLoop:
   930	PADDD X3, X0
   931	PXOR  X0, X9
   932	ROL16(X9, X12)
   933	PADDD X9, X6
   934	PXOR  X6, X3
   935	MOVO  X3, X12
   936	PSLLL $0x0c, X12
   937	PSRLL $0x14, X3
   938	PXOR  X12, X3
   939	PADDD X3, X0
   940	PXOR  X0, X9
   941	ROL8(X9, X12)
   942	PADDD X9, X6
   943	PXOR  X6, X3
   944	MOVO  X3, X12
   945	PSLLL $0x07, X12
   946	PSRLL $0x19, X3
   947	PXOR  X12, X3
   948	PADDD X4, X1
   949	PXOR  X1, X10
   950	ROL16(X10, X12)
   951	PADDD X10, X7
   952	PXOR  X7, X4
   953	MOVO  X4, X12
   954	PSLLL $0x0c, X12
   955	PSRLL $0x14, X4
   956	PXOR  X12, X4
   957	PADDD X4, X1
   958	PXOR  X1, X10
   959	ROL8(X10, X12)
   960	PADDD X10, X7
   961	PXOR  X7, X4
   962	MOVO  X4, X12
   963	PSLLL $0x07, X12
   964	PSRLL $0x19, X4
   965	PXOR  X12, X4
   966	PADDD X5, X2
   967	PXOR  X2, X11
   968	ROL16(X11, X12)
   969	PADDD X11, X8
   970	PXOR  X8, X5
   971	MOVO  X5, X12
   972	PSLLL $0x0c, X12
   973	PSRLL $0x14, X5
   974	PXOR  X12, X5
   975	PADDD X5, X2
   976	PXOR  X2, X11
   977	ROL8(X11, X12)
   978	PADDD X11, X8
   979	PXOR  X8, X5
   980	MOVO  X5, X12
   981	PSLLL $0x07, X12
   982	PSRLL $0x19, X5
   983	PXOR  X12, X5
   984	BYTE  $0x66
   985	BYTE  $0x0f
   986	BYTE  $0x3a
   987	BYTE  $0x0f
   988	BYTE  $0xdb
   989	BYTE  $0x04
   990	BYTE  $0x66
   991	BYTE  $0x0f
   992	BYTE  $0x3a
   993	BYTE  $0x0f
   994	BYTE  $0xe4
   995	BYTE  $0x04
   996	BYTE  $0x66
   997	BYTE  $0x0f
   998	BYTE  $0x3a
   999	BYTE  $0x0f
  1000	BYTE  $0xed
  1001	BYTE  $0x04
  1002	BYTE  $0x66
  1003	BYTE  $0x0f
  1004	BYTE  $0x3a
  1005	BYTE  $0x0f
  1006	BYTE  $0xf6
  1007	BYTE  $0x08
  1008	BYTE  $0x66
  1009	BYTE  $0x0f
  1010	BYTE  $0x3a
  1011	BYTE  $0x0f
  1012	BYTE  $0xff
  1013	BYTE  $0x08
  1014	BYTE  $0x66
  1015	BYTE  $0x45
  1016	BYTE  $0x0f
  1017	BYTE  $0x3a
  1018	BYTE  $0x0f
  1019	BYTE  $0xc0
  1020	BYTE  $0x08
  1021	BYTE  $0x66
  1022	BYTE  $0x45
  1023	BYTE  $0x0f
  1024	BYTE  $0x3a
  1025	BYTE  $0x0f
  1026	BYTE  $0xc9
  1027	BYTE  $0x0c
  1028	BYTE  $0x66
  1029	BYTE  $0x45
  1030	BYTE  $0x0f
  1031	BYTE  $0x3a
  1032	BYTE  $0x0f
  1033	BYTE  $0xd2
  1034	BYTE  $0x0c
  1035	BYTE  $0x66
  1036	BYTE  $0x45
  1037	BYTE  $0x0f
  1038	BYTE  $0x3a
  1039	BYTE  $0x0f
  1040	BYTE  $0xdb
  1041	BYTE  $0x0c
  1042	PADDD X3, X0
  1043	PXOR  X0, X9
  1044	ROL16(X9, X12)
  1045	PADDD X9, X6
  1046	PXOR  X6, X3
  1047	MOVO  X3, X12
  1048	PSLLL $0x0c, X12
  1049	PSRLL $0x14, X3
  1050	PXOR  X12, X3
  1051	PADDD X3, X0
  1052	PXOR  X0, X9
  1053	ROL8(X9, X12)
  1054	PADDD X9, X6
  1055	PXOR  X6, X3
  1056	MOVO  X3, X12
  1057	PSLLL $0x07, X12
  1058	PSRLL $0x19, X3
  1059	PXOR  X12, X3
  1060	PADDD X4, X1
  1061	PXOR  X1, X10
  1062	ROL16(X10, X12)
  1063	PADDD X10, X7
  1064	PXOR  X7, X4
  1065	MOVO  X4, X12
  1066	PSLLL $0x0c, X12
  1067	PSRLL $0x14, X4
  1068	PXOR  X12, X4
  1069	PADDD X4, X1
  1070	PXOR  X1, X10
  1071	ROL8(X10, X12)
  1072	PADDD X10, X7
  1073	PXOR  X7, X4
  1074	MOVO  X4, X12
  1075	PSLLL $0x07, X12
  1076	PSRLL $0x19, X4
  1077	PXOR  X12, X4
  1078	PADDD X5, X2
  1079	PXOR  X2, X11
  1080	ROL16(X11, X12)
  1081	PADDD X11, X8
  1082	PXOR  X8, X5
  1083	MOVO  X5, X12
  1084	PSLLL $0x0c, X12
  1085	PSRLL $0x14, X5
  1086	PXOR  X12, X5
  1087	PADDD X5, X2
  1088	PXOR  X2, X11
  1089	ROL8(X11, X12)
  1090	PADDD X11, X8
  1091	PXOR  X8, X5
  1092	MOVO  X5, X12
  1093	PSLLL $0x07, X12
  1094	PSRLL $0x19, X5
  1095	PXOR  X12, X5
  1096	BYTE  $0x66
  1097	BYTE  $0x0f
  1098	BYTE  $0x3a
  1099	BYTE  $0x0f
  1100	BYTE  $0xdb
  1101	BYTE  $0x0c
  1102	BYTE  $0x66
  1103	BYTE  $0x0f
  1104	BYTE  $0x3a
  1105	BYTE  $0x0f
  1106	BYTE  $0xe4
  1107	BYTE  $0x0c
  1108	BYTE  $0x66
  1109	BYTE  $0x0f
  1110	BYTE  $0x3a
  1111	BYTE  $0x0f
  1112	BYTE  $0xed
  1113	BYTE  $0x0c
  1114	BYTE  $0x66
  1115	BYTE  $0x0f
  1116	BYTE  $0x3a
  1117	BYTE  $0x0f
  1118	BYTE  $0xf6
  1119	BYTE  $0x08
  1120	BYTE  $0x66
  1121	BYTE  $0x0f
  1122	BYTE  $0x3a
  1123	BYTE  $0x0f
  1124	BYTE  $0xff
  1125	BYTE  $0x08
  1126	BYTE  $0x66
  1127	BYTE  $0x45
  1128	BYTE  $0x0f
  1129	BYTE  $0x3a
  1130	BYTE  $0x0f
  1131	BYTE  $0xc0
  1132	BYTE  $0x08
  1133	BYTE  $0x66
  1134	BYTE  $0x45
  1135	BYTE  $0x0f
  1136	BYTE  $0x3a
  1137	BYTE  $0x0f
  1138	BYTE  $0xc9
  1139	BYTE  $0x04
  1140	BYTE  $0x66
  1141	BYTE  $0x45
  1142	BYTE  $0x0f
  1143	BYTE  $0x3a
  1144	BYTE  $0x0f
  1145	BYTE  $0xd2
  1146	BYTE  $0x04
  1147	BYTE  $0x66
  1148	BYTE  $0x45
  1149	BYTE  $0x0f
  1150	BYTE  $0x3a
  1151	BYTE  $0x0f
  1152	BYTE  $0xdb
  1153	BYTE  $0x04
  1154	DECQ  R9
  1155	JNE   openSSE128InnerCipherLoop
  1156
  1157	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1158	PADDL ·chacha20Constants<>+0(SB), X0
  1159	PADDL ·chacha20Constants<>+0(SB), X1
  1160	PADDL ·chacha20Constants<>+0(SB), X2
  1161	PADDL X13, X3
  1162	PADDL X13, X4
  1163	PADDL X13, X5
  1164	PADDL X14, X7
  1165	PADDL X14, X8
  1166	PADDL X15, X10
  1167	PADDL ·sseIncMask<>+0(SB), X15
  1168	PADDL X15, X11
  1169
  1170	// Clamp and store the key
  1171	PAND  ·polyClampMask<>+0(SB), X0
  1172	MOVOU X0, (BP)
  1173	MOVOU X3, 16(BP)
  1174
  1175	// Hash
  1176	MOVQ ad_len+80(FP), R9
  1177	CALL polyHashADInternal<>(SB)
  1178
  1179openSSE128Open:
  1180	CMPQ BX, $0x10
  1181	JB   openSSETail16
  1182	SUBQ $0x10, BX
  1183
  1184	// Load for hashing
  1185	ADDQ (SI), R10
  1186	ADCQ 8(SI), R11
  1187	ADCQ $0x01, R12
  1188
  1189	// Load for decryption
  1190	MOVOU (SI), X12
  1191	PXOR  X12, X1
  1192	MOVOU X1, (DI)
  1193	LEAQ  16(SI), SI
  1194	LEAQ  16(DI), DI
  1195	MOVQ  (BP), AX
  1196	MOVQ  AX, R15
  1197	MULQ  R10
  1198	MOVQ  AX, R13
  1199	MOVQ  DX, R14
  1200	MOVQ  (BP), AX
  1201	MULQ  R11
  1202	IMULQ R12, R15
  1203	ADDQ  AX, R14
  1204	ADCQ  DX, R15
  1205	MOVQ  8(BP), AX
  1206	MOVQ  AX, R8
  1207	MULQ  R10
  1208	ADDQ  AX, R14
  1209	ADCQ  $0x00, DX
  1210	MOVQ  DX, R10
  1211	MOVQ  8(BP), AX
  1212	MULQ  R11
  1213	ADDQ  AX, R15
  1214	ADCQ  $0x00, DX
  1215	IMULQ R12, R8
  1216	ADDQ  R10, R15
  1217	ADCQ  DX, R8
  1218	MOVQ  R13, R10
  1219	MOVQ  R14, R11
  1220	MOVQ  R15, R12
  1221	ANDQ  $0x03, R12
  1222	MOVQ  R15, R13
  1223	ANDQ  $-4, R13
  1224	MOVQ  R8, R14
  1225	SHRQ  $0x02, R8, R15
  1226	SHRQ  $0x02, R8
  1227	ADDQ  R13, R10
  1228	ADCQ  R14, R11
  1229	ADCQ  $0x00, R12
  1230	ADDQ  R15, R10
  1231	ADCQ  R8, R11
  1232	ADCQ  $0x00, R12
  1233
  1234	// Shift the stream "left"
  1235	MOVO X4, X1
  1236	MOVO X7, X4
  1237	MOVO X10, X7
  1238	MOVO X2, X10
  1239	MOVO X5, X2
  1240	MOVO X8, X5
  1241	MOVO X11, X8
  1242	JMP  openSSE128Open
  1243
  1244openSSETail16:
  1245	TESTQ BX, BX
  1246	JE    openSSEFinalize
  1247
  1248	// We can safely load the CT from the end, because it is padded with the MAC
  1249	MOVQ  BX, R9
  1250	SHLQ  $0x04, R9
  1251	LEAQ  ·andMask<>+0(SB), R13
  1252	MOVOU (SI), X12
  1253	ADDQ  BX, SI
  1254	PAND  -16(R13)(R9*1), X12
  1255	MOVO  X12, 64(BP)
  1256	MOVQ  X12, R13
  1257	MOVQ  72(BP), R14
  1258	PXOR  X1, X12
  1259
  1260	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
  1261openSSETail16Store:
  1262	MOVQ   X12, R8
  1263	MOVB   R8, (DI)
  1264	PSRLDQ $0x01, X12
  1265	INCQ   DI
  1266	DECQ   BX
  1267	JNE    openSSETail16Store
  1268	ADDQ   R13, R10
  1269	ADCQ   R14, R11
  1270	ADCQ   $0x01, R12
  1271	MOVQ   (BP), AX
  1272	MOVQ   AX, R15
  1273	MULQ   R10
  1274	MOVQ   AX, R13
  1275	MOVQ   DX, R14
  1276	MOVQ   (BP), AX
  1277	MULQ   R11
  1278	IMULQ  R12, R15
  1279	ADDQ   AX, R14
  1280	ADCQ   DX, R15
  1281	MOVQ   8(BP), AX
  1282	MOVQ   AX, R8
  1283	MULQ   R10
  1284	ADDQ   AX, R14
  1285	ADCQ   $0x00, DX
  1286	MOVQ   DX, R10
  1287	MOVQ   8(BP), AX
  1288	MULQ   R11
  1289	ADDQ   AX, R15
  1290	ADCQ   $0x00, DX
  1291	IMULQ  R12, R8
  1292	ADDQ   R10, R15
  1293	ADCQ   DX, R8
  1294	MOVQ   R13, R10
  1295	MOVQ   R14, R11
  1296	MOVQ   R15, R12
  1297	ANDQ   $0x03, R12
  1298	MOVQ   R15, R13
  1299	ANDQ   $-4, R13
  1300	MOVQ   R8, R14
  1301	SHRQ   $0x02, R8, R15
  1302	SHRQ   $0x02, R8
  1303	ADDQ   R13, R10
  1304	ADCQ   R14, R11
  1305	ADCQ   $0x00, R12
  1306	ADDQ   R15, R10
  1307	ADCQ   R8, R11
  1308	ADCQ   $0x00, R12
  1309	JMP    openSSEFinalize
  1310
  1311openSSETail64:
  1312	MOVO  ·chacha20Constants<>+0(SB), X0
  1313	MOVO  32(BP), X3
  1314	MOVO  48(BP), X6
  1315	MOVO  128(BP), X9
  1316	PADDL ·sseIncMask<>+0(SB), X9
  1317	MOVO  X9, 80(BP)
  1318	XORQ  R9, R9
  1319	MOVQ  BX, CX
  1320	CMPQ  CX, $0x10
  1321	JB    openSSETail64LoopB
  1322
  1323openSSETail64LoopA:
  1324	ADDQ  (SI)(R9*1), R10
  1325	ADCQ  8(SI)(R9*1), R11
  1326	ADCQ  $0x01, R12
  1327	MOVQ  (BP), AX
  1328	MOVQ  AX, R15
  1329	MULQ  R10
  1330	MOVQ  AX, R13
  1331	MOVQ  DX, R14
  1332	MOVQ  (BP), AX
  1333	MULQ  R11
  1334	IMULQ R12, R15
  1335	ADDQ  AX, R14
  1336	ADCQ  DX, R15
  1337	MOVQ  8(BP), AX
  1338	MOVQ  AX, R8
  1339	MULQ  R10
  1340	ADDQ  AX, R14
  1341	ADCQ  $0x00, DX
  1342	MOVQ  DX, R10
  1343	MOVQ  8(BP), AX
  1344	MULQ  R11
  1345	ADDQ  AX, R15
  1346	ADCQ  $0x00, DX
  1347	IMULQ R12, R8
  1348	ADDQ  R10, R15
  1349	ADCQ  DX, R8
  1350	MOVQ  R13, R10
  1351	MOVQ  R14, R11
  1352	MOVQ  R15, R12
  1353	ANDQ  $0x03, R12
  1354	MOVQ  R15, R13
  1355	ANDQ  $-4, R13
  1356	MOVQ  R8, R14
  1357	SHRQ  $0x02, R8, R15
  1358	SHRQ  $0x02, R8
  1359	ADDQ  R13, R10
  1360	ADCQ  R14, R11
  1361	ADCQ  $0x00, R12
  1362	ADDQ  R15, R10
  1363	ADCQ  R8, R11
  1364	ADCQ  $0x00, R12
  1365	SUBQ  $0x10, CX
  1366
  1367openSSETail64LoopB:
  1368	ADDQ  $0x10, R9
  1369	PADDD X3, X0
  1370	PXOR  X0, X9
  1371	ROL16(X9, X12)
  1372	PADDD X9, X6
  1373	PXOR  X6, X3
  1374	MOVO  X3, X12
  1375	PSLLL $0x0c, X12
  1376	PSRLL $0x14, X3
  1377	PXOR  X12, X3
  1378	PADDD X3, X0
  1379	PXOR  X0, X9
  1380	ROL8(X9, X12)
  1381	PADDD X9, X6
  1382	PXOR  X6, X3
  1383	MOVO  X3, X12
  1384	PSLLL $0x07, X12
  1385	PSRLL $0x19, X3
  1386	PXOR  X12, X3
  1387	BYTE  $0x66
  1388	BYTE  $0x0f
  1389	BYTE  $0x3a
  1390	BYTE  $0x0f
  1391	BYTE  $0xdb
  1392	BYTE  $0x04
  1393	BYTE  $0x66
  1394	BYTE  $0x0f
  1395	BYTE  $0x3a
  1396	BYTE  $0x0f
  1397	BYTE  $0xf6
  1398	BYTE  $0x08
  1399	BYTE  $0x66
  1400	BYTE  $0x45
  1401	BYTE  $0x0f
  1402	BYTE  $0x3a
  1403	BYTE  $0x0f
  1404	BYTE  $0xc9
  1405	BYTE  $0x0c
  1406	PADDD X3, X0
  1407	PXOR  X0, X9
  1408	ROL16(X9, X12)
  1409	PADDD X9, X6
  1410	PXOR  X6, X3
  1411	MOVO  X3, X12
  1412	PSLLL $0x0c, X12
  1413	PSRLL $0x14, X3
  1414	PXOR  X12, X3
  1415	PADDD X3, X0
  1416	PXOR  X0, X9
  1417	ROL8(X9, X12)
  1418	PADDD X9, X6
  1419	PXOR  X6, X3
  1420	MOVO  X3, X12
  1421	PSLLL $0x07, X12
  1422	PSRLL $0x19, X3
  1423	PXOR  X12, X3
  1424	BYTE  $0x66
  1425	BYTE  $0x0f
  1426	BYTE  $0x3a
  1427	BYTE  $0x0f
  1428	BYTE  $0xdb
  1429	BYTE  $0x0c
  1430	BYTE  $0x66
  1431	BYTE  $0x0f
  1432	BYTE  $0x3a
  1433	BYTE  $0x0f
  1434	BYTE  $0xf6
  1435	BYTE  $0x08
  1436	BYTE  $0x66
  1437	BYTE  $0x45
  1438	BYTE  $0x0f
  1439	BYTE  $0x3a
  1440	BYTE  $0x0f
  1441	BYTE  $0xc9
  1442	BYTE  $0x04
  1443	CMPQ  CX, $0x10
  1444	JAE   openSSETail64LoopA
  1445	CMPQ  R9, $0xa0
  1446	JNE   openSSETail64LoopB
  1447	PADDL ·chacha20Constants<>+0(SB), X0
  1448	PADDL 32(BP), X3
  1449	PADDL 48(BP), X6
  1450	PADDL 80(BP), X9
  1451
  1452openSSETail64DecLoop:
  1453	CMPQ  BX, $0x10
  1454	JB    openSSETail64DecLoopDone
  1455	SUBQ  $0x10, BX
  1456	MOVOU (SI), X12
  1457	PXOR  X12, X0
  1458	MOVOU X0, (DI)
  1459	LEAQ  16(SI), SI
  1460	LEAQ  16(DI), DI
  1461	MOVO  X3, X0
  1462	MOVO  X6, X3
  1463	MOVO  X9, X6
  1464	JMP   openSSETail64DecLoop
  1465
  1466openSSETail64DecLoopDone:
  1467	MOVO X0, X1
  1468	JMP  openSSETail16
  1469
  1470openSSETail128:
  1471	MOVO  ·chacha20Constants<>+0(SB), X1
  1472	MOVO  32(BP), X4
  1473	MOVO  48(BP), X7
  1474	MOVO  128(BP), X10
  1475	PADDL ·sseIncMask<>+0(SB), X10
  1476	MOVO  X10, 80(BP)
  1477	MOVO  X1, X0
  1478	MOVO  X4, X3
  1479	MOVO  X7, X6
  1480	MOVO  X10, X9
  1481	PADDL ·sseIncMask<>+0(SB), X9
  1482	MOVO  X9, 96(BP)
  1483	XORQ  R9, R9
  1484	MOVQ  BX, CX
  1485	ANDQ  $-16, CX
  1486
  1487openSSETail128LoopA:
  1488	ADDQ  (SI)(R9*1), R10
  1489	ADCQ  8(SI)(R9*1), R11
  1490	ADCQ  $0x01, R12
  1491	MOVQ  (BP), AX
  1492	MOVQ  AX, R15
  1493	MULQ  R10
  1494	MOVQ  AX, R13
  1495	MOVQ  DX, R14
  1496	MOVQ  (BP), AX
  1497	MULQ  R11
  1498	IMULQ R12, R15
  1499	ADDQ  AX, R14
  1500	ADCQ  DX, R15
  1501	MOVQ  8(BP), AX
  1502	MOVQ  AX, R8
  1503	MULQ  R10
  1504	ADDQ  AX, R14
  1505	ADCQ  $0x00, DX
  1506	MOVQ  DX, R10
  1507	MOVQ  8(BP), AX
  1508	MULQ  R11
  1509	ADDQ  AX, R15
  1510	ADCQ  $0x00, DX
  1511	IMULQ R12, R8
  1512	ADDQ  R10, R15
  1513	ADCQ  DX, R8
  1514	MOVQ  R13, R10
  1515	MOVQ  R14, R11
  1516	MOVQ  R15, R12
  1517	ANDQ  $0x03, R12
  1518	MOVQ  R15, R13
  1519	ANDQ  $-4, R13
  1520	MOVQ  R8, R14
  1521	SHRQ  $0x02, R8, R15
  1522	SHRQ  $0x02, R8
  1523	ADDQ  R13, R10
  1524	ADCQ  R14, R11
  1525	ADCQ  $0x00, R12
  1526	ADDQ  R15, R10
  1527	ADCQ  R8, R11
  1528	ADCQ  $0x00, R12
  1529
  1530openSSETail128LoopB:
  1531	ADDQ  $0x10, R9
  1532	PADDD X3, X0
  1533	PXOR  X0, X9
  1534	ROL16(X9, X12)
  1535	PADDD X9, X6
  1536	PXOR  X6, X3
  1537	MOVO  X3, X12
  1538	PSLLL $0x0c, X12
  1539	PSRLL $0x14, X3
  1540	PXOR  X12, X3
  1541	PADDD X3, X0
  1542	PXOR  X0, X9
  1543	ROL8(X9, X12)
  1544	PADDD X9, X6
  1545	PXOR  X6, X3
  1546	MOVO  X3, X12
  1547	PSLLL $0x07, X12
  1548	PSRLL $0x19, X3
  1549	PXOR  X12, X3
  1550	PADDD X4, X1
  1551	PXOR  X1, X10
  1552	ROL16(X10, X12)
  1553	PADDD X10, X7
  1554	PXOR  X7, X4
  1555	MOVO  X4, X12
  1556	PSLLL $0x0c, X12
  1557	PSRLL $0x14, X4
  1558	PXOR  X12, X4
  1559	PADDD X4, X1
  1560	PXOR  X1, X10
  1561	ROL8(X10, X12)
  1562	PADDD X10, X7
  1563	PXOR  X7, X4
  1564	MOVO  X4, X12
  1565	PSLLL $0x07, X12
  1566	PSRLL $0x19, X4
  1567	PXOR  X12, X4
  1568	BYTE  $0x66
  1569	BYTE  $0x0f
  1570	BYTE  $0x3a
  1571	BYTE  $0x0f
  1572	BYTE  $0xdb
  1573	BYTE  $0x04
  1574	BYTE  $0x66
  1575	BYTE  $0x0f
  1576	BYTE  $0x3a
  1577	BYTE  $0x0f
  1578	BYTE  $0xf6
  1579	BYTE  $0x08
  1580	BYTE  $0x66
  1581	BYTE  $0x45
  1582	BYTE  $0x0f
  1583	BYTE  $0x3a
  1584	BYTE  $0x0f
  1585	BYTE  $0xc9
  1586	BYTE  $0x0c
  1587	BYTE  $0x66
  1588	BYTE  $0x0f
  1589	BYTE  $0x3a
  1590	BYTE  $0x0f
  1591	BYTE  $0xe4
  1592	BYTE  $0x04
  1593	BYTE  $0x66
  1594	BYTE  $0x0f
  1595	BYTE  $0x3a
  1596	BYTE  $0x0f
  1597	BYTE  $0xff
  1598	BYTE  $0x08
  1599	BYTE  $0x66
  1600	BYTE  $0x45
  1601	BYTE  $0x0f
  1602	BYTE  $0x3a
  1603	BYTE  $0x0f
  1604	BYTE  $0xd2
  1605	BYTE  $0x0c
  1606	PADDD X3, X0
  1607	PXOR  X0, X9
  1608	ROL16(X9, X12)
  1609	PADDD X9, X6
  1610	PXOR  X6, X3
  1611	MOVO  X3, X12
  1612	PSLLL $0x0c, X12
  1613	PSRLL $0x14, X3
  1614	PXOR  X12, X3
  1615	PADDD X3, X0
  1616	PXOR  X0, X9
  1617	ROL8(X9, X12)
  1618	PADDD X9, X6
  1619	PXOR  X6, X3
  1620	MOVO  X3, X12
  1621	PSLLL $0x07, X12
  1622	PSRLL $0x19, X3
  1623	PXOR  X12, X3
  1624	PADDD X4, X1
  1625	PXOR  X1, X10
  1626	ROL16(X10, X12)
  1627	PADDD X10, X7
  1628	PXOR  X7, X4
  1629	MOVO  X4, X12
  1630	PSLLL $0x0c, X12
  1631	PSRLL $0x14, X4
  1632	PXOR  X12, X4
  1633	PADDD X4, X1
  1634	PXOR  X1, X10
  1635	ROL8(X10, X12)
  1636	PADDD X10, X7
  1637	PXOR  X7, X4
  1638	MOVO  X4, X12
  1639	PSLLL $0x07, X12
  1640	PSRLL $0x19, X4
  1641	PXOR  X12, X4
  1642	BYTE  $0x66
  1643	BYTE  $0x0f
  1644	BYTE  $0x3a
  1645	BYTE  $0x0f
  1646	BYTE  $0xdb
  1647	BYTE  $0x0c
  1648	BYTE  $0x66
  1649	BYTE  $0x0f
  1650	BYTE  $0x3a
  1651	BYTE  $0x0f
  1652	BYTE  $0xf6
  1653	BYTE  $0x08
  1654	BYTE  $0x66
  1655	BYTE  $0x45
  1656	BYTE  $0x0f
  1657	BYTE  $0x3a
  1658	BYTE  $0x0f
  1659	BYTE  $0xc9
  1660	BYTE  $0x04
  1661	BYTE  $0x66
  1662	BYTE  $0x0f
  1663	BYTE  $0x3a
  1664	BYTE  $0x0f
  1665	BYTE  $0xe4
  1666	BYTE  $0x0c
  1667	BYTE  $0x66
  1668	BYTE  $0x0f
  1669	BYTE  $0x3a
  1670	BYTE  $0x0f
  1671	BYTE  $0xff
  1672	BYTE  $0x08
  1673	BYTE  $0x66
  1674	BYTE  $0x45
  1675	BYTE  $0x0f
  1676	BYTE  $0x3a
  1677	BYTE  $0x0f
  1678	BYTE  $0xd2
  1679	BYTE  $0x04
  1680	CMPQ  R9, CX
  1681	JB    openSSETail128LoopA
  1682	CMPQ  R9, $0xa0
  1683	JNE   openSSETail128LoopB
  1684	PADDL ·chacha20Constants<>+0(SB), X0
  1685	PADDL ·chacha20Constants<>+0(SB), X1
  1686	PADDL 32(BP), X3
  1687	PADDL 32(BP), X4
  1688	PADDL 48(BP), X6
  1689	PADDL 48(BP), X7
  1690	PADDL 96(BP), X9
  1691	PADDL 80(BP), X10
  1692	MOVOU (SI), X12
  1693	MOVOU 16(SI), X13
  1694	MOVOU 32(SI), X14
  1695	MOVOU 48(SI), X15
  1696	PXOR  X12, X1
  1697	PXOR  X13, X4
  1698	PXOR  X14, X7
  1699	PXOR  X15, X10
  1700	MOVOU X1, (DI)
  1701	MOVOU X4, 16(DI)
  1702	MOVOU X7, 32(DI)
  1703	MOVOU X10, 48(DI)
  1704	SUBQ  $0x40, BX
  1705	LEAQ  64(SI), SI
  1706	LEAQ  64(DI), DI
  1707	JMP   openSSETail64DecLoop
  1708
  1709openSSETail192:
  1710	MOVO    ·chacha20Constants<>+0(SB), X2
  1711	MOVO    32(BP), X5
  1712	MOVO    48(BP), X8
  1713	MOVO    128(BP), X11
  1714	PADDL   ·sseIncMask<>+0(SB), X11
  1715	MOVO    X11, 80(BP)
  1716	MOVO    X2, X1
  1717	MOVO    X5, X4
  1718	MOVO    X8, X7
  1719	MOVO    X11, X10
  1720	PADDL   ·sseIncMask<>+0(SB), X10
  1721	MOVO    X10, 96(BP)
  1722	MOVO    X1, X0
  1723	MOVO    X4, X3
  1724	MOVO    X7, X6
  1725	MOVO    X10, X9
  1726	PADDL   ·sseIncMask<>+0(SB), X9
  1727	MOVO    X9, 112(BP)
  1728	MOVQ    BX, CX
  1729	MOVQ    $0x000000a0, R9
  1730	CMPQ    CX, $0xa0
  1731	CMOVQGT R9, CX
  1732	ANDQ    $-16, CX
  1733	XORQ    R9, R9
  1734
  1735openSSLTail192LoopA:
  1736	ADDQ  (SI)(R9*1), R10
  1737	ADCQ  8(SI)(R9*1), R11
  1738	ADCQ  $0x01, R12
  1739	MOVQ  (BP), AX
  1740	MOVQ  AX, R15
  1741	MULQ  R10
  1742	MOVQ  AX, R13
  1743	MOVQ  DX, R14
  1744	MOVQ  (BP), AX
  1745	MULQ  R11
  1746	IMULQ R12, R15
  1747	ADDQ  AX, R14
  1748	ADCQ  DX, R15
  1749	MOVQ  8(BP), AX
  1750	MOVQ  AX, R8
  1751	MULQ  R10
  1752	ADDQ  AX, R14
  1753	ADCQ  $0x00, DX
  1754	MOVQ  DX, R10
  1755	MOVQ  8(BP), AX
  1756	MULQ  R11
  1757	ADDQ  AX, R15
  1758	ADCQ  $0x00, DX
  1759	IMULQ R12, R8
  1760	ADDQ  R10, R15
  1761	ADCQ  DX, R8
  1762	MOVQ  R13, R10
  1763	MOVQ  R14, R11
  1764	MOVQ  R15, R12
  1765	ANDQ  $0x03, R12
  1766	MOVQ  R15, R13
  1767	ANDQ  $-4, R13
  1768	MOVQ  R8, R14
  1769	SHRQ  $0x02, R8, R15
  1770	SHRQ  $0x02, R8
  1771	ADDQ  R13, R10
  1772	ADCQ  R14, R11
  1773	ADCQ  $0x00, R12
  1774	ADDQ  R15, R10
  1775	ADCQ  R8, R11
  1776	ADCQ  $0x00, R12
  1777
  1778openSSLTail192LoopB:
  1779	ADDQ  $0x10, R9
  1780	PADDD X3, X0
  1781	PXOR  X0, X9
  1782	ROL16(X9, X12)
  1783	PADDD X9, X6
  1784	PXOR  X6, X3
  1785	MOVO  X3, X12
  1786	PSLLL $0x0c, X12
  1787	PSRLL $0x14, X3
  1788	PXOR  X12, X3
  1789	PADDD X3, X0
  1790	PXOR  X0, X9
  1791	ROL8(X9, X12)
  1792	PADDD X9, X6
  1793	PXOR  X6, X3
  1794	MOVO  X3, X12
  1795	PSLLL $0x07, X12
  1796	PSRLL $0x19, X3
  1797	PXOR  X12, X3
  1798	PADDD X4, X1
  1799	PXOR  X1, X10
  1800	ROL16(X10, X12)
  1801	PADDD X10, X7
  1802	PXOR  X7, X4
  1803	MOVO  X4, X12
  1804	PSLLL $0x0c, X12
  1805	PSRLL $0x14, X4
  1806	PXOR  X12, X4
  1807	PADDD X4, X1
  1808	PXOR  X1, X10
  1809	ROL8(X10, X12)
  1810	PADDD X10, X7
  1811	PXOR  X7, X4
  1812	MOVO  X4, X12
  1813	PSLLL $0x07, X12
  1814	PSRLL $0x19, X4
  1815	PXOR  X12, X4
  1816	PADDD X5, X2
  1817	PXOR  X2, X11
  1818	ROL16(X11, X12)
  1819	PADDD X11, X8
  1820	PXOR  X8, X5
  1821	MOVO  X5, X12
  1822	PSLLL $0x0c, X12
  1823	PSRLL $0x14, X5
  1824	PXOR  X12, X5
  1825	PADDD X5, X2
  1826	PXOR  X2, X11
  1827	ROL8(X11, X12)
  1828	PADDD X11, X8
  1829	PXOR  X8, X5
  1830	MOVO  X5, X12
  1831	PSLLL $0x07, X12
  1832	PSRLL $0x19, X5
  1833	PXOR  X12, X5
  1834	BYTE  $0x66
  1835	BYTE  $0x0f
  1836	BYTE  $0x3a
  1837	BYTE  $0x0f
  1838	BYTE  $0xdb
  1839	BYTE  $0x04
  1840	BYTE  $0x66
  1841	BYTE  $0x0f
  1842	BYTE  $0x3a
  1843	BYTE  $0x0f
  1844	BYTE  $0xf6
  1845	BYTE  $0x08
  1846	BYTE  $0x66
  1847	BYTE  $0x45
  1848	BYTE  $0x0f
  1849	BYTE  $0x3a
  1850	BYTE  $0x0f
  1851	BYTE  $0xc9
  1852	BYTE  $0x0c
  1853	BYTE  $0x66
  1854	BYTE  $0x0f
  1855	BYTE  $0x3a
  1856	BYTE  $0x0f
  1857	BYTE  $0xe4
  1858	BYTE  $0x04
  1859	BYTE  $0x66
  1860	BYTE  $0x0f
  1861	BYTE  $0x3a
  1862	BYTE  $0x0f
  1863	BYTE  $0xff
  1864	BYTE  $0x08
  1865	BYTE  $0x66
  1866	BYTE  $0x45
  1867	BYTE  $0x0f
  1868	BYTE  $0x3a
  1869	BYTE  $0x0f
  1870	BYTE  $0xd2
  1871	BYTE  $0x0c
  1872	BYTE  $0x66
  1873	BYTE  $0x0f
  1874	BYTE  $0x3a
  1875	BYTE  $0x0f
  1876	BYTE  $0xed
  1877	BYTE  $0x04
  1878	BYTE  $0x66
  1879	BYTE  $0x45
  1880	BYTE  $0x0f
  1881	BYTE  $0x3a
  1882	BYTE  $0x0f
  1883	BYTE  $0xc0
  1884	BYTE  $0x08
  1885	BYTE  $0x66
  1886	BYTE  $0x45
  1887	BYTE  $0x0f
  1888	BYTE  $0x3a
  1889	BYTE  $0x0f
  1890	BYTE  $0xdb
  1891	BYTE  $0x0c
  1892	PADDD X3, X0
  1893	PXOR  X0, X9
  1894	ROL16(X9, X12)
  1895	PADDD X9, X6
  1896	PXOR  X6, X3
  1897	MOVO  X3, X12
  1898	PSLLL $0x0c, X12
  1899	PSRLL $0x14, X3
  1900	PXOR  X12, X3
  1901	PADDD X3, X0
  1902	PXOR  X0, X9
  1903	ROL8(X9, X12)
  1904	PADDD X9, X6
  1905	PXOR  X6, X3
  1906	MOVO  X3, X12
  1907	PSLLL $0x07, X12
  1908	PSRLL $0x19, X3
  1909	PXOR  X12, X3
  1910	PADDD X4, X1
  1911	PXOR  X1, X10
  1912	ROL16(X10, X12)
  1913	PADDD X10, X7
  1914	PXOR  X7, X4
  1915	MOVO  X4, X12
  1916	PSLLL $0x0c, X12
  1917	PSRLL $0x14, X4
  1918	PXOR  X12, X4
  1919	PADDD X4, X1
  1920	PXOR  X1, X10
  1921	ROL8(X10, X12)
  1922	PADDD X10, X7
  1923	PXOR  X7, X4
  1924	MOVO  X4, X12
  1925	PSLLL $0x07, X12
  1926	PSRLL $0x19, X4
  1927	PXOR  X12, X4
  1928	PADDD X5, X2
  1929	PXOR  X2, X11
  1930	ROL16(X11, X12)
  1931	PADDD X11, X8
  1932	PXOR  X8, X5
  1933	MOVO  X5, X12
  1934	PSLLL $0x0c, X12
  1935	PSRLL $0x14, X5
  1936	PXOR  X12, X5
  1937	PADDD X5, X2
  1938	PXOR  X2, X11
  1939	ROL8(X11, X12)
  1940	PADDD X11, X8
  1941	PXOR  X8, X5
  1942	MOVO  X5, X12
  1943	PSLLL $0x07, X12
  1944	PSRLL $0x19, X5
  1945	PXOR  X12, X5
  1946	BYTE  $0x66
  1947	BYTE  $0x0f
  1948	BYTE  $0x3a
  1949	BYTE  $0x0f
  1950	BYTE  $0xdb
  1951	BYTE  $0x0c
  1952	BYTE  $0x66
  1953	BYTE  $0x0f
  1954	BYTE  $0x3a
  1955	BYTE  $0x0f
  1956	BYTE  $0xf6
  1957	BYTE  $0x08
  1958	BYTE  $0x66
  1959	BYTE  $0x45
  1960	BYTE  $0x0f
  1961	BYTE  $0x3a
  1962	BYTE  $0x0f
  1963	BYTE  $0xc9
  1964	BYTE  $0x04
  1965	BYTE  $0x66
  1966	BYTE  $0x0f
  1967	BYTE  $0x3a
  1968	BYTE  $0x0f
  1969	BYTE  $0xe4
  1970	BYTE  $0x0c
  1971	BYTE  $0x66
  1972	BYTE  $0x0f
  1973	BYTE  $0x3a
  1974	BYTE  $0x0f
  1975	BYTE  $0xff
  1976	BYTE  $0x08
  1977	BYTE  $0x66
  1978	BYTE  $0x45
  1979	BYTE  $0x0f
  1980	BYTE  $0x3a
  1981	BYTE  $0x0f
  1982	BYTE  $0xd2
  1983	BYTE  $0x04
  1984	BYTE  $0x66
  1985	BYTE  $0x0f
  1986	BYTE  $0x3a
  1987	BYTE  $0x0f
  1988	BYTE  $0xed
  1989	BYTE  $0x0c
  1990	BYTE  $0x66
  1991	BYTE  $0x45
  1992	BYTE  $0x0f
  1993	BYTE  $0x3a
  1994	BYTE  $0x0f
  1995	BYTE  $0xc0
  1996	BYTE  $0x08
  1997	BYTE  $0x66
  1998	BYTE  $0x45
  1999	BYTE  $0x0f
  2000	BYTE  $0x3a
  2001	BYTE  $0x0f
  2002	BYTE  $0xdb
  2003	BYTE  $0x04
  2004	CMPQ  R9, CX
  2005	JB    openSSLTail192LoopA
  2006	CMPQ  R9, $0xa0
  2007	JNE   openSSLTail192LoopB
  2008	CMPQ  BX, $0xb0
  2009	JB    openSSLTail192Store
  2010	ADDQ  160(SI), R10
  2011	ADCQ  168(SI), R11
  2012	ADCQ  $0x01, R12
  2013	MOVQ  (BP), AX
  2014	MOVQ  AX, R15
  2015	MULQ  R10
  2016	MOVQ  AX, R13
  2017	MOVQ  DX, R14
  2018	MOVQ  (BP), AX
  2019	MULQ  R11
  2020	IMULQ R12, R15
  2021	ADDQ  AX, R14
  2022	ADCQ  DX, R15
  2023	MOVQ  8(BP), AX
  2024	MOVQ  AX, R8
  2025	MULQ  R10
  2026	ADDQ  AX, R14
  2027	ADCQ  $0x00, DX
  2028	MOVQ  DX, R10
  2029	MOVQ  8(BP), AX
  2030	MULQ  R11
  2031	ADDQ  AX, R15
  2032	ADCQ  $0x00, DX
  2033	IMULQ R12, R8
  2034	ADDQ  R10, R15
  2035	ADCQ  DX, R8
  2036	MOVQ  R13, R10
  2037	MOVQ  R14, R11
  2038	MOVQ  R15, R12
  2039	ANDQ  $0x03, R12
  2040	MOVQ  R15, R13
  2041	ANDQ  $-4, R13
  2042	MOVQ  R8, R14
  2043	SHRQ  $0x02, R8, R15
  2044	SHRQ  $0x02, R8
  2045	ADDQ  R13, R10
  2046	ADCQ  R14, R11
  2047	ADCQ  $0x00, R12
  2048	ADDQ  R15, R10
  2049	ADCQ  R8, R11
  2050	ADCQ  $0x00, R12
  2051	CMPQ  BX, $0xc0
  2052	JB    openSSLTail192Store
  2053	ADDQ  176(SI), R10
  2054	ADCQ  184(SI), R11
  2055	ADCQ  $0x01, R12
  2056	MOVQ  (BP), AX
  2057	MOVQ  AX, R15
  2058	MULQ  R10
  2059	MOVQ  AX, R13
  2060	MOVQ  DX, R14
  2061	MOVQ  (BP), AX
  2062	MULQ  R11
  2063	IMULQ R12, R15
  2064	ADDQ  AX, R14
  2065	ADCQ  DX, R15
  2066	MOVQ  8(BP), AX
  2067	MOVQ  AX, R8
  2068	MULQ  R10
  2069	ADDQ  AX, R14
  2070	ADCQ  $0x00, DX
  2071	MOVQ  DX, R10
  2072	MOVQ  8(BP), AX
  2073	MULQ  R11
  2074	ADDQ  AX, R15
  2075	ADCQ  $0x00, DX
  2076	IMULQ R12, R8
  2077	ADDQ  R10, R15
  2078	ADCQ  DX, R8
  2079	MOVQ  R13, R10
  2080	MOVQ  R14, R11
  2081	MOVQ  R15, R12
  2082	ANDQ  $0x03, R12
  2083	MOVQ  R15, R13
  2084	ANDQ  $-4, R13
  2085	MOVQ  R8, R14
  2086	SHRQ  $0x02, R8, R15
  2087	SHRQ  $0x02, R8
  2088	ADDQ  R13, R10
  2089	ADCQ  R14, R11
  2090	ADCQ  $0x00, R12
  2091	ADDQ  R15, R10
  2092	ADCQ  R8, R11
  2093	ADCQ  $0x00, R12
  2094
  2095openSSLTail192Store:
  2096	PADDL ·chacha20Constants<>+0(SB), X0
  2097	PADDL ·chacha20Constants<>+0(SB), X1
  2098	PADDL ·chacha20Constants<>+0(SB), X2
  2099	PADDL 32(BP), X3
  2100	PADDL 32(BP), X4
  2101	PADDL 32(BP), X5
  2102	PADDL 48(BP), X6
  2103	PADDL 48(BP), X7
  2104	PADDL 48(BP), X8
  2105	PADDL 112(BP), X9
  2106	PADDL 96(BP), X10
  2107	PADDL 80(BP), X11
  2108	MOVOU (SI), X12
  2109	MOVOU 16(SI), X13
  2110	MOVOU 32(SI), X14
  2111	MOVOU 48(SI), X15
  2112	PXOR  X12, X2
  2113	PXOR  X13, X5
  2114	PXOR  X14, X8
  2115	PXOR  X15, X11
  2116	MOVOU X2, (DI)
  2117	MOVOU X5, 16(DI)
  2118	MOVOU X8, 32(DI)
  2119	MOVOU X11, 48(DI)
  2120	MOVOU 64(SI), X12
  2121	MOVOU 80(SI), X13
  2122	MOVOU 96(SI), X14
  2123	MOVOU 112(SI), X15
  2124	PXOR  X12, X1
  2125	PXOR  X13, X4
  2126	PXOR  X14, X7
  2127	PXOR  X15, X10
  2128	MOVOU X1, 64(DI)
  2129	MOVOU X4, 80(DI)
  2130	MOVOU X7, 96(DI)
  2131	MOVOU X10, 112(DI)
  2132	SUBQ  $0x80, BX
  2133	LEAQ  128(SI), SI
  2134	LEAQ  128(DI), DI
  2135	JMP   openSSETail64DecLoop
  2136
  2137openSSETail256:
  2138	MOVO  ·chacha20Constants<>+0(SB), X0
  2139	MOVO  32(BP), X3
  2140	MOVO  48(BP), X6
  2141	MOVO  128(BP), X9
  2142	PADDL ·sseIncMask<>+0(SB), X9
  2143	MOVO  X0, X1
  2144	MOVO  X3, X4
  2145	MOVO  X6, X7
  2146	MOVO  X9, X10
  2147	PADDL ·sseIncMask<>+0(SB), X10
  2148	MOVO  X1, X2
  2149	MOVO  X4, X5
  2150	MOVO  X7, X8
  2151	MOVO  X10, X11
  2152	PADDL ·sseIncMask<>+0(SB), X11
  2153	MOVO  X2, X12
  2154	MOVO  X5, X13
  2155	MOVO  X8, X14
  2156	MOVO  X11, X15
  2157	PADDL ·sseIncMask<>+0(SB), X15
  2158
  2159	// Store counters
  2160	MOVO X9, 80(BP)
  2161	MOVO X10, 96(BP)
  2162	MOVO X11, 112(BP)
  2163	MOVO X15, 128(BP)
  2164	XORQ R9, R9
  2165
  2166openSSETail256Loop:
  2167	ADDQ  (SI)(R9*1), R10
  2168	ADCQ  8(SI)(R9*1), R11
  2169	ADCQ  $0x01, R12
  2170	MOVO  X14, 64(BP)
  2171	PADDD X3, X0
  2172	PXOR  X0, X9
  2173	ROL16(X9, X14)
  2174	PADDD X9, X6
  2175	PXOR  X6, X3
  2176	MOVO  X3, X14
  2177	PSLLL $0x0c, X14
  2178	PSRLL $0x14, X3
  2179	PXOR  X14, X3
  2180	PADDD X3, X0
  2181	PXOR  X0, X9
  2182	ROL8(X9, X14)
  2183	PADDD X9, X6
  2184	PXOR  X6, X3
  2185	MOVO  X3, X14
  2186	PSLLL $0x07, X14
  2187	PSRLL $0x19, X3
  2188	PXOR  X14, X3
  2189	PADDD X4, X1
  2190	PXOR  X1, X10
  2191	ROL16(X10, X14)
  2192	PADDD X10, X7
  2193	PXOR  X7, X4
  2194	MOVO  X4, X14
  2195	PSLLL $0x0c, X14
  2196	PSRLL $0x14, X4
  2197	PXOR  X14, X4
  2198	PADDD X4, X1
  2199	PXOR  X1, X10
  2200	ROL8(X10, X14)
  2201	PADDD X10, X7
  2202	PXOR  X7, X4
  2203	MOVO  X4, X14
  2204	PSLLL $0x07, X14
  2205	PSRLL $0x19, X4
  2206	PXOR  X14, X4
  2207	PADDD X5, X2
  2208	PXOR  X2, X11
  2209	ROL16(X11, X14)
  2210	PADDD X11, X8
  2211	PXOR  X8, X5
  2212	MOVO  X5, X14
  2213	PSLLL $0x0c, X14
  2214	PSRLL $0x14, X5
  2215	PXOR  X14, X5
  2216	PADDD X5, X2
  2217	PXOR  X2, X11
  2218	ROL8(X11, X14)
  2219	PADDD X11, X8
  2220	PXOR  X8, X5
  2221	MOVO  X5, X14
  2222	PSLLL $0x07, X14
  2223	PSRLL $0x19, X5
  2224	PXOR  X14, X5
  2225	MOVO  64(BP), X14
  2226	MOVO  X7, 64(BP)
  2227	PADDD X13, X12
  2228	PXOR  X12, X15
  2229	ROL16(X15, X7)
  2230	PADDD X15, X14
  2231	PXOR  X14, X13
  2232	MOVO  X13, X7
  2233	PSLLL $0x0c, X7
  2234	PSRLL $0x14, X13
  2235	PXOR  X7, X13
  2236	PADDD X13, X12
  2237	PXOR  X12, X15
  2238	ROL8(X15, X7)
  2239	PADDD X15, X14
  2240	PXOR  X14, X13
  2241	MOVO  X13, X7
  2242	PSLLL $0x07, X7
  2243	PSRLL $0x19, X13
  2244	PXOR  X7, X13
  2245	MOVO  64(BP), X7
  2246	BYTE  $0x66
  2247	BYTE  $0x0f
  2248	BYTE  $0x3a
  2249	BYTE  $0x0f
  2250	BYTE  $0xdb
  2251	BYTE  $0x04
  2252	BYTE  $0x66
  2253	BYTE  $0x0f
  2254	BYTE  $0x3a
  2255	BYTE  $0x0f
  2256	BYTE  $0xe4
  2257	BYTE  $0x04
  2258	BYTE  $0x66
  2259	BYTE  $0x0f
  2260	BYTE  $0x3a
  2261	BYTE  $0x0f
  2262	BYTE  $0xed
  2263	BYTE  $0x04
  2264	BYTE  $0x66
  2265	BYTE  $0x45
  2266	BYTE  $0x0f
  2267	BYTE  $0x3a
  2268	BYTE  $0x0f
  2269	BYTE  $0xed
  2270	BYTE  $0x04
  2271	BYTE  $0x66
  2272	BYTE  $0x0f
  2273	BYTE  $0x3a
  2274	BYTE  $0x0f
  2275	BYTE  $0xf6
  2276	BYTE  $0x08
  2277	BYTE  $0x66
  2278	BYTE  $0x0f
  2279	BYTE  $0x3a
  2280	BYTE  $0x0f
  2281	BYTE  $0xff
  2282	BYTE  $0x08
  2283	BYTE  $0x66
  2284	BYTE  $0x45
  2285	BYTE  $0x0f
  2286	BYTE  $0x3a
  2287	BYTE  $0x0f
  2288	BYTE  $0xc0
  2289	BYTE  $0x08
  2290	BYTE  $0x66
  2291	BYTE  $0x45
  2292	BYTE  $0x0f
  2293	BYTE  $0x3a
  2294	BYTE  $0x0f
  2295	BYTE  $0xf6
  2296	BYTE  $0x08
  2297	BYTE  $0x66
  2298	BYTE  $0x45
  2299	BYTE  $0x0f
  2300	BYTE  $0x3a
  2301	BYTE  $0x0f
  2302	BYTE  $0xc9
  2303	BYTE  $0x0c
  2304	BYTE  $0x66
  2305	BYTE  $0x45
  2306	BYTE  $0x0f
  2307	BYTE  $0x3a
  2308	BYTE  $0x0f
  2309	BYTE  $0xd2
  2310	BYTE  $0x0c
  2311	BYTE  $0x66
  2312	BYTE  $0x45
  2313	BYTE  $0x0f
  2314	BYTE  $0x3a
  2315	BYTE  $0x0f
  2316	BYTE  $0xdb
  2317	BYTE  $0x0c
  2318	BYTE  $0x66
  2319	BYTE  $0x45
  2320	BYTE  $0x0f
  2321	BYTE  $0x3a
  2322	BYTE  $0x0f
  2323	BYTE  $0xff
  2324	BYTE  $0x0c
  2325	MOVQ  (BP), AX
  2326	MOVQ  AX, R15
  2327	MULQ  R10
  2328	MOVQ  AX, R13
  2329	MOVQ  DX, R14
  2330	MOVQ  (BP), AX
  2331	MULQ  R11
  2332	IMULQ R12, R15
  2333	ADDQ  AX, R14
  2334	ADCQ  DX, R15
  2335	MOVQ  8(BP), AX
  2336	MOVQ  AX, R8
  2337	MULQ  R10
  2338	ADDQ  AX, R14
  2339	ADCQ  $0x00, DX
  2340	MOVQ  DX, R10
  2341	MOVQ  8(BP), AX
  2342	MULQ  R11
  2343	ADDQ  AX, R15
  2344	ADCQ  $0x00, DX
  2345	MOVO  X14, 64(BP)
  2346	PADDD X3, X0
  2347	PXOR  X0, X9
  2348	ROL16(X9, X14)
  2349	PADDD X9, X6
  2350	PXOR  X6, X3
  2351	MOVO  X3, X14
  2352	PSLLL $0x0c, X14
  2353	PSRLL $0x14, X3
  2354	PXOR  X14, X3
  2355	PADDD X3, X0
  2356	PXOR  X0, X9
  2357	ROL8(X9, X14)
  2358	PADDD X9, X6
  2359	PXOR  X6, X3
  2360	MOVO  X3, X14
  2361	PSLLL $0x07, X14
  2362	PSRLL $0x19, X3
  2363	PXOR  X14, X3
  2364	PADDD X4, X1
  2365	PXOR  X1, X10
  2366	ROL16(X10, X14)
  2367	PADDD X10, X7
  2368	PXOR  X7, X4
  2369	MOVO  X4, X14
  2370	PSLLL $0x0c, X14
  2371	PSRLL $0x14, X4
  2372	PXOR  X14, X4
  2373	PADDD X4, X1
  2374	PXOR  X1, X10
  2375	ROL8(X10, X14)
  2376	PADDD X10, X7
  2377	PXOR  X7, X4
  2378	MOVO  X4, X14
  2379	PSLLL $0x07, X14
  2380	PSRLL $0x19, X4
  2381	PXOR  X14, X4
  2382	PADDD X5, X2
  2383	PXOR  X2, X11
  2384	ROL16(X11, X14)
  2385	PADDD X11, X8
  2386	PXOR  X8, X5
  2387	MOVO  X5, X14
  2388	PSLLL $0x0c, X14
  2389	PSRLL $0x14, X5
  2390	PXOR  X14, X5
  2391	PADDD X5, X2
  2392	PXOR  X2, X11
  2393	ROL8(X11, X14)
  2394	PADDD X11, X8
  2395	PXOR  X8, X5
  2396	MOVO  X5, X14
  2397	PSLLL $0x07, X14
  2398	PSRLL $0x19, X5
  2399	PXOR  X14, X5
  2400	MOVO  64(BP), X14
  2401	MOVO  X7, 64(BP)
  2402	PADDD X13, X12
  2403	PXOR  X12, X15
  2404	ROL16(X15, X7)
  2405	PADDD X15, X14
  2406	PXOR  X14, X13
  2407	MOVO  X13, X7
  2408	PSLLL $0x0c, X7
  2409	PSRLL $0x14, X13
  2410	PXOR  X7, X13
  2411	PADDD X13, X12
  2412	PXOR  X12, X15
  2413	ROL8(X15, X7)
  2414	PADDD X15, X14
  2415	PXOR  X14, X13
  2416	MOVO  X13, X7
  2417	PSLLL $0x07, X7
  2418	PSRLL $0x19, X13
  2419	PXOR  X7, X13
  2420	MOVO  64(BP), X7
  2421	IMULQ R12, R8
  2422	ADDQ  R10, R15
  2423	ADCQ  DX, R8
  2424	MOVQ  R13, R10
  2425	MOVQ  R14, R11
  2426	MOVQ  R15, R12
  2427	ANDQ  $0x03, R12
  2428	MOVQ  R15, R13
  2429	ANDQ  $-4, R13
  2430	MOVQ  R8, R14
  2431	SHRQ  $0x02, R8, R15
  2432	SHRQ  $0x02, R8
  2433	ADDQ  R13, R10
  2434	ADCQ  R14, R11
  2435	ADCQ  $0x00, R12
  2436	ADDQ  R15, R10
  2437	ADCQ  R8, R11
  2438	ADCQ  $0x00, R12
  2439	BYTE  $0x66
  2440	BYTE  $0x0f
  2441	BYTE  $0x3a
  2442	BYTE  $0x0f
  2443	BYTE  $0xdb
  2444	BYTE  $0x0c
  2445	BYTE  $0x66
  2446	BYTE  $0x0f
  2447	BYTE  $0x3a
  2448	BYTE  $0x0f
  2449	BYTE  $0xe4
  2450	BYTE  $0x0c
  2451	BYTE  $0x66
  2452	BYTE  $0x0f
  2453	BYTE  $0x3a
  2454	BYTE  $0x0f
  2455	BYTE  $0xed
  2456	BYTE  $0x0c
  2457	BYTE  $0x66
  2458	BYTE  $0x45
  2459	BYTE  $0x0f
  2460	BYTE  $0x3a
  2461	BYTE  $0x0f
  2462	BYTE  $0xed
  2463	BYTE  $0x0c
  2464	BYTE  $0x66
  2465	BYTE  $0x0f
  2466	BYTE  $0x3a
  2467	BYTE  $0x0f
  2468	BYTE  $0xf6
  2469	BYTE  $0x08
  2470	BYTE  $0x66
  2471	BYTE  $0x0f
  2472	BYTE  $0x3a
  2473	BYTE  $0x0f
  2474	BYTE  $0xff
  2475	BYTE  $0x08
  2476	BYTE  $0x66
  2477	BYTE  $0x45
  2478	BYTE  $0x0f
  2479	BYTE  $0x3a
  2480	BYTE  $0x0f
  2481	BYTE  $0xc0
  2482	BYTE  $0x08
  2483	BYTE  $0x66
  2484	BYTE  $0x45
  2485	BYTE  $0x0f
  2486	BYTE  $0x3a
  2487	BYTE  $0x0f
  2488	BYTE  $0xf6
  2489	BYTE  $0x08
  2490	BYTE  $0x66
  2491	BYTE  $0x45
  2492	BYTE  $0x0f
  2493	BYTE  $0x3a
  2494	BYTE  $0x0f
  2495	BYTE  $0xc9
  2496	BYTE  $0x04
  2497	BYTE  $0x66
  2498	BYTE  $0x45
  2499	BYTE  $0x0f
  2500	BYTE  $0x3a
  2501	BYTE  $0x0f
  2502	BYTE  $0xd2
  2503	BYTE  $0x04
  2504	BYTE  $0x66
  2505	BYTE  $0x45
  2506	BYTE  $0x0f
  2507	BYTE  $0x3a
  2508	BYTE  $0x0f
  2509	BYTE  $0xdb
  2510	BYTE  $0x04
  2511	BYTE  $0x66
  2512	BYTE  $0x45
  2513	BYTE  $0x0f
  2514	BYTE  $0x3a
  2515	BYTE  $0x0f
  2516	BYTE  $0xff
  2517	BYTE  $0x04
  2518	ADDQ  $0x10, R9
  2519	CMPQ  R9, $0xa0
  2520	JB    openSSETail256Loop
  2521	MOVQ  BX, CX
  2522	ANDQ  $-16, CX
  2523
  2524openSSETail256HashLoop:
  2525	ADDQ  (SI)(R9*1), R10
  2526	ADCQ  8(SI)(R9*1), R11
  2527	ADCQ  $0x01, R12
  2528	MOVQ  (BP), AX
  2529	MOVQ  AX, R15
  2530	MULQ  R10
  2531	MOVQ  AX, R13
  2532	MOVQ  DX, R14
  2533	MOVQ  (BP), AX
  2534	MULQ  R11
  2535	IMULQ R12, R15
  2536	ADDQ  AX, R14
  2537	ADCQ  DX, R15
  2538	MOVQ  8(BP), AX
  2539	MOVQ  AX, R8
  2540	MULQ  R10
  2541	ADDQ  AX, R14
  2542	ADCQ  $0x00, DX
  2543	MOVQ  DX, R10
  2544	MOVQ  8(BP), AX
  2545	MULQ  R11
  2546	ADDQ  AX, R15
  2547	ADCQ  $0x00, DX
  2548	IMULQ R12, R8
  2549	ADDQ  R10, R15
  2550	ADCQ  DX, R8
  2551	MOVQ  R13, R10
  2552	MOVQ  R14, R11
  2553	MOVQ  R15, R12
  2554	ANDQ  $0x03, R12
  2555	MOVQ  R15, R13
  2556	ANDQ  $-4, R13
  2557	MOVQ  R8, R14
  2558	SHRQ  $0x02, R8, R15
  2559	SHRQ  $0x02, R8
  2560	ADDQ  R13, R10
  2561	ADCQ  R14, R11
  2562	ADCQ  $0x00, R12
  2563	ADDQ  R15, R10
  2564	ADCQ  R8, R11
  2565	ADCQ  $0x00, R12
  2566	ADDQ  $0x10, R9
  2567	CMPQ  R9, CX
  2568	JB    openSSETail256HashLoop
  2569
  2570	// Add in the state
  2571	PADDD ·chacha20Constants<>+0(SB), X0
  2572	PADDD ·chacha20Constants<>+0(SB), X1
  2573	PADDD ·chacha20Constants<>+0(SB), X2
  2574	PADDD ·chacha20Constants<>+0(SB), X12
  2575	PADDD 32(BP), X3
  2576	PADDD 32(BP), X4
  2577	PADDD 32(BP), X5
  2578	PADDD 32(BP), X13
  2579	PADDD 48(BP), X6
  2580	PADDD 48(BP), X7
  2581	PADDD 48(BP), X8
  2582	PADDD 48(BP), X14
  2583	PADDD 80(BP), X9
  2584	PADDD 96(BP), X10
  2585	PADDD 112(BP), X11
  2586	PADDD 128(BP), X15
  2587	MOVO  X15, 64(BP)
  2588
  2589	// Load - xor - store
  2590	MOVOU (SI), X15
  2591	PXOR  X15, X0
  2592	MOVOU 16(SI), X15
  2593	PXOR  X15, X3
  2594	MOVOU 32(SI), X15
  2595	PXOR  X15, X6
  2596	MOVOU 48(SI), X15
  2597	PXOR  X15, X9
  2598	MOVOU X0, (DI)
  2599	MOVOU X3, 16(DI)
  2600	MOVOU X6, 32(DI)
  2601	MOVOU X9, 48(DI)
  2602	MOVOU 64(SI), X0
  2603	MOVOU 80(SI), X3
  2604	MOVOU 96(SI), X6
  2605	MOVOU 112(SI), X9
  2606	PXOR  X0, X1
  2607	PXOR  X3, X4
  2608	PXOR  X6, X7
  2609	PXOR  X9, X10
  2610	MOVOU X1, 64(DI)
  2611	MOVOU X4, 80(DI)
  2612	MOVOU X7, 96(DI)
  2613	MOVOU X10, 112(DI)
  2614	MOVOU 128(SI), X0
  2615	MOVOU 144(SI), X3
  2616	MOVOU 160(SI), X6
  2617	MOVOU 176(SI), X9
  2618	PXOR  X0, X2
  2619	PXOR  X3, X5
  2620	PXOR  X6, X8
  2621	PXOR  X9, X11
  2622	MOVOU X2, 128(DI)
  2623	MOVOU X5, 144(DI)
  2624	MOVOU X8, 160(DI)
  2625	MOVOU X11, 176(DI)
  2626	LEAQ  192(SI), SI
  2627	LEAQ  192(DI), DI
  2628	SUBQ  $0xc0, BX
  2629	MOVO  X12, X0
  2630	MOVO  X13, X3
  2631	MOVO  X14, X6
  2632	MOVO  64(BP), X9
  2633	JMP   openSSETail64DecLoop
  2634
  2635chacha20Poly1305Open_AVX2:
  2636	VZEROUPPER
  2637	VMOVDQU ·chacha20Constants<>+0(SB), Y0
  2638	BYTE    $0xc4
  2639	BYTE    $0x42
  2640	BYTE    $0x7d
  2641	BYTE    $0x5a
  2642	BYTE    $0x70
  2643	BYTE    $0x10
  2644	BYTE    $0xc4
  2645	BYTE    $0x42
  2646	BYTE    $0x7d
  2647	BYTE    $0x5a
  2648	BYTE    $0x60
  2649	BYTE    $0x20
  2650	BYTE    $0xc4
  2651	BYTE    $0xc2
  2652	BYTE    $0x7d
  2653	BYTE    $0x5a
  2654	BYTE    $0x60
  2655	BYTE    $0x30
  2656	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
  2657
  2658	// Special optimization, for very short buffers
  2659	CMPQ BX, $0xc0
  2660	JBE  openAVX2192
  2661	CMPQ BX, $0x00000140
  2662	JBE  openAVX2320
  2663
  2664	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  2665	VMOVDQA Y14, 32(BP)
  2666	VMOVDQA Y12, 64(BP)
  2667	VMOVDQA Y4, 192(BP)
  2668	MOVQ    $0x0000000a, R9
  2669
  2670openAVX2PreparePolyKey:
  2671	VPADDD     Y14, Y0, Y0
  2672	VPXOR      Y0, Y4, Y4
  2673	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  2674	VPADDD     Y4, Y12, Y12
  2675	VPXOR      Y12, Y14, Y14
  2676	VPSLLD     $0x0c, Y14, Y3
  2677	VPSRLD     $0x14, Y14, Y14
  2678	VPXOR      Y3, Y14, Y14
  2679	VPADDD     Y14, Y0, Y0
  2680	VPXOR      Y0, Y4, Y4
  2681	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  2682	VPADDD     Y4, Y12, Y12
  2683	VPXOR      Y12, Y14, Y14
  2684	VPSLLD     $0x07, Y14, Y3
  2685	VPSRLD     $0x19, Y14, Y14
  2686	VPXOR      Y3, Y14, Y14
  2687	VPALIGNR   $0x04, Y14, Y14, Y14
  2688	VPALIGNR   $0x08, Y12, Y12, Y12
  2689	VPALIGNR   $0x0c, Y4, Y4, Y4
  2690	VPADDD     Y14, Y0, Y0
  2691	VPXOR      Y0, Y4, Y4
  2692	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  2693	VPADDD     Y4, Y12, Y12
  2694	VPXOR      Y12, Y14, Y14
  2695	VPSLLD     $0x0c, Y14, Y3
  2696	VPSRLD     $0x14, Y14, Y14
  2697	VPXOR      Y3, Y14, Y14
  2698	VPADDD     Y14, Y0, Y0
  2699	VPXOR      Y0, Y4, Y4
  2700	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  2701	VPADDD     Y4, Y12, Y12
  2702	VPXOR      Y12, Y14, Y14
  2703	VPSLLD     $0x07, Y14, Y3
  2704	VPSRLD     $0x19, Y14, Y14
  2705	VPXOR      Y3, Y14, Y14
  2706	VPALIGNR   $0x0c, Y14, Y14, Y14
  2707	VPALIGNR   $0x08, Y12, Y12, Y12
  2708	VPALIGNR   $0x04, Y4, Y4, Y4
  2709	DECQ       R9
  2710	JNE        openAVX2PreparePolyKey
  2711	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  2712	VPADDD     32(BP), Y14, Y14
  2713	VPADDD     64(BP), Y12, Y12
  2714	VPADDD     192(BP), Y4, Y4
  2715	VPERM2I128 $0x02, Y0, Y14, Y3
  2716
  2717	// Clamp and store poly key
  2718	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
  2719	VMOVDQA Y3, (BP)
  2720
  2721	// Stream for the first 64 bytes
  2722	VPERM2I128 $0x13, Y0, Y14, Y0
  2723	VPERM2I128 $0x13, Y12, Y4, Y14
  2724
  2725	// Hash AD + first 64 bytes
  2726	MOVQ ad_len+80(FP), R9
  2727	CALL polyHashADInternal<>(SB)
  2728	XORQ CX, CX
  2729
  2730openAVX2InitialHash64:
  2731	ADDQ  (SI)(CX*1), R10
  2732	ADCQ  8(SI)(CX*1), R11
  2733	ADCQ  $0x01, R12
  2734	MOVQ  (BP), DX
  2735	MOVQ  DX, R15
  2736	MULXQ R10, R13, R14
  2737	IMULQ R12, R15
  2738	MULXQ R11, AX, DX
  2739	ADDQ  AX, R14
  2740	ADCQ  DX, R15
  2741	MOVQ  8(BP), DX
  2742	MULXQ R10, R10, AX
  2743	ADDQ  R10, R14
  2744	MULXQ R11, R11, R8
  2745	ADCQ  R11, R15
  2746	ADCQ  $0x00, R8
  2747	IMULQ R12, DX
  2748	ADDQ  AX, R15
  2749	ADCQ  DX, R8
  2750	MOVQ  R13, R10
  2751	MOVQ  R14, R11
  2752	MOVQ  R15, R12
  2753	ANDQ  $0x03, R12
  2754	MOVQ  R15, R13
  2755	ANDQ  $-4, R13
  2756	MOVQ  R8, R14
  2757	SHRQ  $0x02, R8, R15
  2758	SHRQ  $0x02, R8
  2759	ADDQ  R13, R10
  2760	ADCQ  R14, R11
  2761	ADCQ  $0x00, R12
  2762	ADDQ  R15, R10
  2763	ADCQ  R8, R11
  2764	ADCQ  $0x00, R12
  2765	ADDQ  $0x10, CX
  2766	CMPQ  CX, $0x40
  2767	JNE   openAVX2InitialHash64
  2768
  2769	// Decrypt the first 64 bytes
  2770	VPXOR   (SI), Y0, Y0
  2771	VPXOR   32(SI), Y14, Y14
  2772	VMOVDQU Y0, (DI)
  2773	VMOVDQU Y14, 32(DI)
  2774	LEAQ    64(SI), SI
  2775	LEAQ    64(DI), DI
  2776	SUBQ    $0x40, BX
  2777
  2778openAVX2MainLoop:
  2779	CMPQ BX, $0x00000200
  2780	JB   openAVX2MainLoopDone
  2781
  2782	// Load state, increment counter blocks, store the incremented counters
  2783	VMOVDQU ·chacha20Constants<>+0(SB), Y0
  2784	VMOVDQA Y0, Y5
  2785	VMOVDQA Y0, Y6
  2786	VMOVDQA Y0, Y7
  2787	VMOVDQA 32(BP), Y14
  2788	VMOVDQA Y14, Y9
  2789	VMOVDQA Y14, Y10
  2790	VMOVDQA Y14, Y11
  2791	VMOVDQA 64(BP), Y12
  2792	VMOVDQA Y12, Y13
  2793	VMOVDQA Y12, Y8
  2794	VMOVDQA Y12, Y15
  2795	VMOVDQA 192(BP), Y4
  2796	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  2797	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  2798	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  2799	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
  2800	VMOVDQA Y4, 96(BP)
  2801	VMOVDQA Y1, 128(BP)
  2802	VMOVDQA Y2, 160(BP)
  2803	VMOVDQA Y3, 192(BP)
  2804	XORQ    CX, CX
  2805
  2806openAVX2InternalLoop:
  2807	ADDQ     (SI)(CX*1), R10
  2808	ADCQ     8(SI)(CX*1), R11
  2809	ADCQ     $0x01, R12
  2810	VPADDD   Y14, Y0, Y0
  2811	VPADDD   Y9, Y5, Y5
  2812	VPADDD   Y10, Y6, Y6
  2813	VPADDD   Y11, Y7, Y7
  2814	MOVQ     (BP), DX
  2815	MOVQ     DX, R15
  2816	MULXQ    R10, R13, R14
  2817	IMULQ    R12, R15
  2818	MULXQ    R11, AX, DX
  2819	ADDQ     AX, R14
  2820	ADCQ     DX, R15
  2821	VPXOR    Y0, Y4, Y4
  2822	VPXOR    Y5, Y1, Y1
  2823	VPXOR    Y6, Y2, Y2
  2824	VPXOR    Y7, Y3, Y3
  2825	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  2826	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  2827	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  2828	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  2829	MOVQ     8(BP), DX
  2830	MULXQ    R10, R10, AX
  2831	ADDQ     R10, R14
  2832	MULXQ    R11, R11, R8
  2833	ADCQ     R11, R15
  2834	ADCQ     $0x00, R8
  2835	VPADDD   Y4, Y12, Y12
  2836	VPADDD   Y1, Y13, Y13
  2837	VPADDD   Y2, Y8, Y8
  2838	VPADDD   Y3, Y15, Y15
  2839	VPXOR    Y12, Y14, Y14
  2840	VPXOR    Y13, Y9, Y9
  2841	VPXOR    Y8, Y10, Y10
  2842	VPXOR    Y15, Y11, Y11
  2843	IMULQ    R12, DX
  2844	ADDQ     AX, R15
  2845	ADCQ     DX, R8
  2846	VMOVDQA  Y15, 224(BP)
  2847	VPSLLD   $0x0c, Y14, Y15
  2848	VPSRLD   $0x14, Y14, Y14
  2849	VPXOR    Y15, Y14, Y14
  2850	VPSLLD   $0x0c, Y9, Y15
  2851	VPSRLD   $0x14, Y9, Y9
  2852	VPXOR    Y15, Y9, Y9
  2853	VPSLLD   $0x0c, Y10, Y15
  2854	VPSRLD   $0x14, Y10, Y10
  2855	VPXOR    Y15, Y10, Y10
  2856	VPSLLD   $0x0c, Y11, Y15
  2857	VPSRLD   $0x14, Y11, Y11
  2858	VPXOR    Y15, Y11, Y11
  2859	VMOVDQA  224(BP), Y15
  2860	MOVQ     R13, R10
  2861	MOVQ     R14, R11
  2862	MOVQ     R15, R12
  2863	ANDQ     $0x03, R12
  2864	MOVQ     R15, R13
  2865	ANDQ     $-4, R13
  2866	MOVQ     R8, R14
  2867	SHRQ     $0x02, R8, R15
  2868	SHRQ     $0x02, R8
  2869	ADDQ     R13, R10
  2870	ADCQ     R14, R11
  2871	ADCQ     $0x00, R12
  2872	ADDQ     R15, R10
  2873	ADCQ     R8, R11
  2874	ADCQ     $0x00, R12
  2875	VPADDD   Y14, Y0, Y0
  2876	VPADDD   Y9, Y5, Y5
  2877	VPADDD   Y10, Y6, Y6
  2878	VPADDD   Y11, Y7, Y7
  2879	VPXOR    Y0, Y4, Y4
  2880	VPXOR    Y5, Y1, Y1
  2881	VPXOR    Y6, Y2, Y2
  2882	VPXOR    Y7, Y3, Y3
  2883	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  2884	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  2885	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  2886	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  2887	ADDQ     16(SI)(CX*1), R10
  2888	ADCQ     24(SI)(CX*1), R11
  2889	ADCQ     $0x01, R12
  2890	VPADDD   Y4, Y12, Y12
  2891	VPADDD   Y1, Y13, Y13
  2892	VPADDD   Y2, Y8, Y8
  2893	VPADDD   Y3, Y15, Y15
  2894	MOVQ     (BP), DX
  2895	MOVQ     DX, R15
  2896	MULXQ    R10, R13, R14
  2897	IMULQ    R12, R15
  2898	MULXQ    R11, AX, DX
  2899	ADDQ     AX, R14
  2900	ADCQ     DX, R15
  2901	VPXOR    Y12, Y14, Y14
  2902	VPXOR    Y13, Y9, Y9
  2903	VPXOR    Y8, Y10, Y10
  2904	VPXOR    Y15, Y11, Y11
  2905	VMOVDQA  Y15, 224(BP)
  2906	VPSLLD   $0x07, Y14, Y15
  2907	VPSRLD   $0x19, Y14, Y14
  2908	VPXOR    Y15, Y14, Y14
  2909	VPSLLD   $0x07, Y9, Y15
  2910	VPSRLD   $0x19, Y9, Y9
  2911	VPXOR    Y15, Y9, Y9
  2912	VPSLLD   $0x07, Y10, Y15
  2913	VPSRLD   $0x19, Y10, Y10
  2914	VPXOR    Y15, Y10, Y10
  2915	VPSLLD   $0x07, Y11, Y15
  2916	VPSRLD   $0x19, Y11, Y11
  2917	VPXOR    Y15, Y11, Y11
  2918	VMOVDQA  224(BP), Y15
  2919	MOVQ     8(BP), DX
  2920	MULXQ    R10, R10, AX
  2921	ADDQ     R10, R14
  2922	MULXQ    R11, R11, R8
  2923	ADCQ     R11, R15
  2924	ADCQ     $0x00, R8
  2925	VPALIGNR $0x04, Y14, Y14, Y14
  2926	VPALIGNR $0x04, Y9, Y9, Y9
  2927	VPALIGNR $0x04, Y10, Y10, Y10
  2928	VPALIGNR $0x04, Y11, Y11, Y11
  2929	VPALIGNR $0x08, Y12, Y12, Y12
  2930	VPALIGNR $0x08, Y13, Y13, Y13
  2931	VPALIGNR $0x08, Y8, Y8, Y8
  2932	VPALIGNR $0x08, Y15, Y15, Y15
  2933	VPALIGNR $0x0c, Y4, Y4, Y4
  2934	VPALIGNR $0x0c, Y1, Y1, Y1
  2935	VPALIGNR $0x0c, Y2, Y2, Y2
  2936	VPALIGNR $0x0c, Y3, Y3, Y3
  2937	VPADDD   Y14, Y0, Y0
  2938	VPADDD   Y9, Y5, Y5
  2939	VPADDD   Y10, Y6, Y6
  2940	VPADDD   Y11, Y7, Y7
  2941	IMULQ    R12, DX
  2942	ADDQ     AX, R15
  2943	ADCQ     DX, R8
  2944	VPXOR    Y0, Y4, Y4
  2945	VPXOR    Y5, Y1, Y1
  2946	VPXOR    Y6, Y2, Y2
  2947	VPXOR    Y7, Y3, Y3
  2948	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  2949	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  2950	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  2951	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  2952	MOVQ     R13, R10
  2953	MOVQ     R14, R11
  2954	MOVQ     R15, R12
  2955	ANDQ     $0x03, R12
  2956	MOVQ     R15, R13
  2957	ANDQ     $-4, R13
  2958	MOVQ     R8, R14
  2959	SHRQ     $0x02, R8, R15
  2960	SHRQ     $0x02, R8
  2961	ADDQ     R13, R10
  2962	ADCQ     R14, R11
  2963	ADCQ     $0x00, R12
  2964	ADDQ     R15, R10
  2965	ADCQ     R8, R11
  2966	ADCQ     $0x00, R12
  2967	VPADDD   Y4, Y12, Y12
  2968	VPADDD   Y1, Y13, Y13
  2969	VPADDD   Y2, Y8, Y8
  2970	VPADDD   Y3, Y15, Y15
  2971	VPXOR    Y12, Y14, Y14
  2972	VPXOR    Y13, Y9, Y9
  2973	VPXOR    Y8, Y10, Y10
  2974	VPXOR    Y15, Y11, Y11
  2975	ADDQ     32(SI)(CX*1), R10
  2976	ADCQ     40(SI)(CX*1), R11
  2977	ADCQ     $0x01, R12
  2978	LEAQ     48(CX), CX
  2979	VMOVDQA  Y15, 224(BP)
  2980	VPSLLD   $0x0c, Y14, Y15
  2981	VPSRLD   $0x14, Y14, Y14
  2982	VPXOR    Y15, Y14, Y14
  2983	VPSLLD   $0x0c, Y9, Y15
  2984	VPSRLD   $0x14, Y9, Y9
  2985	VPXOR    Y15, Y9, Y9
  2986	VPSLLD   $0x0c, Y10, Y15
  2987	VPSRLD   $0x14, Y10, Y10
  2988	VPXOR    Y15, Y10, Y10
  2989	VPSLLD   $0x0c, Y11, Y15
  2990	VPSRLD   $0x14, Y11, Y11
  2991	VPXOR    Y15, Y11, Y11
  2992	VMOVDQA  224(BP), Y15
  2993	MOVQ     (BP), DX
  2994	MOVQ     DX, R15
  2995	MULXQ    R10, R13, R14
  2996	IMULQ    R12, R15
  2997	MULXQ    R11, AX, DX
  2998	ADDQ     AX, R14
  2999	ADCQ     DX, R15
  3000	VPADDD   Y14, Y0, Y0
  3001	VPADDD   Y9, Y5, Y5
  3002	VPADDD   Y10, Y6, Y6
  3003	VPADDD   Y11, Y7, Y7
  3004	VPXOR    Y0, Y4, Y4
  3005	VPXOR    Y5, Y1, Y1
  3006	VPXOR    Y6, Y2, Y2
  3007	VPXOR    Y7, Y3, Y3
  3008	MOVQ     8(BP), DX
  3009	MULXQ    R10, R10, AX
  3010	ADDQ     R10, R14
  3011	MULXQ    R11, R11, R8
  3012	ADCQ     R11, R15
  3013	ADCQ     $0x00, R8
  3014	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  3015	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  3016	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  3017	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  3018	VPADDD   Y4, Y12, Y12
  3019	VPADDD   Y1, Y13, Y13
  3020	VPADDD   Y2, Y8, Y8
  3021	VPADDD   Y3, Y15, Y15
  3022	IMULQ    R12, DX
  3023	ADDQ     AX, R15
  3024	ADCQ     DX, R8
  3025	VPXOR    Y12, Y14, Y14
  3026	VPXOR    Y13, Y9, Y9
  3027	VPXOR    Y8, Y10, Y10
  3028	VPXOR    Y15, Y11, Y11
  3029	VMOVDQA  Y15, 224(BP)
  3030	VPSLLD   $0x07, Y14, Y15
  3031	VPSRLD   $0x19, Y14, Y14
  3032	VPXOR    Y15, Y14, Y14
  3033	VPSLLD   $0x07, Y9, Y15
  3034	VPSRLD   $0x19, Y9, Y9
  3035	VPXOR    Y15, Y9, Y9
  3036	VPSLLD   $0x07, Y10, Y15
  3037	VPSRLD   $0x19, Y10, Y10
  3038	VPXOR    Y15, Y10, Y10
  3039	VPSLLD   $0x07, Y11, Y15
  3040	VPSRLD   $0x19, Y11, Y11
  3041	VPXOR    Y15, Y11, Y11
  3042	VMOVDQA  224(BP), Y15
  3043	MOVQ     R13, R10
  3044	MOVQ     R14, R11
  3045	MOVQ     R15, R12
  3046	ANDQ     $0x03, R12
  3047	MOVQ     R15, R13
  3048	ANDQ     $-4, R13
  3049	MOVQ     R8, R14
  3050	SHRQ     $0x02, R8, R15
  3051	SHRQ     $0x02, R8
  3052	ADDQ     R13, R10
  3053	ADCQ     R14, R11
  3054	ADCQ     $0x00, R12
  3055	ADDQ     R15, R10
  3056	ADCQ     R8, R11
  3057	ADCQ     $0x00, R12
  3058	VPALIGNR $0x0c, Y14, Y14, Y14
  3059	VPALIGNR $0x0c, Y9, Y9, Y9
  3060	VPALIGNR $0x0c, Y10, Y10, Y10
  3061	VPALIGNR $0x0c, Y11, Y11, Y11
  3062	VPALIGNR $0x08, Y12, Y12, Y12
  3063	VPALIGNR $0x08, Y13, Y13, Y13
  3064	VPALIGNR $0x08, Y8, Y8, Y8
  3065	VPALIGNR $0x08, Y15, Y15, Y15
  3066	VPALIGNR $0x04, Y4, Y4, Y4
  3067	VPALIGNR $0x04, Y1, Y1, Y1
  3068	VPALIGNR $0x04, Y2, Y2, Y2
  3069	VPALIGNR $0x04, Y3, Y3, Y3
  3070	CMPQ     CX, $0x000001e0
  3071	JNE      openAVX2InternalLoop
  3072	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
  3073	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
  3074	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
  3075	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
  3076	VPADDD   32(BP), Y14, Y14
  3077	VPADDD   32(BP), Y9, Y9
  3078	VPADDD   32(BP), Y10, Y10
  3079	VPADDD   32(BP), Y11, Y11
  3080	VPADDD   64(BP), Y12, Y12
  3081	VPADDD   64(BP), Y13, Y13
  3082	VPADDD   64(BP), Y8, Y8
  3083	VPADDD   64(BP), Y15, Y15
  3084	VPADDD   96(BP), Y4, Y4
  3085	VPADDD   128(BP), Y1, Y1
  3086	VPADDD   160(BP), Y2, Y2
  3087	VPADDD   192(BP), Y3, Y3
  3088	VMOVDQA  Y15, 224(BP)
  3089
  3090	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  3091	ADDQ       480(SI), R10
  3092	ADCQ       488(SI), R11
  3093	ADCQ       $0x01, R12
  3094	MOVQ       (BP), DX
  3095	MOVQ       DX, R15
  3096	MULXQ      R10, R13, R14
  3097	IMULQ      R12, R15
  3098	MULXQ      R11, AX, DX
  3099	ADDQ       AX, R14
  3100	ADCQ       DX, R15
  3101	MOVQ       8(BP), DX
  3102	MULXQ      R10, R10, AX
  3103	ADDQ       R10, R14
  3104	MULXQ      R11, R11, R8
  3105	ADCQ       R11, R15
  3106	ADCQ       $0x00, R8
  3107	IMULQ      R12, DX
  3108	ADDQ       AX, R15
  3109	ADCQ       DX, R8
  3110	MOVQ       R13, R10
  3111	MOVQ       R14, R11
  3112	MOVQ       R15, R12
  3113	ANDQ       $0x03, R12
  3114	MOVQ       R15, R13
  3115	ANDQ       $-4, R13
  3116	MOVQ       R8, R14
  3117	SHRQ       $0x02, R8, R15
  3118	SHRQ       $0x02, R8
  3119	ADDQ       R13, R10
  3120	ADCQ       R14, R11
  3121	ADCQ       $0x00, R12
  3122	ADDQ       R15, R10
  3123	ADCQ       R8, R11
  3124	ADCQ       $0x00, R12
  3125	VPERM2I128 $0x02, Y0, Y14, Y15
  3126	VPERM2I128 $0x13, Y0, Y14, Y14
  3127	VPERM2I128 $0x02, Y12, Y4, Y0
  3128	VPERM2I128 $0x13, Y12, Y4, Y12
  3129	VPXOR      (SI), Y15, Y15
  3130	VPXOR      32(SI), Y0, Y0
  3131	VPXOR      64(SI), Y14, Y14
  3132	VPXOR      96(SI), Y12, Y12
  3133	VMOVDQU    Y15, (DI)
  3134	VMOVDQU    Y0, 32(DI)
  3135	VMOVDQU    Y14, 64(DI)
  3136	VMOVDQU    Y12, 96(DI)
  3137	VPERM2I128 $0x02, Y5, Y9, Y0
  3138	VPERM2I128 $0x02, Y13, Y1, Y14
  3139	VPERM2I128 $0x13, Y5, Y9, Y12
  3140	VPERM2I128 $0x13, Y13, Y1, Y4
  3141	VPXOR      128(SI), Y0, Y0
  3142	VPXOR      160(SI), Y14, Y14
  3143	VPXOR      192(SI), Y12, Y12
  3144	VPXOR      224(SI), Y4, Y4
  3145	VMOVDQU    Y0, 128(DI)
  3146	VMOVDQU    Y14, 160(DI)
  3147	VMOVDQU    Y12, 192(DI)
  3148	VMOVDQU    Y4, 224(DI)
  3149
  3150	// and here
  3151	ADDQ       496(SI), R10
  3152	ADCQ       504(SI), R11
  3153	ADCQ       $0x01, R12
  3154	MOVQ       (BP), DX
  3155	MOVQ       DX, R15
  3156	MULXQ      R10, R13, R14
  3157	IMULQ      R12, R15
  3158	MULXQ      R11, AX, DX
  3159	ADDQ       AX, R14
  3160	ADCQ       DX, R15
  3161	MOVQ       8(BP), DX
  3162	MULXQ      R10, R10, AX
  3163	ADDQ       R10, R14
  3164	MULXQ      R11, R11, R8
  3165	ADCQ       R11, R15
  3166	ADCQ       $0x00, R8
  3167	IMULQ      R12, DX
  3168	ADDQ       AX, R15
  3169	ADCQ       DX, R8
  3170	MOVQ       R13, R10
  3171	MOVQ       R14, R11
  3172	MOVQ       R15, R12
  3173	ANDQ       $0x03, R12
  3174	MOVQ       R15, R13
  3175	ANDQ       $-4, R13
  3176	MOVQ       R8, R14
  3177	SHRQ       $0x02, R8, R15
  3178	SHRQ       $0x02, R8
  3179	ADDQ       R13, R10
  3180	ADCQ       R14, R11
  3181	ADCQ       $0x00, R12
  3182	ADDQ       R15, R10
  3183	ADCQ       R8, R11
  3184	ADCQ       $0x00, R12
  3185	VPERM2I128 $0x02, Y6, Y10, Y0
  3186	VPERM2I128 $0x02, Y8, Y2, Y14
  3187	VPERM2I128 $0x13, Y6, Y10, Y12
  3188	VPERM2I128 $0x13, Y8, Y2, Y4
  3189	VPXOR      256(SI), Y0, Y0
  3190	VPXOR      288(SI), Y14, Y14
  3191	VPXOR      320(SI), Y12, Y12
  3192	VPXOR      352(SI), Y4, Y4
  3193	VMOVDQU    Y0, 256(DI)
  3194	VMOVDQU    Y14, 288(DI)
  3195	VMOVDQU    Y12, 320(DI)
  3196	VMOVDQU    Y4, 352(DI)
  3197	VPERM2I128 $0x02, Y7, Y11, Y0
  3198	VPERM2I128 $0x02, 224(BP), Y3, Y14
  3199	VPERM2I128 $0x13, Y7, Y11, Y12
  3200	VPERM2I128 $0x13, 224(BP), Y3, Y4
  3201	VPXOR      384(SI), Y0, Y0
  3202	VPXOR      416(SI), Y14, Y14
  3203	VPXOR      448(SI), Y12, Y12
  3204	VPXOR      480(SI), Y4, Y4
  3205	VMOVDQU    Y0, 384(DI)
  3206	VMOVDQU    Y14, 416(DI)
  3207	VMOVDQU    Y12, 448(DI)
  3208	VMOVDQU    Y4, 480(DI)
  3209	LEAQ       512(SI), SI
  3210	LEAQ       512(DI), DI
  3211	SUBQ       $0x00000200, BX
  3212	JMP        openAVX2MainLoop
  3213
  3214openAVX2MainLoopDone:
  3215	// Handle the various tail sizes efficiently
  3216	TESTQ BX, BX
  3217	JE    openSSEFinalize
  3218	CMPQ  BX, $0x80
  3219	JBE   openAVX2Tail128
  3220	CMPQ  BX, $0x00000100
  3221	JBE   openAVX2Tail256
  3222	CMPQ  BX, $0x00000180
  3223	JBE   openAVX2Tail384
  3224	JMP   openAVX2Tail512
  3225
  3226openAVX2192:
  3227	VMOVDQA Y0, Y5
  3228	VMOVDQA Y14, Y9
  3229	VMOVDQA Y12, Y13
  3230	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  3231	VMOVDQA Y0, Y6
  3232	VMOVDQA Y14, Y10
  3233	VMOVDQA Y12, Y8
  3234	VMOVDQA Y4, Y2
  3235	VMOVDQA Y1, Y15
  3236	MOVQ    $0x0000000a, R9
  3237
  3238openAVX2192InnerCipherLoop:
  3239	VPADDD     Y14, Y0, Y0
  3240	VPXOR      Y0, Y4, Y4
  3241	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  3242	VPADDD     Y4, Y12, Y12
  3243	VPXOR      Y12, Y14, Y14
  3244	VPSLLD     $0x0c, Y14, Y3
  3245	VPSRLD     $0x14, Y14, Y14
  3246	VPXOR      Y3, Y14, Y14
  3247	VPADDD     Y14, Y0, Y0
  3248	VPXOR      Y0, Y4, Y4
  3249	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  3250	VPADDD     Y4, Y12, Y12
  3251	VPXOR      Y12, Y14, Y14
  3252	VPSLLD     $0x07, Y14, Y3
  3253	VPSRLD     $0x19, Y14, Y14
  3254	VPXOR      Y3, Y14, Y14
  3255	VPADDD     Y9, Y5, Y5
  3256	VPXOR      Y5, Y1, Y1
  3257	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  3258	VPADDD     Y1, Y13, Y13
  3259	VPXOR      Y13, Y9, Y9
  3260	VPSLLD     $0x0c, Y9, Y3
  3261	VPSRLD     $0x14, Y9, Y9
  3262	VPXOR      Y3, Y9, Y9
  3263	VPADDD     Y9, Y5, Y5
  3264	VPXOR      Y5, Y1, Y1
  3265	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  3266	VPADDD     Y1, Y13, Y13
  3267	VPXOR      Y13, Y9, Y9
  3268	VPSLLD     $0x07, Y9, Y3
  3269	VPSRLD     $0x19, Y9, Y9
  3270	VPXOR      Y3, Y9, Y9
  3271	VPALIGNR   $0x04, Y14, Y14, Y14
  3272	VPALIGNR   $0x04, Y9, Y9, Y9
  3273	VPALIGNR   $0x08, Y12, Y12, Y12
  3274	VPALIGNR   $0x08, Y13, Y13, Y13
  3275	VPALIGNR   $0x0c, Y4, Y4, Y4
  3276	VPALIGNR   $0x0c, Y1, Y1, Y1
  3277	VPADDD     Y14, Y0, Y0
  3278	VPXOR      Y0, Y4, Y4
  3279	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  3280	VPADDD     Y4, Y12, Y12
  3281	VPXOR      Y12, Y14, Y14
  3282	VPSLLD     $0x0c, Y14, Y3
  3283	VPSRLD     $0x14, Y14, Y14
  3284	VPXOR      Y3, Y14, Y14
  3285	VPADDD     Y14, Y0, Y0
  3286	VPXOR      Y0, Y4, Y4
  3287	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  3288	VPADDD     Y4, Y12, Y12
  3289	VPXOR      Y12, Y14, Y14
  3290	VPSLLD     $0x07, Y14, Y3
  3291	VPSRLD     $0x19, Y14, Y14
  3292	VPXOR      Y3, Y14, Y14
  3293	VPADDD     Y9, Y5, Y5
  3294	VPXOR      Y5, Y1, Y1
  3295	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  3296	VPADDD     Y1, Y13, Y13
  3297	VPXOR      Y13, Y9, Y9
  3298	VPSLLD     $0x0c, Y9, Y3
  3299	VPSRLD     $0x14, Y9, Y9
  3300	VPXOR      Y3, Y9, Y9
  3301	VPADDD     Y9, Y5, Y5
  3302	VPXOR      Y5, Y1, Y1
  3303	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  3304	VPADDD     Y1, Y13, Y13
  3305	VPXOR      Y13, Y9, Y9
  3306	VPSLLD     $0x07, Y9, Y3
  3307	VPSRLD     $0x19, Y9, Y9
  3308	VPXOR      Y3, Y9, Y9
  3309	VPALIGNR   $0x0c, Y14, Y14, Y14
  3310	VPALIGNR   $0x0c, Y9, Y9, Y9
  3311	VPALIGNR   $0x08, Y12, Y12, Y12
  3312	VPALIGNR   $0x08, Y13, Y13, Y13
  3313	VPALIGNR   $0x04, Y4, Y4, Y4
  3314	VPALIGNR   $0x04, Y1, Y1, Y1
  3315	DECQ       R9
  3316	JNE        openAVX2192InnerCipherLoop
  3317	VPADDD     Y6, Y0, Y0
  3318	VPADDD     Y6, Y5, Y5
  3319	VPADDD     Y10, Y14, Y14
  3320	VPADDD     Y10, Y9, Y9
  3321	VPADDD     Y8, Y12, Y12
  3322	VPADDD     Y8, Y13, Y13
  3323	VPADDD     Y2, Y4, Y4
  3324	VPADDD     Y15, Y1, Y1
  3325	VPERM2I128 $0x02, Y0, Y14, Y3
  3326
  3327	// Clamp and store poly key
  3328	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
  3329	VMOVDQA Y3, (BP)
  3330
  3331	// Stream for up to 192 bytes
  3332	VPERM2I128 $0x13, Y0, Y14, Y0
  3333	VPERM2I128 $0x13, Y12, Y4, Y14
  3334	VPERM2I128 $0x02, Y5, Y9, Y12
  3335	VPERM2I128 $0x02, Y13, Y1, Y4
  3336	VPERM2I128 $0x13, Y5, Y9, Y5
  3337	VPERM2I128 $0x13, Y13, Y1, Y9
  3338
  3339openAVX2ShortOpen:
  3340	// Hash
  3341	MOVQ ad_len+80(FP), R9
  3342	CALL polyHashADInternal<>(SB)
  3343
  3344openAVX2ShortOpenLoop:
  3345	CMPQ BX, $0x20
  3346	JB   openAVX2ShortTail32
  3347	SUBQ $0x20, BX
  3348
  3349	// Load for hashing
  3350	ADDQ  (SI), R10
  3351	ADCQ  8(SI), R11
  3352	ADCQ  $0x01, R12
  3353	MOVQ  (BP), DX
  3354	MOVQ  DX, R15
  3355	MULXQ R10, R13, R14
  3356	IMULQ R12, R15
  3357	MULXQ R11, AX, DX
  3358	ADDQ  AX, R14
  3359	ADCQ  DX, R15
  3360	MOVQ  8(BP), DX
  3361	MULXQ R10, R10, AX
  3362	ADDQ  R10, R14
  3363	MULXQ R11, R11, R8
  3364	ADCQ  R11, R15
  3365	ADCQ  $0x00, R8
  3366	IMULQ R12, DX
  3367	ADDQ  AX, R15
  3368	ADCQ  DX, R8
  3369	MOVQ  R13, R10
  3370	MOVQ  R14, R11
  3371	MOVQ  R15, R12
  3372	ANDQ  $0x03, R12
  3373	MOVQ  R15, R13
  3374	ANDQ  $-4, R13
  3375	MOVQ  R8, R14
  3376	SHRQ  $0x02, R8, R15
  3377	SHRQ  $0x02, R8
  3378	ADDQ  R13, R10
  3379	ADCQ  R14, R11
  3380	ADCQ  $0x00, R12
  3381	ADDQ  R15, R10
  3382	ADCQ  R8, R11
  3383	ADCQ  $0x00, R12
  3384	ADDQ  16(SI), R10
  3385	ADCQ  24(SI), R11
  3386	ADCQ  $0x01, R12
  3387	MOVQ  (BP), DX
  3388	MOVQ  DX, R15
  3389	MULXQ R10, R13, R14
  3390	IMULQ R12, R15
  3391	MULXQ R11, AX, DX
  3392	ADDQ  AX, R14
  3393	ADCQ  DX, R15
  3394	MOVQ  8(BP), DX
  3395	MULXQ R10, R10, AX
  3396	ADDQ  R10, R14
  3397	MULXQ R11, R11, R8
  3398	ADCQ  R11, R15
  3399	ADCQ  $0x00, R8
  3400	IMULQ R12, DX
  3401	ADDQ  AX, R15
  3402	ADCQ  DX, R8
  3403	MOVQ  R13, R10
  3404	MOVQ  R14, R11
  3405	MOVQ  R15, R12
  3406	ANDQ  $0x03, R12
  3407	MOVQ  R15, R13
  3408	ANDQ  $-4, R13
  3409	MOVQ  R8, R14
  3410	SHRQ  $0x02, R8, R15
  3411	SHRQ  $0x02, R8
  3412	ADDQ  R13, R10
  3413	ADCQ  R14, R11
  3414	ADCQ  $0x00, R12
  3415	ADDQ  R15, R10
  3416	ADCQ  R8, R11
  3417	ADCQ  $0x00, R12
  3418
  3419	// Load for decryption
  3420	VPXOR   (SI), Y0, Y0
  3421	VMOVDQU Y0, (DI)
  3422	LEAQ    32(SI), SI
  3423	LEAQ    32(DI), DI
  3424
  3425	// Shift stream left
  3426	VMOVDQA Y14, Y0
  3427	VMOVDQA Y12, Y14
  3428	VMOVDQA Y4, Y12
  3429	VMOVDQA Y5, Y4
  3430	VMOVDQA Y9, Y5
  3431	VMOVDQA Y13, Y9
  3432	VMOVDQA Y1, Y13
  3433	VMOVDQA Y6, Y1
  3434	VMOVDQA Y10, Y6
  3435	JMP     openAVX2ShortOpenLoop
  3436
  3437openAVX2ShortTail32:
  3438	CMPQ    BX, $0x10
  3439	VMOVDQA X0, X1
  3440	JB      openAVX2ShortDone
  3441	SUBQ    $0x10, BX
  3442
  3443	// Load for hashing
  3444	ADDQ  (SI), R10
  3445	ADCQ  8(SI), R11
  3446	ADCQ  $0x01, R12
  3447	MOVQ  (BP), DX
  3448	MOVQ  DX, R15
  3449	MULXQ R10, R13, R14
  3450	IMULQ R12, R15
  3451	MULXQ R11, AX, DX
  3452	ADDQ  AX, R14
  3453	ADCQ  DX, R15
  3454	MOVQ  8(BP), DX
  3455	MULXQ R10, R10, AX
  3456	ADDQ  R10, R14
  3457	MULXQ R11, R11, R8
  3458	ADCQ  R11, R15
  3459	ADCQ  $0x00, R8
  3460	IMULQ R12, DX
  3461	ADDQ  AX, R15
  3462	ADCQ  DX, R8
  3463	MOVQ  R13, R10
  3464	MOVQ  R14, R11
  3465	MOVQ  R15, R12
  3466	ANDQ  $0x03, R12
  3467	MOVQ  R15, R13
  3468	ANDQ  $-4, R13
  3469	MOVQ  R8, R14
  3470	SHRQ  $0x02, R8, R15
  3471	SHRQ  $0x02, R8
  3472	ADDQ  R13, R10
  3473	ADCQ  R14, R11
  3474	ADCQ  $0x00, R12
  3475	ADDQ  R15, R10
  3476	ADCQ  R8, R11
  3477	ADCQ  $0x00, R12
  3478
  3479	// Load for decryption
  3480	VPXOR      (SI), X0, X12
  3481	VMOVDQU    X12, (DI)
  3482	LEAQ       16(SI), SI
  3483	LEAQ       16(DI), DI
  3484	VPERM2I128 $0x11, Y0, Y0, Y0
  3485	VMOVDQA    X0, X1
  3486
  3487openAVX2ShortDone:
  3488	VZEROUPPER
  3489	JMP openSSETail16
  3490
  3491openAVX2320:
  3492	VMOVDQA Y0, Y5
  3493	VMOVDQA Y14, Y9
  3494	VMOVDQA Y12, Y13
  3495	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  3496	VMOVDQA Y0, Y6
  3497	VMOVDQA Y14, Y10
  3498	VMOVDQA Y12, Y8
  3499	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  3500	VMOVDQA Y14, Y7
  3501	VMOVDQA Y12, Y11
  3502	VMOVDQA Y4, Y15
  3503	MOVQ    $0x0000000a, R9
  3504
  3505openAVX2320InnerCipherLoop:
  3506	VPADDD   Y14, Y0, Y0
  3507	VPXOR    Y0, Y4, Y4
  3508	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  3509	VPADDD   Y4, Y12, Y12
  3510	VPXOR    Y12, Y14, Y14
  3511	VPSLLD   $0x0c, Y14, Y3
  3512	VPSRLD   $0x14, Y14, Y14
  3513	VPXOR    Y3, Y14, Y14
  3514	VPADDD   Y14, Y0, Y0
  3515	VPXOR    Y0, Y4, Y4
  3516	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  3517	VPADDD   Y4, Y12, Y12
  3518	VPXOR    Y12, Y14, Y14
  3519	VPSLLD   $0x07, Y14, Y3
  3520	VPSRLD   $0x19, Y14, Y14
  3521	VPXOR    Y3, Y14, Y14
  3522	VPADDD   Y9, Y5, Y5
  3523	VPXOR    Y5, Y1, Y1
  3524	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  3525	VPADDD   Y1, Y13, Y13
  3526	VPXOR    Y13, Y9, Y9
  3527	VPSLLD   $0x0c, Y9, Y3
  3528	VPSRLD   $0x14, Y9, Y9
  3529	VPXOR    Y3, Y9, Y9
  3530	VPADDD   Y9, Y5, Y5
  3531	VPXOR    Y5, Y1, Y1
  3532	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  3533	VPADDD   Y1, Y13, Y13
  3534	VPXOR    Y13, Y9, Y9
  3535	VPSLLD   $0x07, Y9, Y3
  3536	VPSRLD   $0x19, Y9, Y9
  3537	VPXOR    Y3, Y9, Y9
  3538	VPADDD   Y10, Y6, Y6
  3539	VPXOR    Y6, Y2, Y2
  3540	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  3541	VPADDD   Y2, Y8, Y8
  3542	VPXOR    Y8, Y10, Y10
  3543	VPSLLD   $0x0c, Y10, Y3
  3544	VPSRLD   $0x14, Y10, Y10
  3545	VPXOR    Y3, Y10, Y10
  3546	VPADDD   Y10, Y6, Y6
  3547	VPXOR    Y6, Y2, Y2
  3548	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  3549	VPADDD   Y2, Y8, Y8
  3550	VPXOR    Y8, Y10, Y10
  3551	VPSLLD   $0x07, Y10, Y3
  3552	VPSRLD   $0x19, Y10, Y10
  3553	VPXOR    Y3, Y10, Y10
  3554	VPALIGNR $0x04, Y14, Y14, Y14
  3555	VPALIGNR $0x04, Y9, Y9, Y9
  3556	VPALIGNR $0x04, Y10, Y10, Y10
  3557	VPALIGNR $0x08, Y12, Y12, Y12
  3558	VPALIGNR $0x08, Y13, Y13, Y13
  3559	VPALIGNR $0x08, Y8, Y8, Y8
  3560	VPALIGNR $0x0c, Y4, Y4, Y4
  3561	VPALIGNR $0x0c, Y1, Y1, Y1
  3562	VPALIGNR $0x0c, Y2, Y2, Y2
  3563	VPADDD   Y14, Y0, Y0
  3564	VPXOR    Y0, Y4, Y4
  3565	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  3566	VPADDD   Y4, Y12, Y12
  3567	VPXOR    Y12, Y14, Y14
  3568	VPSLLD   $0x0c, Y14, Y3
  3569	VPSRLD   $0x14, Y14, Y14
  3570	VPXOR    Y3, Y14, Y14
  3571	VPADDD   Y14, Y0, Y0
  3572	VPXOR    Y0, Y4, Y4
  3573	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  3574	VPADDD   Y4, Y12, Y12
  3575	VPXOR    Y12, Y14, Y14
  3576	VPSLLD   $0x07, Y14, Y3
  3577	VPSRLD   $0x19, Y14, Y14
  3578	VPXOR    Y3, Y14, Y14
  3579	VPADDD   Y9, Y5, Y5
  3580	VPXOR    Y5, Y1, Y1
  3581	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  3582	VPADDD   Y1, Y13, Y13
  3583	VPXOR    Y13, Y9, Y9
  3584	VPSLLD   $0x0c, Y9, Y3
  3585	VPSRLD   $0x14, Y9, Y9
  3586	VPXOR    Y3, Y9, Y9
  3587	VPADDD   Y9, Y5, Y5
  3588	VPXOR    Y5, Y1, Y1
  3589	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  3590	VPADDD   Y1, Y13, Y13
  3591	VPXOR    Y13, Y9, Y9
  3592	VPSLLD   $0x07, Y9, Y3
  3593	VPSRLD   $0x19, Y9, Y9
  3594	VPXOR    Y3, Y9, Y9
  3595	VPADDD   Y10, Y6, Y6
  3596	VPXOR    Y6, Y2, Y2
  3597	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  3598	VPADDD   Y2, Y8, Y8
  3599	VPXOR    Y8, Y10, Y10
  3600	VPSLLD   $0x0c, Y10, Y3
  3601	VPSRLD   $0x14, Y10, Y10
  3602	VPXOR    Y3, Y10, Y10
  3603	VPADDD   Y10, Y6, Y6
  3604	VPXOR    Y6, Y2, Y2
  3605	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  3606	VPADDD   Y2, Y8, Y8
  3607	VPXOR    Y8, Y10, Y10
  3608	VPSLLD   $0x07, Y10, Y3
  3609	VPSRLD   $0x19, Y10, Y10
  3610	VPXOR    Y3, Y10, Y10
  3611	VPALIGNR $0x0c, Y14, Y14, Y14
  3612	VPALIGNR $0x0c, Y9, Y9, Y9
  3613	VPALIGNR $0x0c, Y10, Y10, Y10
  3614	VPALIGNR $0x08, Y12, Y12, Y12
  3615	VPALIGNR $0x08, Y13, Y13, Y13
  3616	VPALIGNR $0x08, Y8, Y8, Y8
  3617	VPALIGNR $0x04, Y4, Y4, Y4
  3618	VPALIGNR $0x04, Y1, Y1, Y1
  3619	VPALIGNR $0x04, Y2, Y2, Y2
  3620	DECQ     R9
  3621	JNE      openAVX2320InnerCipherLoop
  3622	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
  3623	VPADDD   Y3, Y0, Y0
  3624	VPADDD   Y3, Y5, Y5
  3625	VPADDD   Y3, Y6, Y6
  3626	VPADDD   Y7, Y14, Y14
  3627	VPADDD   Y7, Y9, Y9
  3628	VPADDD   Y7, Y10, Y10
  3629	VPADDD   Y11, Y12, Y12
  3630	VPADDD   Y11, Y13, Y13
  3631	VPADDD   Y11, Y8, Y8
  3632	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
  3633	VPADDD   Y15, Y4, Y4
  3634	VPADDD   Y3, Y15, Y15
  3635	VPADDD   Y15, Y1, Y1
  3636	VPADDD   Y3, Y15, Y15
  3637	VPADDD   Y15, Y2, Y2
  3638
  3639	// Clamp and store poly key
  3640	VPERM2I128 $0x02, Y0, Y14, Y3
  3641	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
  3642	VMOVDQA    Y3, (BP)
  3643
  3644	// Stream for up to 320 bytes
  3645	VPERM2I128 $0x13, Y0, Y14, Y0
  3646	VPERM2I128 $0x13, Y12, Y4, Y14
  3647	VPERM2I128 $0x02, Y5, Y9, Y12
  3648	VPERM2I128 $0x02, Y13, Y1, Y4
  3649	VPERM2I128 $0x13, Y5, Y9, Y5
  3650	VPERM2I128 $0x13, Y13, Y1, Y9
  3651	VPERM2I128 $0x02, Y6, Y10, Y13
  3652	VPERM2I128 $0x02, Y8, Y2, Y1
  3653	VPERM2I128 $0x13, Y6, Y10, Y6
  3654	VPERM2I128 $0x13, Y8, Y2, Y10
  3655	JMP        openAVX2ShortOpen
  3656
  3657openAVX2Tail128:
  3658	// Need to decrypt up to 128 bytes - prepare two blocks
  3659	VMOVDQA ·chacha20Constants<>+0(SB), Y5
  3660	VMOVDQA 32(BP), Y9
  3661	VMOVDQA 64(BP), Y13
  3662	VMOVDQA 192(BP), Y1
  3663	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y1
  3664	VMOVDQA Y1, Y4
  3665	XORQ    R9, R9
  3666	MOVQ    BX, CX
  3667	ANDQ    $-16, CX
  3668	TESTQ   CX, CX
  3669	JE      openAVX2Tail128LoopB
  3670
  3671openAVX2Tail128LoopA:
  3672	ADDQ  (SI)(R9*1), R10
  3673	ADCQ  8(SI)(R9*1), R11
  3674	ADCQ  $0x01, R12
  3675	MOVQ  (BP), DX
  3676	MOVQ  DX, R15
  3677	MULXQ R10, R13, R14
  3678	IMULQ R12, R15
  3679	MULXQ R11, AX, DX
  3680	ADDQ  AX, R14
  3681	ADCQ  DX, R15
  3682	MOVQ  8(BP), DX
  3683	MULXQ R10, R10, AX
  3684	ADDQ  R10, R14
  3685	MULXQ R11, R11, R8
  3686	ADCQ  R11, R15
  3687	ADCQ  $0x00, R8
  3688	IMULQ R12, DX
  3689	ADDQ  AX, R15
  3690	ADCQ  DX, R8
  3691	MOVQ  R13, R10
  3692	MOVQ  R14, R11
  3693	MOVQ  R15, R12
  3694	ANDQ  $0x03, R12
  3695	MOVQ  R15, R13
  3696	ANDQ  $-4, R13
  3697	MOVQ  R8, R14
  3698	SHRQ  $0x02, R8, R15
  3699	SHRQ  $0x02, R8
  3700	ADDQ  R13, R10
  3701	ADCQ  R14, R11
  3702	ADCQ  $0x00, R12
  3703	ADDQ  R15, R10
  3704	ADCQ  R8, R11
  3705	ADCQ  $0x00, R12
  3706
  3707openAVX2Tail128LoopB:
  3708	ADDQ       $0x10, R9
  3709	VPADDD     Y9, Y5, Y5
  3710	VPXOR      Y5, Y1, Y1
  3711	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  3712	VPADDD     Y1, Y13, Y13
  3713	VPXOR      Y13, Y9, Y9
  3714	VPSLLD     $0x0c, Y9, Y3
  3715	VPSRLD     $0x14, Y9, Y9
  3716	VPXOR      Y3, Y9, Y9
  3717	VPADDD     Y9, Y5, Y5
  3718	VPXOR      Y5, Y1, Y1
  3719	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  3720	VPADDD     Y1, Y13, Y13
  3721	VPXOR      Y13, Y9, Y9
  3722	VPSLLD     $0x07, Y9, Y3
  3723	VPSRLD     $0x19, Y9, Y9
  3724	VPXOR      Y3, Y9, Y9
  3725	VPALIGNR   $0x04, Y9, Y9, Y9
  3726	VPALIGNR   $0x08, Y13, Y13, Y13
  3727	VPALIGNR   $0x0c, Y1, Y1, Y1
  3728	VPADDD     Y9, Y5, Y5
  3729	VPXOR      Y5, Y1, Y1
  3730	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  3731	VPADDD     Y1, Y13, Y13
  3732	VPXOR      Y13, Y9, Y9
  3733	VPSLLD     $0x0c, Y9, Y3
  3734	VPSRLD     $0x14, Y9, Y9
  3735	VPXOR      Y3, Y9, Y9
  3736	VPADDD     Y9, Y5, Y5
  3737	VPXOR      Y5, Y1, Y1
  3738	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  3739	VPADDD     Y1, Y13, Y13
  3740	VPXOR      Y13, Y9, Y9
  3741	VPSLLD     $0x07, Y9, Y3
  3742	VPSRLD     $0x19, Y9, Y9
  3743	VPXOR      Y3, Y9, Y9
  3744	VPALIGNR   $0x0c, Y9, Y9, Y9
  3745	VPALIGNR   $0x08, Y13, Y13, Y13
  3746	VPALIGNR   $0x04, Y1, Y1, Y1
  3747	CMPQ       R9, CX
  3748	JB         openAVX2Tail128LoopA
  3749	CMPQ       R9, $0xa0
  3750	JNE        openAVX2Tail128LoopB
  3751	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  3752	VPADDD     32(BP), Y9, Y9
  3753	VPADDD     64(BP), Y13, Y13
  3754	VPADDD     Y4, Y1, Y1
  3755	VPERM2I128 $0x02, Y5, Y9, Y0
  3756	VPERM2I128 $0x02, Y13, Y1, Y14
  3757	VPERM2I128 $0x13, Y5, Y9, Y12
  3758	VPERM2I128 $0x13, Y13, Y1, Y4
  3759
  3760openAVX2TailLoop:
  3761	CMPQ BX, $0x20
  3762	JB   openAVX2Tail
  3763	SUBQ $0x20, BX
  3764
  3765	// Load for decryption
  3766	VPXOR   (SI), Y0, Y0
  3767	VMOVDQU Y0, (DI)
  3768	LEAQ    32(SI), SI
  3769	LEAQ    32(DI), DI
  3770	VMOVDQA Y14, Y0
  3771	VMOVDQA Y12, Y14
  3772	VMOVDQA Y4, Y12
  3773	JMP     openAVX2TailLoop
  3774
  3775openAVX2Tail:
  3776	CMPQ    BX, $0x10
  3777	VMOVDQA X0, X1
  3778	JB      openAVX2TailDone
  3779	SUBQ    $0x10, BX
  3780
  3781	// Load for decryption
  3782	VPXOR      (SI), X0, X12
  3783	VMOVDQU    X12, (DI)
  3784	LEAQ       16(SI), SI
  3785	LEAQ       16(DI), DI
  3786	VPERM2I128 $0x11, Y0, Y0, Y0
  3787	VMOVDQA    X0, X1
  3788
  3789openAVX2TailDone:
  3790	VZEROUPPER
  3791	JMP openSSETail16
  3792
  3793openAVX2Tail256:
  3794	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  3795	VMOVDQA Y0, Y5
  3796	VMOVDQA 32(BP), Y14
  3797	VMOVDQA Y14, Y9
  3798	VMOVDQA 64(BP), Y12
  3799	VMOVDQA Y12, Y13
  3800	VMOVDQA 192(BP), Y4
  3801	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  3802	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  3803	VMOVDQA Y4, Y7
  3804	VMOVDQA Y1, Y11
  3805
  3806	// Compute the number of iterations that will hash data
  3807	MOVQ    BX, 224(BP)
  3808	MOVQ    BX, CX
  3809	SUBQ    $0x80, CX
  3810	SHRQ    $0x04, CX
  3811	MOVQ    $0x0000000a, R9
  3812	CMPQ    CX, $0x0a
  3813	CMOVQGT R9, CX
  3814	MOVQ    SI, BX
  3815	XORQ    R9, R9
  3816
  3817openAVX2Tail256LoopA:
  3818	ADDQ  (BX), R10
  3819	ADCQ  8(BX), R11
  3820	ADCQ  $0x01, R12
  3821	MOVQ  (BP), DX
  3822	MOVQ  DX, R15
  3823	MULXQ R10, R13, R14
  3824	IMULQ R12, R15
  3825	MULXQ R11, AX, DX
  3826	ADDQ  AX, R14
  3827	ADCQ  DX, R15
  3828	MOVQ  8(BP), DX
  3829	MULXQ R10, R10, AX
  3830	ADDQ  R10, R14
  3831	MULXQ R11, R11, R8
  3832	ADCQ  R11, R15
  3833	ADCQ  $0x00, R8
  3834	IMULQ R12, DX
  3835	ADDQ  AX, R15
  3836	ADCQ  DX, R8
  3837	MOVQ  R13, R10
  3838	MOVQ  R14, R11
  3839	MOVQ  R15, R12
  3840	ANDQ  $0x03, R12
  3841	MOVQ  R15, R13
  3842	ANDQ  $-4, R13
  3843	MOVQ  R8, R14
  3844	SHRQ  $0x02, R8, R15
  3845	SHRQ  $0x02, R8
  3846	ADDQ  R13, R10
  3847	ADCQ  R14, R11
  3848	ADCQ  $0x00, R12
  3849	ADDQ  R15, R10
  3850	ADCQ  R8, R11
  3851	ADCQ  $0x00, R12
  3852	LEAQ  16(BX), BX
  3853
  3854openAVX2Tail256LoopB:
  3855	VPADDD   Y14, Y0, Y0
  3856	VPXOR    Y0, Y4, Y4
  3857	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  3858	VPADDD   Y4, Y12, Y12
  3859	VPXOR    Y12, Y14, Y14
  3860	VPSLLD   $0x0c, Y14, Y3
  3861	VPSRLD   $0x14, Y14, Y14
  3862	VPXOR    Y3, Y14, Y14
  3863	VPADDD   Y14, Y0, Y0
  3864	VPXOR    Y0, Y4, Y4
  3865	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  3866	VPADDD   Y4, Y12, Y12
  3867	VPXOR    Y12, Y14, Y14
  3868	VPSLLD   $0x07, Y14, Y3
  3869	VPSRLD   $0x19, Y14, Y14
  3870	VPXOR    Y3, Y14, Y14
  3871	VPADDD   Y9, Y5, Y5
  3872	VPXOR    Y5, Y1, Y1
  3873	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  3874	VPADDD   Y1, Y13, Y13
  3875	VPXOR    Y13, Y9, Y9
  3876	VPSLLD   $0x0c, Y9, Y3
  3877	VPSRLD   $0x14, Y9, Y9
  3878	VPXOR    Y3, Y9, Y9
  3879	VPADDD   Y9, Y5, Y5
  3880	VPXOR    Y5, Y1, Y1
  3881	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  3882	VPADDD   Y1, Y13, Y13
  3883	VPXOR    Y13, Y9, Y9
  3884	VPSLLD   $0x07, Y9, Y3
  3885	VPSRLD   $0x19, Y9, Y9
  3886	VPXOR    Y3, Y9, Y9
  3887	VPALIGNR $0x04, Y14, Y14, Y14
  3888	VPALIGNR $0x04, Y9, Y9, Y9
  3889	VPALIGNR $0x08, Y12, Y12, Y12
  3890	VPALIGNR $0x08, Y13, Y13, Y13
  3891	VPALIGNR $0x0c, Y4, Y4, Y4
  3892	VPALIGNR $0x0c, Y1, Y1, Y1
  3893	INCQ     R9
  3894	VPADDD   Y14, Y0, Y0
  3895	VPXOR    Y0, Y4, Y4
  3896	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  3897	VPADDD   Y4, Y12, Y12
  3898	VPXOR    Y12, Y14, Y14
  3899	VPSLLD   $0x0c, Y14, Y3
  3900	VPSRLD   $0x14, Y14, Y14
  3901	VPXOR    Y3, Y14, Y14
  3902	VPADDD   Y14, Y0, Y0
  3903	VPXOR    Y0, Y4, Y4
  3904	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  3905	VPADDD   Y4, Y12, Y12
  3906	VPXOR    Y12, Y14, Y14
  3907	VPSLLD   $0x07, Y14, Y3
  3908	VPSRLD   $0x19, Y14, Y14
  3909	VPXOR    Y3, Y14, Y14
  3910	VPADDD   Y9, Y5, Y5
  3911	VPXOR    Y5, Y1, Y1
  3912	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  3913	VPADDD   Y1, Y13, Y13
  3914	VPXOR    Y13, Y9, Y9
  3915	VPSLLD   $0x0c, Y9, Y3
  3916	VPSRLD   $0x14, Y9, Y9
  3917	VPXOR    Y3, Y9, Y9
  3918	VPADDD   Y9, Y5, Y5
  3919	VPXOR    Y5, Y1, Y1
  3920	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  3921	VPADDD   Y1, Y13, Y13
  3922	VPXOR    Y13, Y9, Y9
  3923	VPSLLD   $0x07, Y9, Y3
  3924	VPSRLD   $0x19, Y9, Y9
  3925	VPXOR    Y3, Y9, Y9
  3926	VPALIGNR $0x0c, Y14, Y14, Y14
  3927	VPALIGNR $0x0c, Y9, Y9, Y9
  3928	VPALIGNR $0x08, Y12, Y12, Y12
  3929	VPALIGNR $0x08, Y13, Y13, Y13
  3930	VPALIGNR $0x04, Y4, Y4, Y4
  3931	VPALIGNR $0x04, Y1, Y1, Y1
  3932	CMPQ     R9, CX
  3933	JB       openAVX2Tail256LoopA
  3934	CMPQ     R9, $0x0a
  3935	JNE      openAVX2Tail256LoopB
  3936	MOVQ     BX, R9
  3937	SUBQ     SI, BX
  3938	MOVQ     BX, CX
  3939	MOVQ     224(BP), BX
  3940
  3941openAVX2Tail256Hash:
  3942	ADDQ  $0x10, CX
  3943	CMPQ  CX, BX
  3944	JGT   openAVX2Tail256HashEnd
  3945	ADDQ  (R9), R10
  3946	ADCQ  8(R9), R11
  3947	ADCQ  $0x01, R12
  3948	MOVQ  (BP), DX
  3949	MOVQ  DX, R15
  3950	MULXQ R10, R13, R14
  3951	IMULQ R12, R15
  3952	MULXQ R11, AX, DX
  3953	ADDQ  AX, R14
  3954	ADCQ  DX, R15
  3955	MOVQ  8(BP), DX
  3956	MULXQ R10, R10, AX
  3957	ADDQ  R10, R14
  3958	MULXQ R11, R11, R8
  3959	ADCQ  R11, R15
  3960	ADCQ  $0x00, R8
  3961	IMULQ R12, DX
  3962	ADDQ  AX, R15
  3963	ADCQ  DX, R8
  3964	MOVQ  R13, R10
  3965	MOVQ  R14, R11
  3966	MOVQ  R15, R12
  3967	ANDQ  $0x03, R12
  3968	MOVQ  R15, R13
  3969	ANDQ  $-4, R13
  3970	MOVQ  R8, R14
  3971	SHRQ  $0x02, R8, R15
  3972	SHRQ  $0x02, R8
  3973	ADDQ  R13, R10
  3974	ADCQ  R14, R11
  3975	ADCQ  $0x00, R12
  3976	ADDQ  R15, R10
  3977	ADCQ  R8, R11
  3978	ADCQ  $0x00, R12
  3979	LEAQ  16(R9), R9
  3980	JMP   openAVX2Tail256Hash
  3981
  3982openAVX2Tail256HashEnd:
  3983	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  3984	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  3985	VPADDD     32(BP), Y14, Y14
  3986	VPADDD     32(BP), Y9, Y9
  3987	VPADDD     64(BP), Y12, Y12
  3988	VPADDD     64(BP), Y13, Y13
  3989	VPADDD     Y7, Y4, Y4
  3990	VPADDD     Y11, Y1, Y1
  3991	VPERM2I128 $0x02, Y0, Y14, Y6
  3992	VPERM2I128 $0x02, Y12, Y4, Y10
  3993	VPERM2I128 $0x13, Y0, Y14, Y8
  3994	VPERM2I128 $0x13, Y12, Y4, Y2
  3995	VPERM2I128 $0x02, Y5, Y9, Y0
  3996	VPERM2I128 $0x02, Y13, Y1, Y14
  3997	VPERM2I128 $0x13, Y5, Y9, Y12
  3998	VPERM2I128 $0x13, Y13, Y1, Y4
  3999	VPXOR      (SI), Y6, Y6
  4000	VPXOR      32(SI), Y10, Y10
  4001	VPXOR      64(SI), Y8, Y8
  4002	VPXOR      96(SI), Y2, Y2
  4003	VMOVDQU    Y6, (DI)
  4004	VMOVDQU    Y10, 32(DI)
  4005	VMOVDQU    Y8, 64(DI)
  4006	VMOVDQU    Y2, 96(DI)
  4007	LEAQ       128(SI), SI
  4008	LEAQ       128(DI), DI
  4009	SUBQ       $0x80, BX
  4010	JMP        openAVX2TailLoop
  4011
  4012openAVX2Tail384:
  4013	// Need to decrypt up to 384 bytes - prepare six blocks
  4014	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  4015	VMOVDQA Y0, Y5
  4016	VMOVDQA Y0, Y6
  4017	VMOVDQA 32(BP), Y14
  4018	VMOVDQA Y14, Y9
  4019	VMOVDQA Y14, Y10
  4020	VMOVDQA 64(BP), Y12
  4021	VMOVDQA Y12, Y13
  4022	VMOVDQA Y12, Y8
  4023	VMOVDQA 192(BP), Y4
  4024	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  4025	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  4026	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  4027	VMOVDQA Y4, 96(BP)
  4028	VMOVDQA Y1, 128(BP)
  4029	VMOVDQA Y2, 160(BP)
  4030
  4031	// Compute the number of iterations that will hash two blocks of data
  4032	MOVQ    BX, 224(BP)
  4033	MOVQ    BX, CX
  4034	SUBQ    $0x00000100, CX
  4035	SHRQ    $0x04, CX
  4036	ADDQ    $0x06, CX
  4037	MOVQ    $0x0000000a, R9
  4038	CMPQ    CX, $0x0a
  4039	CMOVQGT R9, CX
  4040	MOVQ    SI, BX
  4041	XORQ    R9, R9
  4042
  4043openAVX2Tail384LoopB:
  4044	ADDQ  (BX), R10
  4045	ADCQ  8(BX), R11
  4046	ADCQ  $0x01, R12
  4047	MOVQ  (BP), DX
  4048	MOVQ  DX, R15
  4049	MULXQ R10, R13, R14
  4050	IMULQ R12, R15
  4051	MULXQ R11, AX, DX
  4052	ADDQ  AX, R14
  4053	ADCQ  DX, R15
  4054	MOVQ  8(BP), DX
  4055	MULXQ R10, R10, AX
  4056	ADDQ  R10, R14
  4057	MULXQ R11, R11, R8
  4058	ADCQ  R11, R15
  4059	ADCQ  $0x00, R8
  4060	IMULQ R12, DX
  4061	ADDQ  AX, R15
  4062	ADCQ  DX, R8
  4063	MOVQ  R13, R10
  4064	MOVQ  R14, R11
  4065	MOVQ  R15, R12
  4066	ANDQ  $0x03, R12
  4067	MOVQ  R15, R13
  4068	ANDQ  $-4, R13
  4069	MOVQ  R8, R14
  4070	SHRQ  $0x02, R8, R15
  4071	SHRQ  $0x02, R8
  4072	ADDQ  R13, R10
  4073	ADCQ  R14, R11
  4074	ADCQ  $0x00, R12
  4075	ADDQ  R15, R10
  4076	ADCQ  R8, R11
  4077	ADCQ  $0x00, R12
  4078	LEAQ  16(BX), BX
  4079
  4080openAVX2Tail384LoopA:
  4081	VPADDD   Y14, Y0, Y0
  4082	VPXOR    Y0, Y4, Y4
  4083	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  4084	VPADDD   Y4, Y12, Y12
  4085	VPXOR    Y12, Y14, Y14
  4086	VPSLLD   $0x0c, Y14, Y3
  4087	VPSRLD   $0x14, Y14, Y14
  4088	VPXOR    Y3, Y14, Y14
  4089	VPADDD   Y14, Y0, Y0
  4090	VPXOR    Y0, Y4, Y4
  4091	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  4092	VPADDD   Y4, Y12, Y12
  4093	VPXOR    Y12, Y14, Y14
  4094	VPSLLD   $0x07, Y14, Y3
  4095	VPSRLD   $0x19, Y14, Y14
  4096	VPXOR    Y3, Y14, Y14
  4097	VPADDD   Y9, Y5, Y5
  4098	VPXOR    Y5, Y1, Y1
  4099	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  4100	VPADDD   Y1, Y13, Y13
  4101	VPXOR    Y13, Y9, Y9
  4102	VPSLLD   $0x0c, Y9, Y3
  4103	VPSRLD   $0x14, Y9, Y9
  4104	VPXOR    Y3, Y9, Y9
  4105	VPADDD   Y9, Y5, Y5
  4106	VPXOR    Y5, Y1, Y1
  4107	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  4108	VPADDD   Y1, Y13, Y13
  4109	VPXOR    Y13, Y9, Y9
  4110	VPSLLD   $0x07, Y9, Y3
  4111	VPSRLD   $0x19, Y9, Y9
  4112	VPXOR    Y3, Y9, Y9
  4113	VPADDD   Y10, Y6, Y6
  4114	VPXOR    Y6, Y2, Y2
  4115	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  4116	VPADDD   Y2, Y8, Y8
  4117	VPXOR    Y8, Y10, Y10
  4118	VPSLLD   $0x0c, Y10, Y3
  4119	VPSRLD   $0x14, Y10, Y10
  4120	VPXOR    Y3, Y10, Y10
  4121	VPADDD   Y10, Y6, Y6
  4122	VPXOR    Y6, Y2, Y2
  4123	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  4124	VPADDD   Y2, Y8, Y8
  4125	VPXOR    Y8, Y10, Y10
  4126	VPSLLD   $0x07, Y10, Y3
  4127	VPSRLD   $0x19, Y10, Y10
  4128	VPXOR    Y3, Y10, Y10
  4129	VPALIGNR $0x04, Y14, Y14, Y14
  4130	VPALIGNR $0x04, Y9, Y9, Y9
  4131	VPALIGNR $0x04, Y10, Y10, Y10
  4132	VPALIGNR $0x08, Y12, Y12, Y12
  4133	VPALIGNR $0x08, Y13, Y13, Y13
  4134	VPALIGNR $0x08, Y8, Y8, Y8
  4135	VPALIGNR $0x0c, Y4, Y4, Y4
  4136	VPALIGNR $0x0c, Y1, Y1, Y1
  4137	VPALIGNR $0x0c, Y2, Y2, Y2
  4138	ADDQ     (BX), R10
  4139	ADCQ     8(BX), R11
  4140	ADCQ     $0x01, R12
  4141	MOVQ     (BP), DX
  4142	MOVQ     DX, R15
  4143	MULXQ    R10, R13, R14
  4144	IMULQ    R12, R15
  4145	MULXQ    R11, AX, DX
  4146	ADDQ     AX, R14
  4147	ADCQ     DX, R15
  4148	MOVQ     8(BP), DX
  4149	MULXQ    R10, R10, AX
  4150	ADDQ     R10, R14
  4151	MULXQ    R11, R11, R8
  4152	ADCQ     R11, R15
  4153	ADCQ     $0x00, R8
  4154	IMULQ    R12, DX
  4155	ADDQ     AX, R15
  4156	ADCQ     DX, R8
  4157	MOVQ     R13, R10
  4158	MOVQ     R14, R11
  4159	MOVQ     R15, R12
  4160	ANDQ     $0x03, R12
  4161	MOVQ     R15, R13
  4162	ANDQ     $-4, R13
  4163	MOVQ     R8, R14
  4164	SHRQ     $0x02, R8, R15
  4165	SHRQ     $0x02, R8
  4166	ADDQ     R13, R10
  4167	ADCQ     R14, R11
  4168	ADCQ     $0x00, R12
  4169	ADDQ     R15, R10
  4170	ADCQ     R8, R11
  4171	ADCQ     $0x00, R12
  4172	LEAQ     16(BX), BX
  4173	INCQ     R9
  4174	VPADDD   Y14, Y0, Y0
  4175	VPXOR    Y0, Y4, Y4
  4176	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  4177	VPADDD   Y4, Y12, Y12
  4178	VPXOR    Y12, Y14, Y14
  4179	VPSLLD   $0x0c, Y14, Y3
  4180	VPSRLD   $0x14, Y14, Y14
  4181	VPXOR    Y3, Y14, Y14
  4182	VPADDD   Y14, Y0, Y0
  4183	VPXOR    Y0, Y4, Y4
  4184	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  4185	VPADDD   Y4, Y12, Y12
  4186	VPXOR    Y12, Y14, Y14
  4187	VPSLLD   $0x07, Y14, Y3
  4188	VPSRLD   $0x19, Y14, Y14
  4189	VPXOR    Y3, Y14, Y14
  4190	VPADDD   Y9, Y5, Y5
  4191	VPXOR    Y5, Y1, Y1
  4192	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  4193	VPADDD   Y1, Y13, Y13
  4194	VPXOR    Y13, Y9, Y9
  4195	VPSLLD   $0x0c, Y9, Y3
  4196	VPSRLD   $0x14, Y9, Y9
  4197	VPXOR    Y3, Y9, Y9
  4198	VPADDD   Y9, Y5, Y5
  4199	VPXOR    Y5, Y1, Y1
  4200	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  4201	VPADDD   Y1, Y13, Y13
  4202	VPXOR    Y13, Y9, Y9
  4203	VPSLLD   $0x07, Y9, Y3
  4204	VPSRLD   $0x19, Y9, Y9
  4205	VPXOR    Y3, Y9, Y9
  4206	VPADDD   Y10, Y6, Y6
  4207	VPXOR    Y6, Y2, Y2
  4208	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  4209	VPADDD   Y2, Y8, Y8
  4210	VPXOR    Y8, Y10, Y10
  4211	VPSLLD   $0x0c, Y10, Y3
  4212	VPSRLD   $0x14, Y10, Y10
  4213	VPXOR    Y3, Y10, Y10
  4214	VPADDD   Y10, Y6, Y6
  4215	VPXOR    Y6, Y2, Y2
  4216	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  4217	VPADDD   Y2, Y8, Y8
  4218	VPXOR    Y8, Y10, Y10
  4219	VPSLLD   $0x07, Y10, Y3
  4220	VPSRLD   $0x19, Y10, Y10
  4221	VPXOR    Y3, Y10, Y10
  4222	VPALIGNR $0x0c, Y14, Y14, Y14
  4223	VPALIGNR $0x0c, Y9, Y9, Y9
  4224	VPALIGNR $0x0c, Y10, Y10, Y10
  4225	VPALIGNR $0x08, Y12, Y12, Y12
  4226	VPALIGNR $0x08, Y13, Y13, Y13
  4227	VPALIGNR $0x08, Y8, Y8, Y8
  4228	VPALIGNR $0x04, Y4, Y4, Y4
  4229	VPALIGNR $0x04, Y1, Y1, Y1
  4230	VPALIGNR $0x04, Y2, Y2, Y2
  4231	CMPQ     R9, CX
  4232	JB       openAVX2Tail384LoopB
  4233	CMPQ     R9, $0x0a
  4234	JNE      openAVX2Tail384LoopA
  4235	MOVQ     BX, R9
  4236	SUBQ     SI, BX
  4237	MOVQ     BX, CX
  4238	MOVQ     224(BP), BX
  4239
  4240openAVX2Tail384Hash:
  4241	ADDQ  $0x10, CX
  4242	CMPQ  CX, BX
  4243	JGT   openAVX2Tail384HashEnd
  4244	ADDQ  (R9), R10
  4245	ADCQ  8(R9), R11
  4246	ADCQ  $0x01, R12
  4247	MOVQ  (BP), DX
  4248	MOVQ  DX, R15
  4249	MULXQ R10, R13, R14
  4250	IMULQ R12, R15
  4251	MULXQ R11, AX, DX
  4252	ADDQ  AX, R14
  4253	ADCQ  DX, R15
  4254	MOVQ  8(BP), DX
  4255	MULXQ R10, R10, AX
  4256	ADDQ  R10, R14
  4257	MULXQ R11, R11, R8
  4258	ADCQ  R11, R15
  4259	ADCQ  $0x00, R8
  4260	IMULQ R12, DX
  4261	ADDQ  AX, R15
  4262	ADCQ  DX, R8
  4263	MOVQ  R13, R10
  4264	MOVQ  R14, R11
  4265	MOVQ  R15, R12
  4266	ANDQ  $0x03, R12
  4267	MOVQ  R15, R13
  4268	ANDQ  $-4, R13
  4269	MOVQ  R8, R14
  4270	SHRQ  $0x02, R8, R15
  4271	SHRQ  $0x02, R8
  4272	ADDQ  R13, R10
  4273	ADCQ  R14, R11
  4274	ADCQ  $0x00, R12
  4275	ADDQ  R15, R10
  4276	ADCQ  R8, R11
  4277	ADCQ  $0x00, R12
  4278	LEAQ  16(R9), R9
  4279	JMP   openAVX2Tail384Hash
  4280
  4281openAVX2Tail384HashEnd:
  4282	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  4283	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  4284	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
  4285	VPADDD     32(BP), Y14, Y14
  4286	VPADDD     32(BP), Y9, Y9
  4287	VPADDD     32(BP), Y10, Y10
  4288	VPADDD     64(BP), Y12, Y12
  4289	VPADDD     64(BP), Y13, Y13
  4290	VPADDD     64(BP), Y8, Y8
  4291	VPADDD     96(BP), Y4, Y4
  4292	VPADDD     128(BP), Y1, Y1
  4293	VPADDD     160(BP), Y2, Y2
  4294	VPERM2I128 $0x02, Y0, Y14, Y3
  4295	VPERM2I128 $0x02, Y12, Y4, Y7
  4296	VPERM2I128 $0x13, Y0, Y14, Y11
  4297	VPERM2I128 $0x13, Y12, Y4, Y15
  4298	VPXOR      (SI), Y3, Y3
  4299	VPXOR      32(SI), Y7, Y7
  4300	VPXOR      64(SI), Y11, Y11
  4301	VPXOR      96(SI), Y15, Y15
  4302	VMOVDQU    Y3, (DI)
  4303	VMOVDQU    Y7, 32(DI)
  4304	VMOVDQU    Y11, 64(DI)
  4305	VMOVDQU    Y15, 96(DI)
  4306	VPERM2I128 $0x02, Y5, Y9, Y3
  4307	VPERM2I128 $0x02, Y13, Y1, Y7
  4308	VPERM2I128 $0x13, Y5, Y9, Y11
  4309	VPERM2I128 $0x13, Y13, Y1, Y15
  4310	VPXOR      128(SI), Y3, Y3
  4311	VPXOR      160(SI), Y7, Y7
  4312	VPXOR      192(SI), Y11, Y11
  4313	VPXOR      224(SI), Y15, Y15
  4314	VMOVDQU    Y3, 128(DI)
  4315	VMOVDQU    Y7, 160(DI)
  4316	VMOVDQU    Y11, 192(DI)
  4317	VMOVDQU    Y15, 224(DI)
  4318	VPERM2I128 $0x02, Y6, Y10, Y0
  4319	VPERM2I128 $0x02, Y8, Y2, Y14
  4320	VPERM2I128 $0x13, Y6, Y10, Y12
  4321	VPERM2I128 $0x13, Y8, Y2, Y4
  4322	LEAQ       256(SI), SI
  4323	LEAQ       256(DI), DI
  4324	SUBQ       $0x00000100, BX
  4325	JMP        openAVX2TailLoop
  4326
  4327openAVX2Tail512:
  4328	VMOVDQU ·chacha20Constants<>+0(SB), Y0
  4329	VMOVDQA Y0, Y5
  4330	VMOVDQA Y0, Y6
  4331	VMOVDQA Y0, Y7
  4332	VMOVDQA 32(BP), Y14
  4333	VMOVDQA Y14, Y9
  4334	VMOVDQA Y14, Y10
  4335	VMOVDQA Y14, Y11
  4336	VMOVDQA 64(BP), Y12
  4337	VMOVDQA Y12, Y13
  4338	VMOVDQA Y12, Y8
  4339	VMOVDQA Y12, Y15
  4340	VMOVDQA 192(BP), Y4
  4341	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  4342	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  4343	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  4344	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
  4345	VMOVDQA Y4, 96(BP)
  4346	VMOVDQA Y1, 128(BP)
  4347	VMOVDQA Y2, 160(BP)
  4348	VMOVDQA Y3, 192(BP)
  4349	XORQ    CX, CX
  4350	MOVQ    SI, R9
  4351
  4352openAVX2Tail512LoopB:
  4353	ADDQ  (R9), R10
  4354	ADCQ  8(R9), R11
  4355	ADCQ  $0x01, R12
  4356	MOVQ  (BP), DX
  4357	MOVQ  DX, R15
  4358	MULXQ R10, R13, R14
  4359	IMULQ R12, R15
  4360	MULXQ R11, AX, DX
  4361	ADDQ  AX, R14
  4362	ADCQ  DX, R15
  4363	MOVQ  8(BP), DX
  4364	MULXQ R10, R10, AX
  4365	ADDQ  R10, R14
  4366	MULXQ R11, R11, R8
  4367	ADCQ  R11, R15
  4368	ADCQ  $0x00, R8
  4369	IMULQ R12, DX
  4370	ADDQ  AX, R15
  4371	ADCQ  DX, R8
  4372	MOVQ  R13, R10
  4373	MOVQ  R14, R11
  4374	MOVQ  R15, R12
  4375	ANDQ  $0x03, R12
  4376	MOVQ  R15, R13
  4377	ANDQ  $-4, R13
  4378	MOVQ  R8, R14
  4379	SHRQ  $0x02, R8, R15
  4380	SHRQ  $0x02, R8
  4381	ADDQ  R13, R10
  4382	ADCQ  R14, R11
  4383	ADCQ  $0x00, R12
  4384	ADDQ  R15, R10
  4385	ADCQ  R8, R11
  4386	ADCQ  $0x00, R12
  4387	LEAQ  16(R9), R9
  4388
  4389openAVX2Tail512LoopA:
  4390	VPADDD   Y14, Y0, Y0
  4391	VPADDD   Y9, Y5, Y5
  4392	VPADDD   Y10, Y6, Y6
  4393	VPADDD   Y11, Y7, Y7
  4394	VPXOR    Y0, Y4, Y4
  4395	VPXOR    Y5, Y1, Y1
  4396	VPXOR    Y6, Y2, Y2
  4397	VPXOR    Y7, Y3, Y3
  4398	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  4399	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  4400	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  4401	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  4402	VPADDD   Y4, Y12, Y12
  4403	VPADDD   Y1, Y13, Y13
  4404	VPADDD   Y2, Y8, Y8
  4405	VPADDD   Y3, Y15, Y15
  4406	VPXOR    Y12, Y14, Y14
  4407	VPXOR    Y13, Y9, Y9
  4408	VPXOR    Y8, Y10, Y10
  4409	VPXOR    Y15, Y11, Y11
  4410	VMOVDQA  Y15, 224(BP)
  4411	VPSLLD   $0x0c, Y14, Y15
  4412	VPSRLD   $0x14, Y14, Y14
  4413	VPXOR    Y15, Y14, Y14
  4414	VPSLLD   $0x0c, Y9, Y15
  4415	VPSRLD   $0x14, Y9, Y9
  4416	VPXOR    Y15, Y9, Y9
  4417	VPSLLD   $0x0c, Y10, Y15
  4418	VPSRLD   $0x14, Y10, Y10
  4419	VPXOR    Y15, Y10, Y10
  4420	VPSLLD   $0x0c, Y11, Y15
  4421	VPSRLD   $0x14, Y11, Y11
  4422	VPXOR    Y15, Y11, Y11
  4423	VMOVDQA  224(BP), Y15
  4424	ADDQ     (R9), R10
  4425	ADCQ     8(R9), R11
  4426	ADCQ     $0x01, R12
  4427	MOVQ     (BP), DX
  4428	MOVQ     DX, R15
  4429	MULXQ    R10, R13, R14
  4430	IMULQ    R12, R15
  4431	MULXQ    R11, AX, DX
  4432	ADDQ     AX, R14
  4433	ADCQ     DX, R15
  4434	MOVQ     8(BP), DX
  4435	MULXQ    R10, R10, AX
  4436	ADDQ     R10, R14
  4437	MULXQ    R11, R11, R8
  4438	ADCQ     R11, R15
  4439	ADCQ     $0x00, R8
  4440	IMULQ    R12, DX
  4441	ADDQ     AX, R15
  4442	ADCQ     DX, R8
  4443	MOVQ     R13, R10
  4444	MOVQ     R14, R11
  4445	MOVQ     R15, R12
  4446	ANDQ     $0x03, R12
  4447	MOVQ     R15, R13
  4448	ANDQ     $-4, R13
  4449	MOVQ     R8, R14
  4450	SHRQ     $0x02, R8, R15
  4451	SHRQ     $0x02, R8
  4452	ADDQ     R13, R10
  4453	ADCQ     R14, R11
  4454	ADCQ     $0x00, R12
  4455	ADDQ     R15, R10
  4456	ADCQ     R8, R11
  4457	ADCQ     $0x00, R12
  4458	VPADDD   Y14, Y0, Y0
  4459	VPADDD   Y9, Y5, Y5
  4460	VPADDD   Y10, Y6, Y6
  4461	VPADDD   Y11, Y7, Y7
  4462	VPXOR    Y0, Y4, Y4
  4463	VPXOR    Y5, Y1, Y1
  4464	VPXOR    Y6, Y2, Y2
  4465	VPXOR    Y7, Y3, Y3
  4466	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  4467	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  4468	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  4469	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  4470	VPADDD   Y4, Y12, Y12
  4471	VPADDD   Y1, Y13, Y13
  4472	VPADDD   Y2, Y8, Y8
  4473	VPADDD   Y3, Y15, Y15
  4474	VPXOR    Y12, Y14, Y14
  4475	VPXOR    Y13, Y9, Y9
  4476	VPXOR    Y8, Y10, Y10
  4477	VPXOR    Y15, Y11, Y11
  4478	VMOVDQA  Y15, 224(BP)
  4479	VPSLLD   $0x07, Y14, Y15
  4480	VPSRLD   $0x19, Y14, Y14
  4481	VPXOR    Y15, Y14, Y14
  4482	VPSLLD   $0x07, Y9, Y15
  4483	VPSRLD   $0x19, Y9, Y9
  4484	VPXOR    Y15, Y9, Y9
  4485	VPSLLD   $0x07, Y10, Y15
  4486	VPSRLD   $0x19, Y10, Y10
  4487	VPXOR    Y15, Y10, Y10
  4488	VPSLLD   $0x07, Y11, Y15
  4489	VPSRLD   $0x19, Y11, Y11
  4490	VPXOR    Y15, Y11, Y11
  4491	VMOVDQA  224(BP), Y15
  4492	VPALIGNR $0x04, Y14, Y14, Y14
  4493	VPALIGNR $0x04, Y9, Y9, Y9
  4494	VPALIGNR $0x04, Y10, Y10, Y10
  4495	VPALIGNR $0x04, Y11, Y11, Y11
  4496	VPALIGNR $0x08, Y12, Y12, Y12
  4497	VPALIGNR $0x08, Y13, Y13, Y13
  4498	VPALIGNR $0x08, Y8, Y8, Y8
  4499	VPALIGNR $0x08, Y15, Y15, Y15
  4500	VPALIGNR $0x0c, Y4, Y4, Y4
  4501	VPALIGNR $0x0c, Y1, Y1, Y1
  4502	VPALIGNR $0x0c, Y2, Y2, Y2
  4503	VPALIGNR $0x0c, Y3, Y3, Y3
  4504	VPADDD   Y14, Y0, Y0
  4505	VPADDD   Y9, Y5, Y5
  4506	VPADDD   Y10, Y6, Y6
  4507	VPADDD   Y11, Y7, Y7
  4508	VPXOR    Y0, Y4, Y4
  4509	VPXOR    Y5, Y1, Y1
  4510	VPXOR    Y6, Y2, Y2
  4511	VPXOR    Y7, Y3, Y3
  4512	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  4513	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  4514	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  4515	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  4516	VPADDD   Y4, Y12, Y12
  4517	VPADDD   Y1, Y13, Y13
  4518	VPADDD   Y2, Y8, Y8
  4519	VPADDD   Y3, Y15, Y15
  4520	VPXOR    Y12, Y14, Y14
  4521	VPXOR    Y13, Y9, Y9
  4522	VPXOR    Y8, Y10, Y10
  4523	VPXOR    Y15, Y11, Y11
  4524	ADDQ     16(R9), R10
  4525	ADCQ     24(R9), R11
  4526	ADCQ     $0x01, R12
  4527	MOVQ     (BP), DX
  4528	MOVQ     DX, R15
  4529	MULXQ    R10, R13, R14
  4530	IMULQ    R12, R15
  4531	MULXQ    R11, AX, DX
  4532	ADDQ     AX, R14
  4533	ADCQ     DX, R15
  4534	MOVQ     8(BP), DX
  4535	MULXQ    R10, R10, AX
  4536	ADDQ     R10, R14
  4537	MULXQ    R11, R11, R8
  4538	ADCQ     R11, R15
  4539	ADCQ     $0x00, R8
  4540	IMULQ    R12, DX
  4541	ADDQ     AX, R15
  4542	ADCQ     DX, R8
  4543	MOVQ     R13, R10
  4544	MOVQ     R14, R11
  4545	MOVQ     R15, R12
  4546	ANDQ     $0x03, R12
  4547	MOVQ     R15, R13
  4548	ANDQ     $-4, R13
  4549	MOVQ     R8, R14
  4550	SHRQ     $0x02, R8, R15
  4551	SHRQ     $0x02, R8
  4552	ADDQ     R13, R10
  4553	ADCQ     R14, R11
  4554	ADCQ     $0x00, R12
  4555	ADDQ     R15, R10
  4556	ADCQ     R8, R11
  4557	ADCQ     $0x00, R12
  4558	LEAQ     32(R9), R9
  4559	VMOVDQA  Y15, 224(BP)
  4560	VPSLLD   $0x0c, Y14, Y15
  4561	VPSRLD   $0x14, Y14, Y14
  4562	VPXOR    Y15, Y14, Y14
  4563	VPSLLD   $0x0c, Y9, Y15
  4564	VPSRLD   $0x14, Y9, Y9
  4565	VPXOR    Y15, Y9, Y9
  4566	VPSLLD   $0x0c, Y10, Y15
  4567	VPSRLD   $0x14, Y10, Y10
  4568	VPXOR    Y15, Y10, Y10
  4569	VPSLLD   $0x0c, Y11, Y15
  4570	VPSRLD   $0x14, Y11, Y11
  4571	VPXOR    Y15, Y11, Y11
  4572	VMOVDQA  224(BP), Y15
  4573	VPADDD   Y14, Y0, Y0
  4574	VPADDD   Y9, Y5, Y5
  4575	VPADDD   Y10, Y6, Y6
  4576	VPADDD   Y11, Y7, Y7
  4577	VPXOR    Y0, Y4, Y4
  4578	VPXOR    Y5, Y1, Y1
  4579	VPXOR    Y6, Y2, Y2
  4580	VPXOR    Y7, Y3, Y3
  4581	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  4582	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  4583	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  4584	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  4585	VPADDD   Y4, Y12, Y12
  4586	VPADDD   Y1, Y13, Y13
  4587	VPADDD   Y2, Y8, Y8
  4588	VPADDD   Y3, Y15, Y15
  4589	VPXOR    Y12, Y14, Y14
  4590	VPXOR    Y13, Y9, Y9
  4591	VPXOR    Y8, Y10, Y10
  4592	VPXOR    Y15, Y11, Y11
  4593	VMOVDQA  Y15, 224(BP)
  4594	VPSLLD   $0x07, Y14, Y15
  4595	VPSRLD   $0x19, Y14, Y14
  4596	VPXOR    Y15, Y14, Y14
  4597	VPSLLD   $0x07, Y9, Y15
  4598	VPSRLD   $0x19, Y9, Y9
  4599	VPXOR    Y15, Y9, Y9
  4600	VPSLLD   $0x07, Y10, Y15
  4601	VPSRLD   $0x19, Y10, Y10
  4602	VPXOR    Y15, Y10, Y10
  4603	VPSLLD   $0x07, Y11, Y15
  4604	VPSRLD   $0x19, Y11, Y11
  4605	VPXOR    Y15, Y11, Y11
  4606	VMOVDQA  224(BP), Y15
  4607	VPALIGNR $0x0c, Y14, Y14, Y14
  4608	VPALIGNR $0x0c, Y9, Y9, Y9
  4609	VPALIGNR $0x0c, Y10, Y10, Y10
  4610	VPALIGNR $0x0c, Y11, Y11, Y11
  4611	VPALIGNR $0x08, Y12, Y12, Y12
  4612	VPALIGNR $0x08, Y13, Y13, Y13
  4613	VPALIGNR $0x08, Y8, Y8, Y8
  4614	VPALIGNR $0x08, Y15, Y15, Y15
  4615	VPALIGNR $0x04, Y4, Y4, Y4
  4616	VPALIGNR $0x04, Y1, Y1, Y1
  4617	VPALIGNR $0x04, Y2, Y2, Y2
  4618	VPALIGNR $0x04, Y3, Y3, Y3
  4619	INCQ     CX
  4620	CMPQ     CX, $0x04
  4621	JLT      openAVX2Tail512LoopB
  4622	CMPQ     CX, $0x0a
  4623	JNE      openAVX2Tail512LoopA
  4624	MOVQ     BX, CX
  4625	SUBQ     $0x00000180, CX
  4626	ANDQ     $-16, CX
  4627
  4628openAVX2Tail512HashLoop:
  4629	TESTQ CX, CX
  4630	JE    openAVX2Tail512HashEnd
  4631	ADDQ  (R9), R10
  4632	ADCQ  8(R9), R11
  4633	ADCQ  $0x01, R12
  4634	MOVQ  (BP), DX
  4635	MOVQ  DX, R15
  4636	MULXQ R10, R13, R14
  4637	IMULQ R12, R15
  4638	MULXQ R11, AX, DX
  4639	ADDQ  AX, R14
  4640	ADCQ  DX, R15
  4641	MOVQ  8(BP), DX
  4642	MULXQ R10, R10, AX
  4643	ADDQ  R10, R14
  4644	MULXQ R11, R11, R8
  4645	ADCQ  R11, R15
  4646	ADCQ  $0x00, R8
  4647	IMULQ R12, DX
  4648	ADDQ  AX, R15
  4649	ADCQ  DX, R8
  4650	MOVQ  R13, R10
  4651	MOVQ  R14, R11
  4652	MOVQ  R15, R12
  4653	ANDQ  $0x03, R12
  4654	MOVQ  R15, R13
  4655	ANDQ  $-4, R13
  4656	MOVQ  R8, R14
  4657	SHRQ  $0x02, R8, R15
  4658	SHRQ  $0x02, R8
  4659	ADDQ  R13, R10
  4660	ADCQ  R14, R11
  4661	ADCQ  $0x00, R12
  4662	ADDQ  R15, R10
  4663	ADCQ  R8, R11
  4664	ADCQ  $0x00, R12
  4665	LEAQ  16(R9), R9
  4666	SUBQ  $0x10, CX
  4667	JMP   openAVX2Tail512HashLoop
  4668
  4669openAVX2Tail512HashEnd:
  4670	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  4671	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  4672	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
  4673	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
  4674	VPADDD     32(BP), Y14, Y14
  4675	VPADDD     32(BP), Y9, Y9
  4676	VPADDD     32(BP), Y10, Y10
  4677	VPADDD     32(BP), Y11, Y11
  4678	VPADDD     64(BP), Y12, Y12
  4679	VPADDD     64(BP), Y13, Y13
  4680	VPADDD     64(BP), Y8, Y8
  4681	VPADDD     64(BP), Y15, Y15
  4682	VPADDD     96(BP), Y4, Y4
  4683	VPADDD     128(BP), Y1, Y1
  4684	VPADDD     160(BP), Y2, Y2
  4685	VPADDD     192(BP), Y3, Y3
  4686	VMOVDQA    Y15, 224(BP)
  4687	VPERM2I128 $0x02, Y0, Y14, Y15
  4688	VPERM2I128 $0x13, Y0, Y14, Y14
  4689	VPERM2I128 $0x02, Y12, Y4, Y0
  4690	VPERM2I128 $0x13, Y12, Y4, Y12
  4691	VPXOR      (SI), Y15, Y15
  4692	VPXOR      32(SI), Y0, Y0
  4693	VPXOR      64(SI), Y14, Y14
  4694	VPXOR      96(SI), Y12, Y12
  4695	VMOVDQU    Y15, (DI)
  4696	VMOVDQU    Y0, 32(DI)
  4697	VMOVDQU    Y14, 64(DI)
  4698	VMOVDQU    Y12, 96(DI)
  4699	VPERM2I128 $0x02, Y5, Y9, Y0
  4700	VPERM2I128 $0x02, Y13, Y1, Y14
  4701	VPERM2I128 $0x13, Y5, Y9, Y12
  4702	VPERM2I128 $0x13, Y13, Y1, Y4
  4703	VPXOR      128(SI), Y0, Y0
  4704	VPXOR      160(SI), Y14, Y14
  4705	VPXOR      192(SI), Y12, Y12
  4706	VPXOR      224(SI), Y4, Y4
  4707	VMOVDQU    Y0, 128(DI)
  4708	VMOVDQU    Y14, 160(DI)
  4709	VMOVDQU    Y12, 192(DI)
  4710	VMOVDQU    Y4, 224(DI)
  4711	VPERM2I128 $0x02, Y6, Y10, Y0
  4712	VPERM2I128 $0x02, Y8, Y2, Y14
  4713	VPERM2I128 $0x13, Y6, Y10, Y12
  4714	VPERM2I128 $0x13, Y8, Y2, Y4
  4715	VPXOR      256(SI), Y0, Y0
  4716	VPXOR      288(SI), Y14, Y14
  4717	VPXOR      320(SI), Y12, Y12
  4718	VPXOR      352(SI), Y4, Y4
  4719	VMOVDQU    Y0, 256(DI)
  4720	VMOVDQU    Y14, 288(DI)
  4721	VMOVDQU    Y12, 320(DI)
  4722	VMOVDQU    Y4, 352(DI)
  4723	VPERM2I128 $0x02, Y7, Y11, Y0
  4724	VPERM2I128 $0x02, 224(BP), Y3, Y14
  4725	VPERM2I128 $0x13, Y7, Y11, Y12
  4726	VPERM2I128 $0x13, 224(BP), Y3, Y4
  4727	LEAQ       384(SI), SI
  4728	LEAQ       384(DI), DI
  4729	SUBQ       $0x00000180, BX
  4730	JMP        openAVX2TailLoop
  4731
  4732DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
  4733DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
  4734DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
  4735DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
  4736DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
  4737DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
  4738DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
  4739DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
  4740GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
  4741
  4742DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
  4743DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
  4744DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
  4745DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
  4746GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
  4747
  4748DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
  4749DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
  4750GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
  4751
  4752DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
  4753DATA ·andMask<>+8(SB)/8, $0x0000000000000000
  4754DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
  4755DATA ·andMask<>+24(SB)/8, $0x0000000000000000
  4756DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
  4757DATA ·andMask<>+40(SB)/8, $0x0000000000000000
  4758DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
  4759DATA ·andMask<>+56(SB)/8, $0x0000000000000000
  4760DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
  4761DATA ·andMask<>+72(SB)/8, $0x0000000000000000
  4762DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
  4763DATA ·andMask<>+88(SB)/8, $0x0000000000000000
  4764DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
  4765DATA ·andMask<>+104(SB)/8, $0x0000000000000000
  4766DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
  4767DATA ·andMask<>+120(SB)/8, $0x0000000000000000
  4768DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
  4769DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
  4770DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
  4771DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
  4772DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
  4773DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
  4774DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
  4775DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
  4776DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
  4777DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
  4778DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
  4779DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
  4780DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
  4781DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
  4782GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
  4783
  4784DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
  4785DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
  4786DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
  4787DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
  4788GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
  4789
  4790DATA ·rol16<>+0(SB)/8, $0x0504070601000302
  4791DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
  4792DATA ·rol16<>+16(SB)/8, $0x0504070601000302
  4793DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
  4794GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
  4795
  4796DATA ·rol8<>+0(SB)/8, $0x0605040702010003
  4797DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
  4798DATA ·rol8<>+16(SB)/8, $0x0605040702010003
  4799DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
  4800GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
  4801
  4802DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
  4803DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
  4804DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
  4805DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
  4806GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
  4807
  4808// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
  4809// Requires: AVX, AVX2, BMI2, CMOV, SSE2
  4810TEXT ·chacha20Poly1305Seal(SB), $288-96
  4811	MOVQ SP, BP
  4812	ADDQ $0x20, BP
  4813	ANDQ $-32, BP
  4814	MOVQ dst_base+0(FP), DI
  4815	MOVQ key_base+24(FP), R8
  4816	MOVQ src_base+48(FP), SI
  4817	MOVQ src_len+56(FP), BX
  4818	MOVQ ad_base+72(FP), CX
  4819	CMPB ·useAVX2+0(SB), $0x01
  4820	JE   chacha20Poly1305Seal_AVX2
  4821
  4822	// Special optimization, for very short buffers
  4823	CMPQ BX, $0x80
  4824	JBE  sealSSE128
  4825
  4826	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  4827	MOVOU ·chacha20Constants<>+0(SB), X0
  4828	MOVOU 16(R8), X3
  4829	MOVOU 32(R8), X6
  4830	MOVOU 48(R8), X9
  4831
  4832	// Store state on stack for future use
  4833	MOVO X3, 32(BP)
  4834	MOVO X6, 48(BP)
  4835
  4836	// Load state, increment counter blocks
  4837	MOVO  X0, X1
  4838	MOVO  X3, X4
  4839	MOVO  X6, X7
  4840	MOVO  X9, X10
  4841	PADDL ·sseIncMask<>+0(SB), X10
  4842	MOVO  X1, X2
  4843	MOVO  X4, X5
  4844	MOVO  X7, X8
  4845	MOVO  X10, X11
  4846	PADDL ·sseIncMask<>+0(SB), X11
  4847	MOVO  X2, X12
  4848	MOVO  X5, X13
  4849	MOVO  X8, X14
  4850	MOVO  X11, X15
  4851	PADDL ·sseIncMask<>+0(SB), X15
  4852
  4853	// Store counters
  4854	MOVO X9, 80(BP)
  4855	MOVO X10, 96(BP)
  4856	MOVO X11, 112(BP)
  4857	MOVO X15, 128(BP)
  4858	MOVQ $0x0000000a, R9
  4859
  4860sealSSEIntroLoop:
  4861	MOVO  X14, 64(BP)
  4862	PADDD X3, X0
  4863	PXOR  X0, X9
  4864	ROL16(X9, X14)
  4865	PADDD X9, X6
  4866	PXOR  X6, X3
  4867	MOVO  X3, X14
  4868	PSLLL $0x0c, X14
  4869	PSRLL $0x14, X3
  4870	PXOR  X14, X3
  4871	PADDD X3, X0
  4872	PXOR  X0, X9
  4873	ROL8(X9, X14)
  4874	PADDD X9, X6
  4875	PXOR  X6, X3
  4876	MOVO  X3, X14
  4877	PSLLL $0x07, X14
  4878	PSRLL $0x19, X3
  4879	PXOR  X14, X3
  4880	PADDD X4, X1
  4881	PXOR  X1, X10
  4882	ROL16(X10, X14)
  4883	PADDD X10, X7
  4884	PXOR  X7, X4
  4885	MOVO  X4, X14
  4886	PSLLL $0x0c, X14
  4887	PSRLL $0x14, X4
  4888	PXOR  X14, X4
  4889	PADDD X4, X1
  4890	PXOR  X1, X10
  4891	ROL8(X10, X14)
  4892	PADDD X10, X7
  4893	PXOR  X7, X4
  4894	MOVO  X4, X14
  4895	PSLLL $0x07, X14
  4896	PSRLL $0x19, X4
  4897	PXOR  X14, X4
  4898	PADDD X5, X2
  4899	PXOR  X2, X11
  4900	ROL16(X11, X14)
  4901	PADDD X11, X8
  4902	PXOR  X8, X5
  4903	MOVO  X5, X14
  4904	PSLLL $0x0c, X14
  4905	PSRLL $0x14, X5
  4906	PXOR  X14, X5
  4907	PADDD X5, X2
  4908	PXOR  X2, X11
  4909	ROL8(X11, X14)
  4910	PADDD X11, X8
  4911	PXOR  X8, X5
  4912	MOVO  X5, X14
  4913	PSLLL $0x07, X14
  4914	PSRLL $0x19, X5
  4915	PXOR  X14, X5
  4916	MOVO  64(BP), X14
  4917	MOVO  X7, 64(BP)
  4918	PADDD X13, X12
  4919	PXOR  X12, X15
  4920	ROL16(X15, X7)
  4921	PADDD X15, X14
  4922	PXOR  X14, X13
  4923	MOVO  X13, X7
  4924	PSLLL $0x0c, X7
  4925	PSRLL $0x14, X13
  4926	PXOR  X7, X13
  4927	PADDD X13, X12
  4928	PXOR  X12, X15
  4929	ROL8(X15, X7)
  4930	PADDD X15, X14
  4931	PXOR  X14, X13
  4932	MOVO  X13, X7
  4933	PSLLL $0x07, X7
  4934	PSRLL $0x19, X13
  4935	PXOR  X7, X13
  4936	MOVO  64(BP), X7
  4937	BYTE  $0x66
  4938	BYTE  $0x0f
  4939	BYTE  $0x3a
  4940	BYTE  $0x0f
  4941	BYTE  $0xdb
  4942	BYTE  $0x04
  4943	BYTE  $0x66
  4944	BYTE  $0x0f
  4945	BYTE  $0x3a
  4946	BYTE  $0x0f
  4947	BYTE  $0xe4
  4948	BYTE  $0x04
  4949	BYTE  $0x66
  4950	BYTE  $0x0f
  4951	BYTE  $0x3a
  4952	BYTE  $0x0f
  4953	BYTE  $0xed
  4954	BYTE  $0x04
  4955	BYTE  $0x66
  4956	BYTE  $0x45
  4957	BYTE  $0x0f
  4958	BYTE  $0x3a
  4959	BYTE  $0x0f
  4960	BYTE  $0xed
  4961	BYTE  $0x04
  4962	BYTE  $0x66
  4963	BYTE  $0x0f
  4964	BYTE  $0x3a
  4965	BYTE  $0x0f
  4966	BYTE  $0xf6
  4967	BYTE  $0x08
  4968	BYTE  $0x66
  4969	BYTE  $0x0f
  4970	BYTE  $0x3a
  4971	BYTE  $0x0f
  4972	BYTE  $0xff
  4973	BYTE  $0x08
  4974	BYTE  $0x66
  4975	BYTE  $0x45
  4976	BYTE  $0x0f
  4977	BYTE  $0x3a
  4978	BYTE  $0x0f
  4979	BYTE  $0xc0
  4980	BYTE  $0x08
  4981	BYTE  $0x66
  4982	BYTE  $0x45
  4983	BYTE  $0x0f
  4984	BYTE  $0x3a
  4985	BYTE  $0x0f
  4986	BYTE  $0xf6
  4987	BYTE  $0x08
  4988	BYTE  $0x66
  4989	BYTE  $0x45
  4990	BYTE  $0x0f
  4991	BYTE  $0x3a
  4992	BYTE  $0x0f
  4993	BYTE  $0xc9
  4994	BYTE  $0x0c
  4995	BYTE  $0x66
  4996	BYTE  $0x45
  4997	BYTE  $0x0f
  4998	BYTE  $0x3a
  4999	BYTE  $0x0f
  5000	BYTE  $0xd2
  5001	BYTE  $0x0c
  5002	BYTE  $0x66
  5003	BYTE  $0x45
  5004	BYTE  $0x0f
  5005	BYTE  $0x3a
  5006	BYTE  $0x0f
  5007	BYTE  $0xdb
  5008	BYTE  $0x0c
  5009	BYTE  $0x66
  5010	BYTE  $0x45
  5011	BYTE  $0x0f
  5012	BYTE  $0x3a
  5013	BYTE  $0x0f
  5014	BYTE  $0xff
  5015	BYTE  $0x0c
  5016	MOVO  X14, 64(BP)
  5017	PADDD X3, X0
  5018	PXOR  X0, X9
  5019	ROL16(X9, X14)
  5020	PADDD X9, X6
  5021	PXOR  X6, X3
  5022	MOVO  X3, X14
  5023	PSLLL $0x0c, X14
  5024	PSRLL $0x14, X3
  5025	PXOR  X14, X3
  5026	PADDD X3, X0
  5027	PXOR  X0, X9
  5028	ROL8(X9, X14)
  5029	PADDD X9, X6
  5030	PXOR  X6, X3
  5031	MOVO  X3, X14
  5032	PSLLL $0x07, X14
  5033	PSRLL $0x19, X3
  5034	PXOR  X14, X3
  5035	PADDD X4, X1
  5036	PXOR  X1, X10
  5037	ROL16(X10, X14)
  5038	PADDD X10, X7
  5039	PXOR  X7, X4
  5040	MOVO  X4, X14
  5041	PSLLL $0x0c, X14
  5042	PSRLL $0x14, X4
  5043	PXOR  X14, X4
  5044	PADDD X4, X1
  5045	PXOR  X1, X10
  5046	ROL8(X10, X14)
  5047	PADDD X10, X7
  5048	PXOR  X7, X4
  5049	MOVO  X4, X14
  5050	PSLLL $0x07, X14
  5051	PSRLL $0x19, X4
  5052	PXOR  X14, X4
  5053	PADDD X5, X2
  5054	PXOR  X2, X11
  5055	ROL16(X11, X14)
  5056	PADDD X11, X8
  5057	PXOR  X8, X5
  5058	MOVO  X5, X14
  5059	PSLLL $0x0c, X14
  5060	PSRLL $0x14, X5
  5061	PXOR  X14, X5
  5062	PADDD X5, X2
  5063	PXOR  X2, X11
  5064	ROL8(X11, X14)
  5065	PADDD X11, X8
  5066	PXOR  X8, X5
  5067	MOVO  X5, X14
  5068	PSLLL $0x07, X14
  5069	PSRLL $0x19, X5
  5070	PXOR  X14, X5
  5071	MOVO  64(BP), X14
  5072	MOVO  X7, 64(BP)
  5073	PADDD X13, X12
  5074	PXOR  X12, X15
  5075	ROL16(X15, X7)
  5076	PADDD X15, X14
  5077	PXOR  X14, X13
  5078	MOVO  X13, X7
  5079	PSLLL $0x0c, X7
  5080	PSRLL $0x14, X13
  5081	PXOR  X7, X13
  5082	PADDD X13, X12
  5083	PXOR  X12, X15
  5084	ROL8(X15, X7)
  5085	PADDD X15, X14
  5086	PXOR  X14, X13
  5087	MOVO  X13, X7
  5088	PSLLL $0x07, X7
  5089	PSRLL $0x19, X13
  5090	PXOR  X7, X13
  5091	MOVO  64(BP), X7
  5092	BYTE  $0x66
  5093	BYTE  $0x0f
  5094	BYTE  $0x3a
  5095	BYTE  $0x0f
  5096	BYTE  $0xdb
  5097	BYTE  $0x0c
  5098	BYTE  $0x66
  5099	BYTE  $0x0f
  5100	BYTE  $0x3a
  5101	BYTE  $0x0f
  5102	BYTE  $0xe4
  5103	BYTE  $0x0c
  5104	BYTE  $0x66
  5105	BYTE  $0x0f
  5106	BYTE  $0x3a
  5107	BYTE  $0x0f
  5108	BYTE  $0xed
  5109	BYTE  $0x0c
  5110	BYTE  $0x66
  5111	BYTE  $0x45
  5112	BYTE  $0x0f
  5113	BYTE  $0x3a
  5114	BYTE  $0x0f
  5115	BYTE  $0xed
  5116	BYTE  $0x0c
  5117	BYTE  $0x66
  5118	BYTE  $0x0f
  5119	BYTE  $0x3a
  5120	BYTE  $0x0f
  5121	BYTE  $0xf6
  5122	BYTE  $0x08
  5123	BYTE  $0x66
  5124	BYTE  $0x0f
  5125	BYTE  $0x3a
  5126	BYTE  $0x0f
  5127	BYTE  $0xff
  5128	BYTE  $0x08
  5129	BYTE  $0x66
  5130	BYTE  $0x45
  5131	BYTE  $0x0f
  5132	BYTE  $0x3a
  5133	BYTE  $0x0f
  5134	BYTE  $0xc0
  5135	BYTE  $0x08
  5136	BYTE  $0x66
  5137	BYTE  $0x45
  5138	BYTE  $0x0f
  5139	BYTE  $0x3a
  5140	BYTE  $0x0f
  5141	BYTE  $0xf6
  5142	BYTE  $0x08
  5143	BYTE  $0x66
  5144	BYTE  $0x45
  5145	BYTE  $0x0f
  5146	BYTE  $0x3a
  5147	BYTE  $0x0f
  5148	BYTE  $0xc9
  5149	BYTE  $0x04
  5150	BYTE  $0x66
  5151	BYTE  $0x45
  5152	BYTE  $0x0f
  5153	BYTE  $0x3a
  5154	BYTE  $0x0f
  5155	BYTE  $0xd2
  5156	BYTE  $0x04
  5157	BYTE  $0x66
  5158	BYTE  $0x45
  5159	BYTE  $0x0f
  5160	BYTE  $0x3a
  5161	BYTE  $0x0f
  5162	BYTE  $0xdb
  5163	BYTE  $0x04
  5164	BYTE  $0x66
  5165	BYTE  $0x45
  5166	BYTE  $0x0f
  5167	BYTE  $0x3a
  5168	BYTE  $0x0f
  5169	BYTE  $0xff
  5170	BYTE  $0x04
  5171	DECQ  R9
  5172	JNE   sealSSEIntroLoop
  5173
  5174	// Add in the state
  5175	PADDD ·chacha20Constants<>+0(SB), X0
  5176	PADDD ·chacha20Constants<>+0(SB), X1
  5177	PADDD ·chacha20Constants<>+0(SB), X2
  5178	PADDD ·chacha20Constants<>+0(SB), X12
  5179	PADDD 32(BP), X3
  5180	PADDD 32(BP), X4
  5181	PADDD 32(BP), X5
  5182	PADDD 32(BP), X13
  5183	PADDD 48(BP), X7
  5184	PADDD 48(BP), X8
  5185	PADDD 48(BP), X14
  5186	PADDD 96(BP), X10
  5187	PADDD 112(BP), X11
  5188	PADDD 128(BP), X15
  5189
  5190	// Clamp and store the key
  5191	PAND ·polyClampMask<>+0(SB), X0
  5192	MOVO X0, (BP)
  5193	MOVO X3, 16(BP)
  5194
  5195	// Hash AAD
  5196	MOVQ  ad_len+80(FP), R9
  5197	CALL  polyHashADInternal<>(SB)
  5198	MOVOU (SI), X0
  5199	MOVOU 16(SI), X3
  5200	MOVOU 32(SI), X6
  5201	MOVOU 48(SI), X9
  5202	PXOR  X0, X1
  5203	PXOR  X3, X4
  5204	PXOR  X6, X7
  5205	PXOR  X9, X10
  5206	MOVOU X1, (DI)
  5207	MOVOU X4, 16(DI)
  5208	MOVOU X7, 32(DI)
  5209	MOVOU X10, 48(DI)
  5210	MOVOU 64(SI), X0
  5211	MOVOU 80(SI), X3
  5212	MOVOU 96(SI), X6
  5213	MOVOU 112(SI), X9
  5214	PXOR  X0, X2
  5215	PXOR  X3, X5
  5216	PXOR  X6, X8
  5217	PXOR  X9, X11
  5218	MOVOU X2, 64(DI)
  5219	MOVOU X5, 80(DI)
  5220	MOVOU X8, 96(DI)
  5221	MOVOU X11, 112(DI)
  5222	MOVQ  $0x00000080, CX
  5223	SUBQ  $0x80, BX
  5224	LEAQ  128(SI), SI
  5225	MOVO  X12, X1
  5226	MOVO  X13, X4
  5227	MOVO  X14, X7
  5228	MOVO  X15, X10
  5229	CMPQ  BX, $0x40
  5230	JBE   sealSSE128SealHash
  5231	MOVOU (SI), X0
  5232	MOVOU 16(SI), X3
  5233	MOVOU 32(SI), X6
  5234	MOVOU 48(SI), X9
  5235	PXOR  X0, X12
  5236	PXOR  X3, X13
  5237	PXOR  X6, X14
  5238	PXOR  X9, X15
  5239	MOVOU X12, 128(DI)
  5240	MOVOU X13, 144(DI)
  5241	MOVOU X14, 160(DI)
  5242	MOVOU X15, 176(DI)
  5243	ADDQ  $0x40, CX
  5244	SUBQ  $0x40, BX
  5245	LEAQ  64(SI), SI
  5246	MOVQ  $0x00000002, CX
  5247	MOVQ  $0x00000008, R9
  5248	CMPQ  BX, $0x40
  5249	JBE   sealSSETail64
  5250	CMPQ  BX, $0x80
  5251	JBE   sealSSETail128
  5252	CMPQ  BX, $0xc0
  5253	JBE   sealSSETail192
  5254
  5255sealSSEMainLoop:
  5256	// Load state, increment counter blocks
  5257	MOVO  ·chacha20Constants<>+0(SB), X0
  5258	MOVO  32(BP), X3
  5259	MOVO  48(BP), X6
  5260	MOVO  128(BP), X9
  5261	PADDL ·sseIncMask<>+0(SB), X9
  5262	MOVO  X0, X1
  5263	MOVO  X3, X4
  5264	MOVO  X6, X7
  5265	MOVO  X9, X10
  5266	PADDL ·sseIncMask<>+0(SB), X10
  5267	MOVO  X1, X2
  5268	MOVO  X4, X5
  5269	MOVO  X7, X8
  5270	MOVO  X10, X11
  5271	PADDL ·sseIncMask<>+0(SB), X11
  5272	MOVO  X2, X12
  5273	MOVO  X5, X13
  5274	MOVO  X8, X14
  5275	MOVO  X11, X15
  5276	PADDL ·sseIncMask<>+0(SB), X15
  5277
  5278	// Store counters
  5279	MOVO X9, 80(BP)
  5280	MOVO X10, 96(BP)
  5281	MOVO X11, 112(BP)
  5282	MOVO X15, 128(BP)
  5283
  5284sealSSEInnerLoop:
  5285	MOVO  X14, 64(BP)
  5286	PADDD X3, X0
  5287	PXOR  X0, X9
  5288	ROL16(X9, X14)
  5289	PADDD X9, X6
  5290	PXOR  X6, X3
  5291	MOVO  X3, X14
  5292	PSLLL $0x0c, X14
  5293	PSRLL $0x14, X3
  5294	PXOR  X14, X3
  5295	PADDD X3, X0
  5296	PXOR  X0, X9
  5297	ROL8(X9, X14)
  5298	PADDD X9, X6
  5299	PXOR  X6, X3
  5300	MOVO  X3, X14
  5301	PSLLL $0x07, X14
  5302	PSRLL $0x19, X3
  5303	PXOR  X14, X3
  5304	PADDD X4, X1
  5305	PXOR  X1, X10
  5306	ROL16(X10, X14)
  5307	PADDD X10, X7
  5308	PXOR  X7, X4
  5309	MOVO  X4, X14
  5310	PSLLL $0x0c, X14
  5311	PSRLL $0x14, X4
  5312	PXOR  X14, X4
  5313	PADDD X4, X1
  5314	PXOR  X1, X10
  5315	ROL8(X10, X14)
  5316	PADDD X10, X7
  5317	PXOR  X7, X4
  5318	MOVO  X4, X14
  5319	PSLLL $0x07, X14
  5320	PSRLL $0x19, X4
  5321	PXOR  X14, X4
  5322	PADDD X5, X2
  5323	PXOR  X2, X11
  5324	ROL16(X11, X14)
  5325	PADDD X11, X8
  5326	PXOR  X8, X5
  5327	MOVO  X5, X14
  5328	PSLLL $0x0c, X14
  5329	PSRLL $0x14, X5
  5330	PXOR  X14, X5
  5331	PADDD X5, X2
  5332	PXOR  X2, X11
  5333	ROL8(X11, X14)
  5334	PADDD X11, X8
  5335	PXOR  X8, X5
  5336	MOVO  X5, X14
  5337	PSLLL $0x07, X14
  5338	PSRLL $0x19, X5
  5339	PXOR  X14, X5
  5340	MOVO  64(BP), X14
  5341	MOVO  X7, 64(BP)
  5342	PADDD X13, X12
  5343	PXOR  X12, X15
  5344	ROL16(X15, X7)
  5345	PADDD X15, X14
  5346	PXOR  X14, X13
  5347	MOVO  X13, X7
  5348	PSLLL $0x0c, X7
  5349	PSRLL $0x14, X13
  5350	PXOR  X7, X13
  5351	PADDD X13, X12
  5352	PXOR  X12, X15
  5353	ROL8(X15, X7)
  5354	PADDD X15, X14
  5355	PXOR  X14, X13
  5356	MOVO  X13, X7
  5357	PSLLL $0x07, X7
  5358	PSRLL $0x19, X13
  5359	PXOR  X7, X13
  5360	MOVO  64(BP), X7
  5361	ADDQ  (DI), R10
  5362	ADCQ  8(DI), R11
  5363	ADCQ  $0x01, R12
  5364	BYTE  $0x66
  5365	BYTE  $0x0f
  5366	BYTE  $0x3a
  5367	BYTE  $0x0f
  5368	BYTE  $0xdb
  5369	BYTE  $0x04
  5370	BYTE  $0x66
  5371	BYTE  $0x0f
  5372	BYTE  $0x3a
  5373	BYTE  $0x0f
  5374	BYTE  $0xe4
  5375	BYTE  $0x04
  5376	BYTE  $0x66
  5377	BYTE  $0x0f
  5378	BYTE  $0x3a
  5379	BYTE  $0x0f
  5380	BYTE  $0xed
  5381	BYTE  $0x04
  5382	BYTE  $0x66
  5383	BYTE  $0x45
  5384	BYTE  $0x0f
  5385	BYTE  $0x3a
  5386	BYTE  $0x0f
  5387	BYTE  $0xed
  5388	BYTE  $0x04
  5389	BYTE  $0x66
  5390	BYTE  $0x0f
  5391	BYTE  $0x3a
  5392	BYTE  $0x0f
  5393	BYTE  $0xf6
  5394	BYTE  $0x08
  5395	BYTE  $0x66
  5396	BYTE  $0x0f
  5397	BYTE  $0x3a
  5398	BYTE  $0x0f
  5399	BYTE  $0xff
  5400	BYTE  $0x08
  5401	BYTE  $0x66
  5402	BYTE  $0x45
  5403	BYTE  $0x0f
  5404	BYTE  $0x3a
  5405	BYTE  $0x0f
  5406	BYTE  $0xc0
  5407	BYTE  $0x08
  5408	BYTE  $0x66
  5409	BYTE  $0x45
  5410	BYTE  $0x0f
  5411	BYTE  $0x3a
  5412	BYTE  $0x0f
  5413	BYTE  $0xf6
  5414	BYTE  $0x08
  5415	BYTE  $0x66
  5416	BYTE  $0x45
  5417	BYTE  $0x0f
  5418	BYTE  $0x3a
  5419	BYTE  $0x0f
  5420	BYTE  $0xc9
  5421	BYTE  $0x0c
  5422	BYTE  $0x66
  5423	BYTE  $0x45
  5424	BYTE  $0x0f
  5425	BYTE  $0x3a
  5426	BYTE  $0x0f
  5427	BYTE  $0xd2
  5428	BYTE  $0x0c
  5429	BYTE  $0x66
  5430	BYTE  $0x45
  5431	BYTE  $0x0f
  5432	BYTE  $0x3a
  5433	BYTE  $0x0f
  5434	BYTE  $0xdb
  5435	BYTE  $0x0c
  5436	BYTE  $0x66
  5437	BYTE  $0x45
  5438	BYTE  $0x0f
  5439	BYTE  $0x3a
  5440	BYTE  $0x0f
  5441	BYTE  $0xff
  5442	BYTE  $0x0c
  5443	MOVQ  (BP), AX
  5444	MOVQ  AX, R15
  5445	MULQ  R10
  5446	MOVQ  AX, R13
  5447	MOVQ  DX, R14
  5448	MOVQ  (BP), AX
  5449	MULQ  R11
  5450	IMULQ R12, R15
  5451	ADDQ  AX, R14
  5452	ADCQ  DX, R15
  5453	MOVQ  8(BP), AX
  5454	MOVQ  AX, R8
  5455	MULQ  R10
  5456	ADDQ  AX, R14
  5457	ADCQ  $0x00, DX
  5458	MOVQ  DX, R10
  5459	MOVQ  8(BP), AX
  5460	MULQ  R11
  5461	ADDQ  AX, R15
  5462	ADCQ  $0x00, DX
  5463	LEAQ  16(DI), DI
  5464	MOVO  X14, 64(BP)
  5465	PADDD X3, X0
  5466	PXOR  X0, X9
  5467	ROL16(X9, X14)
  5468	PADDD X9, X6
  5469	PXOR  X6, X3
  5470	MOVO  X3, X14
  5471	PSLLL $0x0c, X14
  5472	PSRLL $0x14, X3
  5473	PXOR  X14, X3
  5474	PADDD X3, X0
  5475	PXOR  X0, X9
  5476	ROL8(X9, X14)
  5477	PADDD X9, X6
  5478	PXOR  X6, X3
  5479	MOVO  X3, X14
  5480	PSLLL $0x07, X14
  5481	PSRLL $0x19, X3
  5482	PXOR  X14, X3
  5483	PADDD X4, X1
  5484	PXOR  X1, X10
  5485	ROL16(X10, X14)
  5486	PADDD X10, X7
  5487	PXOR  X7, X4
  5488	MOVO  X4, X14
  5489	PSLLL $0x0c, X14
  5490	PSRLL $0x14, X4
  5491	PXOR  X14, X4
  5492	PADDD X4, X1
  5493	PXOR  X1, X10
  5494	ROL8(X10, X14)
  5495	PADDD X10, X7
  5496	PXOR  X7, X4
  5497	MOVO  X4, X14
  5498	PSLLL $0x07, X14
  5499	PSRLL $0x19, X4
  5500	PXOR  X14, X4
  5501	PADDD X5, X2
  5502	PXOR  X2, X11
  5503	ROL16(X11, X14)
  5504	PADDD X11, X8
  5505	PXOR  X8, X5
  5506	MOVO  X5, X14
  5507	PSLLL $0x0c, X14
  5508	PSRLL $0x14, X5
  5509	PXOR  X14, X5
  5510	PADDD X5, X2
  5511	PXOR  X2, X11
  5512	ROL8(X11, X14)
  5513	PADDD X11, X8
  5514	PXOR  X8, X5
  5515	MOVO  X5, X14
  5516	PSLLL $0x07, X14
  5517	PSRLL $0x19, X5
  5518	PXOR  X14, X5
  5519	MOVO  64(BP), X14
  5520	MOVO  X7, 64(BP)
  5521	IMULQ R12, R8
  5522	ADDQ  R10, R15
  5523	ADCQ  DX, R8
  5524	PADDD X13, X12
  5525	PXOR  X12, X15
  5526	ROL16(X15, X7)
  5527	PADDD X15, X14
  5528	PXOR  X14, X13
  5529	MOVO  X13, X7
  5530	PSLLL $0x0c, X7
  5531	PSRLL $0x14, X13
  5532	PXOR  X7, X13
  5533	PADDD X13, X12
  5534	PXOR  X12, X15
  5535	ROL8(X15, X7)
  5536	PADDD X15, X14
  5537	PXOR  X14, X13
  5538	MOVO  X13, X7
  5539	PSLLL $0x07, X7
  5540	PSRLL $0x19, X13
  5541	PXOR  X7, X13
  5542	MOVO  64(BP), X7
  5543	MOVQ  R13, R10
  5544	MOVQ  R14, R11
  5545	MOVQ  R15, R12
  5546	ANDQ  $0x03, R12
  5547	MOVQ  R15, R13
  5548	ANDQ  $-4, R13
  5549	MOVQ  R8, R14
  5550	SHRQ  $0x02, R8, R15
  5551	SHRQ  $0x02, R8
  5552	ADDQ  R13, R10
  5553	ADCQ  R14, R11
  5554	ADCQ  $0x00, R12
  5555	ADDQ  R15, R10
  5556	ADCQ  R8, R11
  5557	ADCQ  $0x00, R12
  5558	BYTE  $0x66
  5559	BYTE  $0x0f
  5560	BYTE  $0x3a
  5561	BYTE  $0x0f
  5562	BYTE  $0xdb
  5563	BYTE  $0x0c
  5564	BYTE  $0x66
  5565	BYTE  $0x0f
  5566	BYTE  $0x3a
  5567	BYTE  $0x0f
  5568	BYTE  $0xe4
  5569	BYTE  $0x0c
  5570	BYTE  $0x66
  5571	BYTE  $0x0f
  5572	BYTE  $0x3a
  5573	BYTE  $0x0f
  5574	BYTE  $0xed
  5575	BYTE  $0x0c
  5576	BYTE  $0x66
  5577	BYTE  $0x45
  5578	BYTE  $0x0f
  5579	BYTE  $0x3a
  5580	BYTE  $0x0f
  5581	BYTE  $0xed
  5582	BYTE  $0x0c
  5583	BYTE  $0x66
  5584	BYTE  $0x0f
  5585	BYTE  $0x3a
  5586	BYTE  $0x0f
  5587	BYTE  $0xf6
  5588	BYTE  $0x08
  5589	BYTE  $0x66
  5590	BYTE  $0x0f
  5591	BYTE  $0x3a
  5592	BYTE  $0x0f
  5593	BYTE  $0xff
  5594	BYTE  $0x08
  5595	BYTE  $0x66
  5596	BYTE  $0x45
  5597	BYTE  $0x0f
  5598	BYTE  $0x3a
  5599	BYTE  $0x0f
  5600	BYTE  $0xc0
  5601	BYTE  $0x08
  5602	BYTE  $0x66
  5603	BYTE  $0x45
  5604	BYTE  $0x0f
  5605	BYTE  $0x3a
  5606	BYTE  $0x0f
  5607	BYTE  $0xf6
  5608	BYTE  $0x08
  5609	BYTE  $0x66
  5610	BYTE  $0x45
  5611	BYTE  $0x0f
  5612	BYTE  $0x3a
  5613	BYTE  $0x0f
  5614	BYTE  $0xc9
  5615	BYTE  $0x04
  5616	BYTE  $0x66
  5617	BYTE  $0x45
  5618	BYTE  $0x0f
  5619	BYTE  $0x3a
  5620	BYTE  $0x0f
  5621	BYTE  $0xd2
  5622	BYTE  $0x04
  5623	BYTE  $0x66
  5624	BYTE  $0x45
  5625	BYTE  $0x0f
  5626	BYTE  $0x3a
  5627	BYTE  $0x0f
  5628	BYTE  $0xdb
  5629	BYTE  $0x04
  5630	BYTE  $0x66
  5631	BYTE  $0x45
  5632	BYTE  $0x0f
  5633	BYTE  $0x3a
  5634	BYTE  $0x0f
  5635	BYTE  $0xff
  5636	BYTE  $0x04
  5637	DECQ  R9
  5638	JGE   sealSSEInnerLoop
  5639	ADDQ  (DI), R10
  5640	ADCQ  8(DI), R11
  5641	ADCQ  $0x01, R12
  5642	MOVQ  (BP), AX
  5643	MOVQ  AX, R15
  5644	MULQ  R10
  5645	MOVQ  AX, R13
  5646	MOVQ  DX, R14
  5647	MOVQ  (BP), AX
  5648	MULQ  R11
  5649	IMULQ R12, R15
  5650	ADDQ  AX, R14
  5651	ADCQ  DX, R15
  5652	MOVQ  8(BP), AX
  5653	MOVQ  AX, R8
  5654	MULQ  R10
  5655	ADDQ  AX, R14
  5656	ADCQ  $0x00, DX
  5657	MOVQ  DX, R10
  5658	MOVQ  8(BP), AX
  5659	MULQ  R11
  5660	ADDQ  AX, R15
  5661	ADCQ  $0x00, DX
  5662	IMULQ R12, R8
  5663	ADDQ  R10, R15
  5664	ADCQ  DX, R8
  5665	MOVQ  R13, R10
  5666	MOVQ  R14, R11
  5667	MOVQ  R15, R12
  5668	ANDQ  $0x03, R12
  5669	MOVQ  R15, R13
  5670	ANDQ  $-4, R13
  5671	MOVQ  R8, R14
  5672	SHRQ  $0x02, R8, R15
  5673	SHRQ  $0x02, R8
  5674	ADDQ  R13, R10
  5675	ADCQ  R14, R11
  5676	ADCQ  $0x00, R12
  5677	ADDQ  R15, R10
  5678	ADCQ  R8, R11
  5679	ADCQ  $0x00, R12
  5680	LEAQ  16(DI), DI
  5681	DECQ  CX
  5682	JG    sealSSEInnerLoop
  5683
  5684	// Add in the state
  5685	PADDD ·chacha20Constants<>+0(SB), X0
  5686	PADDD ·chacha20Constants<>+0(SB), X1
  5687	PADDD ·chacha20Constants<>+0(SB), X2
  5688	PADDD ·chacha20Constants<>+0(SB), X12
  5689	PADDD 32(BP), X3
  5690	PADDD 32(BP), X4
  5691	PADDD 32(BP), X5
  5692	PADDD 32(BP), X13
  5693	PADDD 48(BP), X6
  5694	PADDD 48(BP), X7
  5695	PADDD 48(BP), X8
  5696	PADDD 48(BP), X14
  5697	PADDD 80(BP), X9
  5698	PADDD 96(BP), X10
  5699	PADDD 112(BP), X11
  5700	PADDD 128(BP), X15
  5701	MOVO  X15, 64(BP)
  5702
  5703	// Load - xor - store
  5704	MOVOU (SI), X15
  5705	PXOR  X15, X0
  5706	MOVOU 16(SI), X15
  5707	PXOR  X15, X3
  5708	MOVOU 32(SI), X15
  5709	PXOR  X15, X6
  5710	MOVOU 48(SI), X15
  5711	PXOR  X15, X9
  5712	MOVOU X0, (DI)
  5713	MOVOU X3, 16(DI)
  5714	MOVOU X6, 32(DI)
  5715	MOVOU X9, 48(DI)
  5716	MOVO  64(BP), X15
  5717	MOVOU 64(SI), X0
  5718	MOVOU 80(SI), X3
  5719	MOVOU 96(SI), X6
  5720	MOVOU 112(SI), X9
  5721	PXOR  X0, X1
  5722	PXOR  X3, X4
  5723	PXOR  X6, X7
  5724	PXOR  X9, X10
  5725	MOVOU X1, 64(DI)
  5726	MOVOU X4, 80(DI)
  5727	MOVOU X7, 96(DI)
  5728	MOVOU X10, 112(DI)
  5729	MOVOU 128(SI), X0
  5730	MOVOU 144(SI), X3
  5731	MOVOU 160(SI), X6
  5732	MOVOU 176(SI), X9
  5733	PXOR  X0, X2
  5734	PXOR  X3, X5
  5735	PXOR  X6, X8
  5736	PXOR  X9, X11
  5737	MOVOU X2, 128(DI)
  5738	MOVOU X5, 144(DI)
  5739	MOVOU X8, 160(DI)
  5740	MOVOU X11, 176(DI)
  5741	ADDQ  $0xc0, SI
  5742	MOVQ  $0x000000c0, CX
  5743	SUBQ  $0xc0, BX
  5744	MOVO  X12, X1
  5745	MOVO  X13, X4
  5746	MOVO  X14, X7
  5747	MOVO  X15, X10
  5748	CMPQ  BX, $0x40
  5749	JBE   sealSSE128SealHash
  5750	MOVOU (SI), X0
  5751	MOVOU 16(SI), X3
  5752	MOVOU 32(SI), X6
  5753	MOVOU 48(SI), X9
  5754	PXOR  X0, X12
  5755	PXOR  X3, X13
  5756	PXOR  X6, X14
  5757	PXOR  X9, X15
  5758	MOVOU X12, 192(DI)
  5759	MOVOU X13, 208(DI)
  5760	MOVOU X14, 224(DI)
  5761	MOVOU X15, 240(DI)
  5762	LEAQ  64(SI), SI
  5763	SUBQ  $0x40, BX
  5764	MOVQ  $0x00000006, CX
  5765	MOVQ  $0x00000004, R9
  5766	CMPQ  BX, $0xc0
  5767	JG    sealSSEMainLoop
  5768	MOVQ  BX, CX
  5769	TESTQ BX, BX
  5770	JE    sealSSE128SealHash
  5771	MOVQ  $0x00000006, CX
  5772	CMPQ  BX, $0x40
  5773	JBE   sealSSETail64
  5774	CMPQ  BX, $0x80
  5775	JBE   sealSSETail128
  5776	JMP   sealSSETail192
  5777
  5778sealSSETail64:
  5779	MOVO  ·chacha20Constants<>+0(SB), X1
  5780	MOVO  32(BP), X4
  5781	MOVO  48(BP), X7
  5782	MOVO  128(BP), X10
  5783	PADDL ·sseIncMask<>+0(SB), X10
  5784	MOVO  X10, 80(BP)
  5785
  5786sealSSETail64LoopA:
  5787	ADDQ  (DI), R10
  5788	ADCQ  8(DI), R11
  5789	ADCQ  $0x01, R12
  5790	MOVQ  (BP), AX
  5791	MOVQ  AX, R15
  5792	MULQ  R10
  5793	MOVQ  AX, R13
  5794	MOVQ  DX, R14
  5795	MOVQ  (BP), AX
  5796	MULQ  R11
  5797	IMULQ R12, R15
  5798	ADDQ  AX, R14
  5799	ADCQ  DX, R15
  5800	MOVQ  8(BP), AX
  5801	MOVQ  AX, R8
  5802	MULQ  R10
  5803	ADDQ  AX, R14
  5804	ADCQ  $0x00, DX
  5805	MOVQ  DX, R10
  5806	MOVQ  8(BP), AX
  5807	MULQ  R11
  5808	ADDQ  AX, R15
  5809	ADCQ  $0x00, DX
  5810	IMULQ R12, R8
  5811	ADDQ  R10, R15
  5812	ADCQ  DX, R8
  5813	MOVQ  R13, R10
  5814	MOVQ  R14, R11
  5815	MOVQ  R15, R12
  5816	ANDQ  $0x03, R12
  5817	MOVQ  R15, R13
  5818	ANDQ  $-4, R13
  5819	MOVQ  R8, R14
  5820	SHRQ  $0x02, R8, R15
  5821	SHRQ  $0x02, R8
  5822	ADDQ  R13, R10
  5823	ADCQ  R14, R11
  5824	ADCQ  $0x00, R12
  5825	ADDQ  R15, R10
  5826	ADCQ  R8, R11
  5827	ADCQ  $0x00, R12
  5828	LEAQ  16(DI), DI
  5829
  5830sealSSETail64LoopB:
  5831	PADDD X4, X1
  5832	PXOR  X1, X10
  5833	ROL16(X10, X13)
  5834	PADDD X10, X7
  5835	PXOR  X7, X4
  5836	MOVO  X4, X13
  5837	PSLLL $0x0c, X13
  5838	PSRLL $0x14, X4
  5839	PXOR  X13, X4
  5840	PADDD X4, X1
  5841	PXOR  X1, X10
  5842	ROL8(X10, X13)
  5843	PADDD X10, X7
  5844	PXOR  X7, X4
  5845	MOVO  X4, X13
  5846	PSLLL $0x07, X13
  5847	PSRLL $0x19, X4
  5848	PXOR  X13, X4
  5849	BYTE  $0x66
  5850	BYTE  $0x0f
  5851	BYTE  $0x3a
  5852	BYTE  $0x0f
  5853	BYTE  $0xe4
  5854	BYTE  $0x04
  5855	BYTE  $0x66
  5856	BYTE  $0x0f
  5857	BYTE  $0x3a
  5858	BYTE  $0x0f
  5859	BYTE  $0xff
  5860	BYTE  $0x08
  5861	BYTE  $0x66
  5862	BYTE  $0x45
  5863	BYTE  $0x0f
  5864	BYTE  $0x3a
  5865	BYTE  $0x0f
  5866	BYTE  $0xd2
  5867	BYTE  $0x0c
  5868	PADDD X4, X1
  5869	PXOR  X1, X10
  5870	ROL16(X10, X13)
  5871	PADDD X10, X7
  5872	PXOR  X7, X4
  5873	MOVO  X4, X13
  5874	PSLLL $0x0c, X13
  5875	PSRLL $0x14, X4
  5876	PXOR  X13, X4
  5877	PADDD X4, X1
  5878	PXOR  X1, X10
  5879	ROL8(X10, X13)
  5880	PADDD X10, X7
  5881	PXOR  X7, X4
  5882	MOVO  X4, X13
  5883	PSLLL $0x07, X13
  5884	PSRLL $0x19, X4
  5885	PXOR  X13, X4
  5886	BYTE  $0x66
  5887	BYTE  $0x0f
  5888	BYTE  $0x3a
  5889	BYTE  $0x0f
  5890	BYTE  $0xe4
  5891	BYTE  $0x0c
  5892	BYTE  $0x66
  5893	BYTE  $0x0f
  5894	BYTE  $0x3a
  5895	BYTE  $0x0f
  5896	BYTE  $0xff
  5897	BYTE  $0x08
  5898	BYTE  $0x66
  5899	BYTE  $0x45
  5900	BYTE  $0x0f
  5901	BYTE  $0x3a
  5902	BYTE  $0x0f
  5903	BYTE  $0xd2
  5904	BYTE  $0x04
  5905	ADDQ  (DI), R10
  5906	ADCQ  8(DI), R11
  5907	ADCQ  $0x01, R12
  5908	MOVQ  (BP), AX
  5909	MOVQ  AX, R15
  5910	MULQ  R10
  5911	MOVQ  AX, R13
  5912	MOVQ  DX, R14
  5913	MOVQ  (BP), AX
  5914	MULQ  R11
  5915	IMULQ R12, R15
  5916	ADDQ  AX, R14
  5917	ADCQ  DX, R15
  5918	MOVQ  8(BP), AX
  5919	MOVQ  AX, R8
  5920	MULQ  R10
  5921	ADDQ  AX, R14
  5922	ADCQ  $0x00, DX
  5923	MOVQ  DX, R10
  5924	MOVQ  8(BP), AX
  5925	MULQ  R11
  5926	ADDQ  AX, R15
  5927	ADCQ  $0x00, DX
  5928	IMULQ R12, R8
  5929	ADDQ  R10, R15
  5930	ADCQ  DX, R8
  5931	MOVQ  R13, R10
  5932	MOVQ  R14, R11
  5933	MOVQ  R15, R12
  5934	ANDQ  $0x03, R12
  5935	MOVQ  R15, R13
  5936	ANDQ  $-4, R13
  5937	MOVQ  R8, R14
  5938	SHRQ  $0x02, R8, R15
  5939	SHRQ  $0x02, R8
  5940	ADDQ  R13, R10
  5941	ADCQ  R14, R11
  5942	ADCQ  $0x00, R12
  5943	ADDQ  R15, R10
  5944	ADCQ  R8, R11
  5945	ADCQ  $0x00, R12
  5946	LEAQ  16(DI), DI
  5947	DECQ  CX
  5948	JG    sealSSETail64LoopA
  5949	DECQ  R9
  5950	JGE   sealSSETail64LoopB
  5951	PADDL ·chacha20Constants<>+0(SB), X1
  5952	PADDL 32(BP), X4
  5953	PADDL 48(BP), X7
  5954	PADDL 80(BP), X10
  5955	JMP   sealSSE128Seal
  5956
  5957sealSSETail128:
  5958	MOVO  ·chacha20Constants<>+0(SB), X0
  5959	MOVO  32(BP), X3
  5960	MOVO  48(BP), X6
  5961	MOVO  128(BP), X9
  5962	PADDL ·sseIncMask<>+0(SB), X9
  5963	MOVO  X9, 80(BP)
  5964	MOVO  X0, X1
  5965	MOVO  X3, X4
  5966	MOVO  X6, X7
  5967	MOVO  X9, X10
  5968	PADDL ·sseIncMask<>+0(SB), X10
  5969	MOVO  X10, 96(BP)
  5970
  5971sealSSETail128LoopA:
  5972	ADDQ  (DI), R10
  5973	ADCQ  8(DI), R11
  5974	ADCQ  $0x01, R12
  5975	MOVQ  (BP), AX
  5976	MOVQ  AX, R15
  5977	MULQ  R10
  5978	MOVQ  AX, R13
  5979	MOVQ  DX, R14
  5980	MOVQ  (BP), AX
  5981	MULQ  R11
  5982	IMULQ R12, R15
  5983	ADDQ  AX, R14
  5984	ADCQ  DX, R15
  5985	MOVQ  8(BP), AX
  5986	MOVQ  AX, R8
  5987	MULQ  R10
  5988	ADDQ  AX, R14
  5989	ADCQ  $0x00, DX
  5990	MOVQ  DX, R10
  5991	MOVQ  8(BP), AX
  5992	MULQ  R11
  5993	ADDQ  AX, R15
  5994	ADCQ  $0x00, DX
  5995	IMULQ R12, R8
  5996	ADDQ  R10, R15
  5997	ADCQ  DX, R8
  5998	MOVQ  R13, R10
  5999	MOVQ  R14, R11
  6000	MOVQ  R15, R12
  6001	ANDQ  $0x03, R12
  6002	MOVQ  R15, R13
  6003	ANDQ  $-4, R13
  6004	MOVQ  R8, R14
  6005	SHRQ  $0x02, R8, R15
  6006	SHRQ  $0x02, R8
  6007	ADDQ  R13, R10
  6008	ADCQ  R14, R11
  6009	ADCQ  $0x00, R12
  6010	ADDQ  R15, R10
  6011	ADCQ  R8, R11
  6012	ADCQ  $0x00, R12
  6013	LEAQ  16(DI), DI
  6014
  6015sealSSETail128LoopB:
  6016	PADDD X3, X0
  6017	PXOR  X0, X9
  6018	ROL16(X9, X12)
  6019	PADDD X9, X6
  6020	PXOR  X6, X3
  6021	MOVO  X3, X12
  6022	PSLLL $0x0c, X12
  6023	PSRLL $0x14, X3
  6024	PXOR  X12, X3
  6025	PADDD X3, X0
  6026	PXOR  X0, X9
  6027	ROL8(X9, X12)
  6028	PADDD X9, X6
  6029	PXOR  X6, X3
  6030	MOVO  X3, X12
  6031	PSLLL $0x07, X12
  6032	PSRLL $0x19, X3
  6033	PXOR  X12, X3
  6034	PADDD X4, X1
  6035	PXOR  X1, X10
  6036	ROL16(X10, X12)
  6037	PADDD X10, X7
  6038	PXOR  X7, X4
  6039	MOVO  X4, X12
  6040	PSLLL $0x0c, X12
  6041	PSRLL $0x14, X4
  6042	PXOR  X12, X4
  6043	PADDD X4, X1
  6044	PXOR  X1, X10
  6045	ROL8(X10, X12)
  6046	PADDD X10, X7
  6047	PXOR  X7, X4
  6048	MOVO  X4, X12
  6049	PSLLL $0x07, X12
  6050	PSRLL $0x19, X4
  6051	PXOR  X12, X4
  6052	BYTE  $0x66
  6053	BYTE  $0x0f
  6054	BYTE  $0x3a
  6055	BYTE  $0x0f
  6056	BYTE  $0xdb
  6057	BYTE  $0x04
  6058	BYTE  $0x66
  6059	BYTE  $0x0f
  6060	BYTE  $0x3a
  6061	BYTE  $0x0f
  6062	BYTE  $0xf6
  6063	BYTE  $0x08
  6064	BYTE  $0x66
  6065	BYTE  $0x45
  6066	BYTE  $0x0f
  6067	BYTE  $0x3a
  6068	BYTE  $0x0f
  6069	BYTE  $0xc9
  6070	BYTE  $0x0c
  6071	BYTE  $0x66
  6072	BYTE  $0x0f
  6073	BYTE  $0x3a
  6074	BYTE  $0x0f
  6075	BYTE  $0xe4
  6076	BYTE  $0x04
  6077	BYTE  $0x66
  6078	BYTE  $0x0f
  6079	BYTE  $0x3a
  6080	BYTE  $0x0f
  6081	BYTE  $0xff
  6082	BYTE  $0x08
  6083	BYTE  $0x66
  6084	BYTE  $0x45
  6085	BYTE  $0x0f
  6086	BYTE  $0x3a
  6087	BYTE  $0x0f
  6088	BYTE  $0xd2
  6089	BYTE  $0x0c
  6090	ADDQ  (DI), R10
  6091	ADCQ  8(DI), R11
  6092	ADCQ  $0x01, R12
  6093	MOVQ  (BP), AX
  6094	MOVQ  AX, R15
  6095	MULQ  R10
  6096	MOVQ  AX, R13
  6097	MOVQ  DX, R14
  6098	MOVQ  (BP), AX
  6099	MULQ  R11
  6100	IMULQ R12, R15
  6101	ADDQ  AX, R14
  6102	ADCQ  DX, R15
  6103	MOVQ  8(BP), AX
  6104	MOVQ  AX, R8
  6105	MULQ  R10
  6106	ADDQ  AX, R14
  6107	ADCQ  $0x00, DX
  6108	MOVQ  DX, R10
  6109	MOVQ  8(BP), AX
  6110	MULQ  R11
  6111	ADDQ  AX, R15
  6112	ADCQ  $0x00, DX
  6113	IMULQ R12, R8
  6114	ADDQ  R10, R15
  6115	ADCQ  DX, R8
  6116	MOVQ  R13, R10
  6117	MOVQ  R14, R11
  6118	MOVQ  R15, R12
  6119	ANDQ  $0x03, R12
  6120	MOVQ  R15, R13
  6121	ANDQ  $-4, R13
  6122	MOVQ  R8, R14
  6123	SHRQ  $0x02, R8, R15
  6124	SHRQ  $0x02, R8
  6125	ADDQ  R13, R10
  6126	ADCQ  R14, R11
  6127	ADCQ  $0x00, R12
  6128	ADDQ  R15, R10
  6129	ADCQ  R8, R11
  6130	ADCQ  $0x00, R12
  6131	LEAQ  16(DI), DI
  6132	PADDD X3, X0
  6133	PXOR  X0, X9
  6134	ROL16(X9, X12)
  6135	PADDD X9, X6
  6136	PXOR  X6, X3
  6137	MOVO  X3, X12
  6138	PSLLL $0x0c, X12
  6139	PSRLL $0x14, X3
  6140	PXOR  X12, X3
  6141	PADDD X3, X0
  6142	PXOR  X0, X9
  6143	ROL8(X9, X12)
  6144	PADDD X9, X6
  6145	PXOR  X6, X3
  6146	MOVO  X3, X12
  6147	PSLLL $0x07, X12
  6148	PSRLL $0x19, X3
  6149	PXOR  X12, X3
  6150	PADDD X4, X1
  6151	PXOR  X1, X10
  6152	ROL16(X10, X12)
  6153	PADDD X10, X7
  6154	PXOR  X7, X4
  6155	MOVO  X4, X12
  6156	PSLLL $0x0c, X12
  6157	PSRLL $0x14, X4
  6158	PXOR  X12, X4
  6159	PADDD X4, X1
  6160	PXOR  X1, X10
  6161	ROL8(X10, X12)
  6162	PADDD X10, X7
  6163	PXOR  X7, X4
  6164	MOVO  X4, X12
  6165	PSLLL $0x07, X12
  6166	PSRLL $0x19, X4
  6167	PXOR  X12, X4
  6168	BYTE  $0x66
  6169	BYTE  $0x0f
  6170	BYTE  $0x3a
  6171	BYTE  $0x0f
  6172	BYTE  $0xdb
  6173	BYTE  $0x0c
  6174	BYTE  $0x66
  6175	BYTE  $0x0f
  6176	BYTE  $0x3a
  6177	BYTE  $0x0f
  6178	BYTE  $0xf6
  6179	BYTE  $0x08
  6180	BYTE  $0x66
  6181	BYTE  $0x45
  6182	BYTE  $0x0f
  6183	BYTE  $0x3a
  6184	BYTE  $0x0f
  6185	BYTE  $0xc9
  6186	BYTE  $0x04
  6187	BYTE  $0x66
  6188	BYTE  $0x0f
  6189	BYTE  $0x3a
  6190	BYTE  $0x0f
  6191	BYTE  $0xe4
  6192	BYTE  $0x0c
  6193	BYTE  $0x66
  6194	BYTE  $0x0f
  6195	BYTE  $0x3a
  6196	BYTE  $0x0f
  6197	BYTE  $0xff
  6198	BYTE  $0x08
  6199	BYTE  $0x66
  6200	BYTE  $0x45
  6201	BYTE  $0x0f
  6202	BYTE  $0x3a
  6203	BYTE  $0x0f
  6204	BYTE  $0xd2
  6205	BYTE  $0x04
  6206	DECQ  CX
  6207	JG    sealSSETail128LoopA
  6208	DECQ  R9
  6209	JGE   sealSSETail128LoopB
  6210	PADDL ·chacha20Constants<>+0(SB), X0
  6211	PADDL ·chacha20Constants<>+0(SB), X1
  6212	PADDL 32(BP), X3
  6213	PADDL 32(BP), X4
  6214	PADDL 48(BP), X6
  6215	PADDL 48(BP), X7
  6216	PADDL 80(BP), X9
  6217	PADDL 96(BP), X10
  6218	MOVOU (SI), X12
  6219	MOVOU 16(SI), X13
  6220	MOVOU 32(SI), X14
  6221	MOVOU 48(SI), X15
  6222	PXOR  X12, X0
  6223	PXOR  X13, X3
  6224	PXOR  X14, X6
  6225	PXOR  X15, X9
  6226	MOVOU X0, (DI)
  6227	MOVOU X3, 16(DI)
  6228	MOVOU X6, 32(DI)
  6229	MOVOU X9, 48(DI)
  6230	MOVQ  $0x00000040, CX
  6231	LEAQ  64(SI), SI
  6232	SUBQ  $0x40, BX
  6233	JMP   sealSSE128SealHash
  6234
  6235sealSSETail192:
  6236	MOVO  ·chacha20Constants<>+0(SB), X0
  6237	MOVO  32(BP), X3
  6238	MOVO  48(BP), X6
  6239	MOVO  128(BP), X9
  6240	PADDL ·sseIncMask<>+0(SB), X9
  6241	MOVO  X9, 80(BP)
  6242	MOVO  X0, X1
  6243	MOVO  X3, X4
  6244	MOVO  X6, X7
  6245	MOVO  X9, X10
  6246	PADDL ·sseIncMask<>+0(SB), X10
  6247	MOVO  X10, 96(BP)
  6248	MOVO  X1, X2
  6249	MOVO  X4, X5
  6250	MOVO  X7, X8
  6251	MOVO  X10, X11
  6252	PADDL ·sseIncMask<>+0(SB), X11
  6253	MOVO  X11, 112(BP)
  6254
  6255sealSSETail192LoopA:
  6256	ADDQ  (DI), R10
  6257	ADCQ  8(DI), R11
  6258	ADCQ  $0x01, R12
  6259	MOVQ  (BP), AX
  6260	MOVQ  AX, R15
  6261	MULQ  R10
  6262	MOVQ  AX, R13
  6263	MOVQ  DX, R14
  6264	MOVQ  (BP), AX
  6265	MULQ  R11
  6266	IMULQ R12, R15
  6267	ADDQ  AX, R14
  6268	ADCQ  DX, R15
  6269	MOVQ  8(BP), AX
  6270	MOVQ  AX, R8
  6271	MULQ  R10
  6272	ADDQ  AX, R14
  6273	ADCQ  $0x00, DX
  6274	MOVQ  DX, R10
  6275	MOVQ  8(BP), AX
  6276	MULQ  R11
  6277	ADDQ  AX, R15
  6278	ADCQ  $0x00, DX
  6279	IMULQ R12, R8
  6280	ADDQ  R10, R15
  6281	ADCQ  DX, R8
  6282	MOVQ  R13, R10
  6283	MOVQ  R14, R11
  6284	MOVQ  R15, R12
  6285	ANDQ  $0x03, R12
  6286	MOVQ  R15, R13
  6287	ANDQ  $-4, R13
  6288	MOVQ  R8, R14
  6289	SHRQ  $0x02, R8, R15
  6290	SHRQ  $0x02, R8
  6291	ADDQ  R13, R10
  6292	ADCQ  R14, R11
  6293	ADCQ  $0x00, R12
  6294	ADDQ  R15, R10
  6295	ADCQ  R8, R11
  6296	ADCQ  $0x00, R12
  6297	LEAQ  16(DI), DI
  6298
  6299sealSSETail192LoopB:
  6300	PADDD X3, X0
  6301	PXOR  X0, X9
  6302	ROL16(X9, X12)
  6303	PADDD X9, X6
  6304	PXOR  X6, X3
  6305	MOVO  X3, X12
  6306	PSLLL $0x0c, X12
  6307	PSRLL $0x14, X3
  6308	PXOR  X12, X3
  6309	PADDD X3, X0
  6310	PXOR  X0, X9
  6311	ROL8(X9, X12)
  6312	PADDD X9, X6
  6313	PXOR  X6, X3
  6314	MOVO  X3, X12
  6315	PSLLL $0x07, X12
  6316	PSRLL $0x19, X3
  6317	PXOR  X12, X3
  6318	PADDD X4, X1
  6319	PXOR  X1, X10
  6320	ROL16(X10, X12)
  6321	PADDD X10, X7
  6322	PXOR  X7, X4
  6323	MOVO  X4, X12
  6324	PSLLL $0x0c, X12
  6325	PSRLL $0x14, X4
  6326	PXOR  X12, X4
  6327	PADDD X4, X1
  6328	PXOR  X1, X10
  6329	ROL8(X10, X12)
  6330	PADDD X10, X7
  6331	PXOR  X7, X4
  6332	MOVO  X4, X12
  6333	PSLLL $0x07, X12
  6334	PSRLL $0x19, X4
  6335	PXOR  X12, X4
  6336	PADDD X5, X2
  6337	PXOR  X2, X11
  6338	ROL16(X11, X12)
  6339	PADDD X11, X8
  6340	PXOR  X8, X5
  6341	MOVO  X5, X12
  6342	PSLLL $0x0c, X12
  6343	PSRLL $0x14, X5
  6344	PXOR  X12, X5
  6345	PADDD X5, X2
  6346	PXOR  X2, X11
  6347	ROL8(X11, X12)
  6348	PADDD X11, X8
  6349	PXOR  X8, X5
  6350	MOVO  X5, X12
  6351	PSLLL $0x07, X12
  6352	PSRLL $0x19, X5
  6353	PXOR  X12, X5
  6354	BYTE  $0x66
  6355	BYTE  $0x0f
  6356	BYTE  $0x3a
  6357	BYTE  $0x0f
  6358	BYTE  $0xdb
  6359	BYTE  $0x04
  6360	BYTE  $0x66
  6361	BYTE  $0x0f
  6362	BYTE  $0x3a
  6363	BYTE  $0x0f
  6364	BYTE  $0xf6
  6365	BYTE  $0x08
  6366	BYTE  $0x66
  6367	BYTE  $0x45
  6368	BYTE  $0x0f
  6369	BYTE  $0x3a
  6370	BYTE  $0x0f
  6371	BYTE  $0xc9
  6372	BYTE  $0x0c
  6373	BYTE  $0x66
  6374	BYTE  $0x0f
  6375	BYTE  $0x3a
  6376	BYTE  $0x0f
  6377	BYTE  $0xe4
  6378	BYTE  $0x04
  6379	BYTE  $0x66
  6380	BYTE  $0x0f
  6381	BYTE  $0x3a
  6382	BYTE  $0x0f
  6383	BYTE  $0xff
  6384	BYTE  $0x08
  6385	BYTE  $0x66
  6386	BYTE  $0x45
  6387	BYTE  $0x0f
  6388	BYTE  $0x3a
  6389	BYTE  $0x0f
  6390	BYTE  $0xd2
  6391	BYTE  $0x0c
  6392	BYTE  $0x66
  6393	BYTE  $0x0f
  6394	BYTE  $0x3a
  6395	BYTE  $0x0f
  6396	BYTE  $0xed
  6397	BYTE  $0x04
  6398	BYTE  $0x66
  6399	BYTE  $0x45
  6400	BYTE  $0x0f
  6401	BYTE  $0x3a
  6402	BYTE  $0x0f
  6403	BYTE  $0xc0
  6404	BYTE  $0x08
  6405	BYTE  $0x66
  6406	BYTE  $0x45
  6407	BYTE  $0x0f
  6408	BYTE  $0x3a
  6409	BYTE  $0x0f
  6410	BYTE  $0xdb
  6411	BYTE  $0x0c
  6412	ADDQ  (DI), R10
  6413	ADCQ  8(DI), R11
  6414	ADCQ  $0x01, R12
  6415	MOVQ  (BP), AX
  6416	MOVQ  AX, R15
  6417	MULQ  R10
  6418	MOVQ  AX, R13
  6419	MOVQ  DX, R14
  6420	MOVQ  (BP), AX
  6421	MULQ  R11
  6422	IMULQ R12, R15
  6423	ADDQ  AX, R14
  6424	ADCQ  DX, R15
  6425	MOVQ  8(BP), AX
  6426	MOVQ  AX, R8
  6427	MULQ  R10
  6428	ADDQ  AX, R14
  6429	ADCQ  $0x00, DX
  6430	MOVQ  DX, R10
  6431	MOVQ  8(BP), AX
  6432	MULQ  R11
  6433	ADDQ  AX, R15
  6434	ADCQ  $0x00, DX
  6435	IMULQ R12, R8
  6436	ADDQ  R10, R15
  6437	ADCQ  DX, R8
  6438	MOVQ  R13, R10
  6439	MOVQ  R14, R11
  6440	MOVQ  R15, R12
  6441	ANDQ  $0x03, R12
  6442	MOVQ  R15, R13
  6443	ANDQ  $-4, R13
  6444	MOVQ  R8, R14
  6445	SHRQ  $0x02, R8, R15
  6446	SHRQ  $0x02, R8
  6447	ADDQ  R13, R10
  6448	ADCQ  R14, R11
  6449	ADCQ  $0x00, R12
  6450	ADDQ  R15, R10
  6451	ADCQ  R8, R11
  6452	ADCQ  $0x00, R12
  6453	LEAQ  16(DI), DI
  6454	PADDD X3, X0
  6455	PXOR  X0, X9
  6456	ROL16(X9, X12)
  6457	PADDD X9, X6
  6458	PXOR  X6, X3
  6459	MOVO  X3, X12
  6460	PSLLL $0x0c, X12
  6461	PSRLL $0x14, X3
  6462	PXOR  X12, X3
  6463	PADDD X3, X0
  6464	PXOR  X0, X9
  6465	ROL8(X9, X12)
  6466	PADDD X9, X6
  6467	PXOR  X6, X3
  6468	MOVO  X3, X12
  6469	PSLLL $0x07, X12
  6470	PSRLL $0x19, X3
  6471	PXOR  X12, X3
  6472	PADDD X4, X1
  6473	PXOR  X1, X10
  6474	ROL16(X10, X12)
  6475	PADDD X10, X7
  6476	PXOR  X7, X4
  6477	MOVO  X4, X12
  6478	PSLLL $0x0c, X12
  6479	PSRLL $0x14, X4
  6480	PXOR  X12, X4
  6481	PADDD X4, X1
  6482	PXOR  X1, X10
  6483	ROL8(X10, X12)
  6484	PADDD X10, X7
  6485	PXOR  X7, X4
  6486	MOVO  X4, X12
  6487	PSLLL $0x07, X12
  6488	PSRLL $0x19, X4
  6489	PXOR  X12, X4
  6490	PADDD X5, X2
  6491	PXOR  X2, X11
  6492	ROL16(X11, X12)
  6493	PADDD X11, X8
  6494	PXOR  X8, X5
  6495	MOVO  X5, X12
  6496	PSLLL $0x0c, X12
  6497	PSRLL $0x14, X5
  6498	PXOR  X12, X5
  6499	PADDD X5, X2
  6500	PXOR  X2, X11
  6501	ROL8(X11, X12)
  6502	PADDD X11, X8
  6503	PXOR  X8, X5
  6504	MOVO  X5, X12
  6505	PSLLL $0x07, X12
  6506	PSRLL $0x19, X5
  6507	PXOR  X12, X5
  6508	BYTE  $0x66
  6509	BYTE  $0x0f
  6510	BYTE  $0x3a
  6511	BYTE  $0x0f
  6512	BYTE  $0xdb
  6513	BYTE  $0x0c
  6514	BYTE  $0x66
  6515	BYTE  $0x0f
  6516	BYTE  $0x3a
  6517	BYTE  $0x0f
  6518	BYTE  $0xf6
  6519	BYTE  $0x08
  6520	BYTE  $0x66
  6521	BYTE  $0x45
  6522	BYTE  $0x0f
  6523	BYTE  $0x3a
  6524	BYTE  $0x0f
  6525	BYTE  $0xc9
  6526	BYTE  $0x04
  6527	BYTE  $0x66
  6528	BYTE  $0x0f
  6529	BYTE  $0x3a
  6530	BYTE  $0x0f
  6531	BYTE  $0xe4
  6532	BYTE  $0x0c
  6533	BYTE  $0x66
  6534	BYTE  $0x0f
  6535	BYTE  $0x3a
  6536	BYTE  $0x0f
  6537	BYTE  $0xff
  6538	BYTE  $0x08
  6539	BYTE  $0x66
  6540	BYTE  $0x45
  6541	BYTE  $0x0f
  6542	BYTE  $0x3a
  6543	BYTE  $0x0f
  6544	BYTE  $0xd2
  6545	BYTE  $0x04
  6546	BYTE  $0x66
  6547	BYTE  $0x0f
  6548	BYTE  $0x3a
  6549	BYTE  $0x0f
  6550	BYTE  $0xed
  6551	BYTE  $0x0c
  6552	BYTE  $0x66
  6553	BYTE  $0x45
  6554	BYTE  $0x0f
  6555	BYTE  $0x3a
  6556	BYTE  $0x0f
  6557	BYTE  $0xc0
  6558	BYTE  $0x08
  6559	BYTE  $0x66
  6560	BYTE  $0x45
  6561	BYTE  $0x0f
  6562	BYTE  $0x3a
  6563	BYTE  $0x0f
  6564	BYTE  $0xdb
  6565	BYTE  $0x04
  6566	DECQ  CX
  6567	JG    sealSSETail192LoopA
  6568	DECQ  R9
  6569	JGE   sealSSETail192LoopB
  6570	PADDL ·chacha20Constants<>+0(SB), X0
  6571	PADDL ·chacha20Constants<>+0(SB), X1
  6572	PADDL ·chacha20Constants<>+0(SB), X2
  6573	PADDL 32(BP), X3
  6574	PADDL 32(BP), X4
  6575	PADDL 32(BP), X5
  6576	PADDL 48(BP), X6
  6577	PADDL 48(BP), X7
  6578	PADDL 48(BP), X8
  6579	PADDL 80(BP), X9
  6580	PADDL 96(BP), X10
  6581	PADDL 112(BP), X11
  6582	MOVOU (SI), X12
  6583	MOVOU 16(SI), X13
  6584	MOVOU 32(SI), X14
  6585	MOVOU 48(SI), X15
  6586	PXOR  X12, X0
  6587	PXOR  X13, X3
  6588	PXOR  X14, X6
  6589	PXOR  X15, X9
  6590	MOVOU X0, (DI)
  6591	MOVOU X3, 16(DI)
  6592	MOVOU X6, 32(DI)
  6593	MOVOU X9, 48(DI)
  6594	MOVOU 64(SI), X12
  6595	MOVOU 80(SI), X13
  6596	MOVOU 96(SI), X14
  6597	MOVOU 112(SI), X15
  6598	PXOR  X12, X1
  6599	PXOR  X13, X4
  6600	PXOR  X14, X7
  6601	PXOR  X15, X10
  6602	MOVOU X1, 64(DI)
  6603	MOVOU X4, 80(DI)
  6604	MOVOU X7, 96(DI)
  6605	MOVOU X10, 112(DI)
  6606	MOVO  X2, X1
  6607	MOVO  X5, X4
  6608	MOVO  X8, X7
  6609	MOVO  X11, X10
  6610	MOVQ  $0x00000080, CX
  6611	LEAQ  128(SI), SI
  6612	SUBQ  $0x80, BX
  6613	JMP   sealSSE128SealHash
  6614
  6615sealSSE128:
  6616	MOVOU ·chacha20Constants<>+0(SB), X0
  6617	MOVOU 16(R8), X3
  6618	MOVOU 32(R8), X6
  6619	MOVOU 48(R8), X9
  6620	MOVO  X0, X1
  6621	MOVO  X3, X4
  6622	MOVO  X6, X7
  6623	MOVO  X9, X10
  6624	PADDL ·sseIncMask<>+0(SB), X10
  6625	MOVO  X1, X2
  6626	MOVO  X4, X5
  6627	MOVO  X7, X8
  6628	MOVO  X10, X11
  6629	PADDL ·sseIncMask<>+0(SB), X11
  6630	MOVO  X3, X13
  6631	MOVO  X6, X14
  6632	MOVO  X10, X15
  6633	MOVQ  $0x0000000a, R9
  6634
  6635sealSSE128InnerCipherLoop:
  6636	PADDD X3, X0
  6637	PXOR  X0, X9
  6638	ROL16(X9, X12)
  6639	PADDD X9, X6
  6640	PXOR  X6, X3
  6641	MOVO  X3, X12
  6642	PSLLL $0x0c, X12
  6643	PSRLL $0x14, X3
  6644	PXOR  X12, X3
  6645	PADDD X3, X0
  6646	PXOR  X0, X9
  6647	ROL8(X9, X12)
  6648	PADDD X9, X6
  6649	PXOR  X6, X3
  6650	MOVO  X3, X12
  6651	PSLLL $0x07, X12
  6652	PSRLL $0x19, X3
  6653	PXOR  X12, X3
  6654	PADDD X4, X1
  6655	PXOR  X1, X10
  6656	ROL16(X10, X12)
  6657	PADDD X10, X7
  6658	PXOR  X7, X4
  6659	MOVO  X4, X12
  6660	PSLLL $0x0c, X12
  6661	PSRLL $0x14, X4
  6662	PXOR  X12, X4
  6663	PADDD X4, X1
  6664	PXOR  X1, X10
  6665	ROL8(X10, X12)
  6666	PADDD X10, X7
  6667	PXOR  X7, X4
  6668	MOVO  X4, X12
  6669	PSLLL $0x07, X12
  6670	PSRLL $0x19, X4
  6671	PXOR  X12, X4
  6672	PADDD X5, X2
  6673	PXOR  X2, X11
  6674	ROL16(X11, X12)
  6675	PADDD X11, X8
  6676	PXOR  X8, X5
  6677	MOVO  X5, X12
  6678	PSLLL $0x0c, X12
  6679	PSRLL $0x14, X5
  6680	PXOR  X12, X5
  6681	PADDD X5, X2
  6682	PXOR  X2, X11
  6683	ROL8(X11, X12)
  6684	PADDD X11, X8
  6685	PXOR  X8, X5
  6686	MOVO  X5, X12
  6687	PSLLL $0x07, X12
  6688	PSRLL $0x19, X5
  6689	PXOR  X12, X5
  6690	BYTE  $0x66
  6691	BYTE  $0x0f
  6692	BYTE  $0x3a
  6693	BYTE  $0x0f
  6694	BYTE  $0xdb
  6695	BYTE  $0x04
  6696	BYTE  $0x66
  6697	BYTE  $0x0f
  6698	BYTE  $0x3a
  6699	BYTE  $0x0f
  6700	BYTE  $0xe4
  6701	BYTE  $0x04
  6702	BYTE  $0x66
  6703	BYTE  $0x0f
  6704	BYTE  $0x3a
  6705	BYTE  $0x0f
  6706	BYTE  $0xed
  6707	BYTE  $0x04
  6708	BYTE  $0x66
  6709	BYTE  $0x0f
  6710	BYTE  $0x3a
  6711	BYTE  $0x0f
  6712	BYTE  $0xf6
  6713	BYTE  $0x08
  6714	BYTE  $0x66
  6715	BYTE  $0x0f
  6716	BYTE  $0x3a
  6717	BYTE  $0x0f
  6718	BYTE  $0xff
  6719	BYTE  $0x08
  6720	BYTE  $0x66
  6721	BYTE  $0x45
  6722	BYTE  $0x0f
  6723	BYTE  $0x3a
  6724	BYTE  $0x0f
  6725	BYTE  $0xc0
  6726	BYTE  $0x08
  6727	BYTE  $0x66
  6728	BYTE  $0x45
  6729	BYTE  $0x0f
  6730	BYTE  $0x3a
  6731	BYTE  $0x0f
  6732	BYTE  $0xc9
  6733	BYTE  $0x0c
  6734	BYTE  $0x66
  6735	BYTE  $0x45
  6736	BYTE  $0x0f
  6737	BYTE  $0x3a
  6738	BYTE  $0x0f
  6739	BYTE  $0xd2
  6740	BYTE  $0x0c
  6741	BYTE  $0x66
  6742	BYTE  $0x45
  6743	BYTE  $0x0f
  6744	BYTE  $0x3a
  6745	BYTE  $0x0f
  6746	BYTE  $0xdb
  6747	BYTE  $0x0c
  6748	PADDD X3, X0
  6749	PXOR  X0, X9
  6750	ROL16(X9, X12)
  6751	PADDD X9, X6
  6752	PXOR  X6, X3
  6753	MOVO  X3, X12
  6754	PSLLL $0x0c, X12
  6755	PSRLL $0x14, X3
  6756	PXOR  X12, X3
  6757	PADDD X3, X0
  6758	PXOR  X0, X9
  6759	ROL8(X9, X12)
  6760	PADDD X9, X6
  6761	PXOR  X6, X3
  6762	MOVO  X3, X12
  6763	PSLLL $0x07, X12
  6764	PSRLL $0x19, X3
  6765	PXOR  X12, X3
  6766	PADDD X4, X1
  6767	PXOR  X1, X10
  6768	ROL16(X10, X12)
  6769	PADDD X10, X7
  6770	PXOR  X7, X4
  6771	MOVO  X4, X12
  6772	PSLLL $0x0c, X12
  6773	PSRLL $0x14, X4
  6774	PXOR  X12, X4
  6775	PADDD X4, X1
  6776	PXOR  X1, X10
  6777	ROL8(X10, X12)
  6778	PADDD X10, X7
  6779	PXOR  X7, X4
  6780	MOVO  X4, X12
  6781	PSLLL $0x07, X12
  6782	PSRLL $0x19, X4
  6783	PXOR  X12, X4
  6784	PADDD X5, X2
  6785	PXOR  X2, X11
  6786	ROL16(X11, X12)
  6787	PADDD X11, X8
  6788	PXOR  X8, X5
  6789	MOVO  X5, X12
  6790	PSLLL $0x0c, X12
  6791	PSRLL $0x14, X5
  6792	PXOR  X12, X5
  6793	PADDD X5, X2
  6794	PXOR  X2, X11
  6795	ROL8(X11, X12)
  6796	PADDD X11, X8
  6797	PXOR  X8, X5
  6798	MOVO  X5, X12
  6799	PSLLL $0x07, X12
  6800	PSRLL $0x19, X5
  6801	PXOR  X12, X5
  6802	BYTE  $0x66
  6803	BYTE  $0x0f
  6804	BYTE  $0x3a
  6805	BYTE  $0x0f
  6806	BYTE  $0xdb
  6807	BYTE  $0x0c
  6808	BYTE  $0x66
  6809	BYTE  $0x0f
  6810	BYTE  $0x3a
  6811	BYTE  $0x0f
  6812	BYTE  $0xe4
  6813	BYTE  $0x0c
  6814	BYTE  $0x66
  6815	BYTE  $0x0f
  6816	BYTE  $0x3a
  6817	BYTE  $0x0f
  6818	BYTE  $0xed
  6819	BYTE  $0x0c
  6820	BYTE  $0x66
  6821	BYTE  $0x0f
  6822	BYTE  $0x3a
  6823	BYTE  $0x0f
  6824	BYTE  $0xf6
  6825	BYTE  $0x08
  6826	BYTE  $0x66
  6827	BYTE  $0x0f
  6828	BYTE  $0x3a
  6829	BYTE  $0x0f
  6830	BYTE  $0xff
  6831	BYTE  $0x08
  6832	BYTE  $0x66
  6833	BYTE  $0x45
  6834	BYTE  $0x0f
  6835	BYTE  $0x3a
  6836	BYTE  $0x0f
  6837	BYTE  $0xc0
  6838	BYTE  $0x08
  6839	BYTE  $0x66
  6840	BYTE  $0x45
  6841	BYTE  $0x0f
  6842	BYTE  $0x3a
  6843	BYTE  $0x0f
  6844	BYTE  $0xc9
  6845	BYTE  $0x04
  6846	BYTE  $0x66
  6847	BYTE  $0x45
  6848	BYTE  $0x0f
  6849	BYTE  $0x3a
  6850	BYTE  $0x0f
  6851	BYTE  $0xd2
  6852	BYTE  $0x04
  6853	BYTE  $0x66
  6854	BYTE  $0x45
  6855	BYTE  $0x0f
  6856	BYTE  $0x3a
  6857	BYTE  $0x0f
  6858	BYTE  $0xdb
  6859	BYTE  $0x04
  6860	DECQ  R9
  6861	JNE   sealSSE128InnerCipherLoop
  6862
  6863	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  6864	PADDL ·chacha20Constants<>+0(SB), X0
  6865	PADDL ·chacha20Constants<>+0(SB), X1
  6866	PADDL ·chacha20Constants<>+0(SB), X2
  6867	PADDL X13, X3
  6868	PADDL X13, X4
  6869	PADDL X13, X5
  6870	PADDL X14, X7
  6871	PADDL X14, X8
  6872	PADDL X15, X10
  6873	PADDL ·sseIncMask<>+0(SB), X15
  6874	PADDL X15, X11
  6875	PAND  ·polyClampMask<>+0(SB), X0
  6876	MOVOU X0, (BP)
  6877	MOVOU X3, 16(BP)
  6878
  6879	// Hash
  6880	MOVQ ad_len+80(FP), R9
  6881	CALL polyHashADInternal<>(SB)
  6882	XORQ CX, CX
  6883
  6884sealSSE128SealHash:
  6885	CMPQ  CX, $0x10
  6886	JB    sealSSE128Seal
  6887	ADDQ  (DI), R10
  6888	ADCQ  8(DI), R11
  6889	ADCQ  $0x01, R12
  6890	MOVQ  (BP), AX
  6891	MOVQ  AX, R15
  6892	MULQ  R10
  6893	MOVQ  AX, R13
  6894	MOVQ  DX, R14
  6895	MOVQ  (BP), AX
  6896	MULQ  R11
  6897	IMULQ R12, R15
  6898	ADDQ  AX, R14
  6899	ADCQ  DX, R15
  6900	MOVQ  8(BP), AX
  6901	MOVQ  AX, R8
  6902	MULQ  R10
  6903	ADDQ  AX, R14
  6904	ADCQ  $0x00, DX
  6905	MOVQ  DX, R10
  6906	MOVQ  8(BP), AX
  6907	MULQ  R11
  6908	ADDQ  AX, R15
  6909	ADCQ  $0x00, DX
  6910	IMULQ R12, R8
  6911	ADDQ  R10, R15
  6912	ADCQ  DX, R8
  6913	MOVQ  R13, R10
  6914	MOVQ  R14, R11
  6915	MOVQ  R15, R12
  6916	ANDQ  $0x03, R12
  6917	MOVQ  R15, R13
  6918	ANDQ  $-4, R13
  6919	MOVQ  R8, R14
  6920	SHRQ  $0x02, R8, R15
  6921	SHRQ  $0x02, R8
  6922	ADDQ  R13, R10
  6923	ADCQ  R14, R11
  6924	ADCQ  $0x00, R12
  6925	ADDQ  R15, R10
  6926	ADCQ  R8, R11
  6927	ADCQ  $0x00, R12
  6928	SUBQ  $0x10, CX
  6929	ADDQ  $0x10, DI
  6930	JMP   sealSSE128SealHash
  6931
  6932sealSSE128Seal:
  6933	CMPQ BX, $0x10
  6934	JB   sealSSETail
  6935	SUBQ $0x10, BX
  6936
  6937	// Load for decryption
  6938	MOVOU (SI), X12
  6939	PXOR  X12, X1
  6940	MOVOU X1, (DI)
  6941	LEAQ  16(SI), SI
  6942	LEAQ  16(DI), DI
  6943
  6944	// Extract for hashing
  6945	MOVQ   X1, R13
  6946	PSRLDQ $0x08, X1
  6947	MOVQ   X1, R14
  6948	ADDQ   R13, R10
  6949	ADCQ   R14, R11
  6950	ADCQ   $0x01, R12
  6951	MOVQ   (BP), AX
  6952	MOVQ   AX, R15
  6953	MULQ   R10
  6954	MOVQ   AX, R13
  6955	MOVQ   DX, R14
  6956	MOVQ   (BP), AX
  6957	MULQ   R11
  6958	IMULQ  R12, R15
  6959	ADDQ   AX, R14
  6960	ADCQ   DX, R15
  6961	MOVQ   8(BP), AX
  6962	MOVQ   AX, R8
  6963	MULQ   R10
  6964	ADDQ   AX, R14
  6965	ADCQ   $0x00, DX
  6966	MOVQ   DX, R10
  6967	MOVQ   8(BP), AX
  6968	MULQ   R11
  6969	ADDQ   AX, R15
  6970	ADCQ   $0x00, DX
  6971	IMULQ  R12, R8
  6972	ADDQ   R10, R15
  6973	ADCQ   DX, R8
  6974	MOVQ   R13, R10
  6975	MOVQ   R14, R11
  6976	MOVQ   R15, R12
  6977	ANDQ   $0x03, R12
  6978	MOVQ   R15, R13
  6979	ANDQ   $-4, R13
  6980	MOVQ   R8, R14
  6981	SHRQ   $0x02, R8, R15
  6982	SHRQ   $0x02, R8
  6983	ADDQ   R13, R10
  6984	ADCQ   R14, R11
  6985	ADCQ   $0x00, R12
  6986	ADDQ   R15, R10
  6987	ADCQ   R8, R11
  6988	ADCQ   $0x00, R12
  6989
  6990	// Shift the stream "left"
  6991	MOVO X4, X1
  6992	MOVO X7, X4
  6993	MOVO X10, X7
  6994	MOVO X2, X10
  6995	MOVO X5, X2
  6996	MOVO X8, X5
  6997	MOVO X11, X8
  6998	JMP  sealSSE128Seal
  6999
  7000sealSSETail:
  7001	TESTQ BX, BX
  7002	JE    sealSSEFinalize
  7003
  7004	// We can only load the PT one byte at a time to avoid read after end of buffer
  7005	MOVQ BX, R9
  7006	SHLQ $0x04, R9
  7007	LEAQ ·andMask<>+0(SB), R13
  7008	MOVQ BX, CX
  7009	LEAQ -1(SI)(BX*1), SI
  7010	XORQ R15, R15
  7011	XORQ R8, R8
  7012	XORQ AX, AX
  7013
  7014sealSSETailLoadLoop:
  7015	SHLQ   $0x08, R15, R8
  7016	SHLQ   $0x08, R15
  7017	MOVB   (SI), AX
  7018	XORQ   AX, R15
  7019	LEAQ   -1(SI), SI
  7020	DECQ   CX
  7021	JNE    sealSSETailLoadLoop
  7022	MOVQ   R15, 64(BP)
  7023	MOVQ   R8, 72(BP)
  7024	PXOR   64(BP), X1
  7025	MOVOU  X1, (DI)
  7026	MOVOU  -16(R13)(R9*1), X12
  7027	PAND   X12, X1
  7028	MOVQ   X1, R13
  7029	PSRLDQ $0x08, X1
  7030	MOVQ   X1, R14
  7031	ADDQ   R13, R10
  7032	ADCQ   R14, R11
  7033	ADCQ   $0x01, R12
  7034	MOVQ   (BP), AX
  7035	MOVQ   AX, R15
  7036	MULQ   R10
  7037	MOVQ   AX, R13
  7038	MOVQ   DX, R14
  7039	MOVQ   (BP), AX
  7040	MULQ   R11
  7041	IMULQ  R12, R15
  7042	ADDQ   AX, R14
  7043	ADCQ   DX, R15
  7044	MOVQ   8(BP), AX
  7045	MOVQ   AX, R8
  7046	MULQ   R10
  7047	ADDQ   AX, R14
  7048	ADCQ   $0x00, DX
  7049	MOVQ   DX, R10
  7050	MOVQ   8(BP), AX
  7051	MULQ   R11
  7052	ADDQ   AX, R15
  7053	ADCQ   $0x00, DX
  7054	IMULQ  R12, R8
  7055	ADDQ   R10, R15
  7056	ADCQ   DX, R8
  7057	MOVQ   R13, R10
  7058	MOVQ   R14, R11
  7059	MOVQ   R15, R12
  7060	ANDQ   $0x03, R12
  7061	MOVQ   R15, R13
  7062	ANDQ   $-4, R13
  7063	MOVQ   R8, R14
  7064	SHRQ   $0x02, R8, R15
  7065	SHRQ   $0x02, R8
  7066	ADDQ   R13, R10
  7067	ADCQ   R14, R11
  7068	ADCQ   $0x00, R12
  7069	ADDQ   R15, R10
  7070	ADCQ   R8, R11
  7071	ADCQ   $0x00, R12
  7072	ADDQ   BX, DI
  7073
  7074sealSSEFinalize:
  7075	// Hash in the buffer lengths
  7076	ADDQ  ad_len+80(FP), R10
  7077	ADCQ  src_len+56(FP), R11
  7078	ADCQ  $0x01, R12
  7079	MOVQ  (BP), AX
  7080	MOVQ  AX, R15
  7081	MULQ  R10
  7082	MOVQ  AX, R13
  7083	MOVQ  DX, R14
  7084	MOVQ  (BP), AX
  7085	MULQ  R11
  7086	IMULQ R12, R15
  7087	ADDQ  AX, R14
  7088	ADCQ  DX, R15
  7089	MOVQ  8(BP), AX
  7090	MOVQ  AX, R8
  7091	MULQ  R10
  7092	ADDQ  AX, R14
  7093	ADCQ  $0x00, DX
  7094	MOVQ  DX, R10
  7095	MOVQ  8(BP), AX
  7096	MULQ  R11
  7097	ADDQ  AX, R15
  7098	ADCQ  $0x00, DX
  7099	IMULQ R12, R8
  7100	ADDQ  R10, R15
  7101	ADCQ  DX, R8
  7102	MOVQ  R13, R10
  7103	MOVQ  R14, R11
  7104	MOVQ  R15, R12
  7105	ANDQ  $0x03, R12
  7106	MOVQ  R15, R13
  7107	ANDQ  $-4, R13
  7108	MOVQ  R8, R14
  7109	SHRQ  $0x02, R8, R15
  7110	SHRQ  $0x02, R8
  7111	ADDQ  R13, R10
  7112	ADCQ  R14, R11
  7113	ADCQ  $0x00, R12
  7114	ADDQ  R15, R10
  7115	ADCQ  R8, R11
  7116	ADCQ  $0x00, R12
  7117
  7118	// Final reduce
  7119	MOVQ    R10, R13
  7120	MOVQ    R11, R14
  7121	MOVQ    R12, R15
  7122	SUBQ    $-5, R10
  7123	SBBQ    $-1, R11
  7124	SBBQ    $0x03, R12
  7125	CMOVQCS R13, R10
  7126	CMOVQCS R14, R11
  7127	CMOVQCS R15, R12
  7128
  7129	// Add in the "s" part of the key
  7130	ADDQ 16(BP), R10
  7131	ADCQ 24(BP), R11
  7132
  7133	// Finally store the tag at the end of the message
  7134	MOVQ R10, (DI)
  7135	MOVQ R11, 8(DI)
  7136	RET
  7137
  7138chacha20Poly1305Seal_AVX2:
  7139	VZEROUPPER
  7140	VMOVDQU ·chacha20Constants<>+0(SB), Y0
  7141	BYTE    $0xc4
  7142	BYTE    $0x42
  7143	BYTE    $0x7d
  7144	BYTE    $0x5a
  7145	BYTE    $0x70
  7146	BYTE    $0x10
  7147	BYTE    $0xc4
  7148	BYTE    $0x42
  7149	BYTE    $0x7d
  7150	BYTE    $0x5a
  7151	BYTE    $0x60
  7152	BYTE    $0x20
  7153	BYTE    $0xc4
  7154	BYTE    $0xc2
  7155	BYTE    $0x7d
  7156	BYTE    $0x5a
  7157	BYTE    $0x60
  7158	BYTE    $0x30
  7159	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
  7160
  7161	// Special optimizations, for very short buffers
  7162	CMPQ BX, $0x000000c0
  7163	JBE  seal192AVX2
  7164	CMPQ BX, $0x00000140
  7165	JBE  seal320AVX2
  7166
  7167	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  7168	VMOVDQA Y0, Y5
  7169	VMOVDQA Y0, Y6
  7170	VMOVDQA Y0, Y7
  7171	VMOVDQA Y14, Y9
  7172	VMOVDQA Y14, Y10
  7173	VMOVDQA Y14, Y11
  7174	VMOVDQA Y14, 32(BP)
  7175	VMOVDQA Y12, Y13
  7176	VMOVDQA Y12, Y8
  7177	VMOVDQA Y12, Y15
  7178	VMOVDQA Y12, 64(BP)
  7179	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  7180	VMOVDQA Y4, 96(BP)
  7181	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  7182	VMOVDQA Y1, 128(BP)
  7183	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
  7184	VMOVDQA Y2, 160(BP)
  7185	VMOVDQA Y3, 192(BP)
  7186	MOVQ    $0x0000000a, R9
  7187
  7188sealAVX2IntroLoop:
  7189	VMOVDQA    Y15, 224(BP)
  7190	VPADDD     Y14, Y0, Y0
  7191	VPXOR      Y0, Y4, Y4
  7192	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  7193	VPADDD     Y4, Y12, Y12
  7194	VPXOR      Y12, Y14, Y14
  7195	VPSLLD     $0x0c, Y14, Y15
  7196	VPSRLD     $0x14, Y14, Y14
  7197	VPXOR      Y15, Y14, Y14
  7198	VPADDD     Y14, Y0, Y0
  7199	VPXOR      Y0, Y4, Y4
  7200	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  7201	VPADDD     Y4, Y12, Y12
  7202	VPXOR      Y12, Y14, Y14
  7203	VPSLLD     $0x07, Y14, Y15
  7204	VPSRLD     $0x19, Y14, Y14
  7205	VPXOR      Y15, Y14, Y14
  7206	VPADDD     Y9, Y5, Y5
  7207	VPXOR      Y5, Y1, Y1
  7208	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  7209	VPADDD     Y1, Y13, Y13
  7210	VPXOR      Y13, Y9, Y9
  7211	VPSLLD     $0x0c, Y9, Y15
  7212	VPSRLD     $0x14, Y9, Y9
  7213	VPXOR      Y15, Y9, Y9
  7214	VPADDD     Y9, Y5, Y5
  7215	VPXOR      Y5, Y1, Y1
  7216	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  7217	VPADDD     Y1, Y13, Y13
  7218	VPXOR      Y13, Y9, Y9
  7219	VPSLLD     $0x07, Y9, Y15
  7220	VPSRLD     $0x19, Y9, Y9
  7221	VPXOR      Y15, Y9, Y9
  7222	VPADDD     Y10, Y6, Y6
  7223	VPXOR      Y6, Y2, Y2
  7224	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  7225	VPADDD     Y2, Y8, Y8
  7226	VPXOR      Y8, Y10, Y10
  7227	VPSLLD     $0x0c, Y10, Y15
  7228	VPSRLD     $0x14, Y10, Y10
  7229	VPXOR      Y15, Y10, Y10
  7230	VPADDD     Y10, Y6, Y6
  7231	VPXOR      Y6, Y2, Y2
  7232	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  7233	VPADDD     Y2, Y8, Y8
  7234	VPXOR      Y8, Y10, Y10
  7235	VPSLLD     $0x07, Y10, Y15
  7236	VPSRLD     $0x19, Y10, Y10
  7237	VPXOR      Y15, Y10, Y10
  7238	VMOVDQA    224(BP), Y15
  7239	VMOVDQA    Y13, 224(BP)
  7240	VPADDD     Y11, Y7, Y7
  7241	VPXOR      Y7, Y3, Y3
  7242	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
  7243	VPADDD     Y3, Y15, Y15
  7244	VPXOR      Y15, Y11, Y11
  7245	VPSLLD     $0x0c, Y11, Y13
  7246	VPSRLD     $0x14, Y11, Y11
  7247	VPXOR      Y13, Y11, Y11
  7248	VPADDD     Y11, Y7, Y7
  7249	VPXOR      Y7, Y3, Y3
  7250	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
  7251	VPADDD     Y3, Y15, Y15
  7252	VPXOR      Y15, Y11, Y11
  7253	VPSLLD     $0x07, Y11, Y13
  7254	VPSRLD     $0x19, Y11, Y11
  7255	VPXOR      Y13, Y11, Y11
  7256	VMOVDQA    224(BP), Y13
  7257	VPALIGNR   $0x04, Y14, Y14, Y14
  7258	VPALIGNR   $0x08, Y12, Y12, Y12
  7259	VPALIGNR   $0x0c, Y4, Y4, Y4
  7260	VPALIGNR   $0x04, Y9, Y9, Y9
  7261	VPALIGNR   $0x08, Y13, Y13, Y13
  7262	VPALIGNR   $0x0c, Y1, Y1, Y1
  7263	VPALIGNR   $0x04, Y10, Y10, Y10
  7264	VPALIGNR   $0x08, Y8, Y8, Y8
  7265	VPALIGNR   $0x0c, Y2, Y2, Y2
  7266	VPALIGNR   $0x04, Y11, Y11, Y11
  7267	VPALIGNR   $0x08, Y15, Y15, Y15
  7268	VPALIGNR   $0x0c, Y3, Y3, Y3
  7269	VMOVDQA    Y15, 224(BP)
  7270	VPADDD     Y14, Y0, Y0
  7271	VPXOR      Y0, Y4, Y4
  7272	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  7273	VPADDD     Y4, Y12, Y12
  7274	VPXOR      Y12, Y14, Y14
  7275	VPSLLD     $0x0c, Y14, Y15
  7276	VPSRLD     $0x14, Y14, Y14
  7277	VPXOR      Y15, Y14, Y14
  7278	VPADDD     Y14, Y0, Y0
  7279	VPXOR      Y0, Y4, Y4
  7280	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  7281	VPADDD     Y4, Y12, Y12
  7282	VPXOR      Y12, Y14, Y14
  7283	VPSLLD     $0x07, Y14, Y15
  7284	VPSRLD     $0x19, Y14, Y14
  7285	VPXOR      Y15, Y14, Y14
  7286	VPADDD     Y9, Y5, Y5
  7287	VPXOR      Y5, Y1, Y1
  7288	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  7289	VPADDD     Y1, Y13, Y13
  7290	VPXOR      Y13, Y9, Y9
  7291	VPSLLD     $0x0c, Y9, Y15
  7292	VPSRLD     $0x14, Y9, Y9
  7293	VPXOR      Y15, Y9, Y9
  7294	VPADDD     Y9, Y5, Y5
  7295	VPXOR      Y5, Y1, Y1
  7296	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  7297	VPADDD     Y1, Y13, Y13
  7298	VPXOR      Y13, Y9, Y9
  7299	VPSLLD     $0x07, Y9, Y15
  7300	VPSRLD     $0x19, Y9, Y9
  7301	VPXOR      Y15, Y9, Y9
  7302	VPADDD     Y10, Y6, Y6
  7303	VPXOR      Y6, Y2, Y2
  7304	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  7305	VPADDD     Y2, Y8, Y8
  7306	VPXOR      Y8, Y10, Y10
  7307	VPSLLD     $0x0c, Y10, Y15
  7308	VPSRLD     $0x14, Y10, Y10
  7309	VPXOR      Y15, Y10, Y10
  7310	VPADDD     Y10, Y6, Y6
  7311	VPXOR      Y6, Y2, Y2
  7312	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  7313	VPADDD     Y2, Y8, Y8
  7314	VPXOR      Y8, Y10, Y10
  7315	VPSLLD     $0x07, Y10, Y15
  7316	VPSRLD     $0x19, Y10, Y10
  7317	VPXOR      Y15, Y10, Y10
  7318	VMOVDQA    224(BP), Y15
  7319	VMOVDQA    Y13, 224(BP)
  7320	VPADDD     Y11, Y7, Y7
  7321	VPXOR      Y7, Y3, Y3
  7322	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
  7323	VPADDD     Y3, Y15, Y15
  7324	VPXOR      Y15, Y11, Y11
  7325	VPSLLD     $0x0c, Y11, Y13
  7326	VPSRLD     $0x14, Y11, Y11
  7327	VPXOR      Y13, Y11, Y11
  7328	VPADDD     Y11, Y7, Y7
  7329	VPXOR      Y7, Y3, Y3
  7330	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
  7331	VPADDD     Y3, Y15, Y15
  7332	VPXOR      Y15, Y11, Y11
  7333	VPSLLD     $0x07, Y11, Y13
  7334	VPSRLD     $0x19, Y11, Y11
  7335	VPXOR      Y13, Y11, Y11
  7336	VMOVDQA    224(BP), Y13
  7337	VPALIGNR   $0x0c, Y14, Y14, Y14
  7338	VPALIGNR   $0x08, Y12, Y12, Y12
  7339	VPALIGNR   $0x04, Y4, Y4, Y4
  7340	VPALIGNR   $0x0c, Y9, Y9, Y9
  7341	VPALIGNR   $0x08, Y13, Y13, Y13
  7342	VPALIGNR   $0x04, Y1, Y1, Y1
  7343	VPALIGNR   $0x0c, Y10, Y10, Y10
  7344	VPALIGNR   $0x08, Y8, Y8, Y8
  7345	VPALIGNR   $0x04, Y2, Y2, Y2
  7346	VPALIGNR   $0x0c, Y11, Y11, Y11
  7347	VPALIGNR   $0x08, Y15, Y15, Y15
  7348	VPALIGNR   $0x04, Y3, Y3, Y3
  7349	DECQ       R9
  7350	JNE        sealAVX2IntroLoop
  7351	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  7352	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  7353	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
  7354	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
  7355	VPADDD     32(BP), Y14, Y14
  7356	VPADDD     32(BP), Y9, Y9
  7357	VPADDD     32(BP), Y10, Y10
  7358	VPADDD     32(BP), Y11, Y11
  7359	VPADDD     64(BP), Y12, Y12
  7360	VPADDD     64(BP), Y13, Y13
  7361	VPADDD     64(BP), Y8, Y8
  7362	VPADDD     64(BP), Y15, Y15
  7363	VPADDD     96(BP), Y4, Y4
  7364	VPADDD     128(BP), Y1, Y1
  7365	VPADDD     160(BP), Y2, Y2
  7366	VPADDD     192(BP), Y3, Y3
  7367	VPERM2I128 $0x13, Y12, Y4, Y12
  7368	VPERM2I128 $0x02, Y0, Y14, Y4
  7369	VPERM2I128 $0x13, Y0, Y14, Y0
  7370
  7371	// Clamp and store poly key
  7372	VPAND   ·polyClampMask<>+0(SB), Y4, Y4
  7373	VMOVDQA Y4, (BP)
  7374
  7375	// Hash AD
  7376	MOVQ ad_len+80(FP), R9
  7377	CALL polyHashADInternal<>(SB)
  7378
  7379	// Can store at least 320 bytes
  7380	VPXOR      (SI), Y0, Y0
  7381	VPXOR      32(SI), Y12, Y12
  7382	VMOVDQU    Y0, (DI)
  7383	VMOVDQU    Y12, 32(DI)
  7384	VPERM2I128 $0x02, Y5, Y9, Y0
  7385	VPERM2I128 $0x02, Y13, Y1, Y14
  7386	VPERM2I128 $0x13, Y5, Y9, Y12
  7387	VPERM2I128 $0x13, Y13, Y1, Y4
  7388	VPXOR      64(SI), Y0, Y0
  7389	VPXOR      96(SI), Y14, Y14
  7390	VPXOR      128(SI), Y12, Y12
  7391	VPXOR      160(SI), Y4, Y4
  7392	VMOVDQU    Y0, 64(DI)
  7393	VMOVDQU    Y14, 96(DI)
  7394	VMOVDQU    Y12, 128(DI)
  7395	VMOVDQU    Y4, 160(DI)
  7396	VPERM2I128 $0x02, Y6, Y10, Y0
  7397	VPERM2I128 $0x02, Y8, Y2, Y14
  7398	VPERM2I128 $0x13, Y6, Y10, Y12
  7399	VPERM2I128 $0x13, Y8, Y2, Y4
  7400	VPXOR      192(SI), Y0, Y0
  7401	VPXOR      224(SI), Y14, Y14
  7402	VPXOR      256(SI), Y12, Y12
  7403	VPXOR      288(SI), Y4, Y4
  7404	VMOVDQU    Y0, 192(DI)
  7405	VMOVDQU    Y14, 224(DI)
  7406	VMOVDQU    Y12, 256(DI)
  7407	VMOVDQU    Y4, 288(DI)
  7408	MOVQ       $0x00000140, CX
  7409	SUBQ       $0x00000140, BX
  7410	LEAQ       320(SI), SI
  7411	VPERM2I128 $0x02, Y7, Y11, Y0
  7412	VPERM2I128 $0x02, Y15, Y3, Y14
  7413	VPERM2I128 $0x13, Y7, Y11, Y12
  7414	VPERM2I128 $0x13, Y15, Y3, Y4
  7415	CMPQ       BX, $0x80
  7416	JBE        sealAVX2SealHash
  7417	VPXOR      (SI), Y0, Y0
  7418	VPXOR      32(SI), Y14, Y14
  7419	VPXOR      64(SI), Y12, Y12
  7420	VPXOR      96(SI), Y4, Y4
  7421	VMOVDQU    Y0, 320(DI)
  7422	VMOVDQU    Y14, 352(DI)
  7423	VMOVDQU    Y12, 384(DI)
  7424	VMOVDQU    Y4, 416(DI)
  7425	SUBQ       $0x80, BX
  7426	LEAQ       128(SI), SI
  7427	MOVQ       $0x00000008, CX
  7428	MOVQ       $0x00000002, R9
  7429	CMPQ       BX, $0x80
  7430	JBE        sealAVX2Tail128
  7431	CMPQ       BX, $0x00000100
  7432	JBE        sealAVX2Tail256
  7433	CMPQ       BX, $0x00000180
  7434	JBE        sealAVX2Tail384
  7435	CMPQ       BX, $0x00000200
  7436	JBE        sealAVX2Tail512
  7437
  7438	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  7439	VMOVDQA  ·chacha20Constants<>+0(SB), Y0
  7440	VMOVDQA  Y0, Y5
  7441	VMOVDQA  Y0, Y6
  7442	VMOVDQA  Y0, Y7
  7443	VMOVDQA  32(BP), Y14
  7444	VMOVDQA  Y14, Y9
  7445	VMOVDQA  Y14, Y10
  7446	VMOVDQA  Y14, Y11
  7447	VMOVDQA  64(BP), Y12
  7448	VMOVDQA  Y12, Y13
  7449	VMOVDQA  Y12, Y8
  7450	VMOVDQA  Y12, Y15
  7451	VMOVDQA  192(BP), Y4
  7452	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y4
  7453	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y1
  7454	VPADDD   ·avx2IncMask<>+0(SB), Y1, Y2
  7455	VPADDD   ·avx2IncMask<>+0(SB), Y2, Y3
  7456	VMOVDQA  Y4, 96(BP)
  7457	VMOVDQA  Y1, 128(BP)
  7458	VMOVDQA  Y2, 160(BP)
  7459	VMOVDQA  Y3, 192(BP)
  7460	VMOVDQA  Y15, 224(BP)
  7461	VPADDD   Y14, Y0, Y0
  7462	VPXOR    Y0, Y4, Y4
  7463	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  7464	VPADDD   Y4, Y12, Y12
  7465	VPXOR    Y12, Y14, Y14
  7466	VPSLLD   $0x0c, Y14, Y15
  7467	VPSRLD   $0x14, Y14, Y14
  7468	VPXOR    Y15, Y14, Y14
  7469	VPADDD   Y14, Y0, Y0
  7470	VPXOR    Y0, Y4, Y4
  7471	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  7472	VPADDD   Y4, Y12, Y12
  7473	VPXOR    Y12, Y14, Y14
  7474	VPSLLD   $0x07, Y14, Y15
  7475	VPSRLD   $0x19, Y14, Y14
  7476	VPXOR    Y15, Y14, Y14
  7477	VPADDD   Y9, Y5, Y5
  7478	VPXOR    Y5, Y1, Y1
  7479	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  7480	VPADDD   Y1, Y13, Y13
  7481	VPXOR    Y13, Y9, Y9
  7482	VPSLLD   $0x0c, Y9, Y15
  7483	VPSRLD   $0x14, Y9, Y9
  7484	VPXOR    Y15, Y9, Y9
  7485	VPADDD   Y9, Y5, Y5
  7486	VPXOR    Y5, Y1, Y1
  7487	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  7488	VPADDD   Y1, Y13, Y13
  7489	VPXOR    Y13, Y9, Y9
  7490	VPSLLD   $0x07, Y9, Y15
  7491	VPSRLD   $0x19, Y9, Y9
  7492	VPXOR    Y15, Y9, Y9
  7493	VPADDD   Y10, Y6, Y6
  7494	VPXOR    Y6, Y2, Y2
  7495	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  7496	VPADDD   Y2, Y8, Y8
  7497	VPXOR    Y8, Y10, Y10
  7498	VPSLLD   $0x0c, Y10, Y15
  7499	VPSRLD   $0x14, Y10, Y10
  7500	VPXOR    Y15, Y10, Y10
  7501	VPADDD   Y10, Y6, Y6
  7502	VPXOR    Y6, Y2, Y2
  7503	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  7504	VPADDD   Y2, Y8, Y8
  7505	VPXOR    Y8, Y10, Y10
  7506	VPSLLD   $0x07, Y10, Y15
  7507	VPSRLD   $0x19, Y10, Y10
  7508	VPXOR    Y15, Y10, Y10
  7509	VMOVDQA  224(BP), Y15
  7510	VMOVDQA  Y13, 224(BP)
  7511	VPADDD   Y11, Y7, Y7
  7512	VPXOR    Y7, Y3, Y3
  7513	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  7514	VPADDD   Y3, Y15, Y15
  7515	VPXOR    Y15, Y11, Y11
  7516	VPSLLD   $0x0c, Y11, Y13
  7517	VPSRLD   $0x14, Y11, Y11
  7518	VPXOR    Y13, Y11, Y11
  7519	VPADDD   Y11, Y7, Y7
  7520	VPXOR    Y7, Y3, Y3
  7521	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  7522	VPADDD   Y3, Y15, Y15
  7523	VPXOR    Y15, Y11, Y11
  7524	VPSLLD   $0x07, Y11, Y13
  7525	VPSRLD   $0x19, Y11, Y11
  7526	VPXOR    Y13, Y11, Y11
  7527	VMOVDQA  224(BP), Y13
  7528	VPALIGNR $0x04, Y14, Y14, Y14
  7529	VPALIGNR $0x08, Y12, Y12, Y12
  7530	VPALIGNR $0x0c, Y4, Y4, Y4
  7531	VPALIGNR $0x04, Y9, Y9, Y9
  7532	VPALIGNR $0x08, Y13, Y13, Y13
  7533	VPALIGNR $0x0c, Y1, Y1, Y1
  7534	VPALIGNR $0x04, Y10, Y10, Y10
  7535	VPALIGNR $0x08, Y8, Y8, Y8
  7536	VPALIGNR $0x0c, Y2, Y2, Y2
  7537	VPALIGNR $0x04, Y11, Y11, Y11
  7538	VPALIGNR $0x08, Y15, Y15, Y15
  7539	VPALIGNR $0x0c, Y3, Y3, Y3
  7540	VMOVDQA  Y15, 224(BP)
  7541	VPADDD   Y14, Y0, Y0
  7542	VPXOR    Y0, Y4, Y4
  7543	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  7544	VPADDD   Y4, Y12, Y12
  7545	VPXOR    Y12, Y14, Y14
  7546	VPSLLD   $0x0c, Y14, Y15
  7547	VPSRLD   $0x14, Y14, Y14
  7548	VPXOR    Y15, Y14, Y14
  7549	VPADDD   Y14, Y0, Y0
  7550	VPXOR    Y0, Y4, Y4
  7551	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  7552	VPADDD   Y4, Y12, Y12
  7553	VPXOR    Y12, Y14, Y14
  7554	VPSLLD   $0x07, Y14, Y15
  7555	VPSRLD   $0x19, Y14, Y14
  7556	VPXOR    Y15, Y14, Y14
  7557	VPADDD   Y9, Y5, Y5
  7558	VPXOR    Y5, Y1, Y1
  7559	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  7560	VPADDD   Y1, Y13, Y13
  7561	VPXOR    Y13, Y9, Y9
  7562	VPSLLD   $0x0c, Y9, Y15
  7563	VPSRLD   $0x14, Y9, Y9
  7564	VPXOR    Y15, Y9, Y9
  7565	VPADDD   Y9, Y5, Y5
  7566	VPXOR    Y5, Y1, Y1
  7567	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  7568	VPADDD   Y1, Y13, Y13
  7569	VPXOR    Y13, Y9, Y9
  7570	VPSLLD   $0x07, Y9, Y15
  7571	VPSRLD   $0x19, Y9, Y9
  7572	VPXOR    Y15, Y9, Y9
  7573	VPADDD   Y10, Y6, Y6
  7574	VPXOR    Y6, Y2, Y2
  7575	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  7576	VPADDD   Y2, Y8, Y8
  7577	VPXOR    Y8, Y10, Y10
  7578	VPSLLD   $0x0c, Y10, Y15
  7579	VPSRLD   $0x14, Y10, Y10
  7580	VPXOR    Y15, Y10, Y10
  7581	VPADDD   Y10, Y6, Y6
  7582	VPXOR    Y6, Y2, Y2
  7583	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  7584	VPADDD   Y2, Y8, Y8
  7585	VPXOR    Y8, Y10, Y10
  7586	VPSLLD   $0x07, Y10, Y15
  7587	VPSRLD   $0x19, Y10, Y10
  7588	VPXOR    Y15, Y10, Y10
  7589	VMOVDQA  224(BP), Y15
  7590	VMOVDQA  Y13, 224(BP)
  7591	VPADDD   Y11, Y7, Y7
  7592	VPXOR    Y7, Y3, Y3
  7593	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  7594	VPADDD   Y3, Y15, Y15
  7595	VPXOR    Y15, Y11, Y11
  7596	VPSLLD   $0x0c, Y11, Y13
  7597	VPSRLD   $0x14, Y11, Y11
  7598	VPXOR    Y13, Y11, Y11
  7599	VPADDD   Y11, Y7, Y7
  7600	VPXOR    Y7, Y3, Y3
  7601	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  7602	VPADDD   Y3, Y15, Y15
  7603	VPXOR    Y15, Y11, Y11
  7604	VPSLLD   $0x07, Y11, Y13
  7605	VPSRLD   $0x19, Y11, Y11
  7606	VPXOR    Y13, Y11, Y11
  7607	VMOVDQA  224(BP), Y13
  7608	VPALIGNR $0x0c, Y14, Y14, Y14
  7609	VPALIGNR $0x08, Y12, Y12, Y12
  7610	VPALIGNR $0x04, Y4, Y4, Y4
  7611	VPALIGNR $0x0c, Y9, Y9, Y9
  7612	VPALIGNR $0x08, Y13, Y13, Y13
  7613	VPALIGNR $0x04, Y1, Y1, Y1
  7614	VPALIGNR $0x0c, Y10, Y10, Y10
  7615	VPALIGNR $0x08, Y8, Y8, Y8
  7616	VPALIGNR $0x04, Y2, Y2, Y2
  7617	VPALIGNR $0x0c, Y11, Y11, Y11
  7618	VPALIGNR $0x08, Y15, Y15, Y15
  7619	VPALIGNR $0x04, Y3, Y3, Y3
  7620	VPADDD   Y14, Y0, Y0
  7621	VPADDD   Y9, Y5, Y5
  7622	VPADDD   Y10, Y6, Y6
  7623	VPADDD   Y11, Y7, Y7
  7624	VPXOR    Y0, Y4, Y4
  7625	VPXOR    Y5, Y1, Y1
  7626	VPXOR    Y6, Y2, Y2
  7627	VPXOR    Y7, Y3, Y3
  7628	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  7629	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  7630	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  7631	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  7632	VPADDD   Y4, Y12, Y12
  7633	VPADDD   Y1, Y13, Y13
  7634	VPADDD   Y2, Y8, Y8
  7635	VPADDD   Y3, Y15, Y15
  7636	VPXOR    Y12, Y14, Y14
  7637	VPXOR    Y13, Y9, Y9
  7638	VPXOR    Y8, Y10, Y10
  7639	VPXOR    Y15, Y11, Y11
  7640	VMOVDQA  Y15, 224(BP)
  7641	VPSLLD   $0x0c, Y14, Y15
  7642	VPSRLD   $0x14, Y14, Y14
  7643	VPXOR    Y15, Y14, Y14
  7644	VPSLLD   $0x0c, Y9, Y15
  7645	VPSRLD   $0x14, Y9, Y9
  7646	VPXOR    Y15, Y9, Y9
  7647	VPSLLD   $0x0c, Y10, Y15
  7648	VPSRLD   $0x14, Y10, Y10
  7649	VPXOR    Y15, Y10, Y10
  7650	VPSLLD   $0x0c, Y11, Y15
  7651	VPSRLD   $0x14, Y11, Y11
  7652	VPXOR    Y15, Y11, Y11
  7653	VMOVDQA  224(BP), Y15
  7654	SUBQ     $0x10, DI
  7655	MOVQ     $0x00000009, CX
  7656	JMP      sealAVX2InternalLoopStart
  7657
  7658sealAVX2MainLoop:
  7659	VMOVDQU ·chacha20Constants<>+0(SB), Y0
  7660	VMOVDQA Y0, Y5
  7661	VMOVDQA Y0, Y6
  7662	VMOVDQA Y0, Y7
  7663	VMOVDQA 32(BP), Y14
  7664	VMOVDQA Y14, Y9
  7665	VMOVDQA Y14, Y10
  7666	VMOVDQA Y14, Y11
  7667	VMOVDQA 64(BP), Y12
  7668	VMOVDQA Y12, Y13
  7669	VMOVDQA Y12, Y8
  7670	VMOVDQA Y12, Y15
  7671	VMOVDQA 192(BP), Y4
  7672	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  7673	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  7674	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  7675	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
  7676	VMOVDQA Y4, 96(BP)
  7677	VMOVDQA Y1, 128(BP)
  7678	VMOVDQA Y2, 160(BP)
  7679	VMOVDQA Y3, 192(BP)
  7680	MOVQ    $0x0000000a, CX
  7681
  7682sealAVX2InternalLoop:
  7683	ADDQ    (DI), R10
  7684	ADCQ    8(DI), R11
  7685	ADCQ    $0x01, R12
  7686	VPADDD  Y14, Y0, Y0
  7687	VPADDD  Y9, Y5, Y5
  7688	VPADDD  Y10, Y6, Y6
  7689	VPADDD  Y11, Y7, Y7
  7690	MOVQ    (BP), DX
  7691	MOVQ    DX, R15
  7692	MULXQ   R10, R13, R14
  7693	IMULQ   R12, R15
  7694	MULXQ   R11, AX, DX
  7695	ADDQ    AX, R14
  7696	ADCQ    DX, R15
  7697	VPXOR   Y0, Y4, Y4
  7698	VPXOR   Y5, Y1, Y1
  7699	VPXOR   Y6, Y2, Y2
  7700	VPXOR   Y7, Y3, Y3
  7701	VPSHUFB ·rol16<>+0(SB), Y4, Y4
  7702	VPSHUFB ·rol16<>+0(SB), Y1, Y1
  7703	VPSHUFB ·rol16<>+0(SB), Y2, Y2
  7704	VPSHUFB ·rol16<>+0(SB), Y3, Y3
  7705	MOVQ    8(BP), DX
  7706	MULXQ   R10, R10, AX
  7707	ADDQ    R10, R14
  7708	MULXQ   R11, R11, R8
  7709	ADCQ    R11, R15
  7710	ADCQ    $0x00, R8
  7711	VPADDD  Y4, Y12, Y12
  7712	VPADDD  Y1, Y13, Y13
  7713	VPADDD  Y2, Y8, Y8
  7714	VPADDD  Y3, Y15, Y15
  7715	VPXOR   Y12, Y14, Y14
  7716	VPXOR   Y13, Y9, Y9
  7717	VPXOR   Y8, Y10, Y10
  7718	VPXOR   Y15, Y11, Y11
  7719	IMULQ   R12, DX
  7720	ADDQ    AX, R15
  7721	ADCQ    DX, R8
  7722	VMOVDQA Y15, 224(BP)
  7723	VPSLLD  $0x0c, Y14, Y15
  7724	VPSRLD  $0x14, Y14, Y14
  7725	VPXOR   Y15, Y14, Y14
  7726	VPSLLD  $0x0c, Y9, Y15
  7727	VPSRLD  $0x14, Y9, Y9
  7728	VPXOR   Y15, Y9, Y9
  7729	VPSLLD  $0x0c, Y10, Y15
  7730	VPSRLD  $0x14, Y10, Y10
  7731	VPXOR   Y15, Y10, Y10
  7732	VPSLLD  $0x0c, Y11, Y15
  7733	VPSRLD  $0x14, Y11, Y11
  7734	VPXOR   Y15, Y11, Y11
  7735	VMOVDQA 224(BP), Y15
  7736	MOVQ    R13, R10
  7737	MOVQ    R14, R11
  7738	MOVQ    R15, R12
  7739	ANDQ    $0x03, R12
  7740	MOVQ    R15, R13
  7741	ANDQ    $-4, R13
  7742	MOVQ    R8, R14
  7743	SHRQ    $0x02, R8, R15
  7744	SHRQ    $0x02, R8
  7745	ADDQ    R13, R10
  7746	ADCQ    R14, R11
  7747	ADCQ    $0x00, R12
  7748	ADDQ    R15, R10
  7749	ADCQ    R8, R11
  7750	ADCQ    $0x00, R12
  7751
  7752sealAVX2InternalLoopStart:
  7753	VPADDD   Y14, Y0, Y0
  7754	VPADDD   Y9, Y5, Y5
  7755	VPADDD   Y10, Y6, Y6
  7756	VPADDD   Y11, Y7, Y7
  7757	VPXOR    Y0, Y4, Y4
  7758	VPXOR    Y5, Y1, Y1
  7759	VPXOR    Y6, Y2, Y2
  7760	VPXOR    Y7, Y3, Y3
  7761	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  7762	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  7763	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  7764	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  7765	ADDQ     16(DI), R10
  7766	ADCQ     24(DI), R11
  7767	ADCQ     $0x01, R12
  7768	VPADDD   Y4, Y12, Y12
  7769	VPADDD   Y1, Y13, Y13
  7770	VPADDD   Y2, Y8, Y8
  7771	VPADDD   Y3, Y15, Y15
  7772	MOVQ     (BP), DX
  7773	MOVQ     DX, R15
  7774	MULXQ    R10, R13, R14
  7775	IMULQ    R12, R15
  7776	MULXQ    R11, AX, DX
  7777	ADDQ     AX, R14
  7778	ADCQ     DX, R15
  7779	VPXOR    Y12, Y14, Y14
  7780	VPXOR    Y13, Y9, Y9
  7781	VPXOR    Y8, Y10, Y10
  7782	VPXOR    Y15, Y11, Y11
  7783	VMOVDQA  Y15, 224(BP)
  7784	VPSLLD   $0x07, Y14, Y15
  7785	VPSRLD   $0x19, Y14, Y14
  7786	VPXOR    Y15, Y14, Y14
  7787	VPSLLD   $0x07, Y9, Y15
  7788	VPSRLD   $0x19, Y9, Y9
  7789	VPXOR    Y15, Y9, Y9
  7790	VPSLLD   $0x07, Y10, Y15
  7791	VPSRLD   $0x19, Y10, Y10
  7792	VPXOR    Y15, Y10, Y10
  7793	VPSLLD   $0x07, Y11, Y15
  7794	VPSRLD   $0x19, Y11, Y11
  7795	VPXOR    Y15, Y11, Y11
  7796	VMOVDQA  224(BP), Y15
  7797	MOVQ     8(BP), DX
  7798	MULXQ    R10, R10, AX
  7799	ADDQ     R10, R14
  7800	MULXQ    R11, R11, R8
  7801	ADCQ     R11, R15
  7802	ADCQ     $0x00, R8
  7803	VPALIGNR $0x04, Y14, Y14, Y14
  7804	VPALIGNR $0x04, Y9, Y9, Y9
  7805	VPALIGNR $0x04, Y10, Y10, Y10
  7806	VPALIGNR $0x04, Y11, Y11, Y11
  7807	VPALIGNR $0x08, Y12, Y12, Y12
  7808	VPALIGNR $0x08, Y13, Y13, Y13
  7809	VPALIGNR $0x08, Y8, Y8, Y8
  7810	VPALIGNR $0x08, Y15, Y15, Y15
  7811	VPALIGNR $0x0c, Y4, Y4, Y4
  7812	VPALIGNR $0x0c, Y1, Y1, Y1
  7813	VPALIGNR $0x0c, Y2, Y2, Y2
  7814	VPALIGNR $0x0c, Y3, Y3, Y3
  7815	VPADDD   Y14, Y0, Y0
  7816	VPADDD   Y9, Y5, Y5
  7817	VPADDD   Y10, Y6, Y6
  7818	VPADDD   Y11, Y7, Y7
  7819	IMULQ    R12, DX
  7820	ADDQ     AX, R15
  7821	ADCQ     DX, R8
  7822	VPXOR    Y0, Y4, Y4
  7823	VPXOR    Y5, Y1, Y1
  7824	VPXOR    Y6, Y2, Y2
  7825	VPXOR    Y7, Y3, Y3
  7826	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  7827	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  7828	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  7829	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
  7830	MOVQ     R13, R10
  7831	MOVQ     R14, R11
  7832	MOVQ     R15, R12
  7833	ANDQ     $0x03, R12
  7834	MOVQ     R15, R13
  7835	ANDQ     $-4, R13
  7836	MOVQ     R8, R14
  7837	SHRQ     $0x02, R8, R15
  7838	SHRQ     $0x02, R8
  7839	ADDQ     R13, R10
  7840	ADCQ     R14, R11
  7841	ADCQ     $0x00, R12
  7842	ADDQ     R15, R10
  7843	ADCQ     R8, R11
  7844	ADCQ     $0x00, R12
  7845	VPADDD   Y4, Y12, Y12
  7846	VPADDD   Y1, Y13, Y13
  7847	VPADDD   Y2, Y8, Y8
  7848	VPADDD   Y3, Y15, Y15
  7849	VPXOR    Y12, Y14, Y14
  7850	VPXOR    Y13, Y9, Y9
  7851	VPXOR    Y8, Y10, Y10
  7852	VPXOR    Y15, Y11, Y11
  7853	ADDQ     32(DI), R10
  7854	ADCQ     40(DI), R11
  7855	ADCQ     $0x01, R12
  7856	LEAQ     48(DI), DI
  7857	VMOVDQA  Y15, 224(BP)
  7858	VPSLLD   $0x0c, Y14, Y15
  7859	VPSRLD   $0x14, Y14, Y14
  7860	VPXOR    Y15, Y14, Y14
  7861	VPSLLD   $0x0c, Y9, Y15
  7862	VPSRLD   $0x14, Y9, Y9
  7863	VPXOR    Y15, Y9, Y9
  7864	VPSLLD   $0x0c, Y10, Y15
  7865	VPSRLD   $0x14, Y10, Y10
  7866	VPXOR    Y15, Y10, Y10
  7867	VPSLLD   $0x0c, Y11, Y15
  7868	VPSRLD   $0x14, Y11, Y11
  7869	VPXOR    Y15, Y11, Y11
  7870	VMOVDQA  224(BP), Y15
  7871	MOVQ     (BP), DX
  7872	MOVQ     DX, R15
  7873	MULXQ    R10, R13, R14
  7874	IMULQ    R12, R15
  7875	MULXQ    R11, AX, DX
  7876	ADDQ     AX, R14
  7877	ADCQ     DX, R15
  7878	VPADDD   Y14, Y0, Y0
  7879	VPADDD   Y9, Y5, Y5
  7880	VPADDD   Y10, Y6, Y6
  7881	VPADDD   Y11, Y7, Y7
  7882	VPXOR    Y0, Y4, Y4
  7883	VPXOR    Y5, Y1, Y1
  7884	VPXOR    Y6, Y2, Y2
  7885	VPXOR    Y7, Y3, Y3
  7886	MOVQ     8(BP), DX
  7887	MULXQ    R10, R10, AX
  7888	ADDQ     R10, R14
  7889	MULXQ    R11, R11, R8
  7890	ADCQ     R11, R15
  7891	ADCQ     $0x00, R8
  7892	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  7893	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  7894	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  7895	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
  7896	VPADDD   Y4, Y12, Y12
  7897	VPADDD   Y1, Y13, Y13
  7898	VPADDD   Y2, Y8, Y8
  7899	VPADDD   Y3, Y15, Y15
  7900	IMULQ    R12, DX
  7901	ADDQ     AX, R15
  7902	ADCQ     DX, R8
  7903	VPXOR    Y12, Y14, Y14
  7904	VPXOR    Y13, Y9, Y9
  7905	VPXOR    Y8, Y10, Y10
  7906	VPXOR    Y15, Y11, Y11
  7907	VMOVDQA  Y15, 224(BP)
  7908	VPSLLD   $0x07, Y14, Y15
  7909	VPSRLD   $0x19, Y14, Y14
  7910	VPXOR    Y15, Y14, Y14
  7911	VPSLLD   $0x07, Y9, Y15
  7912	VPSRLD   $0x19, Y9, Y9
  7913	VPXOR    Y15, Y9, Y9
  7914	VPSLLD   $0x07, Y10, Y15
  7915	VPSRLD   $0x19, Y10, Y10
  7916	VPXOR    Y15, Y10, Y10
  7917	VPSLLD   $0x07, Y11, Y15
  7918	VPSRLD   $0x19, Y11, Y11
  7919	VPXOR    Y15, Y11, Y11
  7920	VMOVDQA  224(BP), Y15
  7921	MOVQ     R13, R10
  7922	MOVQ     R14, R11
  7923	MOVQ     R15, R12
  7924	ANDQ     $0x03, R12
  7925	MOVQ     R15, R13
  7926	ANDQ     $-4, R13
  7927	MOVQ     R8, R14
  7928	SHRQ     $0x02, R8, R15
  7929	SHRQ     $0x02, R8
  7930	ADDQ     R13, R10
  7931	ADCQ     R14, R11
  7932	ADCQ     $0x00, R12
  7933	ADDQ     R15, R10
  7934	ADCQ     R8, R11
  7935	ADCQ     $0x00, R12
  7936	VPALIGNR $0x0c, Y14, Y14, Y14
  7937	VPALIGNR $0x0c, Y9, Y9, Y9
  7938	VPALIGNR $0x0c, Y10, Y10, Y10
  7939	VPALIGNR $0x0c, Y11, Y11, Y11
  7940	VPALIGNR $0x08, Y12, Y12, Y12
  7941	VPALIGNR $0x08, Y13, Y13, Y13
  7942	VPALIGNR $0x08, Y8, Y8, Y8
  7943	VPALIGNR $0x08, Y15, Y15, Y15
  7944	VPALIGNR $0x04, Y4, Y4, Y4
  7945	VPALIGNR $0x04, Y1, Y1, Y1
  7946	VPALIGNR $0x04, Y2, Y2, Y2
  7947	VPALIGNR $0x04, Y3, Y3, Y3
  7948	DECQ     CX
  7949	JNE      sealAVX2InternalLoop
  7950	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
  7951	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
  7952	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
  7953	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
  7954	VPADDD   32(BP), Y14, Y14
  7955	VPADDD   32(BP), Y9, Y9
  7956	VPADDD   32(BP), Y10, Y10
  7957	VPADDD   32(BP), Y11, Y11
  7958	VPADDD   64(BP), Y12, Y12
  7959	VPADDD   64(BP), Y13, Y13
  7960	VPADDD   64(BP), Y8, Y8
  7961	VPADDD   64(BP), Y15, Y15
  7962	VPADDD   96(BP), Y4, Y4
  7963	VPADDD   128(BP), Y1, Y1
  7964	VPADDD   160(BP), Y2, Y2
  7965	VPADDD   192(BP), Y3, Y3
  7966	VMOVDQA  Y15, 224(BP)
  7967
  7968	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  7969	ADDQ       (DI), R10
  7970	ADCQ       8(DI), R11
  7971	ADCQ       $0x01, R12
  7972	MOVQ       (BP), DX
  7973	MOVQ       DX, R15
  7974	MULXQ      R10, R13, R14
  7975	IMULQ      R12, R15
  7976	MULXQ      R11, AX, DX
  7977	ADDQ       AX, R14
  7978	ADCQ       DX, R15
  7979	MOVQ       8(BP), DX
  7980	MULXQ      R10, R10, AX
  7981	ADDQ       R10, R14
  7982	MULXQ      R11, R11, R8
  7983	ADCQ       R11, R15
  7984	ADCQ       $0x00, R8
  7985	IMULQ      R12, DX
  7986	ADDQ       AX, R15
  7987	ADCQ       DX, R8
  7988	MOVQ       R13, R10
  7989	MOVQ       R14, R11
  7990	MOVQ       R15, R12
  7991	ANDQ       $0x03, R12
  7992	MOVQ       R15, R13
  7993	ANDQ       $-4, R13
  7994	MOVQ       R8, R14
  7995	SHRQ       $0x02, R8, R15
  7996	SHRQ       $0x02, R8
  7997	ADDQ       R13, R10
  7998	ADCQ       R14, R11
  7999	ADCQ       $0x00, R12
  8000	ADDQ       R15, R10
  8001	ADCQ       R8, R11
  8002	ADCQ       $0x00, R12
  8003	LEAQ       32(DI), DI
  8004	VPERM2I128 $0x02, Y0, Y14, Y15
  8005	VPERM2I128 $0x13, Y0, Y14, Y14
  8006	VPERM2I128 $0x02, Y12, Y4, Y0
  8007	VPERM2I128 $0x13, Y12, Y4, Y12
  8008	VPXOR      (SI), Y15, Y15
  8009	VPXOR      32(SI), Y0, Y0
  8010	VPXOR      64(SI), Y14, Y14
  8011	VPXOR      96(SI), Y12, Y12
  8012	VMOVDQU    Y15, (DI)
  8013	VMOVDQU    Y0, 32(DI)
  8014	VMOVDQU    Y14, 64(DI)
  8015	VMOVDQU    Y12, 96(DI)
  8016	VPERM2I128 $0x02, Y5, Y9, Y0
  8017	VPERM2I128 $0x02, Y13, Y1, Y14
  8018	VPERM2I128 $0x13, Y5, Y9, Y12
  8019	VPERM2I128 $0x13, Y13, Y1, Y4
  8020	VPXOR      128(SI), Y0, Y0
  8021	VPXOR      160(SI), Y14, Y14
  8022	VPXOR      192(SI), Y12, Y12
  8023	VPXOR      224(SI), Y4, Y4
  8024	VMOVDQU    Y0, 128(DI)
  8025	VMOVDQU    Y14, 160(DI)
  8026	VMOVDQU    Y12, 192(DI)
  8027	VMOVDQU    Y4, 224(DI)
  8028
  8029	// and here
  8030	ADDQ       -16(DI), R10
  8031	ADCQ       -8(DI), R11
  8032	ADCQ       $0x01, R12
  8033	MOVQ       (BP), DX
  8034	MOVQ       DX, R15
  8035	MULXQ      R10, R13, R14
  8036	IMULQ      R12, R15
  8037	MULXQ      R11, AX, DX
  8038	ADDQ       AX, R14
  8039	ADCQ       DX, R15
  8040	MOVQ       8(BP), DX
  8041	MULXQ      R10, R10, AX
  8042	ADDQ       R10, R14
  8043	MULXQ      R11, R11, R8
  8044	ADCQ       R11, R15
  8045	ADCQ       $0x00, R8
  8046	IMULQ      R12, DX
  8047	ADDQ       AX, R15
  8048	ADCQ       DX, R8
  8049	MOVQ       R13, R10
  8050	MOVQ       R14, R11
  8051	MOVQ       R15, R12
  8052	ANDQ       $0x03, R12
  8053	MOVQ       R15, R13
  8054	ANDQ       $-4, R13
  8055	MOVQ       R8, R14
  8056	SHRQ       $0x02, R8, R15
  8057	SHRQ       $0x02, R8
  8058	ADDQ       R13, R10
  8059	ADCQ       R14, R11
  8060	ADCQ       $0x00, R12
  8061	ADDQ       R15, R10
  8062	ADCQ       R8, R11
  8063	ADCQ       $0x00, R12
  8064	VPERM2I128 $0x02, Y6, Y10, Y0
  8065	VPERM2I128 $0x02, Y8, Y2, Y14
  8066	VPERM2I128 $0x13, Y6, Y10, Y12
  8067	VPERM2I128 $0x13, Y8, Y2, Y4
  8068	VPXOR      256(SI), Y0, Y0
  8069	VPXOR      288(SI), Y14, Y14
  8070	VPXOR      320(SI), Y12, Y12
  8071	VPXOR      352(SI), Y4, Y4
  8072	VMOVDQU    Y0, 256(DI)
  8073	VMOVDQU    Y14, 288(DI)
  8074	VMOVDQU    Y12, 320(DI)
  8075	VMOVDQU    Y4, 352(DI)
  8076	VPERM2I128 $0x02, Y7, Y11, Y0
  8077	VPERM2I128 $0x02, 224(BP), Y3, Y14
  8078	VPERM2I128 $0x13, Y7, Y11, Y12
  8079	VPERM2I128 $0x13, 224(BP), Y3, Y4
  8080	VPXOR      384(SI), Y0, Y0
  8081	VPXOR      416(SI), Y14, Y14
  8082	VPXOR      448(SI), Y12, Y12
  8083	VPXOR      480(SI), Y4, Y4
  8084	VMOVDQU    Y0, 384(DI)
  8085	VMOVDQU    Y14, 416(DI)
  8086	VMOVDQU    Y12, 448(DI)
  8087	VMOVDQU    Y4, 480(DI)
  8088	LEAQ       512(SI), SI
  8089	SUBQ       $0x00000200, BX
  8090	CMPQ       BX, $0x00000200
  8091	JG         sealAVX2MainLoop
  8092
  8093	// Tail can only hash 480 bytes
  8094	ADDQ  (DI), R10
  8095	ADCQ  8(DI), R11
  8096	ADCQ  $0x01, R12
  8097	MOVQ  (BP), DX
  8098	MOVQ  DX, R15
  8099	MULXQ R10, R13, R14
  8100	IMULQ R12, R15
  8101	MULXQ R11, AX, DX
  8102	ADDQ  AX, R14
  8103	ADCQ  DX, R15
  8104	MOVQ  8(BP), DX
  8105	MULXQ R10, R10, AX
  8106	ADDQ  R10, R14
  8107	MULXQ R11, R11, R8
  8108	ADCQ  R11, R15
  8109	ADCQ  $0x00, R8
  8110	IMULQ R12, DX
  8111	ADDQ  AX, R15
  8112	ADCQ  DX, R8
  8113	MOVQ  R13, R10
  8114	MOVQ  R14, R11
  8115	MOVQ  R15, R12
  8116	ANDQ  $0x03, R12
  8117	MOVQ  R15, R13
  8118	ANDQ  $-4, R13
  8119	MOVQ  R8, R14
  8120	SHRQ  $0x02, R8, R15
  8121	SHRQ  $0x02, R8
  8122	ADDQ  R13, R10
  8123	ADCQ  R14, R11
  8124	ADCQ  $0x00, R12
  8125	ADDQ  R15, R10
  8126	ADCQ  R8, R11
  8127	ADCQ  $0x00, R12
  8128	ADDQ  16(DI), R10
  8129	ADCQ  24(DI), R11
  8130	ADCQ  $0x01, R12
  8131	MOVQ  (BP), DX
  8132	MOVQ  DX, R15
  8133	MULXQ R10, R13, R14
  8134	IMULQ R12, R15
  8135	MULXQ R11, AX, DX
  8136	ADDQ  AX, R14
  8137	ADCQ  DX, R15
  8138	MOVQ  8(BP), DX
  8139	MULXQ R10, R10, AX
  8140	ADDQ  R10, R14
  8141	MULXQ R11, R11, R8
  8142	ADCQ  R11, R15
  8143	ADCQ  $0x00, R8
  8144	IMULQ R12, DX
  8145	ADDQ  AX, R15
  8146	ADCQ  DX, R8
  8147	MOVQ  R13, R10
  8148	MOVQ  R14, R11
  8149	MOVQ  R15, R12
  8150	ANDQ  $0x03, R12
  8151	MOVQ  R15, R13
  8152	ANDQ  $-4, R13
  8153	MOVQ  R8, R14
  8154	SHRQ  $0x02, R8, R15
  8155	SHRQ  $0x02, R8
  8156	ADDQ  R13, R10
  8157	ADCQ  R14, R11
  8158	ADCQ  $0x00, R12
  8159	ADDQ  R15, R10
  8160	ADCQ  R8, R11
  8161	ADCQ  $0x00, R12
  8162	LEAQ  32(DI), DI
  8163	MOVQ  $0x0000000a, CX
  8164	MOVQ  $0x00000000, R9
  8165	CMPQ  BX, $0x80
  8166	JBE   sealAVX2Tail128
  8167	CMPQ  BX, $0x00000100
  8168	JBE   sealAVX2Tail256
  8169	CMPQ  BX, $0x00000180
  8170	JBE   sealAVX2Tail384
  8171	JMP   sealAVX2Tail512
  8172
  8173seal192AVX2:
  8174	VMOVDQA Y0, Y5
  8175	VMOVDQA Y14, Y9
  8176	VMOVDQA Y12, Y13
  8177	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  8178	VMOVDQA Y0, Y6
  8179	VMOVDQA Y14, Y10
  8180	VMOVDQA Y12, Y8
  8181	VMOVDQA Y4, Y2
  8182	VMOVDQA Y1, Y15
  8183	MOVQ    $0x0000000a, R9
  8184
  8185sealAVX2192InnerCipherLoop:
  8186	VPADDD     Y14, Y0, Y0
  8187	VPXOR      Y0, Y4, Y4
  8188	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8189	VPADDD     Y4, Y12, Y12
  8190	VPXOR      Y12, Y14, Y14
  8191	VPSLLD     $0x0c, Y14, Y3
  8192	VPSRLD     $0x14, Y14, Y14
  8193	VPXOR      Y3, Y14, Y14
  8194	VPADDD     Y14, Y0, Y0
  8195	VPXOR      Y0, Y4, Y4
  8196	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8197	VPADDD     Y4, Y12, Y12
  8198	VPXOR      Y12, Y14, Y14
  8199	VPSLLD     $0x07, Y14, Y3
  8200	VPSRLD     $0x19, Y14, Y14
  8201	VPXOR      Y3, Y14, Y14
  8202	VPADDD     Y9, Y5, Y5
  8203	VPXOR      Y5, Y1, Y1
  8204	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  8205	VPADDD     Y1, Y13, Y13
  8206	VPXOR      Y13, Y9, Y9
  8207	VPSLLD     $0x0c, Y9, Y3
  8208	VPSRLD     $0x14, Y9, Y9
  8209	VPXOR      Y3, Y9, Y9
  8210	VPADDD     Y9, Y5, Y5
  8211	VPXOR      Y5, Y1, Y1
  8212	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  8213	VPADDD     Y1, Y13, Y13
  8214	VPXOR      Y13, Y9, Y9
  8215	VPSLLD     $0x07, Y9, Y3
  8216	VPSRLD     $0x19, Y9, Y9
  8217	VPXOR      Y3, Y9, Y9
  8218	VPALIGNR   $0x04, Y14, Y14, Y14
  8219	VPALIGNR   $0x04, Y9, Y9, Y9
  8220	VPALIGNR   $0x08, Y12, Y12, Y12
  8221	VPALIGNR   $0x08, Y13, Y13, Y13
  8222	VPALIGNR   $0x0c, Y4, Y4, Y4
  8223	VPALIGNR   $0x0c, Y1, Y1, Y1
  8224	VPADDD     Y14, Y0, Y0
  8225	VPXOR      Y0, Y4, Y4
  8226	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8227	VPADDD     Y4, Y12, Y12
  8228	VPXOR      Y12, Y14, Y14
  8229	VPSLLD     $0x0c, Y14, Y3
  8230	VPSRLD     $0x14, Y14, Y14
  8231	VPXOR      Y3, Y14, Y14
  8232	VPADDD     Y14, Y0, Y0
  8233	VPXOR      Y0, Y4, Y4
  8234	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8235	VPADDD     Y4, Y12, Y12
  8236	VPXOR      Y12, Y14, Y14
  8237	VPSLLD     $0x07, Y14, Y3
  8238	VPSRLD     $0x19, Y14, Y14
  8239	VPXOR      Y3, Y14, Y14
  8240	VPADDD     Y9, Y5, Y5
  8241	VPXOR      Y5, Y1, Y1
  8242	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  8243	VPADDD     Y1, Y13, Y13
  8244	VPXOR      Y13, Y9, Y9
  8245	VPSLLD     $0x0c, Y9, Y3
  8246	VPSRLD     $0x14, Y9, Y9
  8247	VPXOR      Y3, Y9, Y9
  8248	VPADDD     Y9, Y5, Y5
  8249	VPXOR      Y5, Y1, Y1
  8250	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  8251	VPADDD     Y1, Y13, Y13
  8252	VPXOR      Y13, Y9, Y9
  8253	VPSLLD     $0x07, Y9, Y3
  8254	VPSRLD     $0x19, Y9, Y9
  8255	VPXOR      Y3, Y9, Y9
  8256	VPALIGNR   $0x0c, Y14, Y14, Y14
  8257	VPALIGNR   $0x0c, Y9, Y9, Y9
  8258	VPALIGNR   $0x08, Y12, Y12, Y12
  8259	VPALIGNR   $0x08, Y13, Y13, Y13
  8260	VPALIGNR   $0x04, Y4, Y4, Y4
  8261	VPALIGNR   $0x04, Y1, Y1, Y1
  8262	DECQ       R9
  8263	JNE        sealAVX2192InnerCipherLoop
  8264	VPADDD     Y6, Y0, Y0
  8265	VPADDD     Y6, Y5, Y5
  8266	VPADDD     Y10, Y14, Y14
  8267	VPADDD     Y10, Y9, Y9
  8268	VPADDD     Y8, Y12, Y12
  8269	VPADDD     Y8, Y13, Y13
  8270	VPADDD     Y2, Y4, Y4
  8271	VPADDD     Y15, Y1, Y1
  8272	VPERM2I128 $0x02, Y0, Y14, Y3
  8273
  8274	// Clamp and store poly key
  8275	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
  8276	VMOVDQA Y3, (BP)
  8277
  8278	// Stream for up to 192 bytes
  8279	VPERM2I128 $0x13, Y0, Y14, Y0
  8280	VPERM2I128 $0x13, Y12, Y4, Y14
  8281	VPERM2I128 $0x02, Y5, Y9, Y12
  8282	VPERM2I128 $0x02, Y13, Y1, Y4
  8283	VPERM2I128 $0x13, Y5, Y9, Y5
  8284	VPERM2I128 $0x13, Y13, Y1, Y9
  8285
  8286sealAVX2ShortSeal:
  8287	// Hash aad
  8288	MOVQ ad_len+80(FP), R9
  8289	CALL polyHashADInternal<>(SB)
  8290	XORQ CX, CX
  8291
  8292sealAVX2SealHash:
  8293	// itr1 holds the number of bytes encrypted but not yet hashed
  8294	CMPQ  CX, $0x10
  8295	JB    sealAVX2ShortSealLoop
  8296	ADDQ  (DI), R10
  8297	ADCQ  8(DI), R11
  8298	ADCQ  $0x01, R12
  8299	MOVQ  (BP), AX
  8300	MOVQ  AX, R15
  8301	MULQ  R10
  8302	MOVQ  AX, R13
  8303	MOVQ  DX, R14
  8304	MOVQ  (BP), AX
  8305	MULQ  R11
  8306	IMULQ R12, R15
  8307	ADDQ  AX, R14
  8308	ADCQ  DX, R15
  8309	MOVQ  8(BP), AX
  8310	MOVQ  AX, R8
  8311	MULQ  R10
  8312	ADDQ  AX, R14
  8313	ADCQ  $0x00, DX
  8314	MOVQ  DX, R10
  8315	MOVQ  8(BP), AX
  8316	MULQ  R11
  8317	ADDQ  AX, R15
  8318	ADCQ  $0x00, DX
  8319	IMULQ R12, R8
  8320	ADDQ  R10, R15
  8321	ADCQ  DX, R8
  8322	MOVQ  R13, R10
  8323	MOVQ  R14, R11
  8324	MOVQ  R15, R12
  8325	ANDQ  $0x03, R12
  8326	MOVQ  R15, R13
  8327	ANDQ  $-4, R13
  8328	MOVQ  R8, R14
  8329	SHRQ  $0x02, R8, R15
  8330	SHRQ  $0x02, R8
  8331	ADDQ  R13, R10
  8332	ADCQ  R14, R11
  8333	ADCQ  $0x00, R12
  8334	ADDQ  R15, R10
  8335	ADCQ  R8, R11
  8336	ADCQ  $0x00, R12
  8337	SUBQ  $0x10, CX
  8338	ADDQ  $0x10, DI
  8339	JMP   sealAVX2SealHash
  8340
  8341sealAVX2ShortSealLoop:
  8342	CMPQ BX, $0x20
  8343	JB   sealAVX2ShortTail32
  8344	SUBQ $0x20, BX
  8345
  8346	// Load for encryption
  8347	VPXOR   (SI), Y0, Y0
  8348	VMOVDQU Y0, (DI)
  8349	LEAQ    32(SI), SI
  8350
  8351	// Now can hash
  8352	ADDQ  (DI), R10
  8353	ADCQ  8(DI), R11
  8354	ADCQ  $0x01, R12
  8355	MOVQ  (BP), DX
  8356	MOVQ  DX, R15
  8357	MULXQ R10, R13, R14
  8358	IMULQ R12, R15
  8359	MULXQ R11, AX, DX
  8360	ADDQ  AX, R14
  8361	ADCQ  DX, R15
  8362	MOVQ  8(BP), DX
  8363	MULXQ R10, R10, AX
  8364	ADDQ  R10, R14
  8365	MULXQ R11, R11, R8
  8366	ADCQ  R11, R15
  8367	ADCQ  $0x00, R8
  8368	IMULQ R12, DX
  8369	ADDQ  AX, R15
  8370	ADCQ  DX, R8
  8371	MOVQ  R13, R10
  8372	MOVQ  R14, R11
  8373	MOVQ  R15, R12
  8374	ANDQ  $0x03, R12
  8375	MOVQ  R15, R13
  8376	ANDQ  $-4, R13
  8377	MOVQ  R8, R14
  8378	SHRQ  $0x02, R8, R15
  8379	SHRQ  $0x02, R8
  8380	ADDQ  R13, R10
  8381	ADCQ  R14, R11
  8382	ADCQ  $0x00, R12
  8383	ADDQ  R15, R10
  8384	ADCQ  R8, R11
  8385	ADCQ  $0x00, R12
  8386	ADDQ  16(DI), R10
  8387	ADCQ  24(DI), R11
  8388	ADCQ  $0x01, R12
  8389	MOVQ  (BP), DX
  8390	MOVQ  DX, R15
  8391	MULXQ R10, R13, R14
  8392	IMULQ R12, R15
  8393	MULXQ R11, AX, DX
  8394	ADDQ  AX, R14
  8395	ADCQ  DX, R15
  8396	MOVQ  8(BP), DX
  8397	MULXQ R10, R10, AX
  8398	ADDQ  R10, R14
  8399	MULXQ R11, R11, R8
  8400	ADCQ  R11, R15
  8401	ADCQ  $0x00, R8
  8402	IMULQ R12, DX
  8403	ADDQ  AX, R15
  8404	ADCQ  DX, R8
  8405	MOVQ  R13, R10
  8406	MOVQ  R14, R11
  8407	MOVQ  R15, R12
  8408	ANDQ  $0x03, R12
  8409	MOVQ  R15, R13
  8410	ANDQ  $-4, R13
  8411	MOVQ  R8, R14
  8412	SHRQ  $0x02, R8, R15
  8413	SHRQ  $0x02, R8
  8414	ADDQ  R13, R10
  8415	ADCQ  R14, R11
  8416	ADCQ  $0x00, R12
  8417	ADDQ  R15, R10
  8418	ADCQ  R8, R11
  8419	ADCQ  $0x00, R12
  8420	LEAQ  32(DI), DI
  8421
  8422	// Shift stream left
  8423	VMOVDQA Y14, Y0
  8424	VMOVDQA Y12, Y14
  8425	VMOVDQA Y4, Y12
  8426	VMOVDQA Y5, Y4
  8427	VMOVDQA Y9, Y5
  8428	VMOVDQA Y13, Y9
  8429	VMOVDQA Y1, Y13
  8430	VMOVDQA Y6, Y1
  8431	VMOVDQA Y10, Y6
  8432	JMP     sealAVX2ShortSealLoop
  8433
  8434sealAVX2ShortTail32:
  8435	CMPQ    BX, $0x10
  8436	VMOVDQA X0, X1
  8437	JB      sealAVX2ShortDone
  8438	SUBQ    $0x10, BX
  8439
  8440	// Load for encryption
  8441	VPXOR   (SI), X0, X12
  8442	VMOVDQU X12, (DI)
  8443	LEAQ    16(SI), SI
  8444
  8445	// Hash
  8446	ADDQ       (DI), R10
  8447	ADCQ       8(DI), R11
  8448	ADCQ       $0x01, R12
  8449	MOVQ       (BP), DX
  8450	MOVQ       DX, R15
  8451	MULXQ      R10, R13, R14
  8452	IMULQ      R12, R15
  8453	MULXQ      R11, AX, DX
  8454	ADDQ       AX, R14
  8455	ADCQ       DX, R15
  8456	MOVQ       8(BP), DX
  8457	MULXQ      R10, R10, AX
  8458	ADDQ       R10, R14
  8459	MULXQ      R11, R11, R8
  8460	ADCQ       R11, R15
  8461	ADCQ       $0x00, R8
  8462	IMULQ      R12, DX
  8463	ADDQ       AX, R15
  8464	ADCQ       DX, R8
  8465	MOVQ       R13, R10
  8466	MOVQ       R14, R11
  8467	MOVQ       R15, R12
  8468	ANDQ       $0x03, R12
  8469	MOVQ       R15, R13
  8470	ANDQ       $-4, R13
  8471	MOVQ       R8, R14
  8472	SHRQ       $0x02, R8, R15
  8473	SHRQ       $0x02, R8
  8474	ADDQ       R13, R10
  8475	ADCQ       R14, R11
  8476	ADCQ       $0x00, R12
  8477	ADDQ       R15, R10
  8478	ADCQ       R8, R11
  8479	ADCQ       $0x00, R12
  8480	LEAQ       16(DI), DI
  8481	VPERM2I128 $0x11, Y0, Y0, Y0
  8482	VMOVDQA    X0, X1
  8483
  8484sealAVX2ShortDone:
  8485	VZEROUPPER
  8486	JMP sealSSETail
  8487
  8488seal320AVX2:
  8489	VMOVDQA Y0, Y5
  8490	VMOVDQA Y14, Y9
  8491	VMOVDQA Y12, Y13
  8492	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  8493	VMOVDQA Y0, Y6
  8494	VMOVDQA Y14, Y10
  8495	VMOVDQA Y12, Y8
  8496	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  8497	VMOVDQA Y14, Y7
  8498	VMOVDQA Y12, Y11
  8499	VMOVDQA Y4, Y15
  8500	MOVQ    $0x0000000a, R9
  8501
  8502sealAVX2320InnerCipherLoop:
  8503	VPADDD   Y14, Y0, Y0
  8504	VPXOR    Y0, Y4, Y4
  8505	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  8506	VPADDD   Y4, Y12, Y12
  8507	VPXOR    Y12, Y14, Y14
  8508	VPSLLD   $0x0c, Y14, Y3
  8509	VPSRLD   $0x14, Y14, Y14
  8510	VPXOR    Y3, Y14, Y14
  8511	VPADDD   Y14, Y0, Y0
  8512	VPXOR    Y0, Y4, Y4
  8513	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  8514	VPADDD   Y4, Y12, Y12
  8515	VPXOR    Y12, Y14, Y14
  8516	VPSLLD   $0x07, Y14, Y3
  8517	VPSRLD   $0x19, Y14, Y14
  8518	VPXOR    Y3, Y14, Y14
  8519	VPADDD   Y9, Y5, Y5
  8520	VPXOR    Y5, Y1, Y1
  8521	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  8522	VPADDD   Y1, Y13, Y13
  8523	VPXOR    Y13, Y9, Y9
  8524	VPSLLD   $0x0c, Y9, Y3
  8525	VPSRLD   $0x14, Y9, Y9
  8526	VPXOR    Y3, Y9, Y9
  8527	VPADDD   Y9, Y5, Y5
  8528	VPXOR    Y5, Y1, Y1
  8529	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  8530	VPADDD   Y1, Y13, Y13
  8531	VPXOR    Y13, Y9, Y9
  8532	VPSLLD   $0x07, Y9, Y3
  8533	VPSRLD   $0x19, Y9, Y9
  8534	VPXOR    Y3, Y9, Y9
  8535	VPADDD   Y10, Y6, Y6
  8536	VPXOR    Y6, Y2, Y2
  8537	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  8538	VPADDD   Y2, Y8, Y8
  8539	VPXOR    Y8, Y10, Y10
  8540	VPSLLD   $0x0c, Y10, Y3
  8541	VPSRLD   $0x14, Y10, Y10
  8542	VPXOR    Y3, Y10, Y10
  8543	VPADDD   Y10, Y6, Y6
  8544	VPXOR    Y6, Y2, Y2
  8545	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  8546	VPADDD   Y2, Y8, Y8
  8547	VPXOR    Y8, Y10, Y10
  8548	VPSLLD   $0x07, Y10, Y3
  8549	VPSRLD   $0x19, Y10, Y10
  8550	VPXOR    Y3, Y10, Y10
  8551	VPALIGNR $0x04, Y14, Y14, Y14
  8552	VPALIGNR $0x04, Y9, Y9, Y9
  8553	VPALIGNR $0x04, Y10, Y10, Y10
  8554	VPALIGNR $0x08, Y12, Y12, Y12
  8555	VPALIGNR $0x08, Y13, Y13, Y13
  8556	VPALIGNR $0x08, Y8, Y8, Y8
  8557	VPALIGNR $0x0c, Y4, Y4, Y4
  8558	VPALIGNR $0x0c, Y1, Y1, Y1
  8559	VPALIGNR $0x0c, Y2, Y2, Y2
  8560	VPADDD   Y14, Y0, Y0
  8561	VPXOR    Y0, Y4, Y4
  8562	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
  8563	VPADDD   Y4, Y12, Y12
  8564	VPXOR    Y12, Y14, Y14
  8565	VPSLLD   $0x0c, Y14, Y3
  8566	VPSRLD   $0x14, Y14, Y14
  8567	VPXOR    Y3, Y14, Y14
  8568	VPADDD   Y14, Y0, Y0
  8569	VPXOR    Y0, Y4, Y4
  8570	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
  8571	VPADDD   Y4, Y12, Y12
  8572	VPXOR    Y12, Y14, Y14
  8573	VPSLLD   $0x07, Y14, Y3
  8574	VPSRLD   $0x19, Y14, Y14
  8575	VPXOR    Y3, Y14, Y14
  8576	VPADDD   Y9, Y5, Y5
  8577	VPXOR    Y5, Y1, Y1
  8578	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
  8579	VPADDD   Y1, Y13, Y13
  8580	VPXOR    Y13, Y9, Y9
  8581	VPSLLD   $0x0c, Y9, Y3
  8582	VPSRLD   $0x14, Y9, Y9
  8583	VPXOR    Y3, Y9, Y9
  8584	VPADDD   Y9, Y5, Y5
  8585	VPXOR    Y5, Y1, Y1
  8586	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
  8587	VPADDD   Y1, Y13, Y13
  8588	VPXOR    Y13, Y9, Y9
  8589	VPSLLD   $0x07, Y9, Y3
  8590	VPSRLD   $0x19, Y9, Y9
  8591	VPXOR    Y3, Y9, Y9
  8592	VPADDD   Y10, Y6, Y6
  8593	VPXOR    Y6, Y2, Y2
  8594	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
  8595	VPADDD   Y2, Y8, Y8
  8596	VPXOR    Y8, Y10, Y10
  8597	VPSLLD   $0x0c, Y10, Y3
  8598	VPSRLD   $0x14, Y10, Y10
  8599	VPXOR    Y3, Y10, Y10
  8600	VPADDD   Y10, Y6, Y6
  8601	VPXOR    Y6, Y2, Y2
  8602	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
  8603	VPADDD   Y2, Y8, Y8
  8604	VPXOR    Y8, Y10, Y10
  8605	VPSLLD   $0x07, Y10, Y3
  8606	VPSRLD   $0x19, Y10, Y10
  8607	VPXOR    Y3, Y10, Y10
  8608	VPALIGNR $0x0c, Y14, Y14, Y14
  8609	VPALIGNR $0x0c, Y9, Y9, Y9
  8610	VPALIGNR $0x0c, Y10, Y10, Y10
  8611	VPALIGNR $0x08, Y12, Y12, Y12
  8612	VPALIGNR $0x08, Y13, Y13, Y13
  8613	VPALIGNR $0x08, Y8, Y8, Y8
  8614	VPALIGNR $0x04, Y4, Y4, Y4
  8615	VPALIGNR $0x04, Y1, Y1, Y1
  8616	VPALIGNR $0x04, Y2, Y2, Y2
  8617	DECQ     R9
  8618	JNE      sealAVX2320InnerCipherLoop
  8619	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
  8620	VPADDD   Y3, Y0, Y0
  8621	VPADDD   Y3, Y5, Y5
  8622	VPADDD   Y3, Y6, Y6
  8623	VPADDD   Y7, Y14, Y14
  8624	VPADDD   Y7, Y9, Y9
  8625	VPADDD   Y7, Y10, Y10
  8626	VPADDD   Y11, Y12, Y12
  8627	VPADDD   Y11, Y13, Y13
  8628	VPADDD   Y11, Y8, Y8
  8629	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
  8630	VPADDD   Y15, Y4, Y4
  8631	VPADDD   Y3, Y15, Y15
  8632	VPADDD   Y15, Y1, Y1
  8633	VPADDD   Y3, Y15, Y15
  8634	VPADDD   Y15, Y2, Y2
  8635
  8636	// Clamp and store poly key
  8637	VPERM2I128 $0x02, Y0, Y14, Y3
  8638	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
  8639	VMOVDQA    Y3, (BP)
  8640
  8641	// Stream for up to 320 bytes
  8642	VPERM2I128 $0x13, Y0, Y14, Y0
  8643	VPERM2I128 $0x13, Y12, Y4, Y14
  8644	VPERM2I128 $0x02, Y5, Y9, Y12
  8645	VPERM2I128 $0x02, Y13, Y1, Y4
  8646	VPERM2I128 $0x13, Y5, Y9, Y5
  8647	VPERM2I128 $0x13, Y13, Y1, Y9
  8648	VPERM2I128 $0x02, Y6, Y10, Y13
  8649	VPERM2I128 $0x02, Y8, Y2, Y1
  8650	VPERM2I128 $0x13, Y6, Y10, Y6
  8651	VPERM2I128 $0x13, Y8, Y2, Y10
  8652	JMP        sealAVX2ShortSeal
  8653
  8654sealAVX2Tail128:
  8655	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  8656	VMOVDQA 32(BP), Y14
  8657	VMOVDQA 64(BP), Y12
  8658	VMOVDQA 192(BP), Y4
  8659	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  8660	VMOVDQA Y4, Y1
  8661
  8662sealAVX2Tail128LoopA:
  8663	ADDQ  (DI), R10
  8664	ADCQ  8(DI), R11
  8665	ADCQ  $0x01, R12
  8666	MOVQ  (BP), AX
  8667	MOVQ  AX, R15
  8668	MULQ  R10
  8669	MOVQ  AX, R13
  8670	MOVQ  DX, R14
  8671	MOVQ  (BP), AX
  8672	MULQ  R11
  8673	IMULQ R12, R15
  8674	ADDQ  AX, R14
  8675	ADCQ  DX, R15
  8676	MOVQ  8(BP), AX
  8677	MOVQ  AX, R8
  8678	MULQ  R10
  8679	ADDQ  AX, R14
  8680	ADCQ  $0x00, DX
  8681	MOVQ  DX, R10
  8682	MOVQ  8(BP), AX
  8683	MULQ  R11
  8684	ADDQ  AX, R15
  8685	ADCQ  $0x00, DX
  8686	IMULQ R12, R8
  8687	ADDQ  R10, R15
  8688	ADCQ  DX, R8
  8689	MOVQ  R13, R10
  8690	MOVQ  R14, R11
  8691	MOVQ  R15, R12
  8692	ANDQ  $0x03, R12
  8693	MOVQ  R15, R13
  8694	ANDQ  $-4, R13
  8695	MOVQ  R8, R14
  8696	SHRQ  $0x02, R8, R15
  8697	SHRQ  $0x02, R8
  8698	ADDQ  R13, R10
  8699	ADCQ  R14, R11
  8700	ADCQ  $0x00, R12
  8701	ADDQ  R15, R10
  8702	ADCQ  R8, R11
  8703	ADCQ  $0x00, R12
  8704	LEAQ  16(DI), DI
  8705
  8706sealAVX2Tail128LoopB:
  8707	VPADDD     Y14, Y0, Y0
  8708	VPXOR      Y0, Y4, Y4
  8709	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8710	VPADDD     Y4, Y12, Y12
  8711	VPXOR      Y12, Y14, Y14
  8712	VPSLLD     $0x0c, Y14, Y3
  8713	VPSRLD     $0x14, Y14, Y14
  8714	VPXOR      Y3, Y14, Y14
  8715	VPADDD     Y14, Y0, Y0
  8716	VPXOR      Y0, Y4, Y4
  8717	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8718	VPADDD     Y4, Y12, Y12
  8719	VPXOR      Y12, Y14, Y14
  8720	VPSLLD     $0x07, Y14, Y3
  8721	VPSRLD     $0x19, Y14, Y14
  8722	VPXOR      Y3, Y14, Y14
  8723	ADDQ       (DI), R10
  8724	ADCQ       8(DI), R11
  8725	ADCQ       $0x01, R12
  8726	MOVQ       (BP), AX
  8727	MOVQ       AX, R15
  8728	MULQ       R10
  8729	MOVQ       AX, R13
  8730	MOVQ       DX, R14
  8731	MOVQ       (BP), AX
  8732	MULQ       R11
  8733	IMULQ      R12, R15
  8734	ADDQ       AX, R14
  8735	ADCQ       DX, R15
  8736	MOVQ       8(BP), AX
  8737	MOVQ       AX, R8
  8738	MULQ       R10
  8739	ADDQ       AX, R14
  8740	ADCQ       $0x00, DX
  8741	MOVQ       DX, R10
  8742	MOVQ       8(BP), AX
  8743	MULQ       R11
  8744	ADDQ       AX, R15
  8745	ADCQ       $0x00, DX
  8746	IMULQ      R12, R8
  8747	ADDQ       R10, R15
  8748	ADCQ       DX, R8
  8749	MOVQ       R13, R10
  8750	MOVQ       R14, R11
  8751	MOVQ       R15, R12
  8752	ANDQ       $0x03, R12
  8753	MOVQ       R15, R13
  8754	ANDQ       $-4, R13
  8755	MOVQ       R8, R14
  8756	SHRQ       $0x02, R8, R15
  8757	SHRQ       $0x02, R8
  8758	ADDQ       R13, R10
  8759	ADCQ       R14, R11
  8760	ADCQ       $0x00, R12
  8761	ADDQ       R15, R10
  8762	ADCQ       R8, R11
  8763	ADCQ       $0x00, R12
  8764	VPALIGNR   $0x04, Y14, Y14, Y14
  8765	VPALIGNR   $0x08, Y12, Y12, Y12
  8766	VPALIGNR   $0x0c, Y4, Y4, Y4
  8767	VPADDD     Y14, Y0, Y0
  8768	VPXOR      Y0, Y4, Y4
  8769	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8770	VPADDD     Y4, Y12, Y12
  8771	VPXOR      Y12, Y14, Y14
  8772	VPSLLD     $0x0c, Y14, Y3
  8773	VPSRLD     $0x14, Y14, Y14
  8774	VPXOR      Y3, Y14, Y14
  8775	VPADDD     Y14, Y0, Y0
  8776	VPXOR      Y0, Y4, Y4
  8777	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8778	VPADDD     Y4, Y12, Y12
  8779	VPXOR      Y12, Y14, Y14
  8780	VPSLLD     $0x07, Y14, Y3
  8781	VPSRLD     $0x19, Y14, Y14
  8782	VPXOR      Y3, Y14, Y14
  8783	ADDQ       16(DI), R10
  8784	ADCQ       24(DI), R11
  8785	ADCQ       $0x01, R12
  8786	MOVQ       (BP), AX
  8787	MOVQ       AX, R15
  8788	MULQ       R10
  8789	MOVQ       AX, R13
  8790	MOVQ       DX, R14
  8791	MOVQ       (BP), AX
  8792	MULQ       R11
  8793	IMULQ      R12, R15
  8794	ADDQ       AX, R14
  8795	ADCQ       DX, R15
  8796	MOVQ       8(BP), AX
  8797	MOVQ       AX, R8
  8798	MULQ       R10
  8799	ADDQ       AX, R14
  8800	ADCQ       $0x00, DX
  8801	MOVQ       DX, R10
  8802	MOVQ       8(BP), AX
  8803	MULQ       R11
  8804	ADDQ       AX, R15
  8805	ADCQ       $0x00, DX
  8806	IMULQ      R12, R8
  8807	ADDQ       R10, R15
  8808	ADCQ       DX, R8
  8809	MOVQ       R13, R10
  8810	MOVQ       R14, R11
  8811	MOVQ       R15, R12
  8812	ANDQ       $0x03, R12
  8813	MOVQ       R15, R13
  8814	ANDQ       $-4, R13
  8815	MOVQ       R8, R14
  8816	SHRQ       $0x02, R8, R15
  8817	SHRQ       $0x02, R8
  8818	ADDQ       R13, R10
  8819	ADCQ       R14, R11
  8820	ADCQ       $0x00, R12
  8821	ADDQ       R15, R10
  8822	ADCQ       R8, R11
  8823	ADCQ       $0x00, R12
  8824	LEAQ       32(DI), DI
  8825	VPALIGNR   $0x0c, Y14, Y14, Y14
  8826	VPALIGNR   $0x08, Y12, Y12, Y12
  8827	VPALIGNR   $0x04, Y4, Y4, Y4
  8828	DECQ       CX
  8829	JG         sealAVX2Tail128LoopA
  8830	DECQ       R9
  8831	JGE        sealAVX2Tail128LoopB
  8832	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y5
  8833	VPADDD     32(BP), Y14, Y9
  8834	VPADDD     64(BP), Y12, Y13
  8835	VPADDD     Y1, Y4, Y1
  8836	VPERM2I128 $0x02, Y5, Y9, Y0
  8837	VPERM2I128 $0x02, Y13, Y1, Y14
  8838	VPERM2I128 $0x13, Y5, Y9, Y12
  8839	VPERM2I128 $0x13, Y13, Y1, Y4
  8840	JMP        sealAVX2ShortSealLoop
  8841
  8842sealAVX2Tail256:
  8843	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  8844	VMOVDQA ·chacha20Constants<>+0(SB), Y5
  8845	VMOVDQA 32(BP), Y14
  8846	VMOVDQA 32(BP), Y9
  8847	VMOVDQA 64(BP), Y12
  8848	VMOVDQA 64(BP), Y13
  8849	VMOVDQA 192(BP), Y4
  8850	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  8851	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  8852	VMOVDQA Y4, Y7
  8853	VMOVDQA Y1, Y11
  8854
  8855sealAVX2Tail256LoopA:
  8856	ADDQ  (DI), R10
  8857	ADCQ  8(DI), R11
  8858	ADCQ  $0x01, R12
  8859	MOVQ  (BP), AX
  8860	MOVQ  AX, R15
  8861	MULQ  R10
  8862	MOVQ  AX, R13
  8863	MOVQ  DX, R14
  8864	MOVQ  (BP), AX
  8865	MULQ  R11
  8866	IMULQ R12, R15
  8867	ADDQ  AX, R14
  8868	ADCQ  DX, R15
  8869	MOVQ  8(BP), AX
  8870	MOVQ  AX, R8
  8871	MULQ  R10
  8872	ADDQ  AX, R14
  8873	ADCQ  $0x00, DX
  8874	MOVQ  DX, R10
  8875	MOVQ  8(BP), AX
  8876	MULQ  R11
  8877	ADDQ  AX, R15
  8878	ADCQ  $0x00, DX
  8879	IMULQ R12, R8
  8880	ADDQ  R10, R15
  8881	ADCQ  DX, R8
  8882	MOVQ  R13, R10
  8883	MOVQ  R14, R11
  8884	MOVQ  R15, R12
  8885	ANDQ  $0x03, R12
  8886	MOVQ  R15, R13
  8887	ANDQ  $-4, R13
  8888	MOVQ  R8, R14
  8889	SHRQ  $0x02, R8, R15
  8890	SHRQ  $0x02, R8
  8891	ADDQ  R13, R10
  8892	ADCQ  R14, R11
  8893	ADCQ  $0x00, R12
  8894	ADDQ  R15, R10
  8895	ADCQ  R8, R11
  8896	ADCQ  $0x00, R12
  8897	LEAQ  16(DI), DI
  8898
  8899sealAVX2Tail256LoopB:
  8900	VPADDD     Y14, Y0, Y0
  8901	VPXOR      Y0, Y4, Y4
  8902	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8903	VPADDD     Y4, Y12, Y12
  8904	VPXOR      Y12, Y14, Y14
  8905	VPSLLD     $0x0c, Y14, Y3
  8906	VPSRLD     $0x14, Y14, Y14
  8907	VPXOR      Y3, Y14, Y14
  8908	VPADDD     Y14, Y0, Y0
  8909	VPXOR      Y0, Y4, Y4
  8910	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8911	VPADDD     Y4, Y12, Y12
  8912	VPXOR      Y12, Y14, Y14
  8913	VPSLLD     $0x07, Y14, Y3
  8914	VPSRLD     $0x19, Y14, Y14
  8915	VPXOR      Y3, Y14, Y14
  8916	VPADDD     Y9, Y5, Y5
  8917	VPXOR      Y5, Y1, Y1
  8918	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  8919	VPADDD     Y1, Y13, Y13
  8920	VPXOR      Y13, Y9, Y9
  8921	VPSLLD     $0x0c, Y9, Y3
  8922	VPSRLD     $0x14, Y9, Y9
  8923	VPXOR      Y3, Y9, Y9
  8924	VPADDD     Y9, Y5, Y5
  8925	VPXOR      Y5, Y1, Y1
  8926	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  8927	VPADDD     Y1, Y13, Y13
  8928	VPXOR      Y13, Y9, Y9
  8929	VPSLLD     $0x07, Y9, Y3
  8930	VPSRLD     $0x19, Y9, Y9
  8931	VPXOR      Y3, Y9, Y9
  8932	ADDQ       (DI), R10
  8933	ADCQ       8(DI), R11
  8934	ADCQ       $0x01, R12
  8935	MOVQ       (BP), AX
  8936	MOVQ       AX, R15
  8937	MULQ       R10
  8938	MOVQ       AX, R13
  8939	MOVQ       DX, R14
  8940	MOVQ       (BP), AX
  8941	MULQ       R11
  8942	IMULQ      R12, R15
  8943	ADDQ       AX, R14
  8944	ADCQ       DX, R15
  8945	MOVQ       8(BP), AX
  8946	MOVQ       AX, R8
  8947	MULQ       R10
  8948	ADDQ       AX, R14
  8949	ADCQ       $0x00, DX
  8950	MOVQ       DX, R10
  8951	MOVQ       8(BP), AX
  8952	MULQ       R11
  8953	ADDQ       AX, R15
  8954	ADCQ       $0x00, DX
  8955	IMULQ      R12, R8
  8956	ADDQ       R10, R15
  8957	ADCQ       DX, R8
  8958	MOVQ       R13, R10
  8959	MOVQ       R14, R11
  8960	MOVQ       R15, R12
  8961	ANDQ       $0x03, R12
  8962	MOVQ       R15, R13
  8963	ANDQ       $-4, R13
  8964	MOVQ       R8, R14
  8965	SHRQ       $0x02, R8, R15
  8966	SHRQ       $0x02, R8
  8967	ADDQ       R13, R10
  8968	ADCQ       R14, R11
  8969	ADCQ       $0x00, R12
  8970	ADDQ       R15, R10
  8971	ADCQ       R8, R11
  8972	ADCQ       $0x00, R12
  8973	VPALIGNR   $0x04, Y14, Y14, Y14
  8974	VPALIGNR   $0x04, Y9, Y9, Y9
  8975	VPALIGNR   $0x08, Y12, Y12, Y12
  8976	VPALIGNR   $0x08, Y13, Y13, Y13
  8977	VPALIGNR   $0x0c, Y4, Y4, Y4
  8978	VPALIGNR   $0x0c, Y1, Y1, Y1
  8979	VPADDD     Y14, Y0, Y0
  8980	VPXOR      Y0, Y4, Y4
  8981	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  8982	VPADDD     Y4, Y12, Y12
  8983	VPXOR      Y12, Y14, Y14
  8984	VPSLLD     $0x0c, Y14, Y3
  8985	VPSRLD     $0x14, Y14, Y14
  8986	VPXOR      Y3, Y14, Y14
  8987	VPADDD     Y14, Y0, Y0
  8988	VPXOR      Y0, Y4, Y4
  8989	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  8990	VPADDD     Y4, Y12, Y12
  8991	VPXOR      Y12, Y14, Y14
  8992	VPSLLD     $0x07, Y14, Y3
  8993	VPSRLD     $0x19, Y14, Y14
  8994	VPXOR      Y3, Y14, Y14
  8995	VPADDD     Y9, Y5, Y5
  8996	VPXOR      Y5, Y1, Y1
  8997	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  8998	VPADDD     Y1, Y13, Y13
  8999	VPXOR      Y13, Y9, Y9
  9000	VPSLLD     $0x0c, Y9, Y3
  9001	VPSRLD     $0x14, Y9, Y9
  9002	VPXOR      Y3, Y9, Y9
  9003	VPADDD     Y9, Y5, Y5
  9004	VPXOR      Y5, Y1, Y1
  9005	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  9006	VPADDD     Y1, Y13, Y13
  9007	VPXOR      Y13, Y9, Y9
  9008	VPSLLD     $0x07, Y9, Y3
  9009	VPSRLD     $0x19, Y9, Y9
  9010	VPXOR      Y3, Y9, Y9
  9011	ADDQ       16(DI), R10
  9012	ADCQ       24(DI), R11
  9013	ADCQ       $0x01, R12
  9014	MOVQ       (BP), AX
  9015	MOVQ       AX, R15
  9016	MULQ       R10
  9017	MOVQ       AX, R13
  9018	MOVQ       DX, R14
  9019	MOVQ       (BP), AX
  9020	MULQ       R11
  9021	IMULQ      R12, R15
  9022	ADDQ       AX, R14
  9023	ADCQ       DX, R15
  9024	MOVQ       8(BP), AX
  9025	MOVQ       AX, R8
  9026	MULQ       R10
  9027	ADDQ       AX, R14
  9028	ADCQ       $0x00, DX
  9029	MOVQ       DX, R10
  9030	MOVQ       8(BP), AX
  9031	MULQ       R11
  9032	ADDQ       AX, R15
  9033	ADCQ       $0x00, DX
  9034	IMULQ      R12, R8
  9035	ADDQ       R10, R15
  9036	ADCQ       DX, R8
  9037	MOVQ       R13, R10
  9038	MOVQ       R14, R11
  9039	MOVQ       R15, R12
  9040	ANDQ       $0x03, R12
  9041	MOVQ       R15, R13
  9042	ANDQ       $-4, R13
  9043	MOVQ       R8, R14
  9044	SHRQ       $0x02, R8, R15
  9045	SHRQ       $0x02, R8
  9046	ADDQ       R13, R10
  9047	ADCQ       R14, R11
  9048	ADCQ       $0x00, R12
  9049	ADDQ       R15, R10
  9050	ADCQ       R8, R11
  9051	ADCQ       $0x00, R12
  9052	LEAQ       32(DI), DI
  9053	VPALIGNR   $0x0c, Y14, Y14, Y14
  9054	VPALIGNR   $0x0c, Y9, Y9, Y9
  9055	VPALIGNR   $0x08, Y12, Y12, Y12
  9056	VPALIGNR   $0x08, Y13, Y13, Y13
  9057	VPALIGNR   $0x04, Y4, Y4, Y4
  9058	VPALIGNR   $0x04, Y1, Y1, Y1
  9059	DECQ       CX
  9060	JG         sealAVX2Tail256LoopA
  9061	DECQ       R9
  9062	JGE        sealAVX2Tail256LoopB
  9063	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  9064	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  9065	VPADDD     32(BP), Y14, Y14
  9066	VPADDD     32(BP), Y9, Y9
  9067	VPADDD     64(BP), Y12, Y12
  9068	VPADDD     64(BP), Y13, Y13
  9069	VPADDD     Y7, Y4, Y4
  9070	VPADDD     Y11, Y1, Y1
  9071	VPERM2I128 $0x02, Y0, Y14, Y3
  9072	VPERM2I128 $0x02, Y12, Y4, Y7
  9073	VPERM2I128 $0x13, Y0, Y14, Y11
  9074	VPERM2I128 $0x13, Y12, Y4, Y15
  9075	VPXOR      (SI), Y3, Y3
  9076	VPXOR      32(SI), Y7, Y7
  9077	VPXOR      64(SI), Y11, Y11
  9078	VPXOR      96(SI), Y15, Y15
  9079	VMOVDQU    Y3, (DI)
  9080	VMOVDQU    Y7, 32(DI)
  9081	VMOVDQU    Y11, 64(DI)
  9082	VMOVDQU    Y15, 96(DI)
  9083	MOVQ       $0x00000080, CX
  9084	LEAQ       128(SI), SI
  9085	SUBQ       $0x80, BX
  9086	VPERM2I128 $0x02, Y5, Y9, Y0
  9087	VPERM2I128 $0x02, Y13, Y1, Y14
  9088	VPERM2I128 $0x13, Y5, Y9, Y12
  9089	VPERM2I128 $0x13, Y13, Y1, Y4
  9090	JMP        sealAVX2SealHash
  9091
  9092sealAVX2Tail384:
  9093	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  9094	VMOVDQA Y0, Y5
  9095	VMOVDQA Y0, Y6
  9096	VMOVDQA 32(BP), Y14
  9097	VMOVDQA Y14, Y9
  9098	VMOVDQA Y14, Y10
  9099	VMOVDQA 64(BP), Y12
  9100	VMOVDQA Y12, Y13
  9101	VMOVDQA Y12, Y8
  9102	VMOVDQA 192(BP), Y4
  9103	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  9104	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  9105	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  9106	VMOVDQA Y4, Y7
  9107	VMOVDQA Y1, Y11
  9108	VMOVDQA Y2, Y15
  9109
  9110sealAVX2Tail384LoopA:
  9111	ADDQ  (DI), R10
  9112	ADCQ  8(DI), R11
  9113	ADCQ  $0x01, R12
  9114	MOVQ  (BP), AX
  9115	MOVQ  AX, R15
  9116	MULQ  R10
  9117	MOVQ  AX, R13
  9118	MOVQ  DX, R14
  9119	MOVQ  (BP), AX
  9120	MULQ  R11
  9121	IMULQ R12, R15
  9122	ADDQ  AX, R14
  9123	ADCQ  DX, R15
  9124	MOVQ  8(BP), AX
  9125	MOVQ  AX, R8
  9126	MULQ  R10
  9127	ADDQ  AX, R14
  9128	ADCQ  $0x00, DX
  9129	MOVQ  DX, R10
  9130	MOVQ  8(BP), AX
  9131	MULQ  R11
  9132	ADDQ  AX, R15
  9133	ADCQ  $0x00, DX
  9134	IMULQ R12, R8
  9135	ADDQ  R10, R15
  9136	ADCQ  DX, R8
  9137	MOVQ  R13, R10
  9138	MOVQ  R14, R11
  9139	MOVQ  R15, R12
  9140	ANDQ  $0x03, R12
  9141	MOVQ  R15, R13
  9142	ANDQ  $-4, R13
  9143	MOVQ  R8, R14
  9144	SHRQ  $0x02, R8, R15
  9145	SHRQ  $0x02, R8
  9146	ADDQ  R13, R10
  9147	ADCQ  R14, R11
  9148	ADCQ  $0x00, R12
  9149	ADDQ  R15, R10
  9150	ADCQ  R8, R11
  9151	ADCQ  $0x00, R12
  9152	LEAQ  16(DI), DI
  9153
  9154sealAVX2Tail384LoopB:
  9155	VPADDD     Y14, Y0, Y0
  9156	VPXOR      Y0, Y4, Y4
  9157	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  9158	VPADDD     Y4, Y12, Y12
  9159	VPXOR      Y12, Y14, Y14
  9160	VPSLLD     $0x0c, Y14, Y3
  9161	VPSRLD     $0x14, Y14, Y14
  9162	VPXOR      Y3, Y14, Y14
  9163	VPADDD     Y14, Y0, Y0
  9164	VPXOR      Y0, Y4, Y4
  9165	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  9166	VPADDD     Y4, Y12, Y12
  9167	VPXOR      Y12, Y14, Y14
  9168	VPSLLD     $0x07, Y14, Y3
  9169	VPSRLD     $0x19, Y14, Y14
  9170	VPXOR      Y3, Y14, Y14
  9171	VPADDD     Y9, Y5, Y5
  9172	VPXOR      Y5, Y1, Y1
  9173	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  9174	VPADDD     Y1, Y13, Y13
  9175	VPXOR      Y13, Y9, Y9
  9176	VPSLLD     $0x0c, Y9, Y3
  9177	VPSRLD     $0x14, Y9, Y9
  9178	VPXOR      Y3, Y9, Y9
  9179	VPADDD     Y9, Y5, Y5
  9180	VPXOR      Y5, Y1, Y1
  9181	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  9182	VPADDD     Y1, Y13, Y13
  9183	VPXOR      Y13, Y9, Y9
  9184	VPSLLD     $0x07, Y9, Y3
  9185	VPSRLD     $0x19, Y9, Y9
  9186	VPXOR      Y3, Y9, Y9
  9187	VPADDD     Y10, Y6, Y6
  9188	VPXOR      Y6, Y2, Y2
  9189	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  9190	VPADDD     Y2, Y8, Y8
  9191	VPXOR      Y8, Y10, Y10
  9192	VPSLLD     $0x0c, Y10, Y3
  9193	VPSRLD     $0x14, Y10, Y10
  9194	VPXOR      Y3, Y10, Y10
  9195	VPADDD     Y10, Y6, Y6
  9196	VPXOR      Y6, Y2, Y2
  9197	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  9198	VPADDD     Y2, Y8, Y8
  9199	VPXOR      Y8, Y10, Y10
  9200	VPSLLD     $0x07, Y10, Y3
  9201	VPSRLD     $0x19, Y10, Y10
  9202	VPXOR      Y3, Y10, Y10
  9203	ADDQ       (DI), R10
  9204	ADCQ       8(DI), R11
  9205	ADCQ       $0x01, R12
  9206	MOVQ       (BP), AX
  9207	MOVQ       AX, R15
  9208	MULQ       R10
  9209	MOVQ       AX, R13
  9210	MOVQ       DX, R14
  9211	MOVQ       (BP), AX
  9212	MULQ       R11
  9213	IMULQ      R12, R15
  9214	ADDQ       AX, R14
  9215	ADCQ       DX, R15
  9216	MOVQ       8(BP), AX
  9217	MOVQ       AX, R8
  9218	MULQ       R10
  9219	ADDQ       AX, R14
  9220	ADCQ       $0x00, DX
  9221	MOVQ       DX, R10
  9222	MOVQ       8(BP), AX
  9223	MULQ       R11
  9224	ADDQ       AX, R15
  9225	ADCQ       $0x00, DX
  9226	IMULQ      R12, R8
  9227	ADDQ       R10, R15
  9228	ADCQ       DX, R8
  9229	MOVQ       R13, R10
  9230	MOVQ       R14, R11
  9231	MOVQ       R15, R12
  9232	ANDQ       $0x03, R12
  9233	MOVQ       R15, R13
  9234	ANDQ       $-4, R13
  9235	MOVQ       R8, R14
  9236	SHRQ       $0x02, R8, R15
  9237	SHRQ       $0x02, R8
  9238	ADDQ       R13, R10
  9239	ADCQ       R14, R11
  9240	ADCQ       $0x00, R12
  9241	ADDQ       R15, R10
  9242	ADCQ       R8, R11
  9243	ADCQ       $0x00, R12
  9244	VPALIGNR   $0x04, Y14, Y14, Y14
  9245	VPALIGNR   $0x04, Y9, Y9, Y9
  9246	VPALIGNR   $0x04, Y10, Y10, Y10
  9247	VPALIGNR   $0x08, Y12, Y12, Y12
  9248	VPALIGNR   $0x08, Y13, Y13, Y13
  9249	VPALIGNR   $0x08, Y8, Y8, Y8
  9250	VPALIGNR   $0x0c, Y4, Y4, Y4
  9251	VPALIGNR   $0x0c, Y1, Y1, Y1
  9252	VPALIGNR   $0x0c, Y2, Y2, Y2
  9253	VPADDD     Y14, Y0, Y0
  9254	VPXOR      Y0, Y4, Y4
  9255	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  9256	VPADDD     Y4, Y12, Y12
  9257	VPXOR      Y12, Y14, Y14
  9258	VPSLLD     $0x0c, Y14, Y3
  9259	VPSRLD     $0x14, Y14, Y14
  9260	VPXOR      Y3, Y14, Y14
  9261	VPADDD     Y14, Y0, Y0
  9262	VPXOR      Y0, Y4, Y4
  9263	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  9264	VPADDD     Y4, Y12, Y12
  9265	VPXOR      Y12, Y14, Y14
  9266	VPSLLD     $0x07, Y14, Y3
  9267	VPSRLD     $0x19, Y14, Y14
  9268	VPXOR      Y3, Y14, Y14
  9269	VPADDD     Y9, Y5, Y5
  9270	VPXOR      Y5, Y1, Y1
  9271	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  9272	VPADDD     Y1, Y13, Y13
  9273	VPXOR      Y13, Y9, Y9
  9274	VPSLLD     $0x0c, Y9, Y3
  9275	VPSRLD     $0x14, Y9, Y9
  9276	VPXOR      Y3, Y9, Y9
  9277	VPADDD     Y9, Y5, Y5
  9278	VPXOR      Y5, Y1, Y1
  9279	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  9280	VPADDD     Y1, Y13, Y13
  9281	VPXOR      Y13, Y9, Y9
  9282	VPSLLD     $0x07, Y9, Y3
  9283	VPSRLD     $0x19, Y9, Y9
  9284	VPXOR      Y3, Y9, Y9
  9285	VPADDD     Y10, Y6, Y6
  9286	VPXOR      Y6, Y2, Y2
  9287	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  9288	VPADDD     Y2, Y8, Y8
  9289	VPXOR      Y8, Y10, Y10
  9290	VPSLLD     $0x0c, Y10, Y3
  9291	VPSRLD     $0x14, Y10, Y10
  9292	VPXOR      Y3, Y10, Y10
  9293	VPADDD     Y10, Y6, Y6
  9294	VPXOR      Y6, Y2, Y2
  9295	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  9296	VPADDD     Y2, Y8, Y8
  9297	VPXOR      Y8, Y10, Y10
  9298	VPSLLD     $0x07, Y10, Y3
  9299	VPSRLD     $0x19, Y10, Y10
  9300	VPXOR      Y3, Y10, Y10
  9301	ADDQ       16(DI), R10
  9302	ADCQ       24(DI), R11
  9303	ADCQ       $0x01, R12
  9304	MOVQ       (BP), AX
  9305	MOVQ       AX, R15
  9306	MULQ       R10
  9307	MOVQ       AX, R13
  9308	MOVQ       DX, R14
  9309	MOVQ       (BP), AX
  9310	MULQ       R11
  9311	IMULQ      R12, R15
  9312	ADDQ       AX, R14
  9313	ADCQ       DX, R15
  9314	MOVQ       8(BP), AX
  9315	MOVQ       AX, R8
  9316	MULQ       R10
  9317	ADDQ       AX, R14
  9318	ADCQ       $0x00, DX
  9319	MOVQ       DX, R10
  9320	MOVQ       8(BP), AX
  9321	MULQ       R11
  9322	ADDQ       AX, R15
  9323	ADCQ       $0x00, DX
  9324	IMULQ      R12, R8
  9325	ADDQ       R10, R15
  9326	ADCQ       DX, R8
  9327	MOVQ       R13, R10
  9328	MOVQ       R14, R11
  9329	MOVQ       R15, R12
  9330	ANDQ       $0x03, R12
  9331	MOVQ       R15, R13
  9332	ANDQ       $-4, R13
  9333	MOVQ       R8, R14
  9334	SHRQ       $0x02, R8, R15
  9335	SHRQ       $0x02, R8
  9336	ADDQ       R13, R10
  9337	ADCQ       R14, R11
  9338	ADCQ       $0x00, R12
  9339	ADDQ       R15, R10
  9340	ADCQ       R8, R11
  9341	ADCQ       $0x00, R12
  9342	LEAQ       32(DI), DI
  9343	VPALIGNR   $0x0c, Y14, Y14, Y14
  9344	VPALIGNR   $0x0c, Y9, Y9, Y9
  9345	VPALIGNR   $0x0c, Y10, Y10, Y10
  9346	VPALIGNR   $0x08, Y12, Y12, Y12
  9347	VPALIGNR   $0x08, Y13, Y13, Y13
  9348	VPALIGNR   $0x08, Y8, Y8, Y8
  9349	VPALIGNR   $0x04, Y4, Y4, Y4
  9350	VPALIGNR   $0x04, Y1, Y1, Y1
  9351	VPALIGNR   $0x04, Y2, Y2, Y2
  9352	DECQ       CX
  9353	JG         sealAVX2Tail384LoopA
  9354	DECQ       R9
  9355	JGE        sealAVX2Tail384LoopB
  9356	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  9357	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  9358	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
  9359	VPADDD     32(BP), Y14, Y14
  9360	VPADDD     32(BP), Y9, Y9
  9361	VPADDD     32(BP), Y10, Y10
  9362	VPADDD     64(BP), Y12, Y12
  9363	VPADDD     64(BP), Y13, Y13
  9364	VPADDD     64(BP), Y8, Y8
  9365	VPADDD     Y7, Y4, Y4
  9366	VPADDD     Y11, Y1, Y1
  9367	VPADDD     Y15, Y2, Y2
  9368	VPERM2I128 $0x02, Y0, Y14, Y3
  9369	VPERM2I128 $0x02, Y12, Y4, Y7
  9370	VPERM2I128 $0x13, Y0, Y14, Y11
  9371	VPERM2I128 $0x13, Y12, Y4, Y15
  9372	VPXOR      (SI), Y3, Y3
  9373	VPXOR      32(SI), Y7, Y7
  9374	VPXOR      64(SI), Y11, Y11
  9375	VPXOR      96(SI), Y15, Y15
  9376	VMOVDQU    Y3, (DI)
  9377	VMOVDQU    Y7, 32(DI)
  9378	VMOVDQU    Y11, 64(DI)
  9379	VMOVDQU    Y15, 96(DI)
  9380	VPERM2I128 $0x02, Y5, Y9, Y3
  9381	VPERM2I128 $0x02, Y13, Y1, Y7
  9382	VPERM2I128 $0x13, Y5, Y9, Y11
  9383	VPERM2I128 $0x13, Y13, Y1, Y15
  9384	VPXOR      128(SI), Y3, Y3
  9385	VPXOR      160(SI), Y7, Y7
  9386	VPXOR      192(SI), Y11, Y11
  9387	VPXOR      224(SI), Y15, Y15
  9388	VMOVDQU    Y3, 128(DI)
  9389	VMOVDQU    Y7, 160(DI)
  9390	VMOVDQU    Y11, 192(DI)
  9391	VMOVDQU    Y15, 224(DI)
  9392	MOVQ       $0x00000100, CX
  9393	LEAQ       256(SI), SI
  9394	SUBQ       $0x00000100, BX
  9395	VPERM2I128 $0x02, Y6, Y10, Y0
  9396	VPERM2I128 $0x02, Y8, Y2, Y14
  9397	VPERM2I128 $0x13, Y6, Y10, Y12
  9398	VPERM2I128 $0x13, Y8, Y2, Y4
  9399	JMP        sealAVX2SealHash
  9400
  9401sealAVX2Tail512:
  9402	VMOVDQA ·chacha20Constants<>+0(SB), Y0
  9403	VMOVDQA Y0, Y5
  9404	VMOVDQA Y0, Y6
  9405	VMOVDQA Y0, Y7
  9406	VMOVDQA 32(BP), Y14
  9407	VMOVDQA Y14, Y9
  9408	VMOVDQA Y14, Y10
  9409	VMOVDQA Y14, Y11
  9410	VMOVDQA 64(BP), Y12
  9411	VMOVDQA Y12, Y13
  9412	VMOVDQA Y12, Y8
  9413	VMOVDQA Y12, Y15
  9414	VMOVDQA 192(BP), Y4
  9415	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
  9416	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
  9417	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
  9418	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
  9419	VMOVDQA Y4, 96(BP)
  9420	VMOVDQA Y1, 128(BP)
  9421	VMOVDQA Y2, 160(BP)
  9422	VMOVDQA Y3, 192(BP)
  9423
  9424sealAVX2Tail512LoopA:
  9425	ADDQ  (DI), R10
  9426	ADCQ  8(DI), R11
  9427	ADCQ  $0x01, R12
  9428	MOVQ  (BP), AX
  9429	MOVQ  AX, R15
  9430	MULQ  R10
  9431	MOVQ  AX, R13
  9432	MOVQ  DX, R14
  9433	MOVQ  (BP), AX
  9434	MULQ  R11
  9435	IMULQ R12, R15
  9436	ADDQ  AX, R14
  9437	ADCQ  DX, R15
  9438	MOVQ  8(BP), AX
  9439	MOVQ  AX, R8
  9440	MULQ  R10
  9441	ADDQ  AX, R14
  9442	ADCQ  $0x00, DX
  9443	MOVQ  DX, R10
  9444	MOVQ  8(BP), AX
  9445	MULQ  R11
  9446	ADDQ  AX, R15
  9447	ADCQ  $0x00, DX
  9448	IMULQ R12, R8
  9449	ADDQ  R10, R15
  9450	ADCQ  DX, R8
  9451	MOVQ  R13, R10
  9452	MOVQ  R14, R11
  9453	MOVQ  R15, R12
  9454	ANDQ  $0x03, R12
  9455	MOVQ  R15, R13
  9456	ANDQ  $-4, R13
  9457	MOVQ  R8, R14
  9458	SHRQ  $0x02, R8, R15
  9459	SHRQ  $0x02, R8
  9460	ADDQ  R13, R10
  9461	ADCQ  R14, R11
  9462	ADCQ  $0x00, R12
  9463	ADDQ  R15, R10
  9464	ADCQ  R8, R11
  9465	ADCQ  $0x00, R12
  9466	LEAQ  16(DI), DI
  9467
  9468sealAVX2Tail512LoopB:
  9469	VPADDD     Y14, Y0, Y0
  9470	VPADDD     Y9, Y5, Y5
  9471	VPADDD     Y10, Y6, Y6
  9472	VPADDD     Y11, Y7, Y7
  9473	VPXOR      Y0, Y4, Y4
  9474	VPXOR      Y5, Y1, Y1
  9475	VPXOR      Y6, Y2, Y2
  9476	VPXOR      Y7, Y3, Y3
  9477	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  9478	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  9479	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  9480	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
  9481	VPADDD     Y4, Y12, Y12
  9482	VPADDD     Y1, Y13, Y13
  9483	VPADDD     Y2, Y8, Y8
  9484	VPADDD     Y3, Y15, Y15
  9485	VPXOR      Y12, Y14, Y14
  9486	VPXOR      Y13, Y9, Y9
  9487	VPXOR      Y8, Y10, Y10
  9488	VPXOR      Y15, Y11, Y11
  9489	VMOVDQA    Y15, 224(BP)
  9490	VPSLLD     $0x0c, Y14, Y15
  9491	VPSRLD     $0x14, Y14, Y14
  9492	VPXOR      Y15, Y14, Y14
  9493	VPSLLD     $0x0c, Y9, Y15
  9494	VPSRLD     $0x14, Y9, Y9
  9495	VPXOR      Y15, Y9, Y9
  9496	VPSLLD     $0x0c, Y10, Y15
  9497	VPSRLD     $0x14, Y10, Y10
  9498	VPXOR      Y15, Y10, Y10
  9499	VPSLLD     $0x0c, Y11, Y15
  9500	VPSRLD     $0x14, Y11, Y11
  9501	VPXOR      Y15, Y11, Y11
  9502	VMOVDQA    224(BP), Y15
  9503	ADDQ       (DI), R10
  9504	ADCQ       8(DI), R11
  9505	ADCQ       $0x01, R12
  9506	MOVQ       (BP), DX
  9507	MOVQ       DX, R15
  9508	MULXQ      R10, R13, R14
  9509	IMULQ      R12, R15
  9510	MULXQ      R11, AX, DX
  9511	ADDQ       AX, R14
  9512	ADCQ       DX, R15
  9513	MOVQ       8(BP), DX
  9514	MULXQ      R10, R10, AX
  9515	ADDQ       R10, R14
  9516	MULXQ      R11, R11, R8
  9517	ADCQ       R11, R15
  9518	ADCQ       $0x00, R8
  9519	IMULQ      R12, DX
  9520	ADDQ       AX, R15
  9521	ADCQ       DX, R8
  9522	MOVQ       R13, R10
  9523	MOVQ       R14, R11
  9524	MOVQ       R15, R12
  9525	ANDQ       $0x03, R12
  9526	MOVQ       R15, R13
  9527	ANDQ       $-4, R13
  9528	MOVQ       R8, R14
  9529	SHRQ       $0x02, R8, R15
  9530	SHRQ       $0x02, R8
  9531	ADDQ       R13, R10
  9532	ADCQ       R14, R11
  9533	ADCQ       $0x00, R12
  9534	ADDQ       R15, R10
  9535	ADCQ       R8, R11
  9536	ADCQ       $0x00, R12
  9537	VPADDD     Y14, Y0, Y0
  9538	VPADDD     Y9, Y5, Y5
  9539	VPADDD     Y10, Y6, Y6
  9540	VPADDD     Y11, Y7, Y7
  9541	VPXOR      Y0, Y4, Y4
  9542	VPXOR      Y5, Y1, Y1
  9543	VPXOR      Y6, Y2, Y2
  9544	VPXOR      Y7, Y3, Y3
  9545	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  9546	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  9547	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  9548	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
  9549	VPADDD     Y4, Y12, Y12
  9550	VPADDD     Y1, Y13, Y13
  9551	VPADDD     Y2, Y8, Y8
  9552	VPADDD     Y3, Y15, Y15
  9553	VPXOR      Y12, Y14, Y14
  9554	VPXOR      Y13, Y9, Y9
  9555	VPXOR      Y8, Y10, Y10
  9556	VPXOR      Y15, Y11, Y11
  9557	VMOVDQA    Y15, 224(BP)
  9558	VPSLLD     $0x07, Y14, Y15
  9559	VPSRLD     $0x19, Y14, Y14
  9560	VPXOR      Y15, Y14, Y14
  9561	VPSLLD     $0x07, Y9, Y15
  9562	VPSRLD     $0x19, Y9, Y9
  9563	VPXOR      Y15, Y9, Y9
  9564	VPSLLD     $0x07, Y10, Y15
  9565	VPSRLD     $0x19, Y10, Y10
  9566	VPXOR      Y15, Y10, Y10
  9567	VPSLLD     $0x07, Y11, Y15
  9568	VPSRLD     $0x19, Y11, Y11
  9569	VPXOR      Y15, Y11, Y11
  9570	VMOVDQA    224(BP), Y15
  9571	VPALIGNR   $0x04, Y14, Y14, Y14
  9572	VPALIGNR   $0x04, Y9, Y9, Y9
  9573	VPALIGNR   $0x04, Y10, Y10, Y10
  9574	VPALIGNR   $0x04, Y11, Y11, Y11
  9575	VPALIGNR   $0x08, Y12, Y12, Y12
  9576	VPALIGNR   $0x08, Y13, Y13, Y13
  9577	VPALIGNR   $0x08, Y8, Y8, Y8
  9578	VPALIGNR   $0x08, Y15, Y15, Y15
  9579	VPALIGNR   $0x0c, Y4, Y4, Y4
  9580	VPALIGNR   $0x0c, Y1, Y1, Y1
  9581	VPALIGNR   $0x0c, Y2, Y2, Y2
  9582	VPALIGNR   $0x0c, Y3, Y3, Y3
  9583	VPADDD     Y14, Y0, Y0
  9584	VPADDD     Y9, Y5, Y5
  9585	VPADDD     Y10, Y6, Y6
  9586	VPADDD     Y11, Y7, Y7
  9587	VPXOR      Y0, Y4, Y4
  9588	VPXOR      Y5, Y1, Y1
  9589	VPXOR      Y6, Y2, Y2
  9590	VPXOR      Y7, Y3, Y3
  9591	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
  9592	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
  9593	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
  9594	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
  9595	VPADDD     Y4, Y12, Y12
  9596	VPADDD     Y1, Y13, Y13
  9597	VPADDD     Y2, Y8, Y8
  9598	VPADDD     Y3, Y15, Y15
  9599	VPXOR      Y12, Y14, Y14
  9600	VPXOR      Y13, Y9, Y9
  9601	VPXOR      Y8, Y10, Y10
  9602	VPXOR      Y15, Y11, Y11
  9603	ADDQ       16(DI), R10
  9604	ADCQ       24(DI), R11
  9605	ADCQ       $0x01, R12
  9606	MOVQ       (BP), DX
  9607	MOVQ       DX, R15
  9608	MULXQ      R10, R13, R14
  9609	IMULQ      R12, R15
  9610	MULXQ      R11, AX, DX
  9611	ADDQ       AX, R14
  9612	ADCQ       DX, R15
  9613	MOVQ       8(BP), DX
  9614	MULXQ      R10, R10, AX
  9615	ADDQ       R10, R14
  9616	MULXQ      R11, R11, R8
  9617	ADCQ       R11, R15
  9618	ADCQ       $0x00, R8
  9619	IMULQ      R12, DX
  9620	ADDQ       AX, R15
  9621	ADCQ       DX, R8
  9622	MOVQ       R13, R10
  9623	MOVQ       R14, R11
  9624	MOVQ       R15, R12
  9625	ANDQ       $0x03, R12
  9626	MOVQ       R15, R13
  9627	ANDQ       $-4, R13
  9628	MOVQ       R8, R14
  9629	SHRQ       $0x02, R8, R15
  9630	SHRQ       $0x02, R8
  9631	ADDQ       R13, R10
  9632	ADCQ       R14, R11
  9633	ADCQ       $0x00, R12
  9634	ADDQ       R15, R10
  9635	ADCQ       R8, R11
  9636	ADCQ       $0x00, R12
  9637	LEAQ       32(DI), DI
  9638	VMOVDQA    Y15, 224(BP)
  9639	VPSLLD     $0x0c, Y14, Y15
  9640	VPSRLD     $0x14, Y14, Y14
  9641	VPXOR      Y15, Y14, Y14
  9642	VPSLLD     $0x0c, Y9, Y15
  9643	VPSRLD     $0x14, Y9, Y9
  9644	VPXOR      Y15, Y9, Y9
  9645	VPSLLD     $0x0c, Y10, Y15
  9646	VPSRLD     $0x14, Y10, Y10
  9647	VPXOR      Y15, Y10, Y10
  9648	VPSLLD     $0x0c, Y11, Y15
  9649	VPSRLD     $0x14, Y11, Y11
  9650	VPXOR      Y15, Y11, Y11
  9651	VMOVDQA    224(BP), Y15
  9652	VPADDD     Y14, Y0, Y0
  9653	VPADDD     Y9, Y5, Y5
  9654	VPADDD     Y10, Y6, Y6
  9655	VPADDD     Y11, Y7, Y7
  9656	VPXOR      Y0, Y4, Y4
  9657	VPXOR      Y5, Y1, Y1
  9658	VPXOR      Y6, Y2, Y2
  9659	VPXOR      Y7, Y3, Y3
  9660	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
  9661	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
  9662	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
  9663	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
  9664	VPADDD     Y4, Y12, Y12
  9665	VPADDD     Y1, Y13, Y13
  9666	VPADDD     Y2, Y8, Y8
  9667	VPADDD     Y3, Y15, Y15
  9668	VPXOR      Y12, Y14, Y14
  9669	VPXOR      Y13, Y9, Y9
  9670	VPXOR      Y8, Y10, Y10
  9671	VPXOR      Y15, Y11, Y11
  9672	VMOVDQA    Y15, 224(BP)
  9673	VPSLLD     $0x07, Y14, Y15
  9674	VPSRLD     $0x19, Y14, Y14
  9675	VPXOR      Y15, Y14, Y14
  9676	VPSLLD     $0x07, Y9, Y15
  9677	VPSRLD     $0x19, Y9, Y9
  9678	VPXOR      Y15, Y9, Y9
  9679	VPSLLD     $0x07, Y10, Y15
  9680	VPSRLD     $0x19, Y10, Y10
  9681	VPXOR      Y15, Y10, Y10
  9682	VPSLLD     $0x07, Y11, Y15
  9683	VPSRLD     $0x19, Y11, Y11
  9684	VPXOR      Y15, Y11, Y11
  9685	VMOVDQA    224(BP), Y15
  9686	VPALIGNR   $0x0c, Y14, Y14, Y14
  9687	VPALIGNR   $0x0c, Y9, Y9, Y9
  9688	VPALIGNR   $0x0c, Y10, Y10, Y10
  9689	VPALIGNR   $0x0c, Y11, Y11, Y11
  9690	VPALIGNR   $0x08, Y12, Y12, Y12
  9691	VPALIGNR   $0x08, Y13, Y13, Y13
  9692	VPALIGNR   $0x08, Y8, Y8, Y8
  9693	VPALIGNR   $0x08, Y15, Y15, Y15
  9694	VPALIGNR   $0x04, Y4, Y4, Y4
  9695	VPALIGNR   $0x04, Y1, Y1, Y1
  9696	VPALIGNR   $0x04, Y2, Y2, Y2
  9697	VPALIGNR   $0x04, Y3, Y3, Y3
  9698	DECQ       CX
  9699	JG         sealAVX2Tail512LoopA
  9700	DECQ       R9
  9701	JGE        sealAVX2Tail512LoopB
  9702	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
  9703	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
  9704	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
  9705	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
  9706	VPADDD     32(BP), Y14, Y14
  9707	VPADDD     32(BP), Y9, Y9
  9708	VPADDD     32(BP), Y10, Y10
  9709	VPADDD     32(BP), Y11, Y11
  9710	VPADDD     64(BP), Y12, Y12
  9711	VPADDD     64(BP), Y13, Y13
  9712	VPADDD     64(BP), Y8, Y8
  9713	VPADDD     64(BP), Y15, Y15
  9714	VPADDD     96(BP), Y4, Y4
  9715	VPADDD     128(BP), Y1, Y1
  9716	VPADDD     160(BP), Y2, Y2
  9717	VPADDD     192(BP), Y3, Y3
  9718	VMOVDQA    Y15, 224(BP)
  9719	VPERM2I128 $0x02, Y0, Y14, Y15
  9720	VPXOR      (SI), Y15, Y15
  9721	VMOVDQU    Y15, (DI)
  9722	VPERM2I128 $0x02, Y12, Y4, Y15
  9723	VPXOR      32(SI), Y15, Y15
  9724	VMOVDQU    Y15, 32(DI)
  9725	VPERM2I128 $0x13, Y0, Y14, Y15
  9726	VPXOR      64(SI), Y15, Y15
  9727	VMOVDQU    Y15, 64(DI)
  9728	VPERM2I128 $0x13, Y12, Y4, Y15
  9729	VPXOR      96(SI), Y15, Y15
  9730	VMOVDQU    Y15, 96(DI)
  9731	VPERM2I128 $0x02, Y5, Y9, Y0
  9732	VPERM2I128 $0x02, Y13, Y1, Y14
  9733	VPERM2I128 $0x13, Y5, Y9, Y12
  9734	VPERM2I128 $0x13, Y13, Y1, Y4
  9735	VPXOR      128(SI), Y0, Y0
  9736	VPXOR      160(SI), Y14, Y14
  9737	VPXOR      192(SI), Y12, Y12
  9738	VPXOR      224(SI), Y4, Y4
  9739	VMOVDQU    Y0, 128(DI)
  9740	VMOVDQU    Y14, 160(DI)
  9741	VMOVDQU    Y12, 192(DI)
  9742	VMOVDQU    Y4, 224(DI)
  9743	VPERM2I128 $0x02, Y6, Y10, Y0
  9744	VPERM2I128 $0x02, Y8, Y2, Y14
  9745	VPERM2I128 $0x13, Y6, Y10, Y12
  9746	VPERM2I128 $0x13, Y8, Y2, Y4
  9747	VPXOR      256(SI), Y0, Y0
  9748	VPXOR      288(SI), Y14, Y14
  9749	VPXOR      320(SI), Y12, Y12
  9750	VPXOR      352(SI), Y4, Y4
  9751	VMOVDQU    Y0, 256(DI)
  9752	VMOVDQU    Y14, 288(DI)
  9753	VMOVDQU    Y12, 320(DI)
  9754	VMOVDQU    Y4, 352(DI)
  9755	MOVQ       $0x00000180, CX
  9756	LEAQ       384(SI), SI
  9757	SUBQ       $0x00000180, BX
  9758	VPERM2I128 $0x02, Y7, Y11, Y0
  9759	VPERM2I128 $0x02, 224(BP), Y3, Y14
  9760	VPERM2I128 $0x13, Y7, Y11, Y12
  9761	VPERM2I128 $0x13, 224(BP), Y3, Y4
  9762	JMP        sealAVX2SealHash

View as plain text