1// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
2
3//go:build gc && !purego
4
5#include "textflag.h"
6
7// func polyHashADInternal<>()
8TEXT polyHashADInternal<>(SB), NOSPLIT, $0
9 // Hack: Must declare #define macros inside of a function due to Avo constraints
10 // ROL rotates the uint32s in register R left by N bits, using temporary T.
11 #define ROL(N, R, T) \
12 MOVO R, T; \
13 PSLLL $(N), T; \
14 PSRLL $(32-(N)), R; \
15 PXOR T, R
16
17 // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
18 #ifdef GOAMD64_v2
19 #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
20 #else
21 #define ROL8(R, T) ROL(8, R, T)
22 #endif
23
24 // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
25 #ifdef GOAMD64_v2
26 #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
27 #else
28 #define ROL16(R, T) ROL(16, R, T)
29 #endif
30 XORQ R10, R10
31 XORQ R11, R11
32 XORQ R12, R12
33 CMPQ R9, $0x0d
34 JNE hashADLoop
35 MOVQ (CX), R10
36 MOVQ 5(CX), R11
37 SHRQ $0x18, R11
38 MOVQ $0x00000001, R12
39 MOVQ (BP), AX
40 MOVQ AX, R15
41 MULQ R10
42 MOVQ AX, R13
43 MOVQ DX, R14
44 MOVQ (BP), AX
45 MULQ R11
46 IMULQ R12, R15
47 ADDQ AX, R14
48 ADCQ DX, R15
49 MOVQ 8(BP), AX
50 MOVQ AX, R8
51 MULQ R10
52 ADDQ AX, R14
53 ADCQ $0x00, DX
54 MOVQ DX, R10
55 MOVQ 8(BP), AX
56 MULQ R11
57 ADDQ AX, R15
58 ADCQ $0x00, DX
59 IMULQ R12, R8
60 ADDQ R10, R15
61 ADCQ DX, R8
62 MOVQ R13, R10
63 MOVQ R14, R11
64 MOVQ R15, R12
65 ANDQ $0x03, R12
66 MOVQ R15, R13
67 ANDQ $-4, R13
68 MOVQ R8, R14
69 SHRQ $0x02, R8, R15
70 SHRQ $0x02, R8
71 ADDQ R13, R10
72 ADCQ R14, R11
73 ADCQ $0x00, R12
74 ADDQ R15, R10
75 ADCQ R8, R11
76 ADCQ $0x00, R12
77 RET
78
79hashADLoop:
80 // Hash in 16 byte chunks
81 CMPQ R9, $0x10
82 JB hashADTail
83 ADDQ (CX), R10
84 ADCQ 8(CX), R11
85 ADCQ $0x01, R12
86 LEAQ 16(CX), CX
87 SUBQ $0x10, R9
88 MOVQ (BP), AX
89 MOVQ AX, R15
90 MULQ R10
91 MOVQ AX, R13
92 MOVQ DX, R14
93 MOVQ (BP), AX
94 MULQ R11
95 IMULQ R12, R15
96 ADDQ AX, R14
97 ADCQ DX, R15
98 MOVQ 8(BP), AX
99 MOVQ AX, R8
100 MULQ R10
101 ADDQ AX, R14
102 ADCQ $0x00, DX
103 MOVQ DX, R10
104 MOVQ 8(BP), AX
105 MULQ R11
106 ADDQ AX, R15
107 ADCQ $0x00, DX
108 IMULQ R12, R8
109 ADDQ R10, R15
110 ADCQ DX, R8
111 MOVQ R13, R10
112 MOVQ R14, R11
113 MOVQ R15, R12
114 ANDQ $0x03, R12
115 MOVQ R15, R13
116 ANDQ $-4, R13
117 MOVQ R8, R14
118 SHRQ $0x02, R8, R15
119 SHRQ $0x02, R8
120 ADDQ R13, R10
121 ADCQ R14, R11
122 ADCQ $0x00, R12
123 ADDQ R15, R10
124 ADCQ R8, R11
125 ADCQ $0x00, R12
126 JMP hashADLoop
127
128hashADTail:
129 CMPQ R9, $0x00
130 JE hashADDone
131
132 // Hash last < 16 byte tail
133 XORQ R13, R13
134 XORQ R14, R14
135 XORQ R15, R15
136 ADDQ R9, CX
137
138hashADTailLoop:
139 SHLQ $0x08, R13, R14
140 SHLQ $0x08, R13
141 MOVB -1(CX), R15
142 XORQ R15, R13
143 DECQ CX
144 DECQ R9
145 JNE hashADTailLoop
146 ADDQ R13, R10
147 ADCQ R14, R11
148 ADCQ $0x01, R12
149 MOVQ (BP), AX
150 MOVQ AX, R15
151 MULQ R10
152 MOVQ AX, R13
153 MOVQ DX, R14
154 MOVQ (BP), AX
155 MULQ R11
156 IMULQ R12, R15
157 ADDQ AX, R14
158 ADCQ DX, R15
159 MOVQ 8(BP), AX
160 MOVQ AX, R8
161 MULQ R10
162 ADDQ AX, R14
163 ADCQ $0x00, DX
164 MOVQ DX, R10
165 MOVQ 8(BP), AX
166 MULQ R11
167 ADDQ AX, R15
168 ADCQ $0x00, DX
169 IMULQ R12, R8
170 ADDQ R10, R15
171 ADCQ DX, R8
172 MOVQ R13, R10
173 MOVQ R14, R11
174 MOVQ R15, R12
175 ANDQ $0x03, R12
176 MOVQ R15, R13
177 ANDQ $-4, R13
178 MOVQ R8, R14
179 SHRQ $0x02, R8, R15
180 SHRQ $0x02, R8
181 ADDQ R13, R10
182 ADCQ R14, R11
183 ADCQ $0x00, R12
184 ADDQ R15, R10
185 ADCQ R8, R11
186 ADCQ $0x00, R12
187
188hashADDone:
189 RET
190
191// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
192// Requires: AVX, AVX2, BMI2, CMOV, SSE2
193TEXT ·chacha20Poly1305Open(SB), $288-97
194 // For aligned stack access
195 MOVQ SP, BP
196 ADDQ $0x20, BP
197 ANDQ $-32, BP
198 MOVQ dst_base+0(FP), DI
199 MOVQ key_base+24(FP), R8
200 MOVQ src_base+48(FP), SI
201 MOVQ src_len+56(FP), BX
202 MOVQ ad_base+72(FP), CX
203
204 // Check for AVX2 support
205 CMPB ·useAVX2+0(SB), $0x01
206 JE chacha20Poly1305Open_AVX2
207
208 // Special optimization, for very short buffers
209 CMPQ BX, $0x80
210 JBE openSSE128
211
212 // For long buffers, prepare the poly key first
213 MOVOU ·chacha20Constants<>+0(SB), X0
214 MOVOU 16(R8), X3
215 MOVOU 32(R8), X6
216 MOVOU 48(R8), X9
217 MOVO X9, X13
218
219 // Store state on stack for future use
220 MOVO X3, 32(BP)
221 MOVO X6, 48(BP)
222 MOVO X9, 128(BP)
223 MOVQ $0x0000000a, R9
224
225openSSEPreparePolyKey:
226 PADDD X3, X0
227 PXOR X0, X9
228 ROL16(X9, X12)
229 PADDD X9, X6
230 PXOR X6, X3
231 MOVO X3, X12
232 PSLLL $0x0c, X12
233 PSRLL $0x14, X3
234 PXOR X12, X3
235 PADDD X3, X0
236 PXOR X0, X9
237 ROL8(X9, X12)
238 PADDD X9, X6
239 PXOR X6, X3
240 MOVO X3, X12
241 PSLLL $0x07, X12
242 PSRLL $0x19, X3
243 PXOR X12, X3
244 BYTE $0x66
245 BYTE $0x0f
246 BYTE $0x3a
247 BYTE $0x0f
248 BYTE $0xdb
249 BYTE $0x04
250 BYTE $0x66
251 BYTE $0x0f
252 BYTE $0x3a
253 BYTE $0x0f
254 BYTE $0xf6
255 BYTE $0x08
256 BYTE $0x66
257 BYTE $0x45
258 BYTE $0x0f
259 BYTE $0x3a
260 BYTE $0x0f
261 BYTE $0xc9
262 BYTE $0x0c
263 PADDD X3, X0
264 PXOR X0, X9
265 ROL16(X9, X12)
266 PADDD X9, X6
267 PXOR X6, X3
268 MOVO X3, X12
269 PSLLL $0x0c, X12
270 PSRLL $0x14, X3
271 PXOR X12, X3
272 PADDD X3, X0
273 PXOR X0, X9
274 ROL8(X9, X12)
275 PADDD X9, X6
276 PXOR X6, X3
277 MOVO X3, X12
278 PSLLL $0x07, X12
279 PSRLL $0x19, X3
280 PXOR X12, X3
281 BYTE $0x66
282 BYTE $0x0f
283 BYTE $0x3a
284 BYTE $0x0f
285 BYTE $0xdb
286 BYTE $0x0c
287 BYTE $0x66
288 BYTE $0x0f
289 BYTE $0x3a
290 BYTE $0x0f
291 BYTE $0xf6
292 BYTE $0x08
293 BYTE $0x66
294 BYTE $0x45
295 BYTE $0x0f
296 BYTE $0x3a
297 BYTE $0x0f
298 BYTE $0xc9
299 BYTE $0x04
300 DECQ R9
301 JNE openSSEPreparePolyKey
302
303 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
304 PADDL ·chacha20Constants<>+0(SB), X0
305 PADDL 32(BP), X3
306
307 // Clamp and store the key
308 PAND ·polyClampMask<>+0(SB), X0
309 MOVO X0, (BP)
310 MOVO X3, 16(BP)
311
312 // Hash AAD
313 MOVQ ad_len+80(FP), R9
314 CALL polyHashADInternal<>(SB)
315
316openSSEMainLoop:
317 CMPQ BX, $0x00000100
318 JB openSSEMainLoopDone
319
320 // Load state, increment counter blocks
321 MOVO ·chacha20Constants<>+0(SB), X0
322 MOVO 32(BP), X3
323 MOVO 48(BP), X6
324 MOVO 128(BP), X9
325 PADDL ·sseIncMask<>+0(SB), X9
326 MOVO X0, X1
327 MOVO X3, X4
328 MOVO X6, X7
329 MOVO X9, X10
330 PADDL ·sseIncMask<>+0(SB), X10
331 MOVO X1, X2
332 MOVO X4, X5
333 MOVO X7, X8
334 MOVO X10, X11
335 PADDL ·sseIncMask<>+0(SB), X11
336 MOVO X2, X12
337 MOVO X5, X13
338 MOVO X8, X14
339 MOVO X11, X15
340 PADDL ·sseIncMask<>+0(SB), X15
341
342 // Store counters
343 MOVO X9, 80(BP)
344 MOVO X10, 96(BP)
345 MOVO X11, 112(BP)
346 MOVO X15, 128(BP)
347
348 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
349 // 2 blocks, and for the remaining 4 only 1 block - for a total of 16
350 MOVQ $0x00000004, CX
351 MOVQ SI, R9
352
353openSSEInternalLoop:
354 MOVO X14, 64(BP)
355 PADDD X3, X0
356 PXOR X0, X9
357 ROL16(X9, X14)
358 PADDD X9, X6
359 PXOR X6, X3
360 MOVO X3, X14
361 PSLLL $0x0c, X14
362 PSRLL $0x14, X3
363 PXOR X14, X3
364 PADDD X3, X0
365 PXOR X0, X9
366 ROL8(X9, X14)
367 PADDD X9, X6
368 PXOR X6, X3
369 MOVO X3, X14
370 PSLLL $0x07, X14
371 PSRLL $0x19, X3
372 PXOR X14, X3
373 PADDD X4, X1
374 PXOR X1, X10
375 ROL16(X10, X14)
376 PADDD X10, X7
377 PXOR X7, X4
378 MOVO X4, X14
379 PSLLL $0x0c, X14
380 PSRLL $0x14, X4
381 PXOR X14, X4
382 PADDD X4, X1
383 PXOR X1, X10
384 ROL8(X10, X14)
385 PADDD X10, X7
386 PXOR X7, X4
387 MOVO X4, X14
388 PSLLL $0x07, X14
389 PSRLL $0x19, X4
390 PXOR X14, X4
391 PADDD X5, X2
392 PXOR X2, X11
393 ROL16(X11, X14)
394 PADDD X11, X8
395 PXOR X8, X5
396 MOVO X5, X14
397 PSLLL $0x0c, X14
398 PSRLL $0x14, X5
399 PXOR X14, X5
400 PADDD X5, X2
401 PXOR X2, X11
402 ROL8(X11, X14)
403 PADDD X11, X8
404 PXOR X8, X5
405 MOVO X5, X14
406 PSLLL $0x07, X14
407 PSRLL $0x19, X5
408 PXOR X14, X5
409 MOVO 64(BP), X14
410 MOVO X7, 64(BP)
411 PADDD X13, X12
412 PXOR X12, X15
413 ROL16(X15, X7)
414 PADDD X15, X14
415 PXOR X14, X13
416 MOVO X13, X7
417 PSLLL $0x0c, X7
418 PSRLL $0x14, X13
419 PXOR X7, X13
420 PADDD X13, X12
421 PXOR X12, X15
422 ROL8(X15, X7)
423 PADDD X15, X14
424 PXOR X14, X13
425 MOVO X13, X7
426 PSLLL $0x07, X7
427 PSRLL $0x19, X13
428 PXOR X7, X13
429 MOVO 64(BP), X7
430 ADDQ (R9), R10
431 ADCQ 8(R9), R11
432 ADCQ $0x01, R12
433 BYTE $0x66
434 BYTE $0x0f
435 BYTE $0x3a
436 BYTE $0x0f
437 BYTE $0xdb
438 BYTE $0x04
439 BYTE $0x66
440 BYTE $0x0f
441 BYTE $0x3a
442 BYTE $0x0f
443 BYTE $0xe4
444 BYTE $0x04
445 BYTE $0x66
446 BYTE $0x0f
447 BYTE $0x3a
448 BYTE $0x0f
449 BYTE $0xed
450 BYTE $0x04
451 BYTE $0x66
452 BYTE $0x45
453 BYTE $0x0f
454 BYTE $0x3a
455 BYTE $0x0f
456 BYTE $0xed
457 BYTE $0x04
458 BYTE $0x66
459 BYTE $0x0f
460 BYTE $0x3a
461 BYTE $0x0f
462 BYTE $0xf6
463 BYTE $0x08
464 BYTE $0x66
465 BYTE $0x0f
466 BYTE $0x3a
467 BYTE $0x0f
468 BYTE $0xff
469 BYTE $0x08
470 BYTE $0x66
471 BYTE $0x45
472 BYTE $0x0f
473 BYTE $0x3a
474 BYTE $0x0f
475 BYTE $0xc0
476 BYTE $0x08
477 BYTE $0x66
478 BYTE $0x45
479 BYTE $0x0f
480 BYTE $0x3a
481 BYTE $0x0f
482 BYTE $0xf6
483 BYTE $0x08
484 BYTE $0x66
485 BYTE $0x45
486 BYTE $0x0f
487 BYTE $0x3a
488 BYTE $0x0f
489 BYTE $0xc9
490 BYTE $0x0c
491 BYTE $0x66
492 BYTE $0x45
493 BYTE $0x0f
494 BYTE $0x3a
495 BYTE $0x0f
496 BYTE $0xd2
497 BYTE $0x0c
498 BYTE $0x66
499 BYTE $0x45
500 BYTE $0x0f
501 BYTE $0x3a
502 BYTE $0x0f
503 BYTE $0xdb
504 BYTE $0x0c
505 BYTE $0x66
506 BYTE $0x45
507 BYTE $0x0f
508 BYTE $0x3a
509 BYTE $0x0f
510 BYTE $0xff
511 BYTE $0x0c
512 MOVQ (BP), AX
513 MOVQ AX, R15
514 MULQ R10
515 MOVQ AX, R13
516 MOVQ DX, R14
517 MOVQ (BP), AX
518 MULQ R11
519 IMULQ R12, R15
520 ADDQ AX, R14
521 ADCQ DX, R15
522 MOVQ 8(BP), AX
523 MOVQ AX, R8
524 MULQ R10
525 ADDQ AX, R14
526 ADCQ $0x00, DX
527 MOVQ DX, R10
528 MOVQ 8(BP), AX
529 MULQ R11
530 ADDQ AX, R15
531 ADCQ $0x00, DX
532 LEAQ 16(R9), R9
533 MOVO X14, 64(BP)
534 PADDD X3, X0
535 PXOR X0, X9
536 ROL16(X9, X14)
537 PADDD X9, X6
538 PXOR X6, X3
539 MOVO X3, X14
540 PSLLL $0x0c, X14
541 PSRLL $0x14, X3
542 PXOR X14, X3
543 PADDD X3, X0
544 PXOR X0, X9
545 ROL8(X9, X14)
546 PADDD X9, X6
547 PXOR X6, X3
548 MOVO X3, X14
549 PSLLL $0x07, X14
550 PSRLL $0x19, X3
551 PXOR X14, X3
552 PADDD X4, X1
553 PXOR X1, X10
554 ROL16(X10, X14)
555 PADDD X10, X7
556 PXOR X7, X4
557 MOVO X4, X14
558 PSLLL $0x0c, X14
559 PSRLL $0x14, X4
560 PXOR X14, X4
561 PADDD X4, X1
562 PXOR X1, X10
563 ROL8(X10, X14)
564 PADDD X10, X7
565 PXOR X7, X4
566 MOVO X4, X14
567 PSLLL $0x07, X14
568 PSRLL $0x19, X4
569 PXOR X14, X4
570 PADDD X5, X2
571 PXOR X2, X11
572 ROL16(X11, X14)
573 PADDD X11, X8
574 PXOR X8, X5
575 MOVO X5, X14
576 PSLLL $0x0c, X14
577 PSRLL $0x14, X5
578 PXOR X14, X5
579 PADDD X5, X2
580 PXOR X2, X11
581 ROL8(X11, X14)
582 PADDD X11, X8
583 PXOR X8, X5
584 MOVO X5, X14
585 PSLLL $0x07, X14
586 PSRLL $0x19, X5
587 PXOR X14, X5
588 MOVO 64(BP), X14
589 MOVO X7, 64(BP)
590 IMULQ R12, R8
591 ADDQ R10, R15
592 ADCQ DX, R8
593 PADDD X13, X12
594 PXOR X12, X15
595 ROL16(X15, X7)
596 PADDD X15, X14
597 PXOR X14, X13
598 MOVO X13, X7
599 PSLLL $0x0c, X7
600 PSRLL $0x14, X13
601 PXOR X7, X13
602 PADDD X13, X12
603 PXOR X12, X15
604 ROL8(X15, X7)
605 PADDD X15, X14
606 PXOR X14, X13
607 MOVO X13, X7
608 PSLLL $0x07, X7
609 PSRLL $0x19, X13
610 PXOR X7, X13
611 MOVO 64(BP), X7
612 MOVQ R13, R10
613 MOVQ R14, R11
614 MOVQ R15, R12
615 ANDQ $0x03, R12
616 MOVQ R15, R13
617 ANDQ $-4, R13
618 MOVQ R8, R14
619 SHRQ $0x02, R8, R15
620 SHRQ $0x02, R8
621 ADDQ R13, R10
622 ADCQ R14, R11
623 ADCQ $0x00, R12
624 ADDQ R15, R10
625 ADCQ R8, R11
626 ADCQ $0x00, R12
627 BYTE $0x66
628 BYTE $0x0f
629 BYTE $0x3a
630 BYTE $0x0f
631 BYTE $0xdb
632 BYTE $0x0c
633 BYTE $0x66
634 BYTE $0x0f
635 BYTE $0x3a
636 BYTE $0x0f
637 BYTE $0xe4
638 BYTE $0x0c
639 BYTE $0x66
640 BYTE $0x0f
641 BYTE $0x3a
642 BYTE $0x0f
643 BYTE $0xed
644 BYTE $0x0c
645 BYTE $0x66
646 BYTE $0x45
647 BYTE $0x0f
648 BYTE $0x3a
649 BYTE $0x0f
650 BYTE $0xed
651 BYTE $0x0c
652 BYTE $0x66
653 BYTE $0x0f
654 BYTE $0x3a
655 BYTE $0x0f
656 BYTE $0xf6
657 BYTE $0x08
658 BYTE $0x66
659 BYTE $0x0f
660 BYTE $0x3a
661 BYTE $0x0f
662 BYTE $0xff
663 BYTE $0x08
664 BYTE $0x66
665 BYTE $0x45
666 BYTE $0x0f
667 BYTE $0x3a
668 BYTE $0x0f
669 BYTE $0xc0
670 BYTE $0x08
671 BYTE $0x66
672 BYTE $0x45
673 BYTE $0x0f
674 BYTE $0x3a
675 BYTE $0x0f
676 BYTE $0xf6
677 BYTE $0x08
678 BYTE $0x66
679 BYTE $0x45
680 BYTE $0x0f
681 BYTE $0x3a
682 BYTE $0x0f
683 BYTE $0xc9
684 BYTE $0x04
685 BYTE $0x66
686 BYTE $0x45
687 BYTE $0x0f
688 BYTE $0x3a
689 BYTE $0x0f
690 BYTE $0xd2
691 BYTE $0x04
692 BYTE $0x66
693 BYTE $0x45
694 BYTE $0x0f
695 BYTE $0x3a
696 BYTE $0x0f
697 BYTE $0xdb
698 BYTE $0x04
699 BYTE $0x66
700 BYTE $0x45
701 BYTE $0x0f
702 BYTE $0x3a
703 BYTE $0x0f
704 BYTE $0xff
705 BYTE $0x04
706 DECQ CX
707 JGE openSSEInternalLoop
708 ADDQ (R9), R10
709 ADCQ 8(R9), R11
710 ADCQ $0x01, R12
711 MOVQ (BP), AX
712 MOVQ AX, R15
713 MULQ R10
714 MOVQ AX, R13
715 MOVQ DX, R14
716 MOVQ (BP), AX
717 MULQ R11
718 IMULQ R12, R15
719 ADDQ AX, R14
720 ADCQ DX, R15
721 MOVQ 8(BP), AX
722 MOVQ AX, R8
723 MULQ R10
724 ADDQ AX, R14
725 ADCQ $0x00, DX
726 MOVQ DX, R10
727 MOVQ 8(BP), AX
728 MULQ R11
729 ADDQ AX, R15
730 ADCQ $0x00, DX
731 IMULQ R12, R8
732 ADDQ R10, R15
733 ADCQ DX, R8
734 MOVQ R13, R10
735 MOVQ R14, R11
736 MOVQ R15, R12
737 ANDQ $0x03, R12
738 MOVQ R15, R13
739 ANDQ $-4, R13
740 MOVQ R8, R14
741 SHRQ $0x02, R8, R15
742 SHRQ $0x02, R8
743 ADDQ R13, R10
744 ADCQ R14, R11
745 ADCQ $0x00, R12
746 ADDQ R15, R10
747 ADCQ R8, R11
748 ADCQ $0x00, R12
749 LEAQ 16(R9), R9
750 CMPQ CX, $-6
751 JG openSSEInternalLoop
752
753 // Add in the state
754 PADDD ·chacha20Constants<>+0(SB), X0
755 PADDD ·chacha20Constants<>+0(SB), X1
756 PADDD ·chacha20Constants<>+0(SB), X2
757 PADDD ·chacha20Constants<>+0(SB), X12
758 PADDD 32(BP), X3
759 PADDD 32(BP), X4
760 PADDD 32(BP), X5
761 PADDD 32(BP), X13
762 PADDD 48(BP), X6
763 PADDD 48(BP), X7
764 PADDD 48(BP), X8
765 PADDD 48(BP), X14
766 PADDD 80(BP), X9
767 PADDD 96(BP), X10
768 PADDD 112(BP), X11
769 PADDD 128(BP), X15
770
771 // Load - xor - store
772 MOVO X15, 64(BP)
773 MOVOU (SI), X15
774 PXOR X15, X0
775 MOVOU X0, (DI)
776 MOVOU 16(SI), X15
777 PXOR X15, X3
778 MOVOU X3, 16(DI)
779 MOVOU 32(SI), X15
780 PXOR X15, X6
781 MOVOU X6, 32(DI)
782 MOVOU 48(SI), X15
783 PXOR X15, X9
784 MOVOU X9, 48(DI)
785 MOVOU 64(SI), X9
786 PXOR X9, X1
787 MOVOU X1, 64(DI)
788 MOVOU 80(SI), X9
789 PXOR X9, X4
790 MOVOU X4, 80(DI)
791 MOVOU 96(SI), X9
792 PXOR X9, X7
793 MOVOU X7, 96(DI)
794 MOVOU 112(SI), X9
795 PXOR X9, X10
796 MOVOU X10, 112(DI)
797 MOVOU 128(SI), X9
798 PXOR X9, X2
799 MOVOU X2, 128(DI)
800 MOVOU 144(SI), X9
801 PXOR X9, X5
802 MOVOU X5, 144(DI)
803 MOVOU 160(SI), X9
804 PXOR X9, X8
805 MOVOU X8, 160(DI)
806 MOVOU 176(SI), X9
807 PXOR X9, X11
808 MOVOU X11, 176(DI)
809 MOVOU 192(SI), X9
810 PXOR X9, X12
811 MOVOU X12, 192(DI)
812 MOVOU 208(SI), X9
813 PXOR X9, X13
814 MOVOU X13, 208(DI)
815 MOVOU 224(SI), X9
816 PXOR X9, X14
817 MOVOU X14, 224(DI)
818 MOVOU 240(SI), X9
819 PXOR 64(BP), X9
820 MOVOU X9, 240(DI)
821 LEAQ 256(SI), SI
822 LEAQ 256(DI), DI
823 SUBQ $0x00000100, BX
824 JMP openSSEMainLoop
825
826openSSEMainLoopDone:
827 // Handle the various tail sizes efficiently
828 TESTQ BX, BX
829 JE openSSEFinalize
830 CMPQ BX, $0x40
831 JBE openSSETail64
832 CMPQ BX, $0x80
833 JBE openSSETail128
834 CMPQ BX, $0xc0
835 JBE openSSETail192
836 JMP openSSETail256
837
838openSSEFinalize:
839 // Hash in the PT, AAD lengths
840 ADDQ ad_len+80(FP), R10
841 ADCQ src_len+56(FP), R11
842 ADCQ $0x01, R12
843 MOVQ (BP), AX
844 MOVQ AX, R15
845 MULQ R10
846 MOVQ AX, R13
847 MOVQ DX, R14
848 MOVQ (BP), AX
849 MULQ R11
850 IMULQ R12, R15
851 ADDQ AX, R14
852 ADCQ DX, R15
853 MOVQ 8(BP), AX
854 MOVQ AX, R8
855 MULQ R10
856 ADDQ AX, R14
857 ADCQ $0x00, DX
858 MOVQ DX, R10
859 MOVQ 8(BP), AX
860 MULQ R11
861 ADDQ AX, R15
862 ADCQ $0x00, DX
863 IMULQ R12, R8
864 ADDQ R10, R15
865 ADCQ DX, R8
866 MOVQ R13, R10
867 MOVQ R14, R11
868 MOVQ R15, R12
869 ANDQ $0x03, R12
870 MOVQ R15, R13
871 ANDQ $-4, R13
872 MOVQ R8, R14
873 SHRQ $0x02, R8, R15
874 SHRQ $0x02, R8
875 ADDQ R13, R10
876 ADCQ R14, R11
877 ADCQ $0x00, R12
878 ADDQ R15, R10
879 ADCQ R8, R11
880 ADCQ $0x00, R12
881
882 // Final reduce
883 MOVQ R10, R13
884 MOVQ R11, R14
885 MOVQ R12, R15
886 SUBQ $-5, R10
887 SBBQ $-1, R11
888 SBBQ $0x03, R12
889 CMOVQCS R13, R10
890 CMOVQCS R14, R11
891 CMOVQCS R15, R12
892
893 // Add in the "s" part of the key
894 ADDQ 16(BP), R10
895 ADCQ 24(BP), R11
896
897 // Finally, constant time compare to the tag at the end of the message
898 XORQ AX, AX
899 MOVQ $0x00000001, DX
900 XORQ (SI), R10
901 XORQ 8(SI), R11
902 ORQ R11, R10
903 CMOVQEQ DX, AX
904
905 // Return true iff tags are equal
906 MOVB AX, ret+96(FP)
907 RET
908
909openSSE128:
910 MOVOU ·chacha20Constants<>+0(SB), X0
911 MOVOU 16(R8), X3
912 MOVOU 32(R8), X6
913 MOVOU 48(R8), X9
914 MOVO X0, X1
915 MOVO X3, X4
916 MOVO X6, X7
917 MOVO X9, X10
918 PADDL ·sseIncMask<>+0(SB), X10
919 MOVO X1, X2
920 MOVO X4, X5
921 MOVO X7, X8
922 MOVO X10, X11
923 PADDL ·sseIncMask<>+0(SB), X11
924 MOVO X3, X13
925 MOVO X6, X14
926 MOVO X10, X15
927 MOVQ $0x0000000a, R9
928
929openSSE128InnerCipherLoop:
930 PADDD X3, X0
931 PXOR X0, X9
932 ROL16(X9, X12)
933 PADDD X9, X6
934 PXOR X6, X3
935 MOVO X3, X12
936 PSLLL $0x0c, X12
937 PSRLL $0x14, X3
938 PXOR X12, X3
939 PADDD X3, X0
940 PXOR X0, X9
941 ROL8(X9, X12)
942 PADDD X9, X6
943 PXOR X6, X3
944 MOVO X3, X12
945 PSLLL $0x07, X12
946 PSRLL $0x19, X3
947 PXOR X12, X3
948 PADDD X4, X1
949 PXOR X1, X10
950 ROL16(X10, X12)
951 PADDD X10, X7
952 PXOR X7, X4
953 MOVO X4, X12
954 PSLLL $0x0c, X12
955 PSRLL $0x14, X4
956 PXOR X12, X4
957 PADDD X4, X1
958 PXOR X1, X10
959 ROL8(X10, X12)
960 PADDD X10, X7
961 PXOR X7, X4
962 MOVO X4, X12
963 PSLLL $0x07, X12
964 PSRLL $0x19, X4
965 PXOR X12, X4
966 PADDD X5, X2
967 PXOR X2, X11
968 ROL16(X11, X12)
969 PADDD X11, X8
970 PXOR X8, X5
971 MOVO X5, X12
972 PSLLL $0x0c, X12
973 PSRLL $0x14, X5
974 PXOR X12, X5
975 PADDD X5, X2
976 PXOR X2, X11
977 ROL8(X11, X12)
978 PADDD X11, X8
979 PXOR X8, X5
980 MOVO X5, X12
981 PSLLL $0x07, X12
982 PSRLL $0x19, X5
983 PXOR X12, X5
984 BYTE $0x66
985 BYTE $0x0f
986 BYTE $0x3a
987 BYTE $0x0f
988 BYTE $0xdb
989 BYTE $0x04
990 BYTE $0x66
991 BYTE $0x0f
992 BYTE $0x3a
993 BYTE $0x0f
994 BYTE $0xe4
995 BYTE $0x04
996 BYTE $0x66
997 BYTE $0x0f
998 BYTE $0x3a
999 BYTE $0x0f
1000 BYTE $0xed
1001 BYTE $0x04
1002 BYTE $0x66
1003 BYTE $0x0f
1004 BYTE $0x3a
1005 BYTE $0x0f
1006 BYTE $0xf6
1007 BYTE $0x08
1008 BYTE $0x66
1009 BYTE $0x0f
1010 BYTE $0x3a
1011 BYTE $0x0f
1012 BYTE $0xff
1013 BYTE $0x08
1014 BYTE $0x66
1015 BYTE $0x45
1016 BYTE $0x0f
1017 BYTE $0x3a
1018 BYTE $0x0f
1019 BYTE $0xc0
1020 BYTE $0x08
1021 BYTE $0x66
1022 BYTE $0x45
1023 BYTE $0x0f
1024 BYTE $0x3a
1025 BYTE $0x0f
1026 BYTE $0xc9
1027 BYTE $0x0c
1028 BYTE $0x66
1029 BYTE $0x45
1030 BYTE $0x0f
1031 BYTE $0x3a
1032 BYTE $0x0f
1033 BYTE $0xd2
1034 BYTE $0x0c
1035 BYTE $0x66
1036 BYTE $0x45
1037 BYTE $0x0f
1038 BYTE $0x3a
1039 BYTE $0x0f
1040 BYTE $0xdb
1041 BYTE $0x0c
1042 PADDD X3, X0
1043 PXOR X0, X9
1044 ROL16(X9, X12)
1045 PADDD X9, X6
1046 PXOR X6, X3
1047 MOVO X3, X12
1048 PSLLL $0x0c, X12
1049 PSRLL $0x14, X3
1050 PXOR X12, X3
1051 PADDD X3, X0
1052 PXOR X0, X9
1053 ROL8(X9, X12)
1054 PADDD X9, X6
1055 PXOR X6, X3
1056 MOVO X3, X12
1057 PSLLL $0x07, X12
1058 PSRLL $0x19, X3
1059 PXOR X12, X3
1060 PADDD X4, X1
1061 PXOR X1, X10
1062 ROL16(X10, X12)
1063 PADDD X10, X7
1064 PXOR X7, X4
1065 MOVO X4, X12
1066 PSLLL $0x0c, X12
1067 PSRLL $0x14, X4
1068 PXOR X12, X4
1069 PADDD X4, X1
1070 PXOR X1, X10
1071 ROL8(X10, X12)
1072 PADDD X10, X7
1073 PXOR X7, X4
1074 MOVO X4, X12
1075 PSLLL $0x07, X12
1076 PSRLL $0x19, X4
1077 PXOR X12, X4
1078 PADDD X5, X2
1079 PXOR X2, X11
1080 ROL16(X11, X12)
1081 PADDD X11, X8
1082 PXOR X8, X5
1083 MOVO X5, X12
1084 PSLLL $0x0c, X12
1085 PSRLL $0x14, X5
1086 PXOR X12, X5
1087 PADDD X5, X2
1088 PXOR X2, X11
1089 ROL8(X11, X12)
1090 PADDD X11, X8
1091 PXOR X8, X5
1092 MOVO X5, X12
1093 PSLLL $0x07, X12
1094 PSRLL $0x19, X5
1095 PXOR X12, X5
1096 BYTE $0x66
1097 BYTE $0x0f
1098 BYTE $0x3a
1099 BYTE $0x0f
1100 BYTE $0xdb
1101 BYTE $0x0c
1102 BYTE $0x66
1103 BYTE $0x0f
1104 BYTE $0x3a
1105 BYTE $0x0f
1106 BYTE $0xe4
1107 BYTE $0x0c
1108 BYTE $0x66
1109 BYTE $0x0f
1110 BYTE $0x3a
1111 BYTE $0x0f
1112 BYTE $0xed
1113 BYTE $0x0c
1114 BYTE $0x66
1115 BYTE $0x0f
1116 BYTE $0x3a
1117 BYTE $0x0f
1118 BYTE $0xf6
1119 BYTE $0x08
1120 BYTE $0x66
1121 BYTE $0x0f
1122 BYTE $0x3a
1123 BYTE $0x0f
1124 BYTE $0xff
1125 BYTE $0x08
1126 BYTE $0x66
1127 BYTE $0x45
1128 BYTE $0x0f
1129 BYTE $0x3a
1130 BYTE $0x0f
1131 BYTE $0xc0
1132 BYTE $0x08
1133 BYTE $0x66
1134 BYTE $0x45
1135 BYTE $0x0f
1136 BYTE $0x3a
1137 BYTE $0x0f
1138 BYTE $0xc9
1139 BYTE $0x04
1140 BYTE $0x66
1141 BYTE $0x45
1142 BYTE $0x0f
1143 BYTE $0x3a
1144 BYTE $0x0f
1145 BYTE $0xd2
1146 BYTE $0x04
1147 BYTE $0x66
1148 BYTE $0x45
1149 BYTE $0x0f
1150 BYTE $0x3a
1151 BYTE $0x0f
1152 BYTE $0xdb
1153 BYTE $0x04
1154 DECQ R9
1155 JNE openSSE128InnerCipherLoop
1156
1157 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1158 PADDL ·chacha20Constants<>+0(SB), X0
1159 PADDL ·chacha20Constants<>+0(SB), X1
1160 PADDL ·chacha20Constants<>+0(SB), X2
1161 PADDL X13, X3
1162 PADDL X13, X4
1163 PADDL X13, X5
1164 PADDL X14, X7
1165 PADDL X14, X8
1166 PADDL X15, X10
1167 PADDL ·sseIncMask<>+0(SB), X15
1168 PADDL X15, X11
1169
1170 // Clamp and store the key
1171 PAND ·polyClampMask<>+0(SB), X0
1172 MOVOU X0, (BP)
1173 MOVOU X3, 16(BP)
1174
1175 // Hash
1176 MOVQ ad_len+80(FP), R9
1177 CALL polyHashADInternal<>(SB)
1178
1179openSSE128Open:
1180 CMPQ BX, $0x10
1181 JB openSSETail16
1182 SUBQ $0x10, BX
1183
1184 // Load for hashing
1185 ADDQ (SI), R10
1186 ADCQ 8(SI), R11
1187 ADCQ $0x01, R12
1188
1189 // Load for decryption
1190 MOVOU (SI), X12
1191 PXOR X12, X1
1192 MOVOU X1, (DI)
1193 LEAQ 16(SI), SI
1194 LEAQ 16(DI), DI
1195 MOVQ (BP), AX
1196 MOVQ AX, R15
1197 MULQ R10
1198 MOVQ AX, R13
1199 MOVQ DX, R14
1200 MOVQ (BP), AX
1201 MULQ R11
1202 IMULQ R12, R15
1203 ADDQ AX, R14
1204 ADCQ DX, R15
1205 MOVQ 8(BP), AX
1206 MOVQ AX, R8
1207 MULQ R10
1208 ADDQ AX, R14
1209 ADCQ $0x00, DX
1210 MOVQ DX, R10
1211 MOVQ 8(BP), AX
1212 MULQ R11
1213 ADDQ AX, R15
1214 ADCQ $0x00, DX
1215 IMULQ R12, R8
1216 ADDQ R10, R15
1217 ADCQ DX, R8
1218 MOVQ R13, R10
1219 MOVQ R14, R11
1220 MOVQ R15, R12
1221 ANDQ $0x03, R12
1222 MOVQ R15, R13
1223 ANDQ $-4, R13
1224 MOVQ R8, R14
1225 SHRQ $0x02, R8, R15
1226 SHRQ $0x02, R8
1227 ADDQ R13, R10
1228 ADCQ R14, R11
1229 ADCQ $0x00, R12
1230 ADDQ R15, R10
1231 ADCQ R8, R11
1232 ADCQ $0x00, R12
1233
1234 // Shift the stream "left"
1235 MOVO X4, X1
1236 MOVO X7, X4
1237 MOVO X10, X7
1238 MOVO X2, X10
1239 MOVO X5, X2
1240 MOVO X8, X5
1241 MOVO X11, X8
1242 JMP openSSE128Open
1243
1244openSSETail16:
1245 TESTQ BX, BX
1246 JE openSSEFinalize
1247
1248 // We can safely load the CT from the end, because it is padded with the MAC
1249 MOVQ BX, R9
1250 SHLQ $0x04, R9
1251 LEAQ ·andMask<>+0(SB), R13
1252 MOVOU (SI), X12
1253 ADDQ BX, SI
1254 PAND -16(R13)(R9*1), X12
1255 MOVO X12, 64(BP)
1256 MOVQ X12, R13
1257 MOVQ 72(BP), R14
1258 PXOR X1, X12
1259
1260 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
1261openSSETail16Store:
1262 MOVQ X12, R8
1263 MOVB R8, (DI)
1264 PSRLDQ $0x01, X12
1265 INCQ DI
1266 DECQ BX
1267 JNE openSSETail16Store
1268 ADDQ R13, R10
1269 ADCQ R14, R11
1270 ADCQ $0x01, R12
1271 MOVQ (BP), AX
1272 MOVQ AX, R15
1273 MULQ R10
1274 MOVQ AX, R13
1275 MOVQ DX, R14
1276 MOVQ (BP), AX
1277 MULQ R11
1278 IMULQ R12, R15
1279 ADDQ AX, R14
1280 ADCQ DX, R15
1281 MOVQ 8(BP), AX
1282 MOVQ AX, R8
1283 MULQ R10
1284 ADDQ AX, R14
1285 ADCQ $0x00, DX
1286 MOVQ DX, R10
1287 MOVQ 8(BP), AX
1288 MULQ R11
1289 ADDQ AX, R15
1290 ADCQ $0x00, DX
1291 IMULQ R12, R8
1292 ADDQ R10, R15
1293 ADCQ DX, R8
1294 MOVQ R13, R10
1295 MOVQ R14, R11
1296 MOVQ R15, R12
1297 ANDQ $0x03, R12
1298 MOVQ R15, R13
1299 ANDQ $-4, R13
1300 MOVQ R8, R14
1301 SHRQ $0x02, R8, R15
1302 SHRQ $0x02, R8
1303 ADDQ R13, R10
1304 ADCQ R14, R11
1305 ADCQ $0x00, R12
1306 ADDQ R15, R10
1307 ADCQ R8, R11
1308 ADCQ $0x00, R12
1309 JMP openSSEFinalize
1310
1311openSSETail64:
1312 MOVO ·chacha20Constants<>+0(SB), X0
1313 MOVO 32(BP), X3
1314 MOVO 48(BP), X6
1315 MOVO 128(BP), X9
1316 PADDL ·sseIncMask<>+0(SB), X9
1317 MOVO X9, 80(BP)
1318 XORQ R9, R9
1319 MOVQ BX, CX
1320 CMPQ CX, $0x10
1321 JB openSSETail64LoopB
1322
1323openSSETail64LoopA:
1324 ADDQ (SI)(R9*1), R10
1325 ADCQ 8(SI)(R9*1), R11
1326 ADCQ $0x01, R12
1327 MOVQ (BP), AX
1328 MOVQ AX, R15
1329 MULQ R10
1330 MOVQ AX, R13
1331 MOVQ DX, R14
1332 MOVQ (BP), AX
1333 MULQ R11
1334 IMULQ R12, R15
1335 ADDQ AX, R14
1336 ADCQ DX, R15
1337 MOVQ 8(BP), AX
1338 MOVQ AX, R8
1339 MULQ R10
1340 ADDQ AX, R14
1341 ADCQ $0x00, DX
1342 MOVQ DX, R10
1343 MOVQ 8(BP), AX
1344 MULQ R11
1345 ADDQ AX, R15
1346 ADCQ $0x00, DX
1347 IMULQ R12, R8
1348 ADDQ R10, R15
1349 ADCQ DX, R8
1350 MOVQ R13, R10
1351 MOVQ R14, R11
1352 MOVQ R15, R12
1353 ANDQ $0x03, R12
1354 MOVQ R15, R13
1355 ANDQ $-4, R13
1356 MOVQ R8, R14
1357 SHRQ $0x02, R8, R15
1358 SHRQ $0x02, R8
1359 ADDQ R13, R10
1360 ADCQ R14, R11
1361 ADCQ $0x00, R12
1362 ADDQ R15, R10
1363 ADCQ R8, R11
1364 ADCQ $0x00, R12
1365 SUBQ $0x10, CX
1366
1367openSSETail64LoopB:
1368 ADDQ $0x10, R9
1369 PADDD X3, X0
1370 PXOR X0, X9
1371 ROL16(X9, X12)
1372 PADDD X9, X6
1373 PXOR X6, X3
1374 MOVO X3, X12
1375 PSLLL $0x0c, X12
1376 PSRLL $0x14, X3
1377 PXOR X12, X3
1378 PADDD X3, X0
1379 PXOR X0, X9
1380 ROL8(X9, X12)
1381 PADDD X9, X6
1382 PXOR X6, X3
1383 MOVO X3, X12
1384 PSLLL $0x07, X12
1385 PSRLL $0x19, X3
1386 PXOR X12, X3
1387 BYTE $0x66
1388 BYTE $0x0f
1389 BYTE $0x3a
1390 BYTE $0x0f
1391 BYTE $0xdb
1392 BYTE $0x04
1393 BYTE $0x66
1394 BYTE $0x0f
1395 BYTE $0x3a
1396 BYTE $0x0f
1397 BYTE $0xf6
1398 BYTE $0x08
1399 BYTE $0x66
1400 BYTE $0x45
1401 BYTE $0x0f
1402 BYTE $0x3a
1403 BYTE $0x0f
1404 BYTE $0xc9
1405 BYTE $0x0c
1406 PADDD X3, X0
1407 PXOR X0, X9
1408 ROL16(X9, X12)
1409 PADDD X9, X6
1410 PXOR X6, X3
1411 MOVO X3, X12
1412 PSLLL $0x0c, X12
1413 PSRLL $0x14, X3
1414 PXOR X12, X3
1415 PADDD X3, X0
1416 PXOR X0, X9
1417 ROL8(X9, X12)
1418 PADDD X9, X6
1419 PXOR X6, X3
1420 MOVO X3, X12
1421 PSLLL $0x07, X12
1422 PSRLL $0x19, X3
1423 PXOR X12, X3
1424 BYTE $0x66
1425 BYTE $0x0f
1426 BYTE $0x3a
1427 BYTE $0x0f
1428 BYTE $0xdb
1429 BYTE $0x0c
1430 BYTE $0x66
1431 BYTE $0x0f
1432 BYTE $0x3a
1433 BYTE $0x0f
1434 BYTE $0xf6
1435 BYTE $0x08
1436 BYTE $0x66
1437 BYTE $0x45
1438 BYTE $0x0f
1439 BYTE $0x3a
1440 BYTE $0x0f
1441 BYTE $0xc9
1442 BYTE $0x04
1443 CMPQ CX, $0x10
1444 JAE openSSETail64LoopA
1445 CMPQ R9, $0xa0
1446 JNE openSSETail64LoopB
1447 PADDL ·chacha20Constants<>+0(SB), X0
1448 PADDL 32(BP), X3
1449 PADDL 48(BP), X6
1450 PADDL 80(BP), X9
1451
1452openSSETail64DecLoop:
1453 CMPQ BX, $0x10
1454 JB openSSETail64DecLoopDone
1455 SUBQ $0x10, BX
1456 MOVOU (SI), X12
1457 PXOR X12, X0
1458 MOVOU X0, (DI)
1459 LEAQ 16(SI), SI
1460 LEAQ 16(DI), DI
1461 MOVO X3, X0
1462 MOVO X6, X3
1463 MOVO X9, X6
1464 JMP openSSETail64DecLoop
1465
1466openSSETail64DecLoopDone:
1467 MOVO X0, X1
1468 JMP openSSETail16
1469
1470openSSETail128:
1471 MOVO ·chacha20Constants<>+0(SB), X1
1472 MOVO 32(BP), X4
1473 MOVO 48(BP), X7
1474 MOVO 128(BP), X10
1475 PADDL ·sseIncMask<>+0(SB), X10
1476 MOVO X10, 80(BP)
1477 MOVO X1, X0
1478 MOVO X4, X3
1479 MOVO X7, X6
1480 MOVO X10, X9
1481 PADDL ·sseIncMask<>+0(SB), X9
1482 MOVO X9, 96(BP)
1483 XORQ R9, R9
1484 MOVQ BX, CX
1485 ANDQ $-16, CX
1486
1487openSSETail128LoopA:
1488 ADDQ (SI)(R9*1), R10
1489 ADCQ 8(SI)(R9*1), R11
1490 ADCQ $0x01, R12
1491 MOVQ (BP), AX
1492 MOVQ AX, R15
1493 MULQ R10
1494 MOVQ AX, R13
1495 MOVQ DX, R14
1496 MOVQ (BP), AX
1497 MULQ R11
1498 IMULQ R12, R15
1499 ADDQ AX, R14
1500 ADCQ DX, R15
1501 MOVQ 8(BP), AX
1502 MOVQ AX, R8
1503 MULQ R10
1504 ADDQ AX, R14
1505 ADCQ $0x00, DX
1506 MOVQ DX, R10
1507 MOVQ 8(BP), AX
1508 MULQ R11
1509 ADDQ AX, R15
1510 ADCQ $0x00, DX
1511 IMULQ R12, R8
1512 ADDQ R10, R15
1513 ADCQ DX, R8
1514 MOVQ R13, R10
1515 MOVQ R14, R11
1516 MOVQ R15, R12
1517 ANDQ $0x03, R12
1518 MOVQ R15, R13
1519 ANDQ $-4, R13
1520 MOVQ R8, R14
1521 SHRQ $0x02, R8, R15
1522 SHRQ $0x02, R8
1523 ADDQ R13, R10
1524 ADCQ R14, R11
1525 ADCQ $0x00, R12
1526 ADDQ R15, R10
1527 ADCQ R8, R11
1528 ADCQ $0x00, R12
1529
1530openSSETail128LoopB:
1531 ADDQ $0x10, R9
1532 PADDD X3, X0
1533 PXOR X0, X9
1534 ROL16(X9, X12)
1535 PADDD X9, X6
1536 PXOR X6, X3
1537 MOVO X3, X12
1538 PSLLL $0x0c, X12
1539 PSRLL $0x14, X3
1540 PXOR X12, X3
1541 PADDD X3, X0
1542 PXOR X0, X9
1543 ROL8(X9, X12)
1544 PADDD X9, X6
1545 PXOR X6, X3
1546 MOVO X3, X12
1547 PSLLL $0x07, X12
1548 PSRLL $0x19, X3
1549 PXOR X12, X3
1550 PADDD X4, X1
1551 PXOR X1, X10
1552 ROL16(X10, X12)
1553 PADDD X10, X7
1554 PXOR X7, X4
1555 MOVO X4, X12
1556 PSLLL $0x0c, X12
1557 PSRLL $0x14, X4
1558 PXOR X12, X4
1559 PADDD X4, X1
1560 PXOR X1, X10
1561 ROL8(X10, X12)
1562 PADDD X10, X7
1563 PXOR X7, X4
1564 MOVO X4, X12
1565 PSLLL $0x07, X12
1566 PSRLL $0x19, X4
1567 PXOR X12, X4
1568 BYTE $0x66
1569 BYTE $0x0f
1570 BYTE $0x3a
1571 BYTE $0x0f
1572 BYTE $0xdb
1573 BYTE $0x04
1574 BYTE $0x66
1575 BYTE $0x0f
1576 BYTE $0x3a
1577 BYTE $0x0f
1578 BYTE $0xf6
1579 BYTE $0x08
1580 BYTE $0x66
1581 BYTE $0x45
1582 BYTE $0x0f
1583 BYTE $0x3a
1584 BYTE $0x0f
1585 BYTE $0xc9
1586 BYTE $0x0c
1587 BYTE $0x66
1588 BYTE $0x0f
1589 BYTE $0x3a
1590 BYTE $0x0f
1591 BYTE $0xe4
1592 BYTE $0x04
1593 BYTE $0x66
1594 BYTE $0x0f
1595 BYTE $0x3a
1596 BYTE $0x0f
1597 BYTE $0xff
1598 BYTE $0x08
1599 BYTE $0x66
1600 BYTE $0x45
1601 BYTE $0x0f
1602 BYTE $0x3a
1603 BYTE $0x0f
1604 BYTE $0xd2
1605 BYTE $0x0c
1606 PADDD X3, X0
1607 PXOR X0, X9
1608 ROL16(X9, X12)
1609 PADDD X9, X6
1610 PXOR X6, X3
1611 MOVO X3, X12
1612 PSLLL $0x0c, X12
1613 PSRLL $0x14, X3
1614 PXOR X12, X3
1615 PADDD X3, X0
1616 PXOR X0, X9
1617 ROL8(X9, X12)
1618 PADDD X9, X6
1619 PXOR X6, X3
1620 MOVO X3, X12
1621 PSLLL $0x07, X12
1622 PSRLL $0x19, X3
1623 PXOR X12, X3
1624 PADDD X4, X1
1625 PXOR X1, X10
1626 ROL16(X10, X12)
1627 PADDD X10, X7
1628 PXOR X7, X4
1629 MOVO X4, X12
1630 PSLLL $0x0c, X12
1631 PSRLL $0x14, X4
1632 PXOR X12, X4
1633 PADDD X4, X1
1634 PXOR X1, X10
1635 ROL8(X10, X12)
1636 PADDD X10, X7
1637 PXOR X7, X4
1638 MOVO X4, X12
1639 PSLLL $0x07, X12
1640 PSRLL $0x19, X4
1641 PXOR X12, X4
1642 BYTE $0x66
1643 BYTE $0x0f
1644 BYTE $0x3a
1645 BYTE $0x0f
1646 BYTE $0xdb
1647 BYTE $0x0c
1648 BYTE $0x66
1649 BYTE $0x0f
1650 BYTE $0x3a
1651 BYTE $0x0f
1652 BYTE $0xf6
1653 BYTE $0x08
1654 BYTE $0x66
1655 BYTE $0x45
1656 BYTE $0x0f
1657 BYTE $0x3a
1658 BYTE $0x0f
1659 BYTE $0xc9
1660 BYTE $0x04
1661 BYTE $0x66
1662 BYTE $0x0f
1663 BYTE $0x3a
1664 BYTE $0x0f
1665 BYTE $0xe4
1666 BYTE $0x0c
1667 BYTE $0x66
1668 BYTE $0x0f
1669 BYTE $0x3a
1670 BYTE $0x0f
1671 BYTE $0xff
1672 BYTE $0x08
1673 BYTE $0x66
1674 BYTE $0x45
1675 BYTE $0x0f
1676 BYTE $0x3a
1677 BYTE $0x0f
1678 BYTE $0xd2
1679 BYTE $0x04
1680 CMPQ R9, CX
1681 JB openSSETail128LoopA
1682 CMPQ R9, $0xa0
1683 JNE openSSETail128LoopB
1684 PADDL ·chacha20Constants<>+0(SB), X0
1685 PADDL ·chacha20Constants<>+0(SB), X1
1686 PADDL 32(BP), X3
1687 PADDL 32(BP), X4
1688 PADDL 48(BP), X6
1689 PADDL 48(BP), X7
1690 PADDL 96(BP), X9
1691 PADDL 80(BP), X10
1692 MOVOU (SI), X12
1693 MOVOU 16(SI), X13
1694 MOVOU 32(SI), X14
1695 MOVOU 48(SI), X15
1696 PXOR X12, X1
1697 PXOR X13, X4
1698 PXOR X14, X7
1699 PXOR X15, X10
1700 MOVOU X1, (DI)
1701 MOVOU X4, 16(DI)
1702 MOVOU X7, 32(DI)
1703 MOVOU X10, 48(DI)
1704 SUBQ $0x40, BX
1705 LEAQ 64(SI), SI
1706 LEAQ 64(DI), DI
1707 JMP openSSETail64DecLoop
1708
1709openSSETail192:
1710 MOVO ·chacha20Constants<>+0(SB), X2
1711 MOVO 32(BP), X5
1712 MOVO 48(BP), X8
1713 MOVO 128(BP), X11
1714 PADDL ·sseIncMask<>+0(SB), X11
1715 MOVO X11, 80(BP)
1716 MOVO X2, X1
1717 MOVO X5, X4
1718 MOVO X8, X7
1719 MOVO X11, X10
1720 PADDL ·sseIncMask<>+0(SB), X10
1721 MOVO X10, 96(BP)
1722 MOVO X1, X0
1723 MOVO X4, X3
1724 MOVO X7, X6
1725 MOVO X10, X9
1726 PADDL ·sseIncMask<>+0(SB), X9
1727 MOVO X9, 112(BP)
1728 MOVQ BX, CX
1729 MOVQ $0x000000a0, R9
1730 CMPQ CX, $0xa0
1731 CMOVQGT R9, CX
1732 ANDQ $-16, CX
1733 XORQ R9, R9
1734
1735openSSLTail192LoopA:
1736 ADDQ (SI)(R9*1), R10
1737 ADCQ 8(SI)(R9*1), R11
1738 ADCQ $0x01, R12
1739 MOVQ (BP), AX
1740 MOVQ AX, R15
1741 MULQ R10
1742 MOVQ AX, R13
1743 MOVQ DX, R14
1744 MOVQ (BP), AX
1745 MULQ R11
1746 IMULQ R12, R15
1747 ADDQ AX, R14
1748 ADCQ DX, R15
1749 MOVQ 8(BP), AX
1750 MOVQ AX, R8
1751 MULQ R10
1752 ADDQ AX, R14
1753 ADCQ $0x00, DX
1754 MOVQ DX, R10
1755 MOVQ 8(BP), AX
1756 MULQ R11
1757 ADDQ AX, R15
1758 ADCQ $0x00, DX
1759 IMULQ R12, R8
1760 ADDQ R10, R15
1761 ADCQ DX, R8
1762 MOVQ R13, R10
1763 MOVQ R14, R11
1764 MOVQ R15, R12
1765 ANDQ $0x03, R12
1766 MOVQ R15, R13
1767 ANDQ $-4, R13
1768 MOVQ R8, R14
1769 SHRQ $0x02, R8, R15
1770 SHRQ $0x02, R8
1771 ADDQ R13, R10
1772 ADCQ R14, R11
1773 ADCQ $0x00, R12
1774 ADDQ R15, R10
1775 ADCQ R8, R11
1776 ADCQ $0x00, R12
1777
1778openSSLTail192LoopB:
1779 ADDQ $0x10, R9
1780 PADDD X3, X0
1781 PXOR X0, X9
1782 ROL16(X9, X12)
1783 PADDD X9, X6
1784 PXOR X6, X3
1785 MOVO X3, X12
1786 PSLLL $0x0c, X12
1787 PSRLL $0x14, X3
1788 PXOR X12, X3
1789 PADDD X3, X0
1790 PXOR X0, X9
1791 ROL8(X9, X12)
1792 PADDD X9, X6
1793 PXOR X6, X3
1794 MOVO X3, X12
1795 PSLLL $0x07, X12
1796 PSRLL $0x19, X3
1797 PXOR X12, X3
1798 PADDD X4, X1
1799 PXOR X1, X10
1800 ROL16(X10, X12)
1801 PADDD X10, X7
1802 PXOR X7, X4
1803 MOVO X4, X12
1804 PSLLL $0x0c, X12
1805 PSRLL $0x14, X4
1806 PXOR X12, X4
1807 PADDD X4, X1
1808 PXOR X1, X10
1809 ROL8(X10, X12)
1810 PADDD X10, X7
1811 PXOR X7, X4
1812 MOVO X4, X12
1813 PSLLL $0x07, X12
1814 PSRLL $0x19, X4
1815 PXOR X12, X4
1816 PADDD X5, X2
1817 PXOR X2, X11
1818 ROL16(X11, X12)
1819 PADDD X11, X8
1820 PXOR X8, X5
1821 MOVO X5, X12
1822 PSLLL $0x0c, X12
1823 PSRLL $0x14, X5
1824 PXOR X12, X5
1825 PADDD X5, X2
1826 PXOR X2, X11
1827 ROL8(X11, X12)
1828 PADDD X11, X8
1829 PXOR X8, X5
1830 MOVO X5, X12
1831 PSLLL $0x07, X12
1832 PSRLL $0x19, X5
1833 PXOR X12, X5
1834 BYTE $0x66
1835 BYTE $0x0f
1836 BYTE $0x3a
1837 BYTE $0x0f
1838 BYTE $0xdb
1839 BYTE $0x04
1840 BYTE $0x66
1841 BYTE $0x0f
1842 BYTE $0x3a
1843 BYTE $0x0f
1844 BYTE $0xf6
1845 BYTE $0x08
1846 BYTE $0x66
1847 BYTE $0x45
1848 BYTE $0x0f
1849 BYTE $0x3a
1850 BYTE $0x0f
1851 BYTE $0xc9
1852 BYTE $0x0c
1853 BYTE $0x66
1854 BYTE $0x0f
1855 BYTE $0x3a
1856 BYTE $0x0f
1857 BYTE $0xe4
1858 BYTE $0x04
1859 BYTE $0x66
1860 BYTE $0x0f
1861 BYTE $0x3a
1862 BYTE $0x0f
1863 BYTE $0xff
1864 BYTE $0x08
1865 BYTE $0x66
1866 BYTE $0x45
1867 BYTE $0x0f
1868 BYTE $0x3a
1869 BYTE $0x0f
1870 BYTE $0xd2
1871 BYTE $0x0c
1872 BYTE $0x66
1873 BYTE $0x0f
1874 BYTE $0x3a
1875 BYTE $0x0f
1876 BYTE $0xed
1877 BYTE $0x04
1878 BYTE $0x66
1879 BYTE $0x45
1880 BYTE $0x0f
1881 BYTE $0x3a
1882 BYTE $0x0f
1883 BYTE $0xc0
1884 BYTE $0x08
1885 BYTE $0x66
1886 BYTE $0x45
1887 BYTE $0x0f
1888 BYTE $0x3a
1889 BYTE $0x0f
1890 BYTE $0xdb
1891 BYTE $0x0c
1892 PADDD X3, X0
1893 PXOR X0, X9
1894 ROL16(X9, X12)
1895 PADDD X9, X6
1896 PXOR X6, X3
1897 MOVO X3, X12
1898 PSLLL $0x0c, X12
1899 PSRLL $0x14, X3
1900 PXOR X12, X3
1901 PADDD X3, X0
1902 PXOR X0, X9
1903 ROL8(X9, X12)
1904 PADDD X9, X6
1905 PXOR X6, X3
1906 MOVO X3, X12
1907 PSLLL $0x07, X12
1908 PSRLL $0x19, X3
1909 PXOR X12, X3
1910 PADDD X4, X1
1911 PXOR X1, X10
1912 ROL16(X10, X12)
1913 PADDD X10, X7
1914 PXOR X7, X4
1915 MOVO X4, X12
1916 PSLLL $0x0c, X12
1917 PSRLL $0x14, X4
1918 PXOR X12, X4
1919 PADDD X4, X1
1920 PXOR X1, X10
1921 ROL8(X10, X12)
1922 PADDD X10, X7
1923 PXOR X7, X4
1924 MOVO X4, X12
1925 PSLLL $0x07, X12
1926 PSRLL $0x19, X4
1927 PXOR X12, X4
1928 PADDD X5, X2
1929 PXOR X2, X11
1930 ROL16(X11, X12)
1931 PADDD X11, X8
1932 PXOR X8, X5
1933 MOVO X5, X12
1934 PSLLL $0x0c, X12
1935 PSRLL $0x14, X5
1936 PXOR X12, X5
1937 PADDD X5, X2
1938 PXOR X2, X11
1939 ROL8(X11, X12)
1940 PADDD X11, X8
1941 PXOR X8, X5
1942 MOVO X5, X12
1943 PSLLL $0x07, X12
1944 PSRLL $0x19, X5
1945 PXOR X12, X5
1946 BYTE $0x66
1947 BYTE $0x0f
1948 BYTE $0x3a
1949 BYTE $0x0f
1950 BYTE $0xdb
1951 BYTE $0x0c
1952 BYTE $0x66
1953 BYTE $0x0f
1954 BYTE $0x3a
1955 BYTE $0x0f
1956 BYTE $0xf6
1957 BYTE $0x08
1958 BYTE $0x66
1959 BYTE $0x45
1960 BYTE $0x0f
1961 BYTE $0x3a
1962 BYTE $0x0f
1963 BYTE $0xc9
1964 BYTE $0x04
1965 BYTE $0x66
1966 BYTE $0x0f
1967 BYTE $0x3a
1968 BYTE $0x0f
1969 BYTE $0xe4
1970 BYTE $0x0c
1971 BYTE $0x66
1972 BYTE $0x0f
1973 BYTE $0x3a
1974 BYTE $0x0f
1975 BYTE $0xff
1976 BYTE $0x08
1977 BYTE $0x66
1978 BYTE $0x45
1979 BYTE $0x0f
1980 BYTE $0x3a
1981 BYTE $0x0f
1982 BYTE $0xd2
1983 BYTE $0x04
1984 BYTE $0x66
1985 BYTE $0x0f
1986 BYTE $0x3a
1987 BYTE $0x0f
1988 BYTE $0xed
1989 BYTE $0x0c
1990 BYTE $0x66
1991 BYTE $0x45
1992 BYTE $0x0f
1993 BYTE $0x3a
1994 BYTE $0x0f
1995 BYTE $0xc0
1996 BYTE $0x08
1997 BYTE $0x66
1998 BYTE $0x45
1999 BYTE $0x0f
2000 BYTE $0x3a
2001 BYTE $0x0f
2002 BYTE $0xdb
2003 BYTE $0x04
2004 CMPQ R9, CX
2005 JB openSSLTail192LoopA
2006 CMPQ R9, $0xa0
2007 JNE openSSLTail192LoopB
2008 CMPQ BX, $0xb0
2009 JB openSSLTail192Store
2010 ADDQ 160(SI), R10
2011 ADCQ 168(SI), R11
2012 ADCQ $0x01, R12
2013 MOVQ (BP), AX
2014 MOVQ AX, R15
2015 MULQ R10
2016 MOVQ AX, R13
2017 MOVQ DX, R14
2018 MOVQ (BP), AX
2019 MULQ R11
2020 IMULQ R12, R15
2021 ADDQ AX, R14
2022 ADCQ DX, R15
2023 MOVQ 8(BP), AX
2024 MOVQ AX, R8
2025 MULQ R10
2026 ADDQ AX, R14
2027 ADCQ $0x00, DX
2028 MOVQ DX, R10
2029 MOVQ 8(BP), AX
2030 MULQ R11
2031 ADDQ AX, R15
2032 ADCQ $0x00, DX
2033 IMULQ R12, R8
2034 ADDQ R10, R15
2035 ADCQ DX, R8
2036 MOVQ R13, R10
2037 MOVQ R14, R11
2038 MOVQ R15, R12
2039 ANDQ $0x03, R12
2040 MOVQ R15, R13
2041 ANDQ $-4, R13
2042 MOVQ R8, R14
2043 SHRQ $0x02, R8, R15
2044 SHRQ $0x02, R8
2045 ADDQ R13, R10
2046 ADCQ R14, R11
2047 ADCQ $0x00, R12
2048 ADDQ R15, R10
2049 ADCQ R8, R11
2050 ADCQ $0x00, R12
2051 CMPQ BX, $0xc0
2052 JB openSSLTail192Store
2053 ADDQ 176(SI), R10
2054 ADCQ 184(SI), R11
2055 ADCQ $0x01, R12
2056 MOVQ (BP), AX
2057 MOVQ AX, R15
2058 MULQ R10
2059 MOVQ AX, R13
2060 MOVQ DX, R14
2061 MOVQ (BP), AX
2062 MULQ R11
2063 IMULQ R12, R15
2064 ADDQ AX, R14
2065 ADCQ DX, R15
2066 MOVQ 8(BP), AX
2067 MOVQ AX, R8
2068 MULQ R10
2069 ADDQ AX, R14
2070 ADCQ $0x00, DX
2071 MOVQ DX, R10
2072 MOVQ 8(BP), AX
2073 MULQ R11
2074 ADDQ AX, R15
2075 ADCQ $0x00, DX
2076 IMULQ R12, R8
2077 ADDQ R10, R15
2078 ADCQ DX, R8
2079 MOVQ R13, R10
2080 MOVQ R14, R11
2081 MOVQ R15, R12
2082 ANDQ $0x03, R12
2083 MOVQ R15, R13
2084 ANDQ $-4, R13
2085 MOVQ R8, R14
2086 SHRQ $0x02, R8, R15
2087 SHRQ $0x02, R8
2088 ADDQ R13, R10
2089 ADCQ R14, R11
2090 ADCQ $0x00, R12
2091 ADDQ R15, R10
2092 ADCQ R8, R11
2093 ADCQ $0x00, R12
2094
2095openSSLTail192Store:
2096 PADDL ·chacha20Constants<>+0(SB), X0
2097 PADDL ·chacha20Constants<>+0(SB), X1
2098 PADDL ·chacha20Constants<>+0(SB), X2
2099 PADDL 32(BP), X3
2100 PADDL 32(BP), X4
2101 PADDL 32(BP), X5
2102 PADDL 48(BP), X6
2103 PADDL 48(BP), X7
2104 PADDL 48(BP), X8
2105 PADDL 112(BP), X9
2106 PADDL 96(BP), X10
2107 PADDL 80(BP), X11
2108 MOVOU (SI), X12
2109 MOVOU 16(SI), X13
2110 MOVOU 32(SI), X14
2111 MOVOU 48(SI), X15
2112 PXOR X12, X2
2113 PXOR X13, X5
2114 PXOR X14, X8
2115 PXOR X15, X11
2116 MOVOU X2, (DI)
2117 MOVOU X5, 16(DI)
2118 MOVOU X8, 32(DI)
2119 MOVOU X11, 48(DI)
2120 MOVOU 64(SI), X12
2121 MOVOU 80(SI), X13
2122 MOVOU 96(SI), X14
2123 MOVOU 112(SI), X15
2124 PXOR X12, X1
2125 PXOR X13, X4
2126 PXOR X14, X7
2127 PXOR X15, X10
2128 MOVOU X1, 64(DI)
2129 MOVOU X4, 80(DI)
2130 MOVOU X7, 96(DI)
2131 MOVOU X10, 112(DI)
2132 SUBQ $0x80, BX
2133 LEAQ 128(SI), SI
2134 LEAQ 128(DI), DI
2135 JMP openSSETail64DecLoop
2136
2137openSSETail256:
2138 MOVO ·chacha20Constants<>+0(SB), X0
2139 MOVO 32(BP), X3
2140 MOVO 48(BP), X6
2141 MOVO 128(BP), X9
2142 PADDL ·sseIncMask<>+0(SB), X9
2143 MOVO X0, X1
2144 MOVO X3, X4
2145 MOVO X6, X7
2146 MOVO X9, X10
2147 PADDL ·sseIncMask<>+0(SB), X10
2148 MOVO X1, X2
2149 MOVO X4, X5
2150 MOVO X7, X8
2151 MOVO X10, X11
2152 PADDL ·sseIncMask<>+0(SB), X11
2153 MOVO X2, X12
2154 MOVO X5, X13
2155 MOVO X8, X14
2156 MOVO X11, X15
2157 PADDL ·sseIncMask<>+0(SB), X15
2158
2159 // Store counters
2160 MOVO X9, 80(BP)
2161 MOVO X10, 96(BP)
2162 MOVO X11, 112(BP)
2163 MOVO X15, 128(BP)
2164 XORQ R9, R9
2165
2166openSSETail256Loop:
2167 ADDQ (SI)(R9*1), R10
2168 ADCQ 8(SI)(R9*1), R11
2169 ADCQ $0x01, R12
2170 MOVO X14, 64(BP)
2171 PADDD X3, X0
2172 PXOR X0, X9
2173 ROL16(X9, X14)
2174 PADDD X9, X6
2175 PXOR X6, X3
2176 MOVO X3, X14
2177 PSLLL $0x0c, X14
2178 PSRLL $0x14, X3
2179 PXOR X14, X3
2180 PADDD X3, X0
2181 PXOR X0, X9
2182 ROL8(X9, X14)
2183 PADDD X9, X6
2184 PXOR X6, X3
2185 MOVO X3, X14
2186 PSLLL $0x07, X14
2187 PSRLL $0x19, X3
2188 PXOR X14, X3
2189 PADDD X4, X1
2190 PXOR X1, X10
2191 ROL16(X10, X14)
2192 PADDD X10, X7
2193 PXOR X7, X4
2194 MOVO X4, X14
2195 PSLLL $0x0c, X14
2196 PSRLL $0x14, X4
2197 PXOR X14, X4
2198 PADDD X4, X1
2199 PXOR X1, X10
2200 ROL8(X10, X14)
2201 PADDD X10, X7
2202 PXOR X7, X4
2203 MOVO X4, X14
2204 PSLLL $0x07, X14
2205 PSRLL $0x19, X4
2206 PXOR X14, X4
2207 PADDD X5, X2
2208 PXOR X2, X11
2209 ROL16(X11, X14)
2210 PADDD X11, X8
2211 PXOR X8, X5
2212 MOVO X5, X14
2213 PSLLL $0x0c, X14
2214 PSRLL $0x14, X5
2215 PXOR X14, X5
2216 PADDD X5, X2
2217 PXOR X2, X11
2218 ROL8(X11, X14)
2219 PADDD X11, X8
2220 PXOR X8, X5
2221 MOVO X5, X14
2222 PSLLL $0x07, X14
2223 PSRLL $0x19, X5
2224 PXOR X14, X5
2225 MOVO 64(BP), X14
2226 MOVO X7, 64(BP)
2227 PADDD X13, X12
2228 PXOR X12, X15
2229 ROL16(X15, X7)
2230 PADDD X15, X14
2231 PXOR X14, X13
2232 MOVO X13, X7
2233 PSLLL $0x0c, X7
2234 PSRLL $0x14, X13
2235 PXOR X7, X13
2236 PADDD X13, X12
2237 PXOR X12, X15
2238 ROL8(X15, X7)
2239 PADDD X15, X14
2240 PXOR X14, X13
2241 MOVO X13, X7
2242 PSLLL $0x07, X7
2243 PSRLL $0x19, X13
2244 PXOR X7, X13
2245 MOVO 64(BP), X7
2246 BYTE $0x66
2247 BYTE $0x0f
2248 BYTE $0x3a
2249 BYTE $0x0f
2250 BYTE $0xdb
2251 BYTE $0x04
2252 BYTE $0x66
2253 BYTE $0x0f
2254 BYTE $0x3a
2255 BYTE $0x0f
2256 BYTE $0xe4
2257 BYTE $0x04
2258 BYTE $0x66
2259 BYTE $0x0f
2260 BYTE $0x3a
2261 BYTE $0x0f
2262 BYTE $0xed
2263 BYTE $0x04
2264 BYTE $0x66
2265 BYTE $0x45
2266 BYTE $0x0f
2267 BYTE $0x3a
2268 BYTE $0x0f
2269 BYTE $0xed
2270 BYTE $0x04
2271 BYTE $0x66
2272 BYTE $0x0f
2273 BYTE $0x3a
2274 BYTE $0x0f
2275 BYTE $0xf6
2276 BYTE $0x08
2277 BYTE $0x66
2278 BYTE $0x0f
2279 BYTE $0x3a
2280 BYTE $0x0f
2281 BYTE $0xff
2282 BYTE $0x08
2283 BYTE $0x66
2284 BYTE $0x45
2285 BYTE $0x0f
2286 BYTE $0x3a
2287 BYTE $0x0f
2288 BYTE $0xc0
2289 BYTE $0x08
2290 BYTE $0x66
2291 BYTE $0x45
2292 BYTE $0x0f
2293 BYTE $0x3a
2294 BYTE $0x0f
2295 BYTE $0xf6
2296 BYTE $0x08
2297 BYTE $0x66
2298 BYTE $0x45
2299 BYTE $0x0f
2300 BYTE $0x3a
2301 BYTE $0x0f
2302 BYTE $0xc9
2303 BYTE $0x0c
2304 BYTE $0x66
2305 BYTE $0x45
2306 BYTE $0x0f
2307 BYTE $0x3a
2308 BYTE $0x0f
2309 BYTE $0xd2
2310 BYTE $0x0c
2311 BYTE $0x66
2312 BYTE $0x45
2313 BYTE $0x0f
2314 BYTE $0x3a
2315 BYTE $0x0f
2316 BYTE $0xdb
2317 BYTE $0x0c
2318 BYTE $0x66
2319 BYTE $0x45
2320 BYTE $0x0f
2321 BYTE $0x3a
2322 BYTE $0x0f
2323 BYTE $0xff
2324 BYTE $0x0c
2325 MOVQ (BP), AX
2326 MOVQ AX, R15
2327 MULQ R10
2328 MOVQ AX, R13
2329 MOVQ DX, R14
2330 MOVQ (BP), AX
2331 MULQ R11
2332 IMULQ R12, R15
2333 ADDQ AX, R14
2334 ADCQ DX, R15
2335 MOVQ 8(BP), AX
2336 MOVQ AX, R8
2337 MULQ R10
2338 ADDQ AX, R14
2339 ADCQ $0x00, DX
2340 MOVQ DX, R10
2341 MOVQ 8(BP), AX
2342 MULQ R11
2343 ADDQ AX, R15
2344 ADCQ $0x00, DX
2345 MOVO X14, 64(BP)
2346 PADDD X3, X0
2347 PXOR X0, X9
2348 ROL16(X9, X14)
2349 PADDD X9, X6
2350 PXOR X6, X3
2351 MOVO X3, X14
2352 PSLLL $0x0c, X14
2353 PSRLL $0x14, X3
2354 PXOR X14, X3
2355 PADDD X3, X0
2356 PXOR X0, X9
2357 ROL8(X9, X14)
2358 PADDD X9, X6
2359 PXOR X6, X3
2360 MOVO X3, X14
2361 PSLLL $0x07, X14
2362 PSRLL $0x19, X3
2363 PXOR X14, X3
2364 PADDD X4, X1
2365 PXOR X1, X10
2366 ROL16(X10, X14)
2367 PADDD X10, X7
2368 PXOR X7, X4
2369 MOVO X4, X14
2370 PSLLL $0x0c, X14
2371 PSRLL $0x14, X4
2372 PXOR X14, X4
2373 PADDD X4, X1
2374 PXOR X1, X10
2375 ROL8(X10, X14)
2376 PADDD X10, X7
2377 PXOR X7, X4
2378 MOVO X4, X14
2379 PSLLL $0x07, X14
2380 PSRLL $0x19, X4
2381 PXOR X14, X4
2382 PADDD X5, X2
2383 PXOR X2, X11
2384 ROL16(X11, X14)
2385 PADDD X11, X8
2386 PXOR X8, X5
2387 MOVO X5, X14
2388 PSLLL $0x0c, X14
2389 PSRLL $0x14, X5
2390 PXOR X14, X5
2391 PADDD X5, X2
2392 PXOR X2, X11
2393 ROL8(X11, X14)
2394 PADDD X11, X8
2395 PXOR X8, X5
2396 MOVO X5, X14
2397 PSLLL $0x07, X14
2398 PSRLL $0x19, X5
2399 PXOR X14, X5
2400 MOVO 64(BP), X14
2401 MOVO X7, 64(BP)
2402 PADDD X13, X12
2403 PXOR X12, X15
2404 ROL16(X15, X7)
2405 PADDD X15, X14
2406 PXOR X14, X13
2407 MOVO X13, X7
2408 PSLLL $0x0c, X7
2409 PSRLL $0x14, X13
2410 PXOR X7, X13
2411 PADDD X13, X12
2412 PXOR X12, X15
2413 ROL8(X15, X7)
2414 PADDD X15, X14
2415 PXOR X14, X13
2416 MOVO X13, X7
2417 PSLLL $0x07, X7
2418 PSRLL $0x19, X13
2419 PXOR X7, X13
2420 MOVO 64(BP), X7
2421 IMULQ R12, R8
2422 ADDQ R10, R15
2423 ADCQ DX, R8
2424 MOVQ R13, R10
2425 MOVQ R14, R11
2426 MOVQ R15, R12
2427 ANDQ $0x03, R12
2428 MOVQ R15, R13
2429 ANDQ $-4, R13
2430 MOVQ R8, R14
2431 SHRQ $0x02, R8, R15
2432 SHRQ $0x02, R8
2433 ADDQ R13, R10
2434 ADCQ R14, R11
2435 ADCQ $0x00, R12
2436 ADDQ R15, R10
2437 ADCQ R8, R11
2438 ADCQ $0x00, R12
2439 BYTE $0x66
2440 BYTE $0x0f
2441 BYTE $0x3a
2442 BYTE $0x0f
2443 BYTE $0xdb
2444 BYTE $0x0c
2445 BYTE $0x66
2446 BYTE $0x0f
2447 BYTE $0x3a
2448 BYTE $0x0f
2449 BYTE $0xe4
2450 BYTE $0x0c
2451 BYTE $0x66
2452 BYTE $0x0f
2453 BYTE $0x3a
2454 BYTE $0x0f
2455 BYTE $0xed
2456 BYTE $0x0c
2457 BYTE $0x66
2458 BYTE $0x45
2459 BYTE $0x0f
2460 BYTE $0x3a
2461 BYTE $0x0f
2462 BYTE $0xed
2463 BYTE $0x0c
2464 BYTE $0x66
2465 BYTE $0x0f
2466 BYTE $0x3a
2467 BYTE $0x0f
2468 BYTE $0xf6
2469 BYTE $0x08
2470 BYTE $0x66
2471 BYTE $0x0f
2472 BYTE $0x3a
2473 BYTE $0x0f
2474 BYTE $0xff
2475 BYTE $0x08
2476 BYTE $0x66
2477 BYTE $0x45
2478 BYTE $0x0f
2479 BYTE $0x3a
2480 BYTE $0x0f
2481 BYTE $0xc0
2482 BYTE $0x08
2483 BYTE $0x66
2484 BYTE $0x45
2485 BYTE $0x0f
2486 BYTE $0x3a
2487 BYTE $0x0f
2488 BYTE $0xf6
2489 BYTE $0x08
2490 BYTE $0x66
2491 BYTE $0x45
2492 BYTE $0x0f
2493 BYTE $0x3a
2494 BYTE $0x0f
2495 BYTE $0xc9
2496 BYTE $0x04
2497 BYTE $0x66
2498 BYTE $0x45
2499 BYTE $0x0f
2500 BYTE $0x3a
2501 BYTE $0x0f
2502 BYTE $0xd2
2503 BYTE $0x04
2504 BYTE $0x66
2505 BYTE $0x45
2506 BYTE $0x0f
2507 BYTE $0x3a
2508 BYTE $0x0f
2509 BYTE $0xdb
2510 BYTE $0x04
2511 BYTE $0x66
2512 BYTE $0x45
2513 BYTE $0x0f
2514 BYTE $0x3a
2515 BYTE $0x0f
2516 BYTE $0xff
2517 BYTE $0x04
2518 ADDQ $0x10, R9
2519 CMPQ R9, $0xa0
2520 JB openSSETail256Loop
2521 MOVQ BX, CX
2522 ANDQ $-16, CX
2523
2524openSSETail256HashLoop:
2525 ADDQ (SI)(R9*1), R10
2526 ADCQ 8(SI)(R9*1), R11
2527 ADCQ $0x01, R12
2528 MOVQ (BP), AX
2529 MOVQ AX, R15
2530 MULQ R10
2531 MOVQ AX, R13
2532 MOVQ DX, R14
2533 MOVQ (BP), AX
2534 MULQ R11
2535 IMULQ R12, R15
2536 ADDQ AX, R14
2537 ADCQ DX, R15
2538 MOVQ 8(BP), AX
2539 MOVQ AX, R8
2540 MULQ R10
2541 ADDQ AX, R14
2542 ADCQ $0x00, DX
2543 MOVQ DX, R10
2544 MOVQ 8(BP), AX
2545 MULQ R11
2546 ADDQ AX, R15
2547 ADCQ $0x00, DX
2548 IMULQ R12, R8
2549 ADDQ R10, R15
2550 ADCQ DX, R8
2551 MOVQ R13, R10
2552 MOVQ R14, R11
2553 MOVQ R15, R12
2554 ANDQ $0x03, R12
2555 MOVQ R15, R13
2556 ANDQ $-4, R13
2557 MOVQ R8, R14
2558 SHRQ $0x02, R8, R15
2559 SHRQ $0x02, R8
2560 ADDQ R13, R10
2561 ADCQ R14, R11
2562 ADCQ $0x00, R12
2563 ADDQ R15, R10
2564 ADCQ R8, R11
2565 ADCQ $0x00, R12
2566 ADDQ $0x10, R9
2567 CMPQ R9, CX
2568 JB openSSETail256HashLoop
2569
2570 // Add in the state
2571 PADDD ·chacha20Constants<>+0(SB), X0
2572 PADDD ·chacha20Constants<>+0(SB), X1
2573 PADDD ·chacha20Constants<>+0(SB), X2
2574 PADDD ·chacha20Constants<>+0(SB), X12
2575 PADDD 32(BP), X3
2576 PADDD 32(BP), X4
2577 PADDD 32(BP), X5
2578 PADDD 32(BP), X13
2579 PADDD 48(BP), X6
2580 PADDD 48(BP), X7
2581 PADDD 48(BP), X8
2582 PADDD 48(BP), X14
2583 PADDD 80(BP), X9
2584 PADDD 96(BP), X10
2585 PADDD 112(BP), X11
2586 PADDD 128(BP), X15
2587 MOVO X15, 64(BP)
2588
2589 // Load - xor - store
2590 MOVOU (SI), X15
2591 PXOR X15, X0
2592 MOVOU 16(SI), X15
2593 PXOR X15, X3
2594 MOVOU 32(SI), X15
2595 PXOR X15, X6
2596 MOVOU 48(SI), X15
2597 PXOR X15, X9
2598 MOVOU X0, (DI)
2599 MOVOU X3, 16(DI)
2600 MOVOU X6, 32(DI)
2601 MOVOU X9, 48(DI)
2602 MOVOU 64(SI), X0
2603 MOVOU 80(SI), X3
2604 MOVOU 96(SI), X6
2605 MOVOU 112(SI), X9
2606 PXOR X0, X1
2607 PXOR X3, X4
2608 PXOR X6, X7
2609 PXOR X9, X10
2610 MOVOU X1, 64(DI)
2611 MOVOU X4, 80(DI)
2612 MOVOU X7, 96(DI)
2613 MOVOU X10, 112(DI)
2614 MOVOU 128(SI), X0
2615 MOVOU 144(SI), X3
2616 MOVOU 160(SI), X6
2617 MOVOU 176(SI), X9
2618 PXOR X0, X2
2619 PXOR X3, X5
2620 PXOR X6, X8
2621 PXOR X9, X11
2622 MOVOU X2, 128(DI)
2623 MOVOU X5, 144(DI)
2624 MOVOU X8, 160(DI)
2625 MOVOU X11, 176(DI)
2626 LEAQ 192(SI), SI
2627 LEAQ 192(DI), DI
2628 SUBQ $0xc0, BX
2629 MOVO X12, X0
2630 MOVO X13, X3
2631 MOVO X14, X6
2632 MOVO 64(BP), X9
2633 JMP openSSETail64DecLoop
2634
2635chacha20Poly1305Open_AVX2:
2636 VZEROUPPER
2637 VMOVDQU ·chacha20Constants<>+0(SB), Y0
2638 BYTE $0xc4
2639 BYTE $0x42
2640 BYTE $0x7d
2641 BYTE $0x5a
2642 BYTE $0x70
2643 BYTE $0x10
2644 BYTE $0xc4
2645 BYTE $0x42
2646 BYTE $0x7d
2647 BYTE $0x5a
2648 BYTE $0x60
2649 BYTE $0x20
2650 BYTE $0xc4
2651 BYTE $0xc2
2652 BYTE $0x7d
2653 BYTE $0x5a
2654 BYTE $0x60
2655 BYTE $0x30
2656 VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
2657
2658 // Special optimization, for very short buffers
2659 CMPQ BX, $0xc0
2660 JBE openAVX2192
2661 CMPQ BX, $0x00000140
2662 JBE openAVX2320
2663
2664 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
2665 VMOVDQA Y14, 32(BP)
2666 VMOVDQA Y12, 64(BP)
2667 VMOVDQA Y4, 192(BP)
2668 MOVQ $0x0000000a, R9
2669
2670openAVX2PreparePolyKey:
2671 VPADDD Y14, Y0, Y0
2672 VPXOR Y0, Y4, Y4
2673 VPSHUFB ·rol16<>+0(SB), Y4, Y4
2674 VPADDD Y4, Y12, Y12
2675 VPXOR Y12, Y14, Y14
2676 VPSLLD $0x0c, Y14, Y3
2677 VPSRLD $0x14, Y14, Y14
2678 VPXOR Y3, Y14, Y14
2679 VPADDD Y14, Y0, Y0
2680 VPXOR Y0, Y4, Y4
2681 VPSHUFB ·rol8<>+0(SB), Y4, Y4
2682 VPADDD Y4, Y12, Y12
2683 VPXOR Y12, Y14, Y14
2684 VPSLLD $0x07, Y14, Y3
2685 VPSRLD $0x19, Y14, Y14
2686 VPXOR Y3, Y14, Y14
2687 VPALIGNR $0x04, Y14, Y14, Y14
2688 VPALIGNR $0x08, Y12, Y12, Y12
2689 VPALIGNR $0x0c, Y4, Y4, Y4
2690 VPADDD Y14, Y0, Y0
2691 VPXOR Y0, Y4, Y4
2692 VPSHUFB ·rol16<>+0(SB), Y4, Y4
2693 VPADDD Y4, Y12, Y12
2694 VPXOR Y12, Y14, Y14
2695 VPSLLD $0x0c, Y14, Y3
2696 VPSRLD $0x14, Y14, Y14
2697 VPXOR Y3, Y14, Y14
2698 VPADDD Y14, Y0, Y0
2699 VPXOR Y0, Y4, Y4
2700 VPSHUFB ·rol8<>+0(SB), Y4, Y4
2701 VPADDD Y4, Y12, Y12
2702 VPXOR Y12, Y14, Y14
2703 VPSLLD $0x07, Y14, Y3
2704 VPSRLD $0x19, Y14, Y14
2705 VPXOR Y3, Y14, Y14
2706 VPALIGNR $0x0c, Y14, Y14, Y14
2707 VPALIGNR $0x08, Y12, Y12, Y12
2708 VPALIGNR $0x04, Y4, Y4, Y4
2709 DECQ R9
2710 JNE openAVX2PreparePolyKey
2711 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
2712 VPADDD 32(BP), Y14, Y14
2713 VPADDD 64(BP), Y12, Y12
2714 VPADDD 192(BP), Y4, Y4
2715 VPERM2I128 $0x02, Y0, Y14, Y3
2716
2717 // Clamp and store poly key
2718 VPAND ·polyClampMask<>+0(SB), Y3, Y3
2719 VMOVDQA Y3, (BP)
2720
2721 // Stream for the first 64 bytes
2722 VPERM2I128 $0x13, Y0, Y14, Y0
2723 VPERM2I128 $0x13, Y12, Y4, Y14
2724
2725 // Hash AD + first 64 bytes
2726 MOVQ ad_len+80(FP), R9
2727 CALL polyHashADInternal<>(SB)
2728 XORQ CX, CX
2729
2730openAVX2InitialHash64:
2731 ADDQ (SI)(CX*1), R10
2732 ADCQ 8(SI)(CX*1), R11
2733 ADCQ $0x01, R12
2734 MOVQ (BP), DX
2735 MOVQ DX, R15
2736 MULXQ R10, R13, R14
2737 IMULQ R12, R15
2738 MULXQ R11, AX, DX
2739 ADDQ AX, R14
2740 ADCQ DX, R15
2741 MOVQ 8(BP), DX
2742 MULXQ R10, R10, AX
2743 ADDQ R10, R14
2744 MULXQ R11, R11, R8
2745 ADCQ R11, R15
2746 ADCQ $0x00, R8
2747 IMULQ R12, DX
2748 ADDQ AX, R15
2749 ADCQ DX, R8
2750 MOVQ R13, R10
2751 MOVQ R14, R11
2752 MOVQ R15, R12
2753 ANDQ $0x03, R12
2754 MOVQ R15, R13
2755 ANDQ $-4, R13
2756 MOVQ R8, R14
2757 SHRQ $0x02, R8, R15
2758 SHRQ $0x02, R8
2759 ADDQ R13, R10
2760 ADCQ R14, R11
2761 ADCQ $0x00, R12
2762 ADDQ R15, R10
2763 ADCQ R8, R11
2764 ADCQ $0x00, R12
2765 ADDQ $0x10, CX
2766 CMPQ CX, $0x40
2767 JNE openAVX2InitialHash64
2768
2769 // Decrypt the first 64 bytes
2770 VPXOR (SI), Y0, Y0
2771 VPXOR 32(SI), Y14, Y14
2772 VMOVDQU Y0, (DI)
2773 VMOVDQU Y14, 32(DI)
2774 LEAQ 64(SI), SI
2775 LEAQ 64(DI), DI
2776 SUBQ $0x40, BX
2777
2778openAVX2MainLoop:
2779 CMPQ BX, $0x00000200
2780 JB openAVX2MainLoopDone
2781
2782 // Load state, increment counter blocks, store the incremented counters
2783 VMOVDQU ·chacha20Constants<>+0(SB), Y0
2784 VMOVDQA Y0, Y5
2785 VMOVDQA Y0, Y6
2786 VMOVDQA Y0, Y7
2787 VMOVDQA 32(BP), Y14
2788 VMOVDQA Y14, Y9
2789 VMOVDQA Y14, Y10
2790 VMOVDQA Y14, Y11
2791 VMOVDQA 64(BP), Y12
2792 VMOVDQA Y12, Y13
2793 VMOVDQA Y12, Y8
2794 VMOVDQA Y12, Y15
2795 VMOVDQA 192(BP), Y4
2796 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
2797 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
2798 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
2799 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
2800 VMOVDQA Y4, 96(BP)
2801 VMOVDQA Y1, 128(BP)
2802 VMOVDQA Y2, 160(BP)
2803 VMOVDQA Y3, 192(BP)
2804 XORQ CX, CX
2805
2806openAVX2InternalLoop:
2807 ADDQ (SI)(CX*1), R10
2808 ADCQ 8(SI)(CX*1), R11
2809 ADCQ $0x01, R12
2810 VPADDD Y14, Y0, Y0
2811 VPADDD Y9, Y5, Y5
2812 VPADDD Y10, Y6, Y6
2813 VPADDD Y11, Y7, Y7
2814 MOVQ (BP), DX
2815 MOVQ DX, R15
2816 MULXQ R10, R13, R14
2817 IMULQ R12, R15
2818 MULXQ R11, AX, DX
2819 ADDQ AX, R14
2820 ADCQ DX, R15
2821 VPXOR Y0, Y4, Y4
2822 VPXOR Y5, Y1, Y1
2823 VPXOR Y6, Y2, Y2
2824 VPXOR Y7, Y3, Y3
2825 VPSHUFB ·rol16<>+0(SB), Y4, Y4
2826 VPSHUFB ·rol16<>+0(SB), Y1, Y1
2827 VPSHUFB ·rol16<>+0(SB), Y2, Y2
2828 VPSHUFB ·rol16<>+0(SB), Y3, Y3
2829 MOVQ 8(BP), DX
2830 MULXQ R10, R10, AX
2831 ADDQ R10, R14
2832 MULXQ R11, R11, R8
2833 ADCQ R11, R15
2834 ADCQ $0x00, R8
2835 VPADDD Y4, Y12, Y12
2836 VPADDD Y1, Y13, Y13
2837 VPADDD Y2, Y8, Y8
2838 VPADDD Y3, Y15, Y15
2839 VPXOR Y12, Y14, Y14
2840 VPXOR Y13, Y9, Y9
2841 VPXOR Y8, Y10, Y10
2842 VPXOR Y15, Y11, Y11
2843 IMULQ R12, DX
2844 ADDQ AX, R15
2845 ADCQ DX, R8
2846 VMOVDQA Y15, 224(BP)
2847 VPSLLD $0x0c, Y14, Y15
2848 VPSRLD $0x14, Y14, Y14
2849 VPXOR Y15, Y14, Y14
2850 VPSLLD $0x0c, Y9, Y15
2851 VPSRLD $0x14, Y9, Y9
2852 VPXOR Y15, Y9, Y9
2853 VPSLLD $0x0c, Y10, Y15
2854 VPSRLD $0x14, Y10, Y10
2855 VPXOR Y15, Y10, Y10
2856 VPSLLD $0x0c, Y11, Y15
2857 VPSRLD $0x14, Y11, Y11
2858 VPXOR Y15, Y11, Y11
2859 VMOVDQA 224(BP), Y15
2860 MOVQ R13, R10
2861 MOVQ R14, R11
2862 MOVQ R15, R12
2863 ANDQ $0x03, R12
2864 MOVQ R15, R13
2865 ANDQ $-4, R13
2866 MOVQ R8, R14
2867 SHRQ $0x02, R8, R15
2868 SHRQ $0x02, R8
2869 ADDQ R13, R10
2870 ADCQ R14, R11
2871 ADCQ $0x00, R12
2872 ADDQ R15, R10
2873 ADCQ R8, R11
2874 ADCQ $0x00, R12
2875 VPADDD Y14, Y0, Y0
2876 VPADDD Y9, Y5, Y5
2877 VPADDD Y10, Y6, Y6
2878 VPADDD Y11, Y7, Y7
2879 VPXOR Y0, Y4, Y4
2880 VPXOR Y5, Y1, Y1
2881 VPXOR Y6, Y2, Y2
2882 VPXOR Y7, Y3, Y3
2883 VPSHUFB ·rol8<>+0(SB), Y4, Y4
2884 VPSHUFB ·rol8<>+0(SB), Y1, Y1
2885 VPSHUFB ·rol8<>+0(SB), Y2, Y2
2886 VPSHUFB ·rol8<>+0(SB), Y3, Y3
2887 ADDQ 16(SI)(CX*1), R10
2888 ADCQ 24(SI)(CX*1), R11
2889 ADCQ $0x01, R12
2890 VPADDD Y4, Y12, Y12
2891 VPADDD Y1, Y13, Y13
2892 VPADDD Y2, Y8, Y8
2893 VPADDD Y3, Y15, Y15
2894 MOVQ (BP), DX
2895 MOVQ DX, R15
2896 MULXQ R10, R13, R14
2897 IMULQ R12, R15
2898 MULXQ R11, AX, DX
2899 ADDQ AX, R14
2900 ADCQ DX, R15
2901 VPXOR Y12, Y14, Y14
2902 VPXOR Y13, Y9, Y9
2903 VPXOR Y8, Y10, Y10
2904 VPXOR Y15, Y11, Y11
2905 VMOVDQA Y15, 224(BP)
2906 VPSLLD $0x07, Y14, Y15
2907 VPSRLD $0x19, Y14, Y14
2908 VPXOR Y15, Y14, Y14
2909 VPSLLD $0x07, Y9, Y15
2910 VPSRLD $0x19, Y9, Y9
2911 VPXOR Y15, Y9, Y9
2912 VPSLLD $0x07, Y10, Y15
2913 VPSRLD $0x19, Y10, Y10
2914 VPXOR Y15, Y10, Y10
2915 VPSLLD $0x07, Y11, Y15
2916 VPSRLD $0x19, Y11, Y11
2917 VPXOR Y15, Y11, Y11
2918 VMOVDQA 224(BP), Y15
2919 MOVQ 8(BP), DX
2920 MULXQ R10, R10, AX
2921 ADDQ R10, R14
2922 MULXQ R11, R11, R8
2923 ADCQ R11, R15
2924 ADCQ $0x00, R8
2925 VPALIGNR $0x04, Y14, Y14, Y14
2926 VPALIGNR $0x04, Y9, Y9, Y9
2927 VPALIGNR $0x04, Y10, Y10, Y10
2928 VPALIGNR $0x04, Y11, Y11, Y11
2929 VPALIGNR $0x08, Y12, Y12, Y12
2930 VPALIGNR $0x08, Y13, Y13, Y13
2931 VPALIGNR $0x08, Y8, Y8, Y8
2932 VPALIGNR $0x08, Y15, Y15, Y15
2933 VPALIGNR $0x0c, Y4, Y4, Y4
2934 VPALIGNR $0x0c, Y1, Y1, Y1
2935 VPALIGNR $0x0c, Y2, Y2, Y2
2936 VPALIGNR $0x0c, Y3, Y3, Y3
2937 VPADDD Y14, Y0, Y0
2938 VPADDD Y9, Y5, Y5
2939 VPADDD Y10, Y6, Y6
2940 VPADDD Y11, Y7, Y7
2941 IMULQ R12, DX
2942 ADDQ AX, R15
2943 ADCQ DX, R8
2944 VPXOR Y0, Y4, Y4
2945 VPXOR Y5, Y1, Y1
2946 VPXOR Y6, Y2, Y2
2947 VPXOR Y7, Y3, Y3
2948 VPSHUFB ·rol16<>+0(SB), Y4, Y4
2949 VPSHUFB ·rol16<>+0(SB), Y1, Y1
2950 VPSHUFB ·rol16<>+0(SB), Y2, Y2
2951 VPSHUFB ·rol16<>+0(SB), Y3, Y3
2952 MOVQ R13, R10
2953 MOVQ R14, R11
2954 MOVQ R15, R12
2955 ANDQ $0x03, R12
2956 MOVQ R15, R13
2957 ANDQ $-4, R13
2958 MOVQ R8, R14
2959 SHRQ $0x02, R8, R15
2960 SHRQ $0x02, R8
2961 ADDQ R13, R10
2962 ADCQ R14, R11
2963 ADCQ $0x00, R12
2964 ADDQ R15, R10
2965 ADCQ R8, R11
2966 ADCQ $0x00, R12
2967 VPADDD Y4, Y12, Y12
2968 VPADDD Y1, Y13, Y13
2969 VPADDD Y2, Y8, Y8
2970 VPADDD Y3, Y15, Y15
2971 VPXOR Y12, Y14, Y14
2972 VPXOR Y13, Y9, Y9
2973 VPXOR Y8, Y10, Y10
2974 VPXOR Y15, Y11, Y11
2975 ADDQ 32(SI)(CX*1), R10
2976 ADCQ 40(SI)(CX*1), R11
2977 ADCQ $0x01, R12
2978 LEAQ 48(CX), CX
2979 VMOVDQA Y15, 224(BP)
2980 VPSLLD $0x0c, Y14, Y15
2981 VPSRLD $0x14, Y14, Y14
2982 VPXOR Y15, Y14, Y14
2983 VPSLLD $0x0c, Y9, Y15
2984 VPSRLD $0x14, Y9, Y9
2985 VPXOR Y15, Y9, Y9
2986 VPSLLD $0x0c, Y10, Y15
2987 VPSRLD $0x14, Y10, Y10
2988 VPXOR Y15, Y10, Y10
2989 VPSLLD $0x0c, Y11, Y15
2990 VPSRLD $0x14, Y11, Y11
2991 VPXOR Y15, Y11, Y11
2992 VMOVDQA 224(BP), Y15
2993 MOVQ (BP), DX
2994 MOVQ DX, R15
2995 MULXQ R10, R13, R14
2996 IMULQ R12, R15
2997 MULXQ R11, AX, DX
2998 ADDQ AX, R14
2999 ADCQ DX, R15
3000 VPADDD Y14, Y0, Y0
3001 VPADDD Y9, Y5, Y5
3002 VPADDD Y10, Y6, Y6
3003 VPADDD Y11, Y7, Y7
3004 VPXOR Y0, Y4, Y4
3005 VPXOR Y5, Y1, Y1
3006 VPXOR Y6, Y2, Y2
3007 VPXOR Y7, Y3, Y3
3008 MOVQ 8(BP), DX
3009 MULXQ R10, R10, AX
3010 ADDQ R10, R14
3011 MULXQ R11, R11, R8
3012 ADCQ R11, R15
3013 ADCQ $0x00, R8
3014 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3015 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3016 VPSHUFB ·rol8<>+0(SB), Y2, Y2
3017 VPSHUFB ·rol8<>+0(SB), Y3, Y3
3018 VPADDD Y4, Y12, Y12
3019 VPADDD Y1, Y13, Y13
3020 VPADDD Y2, Y8, Y8
3021 VPADDD Y3, Y15, Y15
3022 IMULQ R12, DX
3023 ADDQ AX, R15
3024 ADCQ DX, R8
3025 VPXOR Y12, Y14, Y14
3026 VPXOR Y13, Y9, Y9
3027 VPXOR Y8, Y10, Y10
3028 VPXOR Y15, Y11, Y11
3029 VMOVDQA Y15, 224(BP)
3030 VPSLLD $0x07, Y14, Y15
3031 VPSRLD $0x19, Y14, Y14
3032 VPXOR Y15, Y14, Y14
3033 VPSLLD $0x07, Y9, Y15
3034 VPSRLD $0x19, Y9, Y9
3035 VPXOR Y15, Y9, Y9
3036 VPSLLD $0x07, Y10, Y15
3037 VPSRLD $0x19, Y10, Y10
3038 VPXOR Y15, Y10, Y10
3039 VPSLLD $0x07, Y11, Y15
3040 VPSRLD $0x19, Y11, Y11
3041 VPXOR Y15, Y11, Y11
3042 VMOVDQA 224(BP), Y15
3043 MOVQ R13, R10
3044 MOVQ R14, R11
3045 MOVQ R15, R12
3046 ANDQ $0x03, R12
3047 MOVQ R15, R13
3048 ANDQ $-4, R13
3049 MOVQ R8, R14
3050 SHRQ $0x02, R8, R15
3051 SHRQ $0x02, R8
3052 ADDQ R13, R10
3053 ADCQ R14, R11
3054 ADCQ $0x00, R12
3055 ADDQ R15, R10
3056 ADCQ R8, R11
3057 ADCQ $0x00, R12
3058 VPALIGNR $0x0c, Y14, Y14, Y14
3059 VPALIGNR $0x0c, Y9, Y9, Y9
3060 VPALIGNR $0x0c, Y10, Y10, Y10
3061 VPALIGNR $0x0c, Y11, Y11, Y11
3062 VPALIGNR $0x08, Y12, Y12, Y12
3063 VPALIGNR $0x08, Y13, Y13, Y13
3064 VPALIGNR $0x08, Y8, Y8, Y8
3065 VPALIGNR $0x08, Y15, Y15, Y15
3066 VPALIGNR $0x04, Y4, Y4, Y4
3067 VPALIGNR $0x04, Y1, Y1, Y1
3068 VPALIGNR $0x04, Y2, Y2, Y2
3069 VPALIGNR $0x04, Y3, Y3, Y3
3070 CMPQ CX, $0x000001e0
3071 JNE openAVX2InternalLoop
3072 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
3073 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
3074 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
3075 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
3076 VPADDD 32(BP), Y14, Y14
3077 VPADDD 32(BP), Y9, Y9
3078 VPADDD 32(BP), Y10, Y10
3079 VPADDD 32(BP), Y11, Y11
3080 VPADDD 64(BP), Y12, Y12
3081 VPADDD 64(BP), Y13, Y13
3082 VPADDD 64(BP), Y8, Y8
3083 VPADDD 64(BP), Y15, Y15
3084 VPADDD 96(BP), Y4, Y4
3085 VPADDD 128(BP), Y1, Y1
3086 VPADDD 160(BP), Y2, Y2
3087 VPADDD 192(BP), Y3, Y3
3088 VMOVDQA Y15, 224(BP)
3089
3090 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
3091 ADDQ 480(SI), R10
3092 ADCQ 488(SI), R11
3093 ADCQ $0x01, R12
3094 MOVQ (BP), DX
3095 MOVQ DX, R15
3096 MULXQ R10, R13, R14
3097 IMULQ R12, R15
3098 MULXQ R11, AX, DX
3099 ADDQ AX, R14
3100 ADCQ DX, R15
3101 MOVQ 8(BP), DX
3102 MULXQ R10, R10, AX
3103 ADDQ R10, R14
3104 MULXQ R11, R11, R8
3105 ADCQ R11, R15
3106 ADCQ $0x00, R8
3107 IMULQ R12, DX
3108 ADDQ AX, R15
3109 ADCQ DX, R8
3110 MOVQ R13, R10
3111 MOVQ R14, R11
3112 MOVQ R15, R12
3113 ANDQ $0x03, R12
3114 MOVQ R15, R13
3115 ANDQ $-4, R13
3116 MOVQ R8, R14
3117 SHRQ $0x02, R8, R15
3118 SHRQ $0x02, R8
3119 ADDQ R13, R10
3120 ADCQ R14, R11
3121 ADCQ $0x00, R12
3122 ADDQ R15, R10
3123 ADCQ R8, R11
3124 ADCQ $0x00, R12
3125 VPERM2I128 $0x02, Y0, Y14, Y15
3126 VPERM2I128 $0x13, Y0, Y14, Y14
3127 VPERM2I128 $0x02, Y12, Y4, Y0
3128 VPERM2I128 $0x13, Y12, Y4, Y12
3129 VPXOR (SI), Y15, Y15
3130 VPXOR 32(SI), Y0, Y0
3131 VPXOR 64(SI), Y14, Y14
3132 VPXOR 96(SI), Y12, Y12
3133 VMOVDQU Y15, (DI)
3134 VMOVDQU Y0, 32(DI)
3135 VMOVDQU Y14, 64(DI)
3136 VMOVDQU Y12, 96(DI)
3137 VPERM2I128 $0x02, Y5, Y9, Y0
3138 VPERM2I128 $0x02, Y13, Y1, Y14
3139 VPERM2I128 $0x13, Y5, Y9, Y12
3140 VPERM2I128 $0x13, Y13, Y1, Y4
3141 VPXOR 128(SI), Y0, Y0
3142 VPXOR 160(SI), Y14, Y14
3143 VPXOR 192(SI), Y12, Y12
3144 VPXOR 224(SI), Y4, Y4
3145 VMOVDQU Y0, 128(DI)
3146 VMOVDQU Y14, 160(DI)
3147 VMOVDQU Y12, 192(DI)
3148 VMOVDQU Y4, 224(DI)
3149
3150 // and here
3151 ADDQ 496(SI), R10
3152 ADCQ 504(SI), R11
3153 ADCQ $0x01, R12
3154 MOVQ (BP), DX
3155 MOVQ DX, R15
3156 MULXQ R10, R13, R14
3157 IMULQ R12, R15
3158 MULXQ R11, AX, DX
3159 ADDQ AX, R14
3160 ADCQ DX, R15
3161 MOVQ 8(BP), DX
3162 MULXQ R10, R10, AX
3163 ADDQ R10, R14
3164 MULXQ R11, R11, R8
3165 ADCQ R11, R15
3166 ADCQ $0x00, R8
3167 IMULQ R12, DX
3168 ADDQ AX, R15
3169 ADCQ DX, R8
3170 MOVQ R13, R10
3171 MOVQ R14, R11
3172 MOVQ R15, R12
3173 ANDQ $0x03, R12
3174 MOVQ R15, R13
3175 ANDQ $-4, R13
3176 MOVQ R8, R14
3177 SHRQ $0x02, R8, R15
3178 SHRQ $0x02, R8
3179 ADDQ R13, R10
3180 ADCQ R14, R11
3181 ADCQ $0x00, R12
3182 ADDQ R15, R10
3183 ADCQ R8, R11
3184 ADCQ $0x00, R12
3185 VPERM2I128 $0x02, Y6, Y10, Y0
3186 VPERM2I128 $0x02, Y8, Y2, Y14
3187 VPERM2I128 $0x13, Y6, Y10, Y12
3188 VPERM2I128 $0x13, Y8, Y2, Y4
3189 VPXOR 256(SI), Y0, Y0
3190 VPXOR 288(SI), Y14, Y14
3191 VPXOR 320(SI), Y12, Y12
3192 VPXOR 352(SI), Y4, Y4
3193 VMOVDQU Y0, 256(DI)
3194 VMOVDQU Y14, 288(DI)
3195 VMOVDQU Y12, 320(DI)
3196 VMOVDQU Y4, 352(DI)
3197 VPERM2I128 $0x02, Y7, Y11, Y0
3198 VPERM2I128 $0x02, 224(BP), Y3, Y14
3199 VPERM2I128 $0x13, Y7, Y11, Y12
3200 VPERM2I128 $0x13, 224(BP), Y3, Y4
3201 VPXOR 384(SI), Y0, Y0
3202 VPXOR 416(SI), Y14, Y14
3203 VPXOR 448(SI), Y12, Y12
3204 VPXOR 480(SI), Y4, Y4
3205 VMOVDQU Y0, 384(DI)
3206 VMOVDQU Y14, 416(DI)
3207 VMOVDQU Y12, 448(DI)
3208 VMOVDQU Y4, 480(DI)
3209 LEAQ 512(SI), SI
3210 LEAQ 512(DI), DI
3211 SUBQ $0x00000200, BX
3212 JMP openAVX2MainLoop
3213
3214openAVX2MainLoopDone:
3215 // Handle the various tail sizes efficiently
3216 TESTQ BX, BX
3217 JE openSSEFinalize
3218 CMPQ BX, $0x80
3219 JBE openAVX2Tail128
3220 CMPQ BX, $0x00000100
3221 JBE openAVX2Tail256
3222 CMPQ BX, $0x00000180
3223 JBE openAVX2Tail384
3224 JMP openAVX2Tail512
3225
3226openAVX2192:
3227 VMOVDQA Y0, Y5
3228 VMOVDQA Y14, Y9
3229 VMOVDQA Y12, Y13
3230 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
3231 VMOVDQA Y0, Y6
3232 VMOVDQA Y14, Y10
3233 VMOVDQA Y12, Y8
3234 VMOVDQA Y4, Y2
3235 VMOVDQA Y1, Y15
3236 MOVQ $0x0000000a, R9
3237
3238openAVX2192InnerCipherLoop:
3239 VPADDD Y14, Y0, Y0
3240 VPXOR Y0, Y4, Y4
3241 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3242 VPADDD Y4, Y12, Y12
3243 VPXOR Y12, Y14, Y14
3244 VPSLLD $0x0c, Y14, Y3
3245 VPSRLD $0x14, Y14, Y14
3246 VPXOR Y3, Y14, Y14
3247 VPADDD Y14, Y0, Y0
3248 VPXOR Y0, Y4, Y4
3249 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3250 VPADDD Y4, Y12, Y12
3251 VPXOR Y12, Y14, Y14
3252 VPSLLD $0x07, Y14, Y3
3253 VPSRLD $0x19, Y14, Y14
3254 VPXOR Y3, Y14, Y14
3255 VPADDD Y9, Y5, Y5
3256 VPXOR Y5, Y1, Y1
3257 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3258 VPADDD Y1, Y13, Y13
3259 VPXOR Y13, Y9, Y9
3260 VPSLLD $0x0c, Y9, Y3
3261 VPSRLD $0x14, Y9, Y9
3262 VPXOR Y3, Y9, Y9
3263 VPADDD Y9, Y5, Y5
3264 VPXOR Y5, Y1, Y1
3265 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3266 VPADDD Y1, Y13, Y13
3267 VPXOR Y13, Y9, Y9
3268 VPSLLD $0x07, Y9, Y3
3269 VPSRLD $0x19, Y9, Y9
3270 VPXOR Y3, Y9, Y9
3271 VPALIGNR $0x04, Y14, Y14, Y14
3272 VPALIGNR $0x04, Y9, Y9, Y9
3273 VPALIGNR $0x08, Y12, Y12, Y12
3274 VPALIGNR $0x08, Y13, Y13, Y13
3275 VPALIGNR $0x0c, Y4, Y4, Y4
3276 VPALIGNR $0x0c, Y1, Y1, Y1
3277 VPADDD Y14, Y0, Y0
3278 VPXOR Y0, Y4, Y4
3279 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3280 VPADDD Y4, Y12, Y12
3281 VPXOR Y12, Y14, Y14
3282 VPSLLD $0x0c, Y14, Y3
3283 VPSRLD $0x14, Y14, Y14
3284 VPXOR Y3, Y14, Y14
3285 VPADDD Y14, Y0, Y0
3286 VPXOR Y0, Y4, Y4
3287 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3288 VPADDD Y4, Y12, Y12
3289 VPXOR Y12, Y14, Y14
3290 VPSLLD $0x07, Y14, Y3
3291 VPSRLD $0x19, Y14, Y14
3292 VPXOR Y3, Y14, Y14
3293 VPADDD Y9, Y5, Y5
3294 VPXOR Y5, Y1, Y1
3295 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3296 VPADDD Y1, Y13, Y13
3297 VPXOR Y13, Y9, Y9
3298 VPSLLD $0x0c, Y9, Y3
3299 VPSRLD $0x14, Y9, Y9
3300 VPXOR Y3, Y9, Y9
3301 VPADDD Y9, Y5, Y5
3302 VPXOR Y5, Y1, Y1
3303 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3304 VPADDD Y1, Y13, Y13
3305 VPXOR Y13, Y9, Y9
3306 VPSLLD $0x07, Y9, Y3
3307 VPSRLD $0x19, Y9, Y9
3308 VPXOR Y3, Y9, Y9
3309 VPALIGNR $0x0c, Y14, Y14, Y14
3310 VPALIGNR $0x0c, Y9, Y9, Y9
3311 VPALIGNR $0x08, Y12, Y12, Y12
3312 VPALIGNR $0x08, Y13, Y13, Y13
3313 VPALIGNR $0x04, Y4, Y4, Y4
3314 VPALIGNR $0x04, Y1, Y1, Y1
3315 DECQ R9
3316 JNE openAVX2192InnerCipherLoop
3317 VPADDD Y6, Y0, Y0
3318 VPADDD Y6, Y5, Y5
3319 VPADDD Y10, Y14, Y14
3320 VPADDD Y10, Y9, Y9
3321 VPADDD Y8, Y12, Y12
3322 VPADDD Y8, Y13, Y13
3323 VPADDD Y2, Y4, Y4
3324 VPADDD Y15, Y1, Y1
3325 VPERM2I128 $0x02, Y0, Y14, Y3
3326
3327 // Clamp and store poly key
3328 VPAND ·polyClampMask<>+0(SB), Y3, Y3
3329 VMOVDQA Y3, (BP)
3330
3331 // Stream for up to 192 bytes
3332 VPERM2I128 $0x13, Y0, Y14, Y0
3333 VPERM2I128 $0x13, Y12, Y4, Y14
3334 VPERM2I128 $0x02, Y5, Y9, Y12
3335 VPERM2I128 $0x02, Y13, Y1, Y4
3336 VPERM2I128 $0x13, Y5, Y9, Y5
3337 VPERM2I128 $0x13, Y13, Y1, Y9
3338
3339openAVX2ShortOpen:
3340 // Hash
3341 MOVQ ad_len+80(FP), R9
3342 CALL polyHashADInternal<>(SB)
3343
3344openAVX2ShortOpenLoop:
3345 CMPQ BX, $0x20
3346 JB openAVX2ShortTail32
3347 SUBQ $0x20, BX
3348
3349 // Load for hashing
3350 ADDQ (SI), R10
3351 ADCQ 8(SI), R11
3352 ADCQ $0x01, R12
3353 MOVQ (BP), DX
3354 MOVQ DX, R15
3355 MULXQ R10, R13, R14
3356 IMULQ R12, R15
3357 MULXQ R11, AX, DX
3358 ADDQ AX, R14
3359 ADCQ DX, R15
3360 MOVQ 8(BP), DX
3361 MULXQ R10, R10, AX
3362 ADDQ R10, R14
3363 MULXQ R11, R11, R8
3364 ADCQ R11, R15
3365 ADCQ $0x00, R8
3366 IMULQ R12, DX
3367 ADDQ AX, R15
3368 ADCQ DX, R8
3369 MOVQ R13, R10
3370 MOVQ R14, R11
3371 MOVQ R15, R12
3372 ANDQ $0x03, R12
3373 MOVQ R15, R13
3374 ANDQ $-4, R13
3375 MOVQ R8, R14
3376 SHRQ $0x02, R8, R15
3377 SHRQ $0x02, R8
3378 ADDQ R13, R10
3379 ADCQ R14, R11
3380 ADCQ $0x00, R12
3381 ADDQ R15, R10
3382 ADCQ R8, R11
3383 ADCQ $0x00, R12
3384 ADDQ 16(SI), R10
3385 ADCQ 24(SI), R11
3386 ADCQ $0x01, R12
3387 MOVQ (BP), DX
3388 MOVQ DX, R15
3389 MULXQ R10, R13, R14
3390 IMULQ R12, R15
3391 MULXQ R11, AX, DX
3392 ADDQ AX, R14
3393 ADCQ DX, R15
3394 MOVQ 8(BP), DX
3395 MULXQ R10, R10, AX
3396 ADDQ R10, R14
3397 MULXQ R11, R11, R8
3398 ADCQ R11, R15
3399 ADCQ $0x00, R8
3400 IMULQ R12, DX
3401 ADDQ AX, R15
3402 ADCQ DX, R8
3403 MOVQ R13, R10
3404 MOVQ R14, R11
3405 MOVQ R15, R12
3406 ANDQ $0x03, R12
3407 MOVQ R15, R13
3408 ANDQ $-4, R13
3409 MOVQ R8, R14
3410 SHRQ $0x02, R8, R15
3411 SHRQ $0x02, R8
3412 ADDQ R13, R10
3413 ADCQ R14, R11
3414 ADCQ $0x00, R12
3415 ADDQ R15, R10
3416 ADCQ R8, R11
3417 ADCQ $0x00, R12
3418
3419 // Load for decryption
3420 VPXOR (SI), Y0, Y0
3421 VMOVDQU Y0, (DI)
3422 LEAQ 32(SI), SI
3423 LEAQ 32(DI), DI
3424
3425 // Shift stream left
3426 VMOVDQA Y14, Y0
3427 VMOVDQA Y12, Y14
3428 VMOVDQA Y4, Y12
3429 VMOVDQA Y5, Y4
3430 VMOVDQA Y9, Y5
3431 VMOVDQA Y13, Y9
3432 VMOVDQA Y1, Y13
3433 VMOVDQA Y6, Y1
3434 VMOVDQA Y10, Y6
3435 JMP openAVX2ShortOpenLoop
3436
3437openAVX2ShortTail32:
3438 CMPQ BX, $0x10
3439 VMOVDQA X0, X1
3440 JB openAVX2ShortDone
3441 SUBQ $0x10, BX
3442
3443 // Load for hashing
3444 ADDQ (SI), R10
3445 ADCQ 8(SI), R11
3446 ADCQ $0x01, R12
3447 MOVQ (BP), DX
3448 MOVQ DX, R15
3449 MULXQ R10, R13, R14
3450 IMULQ R12, R15
3451 MULXQ R11, AX, DX
3452 ADDQ AX, R14
3453 ADCQ DX, R15
3454 MOVQ 8(BP), DX
3455 MULXQ R10, R10, AX
3456 ADDQ R10, R14
3457 MULXQ R11, R11, R8
3458 ADCQ R11, R15
3459 ADCQ $0x00, R8
3460 IMULQ R12, DX
3461 ADDQ AX, R15
3462 ADCQ DX, R8
3463 MOVQ R13, R10
3464 MOVQ R14, R11
3465 MOVQ R15, R12
3466 ANDQ $0x03, R12
3467 MOVQ R15, R13
3468 ANDQ $-4, R13
3469 MOVQ R8, R14
3470 SHRQ $0x02, R8, R15
3471 SHRQ $0x02, R8
3472 ADDQ R13, R10
3473 ADCQ R14, R11
3474 ADCQ $0x00, R12
3475 ADDQ R15, R10
3476 ADCQ R8, R11
3477 ADCQ $0x00, R12
3478
3479 // Load for decryption
3480 VPXOR (SI), X0, X12
3481 VMOVDQU X12, (DI)
3482 LEAQ 16(SI), SI
3483 LEAQ 16(DI), DI
3484 VPERM2I128 $0x11, Y0, Y0, Y0
3485 VMOVDQA X0, X1
3486
3487openAVX2ShortDone:
3488 VZEROUPPER
3489 JMP openSSETail16
3490
3491openAVX2320:
3492 VMOVDQA Y0, Y5
3493 VMOVDQA Y14, Y9
3494 VMOVDQA Y12, Y13
3495 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
3496 VMOVDQA Y0, Y6
3497 VMOVDQA Y14, Y10
3498 VMOVDQA Y12, Y8
3499 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
3500 VMOVDQA Y14, Y7
3501 VMOVDQA Y12, Y11
3502 VMOVDQA Y4, Y15
3503 MOVQ $0x0000000a, R9
3504
3505openAVX2320InnerCipherLoop:
3506 VPADDD Y14, Y0, Y0
3507 VPXOR Y0, Y4, Y4
3508 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3509 VPADDD Y4, Y12, Y12
3510 VPXOR Y12, Y14, Y14
3511 VPSLLD $0x0c, Y14, Y3
3512 VPSRLD $0x14, Y14, Y14
3513 VPXOR Y3, Y14, Y14
3514 VPADDD Y14, Y0, Y0
3515 VPXOR Y0, Y4, Y4
3516 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3517 VPADDD Y4, Y12, Y12
3518 VPXOR Y12, Y14, Y14
3519 VPSLLD $0x07, Y14, Y3
3520 VPSRLD $0x19, Y14, Y14
3521 VPXOR Y3, Y14, Y14
3522 VPADDD Y9, Y5, Y5
3523 VPXOR Y5, Y1, Y1
3524 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3525 VPADDD Y1, Y13, Y13
3526 VPXOR Y13, Y9, Y9
3527 VPSLLD $0x0c, Y9, Y3
3528 VPSRLD $0x14, Y9, Y9
3529 VPXOR Y3, Y9, Y9
3530 VPADDD Y9, Y5, Y5
3531 VPXOR Y5, Y1, Y1
3532 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3533 VPADDD Y1, Y13, Y13
3534 VPXOR Y13, Y9, Y9
3535 VPSLLD $0x07, Y9, Y3
3536 VPSRLD $0x19, Y9, Y9
3537 VPXOR Y3, Y9, Y9
3538 VPADDD Y10, Y6, Y6
3539 VPXOR Y6, Y2, Y2
3540 VPSHUFB ·rol16<>+0(SB), Y2, Y2
3541 VPADDD Y2, Y8, Y8
3542 VPXOR Y8, Y10, Y10
3543 VPSLLD $0x0c, Y10, Y3
3544 VPSRLD $0x14, Y10, Y10
3545 VPXOR Y3, Y10, Y10
3546 VPADDD Y10, Y6, Y6
3547 VPXOR Y6, Y2, Y2
3548 VPSHUFB ·rol8<>+0(SB), Y2, Y2
3549 VPADDD Y2, Y8, Y8
3550 VPXOR Y8, Y10, Y10
3551 VPSLLD $0x07, Y10, Y3
3552 VPSRLD $0x19, Y10, Y10
3553 VPXOR Y3, Y10, Y10
3554 VPALIGNR $0x04, Y14, Y14, Y14
3555 VPALIGNR $0x04, Y9, Y9, Y9
3556 VPALIGNR $0x04, Y10, Y10, Y10
3557 VPALIGNR $0x08, Y12, Y12, Y12
3558 VPALIGNR $0x08, Y13, Y13, Y13
3559 VPALIGNR $0x08, Y8, Y8, Y8
3560 VPALIGNR $0x0c, Y4, Y4, Y4
3561 VPALIGNR $0x0c, Y1, Y1, Y1
3562 VPALIGNR $0x0c, Y2, Y2, Y2
3563 VPADDD Y14, Y0, Y0
3564 VPXOR Y0, Y4, Y4
3565 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3566 VPADDD Y4, Y12, Y12
3567 VPXOR Y12, Y14, Y14
3568 VPSLLD $0x0c, Y14, Y3
3569 VPSRLD $0x14, Y14, Y14
3570 VPXOR Y3, Y14, Y14
3571 VPADDD Y14, Y0, Y0
3572 VPXOR Y0, Y4, Y4
3573 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3574 VPADDD Y4, Y12, Y12
3575 VPXOR Y12, Y14, Y14
3576 VPSLLD $0x07, Y14, Y3
3577 VPSRLD $0x19, Y14, Y14
3578 VPXOR Y3, Y14, Y14
3579 VPADDD Y9, Y5, Y5
3580 VPXOR Y5, Y1, Y1
3581 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3582 VPADDD Y1, Y13, Y13
3583 VPXOR Y13, Y9, Y9
3584 VPSLLD $0x0c, Y9, Y3
3585 VPSRLD $0x14, Y9, Y9
3586 VPXOR Y3, Y9, Y9
3587 VPADDD Y9, Y5, Y5
3588 VPXOR Y5, Y1, Y1
3589 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3590 VPADDD Y1, Y13, Y13
3591 VPXOR Y13, Y9, Y9
3592 VPSLLD $0x07, Y9, Y3
3593 VPSRLD $0x19, Y9, Y9
3594 VPXOR Y3, Y9, Y9
3595 VPADDD Y10, Y6, Y6
3596 VPXOR Y6, Y2, Y2
3597 VPSHUFB ·rol16<>+0(SB), Y2, Y2
3598 VPADDD Y2, Y8, Y8
3599 VPXOR Y8, Y10, Y10
3600 VPSLLD $0x0c, Y10, Y3
3601 VPSRLD $0x14, Y10, Y10
3602 VPXOR Y3, Y10, Y10
3603 VPADDD Y10, Y6, Y6
3604 VPXOR Y6, Y2, Y2
3605 VPSHUFB ·rol8<>+0(SB), Y2, Y2
3606 VPADDD Y2, Y8, Y8
3607 VPXOR Y8, Y10, Y10
3608 VPSLLD $0x07, Y10, Y3
3609 VPSRLD $0x19, Y10, Y10
3610 VPXOR Y3, Y10, Y10
3611 VPALIGNR $0x0c, Y14, Y14, Y14
3612 VPALIGNR $0x0c, Y9, Y9, Y9
3613 VPALIGNR $0x0c, Y10, Y10, Y10
3614 VPALIGNR $0x08, Y12, Y12, Y12
3615 VPALIGNR $0x08, Y13, Y13, Y13
3616 VPALIGNR $0x08, Y8, Y8, Y8
3617 VPALIGNR $0x04, Y4, Y4, Y4
3618 VPALIGNR $0x04, Y1, Y1, Y1
3619 VPALIGNR $0x04, Y2, Y2, Y2
3620 DECQ R9
3621 JNE openAVX2320InnerCipherLoop
3622 VMOVDQA ·chacha20Constants<>+0(SB), Y3
3623 VPADDD Y3, Y0, Y0
3624 VPADDD Y3, Y5, Y5
3625 VPADDD Y3, Y6, Y6
3626 VPADDD Y7, Y14, Y14
3627 VPADDD Y7, Y9, Y9
3628 VPADDD Y7, Y10, Y10
3629 VPADDD Y11, Y12, Y12
3630 VPADDD Y11, Y13, Y13
3631 VPADDD Y11, Y8, Y8
3632 VMOVDQA ·avx2IncMask<>+0(SB), Y3
3633 VPADDD Y15, Y4, Y4
3634 VPADDD Y3, Y15, Y15
3635 VPADDD Y15, Y1, Y1
3636 VPADDD Y3, Y15, Y15
3637 VPADDD Y15, Y2, Y2
3638
3639 // Clamp and store poly key
3640 VPERM2I128 $0x02, Y0, Y14, Y3
3641 VPAND ·polyClampMask<>+0(SB), Y3, Y3
3642 VMOVDQA Y3, (BP)
3643
3644 // Stream for up to 320 bytes
3645 VPERM2I128 $0x13, Y0, Y14, Y0
3646 VPERM2I128 $0x13, Y12, Y4, Y14
3647 VPERM2I128 $0x02, Y5, Y9, Y12
3648 VPERM2I128 $0x02, Y13, Y1, Y4
3649 VPERM2I128 $0x13, Y5, Y9, Y5
3650 VPERM2I128 $0x13, Y13, Y1, Y9
3651 VPERM2I128 $0x02, Y6, Y10, Y13
3652 VPERM2I128 $0x02, Y8, Y2, Y1
3653 VPERM2I128 $0x13, Y6, Y10, Y6
3654 VPERM2I128 $0x13, Y8, Y2, Y10
3655 JMP openAVX2ShortOpen
3656
3657openAVX2Tail128:
3658 // Need to decrypt up to 128 bytes - prepare two blocks
3659 VMOVDQA ·chacha20Constants<>+0(SB), Y5
3660 VMOVDQA 32(BP), Y9
3661 VMOVDQA 64(BP), Y13
3662 VMOVDQA 192(BP), Y1
3663 VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
3664 VMOVDQA Y1, Y4
3665 XORQ R9, R9
3666 MOVQ BX, CX
3667 ANDQ $-16, CX
3668 TESTQ CX, CX
3669 JE openAVX2Tail128LoopB
3670
3671openAVX2Tail128LoopA:
3672 ADDQ (SI)(R9*1), R10
3673 ADCQ 8(SI)(R9*1), R11
3674 ADCQ $0x01, R12
3675 MOVQ (BP), DX
3676 MOVQ DX, R15
3677 MULXQ R10, R13, R14
3678 IMULQ R12, R15
3679 MULXQ R11, AX, DX
3680 ADDQ AX, R14
3681 ADCQ DX, R15
3682 MOVQ 8(BP), DX
3683 MULXQ R10, R10, AX
3684 ADDQ R10, R14
3685 MULXQ R11, R11, R8
3686 ADCQ R11, R15
3687 ADCQ $0x00, R8
3688 IMULQ R12, DX
3689 ADDQ AX, R15
3690 ADCQ DX, R8
3691 MOVQ R13, R10
3692 MOVQ R14, R11
3693 MOVQ R15, R12
3694 ANDQ $0x03, R12
3695 MOVQ R15, R13
3696 ANDQ $-4, R13
3697 MOVQ R8, R14
3698 SHRQ $0x02, R8, R15
3699 SHRQ $0x02, R8
3700 ADDQ R13, R10
3701 ADCQ R14, R11
3702 ADCQ $0x00, R12
3703 ADDQ R15, R10
3704 ADCQ R8, R11
3705 ADCQ $0x00, R12
3706
3707openAVX2Tail128LoopB:
3708 ADDQ $0x10, R9
3709 VPADDD Y9, Y5, Y5
3710 VPXOR Y5, Y1, Y1
3711 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3712 VPADDD Y1, Y13, Y13
3713 VPXOR Y13, Y9, Y9
3714 VPSLLD $0x0c, Y9, Y3
3715 VPSRLD $0x14, Y9, Y9
3716 VPXOR Y3, Y9, Y9
3717 VPADDD Y9, Y5, Y5
3718 VPXOR Y5, Y1, Y1
3719 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3720 VPADDD Y1, Y13, Y13
3721 VPXOR Y13, Y9, Y9
3722 VPSLLD $0x07, Y9, Y3
3723 VPSRLD $0x19, Y9, Y9
3724 VPXOR Y3, Y9, Y9
3725 VPALIGNR $0x04, Y9, Y9, Y9
3726 VPALIGNR $0x08, Y13, Y13, Y13
3727 VPALIGNR $0x0c, Y1, Y1, Y1
3728 VPADDD Y9, Y5, Y5
3729 VPXOR Y5, Y1, Y1
3730 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3731 VPADDD Y1, Y13, Y13
3732 VPXOR Y13, Y9, Y9
3733 VPSLLD $0x0c, Y9, Y3
3734 VPSRLD $0x14, Y9, Y9
3735 VPXOR Y3, Y9, Y9
3736 VPADDD Y9, Y5, Y5
3737 VPXOR Y5, Y1, Y1
3738 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3739 VPADDD Y1, Y13, Y13
3740 VPXOR Y13, Y9, Y9
3741 VPSLLD $0x07, Y9, Y3
3742 VPSRLD $0x19, Y9, Y9
3743 VPXOR Y3, Y9, Y9
3744 VPALIGNR $0x0c, Y9, Y9, Y9
3745 VPALIGNR $0x08, Y13, Y13, Y13
3746 VPALIGNR $0x04, Y1, Y1, Y1
3747 CMPQ R9, CX
3748 JB openAVX2Tail128LoopA
3749 CMPQ R9, $0xa0
3750 JNE openAVX2Tail128LoopB
3751 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
3752 VPADDD 32(BP), Y9, Y9
3753 VPADDD 64(BP), Y13, Y13
3754 VPADDD Y4, Y1, Y1
3755 VPERM2I128 $0x02, Y5, Y9, Y0
3756 VPERM2I128 $0x02, Y13, Y1, Y14
3757 VPERM2I128 $0x13, Y5, Y9, Y12
3758 VPERM2I128 $0x13, Y13, Y1, Y4
3759
3760openAVX2TailLoop:
3761 CMPQ BX, $0x20
3762 JB openAVX2Tail
3763 SUBQ $0x20, BX
3764
3765 // Load for decryption
3766 VPXOR (SI), Y0, Y0
3767 VMOVDQU Y0, (DI)
3768 LEAQ 32(SI), SI
3769 LEAQ 32(DI), DI
3770 VMOVDQA Y14, Y0
3771 VMOVDQA Y12, Y14
3772 VMOVDQA Y4, Y12
3773 JMP openAVX2TailLoop
3774
3775openAVX2Tail:
3776 CMPQ BX, $0x10
3777 VMOVDQA X0, X1
3778 JB openAVX2TailDone
3779 SUBQ $0x10, BX
3780
3781 // Load for decryption
3782 VPXOR (SI), X0, X12
3783 VMOVDQU X12, (DI)
3784 LEAQ 16(SI), SI
3785 LEAQ 16(DI), DI
3786 VPERM2I128 $0x11, Y0, Y0, Y0
3787 VMOVDQA X0, X1
3788
3789openAVX2TailDone:
3790 VZEROUPPER
3791 JMP openSSETail16
3792
3793openAVX2Tail256:
3794 VMOVDQA ·chacha20Constants<>+0(SB), Y0
3795 VMOVDQA Y0, Y5
3796 VMOVDQA 32(BP), Y14
3797 VMOVDQA Y14, Y9
3798 VMOVDQA 64(BP), Y12
3799 VMOVDQA Y12, Y13
3800 VMOVDQA 192(BP), Y4
3801 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
3802 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
3803 VMOVDQA Y4, Y7
3804 VMOVDQA Y1, Y11
3805
3806 // Compute the number of iterations that will hash data
3807 MOVQ BX, 224(BP)
3808 MOVQ BX, CX
3809 SUBQ $0x80, CX
3810 SHRQ $0x04, CX
3811 MOVQ $0x0000000a, R9
3812 CMPQ CX, $0x0a
3813 CMOVQGT R9, CX
3814 MOVQ SI, BX
3815 XORQ R9, R9
3816
3817openAVX2Tail256LoopA:
3818 ADDQ (BX), R10
3819 ADCQ 8(BX), R11
3820 ADCQ $0x01, R12
3821 MOVQ (BP), DX
3822 MOVQ DX, R15
3823 MULXQ R10, R13, R14
3824 IMULQ R12, R15
3825 MULXQ R11, AX, DX
3826 ADDQ AX, R14
3827 ADCQ DX, R15
3828 MOVQ 8(BP), DX
3829 MULXQ R10, R10, AX
3830 ADDQ R10, R14
3831 MULXQ R11, R11, R8
3832 ADCQ R11, R15
3833 ADCQ $0x00, R8
3834 IMULQ R12, DX
3835 ADDQ AX, R15
3836 ADCQ DX, R8
3837 MOVQ R13, R10
3838 MOVQ R14, R11
3839 MOVQ R15, R12
3840 ANDQ $0x03, R12
3841 MOVQ R15, R13
3842 ANDQ $-4, R13
3843 MOVQ R8, R14
3844 SHRQ $0x02, R8, R15
3845 SHRQ $0x02, R8
3846 ADDQ R13, R10
3847 ADCQ R14, R11
3848 ADCQ $0x00, R12
3849 ADDQ R15, R10
3850 ADCQ R8, R11
3851 ADCQ $0x00, R12
3852 LEAQ 16(BX), BX
3853
3854openAVX2Tail256LoopB:
3855 VPADDD Y14, Y0, Y0
3856 VPXOR Y0, Y4, Y4
3857 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3858 VPADDD Y4, Y12, Y12
3859 VPXOR Y12, Y14, Y14
3860 VPSLLD $0x0c, Y14, Y3
3861 VPSRLD $0x14, Y14, Y14
3862 VPXOR Y3, Y14, Y14
3863 VPADDD Y14, Y0, Y0
3864 VPXOR Y0, Y4, Y4
3865 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3866 VPADDD Y4, Y12, Y12
3867 VPXOR Y12, Y14, Y14
3868 VPSLLD $0x07, Y14, Y3
3869 VPSRLD $0x19, Y14, Y14
3870 VPXOR Y3, Y14, Y14
3871 VPADDD Y9, Y5, Y5
3872 VPXOR Y5, Y1, Y1
3873 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3874 VPADDD Y1, Y13, Y13
3875 VPXOR Y13, Y9, Y9
3876 VPSLLD $0x0c, Y9, Y3
3877 VPSRLD $0x14, Y9, Y9
3878 VPXOR Y3, Y9, Y9
3879 VPADDD Y9, Y5, Y5
3880 VPXOR Y5, Y1, Y1
3881 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3882 VPADDD Y1, Y13, Y13
3883 VPXOR Y13, Y9, Y9
3884 VPSLLD $0x07, Y9, Y3
3885 VPSRLD $0x19, Y9, Y9
3886 VPXOR Y3, Y9, Y9
3887 VPALIGNR $0x04, Y14, Y14, Y14
3888 VPALIGNR $0x04, Y9, Y9, Y9
3889 VPALIGNR $0x08, Y12, Y12, Y12
3890 VPALIGNR $0x08, Y13, Y13, Y13
3891 VPALIGNR $0x0c, Y4, Y4, Y4
3892 VPALIGNR $0x0c, Y1, Y1, Y1
3893 INCQ R9
3894 VPADDD Y14, Y0, Y0
3895 VPXOR Y0, Y4, Y4
3896 VPSHUFB ·rol16<>+0(SB), Y4, Y4
3897 VPADDD Y4, Y12, Y12
3898 VPXOR Y12, Y14, Y14
3899 VPSLLD $0x0c, Y14, Y3
3900 VPSRLD $0x14, Y14, Y14
3901 VPXOR Y3, Y14, Y14
3902 VPADDD Y14, Y0, Y0
3903 VPXOR Y0, Y4, Y4
3904 VPSHUFB ·rol8<>+0(SB), Y4, Y4
3905 VPADDD Y4, Y12, Y12
3906 VPXOR Y12, Y14, Y14
3907 VPSLLD $0x07, Y14, Y3
3908 VPSRLD $0x19, Y14, Y14
3909 VPXOR Y3, Y14, Y14
3910 VPADDD Y9, Y5, Y5
3911 VPXOR Y5, Y1, Y1
3912 VPSHUFB ·rol16<>+0(SB), Y1, Y1
3913 VPADDD Y1, Y13, Y13
3914 VPXOR Y13, Y9, Y9
3915 VPSLLD $0x0c, Y9, Y3
3916 VPSRLD $0x14, Y9, Y9
3917 VPXOR Y3, Y9, Y9
3918 VPADDD Y9, Y5, Y5
3919 VPXOR Y5, Y1, Y1
3920 VPSHUFB ·rol8<>+0(SB), Y1, Y1
3921 VPADDD Y1, Y13, Y13
3922 VPXOR Y13, Y9, Y9
3923 VPSLLD $0x07, Y9, Y3
3924 VPSRLD $0x19, Y9, Y9
3925 VPXOR Y3, Y9, Y9
3926 VPALIGNR $0x0c, Y14, Y14, Y14
3927 VPALIGNR $0x0c, Y9, Y9, Y9
3928 VPALIGNR $0x08, Y12, Y12, Y12
3929 VPALIGNR $0x08, Y13, Y13, Y13
3930 VPALIGNR $0x04, Y4, Y4, Y4
3931 VPALIGNR $0x04, Y1, Y1, Y1
3932 CMPQ R9, CX
3933 JB openAVX2Tail256LoopA
3934 CMPQ R9, $0x0a
3935 JNE openAVX2Tail256LoopB
3936 MOVQ BX, R9
3937 SUBQ SI, BX
3938 MOVQ BX, CX
3939 MOVQ 224(BP), BX
3940
3941openAVX2Tail256Hash:
3942 ADDQ $0x10, CX
3943 CMPQ CX, BX
3944 JGT openAVX2Tail256HashEnd
3945 ADDQ (R9), R10
3946 ADCQ 8(R9), R11
3947 ADCQ $0x01, R12
3948 MOVQ (BP), DX
3949 MOVQ DX, R15
3950 MULXQ R10, R13, R14
3951 IMULQ R12, R15
3952 MULXQ R11, AX, DX
3953 ADDQ AX, R14
3954 ADCQ DX, R15
3955 MOVQ 8(BP), DX
3956 MULXQ R10, R10, AX
3957 ADDQ R10, R14
3958 MULXQ R11, R11, R8
3959 ADCQ R11, R15
3960 ADCQ $0x00, R8
3961 IMULQ R12, DX
3962 ADDQ AX, R15
3963 ADCQ DX, R8
3964 MOVQ R13, R10
3965 MOVQ R14, R11
3966 MOVQ R15, R12
3967 ANDQ $0x03, R12
3968 MOVQ R15, R13
3969 ANDQ $-4, R13
3970 MOVQ R8, R14
3971 SHRQ $0x02, R8, R15
3972 SHRQ $0x02, R8
3973 ADDQ R13, R10
3974 ADCQ R14, R11
3975 ADCQ $0x00, R12
3976 ADDQ R15, R10
3977 ADCQ R8, R11
3978 ADCQ $0x00, R12
3979 LEAQ 16(R9), R9
3980 JMP openAVX2Tail256Hash
3981
3982openAVX2Tail256HashEnd:
3983 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
3984 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
3985 VPADDD 32(BP), Y14, Y14
3986 VPADDD 32(BP), Y9, Y9
3987 VPADDD 64(BP), Y12, Y12
3988 VPADDD 64(BP), Y13, Y13
3989 VPADDD Y7, Y4, Y4
3990 VPADDD Y11, Y1, Y1
3991 VPERM2I128 $0x02, Y0, Y14, Y6
3992 VPERM2I128 $0x02, Y12, Y4, Y10
3993 VPERM2I128 $0x13, Y0, Y14, Y8
3994 VPERM2I128 $0x13, Y12, Y4, Y2
3995 VPERM2I128 $0x02, Y5, Y9, Y0
3996 VPERM2I128 $0x02, Y13, Y1, Y14
3997 VPERM2I128 $0x13, Y5, Y9, Y12
3998 VPERM2I128 $0x13, Y13, Y1, Y4
3999 VPXOR (SI), Y6, Y6
4000 VPXOR 32(SI), Y10, Y10
4001 VPXOR 64(SI), Y8, Y8
4002 VPXOR 96(SI), Y2, Y2
4003 VMOVDQU Y6, (DI)
4004 VMOVDQU Y10, 32(DI)
4005 VMOVDQU Y8, 64(DI)
4006 VMOVDQU Y2, 96(DI)
4007 LEAQ 128(SI), SI
4008 LEAQ 128(DI), DI
4009 SUBQ $0x80, BX
4010 JMP openAVX2TailLoop
4011
4012openAVX2Tail384:
4013 // Need to decrypt up to 384 bytes - prepare six blocks
4014 VMOVDQA ·chacha20Constants<>+0(SB), Y0
4015 VMOVDQA Y0, Y5
4016 VMOVDQA Y0, Y6
4017 VMOVDQA 32(BP), Y14
4018 VMOVDQA Y14, Y9
4019 VMOVDQA Y14, Y10
4020 VMOVDQA 64(BP), Y12
4021 VMOVDQA Y12, Y13
4022 VMOVDQA Y12, Y8
4023 VMOVDQA 192(BP), Y4
4024 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
4025 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
4026 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
4027 VMOVDQA Y4, 96(BP)
4028 VMOVDQA Y1, 128(BP)
4029 VMOVDQA Y2, 160(BP)
4030
4031 // Compute the number of iterations that will hash two blocks of data
4032 MOVQ BX, 224(BP)
4033 MOVQ BX, CX
4034 SUBQ $0x00000100, CX
4035 SHRQ $0x04, CX
4036 ADDQ $0x06, CX
4037 MOVQ $0x0000000a, R9
4038 CMPQ CX, $0x0a
4039 CMOVQGT R9, CX
4040 MOVQ SI, BX
4041 XORQ R9, R9
4042
4043openAVX2Tail384LoopB:
4044 ADDQ (BX), R10
4045 ADCQ 8(BX), R11
4046 ADCQ $0x01, R12
4047 MOVQ (BP), DX
4048 MOVQ DX, R15
4049 MULXQ R10, R13, R14
4050 IMULQ R12, R15
4051 MULXQ R11, AX, DX
4052 ADDQ AX, R14
4053 ADCQ DX, R15
4054 MOVQ 8(BP), DX
4055 MULXQ R10, R10, AX
4056 ADDQ R10, R14
4057 MULXQ R11, R11, R8
4058 ADCQ R11, R15
4059 ADCQ $0x00, R8
4060 IMULQ R12, DX
4061 ADDQ AX, R15
4062 ADCQ DX, R8
4063 MOVQ R13, R10
4064 MOVQ R14, R11
4065 MOVQ R15, R12
4066 ANDQ $0x03, R12
4067 MOVQ R15, R13
4068 ANDQ $-4, R13
4069 MOVQ R8, R14
4070 SHRQ $0x02, R8, R15
4071 SHRQ $0x02, R8
4072 ADDQ R13, R10
4073 ADCQ R14, R11
4074 ADCQ $0x00, R12
4075 ADDQ R15, R10
4076 ADCQ R8, R11
4077 ADCQ $0x00, R12
4078 LEAQ 16(BX), BX
4079
4080openAVX2Tail384LoopA:
4081 VPADDD Y14, Y0, Y0
4082 VPXOR Y0, Y4, Y4
4083 VPSHUFB ·rol16<>+0(SB), Y4, Y4
4084 VPADDD Y4, Y12, Y12
4085 VPXOR Y12, Y14, Y14
4086 VPSLLD $0x0c, Y14, Y3
4087 VPSRLD $0x14, Y14, Y14
4088 VPXOR Y3, Y14, Y14
4089 VPADDD Y14, Y0, Y0
4090 VPXOR Y0, Y4, Y4
4091 VPSHUFB ·rol8<>+0(SB), Y4, Y4
4092 VPADDD Y4, Y12, Y12
4093 VPXOR Y12, Y14, Y14
4094 VPSLLD $0x07, Y14, Y3
4095 VPSRLD $0x19, Y14, Y14
4096 VPXOR Y3, Y14, Y14
4097 VPADDD Y9, Y5, Y5
4098 VPXOR Y5, Y1, Y1
4099 VPSHUFB ·rol16<>+0(SB), Y1, Y1
4100 VPADDD Y1, Y13, Y13
4101 VPXOR Y13, Y9, Y9
4102 VPSLLD $0x0c, Y9, Y3
4103 VPSRLD $0x14, Y9, Y9
4104 VPXOR Y3, Y9, Y9
4105 VPADDD Y9, Y5, Y5
4106 VPXOR Y5, Y1, Y1
4107 VPSHUFB ·rol8<>+0(SB), Y1, Y1
4108 VPADDD Y1, Y13, Y13
4109 VPXOR Y13, Y9, Y9
4110 VPSLLD $0x07, Y9, Y3
4111 VPSRLD $0x19, Y9, Y9
4112 VPXOR Y3, Y9, Y9
4113 VPADDD Y10, Y6, Y6
4114 VPXOR Y6, Y2, Y2
4115 VPSHUFB ·rol16<>+0(SB), Y2, Y2
4116 VPADDD Y2, Y8, Y8
4117 VPXOR Y8, Y10, Y10
4118 VPSLLD $0x0c, Y10, Y3
4119 VPSRLD $0x14, Y10, Y10
4120 VPXOR Y3, Y10, Y10
4121 VPADDD Y10, Y6, Y6
4122 VPXOR Y6, Y2, Y2
4123 VPSHUFB ·rol8<>+0(SB), Y2, Y2
4124 VPADDD Y2, Y8, Y8
4125 VPXOR Y8, Y10, Y10
4126 VPSLLD $0x07, Y10, Y3
4127 VPSRLD $0x19, Y10, Y10
4128 VPXOR Y3, Y10, Y10
4129 VPALIGNR $0x04, Y14, Y14, Y14
4130 VPALIGNR $0x04, Y9, Y9, Y9
4131 VPALIGNR $0x04, Y10, Y10, Y10
4132 VPALIGNR $0x08, Y12, Y12, Y12
4133 VPALIGNR $0x08, Y13, Y13, Y13
4134 VPALIGNR $0x08, Y8, Y8, Y8
4135 VPALIGNR $0x0c, Y4, Y4, Y4
4136 VPALIGNR $0x0c, Y1, Y1, Y1
4137 VPALIGNR $0x0c, Y2, Y2, Y2
4138 ADDQ (BX), R10
4139 ADCQ 8(BX), R11
4140 ADCQ $0x01, R12
4141 MOVQ (BP), DX
4142 MOVQ DX, R15
4143 MULXQ R10, R13, R14
4144 IMULQ R12, R15
4145 MULXQ R11, AX, DX
4146 ADDQ AX, R14
4147 ADCQ DX, R15
4148 MOVQ 8(BP), DX
4149 MULXQ R10, R10, AX
4150 ADDQ R10, R14
4151 MULXQ R11, R11, R8
4152 ADCQ R11, R15
4153 ADCQ $0x00, R8
4154 IMULQ R12, DX
4155 ADDQ AX, R15
4156 ADCQ DX, R8
4157 MOVQ R13, R10
4158 MOVQ R14, R11
4159 MOVQ R15, R12
4160 ANDQ $0x03, R12
4161 MOVQ R15, R13
4162 ANDQ $-4, R13
4163 MOVQ R8, R14
4164 SHRQ $0x02, R8, R15
4165 SHRQ $0x02, R8
4166 ADDQ R13, R10
4167 ADCQ R14, R11
4168 ADCQ $0x00, R12
4169 ADDQ R15, R10
4170 ADCQ R8, R11
4171 ADCQ $0x00, R12
4172 LEAQ 16(BX), BX
4173 INCQ R9
4174 VPADDD Y14, Y0, Y0
4175 VPXOR Y0, Y4, Y4
4176 VPSHUFB ·rol16<>+0(SB), Y4, Y4
4177 VPADDD Y4, Y12, Y12
4178 VPXOR Y12, Y14, Y14
4179 VPSLLD $0x0c, Y14, Y3
4180 VPSRLD $0x14, Y14, Y14
4181 VPXOR Y3, Y14, Y14
4182 VPADDD Y14, Y0, Y0
4183 VPXOR Y0, Y4, Y4
4184 VPSHUFB ·rol8<>+0(SB), Y4, Y4
4185 VPADDD Y4, Y12, Y12
4186 VPXOR Y12, Y14, Y14
4187 VPSLLD $0x07, Y14, Y3
4188 VPSRLD $0x19, Y14, Y14
4189 VPXOR Y3, Y14, Y14
4190 VPADDD Y9, Y5, Y5
4191 VPXOR Y5, Y1, Y1
4192 VPSHUFB ·rol16<>+0(SB), Y1, Y1
4193 VPADDD Y1, Y13, Y13
4194 VPXOR Y13, Y9, Y9
4195 VPSLLD $0x0c, Y9, Y3
4196 VPSRLD $0x14, Y9, Y9
4197 VPXOR Y3, Y9, Y9
4198 VPADDD Y9, Y5, Y5
4199 VPXOR Y5, Y1, Y1
4200 VPSHUFB ·rol8<>+0(SB), Y1, Y1
4201 VPADDD Y1, Y13, Y13
4202 VPXOR Y13, Y9, Y9
4203 VPSLLD $0x07, Y9, Y3
4204 VPSRLD $0x19, Y9, Y9
4205 VPXOR Y3, Y9, Y9
4206 VPADDD Y10, Y6, Y6
4207 VPXOR Y6, Y2, Y2
4208 VPSHUFB ·rol16<>+0(SB), Y2, Y2
4209 VPADDD Y2, Y8, Y8
4210 VPXOR Y8, Y10, Y10
4211 VPSLLD $0x0c, Y10, Y3
4212 VPSRLD $0x14, Y10, Y10
4213 VPXOR Y3, Y10, Y10
4214 VPADDD Y10, Y6, Y6
4215 VPXOR Y6, Y2, Y2
4216 VPSHUFB ·rol8<>+0(SB), Y2, Y2
4217 VPADDD Y2, Y8, Y8
4218 VPXOR Y8, Y10, Y10
4219 VPSLLD $0x07, Y10, Y3
4220 VPSRLD $0x19, Y10, Y10
4221 VPXOR Y3, Y10, Y10
4222 VPALIGNR $0x0c, Y14, Y14, Y14
4223 VPALIGNR $0x0c, Y9, Y9, Y9
4224 VPALIGNR $0x0c, Y10, Y10, Y10
4225 VPALIGNR $0x08, Y12, Y12, Y12
4226 VPALIGNR $0x08, Y13, Y13, Y13
4227 VPALIGNR $0x08, Y8, Y8, Y8
4228 VPALIGNR $0x04, Y4, Y4, Y4
4229 VPALIGNR $0x04, Y1, Y1, Y1
4230 VPALIGNR $0x04, Y2, Y2, Y2
4231 CMPQ R9, CX
4232 JB openAVX2Tail384LoopB
4233 CMPQ R9, $0x0a
4234 JNE openAVX2Tail384LoopA
4235 MOVQ BX, R9
4236 SUBQ SI, BX
4237 MOVQ BX, CX
4238 MOVQ 224(BP), BX
4239
4240openAVX2Tail384Hash:
4241 ADDQ $0x10, CX
4242 CMPQ CX, BX
4243 JGT openAVX2Tail384HashEnd
4244 ADDQ (R9), R10
4245 ADCQ 8(R9), R11
4246 ADCQ $0x01, R12
4247 MOVQ (BP), DX
4248 MOVQ DX, R15
4249 MULXQ R10, R13, R14
4250 IMULQ R12, R15
4251 MULXQ R11, AX, DX
4252 ADDQ AX, R14
4253 ADCQ DX, R15
4254 MOVQ 8(BP), DX
4255 MULXQ R10, R10, AX
4256 ADDQ R10, R14
4257 MULXQ R11, R11, R8
4258 ADCQ R11, R15
4259 ADCQ $0x00, R8
4260 IMULQ R12, DX
4261 ADDQ AX, R15
4262 ADCQ DX, R8
4263 MOVQ R13, R10
4264 MOVQ R14, R11
4265 MOVQ R15, R12
4266 ANDQ $0x03, R12
4267 MOVQ R15, R13
4268 ANDQ $-4, R13
4269 MOVQ R8, R14
4270 SHRQ $0x02, R8, R15
4271 SHRQ $0x02, R8
4272 ADDQ R13, R10
4273 ADCQ R14, R11
4274 ADCQ $0x00, R12
4275 ADDQ R15, R10
4276 ADCQ R8, R11
4277 ADCQ $0x00, R12
4278 LEAQ 16(R9), R9
4279 JMP openAVX2Tail384Hash
4280
4281openAVX2Tail384HashEnd:
4282 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
4283 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
4284 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
4285 VPADDD 32(BP), Y14, Y14
4286 VPADDD 32(BP), Y9, Y9
4287 VPADDD 32(BP), Y10, Y10
4288 VPADDD 64(BP), Y12, Y12
4289 VPADDD 64(BP), Y13, Y13
4290 VPADDD 64(BP), Y8, Y8
4291 VPADDD 96(BP), Y4, Y4
4292 VPADDD 128(BP), Y1, Y1
4293 VPADDD 160(BP), Y2, Y2
4294 VPERM2I128 $0x02, Y0, Y14, Y3
4295 VPERM2I128 $0x02, Y12, Y4, Y7
4296 VPERM2I128 $0x13, Y0, Y14, Y11
4297 VPERM2I128 $0x13, Y12, Y4, Y15
4298 VPXOR (SI), Y3, Y3
4299 VPXOR 32(SI), Y7, Y7
4300 VPXOR 64(SI), Y11, Y11
4301 VPXOR 96(SI), Y15, Y15
4302 VMOVDQU Y3, (DI)
4303 VMOVDQU Y7, 32(DI)
4304 VMOVDQU Y11, 64(DI)
4305 VMOVDQU Y15, 96(DI)
4306 VPERM2I128 $0x02, Y5, Y9, Y3
4307 VPERM2I128 $0x02, Y13, Y1, Y7
4308 VPERM2I128 $0x13, Y5, Y9, Y11
4309 VPERM2I128 $0x13, Y13, Y1, Y15
4310 VPXOR 128(SI), Y3, Y3
4311 VPXOR 160(SI), Y7, Y7
4312 VPXOR 192(SI), Y11, Y11
4313 VPXOR 224(SI), Y15, Y15
4314 VMOVDQU Y3, 128(DI)
4315 VMOVDQU Y7, 160(DI)
4316 VMOVDQU Y11, 192(DI)
4317 VMOVDQU Y15, 224(DI)
4318 VPERM2I128 $0x02, Y6, Y10, Y0
4319 VPERM2I128 $0x02, Y8, Y2, Y14
4320 VPERM2I128 $0x13, Y6, Y10, Y12
4321 VPERM2I128 $0x13, Y8, Y2, Y4
4322 LEAQ 256(SI), SI
4323 LEAQ 256(DI), DI
4324 SUBQ $0x00000100, BX
4325 JMP openAVX2TailLoop
4326
4327openAVX2Tail512:
4328 VMOVDQU ·chacha20Constants<>+0(SB), Y0
4329 VMOVDQA Y0, Y5
4330 VMOVDQA Y0, Y6
4331 VMOVDQA Y0, Y7
4332 VMOVDQA 32(BP), Y14
4333 VMOVDQA Y14, Y9
4334 VMOVDQA Y14, Y10
4335 VMOVDQA Y14, Y11
4336 VMOVDQA 64(BP), Y12
4337 VMOVDQA Y12, Y13
4338 VMOVDQA Y12, Y8
4339 VMOVDQA Y12, Y15
4340 VMOVDQA 192(BP), Y4
4341 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
4342 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
4343 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
4344 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
4345 VMOVDQA Y4, 96(BP)
4346 VMOVDQA Y1, 128(BP)
4347 VMOVDQA Y2, 160(BP)
4348 VMOVDQA Y3, 192(BP)
4349 XORQ CX, CX
4350 MOVQ SI, R9
4351
4352openAVX2Tail512LoopB:
4353 ADDQ (R9), R10
4354 ADCQ 8(R9), R11
4355 ADCQ $0x01, R12
4356 MOVQ (BP), DX
4357 MOVQ DX, R15
4358 MULXQ R10, R13, R14
4359 IMULQ R12, R15
4360 MULXQ R11, AX, DX
4361 ADDQ AX, R14
4362 ADCQ DX, R15
4363 MOVQ 8(BP), DX
4364 MULXQ R10, R10, AX
4365 ADDQ R10, R14
4366 MULXQ R11, R11, R8
4367 ADCQ R11, R15
4368 ADCQ $0x00, R8
4369 IMULQ R12, DX
4370 ADDQ AX, R15
4371 ADCQ DX, R8
4372 MOVQ R13, R10
4373 MOVQ R14, R11
4374 MOVQ R15, R12
4375 ANDQ $0x03, R12
4376 MOVQ R15, R13
4377 ANDQ $-4, R13
4378 MOVQ R8, R14
4379 SHRQ $0x02, R8, R15
4380 SHRQ $0x02, R8
4381 ADDQ R13, R10
4382 ADCQ R14, R11
4383 ADCQ $0x00, R12
4384 ADDQ R15, R10
4385 ADCQ R8, R11
4386 ADCQ $0x00, R12
4387 LEAQ 16(R9), R9
4388
4389openAVX2Tail512LoopA:
4390 VPADDD Y14, Y0, Y0
4391 VPADDD Y9, Y5, Y5
4392 VPADDD Y10, Y6, Y6
4393 VPADDD Y11, Y7, Y7
4394 VPXOR Y0, Y4, Y4
4395 VPXOR Y5, Y1, Y1
4396 VPXOR Y6, Y2, Y2
4397 VPXOR Y7, Y3, Y3
4398 VPSHUFB ·rol16<>+0(SB), Y4, Y4
4399 VPSHUFB ·rol16<>+0(SB), Y1, Y1
4400 VPSHUFB ·rol16<>+0(SB), Y2, Y2
4401 VPSHUFB ·rol16<>+0(SB), Y3, Y3
4402 VPADDD Y4, Y12, Y12
4403 VPADDD Y1, Y13, Y13
4404 VPADDD Y2, Y8, Y8
4405 VPADDD Y3, Y15, Y15
4406 VPXOR Y12, Y14, Y14
4407 VPXOR Y13, Y9, Y9
4408 VPXOR Y8, Y10, Y10
4409 VPXOR Y15, Y11, Y11
4410 VMOVDQA Y15, 224(BP)
4411 VPSLLD $0x0c, Y14, Y15
4412 VPSRLD $0x14, Y14, Y14
4413 VPXOR Y15, Y14, Y14
4414 VPSLLD $0x0c, Y9, Y15
4415 VPSRLD $0x14, Y9, Y9
4416 VPXOR Y15, Y9, Y9
4417 VPSLLD $0x0c, Y10, Y15
4418 VPSRLD $0x14, Y10, Y10
4419 VPXOR Y15, Y10, Y10
4420 VPSLLD $0x0c, Y11, Y15
4421 VPSRLD $0x14, Y11, Y11
4422 VPXOR Y15, Y11, Y11
4423 VMOVDQA 224(BP), Y15
4424 ADDQ (R9), R10
4425 ADCQ 8(R9), R11
4426 ADCQ $0x01, R12
4427 MOVQ (BP), DX
4428 MOVQ DX, R15
4429 MULXQ R10, R13, R14
4430 IMULQ R12, R15
4431 MULXQ R11, AX, DX
4432 ADDQ AX, R14
4433 ADCQ DX, R15
4434 MOVQ 8(BP), DX
4435 MULXQ R10, R10, AX
4436 ADDQ R10, R14
4437 MULXQ R11, R11, R8
4438 ADCQ R11, R15
4439 ADCQ $0x00, R8
4440 IMULQ R12, DX
4441 ADDQ AX, R15
4442 ADCQ DX, R8
4443 MOVQ R13, R10
4444 MOVQ R14, R11
4445 MOVQ R15, R12
4446 ANDQ $0x03, R12
4447 MOVQ R15, R13
4448 ANDQ $-4, R13
4449 MOVQ R8, R14
4450 SHRQ $0x02, R8, R15
4451 SHRQ $0x02, R8
4452 ADDQ R13, R10
4453 ADCQ R14, R11
4454 ADCQ $0x00, R12
4455 ADDQ R15, R10
4456 ADCQ R8, R11
4457 ADCQ $0x00, R12
4458 VPADDD Y14, Y0, Y0
4459 VPADDD Y9, Y5, Y5
4460 VPADDD Y10, Y6, Y6
4461 VPADDD Y11, Y7, Y7
4462 VPXOR Y0, Y4, Y4
4463 VPXOR Y5, Y1, Y1
4464 VPXOR Y6, Y2, Y2
4465 VPXOR Y7, Y3, Y3
4466 VPSHUFB ·rol8<>+0(SB), Y4, Y4
4467 VPSHUFB ·rol8<>+0(SB), Y1, Y1
4468 VPSHUFB ·rol8<>+0(SB), Y2, Y2
4469 VPSHUFB ·rol8<>+0(SB), Y3, Y3
4470 VPADDD Y4, Y12, Y12
4471 VPADDD Y1, Y13, Y13
4472 VPADDD Y2, Y8, Y8
4473 VPADDD Y3, Y15, Y15
4474 VPXOR Y12, Y14, Y14
4475 VPXOR Y13, Y9, Y9
4476 VPXOR Y8, Y10, Y10
4477 VPXOR Y15, Y11, Y11
4478 VMOVDQA Y15, 224(BP)
4479 VPSLLD $0x07, Y14, Y15
4480 VPSRLD $0x19, Y14, Y14
4481 VPXOR Y15, Y14, Y14
4482 VPSLLD $0x07, Y9, Y15
4483 VPSRLD $0x19, Y9, Y9
4484 VPXOR Y15, Y9, Y9
4485 VPSLLD $0x07, Y10, Y15
4486 VPSRLD $0x19, Y10, Y10
4487 VPXOR Y15, Y10, Y10
4488 VPSLLD $0x07, Y11, Y15
4489 VPSRLD $0x19, Y11, Y11
4490 VPXOR Y15, Y11, Y11
4491 VMOVDQA 224(BP), Y15
4492 VPALIGNR $0x04, Y14, Y14, Y14
4493 VPALIGNR $0x04, Y9, Y9, Y9
4494 VPALIGNR $0x04, Y10, Y10, Y10
4495 VPALIGNR $0x04, Y11, Y11, Y11
4496 VPALIGNR $0x08, Y12, Y12, Y12
4497 VPALIGNR $0x08, Y13, Y13, Y13
4498 VPALIGNR $0x08, Y8, Y8, Y8
4499 VPALIGNR $0x08, Y15, Y15, Y15
4500 VPALIGNR $0x0c, Y4, Y4, Y4
4501 VPALIGNR $0x0c, Y1, Y1, Y1
4502 VPALIGNR $0x0c, Y2, Y2, Y2
4503 VPALIGNR $0x0c, Y3, Y3, Y3
4504 VPADDD Y14, Y0, Y0
4505 VPADDD Y9, Y5, Y5
4506 VPADDD Y10, Y6, Y6
4507 VPADDD Y11, Y7, Y7
4508 VPXOR Y0, Y4, Y4
4509 VPXOR Y5, Y1, Y1
4510 VPXOR Y6, Y2, Y2
4511 VPXOR Y7, Y3, Y3
4512 VPSHUFB ·rol16<>+0(SB), Y4, Y4
4513 VPSHUFB ·rol16<>+0(SB), Y1, Y1
4514 VPSHUFB ·rol16<>+0(SB), Y2, Y2
4515 VPSHUFB ·rol16<>+0(SB), Y3, Y3
4516 VPADDD Y4, Y12, Y12
4517 VPADDD Y1, Y13, Y13
4518 VPADDD Y2, Y8, Y8
4519 VPADDD Y3, Y15, Y15
4520 VPXOR Y12, Y14, Y14
4521 VPXOR Y13, Y9, Y9
4522 VPXOR Y8, Y10, Y10
4523 VPXOR Y15, Y11, Y11
4524 ADDQ 16(R9), R10
4525 ADCQ 24(R9), R11
4526 ADCQ $0x01, R12
4527 MOVQ (BP), DX
4528 MOVQ DX, R15
4529 MULXQ R10, R13, R14
4530 IMULQ R12, R15
4531 MULXQ R11, AX, DX
4532 ADDQ AX, R14
4533 ADCQ DX, R15
4534 MOVQ 8(BP), DX
4535 MULXQ R10, R10, AX
4536 ADDQ R10, R14
4537 MULXQ R11, R11, R8
4538 ADCQ R11, R15
4539 ADCQ $0x00, R8
4540 IMULQ R12, DX
4541 ADDQ AX, R15
4542 ADCQ DX, R8
4543 MOVQ R13, R10
4544 MOVQ R14, R11
4545 MOVQ R15, R12
4546 ANDQ $0x03, R12
4547 MOVQ R15, R13
4548 ANDQ $-4, R13
4549 MOVQ R8, R14
4550 SHRQ $0x02, R8, R15
4551 SHRQ $0x02, R8
4552 ADDQ R13, R10
4553 ADCQ R14, R11
4554 ADCQ $0x00, R12
4555 ADDQ R15, R10
4556 ADCQ R8, R11
4557 ADCQ $0x00, R12
4558 LEAQ 32(R9), R9
4559 VMOVDQA Y15, 224(BP)
4560 VPSLLD $0x0c, Y14, Y15
4561 VPSRLD $0x14, Y14, Y14
4562 VPXOR Y15, Y14, Y14
4563 VPSLLD $0x0c, Y9, Y15
4564 VPSRLD $0x14, Y9, Y9
4565 VPXOR Y15, Y9, Y9
4566 VPSLLD $0x0c, Y10, Y15
4567 VPSRLD $0x14, Y10, Y10
4568 VPXOR Y15, Y10, Y10
4569 VPSLLD $0x0c, Y11, Y15
4570 VPSRLD $0x14, Y11, Y11
4571 VPXOR Y15, Y11, Y11
4572 VMOVDQA 224(BP), Y15
4573 VPADDD Y14, Y0, Y0
4574 VPADDD Y9, Y5, Y5
4575 VPADDD Y10, Y6, Y6
4576 VPADDD Y11, Y7, Y7
4577 VPXOR Y0, Y4, Y4
4578 VPXOR Y5, Y1, Y1
4579 VPXOR Y6, Y2, Y2
4580 VPXOR Y7, Y3, Y3
4581 VPSHUFB ·rol8<>+0(SB), Y4, Y4
4582 VPSHUFB ·rol8<>+0(SB), Y1, Y1
4583 VPSHUFB ·rol8<>+0(SB), Y2, Y2
4584 VPSHUFB ·rol8<>+0(SB), Y3, Y3
4585 VPADDD Y4, Y12, Y12
4586 VPADDD Y1, Y13, Y13
4587 VPADDD Y2, Y8, Y8
4588 VPADDD Y3, Y15, Y15
4589 VPXOR Y12, Y14, Y14
4590 VPXOR Y13, Y9, Y9
4591 VPXOR Y8, Y10, Y10
4592 VPXOR Y15, Y11, Y11
4593 VMOVDQA Y15, 224(BP)
4594 VPSLLD $0x07, Y14, Y15
4595 VPSRLD $0x19, Y14, Y14
4596 VPXOR Y15, Y14, Y14
4597 VPSLLD $0x07, Y9, Y15
4598 VPSRLD $0x19, Y9, Y9
4599 VPXOR Y15, Y9, Y9
4600 VPSLLD $0x07, Y10, Y15
4601 VPSRLD $0x19, Y10, Y10
4602 VPXOR Y15, Y10, Y10
4603 VPSLLD $0x07, Y11, Y15
4604 VPSRLD $0x19, Y11, Y11
4605 VPXOR Y15, Y11, Y11
4606 VMOVDQA 224(BP), Y15
4607 VPALIGNR $0x0c, Y14, Y14, Y14
4608 VPALIGNR $0x0c, Y9, Y9, Y9
4609 VPALIGNR $0x0c, Y10, Y10, Y10
4610 VPALIGNR $0x0c, Y11, Y11, Y11
4611 VPALIGNR $0x08, Y12, Y12, Y12
4612 VPALIGNR $0x08, Y13, Y13, Y13
4613 VPALIGNR $0x08, Y8, Y8, Y8
4614 VPALIGNR $0x08, Y15, Y15, Y15
4615 VPALIGNR $0x04, Y4, Y4, Y4
4616 VPALIGNR $0x04, Y1, Y1, Y1
4617 VPALIGNR $0x04, Y2, Y2, Y2
4618 VPALIGNR $0x04, Y3, Y3, Y3
4619 INCQ CX
4620 CMPQ CX, $0x04
4621 JLT openAVX2Tail512LoopB
4622 CMPQ CX, $0x0a
4623 JNE openAVX2Tail512LoopA
4624 MOVQ BX, CX
4625 SUBQ $0x00000180, CX
4626 ANDQ $-16, CX
4627
4628openAVX2Tail512HashLoop:
4629 TESTQ CX, CX
4630 JE openAVX2Tail512HashEnd
4631 ADDQ (R9), R10
4632 ADCQ 8(R9), R11
4633 ADCQ $0x01, R12
4634 MOVQ (BP), DX
4635 MOVQ DX, R15
4636 MULXQ R10, R13, R14
4637 IMULQ R12, R15
4638 MULXQ R11, AX, DX
4639 ADDQ AX, R14
4640 ADCQ DX, R15
4641 MOVQ 8(BP), DX
4642 MULXQ R10, R10, AX
4643 ADDQ R10, R14
4644 MULXQ R11, R11, R8
4645 ADCQ R11, R15
4646 ADCQ $0x00, R8
4647 IMULQ R12, DX
4648 ADDQ AX, R15
4649 ADCQ DX, R8
4650 MOVQ R13, R10
4651 MOVQ R14, R11
4652 MOVQ R15, R12
4653 ANDQ $0x03, R12
4654 MOVQ R15, R13
4655 ANDQ $-4, R13
4656 MOVQ R8, R14
4657 SHRQ $0x02, R8, R15
4658 SHRQ $0x02, R8
4659 ADDQ R13, R10
4660 ADCQ R14, R11
4661 ADCQ $0x00, R12
4662 ADDQ R15, R10
4663 ADCQ R8, R11
4664 ADCQ $0x00, R12
4665 LEAQ 16(R9), R9
4666 SUBQ $0x10, CX
4667 JMP openAVX2Tail512HashLoop
4668
4669openAVX2Tail512HashEnd:
4670 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
4671 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
4672 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
4673 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
4674 VPADDD 32(BP), Y14, Y14
4675 VPADDD 32(BP), Y9, Y9
4676 VPADDD 32(BP), Y10, Y10
4677 VPADDD 32(BP), Y11, Y11
4678 VPADDD 64(BP), Y12, Y12
4679 VPADDD 64(BP), Y13, Y13
4680 VPADDD 64(BP), Y8, Y8
4681 VPADDD 64(BP), Y15, Y15
4682 VPADDD 96(BP), Y4, Y4
4683 VPADDD 128(BP), Y1, Y1
4684 VPADDD 160(BP), Y2, Y2
4685 VPADDD 192(BP), Y3, Y3
4686 VMOVDQA Y15, 224(BP)
4687 VPERM2I128 $0x02, Y0, Y14, Y15
4688 VPERM2I128 $0x13, Y0, Y14, Y14
4689 VPERM2I128 $0x02, Y12, Y4, Y0
4690 VPERM2I128 $0x13, Y12, Y4, Y12
4691 VPXOR (SI), Y15, Y15
4692 VPXOR 32(SI), Y0, Y0
4693 VPXOR 64(SI), Y14, Y14
4694 VPXOR 96(SI), Y12, Y12
4695 VMOVDQU Y15, (DI)
4696 VMOVDQU Y0, 32(DI)
4697 VMOVDQU Y14, 64(DI)
4698 VMOVDQU Y12, 96(DI)
4699 VPERM2I128 $0x02, Y5, Y9, Y0
4700 VPERM2I128 $0x02, Y13, Y1, Y14
4701 VPERM2I128 $0x13, Y5, Y9, Y12
4702 VPERM2I128 $0x13, Y13, Y1, Y4
4703 VPXOR 128(SI), Y0, Y0
4704 VPXOR 160(SI), Y14, Y14
4705 VPXOR 192(SI), Y12, Y12
4706 VPXOR 224(SI), Y4, Y4
4707 VMOVDQU Y0, 128(DI)
4708 VMOVDQU Y14, 160(DI)
4709 VMOVDQU Y12, 192(DI)
4710 VMOVDQU Y4, 224(DI)
4711 VPERM2I128 $0x02, Y6, Y10, Y0
4712 VPERM2I128 $0x02, Y8, Y2, Y14
4713 VPERM2I128 $0x13, Y6, Y10, Y12
4714 VPERM2I128 $0x13, Y8, Y2, Y4
4715 VPXOR 256(SI), Y0, Y0
4716 VPXOR 288(SI), Y14, Y14
4717 VPXOR 320(SI), Y12, Y12
4718 VPXOR 352(SI), Y4, Y4
4719 VMOVDQU Y0, 256(DI)
4720 VMOVDQU Y14, 288(DI)
4721 VMOVDQU Y12, 320(DI)
4722 VMOVDQU Y4, 352(DI)
4723 VPERM2I128 $0x02, Y7, Y11, Y0
4724 VPERM2I128 $0x02, 224(BP), Y3, Y14
4725 VPERM2I128 $0x13, Y7, Y11, Y12
4726 VPERM2I128 $0x13, 224(BP), Y3, Y4
4727 LEAQ 384(SI), SI
4728 LEAQ 384(DI), DI
4729 SUBQ $0x00000180, BX
4730 JMP openAVX2TailLoop
4731
4732DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
4733DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
4734DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
4735DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
4736DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
4737DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
4738DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
4739DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
4740GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
4741
4742DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
4743DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
4744DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
4745DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
4746GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
4747
4748DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
4749DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
4750GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
4751
4752DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
4753DATA ·andMask<>+8(SB)/8, $0x0000000000000000
4754DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
4755DATA ·andMask<>+24(SB)/8, $0x0000000000000000
4756DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
4757DATA ·andMask<>+40(SB)/8, $0x0000000000000000
4758DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
4759DATA ·andMask<>+56(SB)/8, $0x0000000000000000
4760DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
4761DATA ·andMask<>+72(SB)/8, $0x0000000000000000
4762DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
4763DATA ·andMask<>+88(SB)/8, $0x0000000000000000
4764DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
4765DATA ·andMask<>+104(SB)/8, $0x0000000000000000
4766DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
4767DATA ·andMask<>+120(SB)/8, $0x0000000000000000
4768DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
4769DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
4770DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
4771DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
4772DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
4773DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
4774DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
4775DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
4776DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
4777DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
4778DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
4779DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
4780DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
4781DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
4782GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
4783
4784DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
4785DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
4786DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
4787DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
4788GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
4789
4790DATA ·rol16<>+0(SB)/8, $0x0504070601000302
4791DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
4792DATA ·rol16<>+16(SB)/8, $0x0504070601000302
4793DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
4794GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
4795
4796DATA ·rol8<>+0(SB)/8, $0x0605040702010003
4797DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
4798DATA ·rol8<>+16(SB)/8, $0x0605040702010003
4799DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
4800GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
4801
4802DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
4803DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
4804DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
4805DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
4806GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
4807
4808// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
4809// Requires: AVX, AVX2, BMI2, CMOV, SSE2
4810TEXT ·chacha20Poly1305Seal(SB), $288-96
4811 MOVQ SP, BP
4812 ADDQ $0x20, BP
4813 ANDQ $-32, BP
4814 MOVQ dst_base+0(FP), DI
4815 MOVQ key_base+24(FP), R8
4816 MOVQ src_base+48(FP), SI
4817 MOVQ src_len+56(FP), BX
4818 MOVQ ad_base+72(FP), CX
4819 CMPB ·useAVX2+0(SB), $0x01
4820 JE chacha20Poly1305Seal_AVX2
4821
4822 // Special optimization, for very short buffers
4823 CMPQ BX, $0x80
4824 JBE sealSSE128
4825
4826 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
4827 MOVOU ·chacha20Constants<>+0(SB), X0
4828 MOVOU 16(R8), X3
4829 MOVOU 32(R8), X6
4830 MOVOU 48(R8), X9
4831
4832 // Store state on stack for future use
4833 MOVO X3, 32(BP)
4834 MOVO X6, 48(BP)
4835
4836 // Load state, increment counter blocks
4837 MOVO X0, X1
4838 MOVO X3, X4
4839 MOVO X6, X7
4840 MOVO X9, X10
4841 PADDL ·sseIncMask<>+0(SB), X10
4842 MOVO X1, X2
4843 MOVO X4, X5
4844 MOVO X7, X8
4845 MOVO X10, X11
4846 PADDL ·sseIncMask<>+0(SB), X11
4847 MOVO X2, X12
4848 MOVO X5, X13
4849 MOVO X8, X14
4850 MOVO X11, X15
4851 PADDL ·sseIncMask<>+0(SB), X15
4852
4853 // Store counters
4854 MOVO X9, 80(BP)
4855 MOVO X10, 96(BP)
4856 MOVO X11, 112(BP)
4857 MOVO X15, 128(BP)
4858 MOVQ $0x0000000a, R9
4859
4860sealSSEIntroLoop:
4861 MOVO X14, 64(BP)
4862 PADDD X3, X0
4863 PXOR X0, X9
4864 ROL16(X9, X14)
4865 PADDD X9, X6
4866 PXOR X6, X3
4867 MOVO X3, X14
4868 PSLLL $0x0c, X14
4869 PSRLL $0x14, X3
4870 PXOR X14, X3
4871 PADDD X3, X0
4872 PXOR X0, X9
4873 ROL8(X9, X14)
4874 PADDD X9, X6
4875 PXOR X6, X3
4876 MOVO X3, X14
4877 PSLLL $0x07, X14
4878 PSRLL $0x19, X3
4879 PXOR X14, X3
4880 PADDD X4, X1
4881 PXOR X1, X10
4882 ROL16(X10, X14)
4883 PADDD X10, X7
4884 PXOR X7, X4
4885 MOVO X4, X14
4886 PSLLL $0x0c, X14
4887 PSRLL $0x14, X4
4888 PXOR X14, X4
4889 PADDD X4, X1
4890 PXOR X1, X10
4891 ROL8(X10, X14)
4892 PADDD X10, X7
4893 PXOR X7, X4
4894 MOVO X4, X14
4895 PSLLL $0x07, X14
4896 PSRLL $0x19, X4
4897 PXOR X14, X4
4898 PADDD X5, X2
4899 PXOR X2, X11
4900 ROL16(X11, X14)
4901 PADDD X11, X8
4902 PXOR X8, X5
4903 MOVO X5, X14
4904 PSLLL $0x0c, X14
4905 PSRLL $0x14, X5
4906 PXOR X14, X5
4907 PADDD X5, X2
4908 PXOR X2, X11
4909 ROL8(X11, X14)
4910 PADDD X11, X8
4911 PXOR X8, X5
4912 MOVO X5, X14
4913 PSLLL $0x07, X14
4914 PSRLL $0x19, X5
4915 PXOR X14, X5
4916 MOVO 64(BP), X14
4917 MOVO X7, 64(BP)
4918 PADDD X13, X12
4919 PXOR X12, X15
4920 ROL16(X15, X7)
4921 PADDD X15, X14
4922 PXOR X14, X13
4923 MOVO X13, X7
4924 PSLLL $0x0c, X7
4925 PSRLL $0x14, X13
4926 PXOR X7, X13
4927 PADDD X13, X12
4928 PXOR X12, X15
4929 ROL8(X15, X7)
4930 PADDD X15, X14
4931 PXOR X14, X13
4932 MOVO X13, X7
4933 PSLLL $0x07, X7
4934 PSRLL $0x19, X13
4935 PXOR X7, X13
4936 MOVO 64(BP), X7
4937 BYTE $0x66
4938 BYTE $0x0f
4939 BYTE $0x3a
4940 BYTE $0x0f
4941 BYTE $0xdb
4942 BYTE $0x04
4943 BYTE $0x66
4944 BYTE $0x0f
4945 BYTE $0x3a
4946 BYTE $0x0f
4947 BYTE $0xe4
4948 BYTE $0x04
4949 BYTE $0x66
4950 BYTE $0x0f
4951 BYTE $0x3a
4952 BYTE $0x0f
4953 BYTE $0xed
4954 BYTE $0x04
4955 BYTE $0x66
4956 BYTE $0x45
4957 BYTE $0x0f
4958 BYTE $0x3a
4959 BYTE $0x0f
4960 BYTE $0xed
4961 BYTE $0x04
4962 BYTE $0x66
4963 BYTE $0x0f
4964 BYTE $0x3a
4965 BYTE $0x0f
4966 BYTE $0xf6
4967 BYTE $0x08
4968 BYTE $0x66
4969 BYTE $0x0f
4970 BYTE $0x3a
4971 BYTE $0x0f
4972 BYTE $0xff
4973 BYTE $0x08
4974 BYTE $0x66
4975 BYTE $0x45
4976 BYTE $0x0f
4977 BYTE $0x3a
4978 BYTE $0x0f
4979 BYTE $0xc0
4980 BYTE $0x08
4981 BYTE $0x66
4982 BYTE $0x45
4983 BYTE $0x0f
4984 BYTE $0x3a
4985 BYTE $0x0f
4986 BYTE $0xf6
4987 BYTE $0x08
4988 BYTE $0x66
4989 BYTE $0x45
4990 BYTE $0x0f
4991 BYTE $0x3a
4992 BYTE $0x0f
4993 BYTE $0xc9
4994 BYTE $0x0c
4995 BYTE $0x66
4996 BYTE $0x45
4997 BYTE $0x0f
4998 BYTE $0x3a
4999 BYTE $0x0f
5000 BYTE $0xd2
5001 BYTE $0x0c
5002 BYTE $0x66
5003 BYTE $0x45
5004 BYTE $0x0f
5005 BYTE $0x3a
5006 BYTE $0x0f
5007 BYTE $0xdb
5008 BYTE $0x0c
5009 BYTE $0x66
5010 BYTE $0x45
5011 BYTE $0x0f
5012 BYTE $0x3a
5013 BYTE $0x0f
5014 BYTE $0xff
5015 BYTE $0x0c
5016 MOVO X14, 64(BP)
5017 PADDD X3, X0
5018 PXOR X0, X9
5019 ROL16(X9, X14)
5020 PADDD X9, X6
5021 PXOR X6, X3
5022 MOVO X3, X14
5023 PSLLL $0x0c, X14
5024 PSRLL $0x14, X3
5025 PXOR X14, X3
5026 PADDD X3, X0
5027 PXOR X0, X9
5028 ROL8(X9, X14)
5029 PADDD X9, X6
5030 PXOR X6, X3
5031 MOVO X3, X14
5032 PSLLL $0x07, X14
5033 PSRLL $0x19, X3
5034 PXOR X14, X3
5035 PADDD X4, X1
5036 PXOR X1, X10
5037 ROL16(X10, X14)
5038 PADDD X10, X7
5039 PXOR X7, X4
5040 MOVO X4, X14
5041 PSLLL $0x0c, X14
5042 PSRLL $0x14, X4
5043 PXOR X14, X4
5044 PADDD X4, X1
5045 PXOR X1, X10
5046 ROL8(X10, X14)
5047 PADDD X10, X7
5048 PXOR X7, X4
5049 MOVO X4, X14
5050 PSLLL $0x07, X14
5051 PSRLL $0x19, X4
5052 PXOR X14, X4
5053 PADDD X5, X2
5054 PXOR X2, X11
5055 ROL16(X11, X14)
5056 PADDD X11, X8
5057 PXOR X8, X5
5058 MOVO X5, X14
5059 PSLLL $0x0c, X14
5060 PSRLL $0x14, X5
5061 PXOR X14, X5
5062 PADDD X5, X2
5063 PXOR X2, X11
5064 ROL8(X11, X14)
5065 PADDD X11, X8
5066 PXOR X8, X5
5067 MOVO X5, X14
5068 PSLLL $0x07, X14
5069 PSRLL $0x19, X5
5070 PXOR X14, X5
5071 MOVO 64(BP), X14
5072 MOVO X7, 64(BP)
5073 PADDD X13, X12
5074 PXOR X12, X15
5075 ROL16(X15, X7)
5076 PADDD X15, X14
5077 PXOR X14, X13
5078 MOVO X13, X7
5079 PSLLL $0x0c, X7
5080 PSRLL $0x14, X13
5081 PXOR X7, X13
5082 PADDD X13, X12
5083 PXOR X12, X15
5084 ROL8(X15, X7)
5085 PADDD X15, X14
5086 PXOR X14, X13
5087 MOVO X13, X7
5088 PSLLL $0x07, X7
5089 PSRLL $0x19, X13
5090 PXOR X7, X13
5091 MOVO 64(BP), X7
5092 BYTE $0x66
5093 BYTE $0x0f
5094 BYTE $0x3a
5095 BYTE $0x0f
5096 BYTE $0xdb
5097 BYTE $0x0c
5098 BYTE $0x66
5099 BYTE $0x0f
5100 BYTE $0x3a
5101 BYTE $0x0f
5102 BYTE $0xe4
5103 BYTE $0x0c
5104 BYTE $0x66
5105 BYTE $0x0f
5106 BYTE $0x3a
5107 BYTE $0x0f
5108 BYTE $0xed
5109 BYTE $0x0c
5110 BYTE $0x66
5111 BYTE $0x45
5112 BYTE $0x0f
5113 BYTE $0x3a
5114 BYTE $0x0f
5115 BYTE $0xed
5116 BYTE $0x0c
5117 BYTE $0x66
5118 BYTE $0x0f
5119 BYTE $0x3a
5120 BYTE $0x0f
5121 BYTE $0xf6
5122 BYTE $0x08
5123 BYTE $0x66
5124 BYTE $0x0f
5125 BYTE $0x3a
5126 BYTE $0x0f
5127 BYTE $0xff
5128 BYTE $0x08
5129 BYTE $0x66
5130 BYTE $0x45
5131 BYTE $0x0f
5132 BYTE $0x3a
5133 BYTE $0x0f
5134 BYTE $0xc0
5135 BYTE $0x08
5136 BYTE $0x66
5137 BYTE $0x45
5138 BYTE $0x0f
5139 BYTE $0x3a
5140 BYTE $0x0f
5141 BYTE $0xf6
5142 BYTE $0x08
5143 BYTE $0x66
5144 BYTE $0x45
5145 BYTE $0x0f
5146 BYTE $0x3a
5147 BYTE $0x0f
5148 BYTE $0xc9
5149 BYTE $0x04
5150 BYTE $0x66
5151 BYTE $0x45
5152 BYTE $0x0f
5153 BYTE $0x3a
5154 BYTE $0x0f
5155 BYTE $0xd2
5156 BYTE $0x04
5157 BYTE $0x66
5158 BYTE $0x45
5159 BYTE $0x0f
5160 BYTE $0x3a
5161 BYTE $0x0f
5162 BYTE $0xdb
5163 BYTE $0x04
5164 BYTE $0x66
5165 BYTE $0x45
5166 BYTE $0x0f
5167 BYTE $0x3a
5168 BYTE $0x0f
5169 BYTE $0xff
5170 BYTE $0x04
5171 DECQ R9
5172 JNE sealSSEIntroLoop
5173
5174 // Add in the state
5175 PADDD ·chacha20Constants<>+0(SB), X0
5176 PADDD ·chacha20Constants<>+0(SB), X1
5177 PADDD ·chacha20Constants<>+0(SB), X2
5178 PADDD ·chacha20Constants<>+0(SB), X12
5179 PADDD 32(BP), X3
5180 PADDD 32(BP), X4
5181 PADDD 32(BP), X5
5182 PADDD 32(BP), X13
5183 PADDD 48(BP), X7
5184 PADDD 48(BP), X8
5185 PADDD 48(BP), X14
5186 PADDD 96(BP), X10
5187 PADDD 112(BP), X11
5188 PADDD 128(BP), X15
5189
5190 // Clamp and store the key
5191 PAND ·polyClampMask<>+0(SB), X0
5192 MOVO X0, (BP)
5193 MOVO X3, 16(BP)
5194
5195 // Hash AAD
5196 MOVQ ad_len+80(FP), R9
5197 CALL polyHashADInternal<>(SB)
5198 MOVOU (SI), X0
5199 MOVOU 16(SI), X3
5200 MOVOU 32(SI), X6
5201 MOVOU 48(SI), X9
5202 PXOR X0, X1
5203 PXOR X3, X4
5204 PXOR X6, X7
5205 PXOR X9, X10
5206 MOVOU X1, (DI)
5207 MOVOU X4, 16(DI)
5208 MOVOU X7, 32(DI)
5209 MOVOU X10, 48(DI)
5210 MOVOU 64(SI), X0
5211 MOVOU 80(SI), X3
5212 MOVOU 96(SI), X6
5213 MOVOU 112(SI), X9
5214 PXOR X0, X2
5215 PXOR X3, X5
5216 PXOR X6, X8
5217 PXOR X9, X11
5218 MOVOU X2, 64(DI)
5219 MOVOU X5, 80(DI)
5220 MOVOU X8, 96(DI)
5221 MOVOU X11, 112(DI)
5222 MOVQ $0x00000080, CX
5223 SUBQ $0x80, BX
5224 LEAQ 128(SI), SI
5225 MOVO X12, X1
5226 MOVO X13, X4
5227 MOVO X14, X7
5228 MOVO X15, X10
5229 CMPQ BX, $0x40
5230 JBE sealSSE128SealHash
5231 MOVOU (SI), X0
5232 MOVOU 16(SI), X3
5233 MOVOU 32(SI), X6
5234 MOVOU 48(SI), X9
5235 PXOR X0, X12
5236 PXOR X3, X13
5237 PXOR X6, X14
5238 PXOR X9, X15
5239 MOVOU X12, 128(DI)
5240 MOVOU X13, 144(DI)
5241 MOVOU X14, 160(DI)
5242 MOVOU X15, 176(DI)
5243 ADDQ $0x40, CX
5244 SUBQ $0x40, BX
5245 LEAQ 64(SI), SI
5246 MOVQ $0x00000002, CX
5247 MOVQ $0x00000008, R9
5248 CMPQ BX, $0x40
5249 JBE sealSSETail64
5250 CMPQ BX, $0x80
5251 JBE sealSSETail128
5252 CMPQ BX, $0xc0
5253 JBE sealSSETail192
5254
5255sealSSEMainLoop:
5256 // Load state, increment counter blocks
5257 MOVO ·chacha20Constants<>+0(SB), X0
5258 MOVO 32(BP), X3
5259 MOVO 48(BP), X6
5260 MOVO 128(BP), X9
5261 PADDL ·sseIncMask<>+0(SB), X9
5262 MOVO X0, X1
5263 MOVO X3, X4
5264 MOVO X6, X7
5265 MOVO X9, X10
5266 PADDL ·sseIncMask<>+0(SB), X10
5267 MOVO X1, X2
5268 MOVO X4, X5
5269 MOVO X7, X8
5270 MOVO X10, X11
5271 PADDL ·sseIncMask<>+0(SB), X11
5272 MOVO X2, X12
5273 MOVO X5, X13
5274 MOVO X8, X14
5275 MOVO X11, X15
5276 PADDL ·sseIncMask<>+0(SB), X15
5277
5278 // Store counters
5279 MOVO X9, 80(BP)
5280 MOVO X10, 96(BP)
5281 MOVO X11, 112(BP)
5282 MOVO X15, 128(BP)
5283
5284sealSSEInnerLoop:
5285 MOVO X14, 64(BP)
5286 PADDD X3, X0
5287 PXOR X0, X9
5288 ROL16(X9, X14)
5289 PADDD X9, X6
5290 PXOR X6, X3
5291 MOVO X3, X14
5292 PSLLL $0x0c, X14
5293 PSRLL $0x14, X3
5294 PXOR X14, X3
5295 PADDD X3, X0
5296 PXOR X0, X9
5297 ROL8(X9, X14)
5298 PADDD X9, X6
5299 PXOR X6, X3
5300 MOVO X3, X14
5301 PSLLL $0x07, X14
5302 PSRLL $0x19, X3
5303 PXOR X14, X3
5304 PADDD X4, X1
5305 PXOR X1, X10
5306 ROL16(X10, X14)
5307 PADDD X10, X7
5308 PXOR X7, X4
5309 MOVO X4, X14
5310 PSLLL $0x0c, X14
5311 PSRLL $0x14, X4
5312 PXOR X14, X4
5313 PADDD X4, X1
5314 PXOR X1, X10
5315 ROL8(X10, X14)
5316 PADDD X10, X7
5317 PXOR X7, X4
5318 MOVO X4, X14
5319 PSLLL $0x07, X14
5320 PSRLL $0x19, X4
5321 PXOR X14, X4
5322 PADDD X5, X2
5323 PXOR X2, X11
5324 ROL16(X11, X14)
5325 PADDD X11, X8
5326 PXOR X8, X5
5327 MOVO X5, X14
5328 PSLLL $0x0c, X14
5329 PSRLL $0x14, X5
5330 PXOR X14, X5
5331 PADDD X5, X2
5332 PXOR X2, X11
5333 ROL8(X11, X14)
5334 PADDD X11, X8
5335 PXOR X8, X5
5336 MOVO X5, X14
5337 PSLLL $0x07, X14
5338 PSRLL $0x19, X5
5339 PXOR X14, X5
5340 MOVO 64(BP), X14
5341 MOVO X7, 64(BP)
5342 PADDD X13, X12
5343 PXOR X12, X15
5344 ROL16(X15, X7)
5345 PADDD X15, X14
5346 PXOR X14, X13
5347 MOVO X13, X7
5348 PSLLL $0x0c, X7
5349 PSRLL $0x14, X13
5350 PXOR X7, X13
5351 PADDD X13, X12
5352 PXOR X12, X15
5353 ROL8(X15, X7)
5354 PADDD X15, X14
5355 PXOR X14, X13
5356 MOVO X13, X7
5357 PSLLL $0x07, X7
5358 PSRLL $0x19, X13
5359 PXOR X7, X13
5360 MOVO 64(BP), X7
5361 ADDQ (DI), R10
5362 ADCQ 8(DI), R11
5363 ADCQ $0x01, R12
5364 BYTE $0x66
5365 BYTE $0x0f
5366 BYTE $0x3a
5367 BYTE $0x0f
5368 BYTE $0xdb
5369 BYTE $0x04
5370 BYTE $0x66
5371 BYTE $0x0f
5372 BYTE $0x3a
5373 BYTE $0x0f
5374 BYTE $0xe4
5375 BYTE $0x04
5376 BYTE $0x66
5377 BYTE $0x0f
5378 BYTE $0x3a
5379 BYTE $0x0f
5380 BYTE $0xed
5381 BYTE $0x04
5382 BYTE $0x66
5383 BYTE $0x45
5384 BYTE $0x0f
5385 BYTE $0x3a
5386 BYTE $0x0f
5387 BYTE $0xed
5388 BYTE $0x04
5389 BYTE $0x66
5390 BYTE $0x0f
5391 BYTE $0x3a
5392 BYTE $0x0f
5393 BYTE $0xf6
5394 BYTE $0x08
5395 BYTE $0x66
5396 BYTE $0x0f
5397 BYTE $0x3a
5398 BYTE $0x0f
5399 BYTE $0xff
5400 BYTE $0x08
5401 BYTE $0x66
5402 BYTE $0x45
5403 BYTE $0x0f
5404 BYTE $0x3a
5405 BYTE $0x0f
5406 BYTE $0xc0
5407 BYTE $0x08
5408 BYTE $0x66
5409 BYTE $0x45
5410 BYTE $0x0f
5411 BYTE $0x3a
5412 BYTE $0x0f
5413 BYTE $0xf6
5414 BYTE $0x08
5415 BYTE $0x66
5416 BYTE $0x45
5417 BYTE $0x0f
5418 BYTE $0x3a
5419 BYTE $0x0f
5420 BYTE $0xc9
5421 BYTE $0x0c
5422 BYTE $0x66
5423 BYTE $0x45
5424 BYTE $0x0f
5425 BYTE $0x3a
5426 BYTE $0x0f
5427 BYTE $0xd2
5428 BYTE $0x0c
5429 BYTE $0x66
5430 BYTE $0x45
5431 BYTE $0x0f
5432 BYTE $0x3a
5433 BYTE $0x0f
5434 BYTE $0xdb
5435 BYTE $0x0c
5436 BYTE $0x66
5437 BYTE $0x45
5438 BYTE $0x0f
5439 BYTE $0x3a
5440 BYTE $0x0f
5441 BYTE $0xff
5442 BYTE $0x0c
5443 MOVQ (BP), AX
5444 MOVQ AX, R15
5445 MULQ R10
5446 MOVQ AX, R13
5447 MOVQ DX, R14
5448 MOVQ (BP), AX
5449 MULQ R11
5450 IMULQ R12, R15
5451 ADDQ AX, R14
5452 ADCQ DX, R15
5453 MOVQ 8(BP), AX
5454 MOVQ AX, R8
5455 MULQ R10
5456 ADDQ AX, R14
5457 ADCQ $0x00, DX
5458 MOVQ DX, R10
5459 MOVQ 8(BP), AX
5460 MULQ R11
5461 ADDQ AX, R15
5462 ADCQ $0x00, DX
5463 LEAQ 16(DI), DI
5464 MOVO X14, 64(BP)
5465 PADDD X3, X0
5466 PXOR X0, X9
5467 ROL16(X9, X14)
5468 PADDD X9, X6
5469 PXOR X6, X3
5470 MOVO X3, X14
5471 PSLLL $0x0c, X14
5472 PSRLL $0x14, X3
5473 PXOR X14, X3
5474 PADDD X3, X0
5475 PXOR X0, X9
5476 ROL8(X9, X14)
5477 PADDD X9, X6
5478 PXOR X6, X3
5479 MOVO X3, X14
5480 PSLLL $0x07, X14
5481 PSRLL $0x19, X3
5482 PXOR X14, X3
5483 PADDD X4, X1
5484 PXOR X1, X10
5485 ROL16(X10, X14)
5486 PADDD X10, X7
5487 PXOR X7, X4
5488 MOVO X4, X14
5489 PSLLL $0x0c, X14
5490 PSRLL $0x14, X4
5491 PXOR X14, X4
5492 PADDD X4, X1
5493 PXOR X1, X10
5494 ROL8(X10, X14)
5495 PADDD X10, X7
5496 PXOR X7, X4
5497 MOVO X4, X14
5498 PSLLL $0x07, X14
5499 PSRLL $0x19, X4
5500 PXOR X14, X4
5501 PADDD X5, X2
5502 PXOR X2, X11
5503 ROL16(X11, X14)
5504 PADDD X11, X8
5505 PXOR X8, X5
5506 MOVO X5, X14
5507 PSLLL $0x0c, X14
5508 PSRLL $0x14, X5
5509 PXOR X14, X5
5510 PADDD X5, X2
5511 PXOR X2, X11
5512 ROL8(X11, X14)
5513 PADDD X11, X8
5514 PXOR X8, X5
5515 MOVO X5, X14
5516 PSLLL $0x07, X14
5517 PSRLL $0x19, X5
5518 PXOR X14, X5
5519 MOVO 64(BP), X14
5520 MOVO X7, 64(BP)
5521 IMULQ R12, R8
5522 ADDQ R10, R15
5523 ADCQ DX, R8
5524 PADDD X13, X12
5525 PXOR X12, X15
5526 ROL16(X15, X7)
5527 PADDD X15, X14
5528 PXOR X14, X13
5529 MOVO X13, X7
5530 PSLLL $0x0c, X7
5531 PSRLL $0x14, X13
5532 PXOR X7, X13
5533 PADDD X13, X12
5534 PXOR X12, X15
5535 ROL8(X15, X7)
5536 PADDD X15, X14
5537 PXOR X14, X13
5538 MOVO X13, X7
5539 PSLLL $0x07, X7
5540 PSRLL $0x19, X13
5541 PXOR X7, X13
5542 MOVO 64(BP), X7
5543 MOVQ R13, R10
5544 MOVQ R14, R11
5545 MOVQ R15, R12
5546 ANDQ $0x03, R12
5547 MOVQ R15, R13
5548 ANDQ $-4, R13
5549 MOVQ R8, R14
5550 SHRQ $0x02, R8, R15
5551 SHRQ $0x02, R8
5552 ADDQ R13, R10
5553 ADCQ R14, R11
5554 ADCQ $0x00, R12
5555 ADDQ R15, R10
5556 ADCQ R8, R11
5557 ADCQ $0x00, R12
5558 BYTE $0x66
5559 BYTE $0x0f
5560 BYTE $0x3a
5561 BYTE $0x0f
5562 BYTE $0xdb
5563 BYTE $0x0c
5564 BYTE $0x66
5565 BYTE $0x0f
5566 BYTE $0x3a
5567 BYTE $0x0f
5568 BYTE $0xe4
5569 BYTE $0x0c
5570 BYTE $0x66
5571 BYTE $0x0f
5572 BYTE $0x3a
5573 BYTE $0x0f
5574 BYTE $0xed
5575 BYTE $0x0c
5576 BYTE $0x66
5577 BYTE $0x45
5578 BYTE $0x0f
5579 BYTE $0x3a
5580 BYTE $0x0f
5581 BYTE $0xed
5582 BYTE $0x0c
5583 BYTE $0x66
5584 BYTE $0x0f
5585 BYTE $0x3a
5586 BYTE $0x0f
5587 BYTE $0xf6
5588 BYTE $0x08
5589 BYTE $0x66
5590 BYTE $0x0f
5591 BYTE $0x3a
5592 BYTE $0x0f
5593 BYTE $0xff
5594 BYTE $0x08
5595 BYTE $0x66
5596 BYTE $0x45
5597 BYTE $0x0f
5598 BYTE $0x3a
5599 BYTE $0x0f
5600 BYTE $0xc0
5601 BYTE $0x08
5602 BYTE $0x66
5603 BYTE $0x45
5604 BYTE $0x0f
5605 BYTE $0x3a
5606 BYTE $0x0f
5607 BYTE $0xf6
5608 BYTE $0x08
5609 BYTE $0x66
5610 BYTE $0x45
5611 BYTE $0x0f
5612 BYTE $0x3a
5613 BYTE $0x0f
5614 BYTE $0xc9
5615 BYTE $0x04
5616 BYTE $0x66
5617 BYTE $0x45
5618 BYTE $0x0f
5619 BYTE $0x3a
5620 BYTE $0x0f
5621 BYTE $0xd2
5622 BYTE $0x04
5623 BYTE $0x66
5624 BYTE $0x45
5625 BYTE $0x0f
5626 BYTE $0x3a
5627 BYTE $0x0f
5628 BYTE $0xdb
5629 BYTE $0x04
5630 BYTE $0x66
5631 BYTE $0x45
5632 BYTE $0x0f
5633 BYTE $0x3a
5634 BYTE $0x0f
5635 BYTE $0xff
5636 BYTE $0x04
5637 DECQ R9
5638 JGE sealSSEInnerLoop
5639 ADDQ (DI), R10
5640 ADCQ 8(DI), R11
5641 ADCQ $0x01, R12
5642 MOVQ (BP), AX
5643 MOVQ AX, R15
5644 MULQ R10
5645 MOVQ AX, R13
5646 MOVQ DX, R14
5647 MOVQ (BP), AX
5648 MULQ R11
5649 IMULQ R12, R15
5650 ADDQ AX, R14
5651 ADCQ DX, R15
5652 MOVQ 8(BP), AX
5653 MOVQ AX, R8
5654 MULQ R10
5655 ADDQ AX, R14
5656 ADCQ $0x00, DX
5657 MOVQ DX, R10
5658 MOVQ 8(BP), AX
5659 MULQ R11
5660 ADDQ AX, R15
5661 ADCQ $0x00, DX
5662 IMULQ R12, R8
5663 ADDQ R10, R15
5664 ADCQ DX, R8
5665 MOVQ R13, R10
5666 MOVQ R14, R11
5667 MOVQ R15, R12
5668 ANDQ $0x03, R12
5669 MOVQ R15, R13
5670 ANDQ $-4, R13
5671 MOVQ R8, R14
5672 SHRQ $0x02, R8, R15
5673 SHRQ $0x02, R8
5674 ADDQ R13, R10
5675 ADCQ R14, R11
5676 ADCQ $0x00, R12
5677 ADDQ R15, R10
5678 ADCQ R8, R11
5679 ADCQ $0x00, R12
5680 LEAQ 16(DI), DI
5681 DECQ CX
5682 JG sealSSEInnerLoop
5683
5684 // Add in the state
5685 PADDD ·chacha20Constants<>+0(SB), X0
5686 PADDD ·chacha20Constants<>+0(SB), X1
5687 PADDD ·chacha20Constants<>+0(SB), X2
5688 PADDD ·chacha20Constants<>+0(SB), X12
5689 PADDD 32(BP), X3
5690 PADDD 32(BP), X4
5691 PADDD 32(BP), X5
5692 PADDD 32(BP), X13
5693 PADDD 48(BP), X6
5694 PADDD 48(BP), X7
5695 PADDD 48(BP), X8
5696 PADDD 48(BP), X14
5697 PADDD 80(BP), X9
5698 PADDD 96(BP), X10
5699 PADDD 112(BP), X11
5700 PADDD 128(BP), X15
5701 MOVO X15, 64(BP)
5702
5703 // Load - xor - store
5704 MOVOU (SI), X15
5705 PXOR X15, X0
5706 MOVOU 16(SI), X15
5707 PXOR X15, X3
5708 MOVOU 32(SI), X15
5709 PXOR X15, X6
5710 MOVOU 48(SI), X15
5711 PXOR X15, X9
5712 MOVOU X0, (DI)
5713 MOVOU X3, 16(DI)
5714 MOVOU X6, 32(DI)
5715 MOVOU X9, 48(DI)
5716 MOVO 64(BP), X15
5717 MOVOU 64(SI), X0
5718 MOVOU 80(SI), X3
5719 MOVOU 96(SI), X6
5720 MOVOU 112(SI), X9
5721 PXOR X0, X1
5722 PXOR X3, X4
5723 PXOR X6, X7
5724 PXOR X9, X10
5725 MOVOU X1, 64(DI)
5726 MOVOU X4, 80(DI)
5727 MOVOU X7, 96(DI)
5728 MOVOU X10, 112(DI)
5729 MOVOU 128(SI), X0
5730 MOVOU 144(SI), X3
5731 MOVOU 160(SI), X6
5732 MOVOU 176(SI), X9
5733 PXOR X0, X2
5734 PXOR X3, X5
5735 PXOR X6, X8
5736 PXOR X9, X11
5737 MOVOU X2, 128(DI)
5738 MOVOU X5, 144(DI)
5739 MOVOU X8, 160(DI)
5740 MOVOU X11, 176(DI)
5741 ADDQ $0xc0, SI
5742 MOVQ $0x000000c0, CX
5743 SUBQ $0xc0, BX
5744 MOVO X12, X1
5745 MOVO X13, X4
5746 MOVO X14, X7
5747 MOVO X15, X10
5748 CMPQ BX, $0x40
5749 JBE sealSSE128SealHash
5750 MOVOU (SI), X0
5751 MOVOU 16(SI), X3
5752 MOVOU 32(SI), X6
5753 MOVOU 48(SI), X9
5754 PXOR X0, X12
5755 PXOR X3, X13
5756 PXOR X6, X14
5757 PXOR X9, X15
5758 MOVOU X12, 192(DI)
5759 MOVOU X13, 208(DI)
5760 MOVOU X14, 224(DI)
5761 MOVOU X15, 240(DI)
5762 LEAQ 64(SI), SI
5763 SUBQ $0x40, BX
5764 MOVQ $0x00000006, CX
5765 MOVQ $0x00000004, R9
5766 CMPQ BX, $0xc0
5767 JG sealSSEMainLoop
5768 MOVQ BX, CX
5769 TESTQ BX, BX
5770 JE sealSSE128SealHash
5771 MOVQ $0x00000006, CX
5772 CMPQ BX, $0x40
5773 JBE sealSSETail64
5774 CMPQ BX, $0x80
5775 JBE sealSSETail128
5776 JMP sealSSETail192
5777
5778sealSSETail64:
5779 MOVO ·chacha20Constants<>+0(SB), X1
5780 MOVO 32(BP), X4
5781 MOVO 48(BP), X7
5782 MOVO 128(BP), X10
5783 PADDL ·sseIncMask<>+0(SB), X10
5784 MOVO X10, 80(BP)
5785
5786sealSSETail64LoopA:
5787 ADDQ (DI), R10
5788 ADCQ 8(DI), R11
5789 ADCQ $0x01, R12
5790 MOVQ (BP), AX
5791 MOVQ AX, R15
5792 MULQ R10
5793 MOVQ AX, R13
5794 MOVQ DX, R14
5795 MOVQ (BP), AX
5796 MULQ R11
5797 IMULQ R12, R15
5798 ADDQ AX, R14
5799 ADCQ DX, R15
5800 MOVQ 8(BP), AX
5801 MOVQ AX, R8
5802 MULQ R10
5803 ADDQ AX, R14
5804 ADCQ $0x00, DX
5805 MOVQ DX, R10
5806 MOVQ 8(BP), AX
5807 MULQ R11
5808 ADDQ AX, R15
5809 ADCQ $0x00, DX
5810 IMULQ R12, R8
5811 ADDQ R10, R15
5812 ADCQ DX, R8
5813 MOVQ R13, R10
5814 MOVQ R14, R11
5815 MOVQ R15, R12
5816 ANDQ $0x03, R12
5817 MOVQ R15, R13
5818 ANDQ $-4, R13
5819 MOVQ R8, R14
5820 SHRQ $0x02, R8, R15
5821 SHRQ $0x02, R8
5822 ADDQ R13, R10
5823 ADCQ R14, R11
5824 ADCQ $0x00, R12
5825 ADDQ R15, R10
5826 ADCQ R8, R11
5827 ADCQ $0x00, R12
5828 LEAQ 16(DI), DI
5829
5830sealSSETail64LoopB:
5831 PADDD X4, X1
5832 PXOR X1, X10
5833 ROL16(X10, X13)
5834 PADDD X10, X7
5835 PXOR X7, X4
5836 MOVO X4, X13
5837 PSLLL $0x0c, X13
5838 PSRLL $0x14, X4
5839 PXOR X13, X4
5840 PADDD X4, X1
5841 PXOR X1, X10
5842 ROL8(X10, X13)
5843 PADDD X10, X7
5844 PXOR X7, X4
5845 MOVO X4, X13
5846 PSLLL $0x07, X13
5847 PSRLL $0x19, X4
5848 PXOR X13, X4
5849 BYTE $0x66
5850 BYTE $0x0f
5851 BYTE $0x3a
5852 BYTE $0x0f
5853 BYTE $0xe4
5854 BYTE $0x04
5855 BYTE $0x66
5856 BYTE $0x0f
5857 BYTE $0x3a
5858 BYTE $0x0f
5859 BYTE $0xff
5860 BYTE $0x08
5861 BYTE $0x66
5862 BYTE $0x45
5863 BYTE $0x0f
5864 BYTE $0x3a
5865 BYTE $0x0f
5866 BYTE $0xd2
5867 BYTE $0x0c
5868 PADDD X4, X1
5869 PXOR X1, X10
5870 ROL16(X10, X13)
5871 PADDD X10, X7
5872 PXOR X7, X4
5873 MOVO X4, X13
5874 PSLLL $0x0c, X13
5875 PSRLL $0x14, X4
5876 PXOR X13, X4
5877 PADDD X4, X1
5878 PXOR X1, X10
5879 ROL8(X10, X13)
5880 PADDD X10, X7
5881 PXOR X7, X4
5882 MOVO X4, X13
5883 PSLLL $0x07, X13
5884 PSRLL $0x19, X4
5885 PXOR X13, X4
5886 BYTE $0x66
5887 BYTE $0x0f
5888 BYTE $0x3a
5889 BYTE $0x0f
5890 BYTE $0xe4
5891 BYTE $0x0c
5892 BYTE $0x66
5893 BYTE $0x0f
5894 BYTE $0x3a
5895 BYTE $0x0f
5896 BYTE $0xff
5897 BYTE $0x08
5898 BYTE $0x66
5899 BYTE $0x45
5900 BYTE $0x0f
5901 BYTE $0x3a
5902 BYTE $0x0f
5903 BYTE $0xd2
5904 BYTE $0x04
5905 ADDQ (DI), R10
5906 ADCQ 8(DI), R11
5907 ADCQ $0x01, R12
5908 MOVQ (BP), AX
5909 MOVQ AX, R15
5910 MULQ R10
5911 MOVQ AX, R13
5912 MOVQ DX, R14
5913 MOVQ (BP), AX
5914 MULQ R11
5915 IMULQ R12, R15
5916 ADDQ AX, R14
5917 ADCQ DX, R15
5918 MOVQ 8(BP), AX
5919 MOVQ AX, R8
5920 MULQ R10
5921 ADDQ AX, R14
5922 ADCQ $0x00, DX
5923 MOVQ DX, R10
5924 MOVQ 8(BP), AX
5925 MULQ R11
5926 ADDQ AX, R15
5927 ADCQ $0x00, DX
5928 IMULQ R12, R8
5929 ADDQ R10, R15
5930 ADCQ DX, R8
5931 MOVQ R13, R10
5932 MOVQ R14, R11
5933 MOVQ R15, R12
5934 ANDQ $0x03, R12
5935 MOVQ R15, R13
5936 ANDQ $-4, R13
5937 MOVQ R8, R14
5938 SHRQ $0x02, R8, R15
5939 SHRQ $0x02, R8
5940 ADDQ R13, R10
5941 ADCQ R14, R11
5942 ADCQ $0x00, R12
5943 ADDQ R15, R10
5944 ADCQ R8, R11
5945 ADCQ $0x00, R12
5946 LEAQ 16(DI), DI
5947 DECQ CX
5948 JG sealSSETail64LoopA
5949 DECQ R9
5950 JGE sealSSETail64LoopB
5951 PADDL ·chacha20Constants<>+0(SB), X1
5952 PADDL 32(BP), X4
5953 PADDL 48(BP), X7
5954 PADDL 80(BP), X10
5955 JMP sealSSE128Seal
5956
5957sealSSETail128:
5958 MOVO ·chacha20Constants<>+0(SB), X0
5959 MOVO 32(BP), X3
5960 MOVO 48(BP), X6
5961 MOVO 128(BP), X9
5962 PADDL ·sseIncMask<>+0(SB), X9
5963 MOVO X9, 80(BP)
5964 MOVO X0, X1
5965 MOVO X3, X4
5966 MOVO X6, X7
5967 MOVO X9, X10
5968 PADDL ·sseIncMask<>+0(SB), X10
5969 MOVO X10, 96(BP)
5970
5971sealSSETail128LoopA:
5972 ADDQ (DI), R10
5973 ADCQ 8(DI), R11
5974 ADCQ $0x01, R12
5975 MOVQ (BP), AX
5976 MOVQ AX, R15
5977 MULQ R10
5978 MOVQ AX, R13
5979 MOVQ DX, R14
5980 MOVQ (BP), AX
5981 MULQ R11
5982 IMULQ R12, R15
5983 ADDQ AX, R14
5984 ADCQ DX, R15
5985 MOVQ 8(BP), AX
5986 MOVQ AX, R8
5987 MULQ R10
5988 ADDQ AX, R14
5989 ADCQ $0x00, DX
5990 MOVQ DX, R10
5991 MOVQ 8(BP), AX
5992 MULQ R11
5993 ADDQ AX, R15
5994 ADCQ $0x00, DX
5995 IMULQ R12, R8
5996 ADDQ R10, R15
5997 ADCQ DX, R8
5998 MOVQ R13, R10
5999 MOVQ R14, R11
6000 MOVQ R15, R12
6001 ANDQ $0x03, R12
6002 MOVQ R15, R13
6003 ANDQ $-4, R13
6004 MOVQ R8, R14
6005 SHRQ $0x02, R8, R15
6006 SHRQ $0x02, R8
6007 ADDQ R13, R10
6008 ADCQ R14, R11
6009 ADCQ $0x00, R12
6010 ADDQ R15, R10
6011 ADCQ R8, R11
6012 ADCQ $0x00, R12
6013 LEAQ 16(DI), DI
6014
6015sealSSETail128LoopB:
6016 PADDD X3, X0
6017 PXOR X0, X9
6018 ROL16(X9, X12)
6019 PADDD X9, X6
6020 PXOR X6, X3
6021 MOVO X3, X12
6022 PSLLL $0x0c, X12
6023 PSRLL $0x14, X3
6024 PXOR X12, X3
6025 PADDD X3, X0
6026 PXOR X0, X9
6027 ROL8(X9, X12)
6028 PADDD X9, X6
6029 PXOR X6, X3
6030 MOVO X3, X12
6031 PSLLL $0x07, X12
6032 PSRLL $0x19, X3
6033 PXOR X12, X3
6034 PADDD X4, X1
6035 PXOR X1, X10
6036 ROL16(X10, X12)
6037 PADDD X10, X7
6038 PXOR X7, X4
6039 MOVO X4, X12
6040 PSLLL $0x0c, X12
6041 PSRLL $0x14, X4
6042 PXOR X12, X4
6043 PADDD X4, X1
6044 PXOR X1, X10
6045 ROL8(X10, X12)
6046 PADDD X10, X7
6047 PXOR X7, X4
6048 MOVO X4, X12
6049 PSLLL $0x07, X12
6050 PSRLL $0x19, X4
6051 PXOR X12, X4
6052 BYTE $0x66
6053 BYTE $0x0f
6054 BYTE $0x3a
6055 BYTE $0x0f
6056 BYTE $0xdb
6057 BYTE $0x04
6058 BYTE $0x66
6059 BYTE $0x0f
6060 BYTE $0x3a
6061 BYTE $0x0f
6062 BYTE $0xf6
6063 BYTE $0x08
6064 BYTE $0x66
6065 BYTE $0x45
6066 BYTE $0x0f
6067 BYTE $0x3a
6068 BYTE $0x0f
6069 BYTE $0xc9
6070 BYTE $0x0c
6071 BYTE $0x66
6072 BYTE $0x0f
6073 BYTE $0x3a
6074 BYTE $0x0f
6075 BYTE $0xe4
6076 BYTE $0x04
6077 BYTE $0x66
6078 BYTE $0x0f
6079 BYTE $0x3a
6080 BYTE $0x0f
6081 BYTE $0xff
6082 BYTE $0x08
6083 BYTE $0x66
6084 BYTE $0x45
6085 BYTE $0x0f
6086 BYTE $0x3a
6087 BYTE $0x0f
6088 BYTE $0xd2
6089 BYTE $0x0c
6090 ADDQ (DI), R10
6091 ADCQ 8(DI), R11
6092 ADCQ $0x01, R12
6093 MOVQ (BP), AX
6094 MOVQ AX, R15
6095 MULQ R10
6096 MOVQ AX, R13
6097 MOVQ DX, R14
6098 MOVQ (BP), AX
6099 MULQ R11
6100 IMULQ R12, R15
6101 ADDQ AX, R14
6102 ADCQ DX, R15
6103 MOVQ 8(BP), AX
6104 MOVQ AX, R8
6105 MULQ R10
6106 ADDQ AX, R14
6107 ADCQ $0x00, DX
6108 MOVQ DX, R10
6109 MOVQ 8(BP), AX
6110 MULQ R11
6111 ADDQ AX, R15
6112 ADCQ $0x00, DX
6113 IMULQ R12, R8
6114 ADDQ R10, R15
6115 ADCQ DX, R8
6116 MOVQ R13, R10
6117 MOVQ R14, R11
6118 MOVQ R15, R12
6119 ANDQ $0x03, R12
6120 MOVQ R15, R13
6121 ANDQ $-4, R13
6122 MOVQ R8, R14
6123 SHRQ $0x02, R8, R15
6124 SHRQ $0x02, R8
6125 ADDQ R13, R10
6126 ADCQ R14, R11
6127 ADCQ $0x00, R12
6128 ADDQ R15, R10
6129 ADCQ R8, R11
6130 ADCQ $0x00, R12
6131 LEAQ 16(DI), DI
6132 PADDD X3, X0
6133 PXOR X0, X9
6134 ROL16(X9, X12)
6135 PADDD X9, X6
6136 PXOR X6, X3
6137 MOVO X3, X12
6138 PSLLL $0x0c, X12
6139 PSRLL $0x14, X3
6140 PXOR X12, X3
6141 PADDD X3, X0
6142 PXOR X0, X9
6143 ROL8(X9, X12)
6144 PADDD X9, X6
6145 PXOR X6, X3
6146 MOVO X3, X12
6147 PSLLL $0x07, X12
6148 PSRLL $0x19, X3
6149 PXOR X12, X3
6150 PADDD X4, X1
6151 PXOR X1, X10
6152 ROL16(X10, X12)
6153 PADDD X10, X7
6154 PXOR X7, X4
6155 MOVO X4, X12
6156 PSLLL $0x0c, X12
6157 PSRLL $0x14, X4
6158 PXOR X12, X4
6159 PADDD X4, X1
6160 PXOR X1, X10
6161 ROL8(X10, X12)
6162 PADDD X10, X7
6163 PXOR X7, X4
6164 MOVO X4, X12
6165 PSLLL $0x07, X12
6166 PSRLL $0x19, X4
6167 PXOR X12, X4
6168 BYTE $0x66
6169 BYTE $0x0f
6170 BYTE $0x3a
6171 BYTE $0x0f
6172 BYTE $0xdb
6173 BYTE $0x0c
6174 BYTE $0x66
6175 BYTE $0x0f
6176 BYTE $0x3a
6177 BYTE $0x0f
6178 BYTE $0xf6
6179 BYTE $0x08
6180 BYTE $0x66
6181 BYTE $0x45
6182 BYTE $0x0f
6183 BYTE $0x3a
6184 BYTE $0x0f
6185 BYTE $0xc9
6186 BYTE $0x04
6187 BYTE $0x66
6188 BYTE $0x0f
6189 BYTE $0x3a
6190 BYTE $0x0f
6191 BYTE $0xe4
6192 BYTE $0x0c
6193 BYTE $0x66
6194 BYTE $0x0f
6195 BYTE $0x3a
6196 BYTE $0x0f
6197 BYTE $0xff
6198 BYTE $0x08
6199 BYTE $0x66
6200 BYTE $0x45
6201 BYTE $0x0f
6202 BYTE $0x3a
6203 BYTE $0x0f
6204 BYTE $0xd2
6205 BYTE $0x04
6206 DECQ CX
6207 JG sealSSETail128LoopA
6208 DECQ R9
6209 JGE sealSSETail128LoopB
6210 PADDL ·chacha20Constants<>+0(SB), X0
6211 PADDL ·chacha20Constants<>+0(SB), X1
6212 PADDL 32(BP), X3
6213 PADDL 32(BP), X4
6214 PADDL 48(BP), X6
6215 PADDL 48(BP), X7
6216 PADDL 80(BP), X9
6217 PADDL 96(BP), X10
6218 MOVOU (SI), X12
6219 MOVOU 16(SI), X13
6220 MOVOU 32(SI), X14
6221 MOVOU 48(SI), X15
6222 PXOR X12, X0
6223 PXOR X13, X3
6224 PXOR X14, X6
6225 PXOR X15, X9
6226 MOVOU X0, (DI)
6227 MOVOU X3, 16(DI)
6228 MOVOU X6, 32(DI)
6229 MOVOU X9, 48(DI)
6230 MOVQ $0x00000040, CX
6231 LEAQ 64(SI), SI
6232 SUBQ $0x40, BX
6233 JMP sealSSE128SealHash
6234
6235sealSSETail192:
6236 MOVO ·chacha20Constants<>+0(SB), X0
6237 MOVO 32(BP), X3
6238 MOVO 48(BP), X6
6239 MOVO 128(BP), X9
6240 PADDL ·sseIncMask<>+0(SB), X9
6241 MOVO X9, 80(BP)
6242 MOVO X0, X1
6243 MOVO X3, X4
6244 MOVO X6, X7
6245 MOVO X9, X10
6246 PADDL ·sseIncMask<>+0(SB), X10
6247 MOVO X10, 96(BP)
6248 MOVO X1, X2
6249 MOVO X4, X5
6250 MOVO X7, X8
6251 MOVO X10, X11
6252 PADDL ·sseIncMask<>+0(SB), X11
6253 MOVO X11, 112(BP)
6254
6255sealSSETail192LoopA:
6256 ADDQ (DI), R10
6257 ADCQ 8(DI), R11
6258 ADCQ $0x01, R12
6259 MOVQ (BP), AX
6260 MOVQ AX, R15
6261 MULQ R10
6262 MOVQ AX, R13
6263 MOVQ DX, R14
6264 MOVQ (BP), AX
6265 MULQ R11
6266 IMULQ R12, R15
6267 ADDQ AX, R14
6268 ADCQ DX, R15
6269 MOVQ 8(BP), AX
6270 MOVQ AX, R8
6271 MULQ R10
6272 ADDQ AX, R14
6273 ADCQ $0x00, DX
6274 MOVQ DX, R10
6275 MOVQ 8(BP), AX
6276 MULQ R11
6277 ADDQ AX, R15
6278 ADCQ $0x00, DX
6279 IMULQ R12, R8
6280 ADDQ R10, R15
6281 ADCQ DX, R8
6282 MOVQ R13, R10
6283 MOVQ R14, R11
6284 MOVQ R15, R12
6285 ANDQ $0x03, R12
6286 MOVQ R15, R13
6287 ANDQ $-4, R13
6288 MOVQ R8, R14
6289 SHRQ $0x02, R8, R15
6290 SHRQ $0x02, R8
6291 ADDQ R13, R10
6292 ADCQ R14, R11
6293 ADCQ $0x00, R12
6294 ADDQ R15, R10
6295 ADCQ R8, R11
6296 ADCQ $0x00, R12
6297 LEAQ 16(DI), DI
6298
6299sealSSETail192LoopB:
6300 PADDD X3, X0
6301 PXOR X0, X9
6302 ROL16(X9, X12)
6303 PADDD X9, X6
6304 PXOR X6, X3
6305 MOVO X3, X12
6306 PSLLL $0x0c, X12
6307 PSRLL $0x14, X3
6308 PXOR X12, X3
6309 PADDD X3, X0
6310 PXOR X0, X9
6311 ROL8(X9, X12)
6312 PADDD X9, X6
6313 PXOR X6, X3
6314 MOVO X3, X12
6315 PSLLL $0x07, X12
6316 PSRLL $0x19, X3
6317 PXOR X12, X3
6318 PADDD X4, X1
6319 PXOR X1, X10
6320 ROL16(X10, X12)
6321 PADDD X10, X7
6322 PXOR X7, X4
6323 MOVO X4, X12
6324 PSLLL $0x0c, X12
6325 PSRLL $0x14, X4
6326 PXOR X12, X4
6327 PADDD X4, X1
6328 PXOR X1, X10
6329 ROL8(X10, X12)
6330 PADDD X10, X7
6331 PXOR X7, X4
6332 MOVO X4, X12
6333 PSLLL $0x07, X12
6334 PSRLL $0x19, X4
6335 PXOR X12, X4
6336 PADDD X5, X2
6337 PXOR X2, X11
6338 ROL16(X11, X12)
6339 PADDD X11, X8
6340 PXOR X8, X5
6341 MOVO X5, X12
6342 PSLLL $0x0c, X12
6343 PSRLL $0x14, X5
6344 PXOR X12, X5
6345 PADDD X5, X2
6346 PXOR X2, X11
6347 ROL8(X11, X12)
6348 PADDD X11, X8
6349 PXOR X8, X5
6350 MOVO X5, X12
6351 PSLLL $0x07, X12
6352 PSRLL $0x19, X5
6353 PXOR X12, X5
6354 BYTE $0x66
6355 BYTE $0x0f
6356 BYTE $0x3a
6357 BYTE $0x0f
6358 BYTE $0xdb
6359 BYTE $0x04
6360 BYTE $0x66
6361 BYTE $0x0f
6362 BYTE $0x3a
6363 BYTE $0x0f
6364 BYTE $0xf6
6365 BYTE $0x08
6366 BYTE $0x66
6367 BYTE $0x45
6368 BYTE $0x0f
6369 BYTE $0x3a
6370 BYTE $0x0f
6371 BYTE $0xc9
6372 BYTE $0x0c
6373 BYTE $0x66
6374 BYTE $0x0f
6375 BYTE $0x3a
6376 BYTE $0x0f
6377 BYTE $0xe4
6378 BYTE $0x04
6379 BYTE $0x66
6380 BYTE $0x0f
6381 BYTE $0x3a
6382 BYTE $0x0f
6383 BYTE $0xff
6384 BYTE $0x08
6385 BYTE $0x66
6386 BYTE $0x45
6387 BYTE $0x0f
6388 BYTE $0x3a
6389 BYTE $0x0f
6390 BYTE $0xd2
6391 BYTE $0x0c
6392 BYTE $0x66
6393 BYTE $0x0f
6394 BYTE $0x3a
6395 BYTE $0x0f
6396 BYTE $0xed
6397 BYTE $0x04
6398 BYTE $0x66
6399 BYTE $0x45
6400 BYTE $0x0f
6401 BYTE $0x3a
6402 BYTE $0x0f
6403 BYTE $0xc0
6404 BYTE $0x08
6405 BYTE $0x66
6406 BYTE $0x45
6407 BYTE $0x0f
6408 BYTE $0x3a
6409 BYTE $0x0f
6410 BYTE $0xdb
6411 BYTE $0x0c
6412 ADDQ (DI), R10
6413 ADCQ 8(DI), R11
6414 ADCQ $0x01, R12
6415 MOVQ (BP), AX
6416 MOVQ AX, R15
6417 MULQ R10
6418 MOVQ AX, R13
6419 MOVQ DX, R14
6420 MOVQ (BP), AX
6421 MULQ R11
6422 IMULQ R12, R15
6423 ADDQ AX, R14
6424 ADCQ DX, R15
6425 MOVQ 8(BP), AX
6426 MOVQ AX, R8
6427 MULQ R10
6428 ADDQ AX, R14
6429 ADCQ $0x00, DX
6430 MOVQ DX, R10
6431 MOVQ 8(BP), AX
6432 MULQ R11
6433 ADDQ AX, R15
6434 ADCQ $0x00, DX
6435 IMULQ R12, R8
6436 ADDQ R10, R15
6437 ADCQ DX, R8
6438 MOVQ R13, R10
6439 MOVQ R14, R11
6440 MOVQ R15, R12
6441 ANDQ $0x03, R12
6442 MOVQ R15, R13
6443 ANDQ $-4, R13
6444 MOVQ R8, R14
6445 SHRQ $0x02, R8, R15
6446 SHRQ $0x02, R8
6447 ADDQ R13, R10
6448 ADCQ R14, R11
6449 ADCQ $0x00, R12
6450 ADDQ R15, R10
6451 ADCQ R8, R11
6452 ADCQ $0x00, R12
6453 LEAQ 16(DI), DI
6454 PADDD X3, X0
6455 PXOR X0, X9
6456 ROL16(X9, X12)
6457 PADDD X9, X6
6458 PXOR X6, X3
6459 MOVO X3, X12
6460 PSLLL $0x0c, X12
6461 PSRLL $0x14, X3
6462 PXOR X12, X3
6463 PADDD X3, X0
6464 PXOR X0, X9
6465 ROL8(X9, X12)
6466 PADDD X9, X6
6467 PXOR X6, X3
6468 MOVO X3, X12
6469 PSLLL $0x07, X12
6470 PSRLL $0x19, X3
6471 PXOR X12, X3
6472 PADDD X4, X1
6473 PXOR X1, X10
6474 ROL16(X10, X12)
6475 PADDD X10, X7
6476 PXOR X7, X4
6477 MOVO X4, X12
6478 PSLLL $0x0c, X12
6479 PSRLL $0x14, X4
6480 PXOR X12, X4
6481 PADDD X4, X1
6482 PXOR X1, X10
6483 ROL8(X10, X12)
6484 PADDD X10, X7
6485 PXOR X7, X4
6486 MOVO X4, X12
6487 PSLLL $0x07, X12
6488 PSRLL $0x19, X4
6489 PXOR X12, X4
6490 PADDD X5, X2
6491 PXOR X2, X11
6492 ROL16(X11, X12)
6493 PADDD X11, X8
6494 PXOR X8, X5
6495 MOVO X5, X12
6496 PSLLL $0x0c, X12
6497 PSRLL $0x14, X5
6498 PXOR X12, X5
6499 PADDD X5, X2
6500 PXOR X2, X11
6501 ROL8(X11, X12)
6502 PADDD X11, X8
6503 PXOR X8, X5
6504 MOVO X5, X12
6505 PSLLL $0x07, X12
6506 PSRLL $0x19, X5
6507 PXOR X12, X5
6508 BYTE $0x66
6509 BYTE $0x0f
6510 BYTE $0x3a
6511 BYTE $0x0f
6512 BYTE $0xdb
6513 BYTE $0x0c
6514 BYTE $0x66
6515 BYTE $0x0f
6516 BYTE $0x3a
6517 BYTE $0x0f
6518 BYTE $0xf6
6519 BYTE $0x08
6520 BYTE $0x66
6521 BYTE $0x45
6522 BYTE $0x0f
6523 BYTE $0x3a
6524 BYTE $0x0f
6525 BYTE $0xc9
6526 BYTE $0x04
6527 BYTE $0x66
6528 BYTE $0x0f
6529 BYTE $0x3a
6530 BYTE $0x0f
6531 BYTE $0xe4
6532 BYTE $0x0c
6533 BYTE $0x66
6534 BYTE $0x0f
6535 BYTE $0x3a
6536 BYTE $0x0f
6537 BYTE $0xff
6538 BYTE $0x08
6539 BYTE $0x66
6540 BYTE $0x45
6541 BYTE $0x0f
6542 BYTE $0x3a
6543 BYTE $0x0f
6544 BYTE $0xd2
6545 BYTE $0x04
6546 BYTE $0x66
6547 BYTE $0x0f
6548 BYTE $0x3a
6549 BYTE $0x0f
6550 BYTE $0xed
6551 BYTE $0x0c
6552 BYTE $0x66
6553 BYTE $0x45
6554 BYTE $0x0f
6555 BYTE $0x3a
6556 BYTE $0x0f
6557 BYTE $0xc0
6558 BYTE $0x08
6559 BYTE $0x66
6560 BYTE $0x45
6561 BYTE $0x0f
6562 BYTE $0x3a
6563 BYTE $0x0f
6564 BYTE $0xdb
6565 BYTE $0x04
6566 DECQ CX
6567 JG sealSSETail192LoopA
6568 DECQ R9
6569 JGE sealSSETail192LoopB
6570 PADDL ·chacha20Constants<>+0(SB), X0
6571 PADDL ·chacha20Constants<>+0(SB), X1
6572 PADDL ·chacha20Constants<>+0(SB), X2
6573 PADDL 32(BP), X3
6574 PADDL 32(BP), X4
6575 PADDL 32(BP), X5
6576 PADDL 48(BP), X6
6577 PADDL 48(BP), X7
6578 PADDL 48(BP), X8
6579 PADDL 80(BP), X9
6580 PADDL 96(BP), X10
6581 PADDL 112(BP), X11
6582 MOVOU (SI), X12
6583 MOVOU 16(SI), X13
6584 MOVOU 32(SI), X14
6585 MOVOU 48(SI), X15
6586 PXOR X12, X0
6587 PXOR X13, X3
6588 PXOR X14, X6
6589 PXOR X15, X9
6590 MOVOU X0, (DI)
6591 MOVOU X3, 16(DI)
6592 MOVOU X6, 32(DI)
6593 MOVOU X9, 48(DI)
6594 MOVOU 64(SI), X12
6595 MOVOU 80(SI), X13
6596 MOVOU 96(SI), X14
6597 MOVOU 112(SI), X15
6598 PXOR X12, X1
6599 PXOR X13, X4
6600 PXOR X14, X7
6601 PXOR X15, X10
6602 MOVOU X1, 64(DI)
6603 MOVOU X4, 80(DI)
6604 MOVOU X7, 96(DI)
6605 MOVOU X10, 112(DI)
6606 MOVO X2, X1
6607 MOVO X5, X4
6608 MOVO X8, X7
6609 MOVO X11, X10
6610 MOVQ $0x00000080, CX
6611 LEAQ 128(SI), SI
6612 SUBQ $0x80, BX
6613 JMP sealSSE128SealHash
6614
6615sealSSE128:
6616 MOVOU ·chacha20Constants<>+0(SB), X0
6617 MOVOU 16(R8), X3
6618 MOVOU 32(R8), X6
6619 MOVOU 48(R8), X9
6620 MOVO X0, X1
6621 MOVO X3, X4
6622 MOVO X6, X7
6623 MOVO X9, X10
6624 PADDL ·sseIncMask<>+0(SB), X10
6625 MOVO X1, X2
6626 MOVO X4, X5
6627 MOVO X7, X8
6628 MOVO X10, X11
6629 PADDL ·sseIncMask<>+0(SB), X11
6630 MOVO X3, X13
6631 MOVO X6, X14
6632 MOVO X10, X15
6633 MOVQ $0x0000000a, R9
6634
6635sealSSE128InnerCipherLoop:
6636 PADDD X3, X0
6637 PXOR X0, X9
6638 ROL16(X9, X12)
6639 PADDD X9, X6
6640 PXOR X6, X3
6641 MOVO X3, X12
6642 PSLLL $0x0c, X12
6643 PSRLL $0x14, X3
6644 PXOR X12, X3
6645 PADDD X3, X0
6646 PXOR X0, X9
6647 ROL8(X9, X12)
6648 PADDD X9, X6
6649 PXOR X6, X3
6650 MOVO X3, X12
6651 PSLLL $0x07, X12
6652 PSRLL $0x19, X3
6653 PXOR X12, X3
6654 PADDD X4, X1
6655 PXOR X1, X10
6656 ROL16(X10, X12)
6657 PADDD X10, X7
6658 PXOR X7, X4
6659 MOVO X4, X12
6660 PSLLL $0x0c, X12
6661 PSRLL $0x14, X4
6662 PXOR X12, X4
6663 PADDD X4, X1
6664 PXOR X1, X10
6665 ROL8(X10, X12)
6666 PADDD X10, X7
6667 PXOR X7, X4
6668 MOVO X4, X12
6669 PSLLL $0x07, X12
6670 PSRLL $0x19, X4
6671 PXOR X12, X4
6672 PADDD X5, X2
6673 PXOR X2, X11
6674 ROL16(X11, X12)
6675 PADDD X11, X8
6676 PXOR X8, X5
6677 MOVO X5, X12
6678 PSLLL $0x0c, X12
6679 PSRLL $0x14, X5
6680 PXOR X12, X5
6681 PADDD X5, X2
6682 PXOR X2, X11
6683 ROL8(X11, X12)
6684 PADDD X11, X8
6685 PXOR X8, X5
6686 MOVO X5, X12
6687 PSLLL $0x07, X12
6688 PSRLL $0x19, X5
6689 PXOR X12, X5
6690 BYTE $0x66
6691 BYTE $0x0f
6692 BYTE $0x3a
6693 BYTE $0x0f
6694 BYTE $0xdb
6695 BYTE $0x04
6696 BYTE $0x66
6697 BYTE $0x0f
6698 BYTE $0x3a
6699 BYTE $0x0f
6700 BYTE $0xe4
6701 BYTE $0x04
6702 BYTE $0x66
6703 BYTE $0x0f
6704 BYTE $0x3a
6705 BYTE $0x0f
6706 BYTE $0xed
6707 BYTE $0x04
6708 BYTE $0x66
6709 BYTE $0x0f
6710 BYTE $0x3a
6711 BYTE $0x0f
6712 BYTE $0xf6
6713 BYTE $0x08
6714 BYTE $0x66
6715 BYTE $0x0f
6716 BYTE $0x3a
6717 BYTE $0x0f
6718 BYTE $0xff
6719 BYTE $0x08
6720 BYTE $0x66
6721 BYTE $0x45
6722 BYTE $0x0f
6723 BYTE $0x3a
6724 BYTE $0x0f
6725 BYTE $0xc0
6726 BYTE $0x08
6727 BYTE $0x66
6728 BYTE $0x45
6729 BYTE $0x0f
6730 BYTE $0x3a
6731 BYTE $0x0f
6732 BYTE $0xc9
6733 BYTE $0x0c
6734 BYTE $0x66
6735 BYTE $0x45
6736 BYTE $0x0f
6737 BYTE $0x3a
6738 BYTE $0x0f
6739 BYTE $0xd2
6740 BYTE $0x0c
6741 BYTE $0x66
6742 BYTE $0x45
6743 BYTE $0x0f
6744 BYTE $0x3a
6745 BYTE $0x0f
6746 BYTE $0xdb
6747 BYTE $0x0c
6748 PADDD X3, X0
6749 PXOR X0, X9
6750 ROL16(X9, X12)
6751 PADDD X9, X6
6752 PXOR X6, X3
6753 MOVO X3, X12
6754 PSLLL $0x0c, X12
6755 PSRLL $0x14, X3
6756 PXOR X12, X3
6757 PADDD X3, X0
6758 PXOR X0, X9
6759 ROL8(X9, X12)
6760 PADDD X9, X6
6761 PXOR X6, X3
6762 MOVO X3, X12
6763 PSLLL $0x07, X12
6764 PSRLL $0x19, X3
6765 PXOR X12, X3
6766 PADDD X4, X1
6767 PXOR X1, X10
6768 ROL16(X10, X12)
6769 PADDD X10, X7
6770 PXOR X7, X4
6771 MOVO X4, X12
6772 PSLLL $0x0c, X12
6773 PSRLL $0x14, X4
6774 PXOR X12, X4
6775 PADDD X4, X1
6776 PXOR X1, X10
6777 ROL8(X10, X12)
6778 PADDD X10, X7
6779 PXOR X7, X4
6780 MOVO X4, X12
6781 PSLLL $0x07, X12
6782 PSRLL $0x19, X4
6783 PXOR X12, X4
6784 PADDD X5, X2
6785 PXOR X2, X11
6786 ROL16(X11, X12)
6787 PADDD X11, X8
6788 PXOR X8, X5
6789 MOVO X5, X12
6790 PSLLL $0x0c, X12
6791 PSRLL $0x14, X5
6792 PXOR X12, X5
6793 PADDD X5, X2
6794 PXOR X2, X11
6795 ROL8(X11, X12)
6796 PADDD X11, X8
6797 PXOR X8, X5
6798 MOVO X5, X12
6799 PSLLL $0x07, X12
6800 PSRLL $0x19, X5
6801 PXOR X12, X5
6802 BYTE $0x66
6803 BYTE $0x0f
6804 BYTE $0x3a
6805 BYTE $0x0f
6806 BYTE $0xdb
6807 BYTE $0x0c
6808 BYTE $0x66
6809 BYTE $0x0f
6810 BYTE $0x3a
6811 BYTE $0x0f
6812 BYTE $0xe4
6813 BYTE $0x0c
6814 BYTE $0x66
6815 BYTE $0x0f
6816 BYTE $0x3a
6817 BYTE $0x0f
6818 BYTE $0xed
6819 BYTE $0x0c
6820 BYTE $0x66
6821 BYTE $0x0f
6822 BYTE $0x3a
6823 BYTE $0x0f
6824 BYTE $0xf6
6825 BYTE $0x08
6826 BYTE $0x66
6827 BYTE $0x0f
6828 BYTE $0x3a
6829 BYTE $0x0f
6830 BYTE $0xff
6831 BYTE $0x08
6832 BYTE $0x66
6833 BYTE $0x45
6834 BYTE $0x0f
6835 BYTE $0x3a
6836 BYTE $0x0f
6837 BYTE $0xc0
6838 BYTE $0x08
6839 BYTE $0x66
6840 BYTE $0x45
6841 BYTE $0x0f
6842 BYTE $0x3a
6843 BYTE $0x0f
6844 BYTE $0xc9
6845 BYTE $0x04
6846 BYTE $0x66
6847 BYTE $0x45
6848 BYTE $0x0f
6849 BYTE $0x3a
6850 BYTE $0x0f
6851 BYTE $0xd2
6852 BYTE $0x04
6853 BYTE $0x66
6854 BYTE $0x45
6855 BYTE $0x0f
6856 BYTE $0x3a
6857 BYTE $0x0f
6858 BYTE $0xdb
6859 BYTE $0x04
6860 DECQ R9
6861 JNE sealSSE128InnerCipherLoop
6862
6863 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
6864 PADDL ·chacha20Constants<>+0(SB), X0
6865 PADDL ·chacha20Constants<>+0(SB), X1
6866 PADDL ·chacha20Constants<>+0(SB), X2
6867 PADDL X13, X3
6868 PADDL X13, X4
6869 PADDL X13, X5
6870 PADDL X14, X7
6871 PADDL X14, X8
6872 PADDL X15, X10
6873 PADDL ·sseIncMask<>+0(SB), X15
6874 PADDL X15, X11
6875 PAND ·polyClampMask<>+0(SB), X0
6876 MOVOU X0, (BP)
6877 MOVOU X3, 16(BP)
6878
6879 // Hash
6880 MOVQ ad_len+80(FP), R9
6881 CALL polyHashADInternal<>(SB)
6882 XORQ CX, CX
6883
6884sealSSE128SealHash:
6885 CMPQ CX, $0x10
6886 JB sealSSE128Seal
6887 ADDQ (DI), R10
6888 ADCQ 8(DI), R11
6889 ADCQ $0x01, R12
6890 MOVQ (BP), AX
6891 MOVQ AX, R15
6892 MULQ R10
6893 MOVQ AX, R13
6894 MOVQ DX, R14
6895 MOVQ (BP), AX
6896 MULQ R11
6897 IMULQ R12, R15
6898 ADDQ AX, R14
6899 ADCQ DX, R15
6900 MOVQ 8(BP), AX
6901 MOVQ AX, R8
6902 MULQ R10
6903 ADDQ AX, R14
6904 ADCQ $0x00, DX
6905 MOVQ DX, R10
6906 MOVQ 8(BP), AX
6907 MULQ R11
6908 ADDQ AX, R15
6909 ADCQ $0x00, DX
6910 IMULQ R12, R8
6911 ADDQ R10, R15
6912 ADCQ DX, R8
6913 MOVQ R13, R10
6914 MOVQ R14, R11
6915 MOVQ R15, R12
6916 ANDQ $0x03, R12
6917 MOVQ R15, R13
6918 ANDQ $-4, R13
6919 MOVQ R8, R14
6920 SHRQ $0x02, R8, R15
6921 SHRQ $0x02, R8
6922 ADDQ R13, R10
6923 ADCQ R14, R11
6924 ADCQ $0x00, R12
6925 ADDQ R15, R10
6926 ADCQ R8, R11
6927 ADCQ $0x00, R12
6928 SUBQ $0x10, CX
6929 ADDQ $0x10, DI
6930 JMP sealSSE128SealHash
6931
6932sealSSE128Seal:
6933 CMPQ BX, $0x10
6934 JB sealSSETail
6935 SUBQ $0x10, BX
6936
6937 // Load for decryption
6938 MOVOU (SI), X12
6939 PXOR X12, X1
6940 MOVOU X1, (DI)
6941 LEAQ 16(SI), SI
6942 LEAQ 16(DI), DI
6943
6944 // Extract for hashing
6945 MOVQ X1, R13
6946 PSRLDQ $0x08, X1
6947 MOVQ X1, R14
6948 ADDQ R13, R10
6949 ADCQ R14, R11
6950 ADCQ $0x01, R12
6951 MOVQ (BP), AX
6952 MOVQ AX, R15
6953 MULQ R10
6954 MOVQ AX, R13
6955 MOVQ DX, R14
6956 MOVQ (BP), AX
6957 MULQ R11
6958 IMULQ R12, R15
6959 ADDQ AX, R14
6960 ADCQ DX, R15
6961 MOVQ 8(BP), AX
6962 MOVQ AX, R8
6963 MULQ R10
6964 ADDQ AX, R14
6965 ADCQ $0x00, DX
6966 MOVQ DX, R10
6967 MOVQ 8(BP), AX
6968 MULQ R11
6969 ADDQ AX, R15
6970 ADCQ $0x00, DX
6971 IMULQ R12, R8
6972 ADDQ R10, R15
6973 ADCQ DX, R8
6974 MOVQ R13, R10
6975 MOVQ R14, R11
6976 MOVQ R15, R12
6977 ANDQ $0x03, R12
6978 MOVQ R15, R13
6979 ANDQ $-4, R13
6980 MOVQ R8, R14
6981 SHRQ $0x02, R8, R15
6982 SHRQ $0x02, R8
6983 ADDQ R13, R10
6984 ADCQ R14, R11
6985 ADCQ $0x00, R12
6986 ADDQ R15, R10
6987 ADCQ R8, R11
6988 ADCQ $0x00, R12
6989
6990 // Shift the stream "left"
6991 MOVO X4, X1
6992 MOVO X7, X4
6993 MOVO X10, X7
6994 MOVO X2, X10
6995 MOVO X5, X2
6996 MOVO X8, X5
6997 MOVO X11, X8
6998 JMP sealSSE128Seal
6999
7000sealSSETail:
7001 TESTQ BX, BX
7002 JE sealSSEFinalize
7003
7004 // We can only load the PT one byte at a time to avoid read after end of buffer
7005 MOVQ BX, R9
7006 SHLQ $0x04, R9
7007 LEAQ ·andMask<>+0(SB), R13
7008 MOVQ BX, CX
7009 LEAQ -1(SI)(BX*1), SI
7010 XORQ R15, R15
7011 XORQ R8, R8
7012 XORQ AX, AX
7013
7014sealSSETailLoadLoop:
7015 SHLQ $0x08, R15, R8
7016 SHLQ $0x08, R15
7017 MOVB (SI), AX
7018 XORQ AX, R15
7019 LEAQ -1(SI), SI
7020 DECQ CX
7021 JNE sealSSETailLoadLoop
7022 MOVQ R15, 64(BP)
7023 MOVQ R8, 72(BP)
7024 PXOR 64(BP), X1
7025 MOVOU X1, (DI)
7026 MOVOU -16(R13)(R9*1), X12
7027 PAND X12, X1
7028 MOVQ X1, R13
7029 PSRLDQ $0x08, X1
7030 MOVQ X1, R14
7031 ADDQ R13, R10
7032 ADCQ R14, R11
7033 ADCQ $0x01, R12
7034 MOVQ (BP), AX
7035 MOVQ AX, R15
7036 MULQ R10
7037 MOVQ AX, R13
7038 MOVQ DX, R14
7039 MOVQ (BP), AX
7040 MULQ R11
7041 IMULQ R12, R15
7042 ADDQ AX, R14
7043 ADCQ DX, R15
7044 MOVQ 8(BP), AX
7045 MOVQ AX, R8
7046 MULQ R10
7047 ADDQ AX, R14
7048 ADCQ $0x00, DX
7049 MOVQ DX, R10
7050 MOVQ 8(BP), AX
7051 MULQ R11
7052 ADDQ AX, R15
7053 ADCQ $0x00, DX
7054 IMULQ R12, R8
7055 ADDQ R10, R15
7056 ADCQ DX, R8
7057 MOVQ R13, R10
7058 MOVQ R14, R11
7059 MOVQ R15, R12
7060 ANDQ $0x03, R12
7061 MOVQ R15, R13
7062 ANDQ $-4, R13
7063 MOVQ R8, R14
7064 SHRQ $0x02, R8, R15
7065 SHRQ $0x02, R8
7066 ADDQ R13, R10
7067 ADCQ R14, R11
7068 ADCQ $0x00, R12
7069 ADDQ R15, R10
7070 ADCQ R8, R11
7071 ADCQ $0x00, R12
7072 ADDQ BX, DI
7073
7074sealSSEFinalize:
7075 // Hash in the buffer lengths
7076 ADDQ ad_len+80(FP), R10
7077 ADCQ src_len+56(FP), R11
7078 ADCQ $0x01, R12
7079 MOVQ (BP), AX
7080 MOVQ AX, R15
7081 MULQ R10
7082 MOVQ AX, R13
7083 MOVQ DX, R14
7084 MOVQ (BP), AX
7085 MULQ R11
7086 IMULQ R12, R15
7087 ADDQ AX, R14
7088 ADCQ DX, R15
7089 MOVQ 8(BP), AX
7090 MOVQ AX, R8
7091 MULQ R10
7092 ADDQ AX, R14
7093 ADCQ $0x00, DX
7094 MOVQ DX, R10
7095 MOVQ 8(BP), AX
7096 MULQ R11
7097 ADDQ AX, R15
7098 ADCQ $0x00, DX
7099 IMULQ R12, R8
7100 ADDQ R10, R15
7101 ADCQ DX, R8
7102 MOVQ R13, R10
7103 MOVQ R14, R11
7104 MOVQ R15, R12
7105 ANDQ $0x03, R12
7106 MOVQ R15, R13
7107 ANDQ $-4, R13
7108 MOVQ R8, R14
7109 SHRQ $0x02, R8, R15
7110 SHRQ $0x02, R8
7111 ADDQ R13, R10
7112 ADCQ R14, R11
7113 ADCQ $0x00, R12
7114 ADDQ R15, R10
7115 ADCQ R8, R11
7116 ADCQ $0x00, R12
7117
7118 // Final reduce
7119 MOVQ R10, R13
7120 MOVQ R11, R14
7121 MOVQ R12, R15
7122 SUBQ $-5, R10
7123 SBBQ $-1, R11
7124 SBBQ $0x03, R12
7125 CMOVQCS R13, R10
7126 CMOVQCS R14, R11
7127 CMOVQCS R15, R12
7128
7129 // Add in the "s" part of the key
7130 ADDQ 16(BP), R10
7131 ADCQ 24(BP), R11
7132
7133 // Finally store the tag at the end of the message
7134 MOVQ R10, (DI)
7135 MOVQ R11, 8(DI)
7136 RET
7137
7138chacha20Poly1305Seal_AVX2:
7139 VZEROUPPER
7140 VMOVDQU ·chacha20Constants<>+0(SB), Y0
7141 BYTE $0xc4
7142 BYTE $0x42
7143 BYTE $0x7d
7144 BYTE $0x5a
7145 BYTE $0x70
7146 BYTE $0x10
7147 BYTE $0xc4
7148 BYTE $0x42
7149 BYTE $0x7d
7150 BYTE $0x5a
7151 BYTE $0x60
7152 BYTE $0x20
7153 BYTE $0xc4
7154 BYTE $0xc2
7155 BYTE $0x7d
7156 BYTE $0x5a
7157 BYTE $0x60
7158 BYTE $0x30
7159 VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
7160
7161 // Special optimizations, for very short buffers
7162 CMPQ BX, $0x000000c0
7163 JBE seal192AVX2
7164 CMPQ BX, $0x00000140
7165 JBE seal320AVX2
7166
7167 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
7168 VMOVDQA Y0, Y5
7169 VMOVDQA Y0, Y6
7170 VMOVDQA Y0, Y7
7171 VMOVDQA Y14, Y9
7172 VMOVDQA Y14, Y10
7173 VMOVDQA Y14, Y11
7174 VMOVDQA Y14, 32(BP)
7175 VMOVDQA Y12, Y13
7176 VMOVDQA Y12, Y8
7177 VMOVDQA Y12, Y15
7178 VMOVDQA Y12, 64(BP)
7179 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
7180 VMOVDQA Y4, 96(BP)
7181 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
7182 VMOVDQA Y1, 128(BP)
7183 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
7184 VMOVDQA Y2, 160(BP)
7185 VMOVDQA Y3, 192(BP)
7186 MOVQ $0x0000000a, R9
7187
7188sealAVX2IntroLoop:
7189 VMOVDQA Y15, 224(BP)
7190 VPADDD Y14, Y0, Y0
7191 VPXOR Y0, Y4, Y4
7192 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7193 VPADDD Y4, Y12, Y12
7194 VPXOR Y12, Y14, Y14
7195 VPSLLD $0x0c, Y14, Y15
7196 VPSRLD $0x14, Y14, Y14
7197 VPXOR Y15, Y14, Y14
7198 VPADDD Y14, Y0, Y0
7199 VPXOR Y0, Y4, Y4
7200 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7201 VPADDD Y4, Y12, Y12
7202 VPXOR Y12, Y14, Y14
7203 VPSLLD $0x07, Y14, Y15
7204 VPSRLD $0x19, Y14, Y14
7205 VPXOR Y15, Y14, Y14
7206 VPADDD Y9, Y5, Y5
7207 VPXOR Y5, Y1, Y1
7208 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7209 VPADDD Y1, Y13, Y13
7210 VPXOR Y13, Y9, Y9
7211 VPSLLD $0x0c, Y9, Y15
7212 VPSRLD $0x14, Y9, Y9
7213 VPXOR Y15, Y9, Y9
7214 VPADDD Y9, Y5, Y5
7215 VPXOR Y5, Y1, Y1
7216 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7217 VPADDD Y1, Y13, Y13
7218 VPXOR Y13, Y9, Y9
7219 VPSLLD $0x07, Y9, Y15
7220 VPSRLD $0x19, Y9, Y9
7221 VPXOR Y15, Y9, Y9
7222 VPADDD Y10, Y6, Y6
7223 VPXOR Y6, Y2, Y2
7224 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7225 VPADDD Y2, Y8, Y8
7226 VPXOR Y8, Y10, Y10
7227 VPSLLD $0x0c, Y10, Y15
7228 VPSRLD $0x14, Y10, Y10
7229 VPXOR Y15, Y10, Y10
7230 VPADDD Y10, Y6, Y6
7231 VPXOR Y6, Y2, Y2
7232 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7233 VPADDD Y2, Y8, Y8
7234 VPXOR Y8, Y10, Y10
7235 VPSLLD $0x07, Y10, Y15
7236 VPSRLD $0x19, Y10, Y10
7237 VPXOR Y15, Y10, Y10
7238 VMOVDQA 224(BP), Y15
7239 VMOVDQA Y13, 224(BP)
7240 VPADDD Y11, Y7, Y7
7241 VPXOR Y7, Y3, Y3
7242 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7243 VPADDD Y3, Y15, Y15
7244 VPXOR Y15, Y11, Y11
7245 VPSLLD $0x0c, Y11, Y13
7246 VPSRLD $0x14, Y11, Y11
7247 VPXOR Y13, Y11, Y11
7248 VPADDD Y11, Y7, Y7
7249 VPXOR Y7, Y3, Y3
7250 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7251 VPADDD Y3, Y15, Y15
7252 VPXOR Y15, Y11, Y11
7253 VPSLLD $0x07, Y11, Y13
7254 VPSRLD $0x19, Y11, Y11
7255 VPXOR Y13, Y11, Y11
7256 VMOVDQA 224(BP), Y13
7257 VPALIGNR $0x04, Y14, Y14, Y14
7258 VPALIGNR $0x08, Y12, Y12, Y12
7259 VPALIGNR $0x0c, Y4, Y4, Y4
7260 VPALIGNR $0x04, Y9, Y9, Y9
7261 VPALIGNR $0x08, Y13, Y13, Y13
7262 VPALIGNR $0x0c, Y1, Y1, Y1
7263 VPALIGNR $0x04, Y10, Y10, Y10
7264 VPALIGNR $0x08, Y8, Y8, Y8
7265 VPALIGNR $0x0c, Y2, Y2, Y2
7266 VPALIGNR $0x04, Y11, Y11, Y11
7267 VPALIGNR $0x08, Y15, Y15, Y15
7268 VPALIGNR $0x0c, Y3, Y3, Y3
7269 VMOVDQA Y15, 224(BP)
7270 VPADDD Y14, Y0, Y0
7271 VPXOR Y0, Y4, Y4
7272 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7273 VPADDD Y4, Y12, Y12
7274 VPXOR Y12, Y14, Y14
7275 VPSLLD $0x0c, Y14, Y15
7276 VPSRLD $0x14, Y14, Y14
7277 VPXOR Y15, Y14, Y14
7278 VPADDD Y14, Y0, Y0
7279 VPXOR Y0, Y4, Y4
7280 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7281 VPADDD Y4, Y12, Y12
7282 VPXOR Y12, Y14, Y14
7283 VPSLLD $0x07, Y14, Y15
7284 VPSRLD $0x19, Y14, Y14
7285 VPXOR Y15, Y14, Y14
7286 VPADDD Y9, Y5, Y5
7287 VPXOR Y5, Y1, Y1
7288 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7289 VPADDD Y1, Y13, Y13
7290 VPXOR Y13, Y9, Y9
7291 VPSLLD $0x0c, Y9, Y15
7292 VPSRLD $0x14, Y9, Y9
7293 VPXOR Y15, Y9, Y9
7294 VPADDD Y9, Y5, Y5
7295 VPXOR Y5, Y1, Y1
7296 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7297 VPADDD Y1, Y13, Y13
7298 VPXOR Y13, Y9, Y9
7299 VPSLLD $0x07, Y9, Y15
7300 VPSRLD $0x19, Y9, Y9
7301 VPXOR Y15, Y9, Y9
7302 VPADDD Y10, Y6, Y6
7303 VPXOR Y6, Y2, Y2
7304 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7305 VPADDD Y2, Y8, Y8
7306 VPXOR Y8, Y10, Y10
7307 VPSLLD $0x0c, Y10, Y15
7308 VPSRLD $0x14, Y10, Y10
7309 VPXOR Y15, Y10, Y10
7310 VPADDD Y10, Y6, Y6
7311 VPXOR Y6, Y2, Y2
7312 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7313 VPADDD Y2, Y8, Y8
7314 VPXOR Y8, Y10, Y10
7315 VPSLLD $0x07, Y10, Y15
7316 VPSRLD $0x19, Y10, Y10
7317 VPXOR Y15, Y10, Y10
7318 VMOVDQA 224(BP), Y15
7319 VMOVDQA Y13, 224(BP)
7320 VPADDD Y11, Y7, Y7
7321 VPXOR Y7, Y3, Y3
7322 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7323 VPADDD Y3, Y15, Y15
7324 VPXOR Y15, Y11, Y11
7325 VPSLLD $0x0c, Y11, Y13
7326 VPSRLD $0x14, Y11, Y11
7327 VPXOR Y13, Y11, Y11
7328 VPADDD Y11, Y7, Y7
7329 VPXOR Y7, Y3, Y3
7330 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7331 VPADDD Y3, Y15, Y15
7332 VPXOR Y15, Y11, Y11
7333 VPSLLD $0x07, Y11, Y13
7334 VPSRLD $0x19, Y11, Y11
7335 VPXOR Y13, Y11, Y11
7336 VMOVDQA 224(BP), Y13
7337 VPALIGNR $0x0c, Y14, Y14, Y14
7338 VPALIGNR $0x08, Y12, Y12, Y12
7339 VPALIGNR $0x04, Y4, Y4, Y4
7340 VPALIGNR $0x0c, Y9, Y9, Y9
7341 VPALIGNR $0x08, Y13, Y13, Y13
7342 VPALIGNR $0x04, Y1, Y1, Y1
7343 VPALIGNR $0x0c, Y10, Y10, Y10
7344 VPALIGNR $0x08, Y8, Y8, Y8
7345 VPALIGNR $0x04, Y2, Y2, Y2
7346 VPALIGNR $0x0c, Y11, Y11, Y11
7347 VPALIGNR $0x08, Y15, Y15, Y15
7348 VPALIGNR $0x04, Y3, Y3, Y3
7349 DECQ R9
7350 JNE sealAVX2IntroLoop
7351 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
7352 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
7353 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
7354 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
7355 VPADDD 32(BP), Y14, Y14
7356 VPADDD 32(BP), Y9, Y9
7357 VPADDD 32(BP), Y10, Y10
7358 VPADDD 32(BP), Y11, Y11
7359 VPADDD 64(BP), Y12, Y12
7360 VPADDD 64(BP), Y13, Y13
7361 VPADDD 64(BP), Y8, Y8
7362 VPADDD 64(BP), Y15, Y15
7363 VPADDD 96(BP), Y4, Y4
7364 VPADDD 128(BP), Y1, Y1
7365 VPADDD 160(BP), Y2, Y2
7366 VPADDD 192(BP), Y3, Y3
7367 VPERM2I128 $0x13, Y12, Y4, Y12
7368 VPERM2I128 $0x02, Y0, Y14, Y4
7369 VPERM2I128 $0x13, Y0, Y14, Y0
7370
7371 // Clamp and store poly key
7372 VPAND ·polyClampMask<>+0(SB), Y4, Y4
7373 VMOVDQA Y4, (BP)
7374
7375 // Hash AD
7376 MOVQ ad_len+80(FP), R9
7377 CALL polyHashADInternal<>(SB)
7378
7379 // Can store at least 320 bytes
7380 VPXOR (SI), Y0, Y0
7381 VPXOR 32(SI), Y12, Y12
7382 VMOVDQU Y0, (DI)
7383 VMOVDQU Y12, 32(DI)
7384 VPERM2I128 $0x02, Y5, Y9, Y0
7385 VPERM2I128 $0x02, Y13, Y1, Y14
7386 VPERM2I128 $0x13, Y5, Y9, Y12
7387 VPERM2I128 $0x13, Y13, Y1, Y4
7388 VPXOR 64(SI), Y0, Y0
7389 VPXOR 96(SI), Y14, Y14
7390 VPXOR 128(SI), Y12, Y12
7391 VPXOR 160(SI), Y4, Y4
7392 VMOVDQU Y0, 64(DI)
7393 VMOVDQU Y14, 96(DI)
7394 VMOVDQU Y12, 128(DI)
7395 VMOVDQU Y4, 160(DI)
7396 VPERM2I128 $0x02, Y6, Y10, Y0
7397 VPERM2I128 $0x02, Y8, Y2, Y14
7398 VPERM2I128 $0x13, Y6, Y10, Y12
7399 VPERM2I128 $0x13, Y8, Y2, Y4
7400 VPXOR 192(SI), Y0, Y0
7401 VPXOR 224(SI), Y14, Y14
7402 VPXOR 256(SI), Y12, Y12
7403 VPXOR 288(SI), Y4, Y4
7404 VMOVDQU Y0, 192(DI)
7405 VMOVDQU Y14, 224(DI)
7406 VMOVDQU Y12, 256(DI)
7407 VMOVDQU Y4, 288(DI)
7408 MOVQ $0x00000140, CX
7409 SUBQ $0x00000140, BX
7410 LEAQ 320(SI), SI
7411 VPERM2I128 $0x02, Y7, Y11, Y0
7412 VPERM2I128 $0x02, Y15, Y3, Y14
7413 VPERM2I128 $0x13, Y7, Y11, Y12
7414 VPERM2I128 $0x13, Y15, Y3, Y4
7415 CMPQ BX, $0x80
7416 JBE sealAVX2SealHash
7417 VPXOR (SI), Y0, Y0
7418 VPXOR 32(SI), Y14, Y14
7419 VPXOR 64(SI), Y12, Y12
7420 VPXOR 96(SI), Y4, Y4
7421 VMOVDQU Y0, 320(DI)
7422 VMOVDQU Y14, 352(DI)
7423 VMOVDQU Y12, 384(DI)
7424 VMOVDQU Y4, 416(DI)
7425 SUBQ $0x80, BX
7426 LEAQ 128(SI), SI
7427 MOVQ $0x00000008, CX
7428 MOVQ $0x00000002, R9
7429 CMPQ BX, $0x80
7430 JBE sealAVX2Tail128
7431 CMPQ BX, $0x00000100
7432 JBE sealAVX2Tail256
7433 CMPQ BX, $0x00000180
7434 JBE sealAVX2Tail384
7435 CMPQ BX, $0x00000200
7436 JBE sealAVX2Tail512
7437
7438 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
7439 VMOVDQA ·chacha20Constants<>+0(SB), Y0
7440 VMOVDQA Y0, Y5
7441 VMOVDQA Y0, Y6
7442 VMOVDQA Y0, Y7
7443 VMOVDQA 32(BP), Y14
7444 VMOVDQA Y14, Y9
7445 VMOVDQA Y14, Y10
7446 VMOVDQA Y14, Y11
7447 VMOVDQA 64(BP), Y12
7448 VMOVDQA Y12, Y13
7449 VMOVDQA Y12, Y8
7450 VMOVDQA Y12, Y15
7451 VMOVDQA 192(BP), Y4
7452 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
7453 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
7454 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
7455 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
7456 VMOVDQA Y4, 96(BP)
7457 VMOVDQA Y1, 128(BP)
7458 VMOVDQA Y2, 160(BP)
7459 VMOVDQA Y3, 192(BP)
7460 VMOVDQA Y15, 224(BP)
7461 VPADDD Y14, Y0, Y0
7462 VPXOR Y0, Y4, Y4
7463 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7464 VPADDD Y4, Y12, Y12
7465 VPXOR Y12, Y14, Y14
7466 VPSLLD $0x0c, Y14, Y15
7467 VPSRLD $0x14, Y14, Y14
7468 VPXOR Y15, Y14, Y14
7469 VPADDD Y14, Y0, Y0
7470 VPXOR Y0, Y4, Y4
7471 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7472 VPADDD Y4, Y12, Y12
7473 VPXOR Y12, Y14, Y14
7474 VPSLLD $0x07, Y14, Y15
7475 VPSRLD $0x19, Y14, Y14
7476 VPXOR Y15, Y14, Y14
7477 VPADDD Y9, Y5, Y5
7478 VPXOR Y5, Y1, Y1
7479 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7480 VPADDD Y1, Y13, Y13
7481 VPXOR Y13, Y9, Y9
7482 VPSLLD $0x0c, Y9, Y15
7483 VPSRLD $0x14, Y9, Y9
7484 VPXOR Y15, Y9, Y9
7485 VPADDD Y9, Y5, Y5
7486 VPXOR Y5, Y1, Y1
7487 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7488 VPADDD Y1, Y13, Y13
7489 VPXOR Y13, Y9, Y9
7490 VPSLLD $0x07, Y9, Y15
7491 VPSRLD $0x19, Y9, Y9
7492 VPXOR Y15, Y9, Y9
7493 VPADDD Y10, Y6, Y6
7494 VPXOR Y6, Y2, Y2
7495 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7496 VPADDD Y2, Y8, Y8
7497 VPXOR Y8, Y10, Y10
7498 VPSLLD $0x0c, Y10, Y15
7499 VPSRLD $0x14, Y10, Y10
7500 VPXOR Y15, Y10, Y10
7501 VPADDD Y10, Y6, Y6
7502 VPXOR Y6, Y2, Y2
7503 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7504 VPADDD Y2, Y8, Y8
7505 VPXOR Y8, Y10, Y10
7506 VPSLLD $0x07, Y10, Y15
7507 VPSRLD $0x19, Y10, Y10
7508 VPXOR Y15, Y10, Y10
7509 VMOVDQA 224(BP), Y15
7510 VMOVDQA Y13, 224(BP)
7511 VPADDD Y11, Y7, Y7
7512 VPXOR Y7, Y3, Y3
7513 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7514 VPADDD Y3, Y15, Y15
7515 VPXOR Y15, Y11, Y11
7516 VPSLLD $0x0c, Y11, Y13
7517 VPSRLD $0x14, Y11, Y11
7518 VPXOR Y13, Y11, Y11
7519 VPADDD Y11, Y7, Y7
7520 VPXOR Y7, Y3, Y3
7521 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7522 VPADDD Y3, Y15, Y15
7523 VPXOR Y15, Y11, Y11
7524 VPSLLD $0x07, Y11, Y13
7525 VPSRLD $0x19, Y11, Y11
7526 VPXOR Y13, Y11, Y11
7527 VMOVDQA 224(BP), Y13
7528 VPALIGNR $0x04, Y14, Y14, Y14
7529 VPALIGNR $0x08, Y12, Y12, Y12
7530 VPALIGNR $0x0c, Y4, Y4, Y4
7531 VPALIGNR $0x04, Y9, Y9, Y9
7532 VPALIGNR $0x08, Y13, Y13, Y13
7533 VPALIGNR $0x0c, Y1, Y1, Y1
7534 VPALIGNR $0x04, Y10, Y10, Y10
7535 VPALIGNR $0x08, Y8, Y8, Y8
7536 VPALIGNR $0x0c, Y2, Y2, Y2
7537 VPALIGNR $0x04, Y11, Y11, Y11
7538 VPALIGNR $0x08, Y15, Y15, Y15
7539 VPALIGNR $0x0c, Y3, Y3, Y3
7540 VMOVDQA Y15, 224(BP)
7541 VPADDD Y14, Y0, Y0
7542 VPXOR Y0, Y4, Y4
7543 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7544 VPADDD Y4, Y12, Y12
7545 VPXOR Y12, Y14, Y14
7546 VPSLLD $0x0c, Y14, Y15
7547 VPSRLD $0x14, Y14, Y14
7548 VPXOR Y15, Y14, Y14
7549 VPADDD Y14, Y0, Y0
7550 VPXOR Y0, Y4, Y4
7551 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7552 VPADDD Y4, Y12, Y12
7553 VPXOR Y12, Y14, Y14
7554 VPSLLD $0x07, Y14, Y15
7555 VPSRLD $0x19, Y14, Y14
7556 VPXOR Y15, Y14, Y14
7557 VPADDD Y9, Y5, Y5
7558 VPXOR Y5, Y1, Y1
7559 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7560 VPADDD Y1, Y13, Y13
7561 VPXOR Y13, Y9, Y9
7562 VPSLLD $0x0c, Y9, Y15
7563 VPSRLD $0x14, Y9, Y9
7564 VPXOR Y15, Y9, Y9
7565 VPADDD Y9, Y5, Y5
7566 VPXOR Y5, Y1, Y1
7567 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7568 VPADDD Y1, Y13, Y13
7569 VPXOR Y13, Y9, Y9
7570 VPSLLD $0x07, Y9, Y15
7571 VPSRLD $0x19, Y9, Y9
7572 VPXOR Y15, Y9, Y9
7573 VPADDD Y10, Y6, Y6
7574 VPXOR Y6, Y2, Y2
7575 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7576 VPADDD Y2, Y8, Y8
7577 VPXOR Y8, Y10, Y10
7578 VPSLLD $0x0c, Y10, Y15
7579 VPSRLD $0x14, Y10, Y10
7580 VPXOR Y15, Y10, Y10
7581 VPADDD Y10, Y6, Y6
7582 VPXOR Y6, Y2, Y2
7583 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7584 VPADDD Y2, Y8, Y8
7585 VPXOR Y8, Y10, Y10
7586 VPSLLD $0x07, Y10, Y15
7587 VPSRLD $0x19, Y10, Y10
7588 VPXOR Y15, Y10, Y10
7589 VMOVDQA 224(BP), Y15
7590 VMOVDQA Y13, 224(BP)
7591 VPADDD Y11, Y7, Y7
7592 VPXOR Y7, Y3, Y3
7593 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7594 VPADDD Y3, Y15, Y15
7595 VPXOR Y15, Y11, Y11
7596 VPSLLD $0x0c, Y11, Y13
7597 VPSRLD $0x14, Y11, Y11
7598 VPXOR Y13, Y11, Y11
7599 VPADDD Y11, Y7, Y7
7600 VPXOR Y7, Y3, Y3
7601 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7602 VPADDD Y3, Y15, Y15
7603 VPXOR Y15, Y11, Y11
7604 VPSLLD $0x07, Y11, Y13
7605 VPSRLD $0x19, Y11, Y11
7606 VPXOR Y13, Y11, Y11
7607 VMOVDQA 224(BP), Y13
7608 VPALIGNR $0x0c, Y14, Y14, Y14
7609 VPALIGNR $0x08, Y12, Y12, Y12
7610 VPALIGNR $0x04, Y4, Y4, Y4
7611 VPALIGNR $0x0c, Y9, Y9, Y9
7612 VPALIGNR $0x08, Y13, Y13, Y13
7613 VPALIGNR $0x04, Y1, Y1, Y1
7614 VPALIGNR $0x0c, Y10, Y10, Y10
7615 VPALIGNR $0x08, Y8, Y8, Y8
7616 VPALIGNR $0x04, Y2, Y2, Y2
7617 VPALIGNR $0x0c, Y11, Y11, Y11
7618 VPALIGNR $0x08, Y15, Y15, Y15
7619 VPALIGNR $0x04, Y3, Y3, Y3
7620 VPADDD Y14, Y0, Y0
7621 VPADDD Y9, Y5, Y5
7622 VPADDD Y10, Y6, Y6
7623 VPADDD Y11, Y7, Y7
7624 VPXOR Y0, Y4, Y4
7625 VPXOR Y5, Y1, Y1
7626 VPXOR Y6, Y2, Y2
7627 VPXOR Y7, Y3, Y3
7628 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7629 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7630 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7631 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7632 VPADDD Y4, Y12, Y12
7633 VPADDD Y1, Y13, Y13
7634 VPADDD Y2, Y8, Y8
7635 VPADDD Y3, Y15, Y15
7636 VPXOR Y12, Y14, Y14
7637 VPXOR Y13, Y9, Y9
7638 VPXOR Y8, Y10, Y10
7639 VPXOR Y15, Y11, Y11
7640 VMOVDQA Y15, 224(BP)
7641 VPSLLD $0x0c, Y14, Y15
7642 VPSRLD $0x14, Y14, Y14
7643 VPXOR Y15, Y14, Y14
7644 VPSLLD $0x0c, Y9, Y15
7645 VPSRLD $0x14, Y9, Y9
7646 VPXOR Y15, Y9, Y9
7647 VPSLLD $0x0c, Y10, Y15
7648 VPSRLD $0x14, Y10, Y10
7649 VPXOR Y15, Y10, Y10
7650 VPSLLD $0x0c, Y11, Y15
7651 VPSRLD $0x14, Y11, Y11
7652 VPXOR Y15, Y11, Y11
7653 VMOVDQA 224(BP), Y15
7654 SUBQ $0x10, DI
7655 MOVQ $0x00000009, CX
7656 JMP sealAVX2InternalLoopStart
7657
7658sealAVX2MainLoop:
7659 VMOVDQU ·chacha20Constants<>+0(SB), Y0
7660 VMOVDQA Y0, Y5
7661 VMOVDQA Y0, Y6
7662 VMOVDQA Y0, Y7
7663 VMOVDQA 32(BP), Y14
7664 VMOVDQA Y14, Y9
7665 VMOVDQA Y14, Y10
7666 VMOVDQA Y14, Y11
7667 VMOVDQA 64(BP), Y12
7668 VMOVDQA Y12, Y13
7669 VMOVDQA Y12, Y8
7670 VMOVDQA Y12, Y15
7671 VMOVDQA 192(BP), Y4
7672 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
7673 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
7674 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
7675 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
7676 VMOVDQA Y4, 96(BP)
7677 VMOVDQA Y1, 128(BP)
7678 VMOVDQA Y2, 160(BP)
7679 VMOVDQA Y3, 192(BP)
7680 MOVQ $0x0000000a, CX
7681
7682sealAVX2InternalLoop:
7683 ADDQ (DI), R10
7684 ADCQ 8(DI), R11
7685 ADCQ $0x01, R12
7686 VPADDD Y14, Y0, Y0
7687 VPADDD Y9, Y5, Y5
7688 VPADDD Y10, Y6, Y6
7689 VPADDD Y11, Y7, Y7
7690 MOVQ (BP), DX
7691 MOVQ DX, R15
7692 MULXQ R10, R13, R14
7693 IMULQ R12, R15
7694 MULXQ R11, AX, DX
7695 ADDQ AX, R14
7696 ADCQ DX, R15
7697 VPXOR Y0, Y4, Y4
7698 VPXOR Y5, Y1, Y1
7699 VPXOR Y6, Y2, Y2
7700 VPXOR Y7, Y3, Y3
7701 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7702 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7703 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7704 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7705 MOVQ 8(BP), DX
7706 MULXQ R10, R10, AX
7707 ADDQ R10, R14
7708 MULXQ R11, R11, R8
7709 ADCQ R11, R15
7710 ADCQ $0x00, R8
7711 VPADDD Y4, Y12, Y12
7712 VPADDD Y1, Y13, Y13
7713 VPADDD Y2, Y8, Y8
7714 VPADDD Y3, Y15, Y15
7715 VPXOR Y12, Y14, Y14
7716 VPXOR Y13, Y9, Y9
7717 VPXOR Y8, Y10, Y10
7718 VPXOR Y15, Y11, Y11
7719 IMULQ R12, DX
7720 ADDQ AX, R15
7721 ADCQ DX, R8
7722 VMOVDQA Y15, 224(BP)
7723 VPSLLD $0x0c, Y14, Y15
7724 VPSRLD $0x14, Y14, Y14
7725 VPXOR Y15, Y14, Y14
7726 VPSLLD $0x0c, Y9, Y15
7727 VPSRLD $0x14, Y9, Y9
7728 VPXOR Y15, Y9, Y9
7729 VPSLLD $0x0c, Y10, Y15
7730 VPSRLD $0x14, Y10, Y10
7731 VPXOR Y15, Y10, Y10
7732 VPSLLD $0x0c, Y11, Y15
7733 VPSRLD $0x14, Y11, Y11
7734 VPXOR Y15, Y11, Y11
7735 VMOVDQA 224(BP), Y15
7736 MOVQ R13, R10
7737 MOVQ R14, R11
7738 MOVQ R15, R12
7739 ANDQ $0x03, R12
7740 MOVQ R15, R13
7741 ANDQ $-4, R13
7742 MOVQ R8, R14
7743 SHRQ $0x02, R8, R15
7744 SHRQ $0x02, R8
7745 ADDQ R13, R10
7746 ADCQ R14, R11
7747 ADCQ $0x00, R12
7748 ADDQ R15, R10
7749 ADCQ R8, R11
7750 ADCQ $0x00, R12
7751
7752sealAVX2InternalLoopStart:
7753 VPADDD Y14, Y0, Y0
7754 VPADDD Y9, Y5, Y5
7755 VPADDD Y10, Y6, Y6
7756 VPADDD Y11, Y7, Y7
7757 VPXOR Y0, Y4, Y4
7758 VPXOR Y5, Y1, Y1
7759 VPXOR Y6, Y2, Y2
7760 VPXOR Y7, Y3, Y3
7761 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7762 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7763 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7764 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7765 ADDQ 16(DI), R10
7766 ADCQ 24(DI), R11
7767 ADCQ $0x01, R12
7768 VPADDD Y4, Y12, Y12
7769 VPADDD Y1, Y13, Y13
7770 VPADDD Y2, Y8, Y8
7771 VPADDD Y3, Y15, Y15
7772 MOVQ (BP), DX
7773 MOVQ DX, R15
7774 MULXQ R10, R13, R14
7775 IMULQ R12, R15
7776 MULXQ R11, AX, DX
7777 ADDQ AX, R14
7778 ADCQ DX, R15
7779 VPXOR Y12, Y14, Y14
7780 VPXOR Y13, Y9, Y9
7781 VPXOR Y8, Y10, Y10
7782 VPXOR Y15, Y11, Y11
7783 VMOVDQA Y15, 224(BP)
7784 VPSLLD $0x07, Y14, Y15
7785 VPSRLD $0x19, Y14, Y14
7786 VPXOR Y15, Y14, Y14
7787 VPSLLD $0x07, Y9, Y15
7788 VPSRLD $0x19, Y9, Y9
7789 VPXOR Y15, Y9, Y9
7790 VPSLLD $0x07, Y10, Y15
7791 VPSRLD $0x19, Y10, Y10
7792 VPXOR Y15, Y10, Y10
7793 VPSLLD $0x07, Y11, Y15
7794 VPSRLD $0x19, Y11, Y11
7795 VPXOR Y15, Y11, Y11
7796 VMOVDQA 224(BP), Y15
7797 MOVQ 8(BP), DX
7798 MULXQ R10, R10, AX
7799 ADDQ R10, R14
7800 MULXQ R11, R11, R8
7801 ADCQ R11, R15
7802 ADCQ $0x00, R8
7803 VPALIGNR $0x04, Y14, Y14, Y14
7804 VPALIGNR $0x04, Y9, Y9, Y9
7805 VPALIGNR $0x04, Y10, Y10, Y10
7806 VPALIGNR $0x04, Y11, Y11, Y11
7807 VPALIGNR $0x08, Y12, Y12, Y12
7808 VPALIGNR $0x08, Y13, Y13, Y13
7809 VPALIGNR $0x08, Y8, Y8, Y8
7810 VPALIGNR $0x08, Y15, Y15, Y15
7811 VPALIGNR $0x0c, Y4, Y4, Y4
7812 VPALIGNR $0x0c, Y1, Y1, Y1
7813 VPALIGNR $0x0c, Y2, Y2, Y2
7814 VPALIGNR $0x0c, Y3, Y3, Y3
7815 VPADDD Y14, Y0, Y0
7816 VPADDD Y9, Y5, Y5
7817 VPADDD Y10, Y6, Y6
7818 VPADDD Y11, Y7, Y7
7819 IMULQ R12, DX
7820 ADDQ AX, R15
7821 ADCQ DX, R8
7822 VPXOR Y0, Y4, Y4
7823 VPXOR Y5, Y1, Y1
7824 VPXOR Y6, Y2, Y2
7825 VPXOR Y7, Y3, Y3
7826 VPSHUFB ·rol16<>+0(SB), Y4, Y4
7827 VPSHUFB ·rol16<>+0(SB), Y1, Y1
7828 VPSHUFB ·rol16<>+0(SB), Y2, Y2
7829 VPSHUFB ·rol16<>+0(SB), Y3, Y3
7830 MOVQ R13, R10
7831 MOVQ R14, R11
7832 MOVQ R15, R12
7833 ANDQ $0x03, R12
7834 MOVQ R15, R13
7835 ANDQ $-4, R13
7836 MOVQ R8, R14
7837 SHRQ $0x02, R8, R15
7838 SHRQ $0x02, R8
7839 ADDQ R13, R10
7840 ADCQ R14, R11
7841 ADCQ $0x00, R12
7842 ADDQ R15, R10
7843 ADCQ R8, R11
7844 ADCQ $0x00, R12
7845 VPADDD Y4, Y12, Y12
7846 VPADDD Y1, Y13, Y13
7847 VPADDD Y2, Y8, Y8
7848 VPADDD Y3, Y15, Y15
7849 VPXOR Y12, Y14, Y14
7850 VPXOR Y13, Y9, Y9
7851 VPXOR Y8, Y10, Y10
7852 VPXOR Y15, Y11, Y11
7853 ADDQ 32(DI), R10
7854 ADCQ 40(DI), R11
7855 ADCQ $0x01, R12
7856 LEAQ 48(DI), DI
7857 VMOVDQA Y15, 224(BP)
7858 VPSLLD $0x0c, Y14, Y15
7859 VPSRLD $0x14, Y14, Y14
7860 VPXOR Y15, Y14, Y14
7861 VPSLLD $0x0c, Y9, Y15
7862 VPSRLD $0x14, Y9, Y9
7863 VPXOR Y15, Y9, Y9
7864 VPSLLD $0x0c, Y10, Y15
7865 VPSRLD $0x14, Y10, Y10
7866 VPXOR Y15, Y10, Y10
7867 VPSLLD $0x0c, Y11, Y15
7868 VPSRLD $0x14, Y11, Y11
7869 VPXOR Y15, Y11, Y11
7870 VMOVDQA 224(BP), Y15
7871 MOVQ (BP), DX
7872 MOVQ DX, R15
7873 MULXQ R10, R13, R14
7874 IMULQ R12, R15
7875 MULXQ R11, AX, DX
7876 ADDQ AX, R14
7877 ADCQ DX, R15
7878 VPADDD Y14, Y0, Y0
7879 VPADDD Y9, Y5, Y5
7880 VPADDD Y10, Y6, Y6
7881 VPADDD Y11, Y7, Y7
7882 VPXOR Y0, Y4, Y4
7883 VPXOR Y5, Y1, Y1
7884 VPXOR Y6, Y2, Y2
7885 VPXOR Y7, Y3, Y3
7886 MOVQ 8(BP), DX
7887 MULXQ R10, R10, AX
7888 ADDQ R10, R14
7889 MULXQ R11, R11, R8
7890 ADCQ R11, R15
7891 ADCQ $0x00, R8
7892 VPSHUFB ·rol8<>+0(SB), Y4, Y4
7893 VPSHUFB ·rol8<>+0(SB), Y1, Y1
7894 VPSHUFB ·rol8<>+0(SB), Y2, Y2
7895 VPSHUFB ·rol8<>+0(SB), Y3, Y3
7896 VPADDD Y4, Y12, Y12
7897 VPADDD Y1, Y13, Y13
7898 VPADDD Y2, Y8, Y8
7899 VPADDD Y3, Y15, Y15
7900 IMULQ R12, DX
7901 ADDQ AX, R15
7902 ADCQ DX, R8
7903 VPXOR Y12, Y14, Y14
7904 VPXOR Y13, Y9, Y9
7905 VPXOR Y8, Y10, Y10
7906 VPXOR Y15, Y11, Y11
7907 VMOVDQA Y15, 224(BP)
7908 VPSLLD $0x07, Y14, Y15
7909 VPSRLD $0x19, Y14, Y14
7910 VPXOR Y15, Y14, Y14
7911 VPSLLD $0x07, Y9, Y15
7912 VPSRLD $0x19, Y9, Y9
7913 VPXOR Y15, Y9, Y9
7914 VPSLLD $0x07, Y10, Y15
7915 VPSRLD $0x19, Y10, Y10
7916 VPXOR Y15, Y10, Y10
7917 VPSLLD $0x07, Y11, Y15
7918 VPSRLD $0x19, Y11, Y11
7919 VPXOR Y15, Y11, Y11
7920 VMOVDQA 224(BP), Y15
7921 MOVQ R13, R10
7922 MOVQ R14, R11
7923 MOVQ R15, R12
7924 ANDQ $0x03, R12
7925 MOVQ R15, R13
7926 ANDQ $-4, R13
7927 MOVQ R8, R14
7928 SHRQ $0x02, R8, R15
7929 SHRQ $0x02, R8
7930 ADDQ R13, R10
7931 ADCQ R14, R11
7932 ADCQ $0x00, R12
7933 ADDQ R15, R10
7934 ADCQ R8, R11
7935 ADCQ $0x00, R12
7936 VPALIGNR $0x0c, Y14, Y14, Y14
7937 VPALIGNR $0x0c, Y9, Y9, Y9
7938 VPALIGNR $0x0c, Y10, Y10, Y10
7939 VPALIGNR $0x0c, Y11, Y11, Y11
7940 VPALIGNR $0x08, Y12, Y12, Y12
7941 VPALIGNR $0x08, Y13, Y13, Y13
7942 VPALIGNR $0x08, Y8, Y8, Y8
7943 VPALIGNR $0x08, Y15, Y15, Y15
7944 VPALIGNR $0x04, Y4, Y4, Y4
7945 VPALIGNR $0x04, Y1, Y1, Y1
7946 VPALIGNR $0x04, Y2, Y2, Y2
7947 VPALIGNR $0x04, Y3, Y3, Y3
7948 DECQ CX
7949 JNE sealAVX2InternalLoop
7950 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
7951 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
7952 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
7953 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
7954 VPADDD 32(BP), Y14, Y14
7955 VPADDD 32(BP), Y9, Y9
7956 VPADDD 32(BP), Y10, Y10
7957 VPADDD 32(BP), Y11, Y11
7958 VPADDD 64(BP), Y12, Y12
7959 VPADDD 64(BP), Y13, Y13
7960 VPADDD 64(BP), Y8, Y8
7961 VPADDD 64(BP), Y15, Y15
7962 VPADDD 96(BP), Y4, Y4
7963 VPADDD 128(BP), Y1, Y1
7964 VPADDD 160(BP), Y2, Y2
7965 VPADDD 192(BP), Y3, Y3
7966 VMOVDQA Y15, 224(BP)
7967
7968 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
7969 ADDQ (DI), R10
7970 ADCQ 8(DI), R11
7971 ADCQ $0x01, R12
7972 MOVQ (BP), DX
7973 MOVQ DX, R15
7974 MULXQ R10, R13, R14
7975 IMULQ R12, R15
7976 MULXQ R11, AX, DX
7977 ADDQ AX, R14
7978 ADCQ DX, R15
7979 MOVQ 8(BP), DX
7980 MULXQ R10, R10, AX
7981 ADDQ R10, R14
7982 MULXQ R11, R11, R8
7983 ADCQ R11, R15
7984 ADCQ $0x00, R8
7985 IMULQ R12, DX
7986 ADDQ AX, R15
7987 ADCQ DX, R8
7988 MOVQ R13, R10
7989 MOVQ R14, R11
7990 MOVQ R15, R12
7991 ANDQ $0x03, R12
7992 MOVQ R15, R13
7993 ANDQ $-4, R13
7994 MOVQ R8, R14
7995 SHRQ $0x02, R8, R15
7996 SHRQ $0x02, R8
7997 ADDQ R13, R10
7998 ADCQ R14, R11
7999 ADCQ $0x00, R12
8000 ADDQ R15, R10
8001 ADCQ R8, R11
8002 ADCQ $0x00, R12
8003 LEAQ 32(DI), DI
8004 VPERM2I128 $0x02, Y0, Y14, Y15
8005 VPERM2I128 $0x13, Y0, Y14, Y14
8006 VPERM2I128 $0x02, Y12, Y4, Y0
8007 VPERM2I128 $0x13, Y12, Y4, Y12
8008 VPXOR (SI), Y15, Y15
8009 VPXOR 32(SI), Y0, Y0
8010 VPXOR 64(SI), Y14, Y14
8011 VPXOR 96(SI), Y12, Y12
8012 VMOVDQU Y15, (DI)
8013 VMOVDQU Y0, 32(DI)
8014 VMOVDQU Y14, 64(DI)
8015 VMOVDQU Y12, 96(DI)
8016 VPERM2I128 $0x02, Y5, Y9, Y0
8017 VPERM2I128 $0x02, Y13, Y1, Y14
8018 VPERM2I128 $0x13, Y5, Y9, Y12
8019 VPERM2I128 $0x13, Y13, Y1, Y4
8020 VPXOR 128(SI), Y0, Y0
8021 VPXOR 160(SI), Y14, Y14
8022 VPXOR 192(SI), Y12, Y12
8023 VPXOR 224(SI), Y4, Y4
8024 VMOVDQU Y0, 128(DI)
8025 VMOVDQU Y14, 160(DI)
8026 VMOVDQU Y12, 192(DI)
8027 VMOVDQU Y4, 224(DI)
8028
8029 // and here
8030 ADDQ -16(DI), R10
8031 ADCQ -8(DI), R11
8032 ADCQ $0x01, R12
8033 MOVQ (BP), DX
8034 MOVQ DX, R15
8035 MULXQ R10, R13, R14
8036 IMULQ R12, R15
8037 MULXQ R11, AX, DX
8038 ADDQ AX, R14
8039 ADCQ DX, R15
8040 MOVQ 8(BP), DX
8041 MULXQ R10, R10, AX
8042 ADDQ R10, R14
8043 MULXQ R11, R11, R8
8044 ADCQ R11, R15
8045 ADCQ $0x00, R8
8046 IMULQ R12, DX
8047 ADDQ AX, R15
8048 ADCQ DX, R8
8049 MOVQ R13, R10
8050 MOVQ R14, R11
8051 MOVQ R15, R12
8052 ANDQ $0x03, R12
8053 MOVQ R15, R13
8054 ANDQ $-4, R13
8055 MOVQ R8, R14
8056 SHRQ $0x02, R8, R15
8057 SHRQ $0x02, R8
8058 ADDQ R13, R10
8059 ADCQ R14, R11
8060 ADCQ $0x00, R12
8061 ADDQ R15, R10
8062 ADCQ R8, R11
8063 ADCQ $0x00, R12
8064 VPERM2I128 $0x02, Y6, Y10, Y0
8065 VPERM2I128 $0x02, Y8, Y2, Y14
8066 VPERM2I128 $0x13, Y6, Y10, Y12
8067 VPERM2I128 $0x13, Y8, Y2, Y4
8068 VPXOR 256(SI), Y0, Y0
8069 VPXOR 288(SI), Y14, Y14
8070 VPXOR 320(SI), Y12, Y12
8071 VPXOR 352(SI), Y4, Y4
8072 VMOVDQU Y0, 256(DI)
8073 VMOVDQU Y14, 288(DI)
8074 VMOVDQU Y12, 320(DI)
8075 VMOVDQU Y4, 352(DI)
8076 VPERM2I128 $0x02, Y7, Y11, Y0
8077 VPERM2I128 $0x02, 224(BP), Y3, Y14
8078 VPERM2I128 $0x13, Y7, Y11, Y12
8079 VPERM2I128 $0x13, 224(BP), Y3, Y4
8080 VPXOR 384(SI), Y0, Y0
8081 VPXOR 416(SI), Y14, Y14
8082 VPXOR 448(SI), Y12, Y12
8083 VPXOR 480(SI), Y4, Y4
8084 VMOVDQU Y0, 384(DI)
8085 VMOVDQU Y14, 416(DI)
8086 VMOVDQU Y12, 448(DI)
8087 VMOVDQU Y4, 480(DI)
8088 LEAQ 512(SI), SI
8089 SUBQ $0x00000200, BX
8090 CMPQ BX, $0x00000200
8091 JG sealAVX2MainLoop
8092
8093 // Tail can only hash 480 bytes
8094 ADDQ (DI), R10
8095 ADCQ 8(DI), R11
8096 ADCQ $0x01, R12
8097 MOVQ (BP), DX
8098 MOVQ DX, R15
8099 MULXQ R10, R13, R14
8100 IMULQ R12, R15
8101 MULXQ R11, AX, DX
8102 ADDQ AX, R14
8103 ADCQ DX, R15
8104 MOVQ 8(BP), DX
8105 MULXQ R10, R10, AX
8106 ADDQ R10, R14
8107 MULXQ R11, R11, R8
8108 ADCQ R11, R15
8109 ADCQ $0x00, R8
8110 IMULQ R12, DX
8111 ADDQ AX, R15
8112 ADCQ DX, R8
8113 MOVQ R13, R10
8114 MOVQ R14, R11
8115 MOVQ R15, R12
8116 ANDQ $0x03, R12
8117 MOVQ R15, R13
8118 ANDQ $-4, R13
8119 MOVQ R8, R14
8120 SHRQ $0x02, R8, R15
8121 SHRQ $0x02, R8
8122 ADDQ R13, R10
8123 ADCQ R14, R11
8124 ADCQ $0x00, R12
8125 ADDQ R15, R10
8126 ADCQ R8, R11
8127 ADCQ $0x00, R12
8128 ADDQ 16(DI), R10
8129 ADCQ 24(DI), R11
8130 ADCQ $0x01, R12
8131 MOVQ (BP), DX
8132 MOVQ DX, R15
8133 MULXQ R10, R13, R14
8134 IMULQ R12, R15
8135 MULXQ R11, AX, DX
8136 ADDQ AX, R14
8137 ADCQ DX, R15
8138 MOVQ 8(BP), DX
8139 MULXQ R10, R10, AX
8140 ADDQ R10, R14
8141 MULXQ R11, R11, R8
8142 ADCQ R11, R15
8143 ADCQ $0x00, R8
8144 IMULQ R12, DX
8145 ADDQ AX, R15
8146 ADCQ DX, R8
8147 MOVQ R13, R10
8148 MOVQ R14, R11
8149 MOVQ R15, R12
8150 ANDQ $0x03, R12
8151 MOVQ R15, R13
8152 ANDQ $-4, R13
8153 MOVQ R8, R14
8154 SHRQ $0x02, R8, R15
8155 SHRQ $0x02, R8
8156 ADDQ R13, R10
8157 ADCQ R14, R11
8158 ADCQ $0x00, R12
8159 ADDQ R15, R10
8160 ADCQ R8, R11
8161 ADCQ $0x00, R12
8162 LEAQ 32(DI), DI
8163 MOVQ $0x0000000a, CX
8164 MOVQ $0x00000000, R9
8165 CMPQ BX, $0x80
8166 JBE sealAVX2Tail128
8167 CMPQ BX, $0x00000100
8168 JBE sealAVX2Tail256
8169 CMPQ BX, $0x00000180
8170 JBE sealAVX2Tail384
8171 JMP sealAVX2Tail512
8172
8173seal192AVX2:
8174 VMOVDQA Y0, Y5
8175 VMOVDQA Y14, Y9
8176 VMOVDQA Y12, Y13
8177 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
8178 VMOVDQA Y0, Y6
8179 VMOVDQA Y14, Y10
8180 VMOVDQA Y12, Y8
8181 VMOVDQA Y4, Y2
8182 VMOVDQA Y1, Y15
8183 MOVQ $0x0000000a, R9
8184
8185sealAVX2192InnerCipherLoop:
8186 VPADDD Y14, Y0, Y0
8187 VPXOR Y0, Y4, Y4
8188 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8189 VPADDD Y4, Y12, Y12
8190 VPXOR Y12, Y14, Y14
8191 VPSLLD $0x0c, Y14, Y3
8192 VPSRLD $0x14, Y14, Y14
8193 VPXOR Y3, Y14, Y14
8194 VPADDD Y14, Y0, Y0
8195 VPXOR Y0, Y4, Y4
8196 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8197 VPADDD Y4, Y12, Y12
8198 VPXOR Y12, Y14, Y14
8199 VPSLLD $0x07, Y14, Y3
8200 VPSRLD $0x19, Y14, Y14
8201 VPXOR Y3, Y14, Y14
8202 VPADDD Y9, Y5, Y5
8203 VPXOR Y5, Y1, Y1
8204 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8205 VPADDD Y1, Y13, Y13
8206 VPXOR Y13, Y9, Y9
8207 VPSLLD $0x0c, Y9, Y3
8208 VPSRLD $0x14, Y9, Y9
8209 VPXOR Y3, Y9, Y9
8210 VPADDD Y9, Y5, Y5
8211 VPXOR Y5, Y1, Y1
8212 VPSHUFB ·rol8<>+0(SB), Y1, Y1
8213 VPADDD Y1, Y13, Y13
8214 VPXOR Y13, Y9, Y9
8215 VPSLLD $0x07, Y9, Y3
8216 VPSRLD $0x19, Y9, Y9
8217 VPXOR Y3, Y9, Y9
8218 VPALIGNR $0x04, Y14, Y14, Y14
8219 VPALIGNR $0x04, Y9, Y9, Y9
8220 VPALIGNR $0x08, Y12, Y12, Y12
8221 VPALIGNR $0x08, Y13, Y13, Y13
8222 VPALIGNR $0x0c, Y4, Y4, Y4
8223 VPALIGNR $0x0c, Y1, Y1, Y1
8224 VPADDD Y14, Y0, Y0
8225 VPXOR Y0, Y4, Y4
8226 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8227 VPADDD Y4, Y12, Y12
8228 VPXOR Y12, Y14, Y14
8229 VPSLLD $0x0c, Y14, Y3
8230 VPSRLD $0x14, Y14, Y14
8231 VPXOR Y3, Y14, Y14
8232 VPADDD Y14, Y0, Y0
8233 VPXOR Y0, Y4, Y4
8234 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8235 VPADDD Y4, Y12, Y12
8236 VPXOR Y12, Y14, Y14
8237 VPSLLD $0x07, Y14, Y3
8238 VPSRLD $0x19, Y14, Y14
8239 VPXOR Y3, Y14, Y14
8240 VPADDD Y9, Y5, Y5
8241 VPXOR Y5, Y1, Y1
8242 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8243 VPADDD Y1, Y13, Y13
8244 VPXOR Y13, Y9, Y9
8245 VPSLLD $0x0c, Y9, Y3
8246 VPSRLD $0x14, Y9, Y9
8247 VPXOR Y3, Y9, Y9
8248 VPADDD Y9, Y5, Y5
8249 VPXOR Y5, Y1, Y1
8250 VPSHUFB ·rol8<>+0(SB), Y1, Y1
8251 VPADDD Y1, Y13, Y13
8252 VPXOR Y13, Y9, Y9
8253 VPSLLD $0x07, Y9, Y3
8254 VPSRLD $0x19, Y9, Y9
8255 VPXOR Y3, Y9, Y9
8256 VPALIGNR $0x0c, Y14, Y14, Y14
8257 VPALIGNR $0x0c, Y9, Y9, Y9
8258 VPALIGNR $0x08, Y12, Y12, Y12
8259 VPALIGNR $0x08, Y13, Y13, Y13
8260 VPALIGNR $0x04, Y4, Y4, Y4
8261 VPALIGNR $0x04, Y1, Y1, Y1
8262 DECQ R9
8263 JNE sealAVX2192InnerCipherLoop
8264 VPADDD Y6, Y0, Y0
8265 VPADDD Y6, Y5, Y5
8266 VPADDD Y10, Y14, Y14
8267 VPADDD Y10, Y9, Y9
8268 VPADDD Y8, Y12, Y12
8269 VPADDD Y8, Y13, Y13
8270 VPADDD Y2, Y4, Y4
8271 VPADDD Y15, Y1, Y1
8272 VPERM2I128 $0x02, Y0, Y14, Y3
8273
8274 // Clamp and store poly key
8275 VPAND ·polyClampMask<>+0(SB), Y3, Y3
8276 VMOVDQA Y3, (BP)
8277
8278 // Stream for up to 192 bytes
8279 VPERM2I128 $0x13, Y0, Y14, Y0
8280 VPERM2I128 $0x13, Y12, Y4, Y14
8281 VPERM2I128 $0x02, Y5, Y9, Y12
8282 VPERM2I128 $0x02, Y13, Y1, Y4
8283 VPERM2I128 $0x13, Y5, Y9, Y5
8284 VPERM2I128 $0x13, Y13, Y1, Y9
8285
8286sealAVX2ShortSeal:
8287 // Hash aad
8288 MOVQ ad_len+80(FP), R9
8289 CALL polyHashADInternal<>(SB)
8290 XORQ CX, CX
8291
8292sealAVX2SealHash:
8293 // itr1 holds the number of bytes encrypted but not yet hashed
8294 CMPQ CX, $0x10
8295 JB sealAVX2ShortSealLoop
8296 ADDQ (DI), R10
8297 ADCQ 8(DI), R11
8298 ADCQ $0x01, R12
8299 MOVQ (BP), AX
8300 MOVQ AX, R15
8301 MULQ R10
8302 MOVQ AX, R13
8303 MOVQ DX, R14
8304 MOVQ (BP), AX
8305 MULQ R11
8306 IMULQ R12, R15
8307 ADDQ AX, R14
8308 ADCQ DX, R15
8309 MOVQ 8(BP), AX
8310 MOVQ AX, R8
8311 MULQ R10
8312 ADDQ AX, R14
8313 ADCQ $0x00, DX
8314 MOVQ DX, R10
8315 MOVQ 8(BP), AX
8316 MULQ R11
8317 ADDQ AX, R15
8318 ADCQ $0x00, DX
8319 IMULQ R12, R8
8320 ADDQ R10, R15
8321 ADCQ DX, R8
8322 MOVQ R13, R10
8323 MOVQ R14, R11
8324 MOVQ R15, R12
8325 ANDQ $0x03, R12
8326 MOVQ R15, R13
8327 ANDQ $-4, R13
8328 MOVQ R8, R14
8329 SHRQ $0x02, R8, R15
8330 SHRQ $0x02, R8
8331 ADDQ R13, R10
8332 ADCQ R14, R11
8333 ADCQ $0x00, R12
8334 ADDQ R15, R10
8335 ADCQ R8, R11
8336 ADCQ $0x00, R12
8337 SUBQ $0x10, CX
8338 ADDQ $0x10, DI
8339 JMP sealAVX2SealHash
8340
8341sealAVX2ShortSealLoop:
8342 CMPQ BX, $0x20
8343 JB sealAVX2ShortTail32
8344 SUBQ $0x20, BX
8345
8346 // Load for encryption
8347 VPXOR (SI), Y0, Y0
8348 VMOVDQU Y0, (DI)
8349 LEAQ 32(SI), SI
8350
8351 // Now can hash
8352 ADDQ (DI), R10
8353 ADCQ 8(DI), R11
8354 ADCQ $0x01, R12
8355 MOVQ (BP), DX
8356 MOVQ DX, R15
8357 MULXQ R10, R13, R14
8358 IMULQ R12, R15
8359 MULXQ R11, AX, DX
8360 ADDQ AX, R14
8361 ADCQ DX, R15
8362 MOVQ 8(BP), DX
8363 MULXQ R10, R10, AX
8364 ADDQ R10, R14
8365 MULXQ R11, R11, R8
8366 ADCQ R11, R15
8367 ADCQ $0x00, R8
8368 IMULQ R12, DX
8369 ADDQ AX, R15
8370 ADCQ DX, R8
8371 MOVQ R13, R10
8372 MOVQ R14, R11
8373 MOVQ R15, R12
8374 ANDQ $0x03, R12
8375 MOVQ R15, R13
8376 ANDQ $-4, R13
8377 MOVQ R8, R14
8378 SHRQ $0x02, R8, R15
8379 SHRQ $0x02, R8
8380 ADDQ R13, R10
8381 ADCQ R14, R11
8382 ADCQ $0x00, R12
8383 ADDQ R15, R10
8384 ADCQ R8, R11
8385 ADCQ $0x00, R12
8386 ADDQ 16(DI), R10
8387 ADCQ 24(DI), R11
8388 ADCQ $0x01, R12
8389 MOVQ (BP), DX
8390 MOVQ DX, R15
8391 MULXQ R10, R13, R14
8392 IMULQ R12, R15
8393 MULXQ R11, AX, DX
8394 ADDQ AX, R14
8395 ADCQ DX, R15
8396 MOVQ 8(BP), DX
8397 MULXQ R10, R10, AX
8398 ADDQ R10, R14
8399 MULXQ R11, R11, R8
8400 ADCQ R11, R15
8401 ADCQ $0x00, R8
8402 IMULQ R12, DX
8403 ADDQ AX, R15
8404 ADCQ DX, R8
8405 MOVQ R13, R10
8406 MOVQ R14, R11
8407 MOVQ R15, R12
8408 ANDQ $0x03, R12
8409 MOVQ R15, R13
8410 ANDQ $-4, R13
8411 MOVQ R8, R14
8412 SHRQ $0x02, R8, R15
8413 SHRQ $0x02, R8
8414 ADDQ R13, R10
8415 ADCQ R14, R11
8416 ADCQ $0x00, R12
8417 ADDQ R15, R10
8418 ADCQ R8, R11
8419 ADCQ $0x00, R12
8420 LEAQ 32(DI), DI
8421
8422 // Shift stream left
8423 VMOVDQA Y14, Y0
8424 VMOVDQA Y12, Y14
8425 VMOVDQA Y4, Y12
8426 VMOVDQA Y5, Y4
8427 VMOVDQA Y9, Y5
8428 VMOVDQA Y13, Y9
8429 VMOVDQA Y1, Y13
8430 VMOVDQA Y6, Y1
8431 VMOVDQA Y10, Y6
8432 JMP sealAVX2ShortSealLoop
8433
8434sealAVX2ShortTail32:
8435 CMPQ BX, $0x10
8436 VMOVDQA X0, X1
8437 JB sealAVX2ShortDone
8438 SUBQ $0x10, BX
8439
8440 // Load for encryption
8441 VPXOR (SI), X0, X12
8442 VMOVDQU X12, (DI)
8443 LEAQ 16(SI), SI
8444
8445 // Hash
8446 ADDQ (DI), R10
8447 ADCQ 8(DI), R11
8448 ADCQ $0x01, R12
8449 MOVQ (BP), DX
8450 MOVQ DX, R15
8451 MULXQ R10, R13, R14
8452 IMULQ R12, R15
8453 MULXQ R11, AX, DX
8454 ADDQ AX, R14
8455 ADCQ DX, R15
8456 MOVQ 8(BP), DX
8457 MULXQ R10, R10, AX
8458 ADDQ R10, R14
8459 MULXQ R11, R11, R8
8460 ADCQ R11, R15
8461 ADCQ $0x00, R8
8462 IMULQ R12, DX
8463 ADDQ AX, R15
8464 ADCQ DX, R8
8465 MOVQ R13, R10
8466 MOVQ R14, R11
8467 MOVQ R15, R12
8468 ANDQ $0x03, R12
8469 MOVQ R15, R13
8470 ANDQ $-4, R13
8471 MOVQ R8, R14
8472 SHRQ $0x02, R8, R15
8473 SHRQ $0x02, R8
8474 ADDQ R13, R10
8475 ADCQ R14, R11
8476 ADCQ $0x00, R12
8477 ADDQ R15, R10
8478 ADCQ R8, R11
8479 ADCQ $0x00, R12
8480 LEAQ 16(DI), DI
8481 VPERM2I128 $0x11, Y0, Y0, Y0
8482 VMOVDQA X0, X1
8483
8484sealAVX2ShortDone:
8485 VZEROUPPER
8486 JMP sealSSETail
8487
8488seal320AVX2:
8489 VMOVDQA Y0, Y5
8490 VMOVDQA Y14, Y9
8491 VMOVDQA Y12, Y13
8492 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
8493 VMOVDQA Y0, Y6
8494 VMOVDQA Y14, Y10
8495 VMOVDQA Y12, Y8
8496 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
8497 VMOVDQA Y14, Y7
8498 VMOVDQA Y12, Y11
8499 VMOVDQA Y4, Y15
8500 MOVQ $0x0000000a, R9
8501
8502sealAVX2320InnerCipherLoop:
8503 VPADDD Y14, Y0, Y0
8504 VPXOR Y0, Y4, Y4
8505 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8506 VPADDD Y4, Y12, Y12
8507 VPXOR Y12, Y14, Y14
8508 VPSLLD $0x0c, Y14, Y3
8509 VPSRLD $0x14, Y14, Y14
8510 VPXOR Y3, Y14, Y14
8511 VPADDD Y14, Y0, Y0
8512 VPXOR Y0, Y4, Y4
8513 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8514 VPADDD Y4, Y12, Y12
8515 VPXOR Y12, Y14, Y14
8516 VPSLLD $0x07, Y14, Y3
8517 VPSRLD $0x19, Y14, Y14
8518 VPXOR Y3, Y14, Y14
8519 VPADDD Y9, Y5, Y5
8520 VPXOR Y5, Y1, Y1
8521 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8522 VPADDD Y1, Y13, Y13
8523 VPXOR Y13, Y9, Y9
8524 VPSLLD $0x0c, Y9, Y3
8525 VPSRLD $0x14, Y9, Y9
8526 VPXOR Y3, Y9, Y9
8527 VPADDD Y9, Y5, Y5
8528 VPXOR Y5, Y1, Y1
8529 VPSHUFB ·rol8<>+0(SB), Y1, Y1
8530 VPADDD Y1, Y13, Y13
8531 VPXOR Y13, Y9, Y9
8532 VPSLLD $0x07, Y9, Y3
8533 VPSRLD $0x19, Y9, Y9
8534 VPXOR Y3, Y9, Y9
8535 VPADDD Y10, Y6, Y6
8536 VPXOR Y6, Y2, Y2
8537 VPSHUFB ·rol16<>+0(SB), Y2, Y2
8538 VPADDD Y2, Y8, Y8
8539 VPXOR Y8, Y10, Y10
8540 VPSLLD $0x0c, Y10, Y3
8541 VPSRLD $0x14, Y10, Y10
8542 VPXOR Y3, Y10, Y10
8543 VPADDD Y10, Y6, Y6
8544 VPXOR Y6, Y2, Y2
8545 VPSHUFB ·rol8<>+0(SB), Y2, Y2
8546 VPADDD Y2, Y8, Y8
8547 VPXOR Y8, Y10, Y10
8548 VPSLLD $0x07, Y10, Y3
8549 VPSRLD $0x19, Y10, Y10
8550 VPXOR Y3, Y10, Y10
8551 VPALIGNR $0x04, Y14, Y14, Y14
8552 VPALIGNR $0x04, Y9, Y9, Y9
8553 VPALIGNR $0x04, Y10, Y10, Y10
8554 VPALIGNR $0x08, Y12, Y12, Y12
8555 VPALIGNR $0x08, Y13, Y13, Y13
8556 VPALIGNR $0x08, Y8, Y8, Y8
8557 VPALIGNR $0x0c, Y4, Y4, Y4
8558 VPALIGNR $0x0c, Y1, Y1, Y1
8559 VPALIGNR $0x0c, Y2, Y2, Y2
8560 VPADDD Y14, Y0, Y0
8561 VPXOR Y0, Y4, Y4
8562 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8563 VPADDD Y4, Y12, Y12
8564 VPXOR Y12, Y14, Y14
8565 VPSLLD $0x0c, Y14, Y3
8566 VPSRLD $0x14, Y14, Y14
8567 VPXOR Y3, Y14, Y14
8568 VPADDD Y14, Y0, Y0
8569 VPXOR Y0, Y4, Y4
8570 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8571 VPADDD Y4, Y12, Y12
8572 VPXOR Y12, Y14, Y14
8573 VPSLLD $0x07, Y14, Y3
8574 VPSRLD $0x19, Y14, Y14
8575 VPXOR Y3, Y14, Y14
8576 VPADDD Y9, Y5, Y5
8577 VPXOR Y5, Y1, Y1
8578 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8579 VPADDD Y1, Y13, Y13
8580 VPXOR Y13, Y9, Y9
8581 VPSLLD $0x0c, Y9, Y3
8582 VPSRLD $0x14, Y9, Y9
8583 VPXOR Y3, Y9, Y9
8584 VPADDD Y9, Y5, Y5
8585 VPXOR Y5, Y1, Y1
8586 VPSHUFB ·rol8<>+0(SB), Y1, Y1
8587 VPADDD Y1, Y13, Y13
8588 VPXOR Y13, Y9, Y9
8589 VPSLLD $0x07, Y9, Y3
8590 VPSRLD $0x19, Y9, Y9
8591 VPXOR Y3, Y9, Y9
8592 VPADDD Y10, Y6, Y6
8593 VPXOR Y6, Y2, Y2
8594 VPSHUFB ·rol16<>+0(SB), Y2, Y2
8595 VPADDD Y2, Y8, Y8
8596 VPXOR Y8, Y10, Y10
8597 VPSLLD $0x0c, Y10, Y3
8598 VPSRLD $0x14, Y10, Y10
8599 VPXOR Y3, Y10, Y10
8600 VPADDD Y10, Y6, Y6
8601 VPXOR Y6, Y2, Y2
8602 VPSHUFB ·rol8<>+0(SB), Y2, Y2
8603 VPADDD Y2, Y8, Y8
8604 VPXOR Y8, Y10, Y10
8605 VPSLLD $0x07, Y10, Y3
8606 VPSRLD $0x19, Y10, Y10
8607 VPXOR Y3, Y10, Y10
8608 VPALIGNR $0x0c, Y14, Y14, Y14
8609 VPALIGNR $0x0c, Y9, Y9, Y9
8610 VPALIGNR $0x0c, Y10, Y10, Y10
8611 VPALIGNR $0x08, Y12, Y12, Y12
8612 VPALIGNR $0x08, Y13, Y13, Y13
8613 VPALIGNR $0x08, Y8, Y8, Y8
8614 VPALIGNR $0x04, Y4, Y4, Y4
8615 VPALIGNR $0x04, Y1, Y1, Y1
8616 VPALIGNR $0x04, Y2, Y2, Y2
8617 DECQ R9
8618 JNE sealAVX2320InnerCipherLoop
8619 VMOVDQA ·chacha20Constants<>+0(SB), Y3
8620 VPADDD Y3, Y0, Y0
8621 VPADDD Y3, Y5, Y5
8622 VPADDD Y3, Y6, Y6
8623 VPADDD Y7, Y14, Y14
8624 VPADDD Y7, Y9, Y9
8625 VPADDD Y7, Y10, Y10
8626 VPADDD Y11, Y12, Y12
8627 VPADDD Y11, Y13, Y13
8628 VPADDD Y11, Y8, Y8
8629 VMOVDQA ·avx2IncMask<>+0(SB), Y3
8630 VPADDD Y15, Y4, Y4
8631 VPADDD Y3, Y15, Y15
8632 VPADDD Y15, Y1, Y1
8633 VPADDD Y3, Y15, Y15
8634 VPADDD Y15, Y2, Y2
8635
8636 // Clamp and store poly key
8637 VPERM2I128 $0x02, Y0, Y14, Y3
8638 VPAND ·polyClampMask<>+0(SB), Y3, Y3
8639 VMOVDQA Y3, (BP)
8640
8641 // Stream for up to 320 bytes
8642 VPERM2I128 $0x13, Y0, Y14, Y0
8643 VPERM2I128 $0x13, Y12, Y4, Y14
8644 VPERM2I128 $0x02, Y5, Y9, Y12
8645 VPERM2I128 $0x02, Y13, Y1, Y4
8646 VPERM2I128 $0x13, Y5, Y9, Y5
8647 VPERM2I128 $0x13, Y13, Y1, Y9
8648 VPERM2I128 $0x02, Y6, Y10, Y13
8649 VPERM2I128 $0x02, Y8, Y2, Y1
8650 VPERM2I128 $0x13, Y6, Y10, Y6
8651 VPERM2I128 $0x13, Y8, Y2, Y10
8652 JMP sealAVX2ShortSeal
8653
8654sealAVX2Tail128:
8655 VMOVDQA ·chacha20Constants<>+0(SB), Y0
8656 VMOVDQA 32(BP), Y14
8657 VMOVDQA 64(BP), Y12
8658 VMOVDQA 192(BP), Y4
8659 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
8660 VMOVDQA Y4, Y1
8661
8662sealAVX2Tail128LoopA:
8663 ADDQ (DI), R10
8664 ADCQ 8(DI), R11
8665 ADCQ $0x01, R12
8666 MOVQ (BP), AX
8667 MOVQ AX, R15
8668 MULQ R10
8669 MOVQ AX, R13
8670 MOVQ DX, R14
8671 MOVQ (BP), AX
8672 MULQ R11
8673 IMULQ R12, R15
8674 ADDQ AX, R14
8675 ADCQ DX, R15
8676 MOVQ 8(BP), AX
8677 MOVQ AX, R8
8678 MULQ R10
8679 ADDQ AX, R14
8680 ADCQ $0x00, DX
8681 MOVQ DX, R10
8682 MOVQ 8(BP), AX
8683 MULQ R11
8684 ADDQ AX, R15
8685 ADCQ $0x00, DX
8686 IMULQ R12, R8
8687 ADDQ R10, R15
8688 ADCQ DX, R8
8689 MOVQ R13, R10
8690 MOVQ R14, R11
8691 MOVQ R15, R12
8692 ANDQ $0x03, R12
8693 MOVQ R15, R13
8694 ANDQ $-4, R13
8695 MOVQ R8, R14
8696 SHRQ $0x02, R8, R15
8697 SHRQ $0x02, R8
8698 ADDQ R13, R10
8699 ADCQ R14, R11
8700 ADCQ $0x00, R12
8701 ADDQ R15, R10
8702 ADCQ R8, R11
8703 ADCQ $0x00, R12
8704 LEAQ 16(DI), DI
8705
8706sealAVX2Tail128LoopB:
8707 VPADDD Y14, Y0, Y0
8708 VPXOR Y0, Y4, Y4
8709 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8710 VPADDD Y4, Y12, Y12
8711 VPXOR Y12, Y14, Y14
8712 VPSLLD $0x0c, Y14, Y3
8713 VPSRLD $0x14, Y14, Y14
8714 VPXOR Y3, Y14, Y14
8715 VPADDD Y14, Y0, Y0
8716 VPXOR Y0, Y4, Y4
8717 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8718 VPADDD Y4, Y12, Y12
8719 VPXOR Y12, Y14, Y14
8720 VPSLLD $0x07, Y14, Y3
8721 VPSRLD $0x19, Y14, Y14
8722 VPXOR Y3, Y14, Y14
8723 ADDQ (DI), R10
8724 ADCQ 8(DI), R11
8725 ADCQ $0x01, R12
8726 MOVQ (BP), AX
8727 MOVQ AX, R15
8728 MULQ R10
8729 MOVQ AX, R13
8730 MOVQ DX, R14
8731 MOVQ (BP), AX
8732 MULQ R11
8733 IMULQ R12, R15
8734 ADDQ AX, R14
8735 ADCQ DX, R15
8736 MOVQ 8(BP), AX
8737 MOVQ AX, R8
8738 MULQ R10
8739 ADDQ AX, R14
8740 ADCQ $0x00, DX
8741 MOVQ DX, R10
8742 MOVQ 8(BP), AX
8743 MULQ R11
8744 ADDQ AX, R15
8745 ADCQ $0x00, DX
8746 IMULQ R12, R8
8747 ADDQ R10, R15
8748 ADCQ DX, R8
8749 MOVQ R13, R10
8750 MOVQ R14, R11
8751 MOVQ R15, R12
8752 ANDQ $0x03, R12
8753 MOVQ R15, R13
8754 ANDQ $-4, R13
8755 MOVQ R8, R14
8756 SHRQ $0x02, R8, R15
8757 SHRQ $0x02, R8
8758 ADDQ R13, R10
8759 ADCQ R14, R11
8760 ADCQ $0x00, R12
8761 ADDQ R15, R10
8762 ADCQ R8, R11
8763 ADCQ $0x00, R12
8764 VPALIGNR $0x04, Y14, Y14, Y14
8765 VPALIGNR $0x08, Y12, Y12, Y12
8766 VPALIGNR $0x0c, Y4, Y4, Y4
8767 VPADDD Y14, Y0, Y0
8768 VPXOR Y0, Y4, Y4
8769 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8770 VPADDD Y4, Y12, Y12
8771 VPXOR Y12, Y14, Y14
8772 VPSLLD $0x0c, Y14, Y3
8773 VPSRLD $0x14, Y14, Y14
8774 VPXOR Y3, Y14, Y14
8775 VPADDD Y14, Y0, Y0
8776 VPXOR Y0, Y4, Y4
8777 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8778 VPADDD Y4, Y12, Y12
8779 VPXOR Y12, Y14, Y14
8780 VPSLLD $0x07, Y14, Y3
8781 VPSRLD $0x19, Y14, Y14
8782 VPXOR Y3, Y14, Y14
8783 ADDQ 16(DI), R10
8784 ADCQ 24(DI), R11
8785 ADCQ $0x01, R12
8786 MOVQ (BP), AX
8787 MOVQ AX, R15
8788 MULQ R10
8789 MOVQ AX, R13
8790 MOVQ DX, R14
8791 MOVQ (BP), AX
8792 MULQ R11
8793 IMULQ R12, R15
8794 ADDQ AX, R14
8795 ADCQ DX, R15
8796 MOVQ 8(BP), AX
8797 MOVQ AX, R8
8798 MULQ R10
8799 ADDQ AX, R14
8800 ADCQ $0x00, DX
8801 MOVQ DX, R10
8802 MOVQ 8(BP), AX
8803 MULQ R11
8804 ADDQ AX, R15
8805 ADCQ $0x00, DX
8806 IMULQ R12, R8
8807 ADDQ R10, R15
8808 ADCQ DX, R8
8809 MOVQ R13, R10
8810 MOVQ R14, R11
8811 MOVQ R15, R12
8812 ANDQ $0x03, R12
8813 MOVQ R15, R13
8814 ANDQ $-4, R13
8815 MOVQ R8, R14
8816 SHRQ $0x02, R8, R15
8817 SHRQ $0x02, R8
8818 ADDQ R13, R10
8819 ADCQ R14, R11
8820 ADCQ $0x00, R12
8821 ADDQ R15, R10
8822 ADCQ R8, R11
8823 ADCQ $0x00, R12
8824 LEAQ 32(DI), DI
8825 VPALIGNR $0x0c, Y14, Y14, Y14
8826 VPALIGNR $0x08, Y12, Y12, Y12
8827 VPALIGNR $0x04, Y4, Y4, Y4
8828 DECQ CX
8829 JG sealAVX2Tail128LoopA
8830 DECQ R9
8831 JGE sealAVX2Tail128LoopB
8832 VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
8833 VPADDD 32(BP), Y14, Y9
8834 VPADDD 64(BP), Y12, Y13
8835 VPADDD Y1, Y4, Y1
8836 VPERM2I128 $0x02, Y5, Y9, Y0
8837 VPERM2I128 $0x02, Y13, Y1, Y14
8838 VPERM2I128 $0x13, Y5, Y9, Y12
8839 VPERM2I128 $0x13, Y13, Y1, Y4
8840 JMP sealAVX2ShortSealLoop
8841
8842sealAVX2Tail256:
8843 VMOVDQA ·chacha20Constants<>+0(SB), Y0
8844 VMOVDQA ·chacha20Constants<>+0(SB), Y5
8845 VMOVDQA 32(BP), Y14
8846 VMOVDQA 32(BP), Y9
8847 VMOVDQA 64(BP), Y12
8848 VMOVDQA 64(BP), Y13
8849 VMOVDQA 192(BP), Y4
8850 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
8851 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
8852 VMOVDQA Y4, Y7
8853 VMOVDQA Y1, Y11
8854
8855sealAVX2Tail256LoopA:
8856 ADDQ (DI), R10
8857 ADCQ 8(DI), R11
8858 ADCQ $0x01, R12
8859 MOVQ (BP), AX
8860 MOVQ AX, R15
8861 MULQ R10
8862 MOVQ AX, R13
8863 MOVQ DX, R14
8864 MOVQ (BP), AX
8865 MULQ R11
8866 IMULQ R12, R15
8867 ADDQ AX, R14
8868 ADCQ DX, R15
8869 MOVQ 8(BP), AX
8870 MOVQ AX, R8
8871 MULQ R10
8872 ADDQ AX, R14
8873 ADCQ $0x00, DX
8874 MOVQ DX, R10
8875 MOVQ 8(BP), AX
8876 MULQ R11
8877 ADDQ AX, R15
8878 ADCQ $0x00, DX
8879 IMULQ R12, R8
8880 ADDQ R10, R15
8881 ADCQ DX, R8
8882 MOVQ R13, R10
8883 MOVQ R14, R11
8884 MOVQ R15, R12
8885 ANDQ $0x03, R12
8886 MOVQ R15, R13
8887 ANDQ $-4, R13
8888 MOVQ R8, R14
8889 SHRQ $0x02, R8, R15
8890 SHRQ $0x02, R8
8891 ADDQ R13, R10
8892 ADCQ R14, R11
8893 ADCQ $0x00, R12
8894 ADDQ R15, R10
8895 ADCQ R8, R11
8896 ADCQ $0x00, R12
8897 LEAQ 16(DI), DI
8898
8899sealAVX2Tail256LoopB:
8900 VPADDD Y14, Y0, Y0
8901 VPXOR Y0, Y4, Y4
8902 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8903 VPADDD Y4, Y12, Y12
8904 VPXOR Y12, Y14, Y14
8905 VPSLLD $0x0c, Y14, Y3
8906 VPSRLD $0x14, Y14, Y14
8907 VPXOR Y3, Y14, Y14
8908 VPADDD Y14, Y0, Y0
8909 VPXOR Y0, Y4, Y4
8910 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8911 VPADDD Y4, Y12, Y12
8912 VPXOR Y12, Y14, Y14
8913 VPSLLD $0x07, Y14, Y3
8914 VPSRLD $0x19, Y14, Y14
8915 VPXOR Y3, Y14, Y14
8916 VPADDD Y9, Y5, Y5
8917 VPXOR Y5, Y1, Y1
8918 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8919 VPADDD Y1, Y13, Y13
8920 VPXOR Y13, Y9, Y9
8921 VPSLLD $0x0c, Y9, Y3
8922 VPSRLD $0x14, Y9, Y9
8923 VPXOR Y3, Y9, Y9
8924 VPADDD Y9, Y5, Y5
8925 VPXOR Y5, Y1, Y1
8926 VPSHUFB ·rol8<>+0(SB), Y1, Y1
8927 VPADDD Y1, Y13, Y13
8928 VPXOR Y13, Y9, Y9
8929 VPSLLD $0x07, Y9, Y3
8930 VPSRLD $0x19, Y9, Y9
8931 VPXOR Y3, Y9, Y9
8932 ADDQ (DI), R10
8933 ADCQ 8(DI), R11
8934 ADCQ $0x01, R12
8935 MOVQ (BP), AX
8936 MOVQ AX, R15
8937 MULQ R10
8938 MOVQ AX, R13
8939 MOVQ DX, R14
8940 MOVQ (BP), AX
8941 MULQ R11
8942 IMULQ R12, R15
8943 ADDQ AX, R14
8944 ADCQ DX, R15
8945 MOVQ 8(BP), AX
8946 MOVQ AX, R8
8947 MULQ R10
8948 ADDQ AX, R14
8949 ADCQ $0x00, DX
8950 MOVQ DX, R10
8951 MOVQ 8(BP), AX
8952 MULQ R11
8953 ADDQ AX, R15
8954 ADCQ $0x00, DX
8955 IMULQ R12, R8
8956 ADDQ R10, R15
8957 ADCQ DX, R8
8958 MOVQ R13, R10
8959 MOVQ R14, R11
8960 MOVQ R15, R12
8961 ANDQ $0x03, R12
8962 MOVQ R15, R13
8963 ANDQ $-4, R13
8964 MOVQ R8, R14
8965 SHRQ $0x02, R8, R15
8966 SHRQ $0x02, R8
8967 ADDQ R13, R10
8968 ADCQ R14, R11
8969 ADCQ $0x00, R12
8970 ADDQ R15, R10
8971 ADCQ R8, R11
8972 ADCQ $0x00, R12
8973 VPALIGNR $0x04, Y14, Y14, Y14
8974 VPALIGNR $0x04, Y9, Y9, Y9
8975 VPALIGNR $0x08, Y12, Y12, Y12
8976 VPALIGNR $0x08, Y13, Y13, Y13
8977 VPALIGNR $0x0c, Y4, Y4, Y4
8978 VPALIGNR $0x0c, Y1, Y1, Y1
8979 VPADDD Y14, Y0, Y0
8980 VPXOR Y0, Y4, Y4
8981 VPSHUFB ·rol16<>+0(SB), Y4, Y4
8982 VPADDD Y4, Y12, Y12
8983 VPXOR Y12, Y14, Y14
8984 VPSLLD $0x0c, Y14, Y3
8985 VPSRLD $0x14, Y14, Y14
8986 VPXOR Y3, Y14, Y14
8987 VPADDD Y14, Y0, Y0
8988 VPXOR Y0, Y4, Y4
8989 VPSHUFB ·rol8<>+0(SB), Y4, Y4
8990 VPADDD Y4, Y12, Y12
8991 VPXOR Y12, Y14, Y14
8992 VPSLLD $0x07, Y14, Y3
8993 VPSRLD $0x19, Y14, Y14
8994 VPXOR Y3, Y14, Y14
8995 VPADDD Y9, Y5, Y5
8996 VPXOR Y5, Y1, Y1
8997 VPSHUFB ·rol16<>+0(SB), Y1, Y1
8998 VPADDD Y1, Y13, Y13
8999 VPXOR Y13, Y9, Y9
9000 VPSLLD $0x0c, Y9, Y3
9001 VPSRLD $0x14, Y9, Y9
9002 VPXOR Y3, Y9, Y9
9003 VPADDD Y9, Y5, Y5
9004 VPXOR Y5, Y1, Y1
9005 VPSHUFB ·rol8<>+0(SB), Y1, Y1
9006 VPADDD Y1, Y13, Y13
9007 VPXOR Y13, Y9, Y9
9008 VPSLLD $0x07, Y9, Y3
9009 VPSRLD $0x19, Y9, Y9
9010 VPXOR Y3, Y9, Y9
9011 ADDQ 16(DI), R10
9012 ADCQ 24(DI), R11
9013 ADCQ $0x01, R12
9014 MOVQ (BP), AX
9015 MOVQ AX, R15
9016 MULQ R10
9017 MOVQ AX, R13
9018 MOVQ DX, R14
9019 MOVQ (BP), AX
9020 MULQ R11
9021 IMULQ R12, R15
9022 ADDQ AX, R14
9023 ADCQ DX, R15
9024 MOVQ 8(BP), AX
9025 MOVQ AX, R8
9026 MULQ R10
9027 ADDQ AX, R14
9028 ADCQ $0x00, DX
9029 MOVQ DX, R10
9030 MOVQ 8(BP), AX
9031 MULQ R11
9032 ADDQ AX, R15
9033 ADCQ $0x00, DX
9034 IMULQ R12, R8
9035 ADDQ R10, R15
9036 ADCQ DX, R8
9037 MOVQ R13, R10
9038 MOVQ R14, R11
9039 MOVQ R15, R12
9040 ANDQ $0x03, R12
9041 MOVQ R15, R13
9042 ANDQ $-4, R13
9043 MOVQ R8, R14
9044 SHRQ $0x02, R8, R15
9045 SHRQ $0x02, R8
9046 ADDQ R13, R10
9047 ADCQ R14, R11
9048 ADCQ $0x00, R12
9049 ADDQ R15, R10
9050 ADCQ R8, R11
9051 ADCQ $0x00, R12
9052 LEAQ 32(DI), DI
9053 VPALIGNR $0x0c, Y14, Y14, Y14
9054 VPALIGNR $0x0c, Y9, Y9, Y9
9055 VPALIGNR $0x08, Y12, Y12, Y12
9056 VPALIGNR $0x08, Y13, Y13, Y13
9057 VPALIGNR $0x04, Y4, Y4, Y4
9058 VPALIGNR $0x04, Y1, Y1, Y1
9059 DECQ CX
9060 JG sealAVX2Tail256LoopA
9061 DECQ R9
9062 JGE sealAVX2Tail256LoopB
9063 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
9064 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
9065 VPADDD 32(BP), Y14, Y14
9066 VPADDD 32(BP), Y9, Y9
9067 VPADDD 64(BP), Y12, Y12
9068 VPADDD 64(BP), Y13, Y13
9069 VPADDD Y7, Y4, Y4
9070 VPADDD Y11, Y1, Y1
9071 VPERM2I128 $0x02, Y0, Y14, Y3
9072 VPERM2I128 $0x02, Y12, Y4, Y7
9073 VPERM2I128 $0x13, Y0, Y14, Y11
9074 VPERM2I128 $0x13, Y12, Y4, Y15
9075 VPXOR (SI), Y3, Y3
9076 VPXOR 32(SI), Y7, Y7
9077 VPXOR 64(SI), Y11, Y11
9078 VPXOR 96(SI), Y15, Y15
9079 VMOVDQU Y3, (DI)
9080 VMOVDQU Y7, 32(DI)
9081 VMOVDQU Y11, 64(DI)
9082 VMOVDQU Y15, 96(DI)
9083 MOVQ $0x00000080, CX
9084 LEAQ 128(SI), SI
9085 SUBQ $0x80, BX
9086 VPERM2I128 $0x02, Y5, Y9, Y0
9087 VPERM2I128 $0x02, Y13, Y1, Y14
9088 VPERM2I128 $0x13, Y5, Y9, Y12
9089 VPERM2I128 $0x13, Y13, Y1, Y4
9090 JMP sealAVX2SealHash
9091
9092sealAVX2Tail384:
9093 VMOVDQA ·chacha20Constants<>+0(SB), Y0
9094 VMOVDQA Y0, Y5
9095 VMOVDQA Y0, Y6
9096 VMOVDQA 32(BP), Y14
9097 VMOVDQA Y14, Y9
9098 VMOVDQA Y14, Y10
9099 VMOVDQA 64(BP), Y12
9100 VMOVDQA Y12, Y13
9101 VMOVDQA Y12, Y8
9102 VMOVDQA 192(BP), Y4
9103 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
9104 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
9105 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
9106 VMOVDQA Y4, Y7
9107 VMOVDQA Y1, Y11
9108 VMOVDQA Y2, Y15
9109
9110sealAVX2Tail384LoopA:
9111 ADDQ (DI), R10
9112 ADCQ 8(DI), R11
9113 ADCQ $0x01, R12
9114 MOVQ (BP), AX
9115 MOVQ AX, R15
9116 MULQ R10
9117 MOVQ AX, R13
9118 MOVQ DX, R14
9119 MOVQ (BP), AX
9120 MULQ R11
9121 IMULQ R12, R15
9122 ADDQ AX, R14
9123 ADCQ DX, R15
9124 MOVQ 8(BP), AX
9125 MOVQ AX, R8
9126 MULQ R10
9127 ADDQ AX, R14
9128 ADCQ $0x00, DX
9129 MOVQ DX, R10
9130 MOVQ 8(BP), AX
9131 MULQ R11
9132 ADDQ AX, R15
9133 ADCQ $0x00, DX
9134 IMULQ R12, R8
9135 ADDQ R10, R15
9136 ADCQ DX, R8
9137 MOVQ R13, R10
9138 MOVQ R14, R11
9139 MOVQ R15, R12
9140 ANDQ $0x03, R12
9141 MOVQ R15, R13
9142 ANDQ $-4, R13
9143 MOVQ R8, R14
9144 SHRQ $0x02, R8, R15
9145 SHRQ $0x02, R8
9146 ADDQ R13, R10
9147 ADCQ R14, R11
9148 ADCQ $0x00, R12
9149 ADDQ R15, R10
9150 ADCQ R8, R11
9151 ADCQ $0x00, R12
9152 LEAQ 16(DI), DI
9153
9154sealAVX2Tail384LoopB:
9155 VPADDD Y14, Y0, Y0
9156 VPXOR Y0, Y4, Y4
9157 VPSHUFB ·rol16<>+0(SB), Y4, Y4
9158 VPADDD Y4, Y12, Y12
9159 VPXOR Y12, Y14, Y14
9160 VPSLLD $0x0c, Y14, Y3
9161 VPSRLD $0x14, Y14, Y14
9162 VPXOR Y3, Y14, Y14
9163 VPADDD Y14, Y0, Y0
9164 VPXOR Y0, Y4, Y4
9165 VPSHUFB ·rol8<>+0(SB), Y4, Y4
9166 VPADDD Y4, Y12, Y12
9167 VPXOR Y12, Y14, Y14
9168 VPSLLD $0x07, Y14, Y3
9169 VPSRLD $0x19, Y14, Y14
9170 VPXOR Y3, Y14, Y14
9171 VPADDD Y9, Y5, Y5
9172 VPXOR Y5, Y1, Y1
9173 VPSHUFB ·rol16<>+0(SB), Y1, Y1
9174 VPADDD Y1, Y13, Y13
9175 VPXOR Y13, Y9, Y9
9176 VPSLLD $0x0c, Y9, Y3
9177 VPSRLD $0x14, Y9, Y9
9178 VPXOR Y3, Y9, Y9
9179 VPADDD Y9, Y5, Y5
9180 VPXOR Y5, Y1, Y1
9181 VPSHUFB ·rol8<>+0(SB), Y1, Y1
9182 VPADDD Y1, Y13, Y13
9183 VPXOR Y13, Y9, Y9
9184 VPSLLD $0x07, Y9, Y3
9185 VPSRLD $0x19, Y9, Y9
9186 VPXOR Y3, Y9, Y9
9187 VPADDD Y10, Y6, Y6
9188 VPXOR Y6, Y2, Y2
9189 VPSHUFB ·rol16<>+0(SB), Y2, Y2
9190 VPADDD Y2, Y8, Y8
9191 VPXOR Y8, Y10, Y10
9192 VPSLLD $0x0c, Y10, Y3
9193 VPSRLD $0x14, Y10, Y10
9194 VPXOR Y3, Y10, Y10
9195 VPADDD Y10, Y6, Y6
9196 VPXOR Y6, Y2, Y2
9197 VPSHUFB ·rol8<>+0(SB), Y2, Y2
9198 VPADDD Y2, Y8, Y8
9199 VPXOR Y8, Y10, Y10
9200 VPSLLD $0x07, Y10, Y3
9201 VPSRLD $0x19, Y10, Y10
9202 VPXOR Y3, Y10, Y10
9203 ADDQ (DI), R10
9204 ADCQ 8(DI), R11
9205 ADCQ $0x01, R12
9206 MOVQ (BP), AX
9207 MOVQ AX, R15
9208 MULQ R10
9209 MOVQ AX, R13
9210 MOVQ DX, R14
9211 MOVQ (BP), AX
9212 MULQ R11
9213 IMULQ R12, R15
9214 ADDQ AX, R14
9215 ADCQ DX, R15
9216 MOVQ 8(BP), AX
9217 MOVQ AX, R8
9218 MULQ R10
9219 ADDQ AX, R14
9220 ADCQ $0x00, DX
9221 MOVQ DX, R10
9222 MOVQ 8(BP), AX
9223 MULQ R11
9224 ADDQ AX, R15
9225 ADCQ $0x00, DX
9226 IMULQ R12, R8
9227 ADDQ R10, R15
9228 ADCQ DX, R8
9229 MOVQ R13, R10
9230 MOVQ R14, R11
9231 MOVQ R15, R12
9232 ANDQ $0x03, R12
9233 MOVQ R15, R13
9234 ANDQ $-4, R13
9235 MOVQ R8, R14
9236 SHRQ $0x02, R8, R15
9237 SHRQ $0x02, R8
9238 ADDQ R13, R10
9239 ADCQ R14, R11
9240 ADCQ $0x00, R12
9241 ADDQ R15, R10
9242 ADCQ R8, R11
9243 ADCQ $0x00, R12
9244 VPALIGNR $0x04, Y14, Y14, Y14
9245 VPALIGNR $0x04, Y9, Y9, Y9
9246 VPALIGNR $0x04, Y10, Y10, Y10
9247 VPALIGNR $0x08, Y12, Y12, Y12
9248 VPALIGNR $0x08, Y13, Y13, Y13
9249 VPALIGNR $0x08, Y8, Y8, Y8
9250 VPALIGNR $0x0c, Y4, Y4, Y4
9251 VPALIGNR $0x0c, Y1, Y1, Y1
9252 VPALIGNR $0x0c, Y2, Y2, Y2
9253 VPADDD Y14, Y0, Y0
9254 VPXOR Y0, Y4, Y4
9255 VPSHUFB ·rol16<>+0(SB), Y4, Y4
9256 VPADDD Y4, Y12, Y12
9257 VPXOR Y12, Y14, Y14
9258 VPSLLD $0x0c, Y14, Y3
9259 VPSRLD $0x14, Y14, Y14
9260 VPXOR Y3, Y14, Y14
9261 VPADDD Y14, Y0, Y0
9262 VPXOR Y0, Y4, Y4
9263 VPSHUFB ·rol8<>+0(SB), Y4, Y4
9264 VPADDD Y4, Y12, Y12
9265 VPXOR Y12, Y14, Y14
9266 VPSLLD $0x07, Y14, Y3
9267 VPSRLD $0x19, Y14, Y14
9268 VPXOR Y3, Y14, Y14
9269 VPADDD Y9, Y5, Y5
9270 VPXOR Y5, Y1, Y1
9271 VPSHUFB ·rol16<>+0(SB), Y1, Y1
9272 VPADDD Y1, Y13, Y13
9273 VPXOR Y13, Y9, Y9
9274 VPSLLD $0x0c, Y9, Y3
9275 VPSRLD $0x14, Y9, Y9
9276 VPXOR Y3, Y9, Y9
9277 VPADDD Y9, Y5, Y5
9278 VPXOR Y5, Y1, Y1
9279 VPSHUFB ·rol8<>+0(SB), Y1, Y1
9280 VPADDD Y1, Y13, Y13
9281 VPXOR Y13, Y9, Y9
9282 VPSLLD $0x07, Y9, Y3
9283 VPSRLD $0x19, Y9, Y9
9284 VPXOR Y3, Y9, Y9
9285 VPADDD Y10, Y6, Y6
9286 VPXOR Y6, Y2, Y2
9287 VPSHUFB ·rol16<>+0(SB), Y2, Y2
9288 VPADDD Y2, Y8, Y8
9289 VPXOR Y8, Y10, Y10
9290 VPSLLD $0x0c, Y10, Y3
9291 VPSRLD $0x14, Y10, Y10
9292 VPXOR Y3, Y10, Y10
9293 VPADDD Y10, Y6, Y6
9294 VPXOR Y6, Y2, Y2
9295 VPSHUFB ·rol8<>+0(SB), Y2, Y2
9296 VPADDD Y2, Y8, Y8
9297 VPXOR Y8, Y10, Y10
9298 VPSLLD $0x07, Y10, Y3
9299 VPSRLD $0x19, Y10, Y10
9300 VPXOR Y3, Y10, Y10
9301 ADDQ 16(DI), R10
9302 ADCQ 24(DI), R11
9303 ADCQ $0x01, R12
9304 MOVQ (BP), AX
9305 MOVQ AX, R15
9306 MULQ R10
9307 MOVQ AX, R13
9308 MOVQ DX, R14
9309 MOVQ (BP), AX
9310 MULQ R11
9311 IMULQ R12, R15
9312 ADDQ AX, R14
9313 ADCQ DX, R15
9314 MOVQ 8(BP), AX
9315 MOVQ AX, R8
9316 MULQ R10
9317 ADDQ AX, R14
9318 ADCQ $0x00, DX
9319 MOVQ DX, R10
9320 MOVQ 8(BP), AX
9321 MULQ R11
9322 ADDQ AX, R15
9323 ADCQ $0x00, DX
9324 IMULQ R12, R8
9325 ADDQ R10, R15
9326 ADCQ DX, R8
9327 MOVQ R13, R10
9328 MOVQ R14, R11
9329 MOVQ R15, R12
9330 ANDQ $0x03, R12
9331 MOVQ R15, R13
9332 ANDQ $-4, R13
9333 MOVQ R8, R14
9334 SHRQ $0x02, R8, R15
9335 SHRQ $0x02, R8
9336 ADDQ R13, R10
9337 ADCQ R14, R11
9338 ADCQ $0x00, R12
9339 ADDQ R15, R10
9340 ADCQ R8, R11
9341 ADCQ $0x00, R12
9342 LEAQ 32(DI), DI
9343 VPALIGNR $0x0c, Y14, Y14, Y14
9344 VPALIGNR $0x0c, Y9, Y9, Y9
9345 VPALIGNR $0x0c, Y10, Y10, Y10
9346 VPALIGNR $0x08, Y12, Y12, Y12
9347 VPALIGNR $0x08, Y13, Y13, Y13
9348 VPALIGNR $0x08, Y8, Y8, Y8
9349 VPALIGNR $0x04, Y4, Y4, Y4
9350 VPALIGNR $0x04, Y1, Y1, Y1
9351 VPALIGNR $0x04, Y2, Y2, Y2
9352 DECQ CX
9353 JG sealAVX2Tail384LoopA
9354 DECQ R9
9355 JGE sealAVX2Tail384LoopB
9356 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
9357 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
9358 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
9359 VPADDD 32(BP), Y14, Y14
9360 VPADDD 32(BP), Y9, Y9
9361 VPADDD 32(BP), Y10, Y10
9362 VPADDD 64(BP), Y12, Y12
9363 VPADDD 64(BP), Y13, Y13
9364 VPADDD 64(BP), Y8, Y8
9365 VPADDD Y7, Y4, Y4
9366 VPADDD Y11, Y1, Y1
9367 VPADDD Y15, Y2, Y2
9368 VPERM2I128 $0x02, Y0, Y14, Y3
9369 VPERM2I128 $0x02, Y12, Y4, Y7
9370 VPERM2I128 $0x13, Y0, Y14, Y11
9371 VPERM2I128 $0x13, Y12, Y4, Y15
9372 VPXOR (SI), Y3, Y3
9373 VPXOR 32(SI), Y7, Y7
9374 VPXOR 64(SI), Y11, Y11
9375 VPXOR 96(SI), Y15, Y15
9376 VMOVDQU Y3, (DI)
9377 VMOVDQU Y7, 32(DI)
9378 VMOVDQU Y11, 64(DI)
9379 VMOVDQU Y15, 96(DI)
9380 VPERM2I128 $0x02, Y5, Y9, Y3
9381 VPERM2I128 $0x02, Y13, Y1, Y7
9382 VPERM2I128 $0x13, Y5, Y9, Y11
9383 VPERM2I128 $0x13, Y13, Y1, Y15
9384 VPXOR 128(SI), Y3, Y3
9385 VPXOR 160(SI), Y7, Y7
9386 VPXOR 192(SI), Y11, Y11
9387 VPXOR 224(SI), Y15, Y15
9388 VMOVDQU Y3, 128(DI)
9389 VMOVDQU Y7, 160(DI)
9390 VMOVDQU Y11, 192(DI)
9391 VMOVDQU Y15, 224(DI)
9392 MOVQ $0x00000100, CX
9393 LEAQ 256(SI), SI
9394 SUBQ $0x00000100, BX
9395 VPERM2I128 $0x02, Y6, Y10, Y0
9396 VPERM2I128 $0x02, Y8, Y2, Y14
9397 VPERM2I128 $0x13, Y6, Y10, Y12
9398 VPERM2I128 $0x13, Y8, Y2, Y4
9399 JMP sealAVX2SealHash
9400
9401sealAVX2Tail512:
9402 VMOVDQA ·chacha20Constants<>+0(SB), Y0
9403 VMOVDQA Y0, Y5
9404 VMOVDQA Y0, Y6
9405 VMOVDQA Y0, Y7
9406 VMOVDQA 32(BP), Y14
9407 VMOVDQA Y14, Y9
9408 VMOVDQA Y14, Y10
9409 VMOVDQA Y14, Y11
9410 VMOVDQA 64(BP), Y12
9411 VMOVDQA Y12, Y13
9412 VMOVDQA Y12, Y8
9413 VMOVDQA Y12, Y15
9414 VMOVDQA 192(BP), Y4
9415 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
9416 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
9417 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
9418 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
9419 VMOVDQA Y4, 96(BP)
9420 VMOVDQA Y1, 128(BP)
9421 VMOVDQA Y2, 160(BP)
9422 VMOVDQA Y3, 192(BP)
9423
9424sealAVX2Tail512LoopA:
9425 ADDQ (DI), R10
9426 ADCQ 8(DI), R11
9427 ADCQ $0x01, R12
9428 MOVQ (BP), AX
9429 MOVQ AX, R15
9430 MULQ R10
9431 MOVQ AX, R13
9432 MOVQ DX, R14
9433 MOVQ (BP), AX
9434 MULQ R11
9435 IMULQ R12, R15
9436 ADDQ AX, R14
9437 ADCQ DX, R15
9438 MOVQ 8(BP), AX
9439 MOVQ AX, R8
9440 MULQ R10
9441 ADDQ AX, R14
9442 ADCQ $0x00, DX
9443 MOVQ DX, R10
9444 MOVQ 8(BP), AX
9445 MULQ R11
9446 ADDQ AX, R15
9447 ADCQ $0x00, DX
9448 IMULQ R12, R8
9449 ADDQ R10, R15
9450 ADCQ DX, R8
9451 MOVQ R13, R10
9452 MOVQ R14, R11
9453 MOVQ R15, R12
9454 ANDQ $0x03, R12
9455 MOVQ R15, R13
9456 ANDQ $-4, R13
9457 MOVQ R8, R14
9458 SHRQ $0x02, R8, R15
9459 SHRQ $0x02, R8
9460 ADDQ R13, R10
9461 ADCQ R14, R11
9462 ADCQ $0x00, R12
9463 ADDQ R15, R10
9464 ADCQ R8, R11
9465 ADCQ $0x00, R12
9466 LEAQ 16(DI), DI
9467
9468sealAVX2Tail512LoopB:
9469 VPADDD Y14, Y0, Y0
9470 VPADDD Y9, Y5, Y5
9471 VPADDD Y10, Y6, Y6
9472 VPADDD Y11, Y7, Y7
9473 VPXOR Y0, Y4, Y4
9474 VPXOR Y5, Y1, Y1
9475 VPXOR Y6, Y2, Y2
9476 VPXOR Y7, Y3, Y3
9477 VPSHUFB ·rol16<>+0(SB), Y4, Y4
9478 VPSHUFB ·rol16<>+0(SB), Y1, Y1
9479 VPSHUFB ·rol16<>+0(SB), Y2, Y2
9480 VPSHUFB ·rol16<>+0(SB), Y3, Y3
9481 VPADDD Y4, Y12, Y12
9482 VPADDD Y1, Y13, Y13
9483 VPADDD Y2, Y8, Y8
9484 VPADDD Y3, Y15, Y15
9485 VPXOR Y12, Y14, Y14
9486 VPXOR Y13, Y9, Y9
9487 VPXOR Y8, Y10, Y10
9488 VPXOR Y15, Y11, Y11
9489 VMOVDQA Y15, 224(BP)
9490 VPSLLD $0x0c, Y14, Y15
9491 VPSRLD $0x14, Y14, Y14
9492 VPXOR Y15, Y14, Y14
9493 VPSLLD $0x0c, Y9, Y15
9494 VPSRLD $0x14, Y9, Y9
9495 VPXOR Y15, Y9, Y9
9496 VPSLLD $0x0c, Y10, Y15
9497 VPSRLD $0x14, Y10, Y10
9498 VPXOR Y15, Y10, Y10
9499 VPSLLD $0x0c, Y11, Y15
9500 VPSRLD $0x14, Y11, Y11
9501 VPXOR Y15, Y11, Y11
9502 VMOVDQA 224(BP), Y15
9503 ADDQ (DI), R10
9504 ADCQ 8(DI), R11
9505 ADCQ $0x01, R12
9506 MOVQ (BP), DX
9507 MOVQ DX, R15
9508 MULXQ R10, R13, R14
9509 IMULQ R12, R15
9510 MULXQ R11, AX, DX
9511 ADDQ AX, R14
9512 ADCQ DX, R15
9513 MOVQ 8(BP), DX
9514 MULXQ R10, R10, AX
9515 ADDQ R10, R14
9516 MULXQ R11, R11, R8
9517 ADCQ R11, R15
9518 ADCQ $0x00, R8
9519 IMULQ R12, DX
9520 ADDQ AX, R15
9521 ADCQ DX, R8
9522 MOVQ R13, R10
9523 MOVQ R14, R11
9524 MOVQ R15, R12
9525 ANDQ $0x03, R12
9526 MOVQ R15, R13
9527 ANDQ $-4, R13
9528 MOVQ R8, R14
9529 SHRQ $0x02, R8, R15
9530 SHRQ $0x02, R8
9531 ADDQ R13, R10
9532 ADCQ R14, R11
9533 ADCQ $0x00, R12
9534 ADDQ R15, R10
9535 ADCQ R8, R11
9536 ADCQ $0x00, R12
9537 VPADDD Y14, Y0, Y0
9538 VPADDD Y9, Y5, Y5
9539 VPADDD Y10, Y6, Y6
9540 VPADDD Y11, Y7, Y7
9541 VPXOR Y0, Y4, Y4
9542 VPXOR Y5, Y1, Y1
9543 VPXOR Y6, Y2, Y2
9544 VPXOR Y7, Y3, Y3
9545 VPSHUFB ·rol8<>+0(SB), Y4, Y4
9546 VPSHUFB ·rol8<>+0(SB), Y1, Y1
9547 VPSHUFB ·rol8<>+0(SB), Y2, Y2
9548 VPSHUFB ·rol8<>+0(SB), Y3, Y3
9549 VPADDD Y4, Y12, Y12
9550 VPADDD Y1, Y13, Y13
9551 VPADDD Y2, Y8, Y8
9552 VPADDD Y3, Y15, Y15
9553 VPXOR Y12, Y14, Y14
9554 VPXOR Y13, Y9, Y9
9555 VPXOR Y8, Y10, Y10
9556 VPXOR Y15, Y11, Y11
9557 VMOVDQA Y15, 224(BP)
9558 VPSLLD $0x07, Y14, Y15
9559 VPSRLD $0x19, Y14, Y14
9560 VPXOR Y15, Y14, Y14
9561 VPSLLD $0x07, Y9, Y15
9562 VPSRLD $0x19, Y9, Y9
9563 VPXOR Y15, Y9, Y9
9564 VPSLLD $0x07, Y10, Y15
9565 VPSRLD $0x19, Y10, Y10
9566 VPXOR Y15, Y10, Y10
9567 VPSLLD $0x07, Y11, Y15
9568 VPSRLD $0x19, Y11, Y11
9569 VPXOR Y15, Y11, Y11
9570 VMOVDQA 224(BP), Y15
9571 VPALIGNR $0x04, Y14, Y14, Y14
9572 VPALIGNR $0x04, Y9, Y9, Y9
9573 VPALIGNR $0x04, Y10, Y10, Y10
9574 VPALIGNR $0x04, Y11, Y11, Y11
9575 VPALIGNR $0x08, Y12, Y12, Y12
9576 VPALIGNR $0x08, Y13, Y13, Y13
9577 VPALIGNR $0x08, Y8, Y8, Y8
9578 VPALIGNR $0x08, Y15, Y15, Y15
9579 VPALIGNR $0x0c, Y4, Y4, Y4
9580 VPALIGNR $0x0c, Y1, Y1, Y1
9581 VPALIGNR $0x0c, Y2, Y2, Y2
9582 VPALIGNR $0x0c, Y3, Y3, Y3
9583 VPADDD Y14, Y0, Y0
9584 VPADDD Y9, Y5, Y5
9585 VPADDD Y10, Y6, Y6
9586 VPADDD Y11, Y7, Y7
9587 VPXOR Y0, Y4, Y4
9588 VPXOR Y5, Y1, Y1
9589 VPXOR Y6, Y2, Y2
9590 VPXOR Y7, Y3, Y3
9591 VPSHUFB ·rol16<>+0(SB), Y4, Y4
9592 VPSHUFB ·rol16<>+0(SB), Y1, Y1
9593 VPSHUFB ·rol16<>+0(SB), Y2, Y2
9594 VPSHUFB ·rol16<>+0(SB), Y3, Y3
9595 VPADDD Y4, Y12, Y12
9596 VPADDD Y1, Y13, Y13
9597 VPADDD Y2, Y8, Y8
9598 VPADDD Y3, Y15, Y15
9599 VPXOR Y12, Y14, Y14
9600 VPXOR Y13, Y9, Y9
9601 VPXOR Y8, Y10, Y10
9602 VPXOR Y15, Y11, Y11
9603 ADDQ 16(DI), R10
9604 ADCQ 24(DI), R11
9605 ADCQ $0x01, R12
9606 MOVQ (BP), DX
9607 MOVQ DX, R15
9608 MULXQ R10, R13, R14
9609 IMULQ R12, R15
9610 MULXQ R11, AX, DX
9611 ADDQ AX, R14
9612 ADCQ DX, R15
9613 MOVQ 8(BP), DX
9614 MULXQ R10, R10, AX
9615 ADDQ R10, R14
9616 MULXQ R11, R11, R8
9617 ADCQ R11, R15
9618 ADCQ $0x00, R8
9619 IMULQ R12, DX
9620 ADDQ AX, R15
9621 ADCQ DX, R8
9622 MOVQ R13, R10
9623 MOVQ R14, R11
9624 MOVQ R15, R12
9625 ANDQ $0x03, R12
9626 MOVQ R15, R13
9627 ANDQ $-4, R13
9628 MOVQ R8, R14
9629 SHRQ $0x02, R8, R15
9630 SHRQ $0x02, R8
9631 ADDQ R13, R10
9632 ADCQ R14, R11
9633 ADCQ $0x00, R12
9634 ADDQ R15, R10
9635 ADCQ R8, R11
9636 ADCQ $0x00, R12
9637 LEAQ 32(DI), DI
9638 VMOVDQA Y15, 224(BP)
9639 VPSLLD $0x0c, Y14, Y15
9640 VPSRLD $0x14, Y14, Y14
9641 VPXOR Y15, Y14, Y14
9642 VPSLLD $0x0c, Y9, Y15
9643 VPSRLD $0x14, Y9, Y9
9644 VPXOR Y15, Y9, Y9
9645 VPSLLD $0x0c, Y10, Y15
9646 VPSRLD $0x14, Y10, Y10
9647 VPXOR Y15, Y10, Y10
9648 VPSLLD $0x0c, Y11, Y15
9649 VPSRLD $0x14, Y11, Y11
9650 VPXOR Y15, Y11, Y11
9651 VMOVDQA 224(BP), Y15
9652 VPADDD Y14, Y0, Y0
9653 VPADDD Y9, Y5, Y5
9654 VPADDD Y10, Y6, Y6
9655 VPADDD Y11, Y7, Y7
9656 VPXOR Y0, Y4, Y4
9657 VPXOR Y5, Y1, Y1
9658 VPXOR Y6, Y2, Y2
9659 VPXOR Y7, Y3, Y3
9660 VPSHUFB ·rol8<>+0(SB), Y4, Y4
9661 VPSHUFB ·rol8<>+0(SB), Y1, Y1
9662 VPSHUFB ·rol8<>+0(SB), Y2, Y2
9663 VPSHUFB ·rol8<>+0(SB), Y3, Y3
9664 VPADDD Y4, Y12, Y12
9665 VPADDD Y1, Y13, Y13
9666 VPADDD Y2, Y8, Y8
9667 VPADDD Y3, Y15, Y15
9668 VPXOR Y12, Y14, Y14
9669 VPXOR Y13, Y9, Y9
9670 VPXOR Y8, Y10, Y10
9671 VPXOR Y15, Y11, Y11
9672 VMOVDQA Y15, 224(BP)
9673 VPSLLD $0x07, Y14, Y15
9674 VPSRLD $0x19, Y14, Y14
9675 VPXOR Y15, Y14, Y14
9676 VPSLLD $0x07, Y9, Y15
9677 VPSRLD $0x19, Y9, Y9
9678 VPXOR Y15, Y9, Y9
9679 VPSLLD $0x07, Y10, Y15
9680 VPSRLD $0x19, Y10, Y10
9681 VPXOR Y15, Y10, Y10
9682 VPSLLD $0x07, Y11, Y15
9683 VPSRLD $0x19, Y11, Y11
9684 VPXOR Y15, Y11, Y11
9685 VMOVDQA 224(BP), Y15
9686 VPALIGNR $0x0c, Y14, Y14, Y14
9687 VPALIGNR $0x0c, Y9, Y9, Y9
9688 VPALIGNR $0x0c, Y10, Y10, Y10
9689 VPALIGNR $0x0c, Y11, Y11, Y11
9690 VPALIGNR $0x08, Y12, Y12, Y12
9691 VPALIGNR $0x08, Y13, Y13, Y13
9692 VPALIGNR $0x08, Y8, Y8, Y8
9693 VPALIGNR $0x08, Y15, Y15, Y15
9694 VPALIGNR $0x04, Y4, Y4, Y4
9695 VPALIGNR $0x04, Y1, Y1, Y1
9696 VPALIGNR $0x04, Y2, Y2, Y2
9697 VPALIGNR $0x04, Y3, Y3, Y3
9698 DECQ CX
9699 JG sealAVX2Tail512LoopA
9700 DECQ R9
9701 JGE sealAVX2Tail512LoopB
9702 VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
9703 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
9704 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
9705 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
9706 VPADDD 32(BP), Y14, Y14
9707 VPADDD 32(BP), Y9, Y9
9708 VPADDD 32(BP), Y10, Y10
9709 VPADDD 32(BP), Y11, Y11
9710 VPADDD 64(BP), Y12, Y12
9711 VPADDD 64(BP), Y13, Y13
9712 VPADDD 64(BP), Y8, Y8
9713 VPADDD 64(BP), Y15, Y15
9714 VPADDD 96(BP), Y4, Y4
9715 VPADDD 128(BP), Y1, Y1
9716 VPADDD 160(BP), Y2, Y2
9717 VPADDD 192(BP), Y3, Y3
9718 VMOVDQA Y15, 224(BP)
9719 VPERM2I128 $0x02, Y0, Y14, Y15
9720 VPXOR (SI), Y15, Y15
9721 VMOVDQU Y15, (DI)
9722 VPERM2I128 $0x02, Y12, Y4, Y15
9723 VPXOR 32(SI), Y15, Y15
9724 VMOVDQU Y15, 32(DI)
9725 VPERM2I128 $0x13, Y0, Y14, Y15
9726 VPXOR 64(SI), Y15, Y15
9727 VMOVDQU Y15, 64(DI)
9728 VPERM2I128 $0x13, Y12, Y4, Y15
9729 VPXOR 96(SI), Y15, Y15
9730 VMOVDQU Y15, 96(DI)
9731 VPERM2I128 $0x02, Y5, Y9, Y0
9732 VPERM2I128 $0x02, Y13, Y1, Y14
9733 VPERM2I128 $0x13, Y5, Y9, Y12
9734 VPERM2I128 $0x13, Y13, Y1, Y4
9735 VPXOR 128(SI), Y0, Y0
9736 VPXOR 160(SI), Y14, Y14
9737 VPXOR 192(SI), Y12, Y12
9738 VPXOR 224(SI), Y4, Y4
9739 VMOVDQU Y0, 128(DI)
9740 VMOVDQU Y14, 160(DI)
9741 VMOVDQU Y12, 192(DI)
9742 VMOVDQU Y4, 224(DI)
9743 VPERM2I128 $0x02, Y6, Y10, Y0
9744 VPERM2I128 $0x02, Y8, Y2, Y14
9745 VPERM2I128 $0x13, Y6, Y10, Y12
9746 VPERM2I128 $0x13, Y8, Y2, Y4
9747 VPXOR 256(SI), Y0, Y0
9748 VPXOR 288(SI), Y14, Y14
9749 VPXOR 320(SI), Y12, Y12
9750 VPXOR 352(SI), Y4, Y4
9751 VMOVDQU Y0, 256(DI)
9752 VMOVDQU Y14, 288(DI)
9753 VMOVDQU Y12, 320(DI)
9754 VMOVDQU Y4, 352(DI)
9755 MOVQ $0x00000180, CX
9756 LEAQ 384(SI), SI
9757 SUBQ $0x00000180, BX
9758 VPERM2I128 $0x02, Y7, Y11, Y0
9759 VPERM2I128 $0x02, 224(BP), Y3, Y14
9760 VPERM2I128 $0x13, Y7, Y11, Y12
9761 VPERM2I128 $0x13, 224(BP), Y3, Y4
9762 JMP sealAVX2SealHash
View as plain text