...

Text file src/crypto/sha512/sha512block_amd64.s

Documentation: crypto/sha512

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7#include "textflag.h"
     8
     9// SHA512 block routine. See sha512block.go for Go equivalent.
    10//
    11// The algorithm is detailed in FIPS 180-4:
    12//
    13//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    14//
    15// Wt = Mt; for 0 <= t <= 15
    16// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    17//
    18// a = H0
    19// b = H1
    20// c = H2
    21// d = H3
    22// e = H4
    23// f = H5
    24// g = H6
    25// h = H7
    26//
    27// for t = 0 to 79 {
    28//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    29//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    30//    h = g
    31//    g = f
    32//    f = e
    33//    e = d + T1
    34//    d = c
    35//    c = b
    36//    b = a
    37//    a = T1 + T2
    38// }
    39//
    40// H0 = a + H0
    41// H1 = b + H1
    42// H2 = c + H2
    43// H3 = d + H3
    44// H4 = e + H4
    45// H5 = f + H5
    46// H6 = g + H6
    47// H7 = h + H7
    48
    49// Wt = Mt; for 0 <= t <= 15
    50#define MSGSCHEDULE0(index) \
    51	MOVQ	(index*8)(SI), AX; \
    52	BSWAPQ	AX; \
    53	MOVQ	AX, (index*8)(BP)
    54
    55// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    56//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    57//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    58#define MSGSCHEDULE1(index) \
    59	MOVQ	((index-2)*8)(BP), AX; \
    60	MOVQ	AX, CX; \
    61	RORQ	$19, AX; \
    62	MOVQ	CX, DX; \
    63	RORQ	$61, CX; \
    64	SHRQ	$6, DX; \
    65	MOVQ	((index-15)*8)(BP), BX; \
    66	XORQ	CX, AX; \
    67	MOVQ	BX, CX; \
    68	XORQ	DX, AX; \
    69	RORQ	$1, BX; \
    70	MOVQ	CX, DX; \
    71	SHRQ	$7, DX; \
    72	RORQ	$8, CX; \
    73	ADDQ	((index-7)*8)(BP), AX; \
    74	XORQ	CX, BX; \
    75	XORQ	DX, BX; \
    76	ADDQ	((index-16)*8)(BP), BX; \
    77	ADDQ	BX, AX; \
    78	MOVQ	AX, ((index)*8)(BP)
    79
    80// Calculate T1 in AX - uses AX, CX and DX registers.
    81// h is also used as an accumulator. Wt is passed in AX.
    82//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    83//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    84//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    85#define SHA512T1(const, e, f, g, h) \
    86	MOVQ	$const, DX; \
    87	ADDQ	AX, h; \
    88	MOVQ	e, AX; \
    89	ADDQ	DX, h; \
    90	MOVQ	e, CX; \
    91	RORQ	$14, AX; \
    92	MOVQ	e, DX; \
    93	RORQ	$18, CX; \
    94	XORQ	CX, AX; \
    95	MOVQ	e, CX; \
    96	RORQ	$41, DX; \
    97	ANDQ	f, CX; \
    98	XORQ	AX, DX; \
    99	MOVQ	e, AX; \
   100	NOTQ	AX; \
   101	ADDQ	DX, h; \
   102	ANDQ	g, AX; \
   103	XORQ	CX, AX; \
   104	ADDQ	h, AX
   105
   106// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   107//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   108//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   109//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   110#define SHA512T2(a, b, c) \
   111	MOVQ	a, DI; \
   112	MOVQ	c, BX; \
   113	RORQ	$28, DI; \
   114	MOVQ	a, DX; \
   115	ANDQ	b, BX; \
   116	RORQ	$34, DX; \
   117	MOVQ	a, CX; \
   118	ANDQ	c, CX; \
   119	XORQ	DX, DI; \
   120	XORQ	CX, BX; \
   121	MOVQ	a, DX; \
   122	MOVQ	b, CX; \
   123	RORQ	$39, DX; \
   124	ANDQ	a, CX; \
   125	XORQ	CX, BX; \
   126	XORQ	DX, DI; \
   127	ADDQ	DI, BX
   128
   129// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   130// The values for e and a are stored in d and h, ready for rotation.
   131#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   132	SHA512T1(const, e, f, g, h); \
   133	SHA512T2(a, b, c); \
   134	MOVQ	BX, h; \
   135	ADDQ	AX, d; \
   136	ADDQ	AX, h
   137
   138#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   139	MSGSCHEDULE0(index); \
   140	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   141
   142#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   143	MSGSCHEDULE1(index); \
   144	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   145
   146TEXT ·blockAMD64(SB),0,$648-32
   147	MOVQ	p_base+8(FP), SI
   148	MOVQ	p_len+16(FP), DX
   149	SHRQ	$7, DX
   150	SHLQ	$7, DX
   151
   152	LEAQ	(SI)(DX*1), DI
   153	MOVQ	DI, 640(SP)
   154	CMPQ	SI, DI
   155	JEQ	end
   156
   157	MOVQ	dig+0(FP), BP
   158	MOVQ	(0*8)(BP), R8		// a = H0
   159	MOVQ	(1*8)(BP), R9		// b = H1
   160	MOVQ	(2*8)(BP), R10		// c = H2
   161	MOVQ	(3*8)(BP), R11		// d = H3
   162	MOVQ	(4*8)(BP), R12		// e = H4
   163	MOVQ	(5*8)(BP), R13		// f = H5
   164	MOVQ	(6*8)(BP), R14		// g = H6
   165	MOVQ	(7*8)(BP), R15		// h = H7
   166
   167loop:
   168	MOVQ	SP, BP			// message schedule
   169
   170	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   171	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   172	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   173	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   174	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   175	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   176	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   177	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   178	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   179	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   180	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   181	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   182	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   183	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   184	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   185	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   186
   187	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   188	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   189	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   190	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   191	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   192	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   193	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   194	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   195	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   196	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   197	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   198	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   199	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   200	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   201	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   202	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   203	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   204	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   205	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   206	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   207	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   208	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   209	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   210	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   211	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   212	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   213	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   214	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   215	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   216	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   217	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   218	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   219	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   220	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   221	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   222	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   223	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   224	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   225	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   226	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   227	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   228	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   229	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   230	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   231	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   232	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   233	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   234	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   235	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   236	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   237	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   238	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   239	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   240	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   241	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   242	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   243	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   244	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   245	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   246	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   247	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   248	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   249	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   250	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   251
   252	MOVQ	dig+0(FP), BP
   253	ADDQ	(0*8)(BP), R8	// H0 = a + H0
   254	MOVQ	R8, (0*8)(BP)
   255	ADDQ	(1*8)(BP), R9	// H1 = b + H1
   256	MOVQ	R9, (1*8)(BP)
   257	ADDQ	(2*8)(BP), R10	// H2 = c + H2
   258	MOVQ	R10, (2*8)(BP)
   259	ADDQ	(3*8)(BP), R11	// H3 = d + H3
   260	MOVQ	R11, (3*8)(BP)
   261	ADDQ	(4*8)(BP), R12	// H4 = e + H4
   262	MOVQ	R12, (4*8)(BP)
   263	ADDQ	(5*8)(BP), R13	// H5 = f + H5
   264	MOVQ	R13, (5*8)(BP)
   265	ADDQ	(6*8)(BP), R14	// H6 = g + H6
   266	MOVQ	R14, (6*8)(BP)
   267	ADDQ	(7*8)(BP), R15	// H7 = h + H7
   268	MOVQ	R15, (7*8)(BP)
   269
   270	ADDQ	$128, SI
   271	CMPQ	SI, 640(SP)
   272	JB	loop
   273
   274end:
   275	RET
   276
   277// Version below is based on "Fast SHA512 Implementations on Intel
   278// Architecture Processors" White-paper
   279// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   280// AVX2 version by Intel, same algorithm in Linux kernel:
   281// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   282
   283// James Guilford <james.guilford@intel.com>
   284// Kirk Yap <kirk.s.yap@intel.com>
   285// Tim Chen <tim.c.chen@linux.intel.com>
   286// David Cote <david.m.cote@intel.com>
   287// Aleksey Sidorov <aleksey.sidorov@intel.com>
   288
   289#define YFER_SIZE (4*8)
   290#define SRND_SIZE (1*8)
   291#define INP_SIZE (1*8)
   292
   293#define frame_YFER (0)
   294#define frame_SRND (frame_YFER + YFER_SIZE)
   295#define frame_INP (frame_SRND + SRND_SIZE)
   296#define frame_INPEND (frame_INP + INP_SIZE)
   297
   298#define addm(p1, p2) \
   299	ADDQ p1, p2; \
   300	MOVQ p2, p1
   301
   302#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   303	VMOVDQU p2, p1;    \
   304	VPSHUFB p3, p1, p1
   305
   306#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   307	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   308	VPALIGNR   $RVAL, YSRC2, YDST, YDST
   309
   310DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   311DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   312DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   313DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   314
   315GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   316
   317DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   318DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   319DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   320DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   321
   322GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   323
   324TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   325	MOVQ dig+0(FP), SI
   326	MOVQ p_base+8(FP), DI
   327	MOVQ p_len+16(FP), DX
   328
   329	SHRQ $7, DX
   330	SHLQ $7, DX
   331
   332	JZ   done_hash
   333	ADDQ DI, DX
   334	MOVQ DX, frame_INPEND(SP)
   335
   336	MOVQ (0*8)(SI), AX
   337	MOVQ (1*8)(SI), BX
   338	MOVQ (2*8)(SI), CX
   339	MOVQ (3*8)(SI), R8
   340	MOVQ (4*8)(SI), DX
   341	MOVQ (5*8)(SI), R9
   342	MOVQ (6*8)(SI), R10
   343	MOVQ (7*8)(SI), R11
   344
   345	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
   346
   347loop0:
   348	MOVQ ·_K+0(SB), BP
   349
   350	// byte swap first 16 dwords
   351	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   352	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   353	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   354	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   355
   356	MOVQ DI, frame_INP(SP)
   357
   358	// schedule 64 input dwords, by doing 12 rounds of 4 each
   359	MOVQ $4, frame_SRND(SP)
   360
   361loop1:
   362	VPADDQ  (BP), Y4, Y0
   363	VMOVDQU Y0, frame_YFER(SP)
   364
   365	MY_VPALIGNR(Y0, Y7, Y6, 8)
   366
   367	VPADDQ Y4, Y0, Y0
   368
   369	MY_VPALIGNR(Y1, Y5, Y4, 8)
   370
   371	VPSRLQ $1, Y1, Y2
   372	VPSLLQ $(64-1), Y1, Y3
   373	VPOR   Y2, Y3, Y3
   374
   375	VPSRLQ $7, Y1, Y8
   376
   377	MOVQ  AX, DI
   378	RORXQ $41, DX, R13
   379	RORXQ $18, DX, R14
   380	ADDQ  frame_YFER(SP), R11
   381	ORQ   CX, DI
   382	MOVQ  R9, R15
   383	RORXQ $34, AX, R12
   384
   385	XORQ  R14, R13
   386	XORQ  R10, R15
   387	RORXQ $14, DX, R14
   388
   389	ANDQ  DX, R15
   390	XORQ  R14, R13
   391	RORXQ $39, AX, R14
   392	ADDQ  R11, R8
   393
   394	ANDQ  BX, DI
   395	XORQ  R12, R14
   396	RORXQ $28, AX, R12
   397
   398	XORQ R10, R15
   399	XORQ R12, R14
   400	MOVQ AX, R12
   401	ANDQ CX, R12
   402
   403	ADDQ R13, R15
   404	ORQ  R12, DI
   405	ADDQ R14, R11
   406
   407	ADDQ R15, R8
   408
   409	ADDQ R15, R11
   410	ADDQ DI, R11
   411
   412	VPSRLQ $8, Y1, Y2
   413	VPSLLQ $(64-8), Y1, Y1
   414	VPOR   Y2, Y1, Y1
   415
   416	VPXOR Y8, Y3, Y3
   417	VPXOR Y1, Y3, Y1
   418
   419	VPADDQ Y1, Y0, Y0
   420
   421	VPERM2F128 $0x0, Y0, Y0, Y4
   422
   423	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   424
   425	VPERM2F128 $0x11, Y7, Y7, Y2
   426	VPSRLQ     $6, Y2, Y8
   427
   428	MOVQ  R11, DI
   429	RORXQ $41, R8, R13
   430	RORXQ $18, R8, R14
   431	ADDQ  1*8+frame_YFER(SP), R10
   432	ORQ   BX, DI
   433
   434	MOVQ  DX, R15
   435	RORXQ $34, R11, R12
   436	XORQ  R14, R13
   437	XORQ  R9, R15
   438
   439	RORXQ $14, R8, R14
   440	XORQ  R14, R13
   441	RORXQ $39, R11, R14
   442	ANDQ  R8, R15
   443	ADDQ  R10, CX
   444
   445	ANDQ AX, DI
   446	XORQ R12, R14
   447
   448	RORXQ $28, R11, R12
   449	XORQ  R9, R15
   450
   451	XORQ R12, R14
   452	MOVQ R11, R12
   453	ANDQ BX, R12
   454	ADDQ R13, R15
   455
   456	ORQ  R12, DI
   457	ADDQ R14, R10
   458
   459	ADDQ R15, CX
   460	ADDQ R15, R10
   461	ADDQ DI, R10
   462
   463	VPSRLQ $19, Y2, Y3
   464	VPSLLQ $(64-19), Y2, Y1
   465	VPOR   Y1, Y3, Y3
   466	VPXOR  Y3, Y8, Y8
   467	VPSRLQ $61, Y2, Y3
   468	VPSLLQ $(64-61), Y2, Y1
   469	VPOR   Y1, Y3, Y3
   470	VPXOR  Y3, Y8, Y8
   471
   472	VPADDQ Y8, Y4, Y4
   473
   474	VPSRLQ $6, Y4, Y8
   475
   476	MOVQ  R10, DI
   477	RORXQ $41, CX, R13
   478	ADDQ  2*8+frame_YFER(SP), R9
   479
   480	RORXQ $18, CX, R14
   481	ORQ   AX, DI
   482	MOVQ  R8, R15
   483	XORQ  DX, R15
   484
   485	RORXQ $34, R10, R12
   486	XORQ  R14, R13
   487	ANDQ  CX, R15
   488
   489	RORXQ $14, CX, R14
   490	ADDQ  R9, BX
   491	ANDQ  R11, DI
   492
   493	XORQ  R14, R13
   494	RORXQ $39, R10, R14
   495	XORQ  DX, R15
   496
   497	XORQ  R12, R14
   498	RORXQ $28, R10, R12
   499
   500	XORQ R12, R14
   501	MOVQ R10, R12
   502	ANDQ AX, R12
   503	ADDQ R13, R15
   504
   505	ORQ  R12, DI
   506	ADDQ R14, R9
   507	ADDQ R15, BX
   508	ADDQ R15, R9
   509
   510	ADDQ DI, R9
   511
   512	VPSRLQ $19, Y4, Y3
   513	VPSLLQ $(64-19), Y4, Y1
   514	VPOR   Y1, Y3, Y3
   515	VPXOR  Y3, Y8, Y8
   516	VPSRLQ $61, Y4, Y3
   517	VPSLLQ $(64-61), Y4, Y1
   518	VPOR   Y1, Y3, Y3
   519	VPXOR  Y3, Y8, Y8
   520
   521	VPADDQ Y8, Y0, Y2
   522
   523	VPBLENDD $0xF0, Y2, Y4, Y4
   524
   525	MOVQ  R9, DI
   526	RORXQ $41, BX, R13
   527	RORXQ $18, BX, R14
   528	ADDQ  3*8+frame_YFER(SP), DX
   529	ORQ   R11, DI
   530
   531	MOVQ  CX, R15
   532	RORXQ $34, R9, R12
   533	XORQ  R14, R13
   534	XORQ  R8, R15
   535
   536	RORXQ $14, BX, R14
   537	ANDQ  BX, R15
   538	ADDQ  DX, AX
   539	ANDQ  R10, DI
   540
   541	XORQ R14, R13
   542	XORQ R8, R15
   543
   544	RORXQ $39, R9, R14
   545	ADDQ  R13, R15
   546
   547	XORQ R12, R14
   548	ADDQ R15, AX
   549
   550	RORXQ $28, R9, R12
   551
   552	XORQ R12, R14
   553	MOVQ R9, R12
   554	ANDQ R11, R12
   555	ORQ  R12, DI
   556
   557	ADDQ R14, DX
   558	ADDQ R15, DX
   559	ADDQ DI, DX
   560
   561	VPADDQ  1*32(BP), Y5, Y0
   562	VMOVDQU Y0, frame_YFER(SP)
   563
   564	MY_VPALIGNR(Y0, Y4, Y7, 8)
   565
   566	VPADDQ Y5, Y0, Y0
   567
   568	MY_VPALIGNR(Y1, Y6, Y5, 8)
   569
   570	VPSRLQ $1, Y1, Y2
   571	VPSLLQ $(64-1), Y1, Y3
   572	VPOR   Y2, Y3, Y3
   573
   574	VPSRLQ $7, Y1, Y8
   575
   576	MOVQ  DX, DI
   577	RORXQ $41, AX, R13
   578	RORXQ $18, AX, R14
   579	ADDQ  frame_YFER(SP), R8
   580	ORQ   R10, DI
   581	MOVQ  BX, R15
   582	RORXQ $34, DX, R12
   583
   584	XORQ  R14, R13
   585	XORQ  CX, R15
   586	RORXQ $14, AX, R14
   587
   588	ANDQ  AX, R15
   589	XORQ  R14, R13
   590	RORXQ $39, DX, R14
   591	ADDQ  R8, R11
   592
   593	ANDQ  R9, DI
   594	XORQ  R12, R14
   595	RORXQ $28, DX, R12
   596
   597	XORQ CX, R15
   598	XORQ R12, R14
   599	MOVQ DX, R12
   600	ANDQ R10, R12
   601
   602	ADDQ R13, R15
   603	ORQ  R12, DI
   604	ADDQ R14, R8
   605
   606	ADDQ R15, R11
   607
   608	ADDQ R15, R8
   609	ADDQ DI, R8
   610
   611	VPSRLQ $8, Y1, Y2
   612	VPSLLQ $(64-8), Y1, Y1
   613	VPOR   Y2, Y1, Y1
   614
   615	VPXOR Y8, Y3, Y3
   616	VPXOR Y1, Y3, Y1
   617
   618	VPADDQ Y1, Y0, Y0
   619
   620	VPERM2F128 $0x0, Y0, Y0, Y5
   621
   622	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   623
   624	VPERM2F128 $0x11, Y4, Y4, Y2
   625	VPSRLQ     $6, Y2, Y8
   626
   627	MOVQ  R8, DI
   628	RORXQ $41, R11, R13
   629	RORXQ $18, R11, R14
   630	ADDQ  1*8+frame_YFER(SP), CX
   631	ORQ   R9, DI
   632
   633	MOVQ  AX, R15
   634	RORXQ $34, R8, R12
   635	XORQ  R14, R13
   636	XORQ  BX, R15
   637
   638	RORXQ $14, R11, R14
   639	XORQ  R14, R13
   640	RORXQ $39, R8, R14
   641	ANDQ  R11, R15
   642	ADDQ  CX, R10
   643
   644	ANDQ DX, DI
   645	XORQ R12, R14
   646
   647	RORXQ $28, R8, R12
   648	XORQ  BX, R15
   649
   650	XORQ R12, R14
   651	MOVQ R8, R12
   652	ANDQ R9, R12
   653	ADDQ R13, R15
   654
   655	ORQ  R12, DI
   656	ADDQ R14, CX
   657
   658	ADDQ R15, R10
   659	ADDQ R15, CX
   660	ADDQ DI, CX
   661
   662	VPSRLQ $19, Y2, Y3
   663	VPSLLQ $(64-19), Y2, Y1
   664	VPOR   Y1, Y3, Y3
   665	VPXOR  Y3, Y8, Y8
   666	VPSRLQ $61, Y2, Y3
   667	VPSLLQ $(64-61), Y2, Y1
   668	VPOR   Y1, Y3, Y3
   669	VPXOR  Y3, Y8, Y8
   670
   671	VPADDQ Y8, Y5, Y5
   672
   673	VPSRLQ $6, Y5, Y8
   674
   675	MOVQ  CX, DI
   676	RORXQ $41, R10, R13
   677	ADDQ  2*8+frame_YFER(SP), BX
   678
   679	RORXQ $18, R10, R14
   680	ORQ   DX, DI
   681	MOVQ  R11, R15
   682	XORQ  AX, R15
   683
   684	RORXQ $34, CX, R12
   685	XORQ  R14, R13
   686	ANDQ  R10, R15
   687
   688	RORXQ $14, R10, R14
   689	ADDQ  BX, R9
   690	ANDQ  R8, DI
   691
   692	XORQ  R14, R13
   693	RORXQ $39, CX, R14
   694	XORQ  AX, R15
   695
   696	XORQ  R12, R14
   697	RORXQ $28, CX, R12
   698
   699	XORQ R12, R14
   700	MOVQ CX, R12
   701	ANDQ DX, R12
   702	ADDQ R13, R15
   703
   704	ORQ  R12, DI
   705	ADDQ R14, BX
   706	ADDQ R15, R9
   707	ADDQ R15, BX
   708
   709	ADDQ DI, BX
   710
   711	VPSRLQ $19, Y5, Y3
   712	VPSLLQ $(64-19), Y5, Y1
   713	VPOR   Y1, Y3, Y3
   714	VPXOR  Y3, Y8, Y8
   715	VPSRLQ $61, Y5, Y3
   716	VPSLLQ $(64-61), Y5, Y1
   717	VPOR   Y1, Y3, Y3
   718	VPXOR  Y3, Y8, Y8
   719
   720	VPADDQ Y8, Y0, Y2
   721
   722	VPBLENDD $0xF0, Y2, Y5, Y5
   723
   724	MOVQ  BX, DI
   725	RORXQ $41, R9, R13
   726	RORXQ $18, R9, R14
   727	ADDQ  3*8+frame_YFER(SP), AX
   728	ORQ   R8, DI
   729
   730	MOVQ  R10, R15
   731	RORXQ $34, BX, R12
   732	XORQ  R14, R13
   733	XORQ  R11, R15
   734
   735	RORXQ $14, R9, R14
   736	ANDQ  R9, R15
   737	ADDQ  AX, DX
   738	ANDQ  CX, DI
   739
   740	XORQ R14, R13
   741	XORQ R11, R15
   742
   743	RORXQ $39, BX, R14
   744	ADDQ  R13, R15
   745
   746	XORQ R12, R14
   747	ADDQ R15, DX
   748
   749	RORXQ $28, BX, R12
   750
   751	XORQ R12, R14
   752	MOVQ BX, R12
   753	ANDQ R8, R12
   754	ORQ  R12, DI
   755
   756	ADDQ R14, AX
   757	ADDQ R15, AX
   758	ADDQ DI, AX
   759
   760	VPADDQ  2*32(BP), Y6, Y0
   761	VMOVDQU Y0, frame_YFER(SP)
   762
   763	MY_VPALIGNR(Y0, Y5, Y4, 8)
   764
   765	VPADDQ Y6, Y0, Y0
   766
   767	MY_VPALIGNR(Y1, Y7, Y6, 8)
   768
   769	VPSRLQ $1, Y1, Y2
   770	VPSLLQ $(64-1), Y1, Y3
   771	VPOR   Y2, Y3, Y3
   772
   773	VPSRLQ $7, Y1, Y8
   774
   775	MOVQ  AX, DI
   776	RORXQ $41, DX, R13
   777	RORXQ $18, DX, R14
   778	ADDQ  frame_YFER(SP), R11
   779	ORQ   CX, DI
   780	MOVQ  R9, R15
   781	RORXQ $34, AX, R12
   782
   783	XORQ  R14, R13
   784	XORQ  R10, R15
   785	RORXQ $14, DX, R14
   786
   787	ANDQ  DX, R15
   788	XORQ  R14, R13
   789	RORXQ $39, AX, R14
   790	ADDQ  R11, R8
   791
   792	ANDQ  BX, DI
   793	XORQ  R12, R14
   794	RORXQ $28, AX, R12
   795
   796	XORQ R10, R15
   797	XORQ R12, R14
   798	MOVQ AX, R12
   799	ANDQ CX, R12
   800
   801	ADDQ R13, R15
   802	ORQ  R12, DI
   803	ADDQ R14, R11
   804
   805	ADDQ R15, R8
   806
   807	ADDQ R15, R11
   808	ADDQ DI, R11
   809
   810	VPSRLQ $8, Y1, Y2
   811	VPSLLQ $(64-8), Y1, Y1
   812	VPOR   Y2, Y1, Y1
   813
   814	VPXOR Y8, Y3, Y3
   815	VPXOR Y1, Y3, Y1
   816
   817	VPADDQ Y1, Y0, Y0
   818
   819	VPERM2F128 $0x0, Y0, Y0, Y6
   820
   821	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   822
   823	VPERM2F128 $0x11, Y5, Y5, Y2
   824	VPSRLQ     $6, Y2, Y8
   825
   826	MOVQ  R11, DI
   827	RORXQ $41, R8, R13
   828	RORXQ $18, R8, R14
   829	ADDQ  1*8+frame_YFER(SP), R10
   830	ORQ   BX, DI
   831
   832	MOVQ  DX, R15
   833	RORXQ $34, R11, R12
   834	XORQ  R14, R13
   835	XORQ  R9, R15
   836
   837	RORXQ $14, R8, R14
   838	XORQ  R14, R13
   839	RORXQ $39, R11, R14
   840	ANDQ  R8, R15
   841	ADDQ  R10, CX
   842
   843	ANDQ AX, DI
   844	XORQ R12, R14
   845
   846	RORXQ $28, R11, R12
   847	XORQ  R9, R15
   848
   849	XORQ R12, R14
   850	MOVQ R11, R12
   851	ANDQ BX, R12
   852	ADDQ R13, R15
   853
   854	ORQ  R12, DI
   855	ADDQ R14, R10
   856
   857	ADDQ R15, CX
   858	ADDQ R15, R10
   859	ADDQ DI, R10
   860
   861	VPSRLQ $19, Y2, Y3
   862	VPSLLQ $(64-19), Y2, Y1
   863	VPOR   Y1, Y3, Y3
   864	VPXOR  Y3, Y8, Y8
   865	VPSRLQ $61, Y2, Y3
   866	VPSLLQ $(64-61), Y2, Y1
   867	VPOR   Y1, Y3, Y3
   868	VPXOR  Y3, Y8, Y8
   869
   870	VPADDQ Y8, Y6, Y6
   871
   872	VPSRLQ $6, Y6, Y8
   873
   874	MOVQ  R10, DI
   875	RORXQ $41, CX, R13
   876	ADDQ  2*8+frame_YFER(SP), R9
   877
   878	RORXQ $18, CX, R14
   879	ORQ   AX, DI
   880	MOVQ  R8, R15
   881	XORQ  DX, R15
   882
   883	RORXQ $34, R10, R12
   884	XORQ  R14, R13
   885	ANDQ  CX, R15
   886
   887	RORXQ $14, CX, R14
   888	ADDQ  R9, BX
   889	ANDQ  R11, DI
   890
   891	XORQ  R14, R13
   892	RORXQ $39, R10, R14
   893	XORQ  DX, R15
   894
   895	XORQ  R12, R14
   896	RORXQ $28, R10, R12
   897
   898	XORQ R12, R14
   899	MOVQ R10, R12
   900	ANDQ AX, R12
   901	ADDQ R13, R15
   902
   903	ORQ  R12, DI
   904	ADDQ R14, R9
   905	ADDQ R15, BX
   906	ADDQ R15, R9
   907
   908	ADDQ DI, R9
   909
   910	VPSRLQ $19, Y6, Y3
   911	VPSLLQ $(64-19), Y6, Y1
   912	VPOR   Y1, Y3, Y3
   913	VPXOR  Y3, Y8, Y8
   914	VPSRLQ $61, Y6, Y3
   915	VPSLLQ $(64-61), Y6, Y1
   916	VPOR   Y1, Y3, Y3
   917	VPXOR  Y3, Y8, Y8
   918
   919	VPADDQ Y8, Y0, Y2
   920
   921	VPBLENDD $0xF0, Y2, Y6, Y6
   922
   923	MOVQ  R9, DI
   924	RORXQ $41, BX, R13
   925	RORXQ $18, BX, R14
   926	ADDQ  3*8+frame_YFER(SP), DX
   927	ORQ   R11, DI
   928
   929	MOVQ  CX, R15
   930	RORXQ $34, R9, R12
   931	XORQ  R14, R13
   932	XORQ  R8, R15
   933
   934	RORXQ $14, BX, R14
   935	ANDQ  BX, R15
   936	ADDQ  DX, AX
   937	ANDQ  R10, DI
   938
   939	XORQ R14, R13
   940	XORQ R8, R15
   941
   942	RORXQ $39, R9, R14
   943	ADDQ  R13, R15
   944
   945	XORQ R12, R14
   946	ADDQ R15, AX
   947
   948	RORXQ $28, R9, R12
   949
   950	XORQ R12, R14
   951	MOVQ R9, R12
   952	ANDQ R11, R12
   953	ORQ  R12, DI
   954
   955	ADDQ R14, DX
   956	ADDQ R15, DX
   957	ADDQ DI, DX
   958
   959	VPADDQ  3*32(BP), Y7, Y0
   960	VMOVDQU Y0, frame_YFER(SP)
   961	ADDQ    $(4*32), BP
   962
   963	MY_VPALIGNR(Y0, Y6, Y5, 8)
   964
   965	VPADDQ Y7, Y0, Y0
   966
   967	MY_VPALIGNR(Y1, Y4, Y7, 8)
   968
   969	VPSRLQ $1, Y1, Y2
   970	VPSLLQ $(64-1), Y1, Y3
   971	VPOR   Y2, Y3, Y3
   972
   973	VPSRLQ $7, Y1, Y8
   974
   975	MOVQ  DX, DI
   976	RORXQ $41, AX, R13
   977	RORXQ $18, AX, R14
   978	ADDQ  frame_YFER(SP), R8
   979	ORQ   R10, DI
   980	MOVQ  BX, R15
   981	RORXQ $34, DX, R12
   982
   983	XORQ  R14, R13
   984	XORQ  CX, R15
   985	RORXQ $14, AX, R14
   986
   987	ANDQ  AX, R15
   988	XORQ  R14, R13
   989	RORXQ $39, DX, R14
   990	ADDQ  R8, R11
   991
   992	ANDQ  R9, DI
   993	XORQ  R12, R14
   994	RORXQ $28, DX, R12
   995
   996	XORQ CX, R15
   997	XORQ R12, R14
   998	MOVQ DX, R12
   999	ANDQ R10, R12
  1000
  1001	ADDQ R13, R15
  1002	ORQ  R12, DI
  1003	ADDQ R14, R8
  1004
  1005	ADDQ R15, R11
  1006
  1007	ADDQ R15, R8
  1008	ADDQ DI, R8
  1009
  1010	VPSRLQ $8, Y1, Y2
  1011	VPSLLQ $(64-8), Y1, Y1
  1012	VPOR   Y2, Y1, Y1
  1013
  1014	VPXOR Y8, Y3, Y3
  1015	VPXOR Y1, Y3, Y1
  1016
  1017	VPADDQ Y1, Y0, Y0
  1018
  1019	VPERM2F128 $0x0, Y0, Y0, Y7
  1020
  1021	VPAND MASK_YMM_LO<>(SB), Y0, Y0
  1022
  1023	VPERM2F128 $0x11, Y6, Y6, Y2
  1024	VPSRLQ     $6, Y2, Y8
  1025
  1026	MOVQ  R8, DI
  1027	RORXQ $41, R11, R13
  1028	RORXQ $18, R11, R14
  1029	ADDQ  1*8+frame_YFER(SP), CX
  1030	ORQ   R9, DI
  1031
  1032	MOVQ  AX, R15
  1033	RORXQ $34, R8, R12
  1034	XORQ  R14, R13
  1035	XORQ  BX, R15
  1036
  1037	RORXQ $14, R11, R14
  1038	XORQ  R14, R13
  1039	RORXQ $39, R8, R14
  1040	ANDQ  R11, R15
  1041	ADDQ  CX, R10
  1042
  1043	ANDQ DX, DI
  1044	XORQ R12, R14
  1045
  1046	RORXQ $28, R8, R12
  1047	XORQ  BX, R15
  1048
  1049	XORQ R12, R14
  1050	MOVQ R8, R12
  1051	ANDQ R9, R12
  1052	ADDQ R13, R15
  1053
  1054	ORQ  R12, DI
  1055	ADDQ R14, CX
  1056
  1057	ADDQ R15, R10
  1058	ADDQ R15, CX
  1059	ADDQ DI, CX
  1060
  1061	VPSRLQ $19, Y2, Y3
  1062	VPSLLQ $(64-19), Y2, Y1
  1063	VPOR   Y1, Y3, Y3
  1064	VPXOR  Y3, Y8, Y8
  1065	VPSRLQ $61, Y2, Y3
  1066	VPSLLQ $(64-61), Y2, Y1
  1067	VPOR   Y1, Y3, Y3
  1068	VPXOR  Y3, Y8, Y8
  1069
  1070	VPADDQ Y8, Y7, Y7
  1071
  1072	VPSRLQ $6, Y7, Y8
  1073
  1074	MOVQ  CX, DI
  1075	RORXQ $41, R10, R13
  1076	ADDQ  2*8+frame_YFER(SP), BX
  1077
  1078	RORXQ $18, R10, R14
  1079	ORQ   DX, DI
  1080	MOVQ  R11, R15
  1081	XORQ  AX, R15
  1082
  1083	RORXQ $34, CX, R12
  1084	XORQ  R14, R13
  1085	ANDQ  R10, R15
  1086
  1087	RORXQ $14, R10, R14
  1088	ADDQ  BX, R9
  1089	ANDQ  R8, DI
  1090
  1091	XORQ  R14, R13
  1092	RORXQ $39, CX, R14
  1093	XORQ  AX, R15
  1094
  1095	XORQ  R12, R14
  1096	RORXQ $28, CX, R12
  1097
  1098	XORQ R12, R14
  1099	MOVQ CX, R12
  1100	ANDQ DX, R12
  1101	ADDQ R13, R15
  1102
  1103	ORQ  R12, DI
  1104	ADDQ R14, BX
  1105	ADDQ R15, R9
  1106	ADDQ R15, BX
  1107
  1108	ADDQ DI, BX
  1109
  1110	VPSRLQ $19, Y7, Y3
  1111	VPSLLQ $(64-19), Y7, Y1
  1112	VPOR   Y1, Y3, Y3
  1113	VPXOR  Y3, Y8, Y8
  1114	VPSRLQ $61, Y7, Y3
  1115	VPSLLQ $(64-61), Y7, Y1
  1116	VPOR   Y1, Y3, Y3
  1117	VPXOR  Y3, Y8, Y8
  1118
  1119	VPADDQ Y8, Y0, Y2
  1120
  1121	VPBLENDD $0xF0, Y2, Y7, Y7
  1122
  1123	MOVQ  BX, DI
  1124	RORXQ $41, R9, R13
  1125	RORXQ $18, R9, R14
  1126	ADDQ  3*8+frame_YFER(SP), AX
  1127	ORQ   R8, DI
  1128
  1129	MOVQ  R10, R15
  1130	RORXQ $34, BX, R12
  1131	XORQ  R14, R13
  1132	XORQ  R11, R15
  1133
  1134	RORXQ $14, R9, R14
  1135	ANDQ  R9, R15
  1136	ADDQ  AX, DX
  1137	ANDQ  CX, DI
  1138
  1139	XORQ R14, R13
  1140	XORQ R11, R15
  1141
  1142	RORXQ $39, BX, R14
  1143	ADDQ  R13, R15
  1144
  1145	XORQ R12, R14
  1146	ADDQ R15, DX
  1147
  1148	RORXQ $28, BX, R12
  1149
  1150	XORQ R12, R14
  1151	MOVQ BX, R12
  1152	ANDQ R8, R12
  1153	ORQ  R12, DI
  1154
  1155	ADDQ R14, AX
  1156	ADDQ R15, AX
  1157	ADDQ DI, AX
  1158
  1159	SUBQ $1, frame_SRND(SP)
  1160	JNE  loop1
  1161
  1162	MOVQ $2, frame_SRND(SP)
  1163
  1164loop2:
  1165	VPADDQ  (BP), Y4, Y0
  1166	VMOVDQU Y0, frame_YFER(SP)
  1167
  1168	MOVQ  R9, R15
  1169	RORXQ $41, DX, R13
  1170	RORXQ $18, DX, R14
  1171	XORQ  R10, R15
  1172
  1173	XORQ  R14, R13
  1174	RORXQ $14, DX, R14
  1175	ANDQ  DX, R15
  1176
  1177	XORQ  R14, R13
  1178	RORXQ $34, AX, R12
  1179	XORQ  R10, R15
  1180	RORXQ $39, AX, R14
  1181	MOVQ  AX, DI
  1182
  1183	XORQ  R12, R14
  1184	RORXQ $28, AX, R12
  1185	ADDQ  frame_YFER(SP), R11
  1186	ORQ   CX, DI
  1187
  1188	XORQ R12, R14
  1189	MOVQ AX, R12
  1190	ANDQ BX, DI
  1191	ANDQ CX, R12
  1192	ADDQ R13, R15
  1193
  1194	ADDQ R11, R8
  1195	ORQ  R12, DI
  1196	ADDQ R14, R11
  1197
  1198	ADDQ R15, R8
  1199
  1200	ADDQ  R15, R11
  1201	MOVQ  DX, R15
  1202	RORXQ $41, R8, R13
  1203	RORXQ $18, R8, R14
  1204	XORQ  R9, R15
  1205
  1206	XORQ  R14, R13
  1207	RORXQ $14, R8, R14
  1208	ANDQ  R8, R15
  1209	ADDQ  DI, R11
  1210
  1211	XORQ  R14, R13
  1212	RORXQ $34, R11, R12
  1213	XORQ  R9, R15
  1214	RORXQ $39, R11, R14
  1215	MOVQ  R11, DI
  1216
  1217	XORQ  R12, R14
  1218	RORXQ $28, R11, R12
  1219	ADDQ  8*1+frame_YFER(SP), R10
  1220	ORQ   BX, DI
  1221
  1222	XORQ R12, R14
  1223	MOVQ R11, R12
  1224	ANDQ AX, DI
  1225	ANDQ BX, R12
  1226	ADDQ R13, R15
  1227
  1228	ADDQ R10, CX
  1229	ORQ  R12, DI
  1230	ADDQ R14, R10
  1231
  1232	ADDQ R15, CX
  1233
  1234	ADDQ  R15, R10
  1235	MOVQ  R8, R15
  1236	RORXQ $41, CX, R13
  1237	RORXQ $18, CX, R14
  1238	XORQ  DX, R15
  1239
  1240	XORQ  R14, R13
  1241	RORXQ $14, CX, R14
  1242	ANDQ  CX, R15
  1243	ADDQ  DI, R10
  1244
  1245	XORQ  R14, R13
  1246	RORXQ $34, R10, R12
  1247	XORQ  DX, R15
  1248	RORXQ $39, R10, R14
  1249	MOVQ  R10, DI
  1250
  1251	XORQ  R12, R14
  1252	RORXQ $28, R10, R12
  1253	ADDQ  8*2+frame_YFER(SP), R9
  1254	ORQ   AX, DI
  1255
  1256	XORQ R12, R14
  1257	MOVQ R10, R12
  1258	ANDQ R11, DI
  1259	ANDQ AX, R12
  1260	ADDQ R13, R15
  1261
  1262	ADDQ R9, BX
  1263	ORQ  R12, DI
  1264	ADDQ R14, R9
  1265
  1266	ADDQ R15, BX
  1267
  1268	ADDQ  R15, R9
  1269	MOVQ  CX, R15
  1270	RORXQ $41, BX, R13
  1271	RORXQ $18, BX, R14
  1272	XORQ  R8, R15
  1273
  1274	XORQ  R14, R13
  1275	RORXQ $14, BX, R14
  1276	ANDQ  BX, R15
  1277	ADDQ  DI, R9
  1278
  1279	XORQ  R14, R13
  1280	RORXQ $34, R9, R12
  1281	XORQ  R8, R15
  1282	RORXQ $39, R9, R14
  1283	MOVQ  R9, DI
  1284
  1285	XORQ  R12, R14
  1286	RORXQ $28, R9, R12
  1287	ADDQ  8*3+frame_YFER(SP), DX
  1288	ORQ   R11, DI
  1289
  1290	XORQ R12, R14
  1291	MOVQ R9, R12
  1292	ANDQ R10, DI
  1293	ANDQ R11, R12
  1294	ADDQ R13, R15
  1295
  1296	ADDQ DX, AX
  1297	ORQ  R12, DI
  1298	ADDQ R14, DX
  1299
  1300	ADDQ R15, AX
  1301
  1302	ADDQ R15, DX
  1303
  1304	ADDQ DI, DX
  1305
  1306	VPADDQ  1*32(BP), Y5, Y0
  1307	VMOVDQU Y0, frame_YFER(SP)
  1308	ADDQ    $(2*32), BP
  1309
  1310	MOVQ  BX, R15
  1311	RORXQ $41, AX, R13
  1312	RORXQ $18, AX, R14
  1313	XORQ  CX, R15
  1314
  1315	XORQ  R14, R13
  1316	RORXQ $14, AX, R14
  1317	ANDQ  AX, R15
  1318
  1319	XORQ  R14, R13
  1320	RORXQ $34, DX, R12
  1321	XORQ  CX, R15
  1322	RORXQ $39, DX, R14
  1323	MOVQ  DX, DI
  1324
  1325	XORQ  R12, R14
  1326	RORXQ $28, DX, R12
  1327	ADDQ  frame_YFER(SP), R8
  1328	ORQ   R10, DI
  1329
  1330	XORQ R12, R14
  1331	MOVQ DX, R12
  1332	ANDQ R9, DI
  1333	ANDQ R10, R12
  1334	ADDQ R13, R15
  1335
  1336	ADDQ R8, R11
  1337	ORQ  R12, DI
  1338	ADDQ R14, R8
  1339
  1340	ADDQ R15, R11
  1341
  1342	ADDQ  R15, R8
  1343	MOVQ  AX, R15
  1344	RORXQ $41, R11, R13
  1345	RORXQ $18, R11, R14
  1346	XORQ  BX, R15
  1347
  1348	XORQ  R14, R13
  1349	RORXQ $14, R11, R14
  1350	ANDQ  R11, R15
  1351	ADDQ  DI, R8
  1352
  1353	XORQ  R14, R13
  1354	RORXQ $34, R8, R12
  1355	XORQ  BX, R15
  1356	RORXQ $39, R8, R14
  1357	MOVQ  R8, DI
  1358
  1359	XORQ  R12, R14
  1360	RORXQ $28, R8, R12
  1361	ADDQ  8*1+frame_YFER(SP), CX
  1362	ORQ   R9, DI
  1363
  1364	XORQ R12, R14
  1365	MOVQ R8, R12
  1366	ANDQ DX, DI
  1367	ANDQ R9, R12
  1368	ADDQ R13, R15
  1369
  1370	ADDQ CX, R10
  1371	ORQ  R12, DI
  1372	ADDQ R14, CX
  1373
  1374	ADDQ R15, R10
  1375
  1376	ADDQ  R15, CX
  1377	MOVQ  R11, R15
  1378	RORXQ $41, R10, R13
  1379	RORXQ $18, R10, R14
  1380	XORQ  AX, R15
  1381
  1382	XORQ  R14, R13
  1383	RORXQ $14, R10, R14
  1384	ANDQ  R10, R15
  1385	ADDQ  DI, CX
  1386
  1387	XORQ  R14, R13
  1388	RORXQ $34, CX, R12
  1389	XORQ  AX, R15
  1390	RORXQ $39, CX, R14
  1391	MOVQ  CX, DI
  1392
  1393	XORQ  R12, R14
  1394	RORXQ $28, CX, R12
  1395	ADDQ  8*2+frame_YFER(SP), BX
  1396	ORQ   DX, DI
  1397
  1398	XORQ R12, R14
  1399	MOVQ CX, R12
  1400	ANDQ R8, DI
  1401	ANDQ DX, R12
  1402	ADDQ R13, R15
  1403
  1404	ADDQ BX, R9
  1405	ORQ  R12, DI
  1406	ADDQ R14, BX
  1407
  1408	ADDQ R15, R9
  1409
  1410	ADDQ  R15, BX
  1411	MOVQ  R10, R15
  1412	RORXQ $41, R9, R13
  1413	RORXQ $18, R9, R14
  1414	XORQ  R11, R15
  1415
  1416	XORQ  R14, R13
  1417	RORXQ $14, R9, R14
  1418	ANDQ  R9, R15
  1419	ADDQ  DI, BX
  1420
  1421	XORQ  R14, R13
  1422	RORXQ $34, BX, R12
  1423	XORQ  R11, R15
  1424	RORXQ $39, BX, R14
  1425	MOVQ  BX, DI
  1426
  1427	XORQ  R12, R14
  1428	RORXQ $28, BX, R12
  1429	ADDQ  8*3+frame_YFER(SP), AX
  1430	ORQ   R8, DI
  1431
  1432	XORQ R12, R14
  1433	MOVQ BX, R12
  1434	ANDQ CX, DI
  1435	ANDQ R8, R12
  1436	ADDQ R13, R15
  1437
  1438	ADDQ AX, DX
  1439	ORQ  R12, DI
  1440	ADDQ R14, AX
  1441
  1442	ADDQ R15, DX
  1443
  1444	ADDQ R15, AX
  1445
  1446	ADDQ DI, AX
  1447
  1448	VMOVDQU Y6, Y4
  1449	VMOVDQU Y7, Y5
  1450
  1451	SUBQ $1, frame_SRND(SP)
  1452	JNE  loop2
  1453
  1454	addm(8*0(SI),AX)
  1455	addm(8*1(SI),BX)
  1456	addm(8*2(SI),CX)
  1457	addm(8*3(SI),R8)
  1458	addm(8*4(SI),DX)
  1459	addm(8*5(SI),R9)
  1460	addm(8*6(SI),R10)
  1461	addm(8*7(SI),R11)
  1462
  1463	MOVQ frame_INP(SP), DI
  1464	ADDQ $128, DI
  1465	CMPQ DI, frame_INPEND(SP)
  1466	JNE  loop0
  1467
  1468done_hash:
  1469	VZEROUPPER
  1470	RET

View as plain text