Text file
src/crypto/sha512/sha512block_amd64.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8
9// SHA512 block routine. See sha512block.go for Go equivalent.
10//
11// The algorithm is detailed in FIPS 180-4:
12//
13// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
14//
15// Wt = Mt; for 0 <= t <= 15
16// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
17//
18// a = H0
19// b = H1
20// c = H2
21// d = H3
22// e = H4
23// f = H5
24// g = H6
25// h = H7
26//
27// for t = 0 to 79 {
28// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
29// T2 = BIGSIGMA0(a) + Maj(a,b,c)
30// h = g
31// g = f
32// f = e
33// e = d + T1
34// d = c
35// c = b
36// b = a
37// a = T1 + T2
38// }
39//
40// H0 = a + H0
41// H1 = b + H1
42// H2 = c + H2
43// H3 = d + H3
44// H4 = e + H4
45// H5 = f + H5
46// H6 = g + H6
47// H7 = h + H7
48
49// Wt = Mt; for 0 <= t <= 15
50#define MSGSCHEDULE0(index) \
51 MOVQ (index*8)(SI), AX; \
52 BSWAPQ AX; \
53 MOVQ AX, (index*8)(BP)
54
55// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
56// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
57// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
58#define MSGSCHEDULE1(index) \
59 MOVQ ((index-2)*8)(BP), AX; \
60 MOVQ AX, CX; \
61 RORQ $19, AX; \
62 MOVQ CX, DX; \
63 RORQ $61, CX; \
64 SHRQ $6, DX; \
65 MOVQ ((index-15)*8)(BP), BX; \
66 XORQ CX, AX; \
67 MOVQ BX, CX; \
68 XORQ DX, AX; \
69 RORQ $1, BX; \
70 MOVQ CX, DX; \
71 SHRQ $7, DX; \
72 RORQ $8, CX; \
73 ADDQ ((index-7)*8)(BP), AX; \
74 XORQ CX, BX; \
75 XORQ DX, BX; \
76 ADDQ ((index-16)*8)(BP), BX; \
77 ADDQ BX, AX; \
78 MOVQ AX, ((index)*8)(BP)
79
80// Calculate T1 in AX - uses AX, CX and DX registers.
81// h is also used as an accumulator. Wt is passed in AX.
82// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
83// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
84// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
85#define SHA512T1(const, e, f, g, h) \
86 MOVQ $const, DX; \
87 ADDQ AX, h; \
88 MOVQ e, AX; \
89 ADDQ DX, h; \
90 MOVQ e, CX; \
91 RORQ $14, AX; \
92 MOVQ e, DX; \
93 RORQ $18, CX; \
94 XORQ CX, AX; \
95 MOVQ e, CX; \
96 RORQ $41, DX; \
97 ANDQ f, CX; \
98 XORQ AX, DX; \
99 MOVQ e, AX; \
100 NOTQ AX; \
101 ADDQ DX, h; \
102 ANDQ g, AX; \
103 XORQ CX, AX; \
104 ADDQ h, AX
105
106// Calculate T2 in BX - uses BX, CX, DX and DI registers.
107// T2 = BIGSIGMA0(a) + Maj(a, b, c)
108// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
109// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
110#define SHA512T2(a, b, c) \
111 MOVQ a, DI; \
112 MOVQ c, BX; \
113 RORQ $28, DI; \
114 MOVQ a, DX; \
115 ANDQ b, BX; \
116 RORQ $34, DX; \
117 MOVQ a, CX; \
118 ANDQ c, CX; \
119 XORQ DX, DI; \
120 XORQ CX, BX; \
121 MOVQ a, DX; \
122 MOVQ b, CX; \
123 RORQ $39, DX; \
124 ANDQ a, CX; \
125 XORQ CX, BX; \
126 XORQ DX, DI; \
127 ADDQ DI, BX
128
129// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
130// The values for e and a are stored in d and h, ready for rotation.
131#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
132 SHA512T1(const, e, f, g, h); \
133 SHA512T2(a, b, c); \
134 MOVQ BX, h; \
135 ADDQ AX, d; \
136 ADDQ AX, h
137
138#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
139 MSGSCHEDULE0(index); \
140 SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
141
142#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
143 MSGSCHEDULE1(index); \
144 SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
145
146TEXT ·blockAMD64(SB),0,$648-32
147 MOVQ p_base+8(FP), SI
148 MOVQ p_len+16(FP), DX
149 SHRQ $7, DX
150 SHLQ $7, DX
151
152 LEAQ (SI)(DX*1), DI
153 MOVQ DI, 640(SP)
154 CMPQ SI, DI
155 JEQ end
156
157 MOVQ dig+0(FP), BP
158 MOVQ (0*8)(BP), R8 // a = H0
159 MOVQ (1*8)(BP), R9 // b = H1
160 MOVQ (2*8)(BP), R10 // c = H2
161 MOVQ (3*8)(BP), R11 // d = H3
162 MOVQ (4*8)(BP), R12 // e = H4
163 MOVQ (5*8)(BP), R13 // f = H5
164 MOVQ (6*8)(BP), R14 // g = H6
165 MOVQ (7*8)(BP), R15 // h = H7
166
167loop:
168 MOVQ SP, BP // message schedule
169
170 SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
171 SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
172 SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
173 SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
174 SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
175 SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
176 SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
177 SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
178 SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
179 SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
180 SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
181 SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
182 SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
183 SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
184 SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
185 SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
186
187 SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
188 SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
189 SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
190 SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
191 SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
192 SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
193 SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
194 SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
195 SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
196 SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
197 SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
198 SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
199 SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
200 SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
201 SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
202 SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
203 SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
204 SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
205 SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
206 SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
207 SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
208 SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
209 SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
210 SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
211 SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
212 SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
213 SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
214 SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
215 SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
216 SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
217 SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
218 SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
219 SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
220 SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
221 SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
222 SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
223 SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
224 SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
225 SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
226 SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
227 SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
228 SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
229 SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
230 SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
231 SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
232 SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
233 SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
234 SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
235 SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
236 SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
237 SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
238 SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
239 SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
240 SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
241 SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
242 SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
243 SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
244 SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
245 SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
246 SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
247 SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
248 SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
249 SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
250 SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
251
252 MOVQ dig+0(FP), BP
253 ADDQ (0*8)(BP), R8 // H0 = a + H0
254 MOVQ R8, (0*8)(BP)
255 ADDQ (1*8)(BP), R9 // H1 = b + H1
256 MOVQ R9, (1*8)(BP)
257 ADDQ (2*8)(BP), R10 // H2 = c + H2
258 MOVQ R10, (2*8)(BP)
259 ADDQ (3*8)(BP), R11 // H3 = d + H3
260 MOVQ R11, (3*8)(BP)
261 ADDQ (4*8)(BP), R12 // H4 = e + H4
262 MOVQ R12, (4*8)(BP)
263 ADDQ (5*8)(BP), R13 // H5 = f + H5
264 MOVQ R13, (5*8)(BP)
265 ADDQ (6*8)(BP), R14 // H6 = g + H6
266 MOVQ R14, (6*8)(BP)
267 ADDQ (7*8)(BP), R15 // H7 = h + H7
268 MOVQ R15, (7*8)(BP)
269
270 ADDQ $128, SI
271 CMPQ SI, 640(SP)
272 JB loop
273
274end:
275 RET
276
277// Version below is based on "Fast SHA512 Implementations on Intel
278// Architecture Processors" White-paper
279// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
280// AVX2 version by Intel, same algorithm in Linux kernel:
281// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
282
283// James Guilford <james.guilford@intel.com>
284// Kirk Yap <kirk.s.yap@intel.com>
285// Tim Chen <tim.c.chen@linux.intel.com>
286// David Cote <david.m.cote@intel.com>
287// Aleksey Sidorov <aleksey.sidorov@intel.com>
288
289#define YFER_SIZE (4*8)
290#define SRND_SIZE (1*8)
291#define INP_SIZE (1*8)
292
293#define frame_YFER (0)
294#define frame_SRND (frame_YFER + YFER_SIZE)
295#define frame_INP (frame_SRND + SRND_SIZE)
296#define frame_INPEND (frame_INP + INP_SIZE)
297
298#define addm(p1, p2) \
299 ADDQ p1, p2; \
300 MOVQ p2, p1
301
302#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
303 VMOVDQU p2, p1; \
304 VPSHUFB p3, p1, p1
305
306#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
307 VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
308 VPALIGNR $RVAL, YSRC2, YDST, YDST
309
310DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
311DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
312DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
313DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
314
315GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
316
317DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
318DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
319DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
320DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
321
322GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
323
324TEXT ·blockAVX2(SB), NOSPLIT, $56-32
325 MOVQ dig+0(FP), SI
326 MOVQ p_base+8(FP), DI
327 MOVQ p_len+16(FP), DX
328
329 SHRQ $7, DX
330 SHLQ $7, DX
331
332 JZ done_hash
333 ADDQ DI, DX
334 MOVQ DX, frame_INPEND(SP)
335
336 MOVQ (0*8)(SI), AX
337 MOVQ (1*8)(SI), BX
338 MOVQ (2*8)(SI), CX
339 MOVQ (3*8)(SI), R8
340 MOVQ (4*8)(SI), DX
341 MOVQ (5*8)(SI), R9
342 MOVQ (6*8)(SI), R10
343 MOVQ (7*8)(SI), R11
344
345 VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
346
347loop0:
348 MOVQ ·_K+0(SB), BP
349
350 // byte swap first 16 dwords
351 COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
352 COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
353 COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
354 COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
355
356 MOVQ DI, frame_INP(SP)
357
358 // schedule 64 input dwords, by doing 12 rounds of 4 each
359 MOVQ $4, frame_SRND(SP)
360
361loop1:
362 VPADDQ (BP), Y4, Y0
363 VMOVDQU Y0, frame_YFER(SP)
364
365 MY_VPALIGNR(Y0, Y7, Y6, 8)
366
367 VPADDQ Y4, Y0, Y0
368
369 MY_VPALIGNR(Y1, Y5, Y4, 8)
370
371 VPSRLQ $1, Y1, Y2
372 VPSLLQ $(64-1), Y1, Y3
373 VPOR Y2, Y3, Y3
374
375 VPSRLQ $7, Y1, Y8
376
377 MOVQ AX, DI
378 RORXQ $41, DX, R13
379 RORXQ $18, DX, R14
380 ADDQ frame_YFER(SP), R11
381 ORQ CX, DI
382 MOVQ R9, R15
383 RORXQ $34, AX, R12
384
385 XORQ R14, R13
386 XORQ R10, R15
387 RORXQ $14, DX, R14
388
389 ANDQ DX, R15
390 XORQ R14, R13
391 RORXQ $39, AX, R14
392 ADDQ R11, R8
393
394 ANDQ BX, DI
395 XORQ R12, R14
396 RORXQ $28, AX, R12
397
398 XORQ R10, R15
399 XORQ R12, R14
400 MOVQ AX, R12
401 ANDQ CX, R12
402
403 ADDQ R13, R15
404 ORQ R12, DI
405 ADDQ R14, R11
406
407 ADDQ R15, R8
408
409 ADDQ R15, R11
410 ADDQ DI, R11
411
412 VPSRLQ $8, Y1, Y2
413 VPSLLQ $(64-8), Y1, Y1
414 VPOR Y2, Y1, Y1
415
416 VPXOR Y8, Y3, Y3
417 VPXOR Y1, Y3, Y1
418
419 VPADDQ Y1, Y0, Y0
420
421 VPERM2F128 $0x0, Y0, Y0, Y4
422
423 VPAND MASK_YMM_LO<>(SB), Y0, Y0
424
425 VPERM2F128 $0x11, Y7, Y7, Y2
426 VPSRLQ $6, Y2, Y8
427
428 MOVQ R11, DI
429 RORXQ $41, R8, R13
430 RORXQ $18, R8, R14
431 ADDQ 1*8+frame_YFER(SP), R10
432 ORQ BX, DI
433
434 MOVQ DX, R15
435 RORXQ $34, R11, R12
436 XORQ R14, R13
437 XORQ R9, R15
438
439 RORXQ $14, R8, R14
440 XORQ R14, R13
441 RORXQ $39, R11, R14
442 ANDQ R8, R15
443 ADDQ R10, CX
444
445 ANDQ AX, DI
446 XORQ R12, R14
447
448 RORXQ $28, R11, R12
449 XORQ R9, R15
450
451 XORQ R12, R14
452 MOVQ R11, R12
453 ANDQ BX, R12
454 ADDQ R13, R15
455
456 ORQ R12, DI
457 ADDQ R14, R10
458
459 ADDQ R15, CX
460 ADDQ R15, R10
461 ADDQ DI, R10
462
463 VPSRLQ $19, Y2, Y3
464 VPSLLQ $(64-19), Y2, Y1
465 VPOR Y1, Y3, Y3
466 VPXOR Y3, Y8, Y8
467 VPSRLQ $61, Y2, Y3
468 VPSLLQ $(64-61), Y2, Y1
469 VPOR Y1, Y3, Y3
470 VPXOR Y3, Y8, Y8
471
472 VPADDQ Y8, Y4, Y4
473
474 VPSRLQ $6, Y4, Y8
475
476 MOVQ R10, DI
477 RORXQ $41, CX, R13
478 ADDQ 2*8+frame_YFER(SP), R9
479
480 RORXQ $18, CX, R14
481 ORQ AX, DI
482 MOVQ R8, R15
483 XORQ DX, R15
484
485 RORXQ $34, R10, R12
486 XORQ R14, R13
487 ANDQ CX, R15
488
489 RORXQ $14, CX, R14
490 ADDQ R9, BX
491 ANDQ R11, DI
492
493 XORQ R14, R13
494 RORXQ $39, R10, R14
495 XORQ DX, R15
496
497 XORQ R12, R14
498 RORXQ $28, R10, R12
499
500 XORQ R12, R14
501 MOVQ R10, R12
502 ANDQ AX, R12
503 ADDQ R13, R15
504
505 ORQ R12, DI
506 ADDQ R14, R9
507 ADDQ R15, BX
508 ADDQ R15, R9
509
510 ADDQ DI, R9
511
512 VPSRLQ $19, Y4, Y3
513 VPSLLQ $(64-19), Y4, Y1
514 VPOR Y1, Y3, Y3
515 VPXOR Y3, Y8, Y8
516 VPSRLQ $61, Y4, Y3
517 VPSLLQ $(64-61), Y4, Y1
518 VPOR Y1, Y3, Y3
519 VPXOR Y3, Y8, Y8
520
521 VPADDQ Y8, Y0, Y2
522
523 VPBLENDD $0xF0, Y2, Y4, Y4
524
525 MOVQ R9, DI
526 RORXQ $41, BX, R13
527 RORXQ $18, BX, R14
528 ADDQ 3*8+frame_YFER(SP), DX
529 ORQ R11, DI
530
531 MOVQ CX, R15
532 RORXQ $34, R9, R12
533 XORQ R14, R13
534 XORQ R8, R15
535
536 RORXQ $14, BX, R14
537 ANDQ BX, R15
538 ADDQ DX, AX
539 ANDQ R10, DI
540
541 XORQ R14, R13
542 XORQ R8, R15
543
544 RORXQ $39, R9, R14
545 ADDQ R13, R15
546
547 XORQ R12, R14
548 ADDQ R15, AX
549
550 RORXQ $28, R9, R12
551
552 XORQ R12, R14
553 MOVQ R9, R12
554 ANDQ R11, R12
555 ORQ R12, DI
556
557 ADDQ R14, DX
558 ADDQ R15, DX
559 ADDQ DI, DX
560
561 VPADDQ 1*32(BP), Y5, Y0
562 VMOVDQU Y0, frame_YFER(SP)
563
564 MY_VPALIGNR(Y0, Y4, Y7, 8)
565
566 VPADDQ Y5, Y0, Y0
567
568 MY_VPALIGNR(Y1, Y6, Y5, 8)
569
570 VPSRLQ $1, Y1, Y2
571 VPSLLQ $(64-1), Y1, Y3
572 VPOR Y2, Y3, Y3
573
574 VPSRLQ $7, Y1, Y8
575
576 MOVQ DX, DI
577 RORXQ $41, AX, R13
578 RORXQ $18, AX, R14
579 ADDQ frame_YFER(SP), R8
580 ORQ R10, DI
581 MOVQ BX, R15
582 RORXQ $34, DX, R12
583
584 XORQ R14, R13
585 XORQ CX, R15
586 RORXQ $14, AX, R14
587
588 ANDQ AX, R15
589 XORQ R14, R13
590 RORXQ $39, DX, R14
591 ADDQ R8, R11
592
593 ANDQ R9, DI
594 XORQ R12, R14
595 RORXQ $28, DX, R12
596
597 XORQ CX, R15
598 XORQ R12, R14
599 MOVQ DX, R12
600 ANDQ R10, R12
601
602 ADDQ R13, R15
603 ORQ R12, DI
604 ADDQ R14, R8
605
606 ADDQ R15, R11
607
608 ADDQ R15, R8
609 ADDQ DI, R8
610
611 VPSRLQ $8, Y1, Y2
612 VPSLLQ $(64-8), Y1, Y1
613 VPOR Y2, Y1, Y1
614
615 VPXOR Y8, Y3, Y3
616 VPXOR Y1, Y3, Y1
617
618 VPADDQ Y1, Y0, Y0
619
620 VPERM2F128 $0x0, Y0, Y0, Y5
621
622 VPAND MASK_YMM_LO<>(SB), Y0, Y0
623
624 VPERM2F128 $0x11, Y4, Y4, Y2
625 VPSRLQ $6, Y2, Y8
626
627 MOVQ R8, DI
628 RORXQ $41, R11, R13
629 RORXQ $18, R11, R14
630 ADDQ 1*8+frame_YFER(SP), CX
631 ORQ R9, DI
632
633 MOVQ AX, R15
634 RORXQ $34, R8, R12
635 XORQ R14, R13
636 XORQ BX, R15
637
638 RORXQ $14, R11, R14
639 XORQ R14, R13
640 RORXQ $39, R8, R14
641 ANDQ R11, R15
642 ADDQ CX, R10
643
644 ANDQ DX, DI
645 XORQ R12, R14
646
647 RORXQ $28, R8, R12
648 XORQ BX, R15
649
650 XORQ R12, R14
651 MOVQ R8, R12
652 ANDQ R9, R12
653 ADDQ R13, R15
654
655 ORQ R12, DI
656 ADDQ R14, CX
657
658 ADDQ R15, R10
659 ADDQ R15, CX
660 ADDQ DI, CX
661
662 VPSRLQ $19, Y2, Y3
663 VPSLLQ $(64-19), Y2, Y1
664 VPOR Y1, Y3, Y3
665 VPXOR Y3, Y8, Y8
666 VPSRLQ $61, Y2, Y3
667 VPSLLQ $(64-61), Y2, Y1
668 VPOR Y1, Y3, Y3
669 VPXOR Y3, Y8, Y8
670
671 VPADDQ Y8, Y5, Y5
672
673 VPSRLQ $6, Y5, Y8
674
675 MOVQ CX, DI
676 RORXQ $41, R10, R13
677 ADDQ 2*8+frame_YFER(SP), BX
678
679 RORXQ $18, R10, R14
680 ORQ DX, DI
681 MOVQ R11, R15
682 XORQ AX, R15
683
684 RORXQ $34, CX, R12
685 XORQ R14, R13
686 ANDQ R10, R15
687
688 RORXQ $14, R10, R14
689 ADDQ BX, R9
690 ANDQ R8, DI
691
692 XORQ R14, R13
693 RORXQ $39, CX, R14
694 XORQ AX, R15
695
696 XORQ R12, R14
697 RORXQ $28, CX, R12
698
699 XORQ R12, R14
700 MOVQ CX, R12
701 ANDQ DX, R12
702 ADDQ R13, R15
703
704 ORQ R12, DI
705 ADDQ R14, BX
706 ADDQ R15, R9
707 ADDQ R15, BX
708
709 ADDQ DI, BX
710
711 VPSRLQ $19, Y5, Y3
712 VPSLLQ $(64-19), Y5, Y1
713 VPOR Y1, Y3, Y3
714 VPXOR Y3, Y8, Y8
715 VPSRLQ $61, Y5, Y3
716 VPSLLQ $(64-61), Y5, Y1
717 VPOR Y1, Y3, Y3
718 VPXOR Y3, Y8, Y8
719
720 VPADDQ Y8, Y0, Y2
721
722 VPBLENDD $0xF0, Y2, Y5, Y5
723
724 MOVQ BX, DI
725 RORXQ $41, R9, R13
726 RORXQ $18, R9, R14
727 ADDQ 3*8+frame_YFER(SP), AX
728 ORQ R8, DI
729
730 MOVQ R10, R15
731 RORXQ $34, BX, R12
732 XORQ R14, R13
733 XORQ R11, R15
734
735 RORXQ $14, R9, R14
736 ANDQ R9, R15
737 ADDQ AX, DX
738 ANDQ CX, DI
739
740 XORQ R14, R13
741 XORQ R11, R15
742
743 RORXQ $39, BX, R14
744 ADDQ R13, R15
745
746 XORQ R12, R14
747 ADDQ R15, DX
748
749 RORXQ $28, BX, R12
750
751 XORQ R12, R14
752 MOVQ BX, R12
753 ANDQ R8, R12
754 ORQ R12, DI
755
756 ADDQ R14, AX
757 ADDQ R15, AX
758 ADDQ DI, AX
759
760 VPADDQ 2*32(BP), Y6, Y0
761 VMOVDQU Y0, frame_YFER(SP)
762
763 MY_VPALIGNR(Y0, Y5, Y4, 8)
764
765 VPADDQ Y6, Y0, Y0
766
767 MY_VPALIGNR(Y1, Y7, Y6, 8)
768
769 VPSRLQ $1, Y1, Y2
770 VPSLLQ $(64-1), Y1, Y3
771 VPOR Y2, Y3, Y3
772
773 VPSRLQ $7, Y1, Y8
774
775 MOVQ AX, DI
776 RORXQ $41, DX, R13
777 RORXQ $18, DX, R14
778 ADDQ frame_YFER(SP), R11
779 ORQ CX, DI
780 MOVQ R9, R15
781 RORXQ $34, AX, R12
782
783 XORQ R14, R13
784 XORQ R10, R15
785 RORXQ $14, DX, R14
786
787 ANDQ DX, R15
788 XORQ R14, R13
789 RORXQ $39, AX, R14
790 ADDQ R11, R8
791
792 ANDQ BX, DI
793 XORQ R12, R14
794 RORXQ $28, AX, R12
795
796 XORQ R10, R15
797 XORQ R12, R14
798 MOVQ AX, R12
799 ANDQ CX, R12
800
801 ADDQ R13, R15
802 ORQ R12, DI
803 ADDQ R14, R11
804
805 ADDQ R15, R8
806
807 ADDQ R15, R11
808 ADDQ DI, R11
809
810 VPSRLQ $8, Y1, Y2
811 VPSLLQ $(64-8), Y1, Y1
812 VPOR Y2, Y1, Y1
813
814 VPXOR Y8, Y3, Y3
815 VPXOR Y1, Y3, Y1
816
817 VPADDQ Y1, Y0, Y0
818
819 VPERM2F128 $0x0, Y0, Y0, Y6
820
821 VPAND MASK_YMM_LO<>(SB), Y0, Y0
822
823 VPERM2F128 $0x11, Y5, Y5, Y2
824 VPSRLQ $6, Y2, Y8
825
826 MOVQ R11, DI
827 RORXQ $41, R8, R13
828 RORXQ $18, R8, R14
829 ADDQ 1*8+frame_YFER(SP), R10
830 ORQ BX, DI
831
832 MOVQ DX, R15
833 RORXQ $34, R11, R12
834 XORQ R14, R13
835 XORQ R9, R15
836
837 RORXQ $14, R8, R14
838 XORQ R14, R13
839 RORXQ $39, R11, R14
840 ANDQ R8, R15
841 ADDQ R10, CX
842
843 ANDQ AX, DI
844 XORQ R12, R14
845
846 RORXQ $28, R11, R12
847 XORQ R9, R15
848
849 XORQ R12, R14
850 MOVQ R11, R12
851 ANDQ BX, R12
852 ADDQ R13, R15
853
854 ORQ R12, DI
855 ADDQ R14, R10
856
857 ADDQ R15, CX
858 ADDQ R15, R10
859 ADDQ DI, R10
860
861 VPSRLQ $19, Y2, Y3
862 VPSLLQ $(64-19), Y2, Y1
863 VPOR Y1, Y3, Y3
864 VPXOR Y3, Y8, Y8
865 VPSRLQ $61, Y2, Y3
866 VPSLLQ $(64-61), Y2, Y1
867 VPOR Y1, Y3, Y3
868 VPXOR Y3, Y8, Y8
869
870 VPADDQ Y8, Y6, Y6
871
872 VPSRLQ $6, Y6, Y8
873
874 MOVQ R10, DI
875 RORXQ $41, CX, R13
876 ADDQ 2*8+frame_YFER(SP), R9
877
878 RORXQ $18, CX, R14
879 ORQ AX, DI
880 MOVQ R8, R15
881 XORQ DX, R15
882
883 RORXQ $34, R10, R12
884 XORQ R14, R13
885 ANDQ CX, R15
886
887 RORXQ $14, CX, R14
888 ADDQ R9, BX
889 ANDQ R11, DI
890
891 XORQ R14, R13
892 RORXQ $39, R10, R14
893 XORQ DX, R15
894
895 XORQ R12, R14
896 RORXQ $28, R10, R12
897
898 XORQ R12, R14
899 MOVQ R10, R12
900 ANDQ AX, R12
901 ADDQ R13, R15
902
903 ORQ R12, DI
904 ADDQ R14, R9
905 ADDQ R15, BX
906 ADDQ R15, R9
907
908 ADDQ DI, R9
909
910 VPSRLQ $19, Y6, Y3
911 VPSLLQ $(64-19), Y6, Y1
912 VPOR Y1, Y3, Y3
913 VPXOR Y3, Y8, Y8
914 VPSRLQ $61, Y6, Y3
915 VPSLLQ $(64-61), Y6, Y1
916 VPOR Y1, Y3, Y3
917 VPXOR Y3, Y8, Y8
918
919 VPADDQ Y8, Y0, Y2
920
921 VPBLENDD $0xF0, Y2, Y6, Y6
922
923 MOVQ R9, DI
924 RORXQ $41, BX, R13
925 RORXQ $18, BX, R14
926 ADDQ 3*8+frame_YFER(SP), DX
927 ORQ R11, DI
928
929 MOVQ CX, R15
930 RORXQ $34, R9, R12
931 XORQ R14, R13
932 XORQ R8, R15
933
934 RORXQ $14, BX, R14
935 ANDQ BX, R15
936 ADDQ DX, AX
937 ANDQ R10, DI
938
939 XORQ R14, R13
940 XORQ R8, R15
941
942 RORXQ $39, R9, R14
943 ADDQ R13, R15
944
945 XORQ R12, R14
946 ADDQ R15, AX
947
948 RORXQ $28, R9, R12
949
950 XORQ R12, R14
951 MOVQ R9, R12
952 ANDQ R11, R12
953 ORQ R12, DI
954
955 ADDQ R14, DX
956 ADDQ R15, DX
957 ADDQ DI, DX
958
959 VPADDQ 3*32(BP), Y7, Y0
960 VMOVDQU Y0, frame_YFER(SP)
961 ADDQ $(4*32), BP
962
963 MY_VPALIGNR(Y0, Y6, Y5, 8)
964
965 VPADDQ Y7, Y0, Y0
966
967 MY_VPALIGNR(Y1, Y4, Y7, 8)
968
969 VPSRLQ $1, Y1, Y2
970 VPSLLQ $(64-1), Y1, Y3
971 VPOR Y2, Y3, Y3
972
973 VPSRLQ $7, Y1, Y8
974
975 MOVQ DX, DI
976 RORXQ $41, AX, R13
977 RORXQ $18, AX, R14
978 ADDQ frame_YFER(SP), R8
979 ORQ R10, DI
980 MOVQ BX, R15
981 RORXQ $34, DX, R12
982
983 XORQ R14, R13
984 XORQ CX, R15
985 RORXQ $14, AX, R14
986
987 ANDQ AX, R15
988 XORQ R14, R13
989 RORXQ $39, DX, R14
990 ADDQ R8, R11
991
992 ANDQ R9, DI
993 XORQ R12, R14
994 RORXQ $28, DX, R12
995
996 XORQ CX, R15
997 XORQ R12, R14
998 MOVQ DX, R12
999 ANDQ R10, R12
1000
1001 ADDQ R13, R15
1002 ORQ R12, DI
1003 ADDQ R14, R8
1004
1005 ADDQ R15, R11
1006
1007 ADDQ R15, R8
1008 ADDQ DI, R8
1009
1010 VPSRLQ $8, Y1, Y2
1011 VPSLLQ $(64-8), Y1, Y1
1012 VPOR Y2, Y1, Y1
1013
1014 VPXOR Y8, Y3, Y3
1015 VPXOR Y1, Y3, Y1
1016
1017 VPADDQ Y1, Y0, Y0
1018
1019 VPERM2F128 $0x0, Y0, Y0, Y7
1020
1021 VPAND MASK_YMM_LO<>(SB), Y0, Y0
1022
1023 VPERM2F128 $0x11, Y6, Y6, Y2
1024 VPSRLQ $6, Y2, Y8
1025
1026 MOVQ R8, DI
1027 RORXQ $41, R11, R13
1028 RORXQ $18, R11, R14
1029 ADDQ 1*8+frame_YFER(SP), CX
1030 ORQ R9, DI
1031
1032 MOVQ AX, R15
1033 RORXQ $34, R8, R12
1034 XORQ R14, R13
1035 XORQ BX, R15
1036
1037 RORXQ $14, R11, R14
1038 XORQ R14, R13
1039 RORXQ $39, R8, R14
1040 ANDQ R11, R15
1041 ADDQ CX, R10
1042
1043 ANDQ DX, DI
1044 XORQ R12, R14
1045
1046 RORXQ $28, R8, R12
1047 XORQ BX, R15
1048
1049 XORQ R12, R14
1050 MOVQ R8, R12
1051 ANDQ R9, R12
1052 ADDQ R13, R15
1053
1054 ORQ R12, DI
1055 ADDQ R14, CX
1056
1057 ADDQ R15, R10
1058 ADDQ R15, CX
1059 ADDQ DI, CX
1060
1061 VPSRLQ $19, Y2, Y3
1062 VPSLLQ $(64-19), Y2, Y1
1063 VPOR Y1, Y3, Y3
1064 VPXOR Y3, Y8, Y8
1065 VPSRLQ $61, Y2, Y3
1066 VPSLLQ $(64-61), Y2, Y1
1067 VPOR Y1, Y3, Y3
1068 VPXOR Y3, Y8, Y8
1069
1070 VPADDQ Y8, Y7, Y7
1071
1072 VPSRLQ $6, Y7, Y8
1073
1074 MOVQ CX, DI
1075 RORXQ $41, R10, R13
1076 ADDQ 2*8+frame_YFER(SP), BX
1077
1078 RORXQ $18, R10, R14
1079 ORQ DX, DI
1080 MOVQ R11, R15
1081 XORQ AX, R15
1082
1083 RORXQ $34, CX, R12
1084 XORQ R14, R13
1085 ANDQ R10, R15
1086
1087 RORXQ $14, R10, R14
1088 ADDQ BX, R9
1089 ANDQ R8, DI
1090
1091 XORQ R14, R13
1092 RORXQ $39, CX, R14
1093 XORQ AX, R15
1094
1095 XORQ R12, R14
1096 RORXQ $28, CX, R12
1097
1098 XORQ R12, R14
1099 MOVQ CX, R12
1100 ANDQ DX, R12
1101 ADDQ R13, R15
1102
1103 ORQ R12, DI
1104 ADDQ R14, BX
1105 ADDQ R15, R9
1106 ADDQ R15, BX
1107
1108 ADDQ DI, BX
1109
1110 VPSRLQ $19, Y7, Y3
1111 VPSLLQ $(64-19), Y7, Y1
1112 VPOR Y1, Y3, Y3
1113 VPXOR Y3, Y8, Y8
1114 VPSRLQ $61, Y7, Y3
1115 VPSLLQ $(64-61), Y7, Y1
1116 VPOR Y1, Y3, Y3
1117 VPXOR Y3, Y8, Y8
1118
1119 VPADDQ Y8, Y0, Y2
1120
1121 VPBLENDD $0xF0, Y2, Y7, Y7
1122
1123 MOVQ BX, DI
1124 RORXQ $41, R9, R13
1125 RORXQ $18, R9, R14
1126 ADDQ 3*8+frame_YFER(SP), AX
1127 ORQ R8, DI
1128
1129 MOVQ R10, R15
1130 RORXQ $34, BX, R12
1131 XORQ R14, R13
1132 XORQ R11, R15
1133
1134 RORXQ $14, R9, R14
1135 ANDQ R9, R15
1136 ADDQ AX, DX
1137 ANDQ CX, DI
1138
1139 XORQ R14, R13
1140 XORQ R11, R15
1141
1142 RORXQ $39, BX, R14
1143 ADDQ R13, R15
1144
1145 XORQ R12, R14
1146 ADDQ R15, DX
1147
1148 RORXQ $28, BX, R12
1149
1150 XORQ R12, R14
1151 MOVQ BX, R12
1152 ANDQ R8, R12
1153 ORQ R12, DI
1154
1155 ADDQ R14, AX
1156 ADDQ R15, AX
1157 ADDQ DI, AX
1158
1159 SUBQ $1, frame_SRND(SP)
1160 JNE loop1
1161
1162 MOVQ $2, frame_SRND(SP)
1163
1164loop2:
1165 VPADDQ (BP), Y4, Y0
1166 VMOVDQU Y0, frame_YFER(SP)
1167
1168 MOVQ R9, R15
1169 RORXQ $41, DX, R13
1170 RORXQ $18, DX, R14
1171 XORQ R10, R15
1172
1173 XORQ R14, R13
1174 RORXQ $14, DX, R14
1175 ANDQ DX, R15
1176
1177 XORQ R14, R13
1178 RORXQ $34, AX, R12
1179 XORQ R10, R15
1180 RORXQ $39, AX, R14
1181 MOVQ AX, DI
1182
1183 XORQ R12, R14
1184 RORXQ $28, AX, R12
1185 ADDQ frame_YFER(SP), R11
1186 ORQ CX, DI
1187
1188 XORQ R12, R14
1189 MOVQ AX, R12
1190 ANDQ BX, DI
1191 ANDQ CX, R12
1192 ADDQ R13, R15
1193
1194 ADDQ R11, R8
1195 ORQ R12, DI
1196 ADDQ R14, R11
1197
1198 ADDQ R15, R8
1199
1200 ADDQ R15, R11
1201 MOVQ DX, R15
1202 RORXQ $41, R8, R13
1203 RORXQ $18, R8, R14
1204 XORQ R9, R15
1205
1206 XORQ R14, R13
1207 RORXQ $14, R8, R14
1208 ANDQ R8, R15
1209 ADDQ DI, R11
1210
1211 XORQ R14, R13
1212 RORXQ $34, R11, R12
1213 XORQ R9, R15
1214 RORXQ $39, R11, R14
1215 MOVQ R11, DI
1216
1217 XORQ R12, R14
1218 RORXQ $28, R11, R12
1219 ADDQ 8*1+frame_YFER(SP), R10
1220 ORQ BX, DI
1221
1222 XORQ R12, R14
1223 MOVQ R11, R12
1224 ANDQ AX, DI
1225 ANDQ BX, R12
1226 ADDQ R13, R15
1227
1228 ADDQ R10, CX
1229 ORQ R12, DI
1230 ADDQ R14, R10
1231
1232 ADDQ R15, CX
1233
1234 ADDQ R15, R10
1235 MOVQ R8, R15
1236 RORXQ $41, CX, R13
1237 RORXQ $18, CX, R14
1238 XORQ DX, R15
1239
1240 XORQ R14, R13
1241 RORXQ $14, CX, R14
1242 ANDQ CX, R15
1243 ADDQ DI, R10
1244
1245 XORQ R14, R13
1246 RORXQ $34, R10, R12
1247 XORQ DX, R15
1248 RORXQ $39, R10, R14
1249 MOVQ R10, DI
1250
1251 XORQ R12, R14
1252 RORXQ $28, R10, R12
1253 ADDQ 8*2+frame_YFER(SP), R9
1254 ORQ AX, DI
1255
1256 XORQ R12, R14
1257 MOVQ R10, R12
1258 ANDQ R11, DI
1259 ANDQ AX, R12
1260 ADDQ R13, R15
1261
1262 ADDQ R9, BX
1263 ORQ R12, DI
1264 ADDQ R14, R9
1265
1266 ADDQ R15, BX
1267
1268 ADDQ R15, R9
1269 MOVQ CX, R15
1270 RORXQ $41, BX, R13
1271 RORXQ $18, BX, R14
1272 XORQ R8, R15
1273
1274 XORQ R14, R13
1275 RORXQ $14, BX, R14
1276 ANDQ BX, R15
1277 ADDQ DI, R9
1278
1279 XORQ R14, R13
1280 RORXQ $34, R9, R12
1281 XORQ R8, R15
1282 RORXQ $39, R9, R14
1283 MOVQ R9, DI
1284
1285 XORQ R12, R14
1286 RORXQ $28, R9, R12
1287 ADDQ 8*3+frame_YFER(SP), DX
1288 ORQ R11, DI
1289
1290 XORQ R12, R14
1291 MOVQ R9, R12
1292 ANDQ R10, DI
1293 ANDQ R11, R12
1294 ADDQ R13, R15
1295
1296 ADDQ DX, AX
1297 ORQ R12, DI
1298 ADDQ R14, DX
1299
1300 ADDQ R15, AX
1301
1302 ADDQ R15, DX
1303
1304 ADDQ DI, DX
1305
1306 VPADDQ 1*32(BP), Y5, Y0
1307 VMOVDQU Y0, frame_YFER(SP)
1308 ADDQ $(2*32), BP
1309
1310 MOVQ BX, R15
1311 RORXQ $41, AX, R13
1312 RORXQ $18, AX, R14
1313 XORQ CX, R15
1314
1315 XORQ R14, R13
1316 RORXQ $14, AX, R14
1317 ANDQ AX, R15
1318
1319 XORQ R14, R13
1320 RORXQ $34, DX, R12
1321 XORQ CX, R15
1322 RORXQ $39, DX, R14
1323 MOVQ DX, DI
1324
1325 XORQ R12, R14
1326 RORXQ $28, DX, R12
1327 ADDQ frame_YFER(SP), R8
1328 ORQ R10, DI
1329
1330 XORQ R12, R14
1331 MOVQ DX, R12
1332 ANDQ R9, DI
1333 ANDQ R10, R12
1334 ADDQ R13, R15
1335
1336 ADDQ R8, R11
1337 ORQ R12, DI
1338 ADDQ R14, R8
1339
1340 ADDQ R15, R11
1341
1342 ADDQ R15, R8
1343 MOVQ AX, R15
1344 RORXQ $41, R11, R13
1345 RORXQ $18, R11, R14
1346 XORQ BX, R15
1347
1348 XORQ R14, R13
1349 RORXQ $14, R11, R14
1350 ANDQ R11, R15
1351 ADDQ DI, R8
1352
1353 XORQ R14, R13
1354 RORXQ $34, R8, R12
1355 XORQ BX, R15
1356 RORXQ $39, R8, R14
1357 MOVQ R8, DI
1358
1359 XORQ R12, R14
1360 RORXQ $28, R8, R12
1361 ADDQ 8*1+frame_YFER(SP), CX
1362 ORQ R9, DI
1363
1364 XORQ R12, R14
1365 MOVQ R8, R12
1366 ANDQ DX, DI
1367 ANDQ R9, R12
1368 ADDQ R13, R15
1369
1370 ADDQ CX, R10
1371 ORQ R12, DI
1372 ADDQ R14, CX
1373
1374 ADDQ R15, R10
1375
1376 ADDQ R15, CX
1377 MOVQ R11, R15
1378 RORXQ $41, R10, R13
1379 RORXQ $18, R10, R14
1380 XORQ AX, R15
1381
1382 XORQ R14, R13
1383 RORXQ $14, R10, R14
1384 ANDQ R10, R15
1385 ADDQ DI, CX
1386
1387 XORQ R14, R13
1388 RORXQ $34, CX, R12
1389 XORQ AX, R15
1390 RORXQ $39, CX, R14
1391 MOVQ CX, DI
1392
1393 XORQ R12, R14
1394 RORXQ $28, CX, R12
1395 ADDQ 8*2+frame_YFER(SP), BX
1396 ORQ DX, DI
1397
1398 XORQ R12, R14
1399 MOVQ CX, R12
1400 ANDQ R8, DI
1401 ANDQ DX, R12
1402 ADDQ R13, R15
1403
1404 ADDQ BX, R9
1405 ORQ R12, DI
1406 ADDQ R14, BX
1407
1408 ADDQ R15, R9
1409
1410 ADDQ R15, BX
1411 MOVQ R10, R15
1412 RORXQ $41, R9, R13
1413 RORXQ $18, R9, R14
1414 XORQ R11, R15
1415
1416 XORQ R14, R13
1417 RORXQ $14, R9, R14
1418 ANDQ R9, R15
1419 ADDQ DI, BX
1420
1421 XORQ R14, R13
1422 RORXQ $34, BX, R12
1423 XORQ R11, R15
1424 RORXQ $39, BX, R14
1425 MOVQ BX, DI
1426
1427 XORQ R12, R14
1428 RORXQ $28, BX, R12
1429 ADDQ 8*3+frame_YFER(SP), AX
1430 ORQ R8, DI
1431
1432 XORQ R12, R14
1433 MOVQ BX, R12
1434 ANDQ CX, DI
1435 ANDQ R8, R12
1436 ADDQ R13, R15
1437
1438 ADDQ AX, DX
1439 ORQ R12, DI
1440 ADDQ R14, AX
1441
1442 ADDQ R15, DX
1443
1444 ADDQ R15, AX
1445
1446 ADDQ DI, AX
1447
1448 VMOVDQU Y6, Y4
1449 VMOVDQU Y7, Y5
1450
1451 SUBQ $1, frame_SRND(SP)
1452 JNE loop2
1453
1454 addm(8*0(SI),AX)
1455 addm(8*1(SI),BX)
1456 addm(8*2(SI),CX)
1457 addm(8*3(SI),R8)
1458 addm(8*4(SI),DX)
1459 addm(8*5(SI),R9)
1460 addm(8*6(SI),R10)
1461 addm(8*7(SI),R11)
1462
1463 MOVQ frame_INP(SP), DI
1464 ADDQ $128, DI
1465 CMPQ DI, frame_INPEND(SP)
1466 JNE loop0
1467
1468done_hash:
1469 VZEROUPPER
1470 RET
View as plain text