Text file
src/crypto/sha1/sha1block_amd64.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// AVX2 version by Intel, same algorithm as code in Linux kernel:
6// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
7// Authors:
8// Ilya Albrekht <ilya.albrekht@intel.com>
9// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
10// Ronen Zohar <ronen.zohar@intel.com>
11// Chandramouli Narayanan <mouli@linux.intel.com>
12
13//go:build !purego
14
15#include "textflag.h"
16
17// SHA-1 block routine. See sha1block.go for Go equivalent.
18//
19// There are 80 rounds of 4 types:
20// - rounds 0-15 are type 1 and load data (ROUND1 macro).
21// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
22// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
23// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
24// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
25//
26// Each round loads or shuffles the data, then computes a per-round
27// function of b, c, d, and then mixes the result into and rotates the
28// five registers a, b, c, d, e holding the intermediate results.
29//
30// The register rotation is implemented by rotating the arguments to
31// the round macros instead of by explicit move instructions.
32
33#define LOAD(index) \
34 MOVL (index*4)(SI), R10; \
35 BSWAPL R10; \
36 MOVL R10, (index*4)(SP)
37
38#define SHUFFLE(index) \
39 MOVL (((index)&0xf)*4)(SP), R10; \
40 XORL (((index-3)&0xf)*4)(SP), R10; \
41 XORL (((index-8)&0xf)*4)(SP), R10; \
42 XORL (((index-14)&0xf)*4)(SP), R10; \
43 ROLL $1, R10; \
44 MOVL R10, (((index)&0xf)*4)(SP)
45
46#define FUNC1(a, b, c, d, e) \
47 MOVL d, R9; \
48 XORL c, R9; \
49 ANDL b, R9; \
50 XORL d, R9
51
52#define FUNC2(a, b, c, d, e) \
53 MOVL b, R9; \
54 XORL c, R9; \
55 XORL d, R9
56
57#define FUNC3(a, b, c, d, e) \
58 MOVL b, R8; \
59 ORL c, R8; \
60 ANDL d, R8; \
61 MOVL b, R9; \
62 ANDL c, R9; \
63 ORL R8, R9
64
65#define FUNC4 FUNC2
66
67#define MIX(a, b, c, d, e, const) \
68 ROLL $30, b; \
69 ADDL R9, e; \
70 MOVL a, R8; \
71 ROLL $5, R8; \
72 LEAL const(e)(R10*1), e; \
73 ADDL R8, e
74
75#define ROUND1(a, b, c, d, e, index) \
76 LOAD(index); \
77 FUNC1(a, b, c, d, e); \
78 MIX(a, b, c, d, e, 0x5A827999)
79
80#define ROUND1x(a, b, c, d, e, index) \
81 SHUFFLE(index); \
82 FUNC1(a, b, c, d, e); \
83 MIX(a, b, c, d, e, 0x5A827999)
84
85#define ROUND2(a, b, c, d, e, index) \
86 SHUFFLE(index); \
87 FUNC2(a, b, c, d, e); \
88 MIX(a, b, c, d, e, 0x6ED9EBA1)
89
90#define ROUND3(a, b, c, d, e, index) \
91 SHUFFLE(index); \
92 FUNC3(a, b, c, d, e); \
93 MIX(a, b, c, d, e, 0x8F1BBCDC)
94
95#define ROUND4(a, b, c, d, e, index) \
96 SHUFFLE(index); \
97 FUNC4(a, b, c, d, e); \
98 MIX(a, b, c, d, e, 0xCA62C1D6)
99
100TEXT ·blockAMD64(SB),NOSPLIT,$64-32
101 MOVQ dig+0(FP), BP
102 MOVQ p_base+8(FP), SI
103 MOVQ p_len+16(FP), DX
104 SHRQ $6, DX
105 SHLQ $6, DX
106
107 LEAQ (SI)(DX*1), DI
108 MOVL (0*4)(BP), AX
109 MOVL (1*4)(BP), BX
110 MOVL (2*4)(BP), CX
111 MOVL (3*4)(BP), DX
112 MOVL (4*4)(BP), BP
113
114 CMPQ SI, DI
115 JEQ end
116
117loop:
118 MOVL AX, R11
119 MOVL BX, R12
120 MOVL CX, R13
121 MOVL DX, R14
122 MOVL BP, R15
123
124 ROUND1(AX, BX, CX, DX, BP, 0)
125 ROUND1(BP, AX, BX, CX, DX, 1)
126 ROUND1(DX, BP, AX, BX, CX, 2)
127 ROUND1(CX, DX, BP, AX, BX, 3)
128 ROUND1(BX, CX, DX, BP, AX, 4)
129 ROUND1(AX, BX, CX, DX, BP, 5)
130 ROUND1(BP, AX, BX, CX, DX, 6)
131 ROUND1(DX, BP, AX, BX, CX, 7)
132 ROUND1(CX, DX, BP, AX, BX, 8)
133 ROUND1(BX, CX, DX, BP, AX, 9)
134 ROUND1(AX, BX, CX, DX, BP, 10)
135 ROUND1(BP, AX, BX, CX, DX, 11)
136 ROUND1(DX, BP, AX, BX, CX, 12)
137 ROUND1(CX, DX, BP, AX, BX, 13)
138 ROUND1(BX, CX, DX, BP, AX, 14)
139 ROUND1(AX, BX, CX, DX, BP, 15)
140
141 ROUND1x(BP, AX, BX, CX, DX, 16)
142 ROUND1x(DX, BP, AX, BX, CX, 17)
143 ROUND1x(CX, DX, BP, AX, BX, 18)
144 ROUND1x(BX, CX, DX, BP, AX, 19)
145
146 ROUND2(AX, BX, CX, DX, BP, 20)
147 ROUND2(BP, AX, BX, CX, DX, 21)
148 ROUND2(DX, BP, AX, BX, CX, 22)
149 ROUND2(CX, DX, BP, AX, BX, 23)
150 ROUND2(BX, CX, DX, BP, AX, 24)
151 ROUND2(AX, BX, CX, DX, BP, 25)
152 ROUND2(BP, AX, BX, CX, DX, 26)
153 ROUND2(DX, BP, AX, BX, CX, 27)
154 ROUND2(CX, DX, BP, AX, BX, 28)
155 ROUND2(BX, CX, DX, BP, AX, 29)
156 ROUND2(AX, BX, CX, DX, BP, 30)
157 ROUND2(BP, AX, BX, CX, DX, 31)
158 ROUND2(DX, BP, AX, BX, CX, 32)
159 ROUND2(CX, DX, BP, AX, BX, 33)
160 ROUND2(BX, CX, DX, BP, AX, 34)
161 ROUND2(AX, BX, CX, DX, BP, 35)
162 ROUND2(BP, AX, BX, CX, DX, 36)
163 ROUND2(DX, BP, AX, BX, CX, 37)
164 ROUND2(CX, DX, BP, AX, BX, 38)
165 ROUND2(BX, CX, DX, BP, AX, 39)
166
167 ROUND3(AX, BX, CX, DX, BP, 40)
168 ROUND3(BP, AX, BX, CX, DX, 41)
169 ROUND3(DX, BP, AX, BX, CX, 42)
170 ROUND3(CX, DX, BP, AX, BX, 43)
171 ROUND3(BX, CX, DX, BP, AX, 44)
172 ROUND3(AX, BX, CX, DX, BP, 45)
173 ROUND3(BP, AX, BX, CX, DX, 46)
174 ROUND3(DX, BP, AX, BX, CX, 47)
175 ROUND3(CX, DX, BP, AX, BX, 48)
176 ROUND3(BX, CX, DX, BP, AX, 49)
177 ROUND3(AX, BX, CX, DX, BP, 50)
178 ROUND3(BP, AX, BX, CX, DX, 51)
179 ROUND3(DX, BP, AX, BX, CX, 52)
180 ROUND3(CX, DX, BP, AX, BX, 53)
181 ROUND3(BX, CX, DX, BP, AX, 54)
182 ROUND3(AX, BX, CX, DX, BP, 55)
183 ROUND3(BP, AX, BX, CX, DX, 56)
184 ROUND3(DX, BP, AX, BX, CX, 57)
185 ROUND3(CX, DX, BP, AX, BX, 58)
186 ROUND3(BX, CX, DX, BP, AX, 59)
187
188 ROUND4(AX, BX, CX, DX, BP, 60)
189 ROUND4(BP, AX, BX, CX, DX, 61)
190 ROUND4(DX, BP, AX, BX, CX, 62)
191 ROUND4(CX, DX, BP, AX, BX, 63)
192 ROUND4(BX, CX, DX, BP, AX, 64)
193 ROUND4(AX, BX, CX, DX, BP, 65)
194 ROUND4(BP, AX, BX, CX, DX, 66)
195 ROUND4(DX, BP, AX, BX, CX, 67)
196 ROUND4(CX, DX, BP, AX, BX, 68)
197 ROUND4(BX, CX, DX, BP, AX, 69)
198 ROUND4(AX, BX, CX, DX, BP, 70)
199 ROUND4(BP, AX, BX, CX, DX, 71)
200 ROUND4(DX, BP, AX, BX, CX, 72)
201 ROUND4(CX, DX, BP, AX, BX, 73)
202 ROUND4(BX, CX, DX, BP, AX, 74)
203 ROUND4(AX, BX, CX, DX, BP, 75)
204 ROUND4(BP, AX, BX, CX, DX, 76)
205 ROUND4(DX, BP, AX, BX, CX, 77)
206 ROUND4(CX, DX, BP, AX, BX, 78)
207 ROUND4(BX, CX, DX, BP, AX, 79)
208
209 ADDL R11, AX
210 ADDL R12, BX
211 ADDL R13, CX
212 ADDL R14, DX
213 ADDL R15, BP
214
215 ADDQ $64, SI
216 CMPQ SI, DI
217 JB loop
218
219end:
220 MOVQ dig+0(FP), DI
221 MOVL AX, (0*4)(DI)
222 MOVL BX, (1*4)(DI)
223 MOVL CX, (2*4)(DI)
224 MOVL DX, (3*4)(DI)
225 MOVL BP, (4*4)(DI)
226 RET
227
228
229// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
230// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
231// From http://software.intel.com/en-us/articles
232// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
233// This implementation is 2x unrolled, and interleaves vector instructions,
234// used to precompute W, with scalar computation of current round
235// for optimal scheduling.
236
237// Trivial helper macros.
238#define UPDATE_HASH(A,TB,C,D,E) \
239 ADDL (R9), A \
240 MOVL A, (R9) \
241 ADDL 4(R9), TB \
242 MOVL TB, 4(R9) \
243 ADDL 8(R9), C \
244 MOVL C, 8(R9) \
245 ADDL 12(R9), D \
246 MOVL D, 12(R9) \
247 ADDL 16(R9), E \
248 MOVL E, 16(R9)
249
250
251
252// Helper macros for PRECALC, which does precomputations
253#define PRECALC_0(OFFSET) \
254 VMOVDQU OFFSET(R10),X0
255
256#define PRECALC_1(OFFSET) \
257 VINSERTI128 $1, OFFSET(R13), Y0, Y0
258
259#define PRECALC_2(YREG) \
260 VPSHUFB Y10, Y0, YREG
261
262#define PRECALC_4(YREG,K_OFFSET) \
263 VPADDD K_OFFSET(R8), YREG, Y0
264
265#define PRECALC_7(OFFSET) \
266 VMOVDQU Y0, (OFFSET*2)(R14)
267
268
269// Message scheduling pre-compute for rounds 0-15
270// R13 is a pointer to even 64-byte block
271// R10 is a pointer to odd 64-byte block
272// R14 is a pointer to temp buffer
273// X0 is used as temp register
274// YREG is clobbered as part of computation
275// OFFSET chooses 16 byte chunk within a block
276// R8 is a pointer to constants block
277// K_OFFSET chooses K constants relevant to this round
278// X10 holds swap mask
279#define PRECALC_00_15(OFFSET,YREG) \
280 PRECALC_0(OFFSET) \
281 PRECALC_1(OFFSET) \
282 PRECALC_2(YREG) \
283 PRECALC_4(YREG,0x0) \
284 PRECALC_7(OFFSET)
285
286
287// Helper macros for PRECALC_16_31
288#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
289 VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14]
290 VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
291
292#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
293 VPXOR REG_SUB_8, REG, REG \
294 VPXOR REG_SUB_16, Y0, Y0
295
296#define PRECALC_18(REG) \
297 VPXOR Y0, REG, REG \
298 VPSLLDQ $12, REG, Y9
299
300#define PRECALC_19(REG) \
301 VPSLLD $1, REG, Y0 \
302 VPSRLD $31, REG, REG
303
304#define PRECALC_20(REG) \
305 VPOR REG, Y0, Y0 \
306 VPSLLD $2, Y9, REG
307
308#define PRECALC_21(REG) \
309 VPSRLD $30, Y9, Y9 \
310 VPXOR REG, Y0, Y0
311
312#define PRECALC_23(REG,K_OFFSET,OFFSET) \
313 VPXOR Y9, Y0, REG \
314 VPADDD K_OFFSET(R8), REG, Y0 \
315 VMOVDQU Y0, (OFFSET)(R14)
316
317// Message scheduling pre-compute for rounds 16-31
318// calculating last 32 w[i] values in 8 XMM registers
319// pre-calculate K+w[i] values and store to mem
320// for later load by ALU add instruction.
321// "brute force" vectorization for rounds 16-31 only
322// due to w[i]->w[i-3] dependency.
323// clobbers 5 input ymm registers REG_SUB*
324// uses X0 and X9 as temp registers
325// As always, R8 is a pointer to constants block
326// and R14 is a pointer to temp buffer
327#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
328 PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
329 PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
330 PRECALC_18(REG) \
331 PRECALC_19(REG) \
332 PRECALC_20(REG) \
333 PRECALC_21(REG) \
334 PRECALC_23(REG,K_OFFSET,OFFSET)
335
336
337// Helper macros for PRECALC_32_79
338#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
339 VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
340
341#define PRECALC_33(REG_SUB_28,REG) \
342 VPXOR REG_SUB_28, REG, REG
343
344#define PRECALC_34(REG_SUB_16) \
345 VPXOR REG_SUB_16, Y0, Y0
346
347#define PRECALC_35(REG) \
348 VPXOR Y0, REG, REG
349
350#define PRECALC_36(REG) \
351 VPSLLD $2, REG, Y0
352
353#define PRECALC_37(REG) \
354 VPSRLD $30, REG, REG \
355 VPOR REG, Y0, REG
356
357#define PRECALC_39(REG,K_OFFSET,OFFSET) \
358 VPADDD K_OFFSET(R8), REG, Y0 \
359 VMOVDQU Y0, (OFFSET)(R14)
360
361// Message scheduling pre-compute for rounds 32-79
362// In SHA-1 specification we have:
363// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
364// Which is the same as:
365// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
366// This allows for more efficient vectorization,
367// since w[i]->w[i-3] dependency is broken
368#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
369 PRECALC_32(REG_SUB_8,REG_SUB_4) \
370 PRECALC_33(REG_SUB_28,REG) \
371 PRECALC_34(REG_SUB_16) \
372 PRECALC_35(REG) \
373 PRECALC_36(REG) \
374 PRECALC_37(REG) \
375 PRECALC_39(REG,K_OFFSET,OFFSET)
376
377#define PRECALC \
378 PRECALC_00_15(0,Y15) \
379 PRECALC_00_15(0x10,Y14) \
380 PRECALC_00_15(0x20,Y13) \
381 PRECALC_00_15(0x30,Y12) \
382 PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
383 PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
384 PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
385 PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
386 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
387 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
388 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
389 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
390 PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
391 PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
392 PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
393 PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
394 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
395 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
396 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
397 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
398
399// Macros calculating individual rounds have general form
400// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
401// CALC_ROUND_{PRE,POST} macros follow
402
403#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
404 ADDL OFFSET(R15),REG_E \
405 ANDNL REG_C,REG_A,BP \
406 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
407 RORXL $0x1b, REG_A, R12 \
408 RORXL $2, REG_A, REG_B // for next round
409
410// Calculate F for the next round
411#define CALC_F1_POST(REG_A,REG_B,REG_E) \
412 ANDL REG_B,REG_A \ // b&c
413 XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d)
414 LEAL (REG_E)(R12*1), REG_E // E += A >>> 5
415
416
417// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
418#define CALC_0 \
419 MOVL SI, BX \ // Precalculating first round
420 RORXL $2, SI, SI \
421 ANDNL AX, BX, BP \
422 ANDL DI, BX \
423 XORL BP, BX \
424 CALC_F1_PRE(0x0,CX,BX,DI,DX) \
425 PRECALC_0(0x80) \
426 CALC_F1_POST(CX,SI,DX)
427
428#define CALC_1 \
429 CALC_F1_PRE(0x4,DX,CX,SI,AX) \
430 PRECALC_1(0x80) \
431 CALC_F1_POST(DX,BX,AX)
432
433#define CALC_2 \
434 CALC_F1_PRE(0x8,AX,DX,BX,DI) \
435 PRECALC_2(Y15) \
436 CALC_F1_POST(AX,CX,DI)
437
438#define CALC_3 \
439 CALC_F1_PRE(0xc,DI,AX,CX,SI) \
440 CALC_F1_POST(DI,DX,SI)
441
442#define CALC_4 \
443 CALC_F1_PRE(0x20,SI,DI,DX,BX) \
444 PRECALC_4(Y15,0x0) \
445 CALC_F1_POST(SI,AX,BX)
446
447#define CALC_5 \
448 CALC_F1_PRE(0x24,BX,SI,AX,CX) \
449 CALC_F1_POST(BX,DI,CX)
450
451#define CALC_6 \
452 CALC_F1_PRE(0x28,CX,BX,DI,DX) \
453 CALC_F1_POST(CX,SI,DX)
454
455#define CALC_7 \
456 CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
457 PRECALC_7(0x0) \
458 CALC_F1_POST(DX,BX,AX)
459
460#define CALC_8 \
461 CALC_F1_PRE(0x40,AX,DX,BX,DI) \
462 PRECALC_0(0x90) \
463 CALC_F1_POST(AX,CX,DI)
464
465#define CALC_9 \
466 CALC_F1_PRE(0x44,DI,AX,CX,SI) \
467 PRECALC_1(0x90) \
468 CALC_F1_POST(DI,DX,SI)
469
470#define CALC_10 \
471 CALC_F1_PRE(0x48,SI,DI,DX,BX) \
472 PRECALC_2(Y14) \
473 CALC_F1_POST(SI,AX,BX)
474
475#define CALC_11 \
476 CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
477 CALC_F1_POST(BX,DI,CX)
478
479#define CALC_12 \
480 CALC_F1_PRE(0x60,CX,BX,DI,DX) \
481 PRECALC_4(Y14,0x0) \
482 CALC_F1_POST(CX,SI,DX)
483
484#define CALC_13 \
485 CALC_F1_PRE(0x64,DX,CX,SI,AX) \
486 CALC_F1_POST(DX,BX,AX)
487
488#define CALC_14 \
489 CALC_F1_PRE(0x68,AX,DX,BX,DI) \
490 CALC_F1_POST(AX,CX,DI)
491
492#define CALC_15 \
493 CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
494 PRECALC_7(0x10) \
495 CALC_F1_POST(DI,DX,SI)
496
497#define CALC_16 \
498 CALC_F1_PRE(0x80,SI,DI,DX,BX) \
499 PRECALC_0(0xa0) \
500 CALC_F1_POST(SI,AX,BX)
501
502#define CALC_17 \
503 CALC_F1_PRE(0x84,BX,SI,AX,CX) \
504 PRECALC_1(0xa0) \
505 CALC_F1_POST(BX,DI,CX)
506
507#define CALC_18 \
508 CALC_F1_PRE(0x88,CX,BX,DI,DX) \
509 PRECALC_2(Y13) \
510 CALC_F1_POST(CX,SI,DX)
511
512
513#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
514 ADDL OFFSET(R15),REG_E \
515 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
516 RORXL $0x1b, REG_A, R12 \
517 RORXL $2, REG_A, REG_B // for next round
518
519#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
520 XORL REG_B, REG_A \
521 ADDL R12, REG_E \
522 XORL REG_C, REG_A
523
524#define CALC_19 \
525 CALC_F2_PRE(0x8c,DX,CX,AX) \
526 CALC_F2_POST(DX,BX,SI,AX)
527
528#define CALC_20 \
529 CALC_F2_PRE(0xa0,AX,DX,DI) \
530 PRECALC_4(Y13,0x0) \
531 CALC_F2_POST(AX,CX,BX,DI)
532
533#define CALC_21 \
534 CALC_F2_PRE(0xa4,DI,AX,SI) \
535 CALC_F2_POST(DI,DX,CX,SI)
536
537#define CALC_22 \
538 CALC_F2_PRE(0xa8,SI,DI,BX) \
539 CALC_F2_POST(SI,AX,DX,BX)
540
541#define CALC_23 \
542 CALC_F2_PRE(0xac,BX,SI,CX) \
543 PRECALC_7(0x20) \
544 CALC_F2_POST(BX,DI,AX,CX)
545
546#define CALC_24 \
547 CALC_F2_PRE(0xc0,CX,BX,DX) \
548 PRECALC_0(0xb0) \
549 CALC_F2_POST(CX,SI,DI,DX)
550
551#define CALC_25 \
552 CALC_F2_PRE(0xc4,DX,CX,AX) \
553 PRECALC_1(0xb0) \
554 CALC_F2_POST(DX,BX,SI,AX)
555
556#define CALC_26 \
557 CALC_F2_PRE(0xc8,AX,DX,DI) \
558 PRECALC_2(Y12) \
559 CALC_F2_POST(AX,CX,BX,DI)
560
561#define CALC_27 \
562 CALC_F2_PRE(0xcc,DI,AX,SI) \
563 CALC_F2_POST(DI,DX,CX,SI)
564
565#define CALC_28 \
566 CALC_F2_PRE(0xe0,SI,DI,BX) \
567 PRECALC_4(Y12,0x0) \
568 CALC_F2_POST(SI,AX,DX,BX)
569
570#define CALC_29 \
571 CALC_F2_PRE(0xe4,BX,SI,CX) \
572 CALC_F2_POST(BX,DI,AX,CX)
573
574#define CALC_30 \
575 CALC_F2_PRE(0xe8,CX,BX,DX) \
576 CALC_F2_POST(CX,SI,DI,DX)
577
578#define CALC_31 \
579 CALC_F2_PRE(0xec,DX,CX,AX) \
580 PRECALC_7(0x30) \
581 CALC_F2_POST(DX,BX,SI,AX)
582
583#define CALC_32 \
584 CALC_F2_PRE(0x100,AX,DX,DI) \
585 PRECALC_16(Y15,Y14,Y12,Y8) \
586 CALC_F2_POST(AX,CX,BX,DI)
587
588#define CALC_33 \
589 CALC_F2_PRE(0x104,DI,AX,SI) \
590 PRECALC_17(Y15,Y13,Y8) \
591 CALC_F2_POST(DI,DX,CX,SI)
592
593#define CALC_34 \
594 CALC_F2_PRE(0x108,SI,DI,BX) \
595 PRECALC_18(Y8) \
596 CALC_F2_POST(SI,AX,DX,BX)
597
598#define CALC_35 \
599 CALC_F2_PRE(0x10c,BX,SI,CX) \
600 PRECALC_19(Y8) \
601 CALC_F2_POST(BX,DI,AX,CX)
602
603#define CALC_36 \
604 CALC_F2_PRE(0x120,CX,BX,DX) \
605 PRECALC_20(Y8) \
606 CALC_F2_POST(CX,SI,DI,DX)
607
608#define CALC_37 \
609 CALC_F2_PRE(0x124,DX,CX,AX) \
610 PRECALC_21(Y8) \
611 CALC_F2_POST(DX,BX,SI,AX)
612
613#define CALC_38 \
614 CALC_F2_PRE(0x128,AX,DX,DI) \
615 CALC_F2_POST(AX,CX,BX,DI)
616
617
618#define CALC_F3_PRE(OFFSET,REG_E) \
619 ADDL OFFSET(R15),REG_E
620
621#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
622 LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
623 MOVL REG_B, BP \
624 ORL REG_A, BP \
625 RORXL $0x1b, REG_A, R12 \
626 RORXL $2, REG_A, REG_TB \
627 ANDL REG_C, BP \ // Calculate F for the next round
628 ANDL REG_B, REG_A \
629 ORL BP, REG_A \
630 ADDL R12, REG_E
631
632#define CALC_39 \
633 CALC_F3_PRE(0x12c,SI) \
634 PRECALC_23(Y8,0x0,0x80) \
635 CALC_F3_POST(DI,DX,CX,SI,AX)
636
637#define CALC_40 \
638 CALC_F3_PRE(0x140,BX) \
639 PRECALC_16(Y14,Y13,Y8,Y7) \
640 CALC_F3_POST(SI,AX,DX,BX,DI)
641
642#define CALC_41 \
643 CALC_F3_PRE(0x144,CX) \
644 PRECALC_17(Y14,Y12,Y7) \
645 CALC_F3_POST(BX,DI,AX,CX,SI)
646
647#define CALC_42 \
648 CALC_F3_PRE(0x148,DX) \
649 PRECALC_18(Y7) \
650 CALC_F3_POST(CX,SI,DI,DX,BX)
651
652#define CALC_43 \
653 CALC_F3_PRE(0x14c,AX) \
654 PRECALC_19(Y7) \
655 CALC_F3_POST(DX,BX,SI,AX,CX)
656
657#define CALC_44 \
658 CALC_F3_PRE(0x160,DI) \
659 PRECALC_20(Y7) \
660 CALC_F3_POST(AX,CX,BX,DI,DX)
661
662#define CALC_45 \
663 CALC_F3_PRE(0x164,SI) \
664 PRECALC_21(Y7) \
665 CALC_F3_POST(DI,DX,CX,SI,AX)
666
667#define CALC_46 \
668 CALC_F3_PRE(0x168,BX) \
669 CALC_F3_POST(SI,AX,DX,BX,DI)
670
671#define CALC_47 \
672 CALC_F3_PRE(0x16c,CX) \
673 VPXOR Y9, Y0, Y7 \
674 VPADDD 0x20(R8), Y7, Y0 \
675 VMOVDQU Y0, 0xa0(R14) \
676 CALC_F3_POST(BX,DI,AX,CX,SI)
677
678#define CALC_48 \
679 CALC_F3_PRE(0x180,DX) \
680 PRECALC_16(Y13,Y12,Y7,Y5) \
681 CALC_F3_POST(CX,SI,DI,DX,BX)
682
683#define CALC_49 \
684 CALC_F3_PRE(0x184,AX) \
685 PRECALC_17(Y13,Y8,Y5) \
686 CALC_F3_POST(DX,BX,SI,AX,CX)
687
688#define CALC_50 \
689 CALC_F3_PRE(0x188,DI) \
690 PRECALC_18(Y5) \
691 CALC_F3_POST(AX,CX,BX,DI,DX)
692
693#define CALC_51 \
694 CALC_F3_PRE(0x18c,SI) \
695 PRECALC_19(Y5) \
696 CALC_F3_POST(DI,DX,CX,SI,AX)
697
698#define CALC_52 \
699 CALC_F3_PRE(0x1a0,BX) \
700 PRECALC_20(Y5) \
701 CALC_F3_POST(SI,AX,DX,BX,DI)
702
703#define CALC_53 \
704 CALC_F3_PRE(0x1a4,CX) \
705 PRECALC_21(Y5) \
706 CALC_F3_POST(BX,DI,AX,CX,SI)
707
708#define CALC_54 \
709 CALC_F3_PRE(0x1a8,DX) \
710 CALC_F3_POST(CX,SI,DI,DX,BX)
711
712#define CALC_55 \
713 CALC_F3_PRE(0x1ac,AX) \
714 PRECALC_23(Y5,0x20,0xc0) \
715 CALC_F3_POST(DX,BX,SI,AX,CX)
716
717#define CALC_56 \
718 CALC_F3_PRE(0x1c0,DI) \
719 PRECALC_16(Y12,Y8,Y5,Y3) \
720 CALC_F3_POST(AX,CX,BX,DI,DX)
721
722#define CALC_57 \
723 CALC_F3_PRE(0x1c4,SI) \
724 PRECALC_17(Y12,Y7,Y3) \
725 CALC_F3_POST(DI,DX,CX,SI,AX)
726
727#define CALC_58 \
728 CALC_F3_PRE(0x1c8,BX) \
729 PRECALC_18(Y3) \
730 CALC_F3_POST(SI,AX,DX,BX,DI)
731
732#define CALC_59 \
733 CALC_F2_PRE(0x1cc,BX,SI,CX) \
734 PRECALC_19(Y3) \
735 CALC_F2_POST(BX,DI,AX,CX)
736
737#define CALC_60 \
738 CALC_F2_PRE(0x1e0,CX,BX,DX) \
739 PRECALC_20(Y3) \
740 CALC_F2_POST(CX,SI,DI,DX)
741
742#define CALC_61 \
743 CALC_F2_PRE(0x1e4,DX,CX,AX) \
744 PRECALC_21(Y3) \
745 CALC_F2_POST(DX,BX,SI,AX)
746
747#define CALC_62 \
748 CALC_F2_PRE(0x1e8,AX,DX,DI) \
749 CALC_F2_POST(AX,CX,BX,DI)
750
751#define CALC_63 \
752 CALC_F2_PRE(0x1ec,DI,AX,SI) \
753 PRECALC_23(Y3,0x20,0xe0) \
754 CALC_F2_POST(DI,DX,CX,SI)
755
756#define CALC_64 \
757 CALC_F2_PRE(0x200,SI,DI,BX) \
758 PRECALC_32(Y5,Y3) \
759 CALC_F2_POST(SI,AX,DX,BX)
760
761#define CALC_65 \
762 CALC_F2_PRE(0x204,BX,SI,CX) \
763 PRECALC_33(Y14,Y15) \
764 CALC_F2_POST(BX,DI,AX,CX)
765
766#define CALC_66 \
767 CALC_F2_PRE(0x208,CX,BX,DX) \
768 PRECALC_34(Y8) \
769 CALC_F2_POST(CX,SI,DI,DX)
770
771#define CALC_67 \
772 CALC_F2_PRE(0x20c,DX,CX,AX) \
773 PRECALC_35(Y15) \
774 CALC_F2_POST(DX,BX,SI,AX)
775
776#define CALC_68 \
777 CALC_F2_PRE(0x220,AX,DX,DI) \
778 PRECALC_36(Y15) \
779 CALC_F2_POST(AX,CX,BX,DI)
780
781#define CALC_69 \
782 CALC_F2_PRE(0x224,DI,AX,SI) \
783 PRECALC_37(Y15) \
784 CALC_F2_POST(DI,DX,CX,SI)
785
786#define CALC_70 \
787 CALC_F2_PRE(0x228,SI,DI,BX) \
788 CALC_F2_POST(SI,AX,DX,BX)
789
790#define CALC_71 \
791 CALC_F2_PRE(0x22c,BX,SI,CX) \
792 PRECALC_39(Y15,0x20,0x100) \
793 CALC_F2_POST(BX,DI,AX,CX)
794
795#define CALC_72 \
796 CALC_F2_PRE(0x240,CX,BX,DX) \
797 PRECALC_32(Y3,Y15) \
798 CALC_F2_POST(CX,SI,DI,DX)
799
800#define CALC_73 \
801 CALC_F2_PRE(0x244,DX,CX,AX) \
802 PRECALC_33(Y13,Y14) \
803 CALC_F2_POST(DX,BX,SI,AX)
804
805#define CALC_74 \
806 CALC_F2_PRE(0x248,AX,DX,DI) \
807 PRECALC_34(Y7) \
808 CALC_F2_POST(AX,CX,BX,DI)
809
810#define CALC_75 \
811 CALC_F2_PRE(0x24c,DI,AX,SI) \
812 PRECALC_35(Y14) \
813 CALC_F2_POST(DI,DX,CX,SI)
814
815#define CALC_76 \
816 CALC_F2_PRE(0x260,SI,DI,BX) \
817 PRECALC_36(Y14) \
818 CALC_F2_POST(SI,AX,DX,BX)
819
820#define CALC_77 \
821 CALC_F2_PRE(0x264,BX,SI,CX) \
822 PRECALC_37(Y14) \
823 CALC_F2_POST(BX,DI,AX,CX)
824
825#define CALC_78 \
826 CALC_F2_PRE(0x268,CX,BX,DX) \
827 CALC_F2_POST(CX,SI,DI,DX)
828
829#define CALC_79 \
830 ADDL 0x26c(R15), AX \
831 LEAL (AX)(CX*1), AX \
832 RORXL $0x1b, DX, R12 \
833 PRECALC_39(Y14,0x20,0x120) \
834 ADDL R12, AX
835
836// Similar to CALC_0
837#define CALC_80 \
838 MOVL CX, DX \
839 RORXL $2, CX, CX \
840 ANDNL SI, DX, BP \
841 ANDL BX, DX \
842 XORL BP, DX \
843 CALC_F1_PRE(0x10,AX,DX,BX,DI) \
844 PRECALC_32(Y15,Y14) \
845 CALC_F1_POST(AX,CX,DI)
846
847#define CALC_81 \
848 CALC_F1_PRE(0x14,DI,AX,CX,SI) \
849 PRECALC_33(Y12,Y13) \
850 CALC_F1_POST(DI,DX,SI)
851
852#define CALC_82 \
853 CALC_F1_PRE(0x18,SI,DI,DX,BX) \
854 PRECALC_34(Y5) \
855 CALC_F1_POST(SI,AX,BX)
856
857#define CALC_83 \
858 CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
859 PRECALC_35(Y13) \
860 CALC_F1_POST(BX,DI,CX)
861
862#define CALC_84 \
863 CALC_F1_PRE(0x30,CX,BX,DI,DX) \
864 PRECALC_36(Y13) \
865 CALC_F1_POST(CX,SI,DX)
866
867#define CALC_85 \
868 CALC_F1_PRE(0x34,DX,CX,SI,AX) \
869 PRECALC_37(Y13) \
870 CALC_F1_POST(DX,BX,AX)
871
872#define CALC_86 \
873 CALC_F1_PRE(0x38,AX,DX,BX,DI) \
874 CALC_F1_POST(AX,CX,DI)
875
876#define CALC_87 \
877 CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
878 PRECALC_39(Y13,0x40,0x140) \
879 CALC_F1_POST(DI,DX,SI)
880
881#define CALC_88 \
882 CALC_F1_PRE(0x50,SI,DI,DX,BX) \
883 PRECALC_32(Y14,Y13) \
884 CALC_F1_POST(SI,AX,BX)
885
886#define CALC_89 \
887 CALC_F1_PRE(0x54,BX,SI,AX,CX) \
888 PRECALC_33(Y8,Y12) \
889 CALC_F1_POST(BX,DI,CX)
890
891#define CALC_90 \
892 CALC_F1_PRE(0x58,CX,BX,DI,DX) \
893 PRECALC_34(Y3) \
894 CALC_F1_POST(CX,SI,DX)
895
896#define CALC_91 \
897 CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
898 PRECALC_35(Y12) \
899 CALC_F1_POST(DX,BX,AX)
900
901#define CALC_92 \
902 CALC_F1_PRE(0x70,AX,DX,BX,DI) \
903 PRECALC_36(Y12) \
904 CALC_F1_POST(AX,CX,DI)
905
906#define CALC_93 \
907 CALC_F1_PRE(0x74,DI,AX,CX,SI) \
908 PRECALC_37(Y12) \
909 CALC_F1_POST(DI,DX,SI)
910
911#define CALC_94 \
912 CALC_F1_PRE(0x78,SI,DI,DX,BX) \
913 CALC_F1_POST(SI,AX,BX)
914
915#define CALC_95 \
916 CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
917 PRECALC_39(Y12,0x40,0x160) \
918 CALC_F1_POST(BX,DI,CX)
919
920#define CALC_96 \
921 CALC_F1_PRE(0x90,CX,BX,DI,DX) \
922 PRECALC_32(Y13,Y12) \
923 CALC_F1_POST(CX,SI,DX)
924
925#define CALC_97 \
926 CALC_F1_PRE(0x94,DX,CX,SI,AX) \
927 PRECALC_33(Y7,Y8) \
928 CALC_F1_POST(DX,BX,AX)
929
930#define CALC_98 \
931 CALC_F1_PRE(0x98,AX,DX,BX,DI) \
932 PRECALC_34(Y15) \
933 CALC_F1_POST(AX,CX,DI)
934
935#define CALC_99 \
936 CALC_F2_PRE(0x9c,DI,AX,SI) \
937 PRECALC_35(Y8) \
938 CALC_F2_POST(DI,DX,CX,SI)
939
940#define CALC_100 \
941 CALC_F2_PRE(0xb0,SI,DI,BX) \
942 PRECALC_36(Y8) \
943 CALC_F2_POST(SI,AX,DX,BX)
944
945#define CALC_101 \
946 CALC_F2_PRE(0xb4,BX,SI,CX) \
947 PRECALC_37(Y8) \
948 CALC_F2_POST(BX,DI,AX,CX)
949
950#define CALC_102 \
951 CALC_F2_PRE(0xb8,CX,BX,DX) \
952 CALC_F2_POST(CX,SI,DI,DX)
953
954#define CALC_103 \
955 CALC_F2_PRE(0xbc,DX,CX,AX) \
956 PRECALC_39(Y8,0x40,0x180) \
957 CALC_F2_POST(DX,BX,SI,AX)
958
959#define CALC_104 \
960 CALC_F2_PRE(0xd0,AX,DX,DI) \
961 PRECALC_32(Y12,Y8) \
962 CALC_F2_POST(AX,CX,BX,DI)
963
964#define CALC_105 \
965 CALC_F2_PRE(0xd4,DI,AX,SI) \
966 PRECALC_33(Y5,Y7) \
967 CALC_F2_POST(DI,DX,CX,SI)
968
969#define CALC_106 \
970 CALC_F2_PRE(0xd8,SI,DI,BX) \
971 PRECALC_34(Y14) \
972 CALC_F2_POST(SI,AX,DX,BX)
973
974#define CALC_107 \
975 CALC_F2_PRE(0xdc,BX,SI,CX) \
976 PRECALC_35(Y7) \
977 CALC_F2_POST(BX,DI,AX,CX)
978
979#define CALC_108 \
980 CALC_F2_PRE(0xf0,CX,BX,DX) \
981 PRECALC_36(Y7) \
982 CALC_F2_POST(CX,SI,DI,DX)
983
984#define CALC_109 \
985 CALC_F2_PRE(0xf4,DX,CX,AX) \
986 PRECALC_37(Y7) \
987 CALC_F2_POST(DX,BX,SI,AX)
988
989#define CALC_110 \
990 CALC_F2_PRE(0xf8,AX,DX,DI) \
991 CALC_F2_POST(AX,CX,BX,DI)
992
993#define CALC_111 \
994 CALC_F2_PRE(0xfc,DI,AX,SI) \
995 PRECALC_39(Y7,0x40,0x1a0) \
996 CALC_F2_POST(DI,DX,CX,SI)
997
998#define CALC_112 \
999 CALC_F2_PRE(0x110,SI,DI,BX) \
1000 PRECALC_32(Y8,Y7) \
1001 CALC_F2_POST(SI,AX,DX,BX)
1002
1003#define CALC_113 \
1004 CALC_F2_PRE(0x114,BX,SI,CX) \
1005 PRECALC_33(Y3,Y5) \
1006 CALC_F2_POST(BX,DI,AX,CX)
1007
1008#define CALC_114 \
1009 CALC_F2_PRE(0x118,CX,BX,DX) \
1010 PRECALC_34(Y13) \
1011 CALC_F2_POST(CX,SI,DI,DX)
1012
1013#define CALC_115 \
1014 CALC_F2_PRE(0x11c,DX,CX,AX) \
1015 PRECALC_35(Y5) \
1016 CALC_F2_POST(DX,BX,SI,AX)
1017
1018#define CALC_116 \
1019 CALC_F2_PRE(0x130,AX,DX,DI) \
1020 PRECALC_36(Y5) \
1021 CALC_F2_POST(AX,CX,BX,DI)
1022
1023#define CALC_117 \
1024 CALC_F2_PRE(0x134,DI,AX,SI) \
1025 PRECALC_37(Y5) \
1026 CALC_F2_POST(DI,DX,CX,SI)
1027
1028#define CALC_118 \
1029 CALC_F2_PRE(0x138,SI,DI,BX) \
1030 CALC_F2_POST(SI,AX,DX,BX)
1031
1032#define CALC_119 \
1033 CALC_F3_PRE(0x13c,CX) \
1034 PRECALC_39(Y5,0x40,0x1c0) \
1035 CALC_F3_POST(BX,DI,AX,CX,SI)
1036
1037#define CALC_120 \
1038 CALC_F3_PRE(0x150,DX) \
1039 PRECALC_32(Y7,Y5) \
1040 CALC_F3_POST(CX,SI,DI,DX,BX)
1041
1042#define CALC_121 \
1043 CALC_F3_PRE(0x154,AX) \
1044 PRECALC_33(Y15,Y3) \
1045 CALC_F3_POST(DX,BX,SI,AX,CX)
1046
1047#define CALC_122 \
1048 CALC_F3_PRE(0x158,DI) \
1049 PRECALC_34(Y12) \
1050 CALC_F3_POST(AX,CX,BX,DI,DX)
1051
1052#define CALC_123 \
1053 CALC_F3_PRE(0x15c,SI) \
1054 PRECALC_35(Y3) \
1055 CALC_F3_POST(DI,DX,CX,SI,AX)
1056
1057#define CALC_124 \
1058 CALC_F3_PRE(0x170,BX) \
1059 PRECALC_36(Y3) \
1060 CALC_F3_POST(SI,AX,DX,BX,DI)
1061
1062#define CALC_125 \
1063 CALC_F3_PRE(0x174,CX) \
1064 PRECALC_37(Y3) \
1065 CALC_F3_POST(BX,DI,AX,CX,SI)
1066
1067#define CALC_126 \
1068 CALC_F3_PRE(0x178,DX) \
1069 CALC_F3_POST(CX,SI,DI,DX,BX)
1070
1071#define CALC_127 \
1072 CALC_F3_PRE(0x17c,AX) \
1073 PRECALC_39(Y3,0x60,0x1e0) \
1074 CALC_F3_POST(DX,BX,SI,AX,CX)
1075
1076#define CALC_128 \
1077 CALC_F3_PRE(0x190,DI) \
1078 PRECALC_32(Y5,Y3) \
1079 CALC_F3_POST(AX,CX,BX,DI,DX)
1080
1081#define CALC_129 \
1082 CALC_F3_PRE(0x194,SI) \
1083 PRECALC_33(Y14,Y15) \
1084 CALC_F3_POST(DI,DX,CX,SI,AX)
1085
1086#define CALC_130 \
1087 CALC_F3_PRE(0x198,BX) \
1088 PRECALC_34(Y8) \
1089 CALC_F3_POST(SI,AX,DX,BX,DI)
1090
1091#define CALC_131 \
1092 CALC_F3_PRE(0x19c,CX) \
1093 PRECALC_35(Y15) \
1094 CALC_F3_POST(BX,DI,AX,CX,SI)
1095
1096#define CALC_132 \
1097 CALC_F3_PRE(0x1b0,DX) \
1098 PRECALC_36(Y15) \
1099 CALC_F3_POST(CX,SI,DI,DX,BX)
1100
1101#define CALC_133 \
1102 CALC_F3_PRE(0x1b4,AX) \
1103 PRECALC_37(Y15) \
1104 CALC_F3_POST(DX,BX,SI,AX,CX)
1105
1106#define CALC_134 \
1107 CALC_F3_PRE(0x1b8,DI) \
1108 CALC_F3_POST(AX,CX,BX,DI,DX)
1109
1110#define CALC_135 \
1111 CALC_F3_PRE(0x1bc,SI) \
1112 PRECALC_39(Y15,0x60,0x200) \
1113 CALC_F3_POST(DI,DX,CX,SI,AX)
1114
1115#define CALC_136 \
1116 CALC_F3_PRE(0x1d0,BX) \
1117 PRECALC_32(Y3,Y15) \
1118 CALC_F3_POST(SI,AX,DX,BX,DI)
1119
1120#define CALC_137 \
1121 CALC_F3_PRE(0x1d4,CX) \
1122 PRECALC_33(Y13,Y14) \
1123 CALC_F3_POST(BX,DI,AX,CX,SI)
1124
1125#define CALC_138 \
1126 CALC_F3_PRE(0x1d8,DX) \
1127 PRECALC_34(Y7) \
1128 CALC_F3_POST(CX,SI,DI,DX,BX)
1129
1130#define CALC_139 \
1131 CALC_F2_PRE(0x1dc,DX,CX,AX) \
1132 PRECALC_35(Y14) \
1133 CALC_F2_POST(DX,BX,SI,AX)
1134
1135#define CALC_140 \
1136 CALC_F2_PRE(0x1f0,AX,DX,DI) \
1137 PRECALC_36(Y14) \
1138 CALC_F2_POST(AX,CX,BX,DI)
1139
1140#define CALC_141 \
1141 CALC_F2_PRE(0x1f4,DI,AX,SI) \
1142 PRECALC_37(Y14) \
1143 CALC_F2_POST(DI,DX,CX,SI)
1144
1145#define CALC_142 \
1146 CALC_F2_PRE(0x1f8,SI,DI,BX) \
1147 CALC_F2_POST(SI,AX,DX,BX)
1148
1149#define CALC_143 \
1150 CALC_F2_PRE(0x1fc,BX,SI,CX) \
1151 PRECALC_39(Y14,0x60,0x220) \
1152 CALC_F2_POST(BX,DI,AX,CX)
1153
1154#define CALC_144 \
1155 CALC_F2_PRE(0x210,CX,BX,DX) \
1156 PRECALC_32(Y15,Y14) \
1157 CALC_F2_POST(CX,SI,DI,DX)
1158
1159#define CALC_145 \
1160 CALC_F2_PRE(0x214,DX,CX,AX) \
1161 PRECALC_33(Y12,Y13) \
1162 CALC_F2_POST(DX,BX,SI,AX)
1163
1164#define CALC_146 \
1165 CALC_F2_PRE(0x218,AX,DX,DI) \
1166 PRECALC_34(Y5) \
1167 CALC_F2_POST(AX,CX,BX,DI)
1168
1169#define CALC_147 \
1170 CALC_F2_PRE(0x21c,DI,AX,SI) \
1171 PRECALC_35(Y13) \
1172 CALC_F2_POST(DI,DX,CX,SI)
1173
1174#define CALC_148 \
1175 CALC_F2_PRE(0x230,SI,DI,BX) \
1176 PRECALC_36(Y13) \
1177 CALC_F2_POST(SI,AX,DX,BX)
1178
1179#define CALC_149 \
1180 CALC_F2_PRE(0x234,BX,SI,CX) \
1181 PRECALC_37(Y13) \
1182 CALC_F2_POST(BX,DI,AX,CX)
1183
1184#define CALC_150 \
1185 CALC_F2_PRE(0x238,CX,BX,DX) \
1186 CALC_F2_POST(CX,SI,DI,DX)
1187
1188#define CALC_151 \
1189 CALC_F2_PRE(0x23c,DX,CX,AX) \
1190 PRECALC_39(Y13,0x60,0x240) \
1191 CALC_F2_POST(DX,BX,SI,AX)
1192
1193#define CALC_152 \
1194 CALC_F2_PRE(0x250,AX,DX,DI) \
1195 PRECALC_32(Y14,Y13) \
1196 CALC_F2_POST(AX,CX,BX,DI)
1197
1198#define CALC_153 \
1199 CALC_F2_PRE(0x254,DI,AX,SI) \
1200 PRECALC_33(Y8,Y12) \
1201 CALC_F2_POST(DI,DX,CX,SI)
1202
1203#define CALC_154 \
1204 CALC_F2_PRE(0x258,SI,DI,BX) \
1205 PRECALC_34(Y3) \
1206 CALC_F2_POST(SI,AX,DX,BX)
1207
1208#define CALC_155 \
1209 CALC_F2_PRE(0x25c,BX,SI,CX) \
1210 PRECALC_35(Y12) \
1211 CALC_F2_POST(BX,DI,AX,CX)
1212
1213#define CALC_156 \
1214 CALC_F2_PRE(0x270,CX,BX,DX) \
1215 PRECALC_36(Y12) \
1216 CALC_F2_POST(CX,SI,DI,DX)
1217
1218#define CALC_157 \
1219 CALC_F2_PRE(0x274,DX,CX,AX) \
1220 PRECALC_37(Y12) \
1221 CALC_F2_POST(DX,BX,SI,AX)
1222
1223#define CALC_158 \
1224 CALC_F2_PRE(0x278,AX,DX,DI) \
1225 CALC_F2_POST(AX,CX,BX,DI)
1226
1227#define CALC_159 \
1228 ADDL 0x27c(R15),SI \
1229 LEAL (SI)(AX*1), SI \
1230 RORXL $0x1b, DI, R12 \
1231 PRECALC_39(Y12,0x60,0x260) \
1232 ADDL R12, SI
1233
1234
1235
1236#define CALC \
1237 MOVL (R9), CX \
1238 MOVL 4(R9), SI \
1239 MOVL 8(R9), DI \
1240 MOVL 12(R9), AX \
1241 MOVL 16(R9), DX \
1242 MOVQ SP, R14 \
1243 LEAQ (2*4*80+32)(SP), R15 \
1244 PRECALC \ // Precalc WK for first 2 blocks
1245 XCHGQ R15, R14 \
1246loop: \ // this loops is unrolled
1247 CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block
1248 JNE begin \
1249 VZEROUPPER \
1250 RET \
1251begin: \
1252 CALC_0 \
1253 CALC_1 \
1254 CALC_2 \
1255 CALC_3 \
1256 CALC_4 \
1257 CALC_5 \
1258 CALC_6 \
1259 CALC_7 \
1260 CALC_8 \
1261 CALC_9 \
1262 CALC_10 \
1263 CALC_11 \
1264 CALC_12 \
1265 CALC_13 \
1266 CALC_14 \
1267 CALC_15 \
1268 CALC_16 \
1269 CALC_17 \
1270 CALC_18 \
1271 CALC_19 \
1272 CALC_20 \
1273 CALC_21 \
1274 CALC_22 \
1275 CALC_23 \
1276 CALC_24 \
1277 CALC_25 \
1278 CALC_26 \
1279 CALC_27 \
1280 CALC_28 \
1281 CALC_29 \
1282 CALC_30 \
1283 CALC_31 \
1284 CALC_32 \
1285 CALC_33 \
1286 CALC_34 \
1287 CALC_35 \
1288 CALC_36 \
1289 CALC_37 \
1290 CALC_38 \
1291 CALC_39 \
1292 CALC_40 \
1293 CALC_41 \
1294 CALC_42 \
1295 CALC_43 \
1296 CALC_44 \
1297 CALC_45 \
1298 CALC_46 \
1299 CALC_47 \
1300 CALC_48 \
1301 CALC_49 \
1302 CALC_50 \
1303 CALC_51 \
1304 CALC_52 \
1305 CALC_53 \
1306 CALC_54 \
1307 CALC_55 \
1308 CALC_56 \
1309 CALC_57 \
1310 CALC_58 \
1311 CALC_59 \
1312 ADDQ $128, R10 \ // move to next even-64-byte block
1313 CMPQ R10, R11 \ // is current block the last one?
1314 CMOVQCC R8, R10 \ // signal the last iteration smartly
1315 CALC_60 \
1316 CALC_61 \
1317 CALC_62 \
1318 CALC_63 \
1319 CALC_64 \
1320 CALC_65 \
1321 CALC_66 \
1322 CALC_67 \
1323 CALC_68 \
1324 CALC_69 \
1325 CALC_70 \
1326 CALC_71 \
1327 CALC_72 \
1328 CALC_73 \
1329 CALC_74 \
1330 CALC_75 \
1331 CALC_76 \
1332 CALC_77 \
1333 CALC_78 \
1334 CALC_79 \
1335 UPDATE_HASH(AX,DX,BX,SI,DI) \
1336 CMPQ R10, R8 \ // is current block the last one?
1337 JE loop\
1338 MOVL DX, CX \
1339 CALC_80 \
1340 CALC_81 \
1341 CALC_82 \
1342 CALC_83 \
1343 CALC_84 \
1344 CALC_85 \
1345 CALC_86 \
1346 CALC_87 \
1347 CALC_88 \
1348 CALC_89 \
1349 CALC_90 \
1350 CALC_91 \
1351 CALC_92 \
1352 CALC_93 \
1353 CALC_94 \
1354 CALC_95 \
1355 CALC_96 \
1356 CALC_97 \
1357 CALC_98 \
1358 CALC_99 \
1359 CALC_100 \
1360 CALC_101 \
1361 CALC_102 \
1362 CALC_103 \
1363 CALC_104 \
1364 CALC_105 \
1365 CALC_106 \
1366 CALC_107 \
1367 CALC_108 \
1368 CALC_109 \
1369 CALC_110 \
1370 CALC_111 \
1371 CALC_112 \
1372 CALC_113 \
1373 CALC_114 \
1374 CALC_115 \
1375 CALC_116 \
1376 CALC_117 \
1377 CALC_118 \
1378 CALC_119 \
1379 CALC_120 \
1380 CALC_121 \
1381 CALC_122 \
1382 CALC_123 \
1383 CALC_124 \
1384 CALC_125 \
1385 CALC_126 \
1386 CALC_127 \
1387 CALC_128 \
1388 CALC_129 \
1389 CALC_130 \
1390 CALC_131 \
1391 CALC_132 \
1392 CALC_133 \
1393 CALC_134 \
1394 CALC_135 \
1395 CALC_136 \
1396 CALC_137 \
1397 CALC_138 \
1398 CALC_139 \
1399 ADDQ $128, R13 \ //move to next even-64-byte block
1400 CMPQ R13, R11 \ //is current block the last one?
1401 CMOVQCC R8, R10 \
1402 CALC_140 \
1403 CALC_141 \
1404 CALC_142 \
1405 CALC_143 \
1406 CALC_144 \
1407 CALC_145 \
1408 CALC_146 \
1409 CALC_147 \
1410 CALC_148 \
1411 CALC_149 \
1412 CALC_150 \
1413 CALC_151 \
1414 CALC_152 \
1415 CALC_153 \
1416 CALC_154 \
1417 CALC_155 \
1418 CALC_156 \
1419 CALC_157 \
1420 CALC_158 \
1421 CALC_159 \
1422 UPDATE_HASH(SI,DI,DX,CX,BX) \
1423 MOVL SI, R12 \ //Reset state for AVX2 reg permutation
1424 MOVL DI, SI \
1425 MOVL DX, DI \
1426 MOVL BX, DX \
1427 MOVL CX, AX \
1428 MOVL R12, CX \
1429 XCHGQ R15, R14 \
1430 JMP loop
1431
1432
1433
1434TEXT ·blockAVX2(SB),$1408-32
1435
1436 MOVQ dig+0(FP), DI
1437 MOVQ p_base+8(FP), SI
1438 MOVQ p_len+16(FP), DX
1439 SHRQ $6, DX
1440 SHLQ $6, DX
1441
1442 MOVQ $K_XMM_AR<>(SB), R8
1443
1444 MOVQ DI, R9
1445 MOVQ SI, R10
1446 LEAQ 64(SI), R13
1447
1448 ADDQ SI, DX
1449 ADDQ $64, DX
1450 MOVQ DX, R11
1451
1452 CMPQ R13, R11
1453 CMOVQCC R8, R13
1454
1455 VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10
1456
1457 CALC // RET is inside macros
1458
1459DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
1460DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
1461DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
1462DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
1463DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
1464DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
1465DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
1466DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
1467DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
1468DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
1469DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
1470DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
1471DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
1472DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
1473DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
1474DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
1475DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
1476DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
1477DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
1478DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
1479DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
1480DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
1481DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
1482DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
1483DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
1484DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
1485DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
1486DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
1487DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
1488DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
1489DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
1490DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
1491GLOBL K_XMM_AR<>(SB),RODATA,$128
1492
1493DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
1494DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
1495DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
1496DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
1497DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
1498DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
1499DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
1500DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
1501GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
View as plain text