...

Text file src/crypto/sha1/sha1block_amd64.s

Documentation: crypto/sha1

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// AVX2 version by Intel, same algorithm as code in Linux kernel:
     6// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
     7// Authors:
     8// Ilya Albrekht <ilya.albrekht@intel.com>
     9// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
    10// Ronen Zohar <ronen.zohar@intel.com>
    11// Chandramouli Narayanan <mouli@linux.intel.com>
    12
    13//go:build !purego
    14
    15#include "textflag.h"
    16
    17// SHA-1 block routine. See sha1block.go for Go equivalent.
    18//
    19// There are 80 rounds of 4 types:
    20//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
    21//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
    22//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
    23//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
    24//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
    25//
    26// Each round loads or shuffles the data, then computes a per-round
    27// function of b, c, d, and then mixes the result into and rotates the
    28// five registers a, b, c, d, e holding the intermediate results.
    29//
    30// The register rotation is implemented by rotating the arguments to
    31// the round macros instead of by explicit move instructions.
    32
    33#define LOAD(index) \
    34	MOVL	(index*4)(SI), R10; \
    35	BSWAPL	R10; \
    36	MOVL	R10, (index*4)(SP)
    37
    38#define SHUFFLE(index) \
    39	MOVL	(((index)&0xf)*4)(SP), R10; \
    40	XORL	(((index-3)&0xf)*4)(SP), R10; \
    41	XORL	(((index-8)&0xf)*4)(SP), R10; \
    42	XORL	(((index-14)&0xf)*4)(SP), R10; \
    43	ROLL	$1, R10; \
    44	MOVL	R10, (((index)&0xf)*4)(SP)
    45
    46#define FUNC1(a, b, c, d, e) \
    47	MOVL	d, R9; \
    48	XORL	c, R9; \
    49	ANDL	b, R9; \
    50	XORL	d, R9
    51
    52#define FUNC2(a, b, c, d, e) \
    53	MOVL	b, R9; \
    54	XORL	c, R9; \
    55	XORL	d, R9
    56
    57#define FUNC3(a, b, c, d, e) \
    58	MOVL	b, R8; \
    59	ORL	c, R8; \
    60	ANDL	d, R8; \
    61	MOVL	b, R9; \
    62	ANDL	c, R9; \
    63	ORL	R8, R9
    64
    65#define FUNC4 FUNC2
    66
    67#define MIX(a, b, c, d, e, const) \
    68	ROLL	$30, b; \
    69	ADDL	R9, e; \
    70	MOVL	a, R8; \
    71	ROLL	$5, R8; \
    72	LEAL	const(e)(R10*1), e; \
    73	ADDL	R8, e
    74
    75#define ROUND1(a, b, c, d, e, index) \
    76	LOAD(index); \
    77	FUNC1(a, b, c, d, e); \
    78	MIX(a, b, c, d, e, 0x5A827999)
    79
    80#define ROUND1x(a, b, c, d, e, index) \
    81	SHUFFLE(index); \
    82	FUNC1(a, b, c, d, e); \
    83	MIX(a, b, c, d, e, 0x5A827999)
    84
    85#define ROUND2(a, b, c, d, e, index) \
    86	SHUFFLE(index); \
    87	FUNC2(a, b, c, d, e); \
    88	MIX(a, b, c, d, e, 0x6ED9EBA1)
    89
    90#define ROUND3(a, b, c, d, e, index) \
    91	SHUFFLE(index); \
    92	FUNC3(a, b, c, d, e); \
    93	MIX(a, b, c, d, e, 0x8F1BBCDC)
    94
    95#define ROUND4(a, b, c, d, e, index) \
    96	SHUFFLE(index); \
    97	FUNC4(a, b, c, d, e); \
    98	MIX(a, b, c, d, e, 0xCA62C1D6)
    99
   100TEXT ·blockAMD64(SB),NOSPLIT,$64-32
   101	MOVQ	dig+0(FP),	BP
   102	MOVQ	p_base+8(FP),	SI
   103	MOVQ	p_len+16(FP),	DX
   104	SHRQ	$6,		DX
   105	SHLQ	$6,		DX
   106
   107	LEAQ	(SI)(DX*1),	DI
   108	MOVL	(0*4)(BP),	AX
   109	MOVL	(1*4)(BP),	BX
   110	MOVL	(2*4)(BP),	CX
   111	MOVL	(3*4)(BP),	DX
   112	MOVL	(4*4)(BP),	BP
   113
   114	CMPQ	SI,		DI
   115	JEQ	end
   116
   117loop:
   118	MOVL	AX,	R11
   119	MOVL	BX,	R12
   120	MOVL	CX,	R13
   121	MOVL	DX,	R14
   122	MOVL	BP,	R15
   123
   124	ROUND1(AX, BX, CX, DX, BP, 0)
   125	ROUND1(BP, AX, BX, CX, DX, 1)
   126	ROUND1(DX, BP, AX, BX, CX, 2)
   127	ROUND1(CX, DX, BP, AX, BX, 3)
   128	ROUND1(BX, CX, DX, BP, AX, 4)
   129	ROUND1(AX, BX, CX, DX, BP, 5)
   130	ROUND1(BP, AX, BX, CX, DX, 6)
   131	ROUND1(DX, BP, AX, BX, CX, 7)
   132	ROUND1(CX, DX, BP, AX, BX, 8)
   133	ROUND1(BX, CX, DX, BP, AX, 9)
   134	ROUND1(AX, BX, CX, DX, BP, 10)
   135	ROUND1(BP, AX, BX, CX, DX, 11)
   136	ROUND1(DX, BP, AX, BX, CX, 12)
   137	ROUND1(CX, DX, BP, AX, BX, 13)
   138	ROUND1(BX, CX, DX, BP, AX, 14)
   139	ROUND1(AX, BX, CX, DX, BP, 15)
   140
   141	ROUND1x(BP, AX, BX, CX, DX, 16)
   142	ROUND1x(DX, BP, AX, BX, CX, 17)
   143	ROUND1x(CX, DX, BP, AX, BX, 18)
   144	ROUND1x(BX, CX, DX, BP, AX, 19)
   145
   146	ROUND2(AX, BX, CX, DX, BP, 20)
   147	ROUND2(BP, AX, BX, CX, DX, 21)
   148	ROUND2(DX, BP, AX, BX, CX, 22)
   149	ROUND2(CX, DX, BP, AX, BX, 23)
   150	ROUND2(BX, CX, DX, BP, AX, 24)
   151	ROUND2(AX, BX, CX, DX, BP, 25)
   152	ROUND2(BP, AX, BX, CX, DX, 26)
   153	ROUND2(DX, BP, AX, BX, CX, 27)
   154	ROUND2(CX, DX, BP, AX, BX, 28)
   155	ROUND2(BX, CX, DX, BP, AX, 29)
   156	ROUND2(AX, BX, CX, DX, BP, 30)
   157	ROUND2(BP, AX, BX, CX, DX, 31)
   158	ROUND2(DX, BP, AX, BX, CX, 32)
   159	ROUND2(CX, DX, BP, AX, BX, 33)
   160	ROUND2(BX, CX, DX, BP, AX, 34)
   161	ROUND2(AX, BX, CX, DX, BP, 35)
   162	ROUND2(BP, AX, BX, CX, DX, 36)
   163	ROUND2(DX, BP, AX, BX, CX, 37)
   164	ROUND2(CX, DX, BP, AX, BX, 38)
   165	ROUND2(BX, CX, DX, BP, AX, 39)
   166
   167	ROUND3(AX, BX, CX, DX, BP, 40)
   168	ROUND3(BP, AX, BX, CX, DX, 41)
   169	ROUND3(DX, BP, AX, BX, CX, 42)
   170	ROUND3(CX, DX, BP, AX, BX, 43)
   171	ROUND3(BX, CX, DX, BP, AX, 44)
   172	ROUND3(AX, BX, CX, DX, BP, 45)
   173	ROUND3(BP, AX, BX, CX, DX, 46)
   174	ROUND3(DX, BP, AX, BX, CX, 47)
   175	ROUND3(CX, DX, BP, AX, BX, 48)
   176	ROUND3(BX, CX, DX, BP, AX, 49)
   177	ROUND3(AX, BX, CX, DX, BP, 50)
   178	ROUND3(BP, AX, BX, CX, DX, 51)
   179	ROUND3(DX, BP, AX, BX, CX, 52)
   180	ROUND3(CX, DX, BP, AX, BX, 53)
   181	ROUND3(BX, CX, DX, BP, AX, 54)
   182	ROUND3(AX, BX, CX, DX, BP, 55)
   183	ROUND3(BP, AX, BX, CX, DX, 56)
   184	ROUND3(DX, BP, AX, BX, CX, 57)
   185	ROUND3(CX, DX, BP, AX, BX, 58)
   186	ROUND3(BX, CX, DX, BP, AX, 59)
   187
   188	ROUND4(AX, BX, CX, DX, BP, 60)
   189	ROUND4(BP, AX, BX, CX, DX, 61)
   190	ROUND4(DX, BP, AX, BX, CX, 62)
   191	ROUND4(CX, DX, BP, AX, BX, 63)
   192	ROUND4(BX, CX, DX, BP, AX, 64)
   193	ROUND4(AX, BX, CX, DX, BP, 65)
   194	ROUND4(BP, AX, BX, CX, DX, 66)
   195	ROUND4(DX, BP, AX, BX, CX, 67)
   196	ROUND4(CX, DX, BP, AX, BX, 68)
   197	ROUND4(BX, CX, DX, BP, AX, 69)
   198	ROUND4(AX, BX, CX, DX, BP, 70)
   199	ROUND4(BP, AX, BX, CX, DX, 71)
   200	ROUND4(DX, BP, AX, BX, CX, 72)
   201	ROUND4(CX, DX, BP, AX, BX, 73)
   202	ROUND4(BX, CX, DX, BP, AX, 74)
   203	ROUND4(AX, BX, CX, DX, BP, 75)
   204	ROUND4(BP, AX, BX, CX, DX, 76)
   205	ROUND4(DX, BP, AX, BX, CX, 77)
   206	ROUND4(CX, DX, BP, AX, BX, 78)
   207	ROUND4(BX, CX, DX, BP, AX, 79)
   208
   209	ADDL	R11, AX
   210	ADDL	R12, BX
   211	ADDL	R13, CX
   212	ADDL	R14, DX
   213	ADDL	R15, BP
   214
   215	ADDQ	$64, SI
   216	CMPQ	SI, DI
   217	JB	loop
   218
   219end:
   220	MOVQ	dig+0(FP), DI
   221	MOVL	AX, (0*4)(DI)
   222	MOVL	BX, (1*4)(DI)
   223	MOVL	CX, (2*4)(DI)
   224	MOVL	DX, (3*4)(DI)
   225	MOVL	BP, (4*4)(DI)
   226	RET
   227
   228
   229// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
   230// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
   231// From http://software.intel.com/en-us/articles
   232// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
   233// This implementation is 2x unrolled, and interleaves vector instructions,
   234// used to precompute W, with scalar computation of current round
   235// for optimal scheduling.
   236
   237// Trivial helper macros.
   238#define UPDATE_HASH(A,TB,C,D,E) \
   239	ADDL	(R9), A \
   240	MOVL	A, (R9) \
   241	ADDL	4(R9), TB \
   242	MOVL	TB, 4(R9) \
   243	ADDL	8(R9), C \
   244	MOVL	C, 8(R9) \
   245	ADDL	12(R9), D \
   246	MOVL	D, 12(R9) \
   247	ADDL	16(R9), E \
   248	MOVL	E, 16(R9)
   249
   250
   251
   252// Helper macros for PRECALC, which does precomputations
   253#define PRECALC_0(OFFSET) \
   254	VMOVDQU   OFFSET(R10),X0
   255
   256#define PRECALC_1(OFFSET) \
   257	VINSERTI128 $1, OFFSET(R13), Y0, Y0
   258
   259#define PRECALC_2(YREG) \
   260	VPSHUFB Y10, Y0, YREG
   261
   262#define PRECALC_4(YREG,K_OFFSET) \
   263	VPADDD K_OFFSET(R8), YREG, Y0
   264
   265#define PRECALC_7(OFFSET) \
   266	VMOVDQU Y0, (OFFSET*2)(R14)
   267
   268
   269// Message scheduling pre-compute for rounds 0-15
   270// R13 is a pointer to even 64-byte block
   271// R10 is a pointer to odd 64-byte block
   272// R14 is a pointer to temp buffer
   273// X0 is used as temp register
   274// YREG is clobbered as part of computation
   275// OFFSET chooses 16 byte chunk within a block
   276// R8 is a pointer to constants block
   277// K_OFFSET chooses K constants relevant to this round
   278// X10 holds swap mask
   279#define PRECALC_00_15(OFFSET,YREG) \
   280	PRECALC_0(OFFSET) \
   281	PRECALC_1(OFFSET) \
   282	PRECALC_2(YREG) \
   283	PRECALC_4(YREG,0x0) \
   284	PRECALC_7(OFFSET)
   285
   286
   287// Helper macros for PRECALC_16_31
   288#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   289	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
   290	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
   291
   292#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   293	VPXOR  REG_SUB_8, REG, REG \
   294	VPXOR  REG_SUB_16, Y0, Y0
   295
   296#define PRECALC_18(REG) \
   297	VPXOR Y0, REG, REG \
   298	VPSLLDQ $12, REG, Y9
   299
   300#define PRECALC_19(REG) \
   301	VPSLLD $1, REG, Y0 \
   302	VPSRLD $31, REG, REG
   303
   304#define PRECALC_20(REG) \
   305	VPOR REG, Y0, Y0 \
   306	VPSLLD $2, Y9,  REG
   307
   308#define PRECALC_21(REG) \
   309	VPSRLD $30, Y9, Y9 \
   310	VPXOR REG, Y0, Y0
   311
   312#define PRECALC_23(REG,K_OFFSET,OFFSET) \
   313	VPXOR Y9, Y0, REG \
   314	VPADDD K_OFFSET(R8), REG, Y0 \
   315	VMOVDQU Y0, (OFFSET)(R14)
   316
   317// Message scheduling pre-compute for rounds 16-31
   318// calculating last 32 w[i] values in 8 XMM registers
   319// pre-calculate K+w[i] values and store to mem
   320// for later load by ALU add instruction.
   321// "brute force" vectorization for rounds 16-31 only
   322// due to w[i]->w[i-3] dependency.
   323// clobbers 5 input ymm registers REG_SUB*
   324// uses X0 and X9 as temp registers
   325// As always, R8 is a pointer to constants block
   326// and R14 is a pointer to temp buffer
   327#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
   328	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   329	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   330	PRECALC_18(REG) \
   331	PRECALC_19(REG) \
   332	PRECALC_20(REG) \
   333	PRECALC_21(REG) \
   334	PRECALC_23(REG,K_OFFSET,OFFSET)
   335
   336
   337// Helper macros for PRECALC_32_79
   338#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
   339	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
   340
   341#define PRECALC_33(REG_SUB_28,REG) \
   342	VPXOR REG_SUB_28, REG, REG
   343
   344#define PRECALC_34(REG_SUB_16) \
   345	VPXOR REG_SUB_16, Y0, Y0
   346
   347#define PRECALC_35(REG) \
   348	VPXOR Y0, REG, REG
   349
   350#define PRECALC_36(REG) \
   351	VPSLLD $2, REG, Y0
   352
   353#define PRECALC_37(REG) \
   354	VPSRLD $30, REG, REG \
   355	VPOR REG, Y0, REG
   356
   357#define PRECALC_39(REG,K_OFFSET,OFFSET) \
   358	VPADDD K_OFFSET(R8), REG, Y0 \
   359	VMOVDQU Y0, (OFFSET)(R14)
   360
   361// Message scheduling pre-compute for rounds 32-79
   362// In SHA-1 specification we have:
   363// w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
   364// Which is the same as:
   365// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
   366// This allows for more efficient vectorization,
   367// since w[i]->w[i-3] dependency is broken
   368#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
   369	PRECALC_32(REG_SUB_8,REG_SUB_4) \
   370	PRECALC_33(REG_SUB_28,REG) \
   371	PRECALC_34(REG_SUB_16) \
   372	PRECALC_35(REG) \
   373	PRECALC_36(REG) \
   374	PRECALC_37(REG) \
   375	PRECALC_39(REG,K_OFFSET,OFFSET)
   376
   377#define PRECALC \
   378	PRECALC_00_15(0,Y15) \
   379	PRECALC_00_15(0x10,Y14) \
   380	PRECALC_00_15(0x20,Y13) \
   381	PRECALC_00_15(0x30,Y12) \
   382	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
   383	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
   384	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
   385	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
   386	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
   387	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
   388	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
   389	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
   390	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
   391	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
   392	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
   393	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
   394	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
   395	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
   396	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
   397	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
   398
   399// Macros calculating individual rounds have general form
   400// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
   401// CALC_ROUND_{PRE,POST} macros follow
   402
   403#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
   404	ADDL OFFSET(R15),REG_E \
   405	ANDNL REG_C,REG_A,BP \
   406	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   407	RORXL $0x1b, REG_A, R12 \
   408	RORXL $2, REG_A, REG_B         // for next round
   409
   410// Calculate F for the next round
   411#define CALC_F1_POST(REG_A,REG_B,REG_E) \
   412	ANDL REG_B,REG_A \             // b&c
   413	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
   414	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
   415
   416
   417// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
   418#define CALC_0 \
   419	MOVL SI, BX \ // Precalculating first round
   420	RORXL $2, SI, SI \
   421	ANDNL AX, BX, BP \
   422	ANDL DI, BX \
   423	XORL BP, BX \
   424	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
   425	PRECALC_0(0x80) \
   426	CALC_F1_POST(CX,SI,DX)
   427
   428#define CALC_1 \
   429	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
   430	PRECALC_1(0x80) \
   431	CALC_F1_POST(DX,BX,AX)
   432
   433#define CALC_2 \
   434	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
   435	PRECALC_2(Y15) \
   436	CALC_F1_POST(AX,CX,DI)
   437
   438#define CALC_3 \
   439	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
   440	CALC_F1_POST(DI,DX,SI)
   441
   442#define CALC_4 \
   443	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
   444	PRECALC_4(Y15,0x0) \
   445	CALC_F1_POST(SI,AX,BX)
   446
   447#define CALC_5 \
   448	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
   449	CALC_F1_POST(BX,DI,CX)
   450
   451#define CALC_6 \
   452	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
   453	CALC_F1_POST(CX,SI,DX)
   454
   455#define CALC_7 \
   456	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
   457	PRECALC_7(0x0) \
   458	CALC_F1_POST(DX,BX,AX)
   459
   460#define CALC_8 \
   461	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
   462	PRECALC_0(0x90) \
   463	CALC_F1_POST(AX,CX,DI)
   464
   465#define CALC_9 \
   466	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
   467	PRECALC_1(0x90) \
   468	CALC_F1_POST(DI,DX,SI)
   469
   470#define CALC_10 \
   471	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
   472	PRECALC_2(Y14) \
   473	CALC_F1_POST(SI,AX,BX)
   474
   475#define CALC_11 \
   476	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
   477	CALC_F1_POST(BX,DI,CX)
   478
   479#define CALC_12 \
   480	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
   481	PRECALC_4(Y14,0x0) \
   482	CALC_F1_POST(CX,SI,DX)
   483
   484#define CALC_13 \
   485	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
   486	CALC_F1_POST(DX,BX,AX)
   487
   488#define CALC_14 \
   489	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
   490	CALC_F1_POST(AX,CX,DI)
   491
   492#define CALC_15 \
   493	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
   494	PRECALC_7(0x10) \
   495	CALC_F1_POST(DI,DX,SI)
   496
   497#define CALC_16 \
   498	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
   499	PRECALC_0(0xa0) \
   500	CALC_F1_POST(SI,AX,BX)
   501
   502#define CALC_17 \
   503	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
   504	PRECALC_1(0xa0) \
   505	CALC_F1_POST(BX,DI,CX)
   506
   507#define CALC_18 \
   508	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
   509	PRECALC_2(Y13) \
   510	CALC_F1_POST(CX,SI,DX)
   511
   512
   513#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
   514	ADDL OFFSET(R15),REG_E \
   515	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   516	RORXL $0x1b, REG_A, R12 \
   517	RORXL $2, REG_A, REG_B         // for next round
   518
   519#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
   520	XORL REG_B, REG_A \
   521	ADDL R12, REG_E \
   522	XORL REG_C, REG_A
   523
   524#define CALC_19 \
   525	CALC_F2_PRE(0x8c,DX,CX,AX) \
   526	CALC_F2_POST(DX,BX,SI,AX)
   527
   528#define CALC_20 \
   529	CALC_F2_PRE(0xa0,AX,DX,DI) \
   530	PRECALC_4(Y13,0x0) \
   531	CALC_F2_POST(AX,CX,BX,DI)
   532
   533#define CALC_21 \
   534	CALC_F2_PRE(0xa4,DI,AX,SI) \
   535	CALC_F2_POST(DI,DX,CX,SI)
   536
   537#define CALC_22 \
   538	CALC_F2_PRE(0xa8,SI,DI,BX) \
   539	CALC_F2_POST(SI,AX,DX,BX)
   540
   541#define CALC_23 \
   542	CALC_F2_PRE(0xac,BX,SI,CX) \
   543	PRECALC_7(0x20) \
   544	CALC_F2_POST(BX,DI,AX,CX)
   545
   546#define CALC_24 \
   547	CALC_F2_PRE(0xc0,CX,BX,DX) \
   548	PRECALC_0(0xb0) \
   549	CALC_F2_POST(CX,SI,DI,DX)
   550
   551#define CALC_25 \
   552	CALC_F2_PRE(0xc4,DX,CX,AX) \
   553	PRECALC_1(0xb0) \
   554	CALC_F2_POST(DX,BX,SI,AX)
   555
   556#define CALC_26 \
   557	CALC_F2_PRE(0xc8,AX,DX,DI) \
   558	PRECALC_2(Y12) \
   559	CALC_F2_POST(AX,CX,BX,DI)
   560
   561#define CALC_27 \
   562	CALC_F2_PRE(0xcc,DI,AX,SI) \
   563	CALC_F2_POST(DI,DX,CX,SI)
   564
   565#define CALC_28 \
   566	CALC_F2_PRE(0xe0,SI,DI,BX) \
   567	PRECALC_4(Y12,0x0) \
   568	CALC_F2_POST(SI,AX,DX,BX)
   569
   570#define CALC_29 \
   571	CALC_F2_PRE(0xe4,BX,SI,CX) \
   572	CALC_F2_POST(BX,DI,AX,CX)
   573
   574#define CALC_30 \
   575	CALC_F2_PRE(0xe8,CX,BX,DX) \
   576	CALC_F2_POST(CX,SI,DI,DX)
   577
   578#define CALC_31 \
   579	CALC_F2_PRE(0xec,DX,CX,AX) \
   580	PRECALC_7(0x30) \
   581	CALC_F2_POST(DX,BX,SI,AX)
   582
   583#define CALC_32 \
   584	CALC_F2_PRE(0x100,AX,DX,DI) \
   585	PRECALC_16(Y15,Y14,Y12,Y8) \
   586	CALC_F2_POST(AX,CX,BX,DI)
   587
   588#define CALC_33 \
   589	CALC_F2_PRE(0x104,DI,AX,SI) \
   590	PRECALC_17(Y15,Y13,Y8) \
   591	CALC_F2_POST(DI,DX,CX,SI)
   592
   593#define CALC_34 \
   594	CALC_F2_PRE(0x108,SI,DI,BX) \
   595	PRECALC_18(Y8) \
   596	CALC_F2_POST(SI,AX,DX,BX)
   597
   598#define CALC_35 \
   599	CALC_F2_PRE(0x10c,BX,SI,CX) \
   600	PRECALC_19(Y8) \
   601	CALC_F2_POST(BX,DI,AX,CX)
   602
   603#define CALC_36 \
   604	CALC_F2_PRE(0x120,CX,BX,DX) \
   605	PRECALC_20(Y8) \
   606	CALC_F2_POST(CX,SI,DI,DX)
   607
   608#define CALC_37 \
   609	CALC_F2_PRE(0x124,DX,CX,AX) \
   610	PRECALC_21(Y8) \
   611	CALC_F2_POST(DX,BX,SI,AX)
   612
   613#define CALC_38 \
   614	CALC_F2_PRE(0x128,AX,DX,DI) \
   615	CALC_F2_POST(AX,CX,BX,DI)
   616
   617
   618#define CALC_F3_PRE(OFFSET,REG_E) \
   619	ADDL OFFSET(R15),REG_E
   620
   621#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
   622	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
   623	MOVL REG_B, BP \
   624	ORL  REG_A, BP \
   625	RORXL $0x1b, REG_A, R12 \
   626	RORXL $2, REG_A, REG_TB \
   627	ANDL REG_C, BP \		// Calculate F for the next round
   628	ANDL REG_B, REG_A \
   629	ORL  BP, REG_A \
   630	ADDL R12, REG_E
   631
   632#define CALC_39 \
   633	CALC_F3_PRE(0x12c,SI) \
   634	PRECALC_23(Y8,0x0,0x80) \
   635	CALC_F3_POST(DI,DX,CX,SI,AX)
   636
   637#define CALC_40 \
   638	CALC_F3_PRE(0x140,BX) \
   639	PRECALC_16(Y14,Y13,Y8,Y7) \
   640	CALC_F3_POST(SI,AX,DX,BX,DI)
   641
   642#define CALC_41 \
   643	CALC_F3_PRE(0x144,CX) \
   644	PRECALC_17(Y14,Y12,Y7) \
   645	CALC_F3_POST(BX,DI,AX,CX,SI)
   646
   647#define CALC_42 \
   648	CALC_F3_PRE(0x148,DX) \
   649	PRECALC_18(Y7) \
   650	CALC_F3_POST(CX,SI,DI,DX,BX)
   651
   652#define CALC_43 \
   653	CALC_F3_PRE(0x14c,AX) \
   654	PRECALC_19(Y7) \
   655	CALC_F3_POST(DX,BX,SI,AX,CX)
   656
   657#define CALC_44 \
   658	CALC_F3_PRE(0x160,DI) \
   659	PRECALC_20(Y7) \
   660	CALC_F3_POST(AX,CX,BX,DI,DX)
   661
   662#define CALC_45 \
   663	CALC_F3_PRE(0x164,SI) \
   664	PRECALC_21(Y7) \
   665	CALC_F3_POST(DI,DX,CX,SI,AX)
   666
   667#define CALC_46 \
   668	CALC_F3_PRE(0x168,BX) \
   669	CALC_F3_POST(SI,AX,DX,BX,DI)
   670
   671#define CALC_47 \
   672	CALC_F3_PRE(0x16c,CX) \
   673	VPXOR Y9, Y0, Y7 \
   674	VPADDD 0x20(R8), Y7, Y0 \
   675	VMOVDQU Y0, 0xa0(R14) \
   676	CALC_F3_POST(BX,DI,AX,CX,SI)
   677
   678#define CALC_48 \
   679	CALC_F3_PRE(0x180,DX) \
   680	PRECALC_16(Y13,Y12,Y7,Y5) \
   681	CALC_F3_POST(CX,SI,DI,DX,BX)
   682
   683#define CALC_49 \
   684	CALC_F3_PRE(0x184,AX) \
   685	PRECALC_17(Y13,Y8,Y5) \
   686	CALC_F3_POST(DX,BX,SI,AX,CX)
   687
   688#define CALC_50 \
   689	CALC_F3_PRE(0x188,DI) \
   690	PRECALC_18(Y5) \
   691	CALC_F3_POST(AX,CX,BX,DI,DX)
   692
   693#define CALC_51 \
   694	CALC_F3_PRE(0x18c,SI) \
   695	PRECALC_19(Y5) \
   696	CALC_F3_POST(DI,DX,CX,SI,AX)
   697
   698#define CALC_52 \
   699	CALC_F3_PRE(0x1a0,BX) \
   700	PRECALC_20(Y5) \
   701	CALC_F3_POST(SI,AX,DX,BX,DI)
   702
   703#define CALC_53 \
   704	CALC_F3_PRE(0x1a4,CX) \
   705	PRECALC_21(Y5) \
   706	CALC_F3_POST(BX,DI,AX,CX,SI)
   707
   708#define CALC_54 \
   709	CALC_F3_PRE(0x1a8,DX) \
   710	CALC_F3_POST(CX,SI,DI,DX,BX)
   711
   712#define CALC_55 \
   713	CALC_F3_PRE(0x1ac,AX) \
   714	PRECALC_23(Y5,0x20,0xc0) \
   715	CALC_F3_POST(DX,BX,SI,AX,CX)
   716
   717#define CALC_56 \
   718	CALC_F3_PRE(0x1c0,DI) \
   719	PRECALC_16(Y12,Y8,Y5,Y3) \
   720	CALC_F3_POST(AX,CX,BX,DI,DX)
   721
   722#define CALC_57 \
   723	CALC_F3_PRE(0x1c4,SI) \
   724	PRECALC_17(Y12,Y7,Y3) \
   725	CALC_F3_POST(DI,DX,CX,SI,AX)
   726
   727#define CALC_58 \
   728	CALC_F3_PRE(0x1c8,BX) \
   729	PRECALC_18(Y3) \
   730	CALC_F3_POST(SI,AX,DX,BX,DI)
   731
   732#define CALC_59 \
   733	CALC_F2_PRE(0x1cc,BX,SI,CX) \
   734	PRECALC_19(Y3) \
   735	CALC_F2_POST(BX,DI,AX,CX)
   736
   737#define CALC_60 \
   738	CALC_F2_PRE(0x1e0,CX,BX,DX) \
   739	PRECALC_20(Y3) \
   740	CALC_F2_POST(CX,SI,DI,DX)
   741
   742#define CALC_61 \
   743	CALC_F2_PRE(0x1e4,DX,CX,AX) \
   744	PRECALC_21(Y3) \
   745	CALC_F2_POST(DX,BX,SI,AX)
   746
   747#define CALC_62 \
   748	CALC_F2_PRE(0x1e8,AX,DX,DI) \
   749	CALC_F2_POST(AX,CX,BX,DI)
   750
   751#define CALC_63 \
   752	CALC_F2_PRE(0x1ec,DI,AX,SI) \
   753	PRECALC_23(Y3,0x20,0xe0) \
   754	CALC_F2_POST(DI,DX,CX,SI)
   755
   756#define CALC_64 \
   757	CALC_F2_PRE(0x200,SI,DI,BX) \
   758	PRECALC_32(Y5,Y3) \
   759	CALC_F2_POST(SI,AX,DX,BX)
   760
   761#define CALC_65 \
   762	CALC_F2_PRE(0x204,BX,SI,CX) \
   763	PRECALC_33(Y14,Y15) \
   764	CALC_F2_POST(BX,DI,AX,CX)
   765
   766#define CALC_66 \
   767	CALC_F2_PRE(0x208,CX,BX,DX) \
   768	PRECALC_34(Y8) \
   769	CALC_F2_POST(CX,SI,DI,DX)
   770
   771#define CALC_67 \
   772	CALC_F2_PRE(0x20c,DX,CX,AX) \
   773	PRECALC_35(Y15) \
   774	CALC_F2_POST(DX,BX,SI,AX)
   775
   776#define CALC_68 \
   777	CALC_F2_PRE(0x220,AX,DX,DI) \
   778	PRECALC_36(Y15) \
   779	CALC_F2_POST(AX,CX,BX,DI)
   780
   781#define CALC_69 \
   782	CALC_F2_PRE(0x224,DI,AX,SI) \
   783	PRECALC_37(Y15) \
   784	CALC_F2_POST(DI,DX,CX,SI)
   785
   786#define CALC_70 \
   787	CALC_F2_PRE(0x228,SI,DI,BX) \
   788	CALC_F2_POST(SI,AX,DX,BX)
   789
   790#define CALC_71 \
   791	CALC_F2_PRE(0x22c,BX,SI,CX) \
   792	PRECALC_39(Y15,0x20,0x100) \
   793	CALC_F2_POST(BX,DI,AX,CX)
   794
   795#define CALC_72 \
   796	CALC_F2_PRE(0x240,CX,BX,DX) \
   797	PRECALC_32(Y3,Y15) \
   798	CALC_F2_POST(CX,SI,DI,DX)
   799
   800#define CALC_73 \
   801	CALC_F2_PRE(0x244,DX,CX,AX) \
   802	PRECALC_33(Y13,Y14) \
   803	CALC_F2_POST(DX,BX,SI,AX)
   804
   805#define CALC_74 \
   806	CALC_F2_PRE(0x248,AX,DX,DI) \
   807	PRECALC_34(Y7) \
   808	CALC_F2_POST(AX,CX,BX,DI)
   809
   810#define CALC_75 \
   811	CALC_F2_PRE(0x24c,DI,AX,SI) \
   812	PRECALC_35(Y14) \
   813	CALC_F2_POST(DI,DX,CX,SI)
   814
   815#define CALC_76 \
   816	CALC_F2_PRE(0x260,SI,DI,BX) \
   817	PRECALC_36(Y14) \
   818	CALC_F2_POST(SI,AX,DX,BX)
   819
   820#define CALC_77 \
   821	CALC_F2_PRE(0x264,BX,SI,CX) \
   822	PRECALC_37(Y14) \
   823	CALC_F2_POST(BX,DI,AX,CX)
   824
   825#define CALC_78 \
   826	CALC_F2_PRE(0x268,CX,BX,DX) \
   827	CALC_F2_POST(CX,SI,DI,DX)
   828
   829#define CALC_79 \
   830	ADDL 0x26c(R15), AX \
   831	LEAL (AX)(CX*1), AX \
   832	RORXL $0x1b, DX, R12 \
   833	PRECALC_39(Y14,0x20,0x120) \
   834	ADDL R12, AX
   835
   836// Similar to CALC_0
   837#define CALC_80 \
   838	MOVL CX, DX \
   839	RORXL $2, CX, CX \
   840	ANDNL SI, DX, BP \
   841	ANDL BX, DX \
   842	XORL BP, DX \
   843	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
   844	PRECALC_32(Y15,Y14) \
   845	CALC_F1_POST(AX,CX,DI)
   846
   847#define CALC_81 \
   848	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
   849	PRECALC_33(Y12,Y13) \
   850	CALC_F1_POST(DI,DX,SI)
   851
   852#define CALC_82 \
   853	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
   854	PRECALC_34(Y5) \
   855	CALC_F1_POST(SI,AX,BX)
   856
   857#define CALC_83 \
   858	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
   859	PRECALC_35(Y13) \
   860	CALC_F1_POST(BX,DI,CX)
   861
   862#define CALC_84 \
   863	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
   864	PRECALC_36(Y13) \
   865	CALC_F1_POST(CX,SI,DX)
   866
   867#define CALC_85 \
   868	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
   869	PRECALC_37(Y13) \
   870	CALC_F1_POST(DX,BX,AX)
   871
   872#define CALC_86 \
   873	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
   874	CALC_F1_POST(AX,CX,DI)
   875
   876#define CALC_87 \
   877	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
   878	PRECALC_39(Y13,0x40,0x140) \
   879	CALC_F1_POST(DI,DX,SI)
   880
   881#define CALC_88 \
   882	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
   883	PRECALC_32(Y14,Y13) \
   884	CALC_F1_POST(SI,AX,BX)
   885
   886#define CALC_89 \
   887	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
   888	PRECALC_33(Y8,Y12) \
   889	CALC_F1_POST(BX,DI,CX)
   890
   891#define CALC_90 \
   892	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
   893	PRECALC_34(Y3) \
   894	CALC_F1_POST(CX,SI,DX)
   895
   896#define CALC_91 \
   897	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
   898	PRECALC_35(Y12) \
   899	CALC_F1_POST(DX,BX,AX)
   900
   901#define CALC_92 \
   902	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
   903	PRECALC_36(Y12) \
   904	CALC_F1_POST(AX,CX,DI)
   905
   906#define CALC_93 \
   907	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
   908	PRECALC_37(Y12) \
   909	CALC_F1_POST(DI,DX,SI)
   910
   911#define CALC_94 \
   912	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
   913	CALC_F1_POST(SI,AX,BX)
   914
   915#define CALC_95 \
   916	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
   917	PRECALC_39(Y12,0x40,0x160) \
   918	CALC_F1_POST(BX,DI,CX)
   919
   920#define CALC_96 \
   921	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
   922	PRECALC_32(Y13,Y12) \
   923	CALC_F1_POST(CX,SI,DX)
   924
   925#define CALC_97 \
   926	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
   927	PRECALC_33(Y7,Y8) \
   928	CALC_F1_POST(DX,BX,AX)
   929
   930#define CALC_98 \
   931	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
   932	PRECALC_34(Y15) \
   933	CALC_F1_POST(AX,CX,DI)
   934
   935#define CALC_99 \
   936	CALC_F2_PRE(0x9c,DI,AX,SI) \
   937	PRECALC_35(Y8) \
   938	CALC_F2_POST(DI,DX,CX,SI)
   939
   940#define CALC_100 \
   941	CALC_F2_PRE(0xb0,SI,DI,BX) \
   942	PRECALC_36(Y8) \
   943	CALC_F2_POST(SI,AX,DX,BX)
   944
   945#define CALC_101 \
   946	CALC_F2_PRE(0xb4,BX,SI,CX) \
   947	PRECALC_37(Y8) \
   948	CALC_F2_POST(BX,DI,AX,CX)
   949
   950#define CALC_102 \
   951	CALC_F2_PRE(0xb8,CX,BX,DX) \
   952	CALC_F2_POST(CX,SI,DI,DX)
   953
   954#define CALC_103 \
   955	CALC_F2_PRE(0xbc,DX,CX,AX) \
   956	PRECALC_39(Y8,0x40,0x180) \
   957	CALC_F2_POST(DX,BX,SI,AX)
   958
   959#define CALC_104 \
   960	CALC_F2_PRE(0xd0,AX,DX,DI) \
   961	PRECALC_32(Y12,Y8) \
   962	CALC_F2_POST(AX,CX,BX,DI)
   963
   964#define CALC_105 \
   965	CALC_F2_PRE(0xd4,DI,AX,SI) \
   966	PRECALC_33(Y5,Y7) \
   967	CALC_F2_POST(DI,DX,CX,SI)
   968
   969#define CALC_106 \
   970	CALC_F2_PRE(0xd8,SI,DI,BX) \
   971	PRECALC_34(Y14) \
   972	CALC_F2_POST(SI,AX,DX,BX)
   973
   974#define CALC_107 \
   975	CALC_F2_PRE(0xdc,BX,SI,CX) \
   976	PRECALC_35(Y7) \
   977	CALC_F2_POST(BX,DI,AX,CX)
   978
   979#define CALC_108 \
   980	CALC_F2_PRE(0xf0,CX,BX,DX) \
   981	PRECALC_36(Y7) \
   982	CALC_F2_POST(CX,SI,DI,DX)
   983
   984#define CALC_109 \
   985	CALC_F2_PRE(0xf4,DX,CX,AX) \
   986	PRECALC_37(Y7) \
   987	CALC_F2_POST(DX,BX,SI,AX)
   988
   989#define CALC_110 \
   990	CALC_F2_PRE(0xf8,AX,DX,DI) \
   991	CALC_F2_POST(AX,CX,BX,DI)
   992
   993#define CALC_111 \
   994	CALC_F2_PRE(0xfc,DI,AX,SI) \
   995	PRECALC_39(Y7,0x40,0x1a0) \
   996	CALC_F2_POST(DI,DX,CX,SI)
   997
   998#define CALC_112 \
   999	CALC_F2_PRE(0x110,SI,DI,BX) \
  1000	PRECALC_32(Y8,Y7) \
  1001	CALC_F2_POST(SI,AX,DX,BX)
  1002
  1003#define CALC_113 \
  1004	CALC_F2_PRE(0x114,BX,SI,CX) \
  1005	PRECALC_33(Y3,Y5) \
  1006	CALC_F2_POST(BX,DI,AX,CX)
  1007
  1008#define CALC_114 \
  1009	CALC_F2_PRE(0x118,CX,BX,DX) \
  1010	PRECALC_34(Y13) \
  1011	CALC_F2_POST(CX,SI,DI,DX)
  1012
  1013#define CALC_115 \
  1014	CALC_F2_PRE(0x11c,DX,CX,AX) \
  1015	PRECALC_35(Y5) \
  1016	CALC_F2_POST(DX,BX,SI,AX)
  1017
  1018#define CALC_116 \
  1019	CALC_F2_PRE(0x130,AX,DX,DI) \
  1020	PRECALC_36(Y5) \
  1021	CALC_F2_POST(AX,CX,BX,DI)
  1022
  1023#define CALC_117 \
  1024	CALC_F2_PRE(0x134,DI,AX,SI) \
  1025	PRECALC_37(Y5) \
  1026	CALC_F2_POST(DI,DX,CX,SI)
  1027
  1028#define CALC_118 \
  1029	CALC_F2_PRE(0x138,SI,DI,BX) \
  1030	CALC_F2_POST(SI,AX,DX,BX)
  1031
  1032#define CALC_119 \
  1033	CALC_F3_PRE(0x13c,CX) \
  1034	PRECALC_39(Y5,0x40,0x1c0) \
  1035	CALC_F3_POST(BX,DI,AX,CX,SI)
  1036
  1037#define CALC_120 \
  1038	CALC_F3_PRE(0x150,DX) \
  1039	PRECALC_32(Y7,Y5) \
  1040	CALC_F3_POST(CX,SI,DI,DX,BX)
  1041
  1042#define CALC_121 \
  1043	CALC_F3_PRE(0x154,AX) \
  1044	PRECALC_33(Y15,Y3) \
  1045	CALC_F3_POST(DX,BX,SI,AX,CX)
  1046
  1047#define CALC_122 \
  1048	CALC_F3_PRE(0x158,DI) \
  1049	PRECALC_34(Y12) \
  1050	CALC_F3_POST(AX,CX,BX,DI,DX)
  1051
  1052#define CALC_123 \
  1053	CALC_F3_PRE(0x15c,SI) \
  1054	PRECALC_35(Y3) \
  1055	CALC_F3_POST(DI,DX,CX,SI,AX)
  1056
  1057#define CALC_124 \
  1058	CALC_F3_PRE(0x170,BX) \
  1059	PRECALC_36(Y3) \
  1060	CALC_F3_POST(SI,AX,DX,BX,DI)
  1061
  1062#define CALC_125 \
  1063	CALC_F3_PRE(0x174,CX) \
  1064	PRECALC_37(Y3) \
  1065	CALC_F3_POST(BX,DI,AX,CX,SI)
  1066
  1067#define CALC_126 \
  1068	CALC_F3_PRE(0x178,DX) \
  1069	CALC_F3_POST(CX,SI,DI,DX,BX)
  1070
  1071#define CALC_127 \
  1072	CALC_F3_PRE(0x17c,AX) \
  1073	PRECALC_39(Y3,0x60,0x1e0) \
  1074	CALC_F3_POST(DX,BX,SI,AX,CX)
  1075
  1076#define CALC_128 \
  1077	CALC_F3_PRE(0x190,DI) \
  1078	PRECALC_32(Y5,Y3) \
  1079	CALC_F3_POST(AX,CX,BX,DI,DX)
  1080
  1081#define CALC_129 \
  1082	CALC_F3_PRE(0x194,SI) \
  1083	PRECALC_33(Y14,Y15) \
  1084	CALC_F3_POST(DI,DX,CX,SI,AX)
  1085
  1086#define CALC_130 \
  1087	CALC_F3_PRE(0x198,BX) \
  1088	PRECALC_34(Y8) \
  1089	CALC_F3_POST(SI,AX,DX,BX,DI)
  1090
  1091#define CALC_131 \
  1092	CALC_F3_PRE(0x19c,CX) \
  1093	PRECALC_35(Y15) \
  1094	CALC_F3_POST(BX,DI,AX,CX,SI)
  1095
  1096#define CALC_132 \
  1097	CALC_F3_PRE(0x1b0,DX) \
  1098	PRECALC_36(Y15) \
  1099	CALC_F3_POST(CX,SI,DI,DX,BX)
  1100
  1101#define CALC_133 \
  1102	CALC_F3_PRE(0x1b4,AX) \
  1103	PRECALC_37(Y15) \
  1104	CALC_F3_POST(DX,BX,SI,AX,CX)
  1105
  1106#define CALC_134 \
  1107	CALC_F3_PRE(0x1b8,DI) \
  1108	CALC_F3_POST(AX,CX,BX,DI,DX)
  1109
  1110#define CALC_135 \
  1111	CALC_F3_PRE(0x1bc,SI) \
  1112	PRECALC_39(Y15,0x60,0x200) \
  1113	CALC_F3_POST(DI,DX,CX,SI,AX)
  1114
  1115#define CALC_136 \
  1116	CALC_F3_PRE(0x1d0,BX) \
  1117	PRECALC_32(Y3,Y15) \
  1118	CALC_F3_POST(SI,AX,DX,BX,DI)
  1119
  1120#define CALC_137 \
  1121	CALC_F3_PRE(0x1d4,CX) \
  1122	PRECALC_33(Y13,Y14) \
  1123	CALC_F3_POST(BX,DI,AX,CX,SI)
  1124
  1125#define CALC_138 \
  1126	CALC_F3_PRE(0x1d8,DX) \
  1127	PRECALC_34(Y7) \
  1128	CALC_F3_POST(CX,SI,DI,DX,BX)
  1129
  1130#define CALC_139 \
  1131	CALC_F2_PRE(0x1dc,DX,CX,AX) \
  1132	PRECALC_35(Y14) \
  1133	CALC_F2_POST(DX,BX,SI,AX)
  1134
  1135#define CALC_140 \
  1136	CALC_F2_PRE(0x1f0,AX,DX,DI) \
  1137	PRECALC_36(Y14) \
  1138	CALC_F2_POST(AX,CX,BX,DI)
  1139
  1140#define CALC_141 \
  1141	CALC_F2_PRE(0x1f4,DI,AX,SI) \
  1142	PRECALC_37(Y14) \
  1143	CALC_F2_POST(DI,DX,CX,SI)
  1144
  1145#define CALC_142 \
  1146	CALC_F2_PRE(0x1f8,SI,DI,BX) \
  1147	CALC_F2_POST(SI,AX,DX,BX)
  1148
  1149#define CALC_143 \
  1150	CALC_F2_PRE(0x1fc,BX,SI,CX) \
  1151	PRECALC_39(Y14,0x60,0x220) \
  1152	CALC_F2_POST(BX,DI,AX,CX)
  1153
  1154#define CALC_144 \
  1155	CALC_F2_PRE(0x210,CX,BX,DX) \
  1156	PRECALC_32(Y15,Y14) \
  1157	CALC_F2_POST(CX,SI,DI,DX)
  1158
  1159#define CALC_145 \
  1160	CALC_F2_PRE(0x214,DX,CX,AX) \
  1161	PRECALC_33(Y12,Y13) \
  1162	CALC_F2_POST(DX,BX,SI,AX)
  1163
  1164#define CALC_146 \
  1165	CALC_F2_PRE(0x218,AX,DX,DI) \
  1166	PRECALC_34(Y5) \
  1167	CALC_F2_POST(AX,CX,BX,DI)
  1168
  1169#define CALC_147 \
  1170	CALC_F2_PRE(0x21c,DI,AX,SI) \
  1171	PRECALC_35(Y13) \
  1172	CALC_F2_POST(DI,DX,CX,SI)
  1173
  1174#define CALC_148 \
  1175	CALC_F2_PRE(0x230,SI,DI,BX) \
  1176	PRECALC_36(Y13) \
  1177	CALC_F2_POST(SI,AX,DX,BX)
  1178
  1179#define CALC_149 \
  1180	CALC_F2_PRE(0x234,BX,SI,CX) \
  1181	PRECALC_37(Y13) \
  1182	CALC_F2_POST(BX,DI,AX,CX)
  1183
  1184#define CALC_150 \
  1185	CALC_F2_PRE(0x238,CX,BX,DX) \
  1186	CALC_F2_POST(CX,SI,DI,DX)
  1187
  1188#define CALC_151 \
  1189	CALC_F2_PRE(0x23c,DX,CX,AX) \
  1190	PRECALC_39(Y13,0x60,0x240) \
  1191	CALC_F2_POST(DX,BX,SI,AX)
  1192
  1193#define CALC_152 \
  1194	CALC_F2_PRE(0x250,AX,DX,DI) \
  1195	PRECALC_32(Y14,Y13) \
  1196	CALC_F2_POST(AX,CX,BX,DI)
  1197
  1198#define CALC_153 \
  1199	CALC_F2_PRE(0x254,DI,AX,SI) \
  1200	PRECALC_33(Y8,Y12) \
  1201	CALC_F2_POST(DI,DX,CX,SI)
  1202
  1203#define CALC_154 \
  1204	CALC_F2_PRE(0x258,SI,DI,BX) \
  1205	PRECALC_34(Y3) \
  1206	CALC_F2_POST(SI,AX,DX,BX)
  1207
  1208#define CALC_155 \
  1209	CALC_F2_PRE(0x25c,BX,SI,CX) \
  1210	PRECALC_35(Y12) \
  1211	CALC_F2_POST(BX,DI,AX,CX)
  1212
  1213#define CALC_156 \
  1214	CALC_F2_PRE(0x270,CX,BX,DX) \
  1215	PRECALC_36(Y12) \
  1216	CALC_F2_POST(CX,SI,DI,DX)
  1217
  1218#define CALC_157 \
  1219	CALC_F2_PRE(0x274,DX,CX,AX) \
  1220	PRECALC_37(Y12) \
  1221	CALC_F2_POST(DX,BX,SI,AX)
  1222
  1223#define CALC_158 \
  1224	CALC_F2_PRE(0x278,AX,DX,DI) \
  1225	CALC_F2_POST(AX,CX,BX,DI)
  1226
  1227#define CALC_159 \
  1228	ADDL 0x27c(R15),SI \
  1229	LEAL (SI)(AX*1), SI \
  1230	RORXL $0x1b, DI, R12 \
  1231	PRECALC_39(Y12,0x60,0x260) \
  1232	ADDL R12, SI
  1233
  1234
  1235
  1236#define CALC \
  1237	MOVL	(R9), CX \
  1238	MOVL	4(R9), SI \
  1239	MOVL	8(R9), DI \
  1240	MOVL	12(R9), AX \
  1241	MOVL	16(R9), DX \
  1242	MOVQ    SP, R14 \
  1243	LEAQ    (2*4*80+32)(SP), R15 \
  1244	PRECALC \ // Precalc WK for first 2 blocks
  1245	XCHGQ   R15, R14 \
  1246loop: \  // this loops is unrolled
  1247	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
  1248	JNE	begin \
  1249	VZEROUPPER \
  1250	RET \
  1251begin: \
  1252	CALC_0 \
  1253	CALC_1 \
  1254	CALC_2 \
  1255	CALC_3 \
  1256	CALC_4 \
  1257	CALC_5 \
  1258	CALC_6 \
  1259	CALC_7 \
  1260	CALC_8 \
  1261	CALC_9 \
  1262	CALC_10 \
  1263	CALC_11 \
  1264	CALC_12 \
  1265	CALC_13 \
  1266	CALC_14 \
  1267	CALC_15 \
  1268	CALC_16 \
  1269	CALC_17 \
  1270	CALC_18 \
  1271	CALC_19 \
  1272	CALC_20 \
  1273	CALC_21 \
  1274	CALC_22 \
  1275	CALC_23 \
  1276	CALC_24 \
  1277	CALC_25 \
  1278	CALC_26 \
  1279	CALC_27 \
  1280	CALC_28 \
  1281	CALC_29 \
  1282	CALC_30 \
  1283	CALC_31 \
  1284	CALC_32 \
  1285	CALC_33 \
  1286	CALC_34 \
  1287	CALC_35 \
  1288	CALC_36 \
  1289	CALC_37 \
  1290	CALC_38 \
  1291	CALC_39 \
  1292	CALC_40 \
  1293	CALC_41 \
  1294	CALC_42 \
  1295	CALC_43 \
  1296	CALC_44 \
  1297	CALC_45 \
  1298	CALC_46 \
  1299	CALC_47 \
  1300	CALC_48 \
  1301	CALC_49 \
  1302	CALC_50 \
  1303	CALC_51 \
  1304	CALC_52 \
  1305	CALC_53 \
  1306	CALC_54 \
  1307	CALC_55 \
  1308	CALC_56 \
  1309	CALC_57 \
  1310	CALC_58 \
  1311	CALC_59 \
  1312	ADDQ $128, R10 \ // move to next even-64-byte block
  1313	CMPQ R10, R11 \ // is current block the last one?
  1314	CMOVQCC R8, R10 \ // signal the last iteration smartly
  1315	CALC_60 \
  1316	CALC_61 \
  1317	CALC_62 \
  1318	CALC_63 \
  1319	CALC_64 \
  1320	CALC_65 \
  1321	CALC_66 \
  1322	CALC_67 \
  1323	CALC_68 \
  1324	CALC_69 \
  1325	CALC_70 \
  1326	CALC_71 \
  1327	CALC_72 \
  1328	CALC_73 \
  1329	CALC_74 \
  1330	CALC_75 \
  1331	CALC_76 \
  1332	CALC_77 \
  1333	CALC_78 \
  1334	CALC_79 \
  1335	UPDATE_HASH(AX,DX,BX,SI,DI) \
  1336	CMPQ R10, R8 \ // is current block the last one?
  1337	JE loop\
  1338	MOVL DX, CX \
  1339	CALC_80 \
  1340	CALC_81 \
  1341	CALC_82 \
  1342	CALC_83 \
  1343	CALC_84 \
  1344	CALC_85 \
  1345	CALC_86 \
  1346	CALC_87 \
  1347	CALC_88 \
  1348	CALC_89 \
  1349	CALC_90 \
  1350	CALC_91 \
  1351	CALC_92 \
  1352	CALC_93 \
  1353	CALC_94 \
  1354	CALC_95 \
  1355	CALC_96 \
  1356	CALC_97 \
  1357	CALC_98 \
  1358	CALC_99 \
  1359	CALC_100 \
  1360	CALC_101 \
  1361	CALC_102 \
  1362	CALC_103 \
  1363	CALC_104 \
  1364	CALC_105 \
  1365	CALC_106 \
  1366	CALC_107 \
  1367	CALC_108 \
  1368	CALC_109 \
  1369	CALC_110 \
  1370	CALC_111 \
  1371	CALC_112 \
  1372	CALC_113 \
  1373	CALC_114 \
  1374	CALC_115 \
  1375	CALC_116 \
  1376	CALC_117 \
  1377	CALC_118 \
  1378	CALC_119 \
  1379	CALC_120 \
  1380	CALC_121 \
  1381	CALC_122 \
  1382	CALC_123 \
  1383	CALC_124 \
  1384	CALC_125 \
  1385	CALC_126 \
  1386	CALC_127 \
  1387	CALC_128 \
  1388	CALC_129 \
  1389	CALC_130 \
  1390	CALC_131 \
  1391	CALC_132 \
  1392	CALC_133 \
  1393	CALC_134 \
  1394	CALC_135 \
  1395	CALC_136 \
  1396	CALC_137 \
  1397	CALC_138 \
  1398	CALC_139 \
  1399	ADDQ $128, R13 \ //move to next even-64-byte block
  1400	CMPQ R13, R11 \ //is current block the last one?
  1401	CMOVQCC R8, R10 \
  1402	CALC_140 \
  1403	CALC_141 \
  1404	CALC_142 \
  1405	CALC_143 \
  1406	CALC_144 \
  1407	CALC_145 \
  1408	CALC_146 \
  1409	CALC_147 \
  1410	CALC_148 \
  1411	CALC_149 \
  1412	CALC_150 \
  1413	CALC_151 \
  1414	CALC_152 \
  1415	CALC_153 \
  1416	CALC_154 \
  1417	CALC_155 \
  1418	CALC_156 \
  1419	CALC_157 \
  1420	CALC_158 \
  1421	CALC_159 \
  1422	UPDATE_HASH(SI,DI,DX,CX,BX) \
  1423	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
  1424	MOVL	DI, SI \
  1425	MOVL	DX, DI \
  1426	MOVL	BX, DX \
  1427	MOVL	CX, AX \
  1428	MOVL	R12, CX \
  1429	XCHGQ   R15, R14 \
  1430	JMP     loop
  1431
  1432
  1433
  1434TEXT ·blockAVX2(SB),$1408-32
  1435
  1436	MOVQ	dig+0(FP),	DI
  1437	MOVQ	p_base+8(FP),	SI
  1438	MOVQ	p_len+16(FP),	DX
  1439	SHRQ	$6,		DX
  1440	SHLQ	$6,		DX
  1441
  1442	MOVQ	$K_XMM_AR<>(SB), R8
  1443
  1444	MOVQ	DI, R9
  1445	MOVQ	SI, R10
  1446	LEAQ	64(SI), R13
  1447
  1448	ADDQ	SI, DX
  1449	ADDQ	$64, DX
  1450	MOVQ	DX, R11
  1451
  1452	CMPQ	R13, R11
  1453	CMOVQCC	R8, R13
  1454
  1455	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
  1456
  1457	CALC // RET is inside macros
  1458
  1459DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
  1460DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
  1461DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
  1462DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
  1463DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
  1464DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
  1465DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
  1466DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
  1467DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
  1468DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
  1469DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
  1470DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
  1471DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
  1472DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
  1473DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
  1474DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
  1475DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
  1476DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
  1477DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
  1478DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
  1479DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
  1480DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
  1481DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
  1482DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
  1483DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
  1484DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
  1485DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
  1486DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
  1487DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
  1488DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
  1489DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
  1490DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
  1491GLOBL K_XMM_AR<>(SB),RODATA,$128
  1492
  1493DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
  1494DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
  1495DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
  1496DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
  1497DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
  1498DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
  1499DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
  1500DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
  1501GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32

View as plain text