// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// RISCV64 version of md5block.go
// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go

//go:build !purego

#include "textflag.h"

#define LOAD32U(base, offset, tmp, dest) \
	MOVBU	(offset+0*1)(base), dest; \
	MOVBU	(offset+1*1)(base), tmp; \
	SLL	$8, tmp; \
	OR	tmp, dest; \
	MOVBU	(offset+2*1)(base), tmp; \
	SLL	$16, tmp; \
	OR	tmp, dest; \
	MOVBU	(offset+3*1)(base), tmp; \
	SLL	$24, tmp; \
	OR	tmp, dest

#define LOAD64U(base, offset, tmp1, tmp2, dst) \
	LOAD32U(base, offset, tmp1, dst); \
	LOAD32U(base, offset+4, tmp1, tmp2); \
	SLL	$32, tmp2; \
	OR	tmp2, dst

#define ROUND1EVN(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	x, a; \
	ADDW	X23, a; \
	XOR	c, d, X23; \
	AND	b, X23; \
	XOR	d, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND1ODD(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	X23, a; \
	SRL	$32, x, X23; \
	ADDW	X23, a; \
	XOR	c, d, X23; \
	AND	b, X23; \
	XOR	d, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND2EVN(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	x, a; \
	ADDW	X23, a; \
	XOR	b, c, X23; \
	AND	d, X23; \
	XOR	c, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND2ODD(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	X23, a; \
	SRL	$32, x, X23; \
	ADDW	X23, a; \
	XOR	b, c, X23; \
	AND	d, X23; \
	XOR	c, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND3EVN(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	x, a; \
	ADDW	X23, a; \
	XOR	c, d, X23; \
	XOR	b, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND3ODD(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	X23, a; \
	SRL	$32, x, X23; \
	ADDW	X23, a; \
	XOR	c, d, X23; \
	XOR	b, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND4EVN(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	x, a; \
	ADDW	X23, a; \
	ORN	d, b, X23; \
	XOR	c, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

#define ROUND4ODD(a, b, c, d, x, const, shift) \
	MOV	$const, X23; \
	ADDW	X23, a; \
	SRL	$32, x, X23; \
	ADDW	X23, a; \
	ORN	d, b, X23; \
	XOR	c, X23; \
	ADDW	X23, a; \
	RORIW	$(32-shift), a; \
	ADDW	b, a

// Register use for the block function
//
// X5 - X12	: contain the 16 32 bit data items in the block we're
//		  processing.  Odd numbered values, e.g., x1, x3 are stored in
//		  the upper 32 bits of the register.
// X13 - X16	: a, b, c, d
// X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
//		  dd.  X17 and X18 are also used as temporary registers when
//		  loading unaligned data.
// X22		: pointer to dig.s
// X23		: temporary register
// X28		: pointer to the first byte beyond the end of p
// X29		: pointer to current 64 byte block of data, initially set to
//		  &p[0]
// X30		: temporary register

TEXT	·block(SB),NOSPLIT,$0-32
	MOV	p+8(FP), X29
	MOV	p_len+16(FP), X30
	SRL	$6, X30
	SLL	$6, X30
	BEQZ	X30, zero

	ADD	X29, X30, X28

	MOV	dig+0(FP), X22
	MOVWU	(0*4)(X22), X13	// a = s[0]
	MOVWU	(1*4)(X22), X14	// b = s[1]
	MOVWU	(2*4)(X22), X15	// c = s[2]
	MOVWU	(3*4)(X22), X16	// d = s[3]

loop:

	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
	// Different paths are taken to load the values depending on whether the
	// buffer is 8 byte aligned or not.  We load all the values up front
	// here at the start of the loop to avoid multiple alignment checks and
	// to reduce code size.  It takes 10 instructions to load an unaligned
	// 32 bit value and this value will be used 4 times in the main body
	// of the loop below.

	AND	$7, X29, X30
	BEQZ	X30, aligned

	LOAD64U(X29,0, X17, X18, X5)
	LOAD64U(X29,8, X17, X18, X6)
	LOAD64U(X29,16, X17, X18, X7)
	LOAD64U(X29,24, X17, X18, X8)
	LOAD64U(X29,32, X17, X18, X9)
	LOAD64U(X29,40, X17, X18, X10)
	LOAD64U(X29,48, X17, X18, X11)
	LOAD64U(X29,56, X17, X18, X12)
	JMP block_loaded

aligned:
	MOV	(0*8)(X29), X5
	MOV	(1*8)(X29), X6
	MOV	(2*8)(X29), X7
	MOV	(3*8)(X29), X8
	MOV	(4*8)(X29), X9
	MOV	(5*8)(X29), X10
	MOV	(6*8)(X29), X11
	MOV	(7*8)(X29), X12

block_loaded:
	MOV	X13, X17
	MOV	X14, X18
	MOV	X15, X19
	MOV	X16, X20

	// Some of the hex constants below are too large to fit into a
	// signed 32 bit value.  The assembler will handle these
	// constants in a special way to ensure that they are
	// zero extended.  Our algorithm is only interested in the
	// bottom 32 bits and doesn't care whether constants are
	// sign or zero extended when moved into 64 bit registers.
	// So we use signed constants instead of hex when bit 31 is
	// set so all constants can be loaded by lui+addi.

	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821

	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a

	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665

	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391

	ADDW	X17, X13
	ADDW	X18, X14
	ADDW	X19, X15
	ADDW	X20, X16

	ADD	$64, X29
	BNE	X28, X29, loop

	MOVW	X13, (0*4)(X22)
	MOVW	X14, (1*4)(X22)
	MOVW	X15, (2*4)(X22)
	MOVW	X16, (3*4)(X22)

zero:
	RET