1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Based on CRYPTOGAMS code with the following comment:
6// # ====================================================================
7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8// # project. The module is, however, dual licensed under OpenSSL and
9// # CRYPTOGAMS licenses depending on where you obtain it. For further
10// # details see http://www.openssl.org/~appro/cryptogams/.
11// # ====================================================================
12
13// Code for the perl script that generates the ppc64 assembler
14// can be found in the cryptogams repository at the link below. It is based on
15// the original from openssl.
16
17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19// The differences in this and the original implementation are
20// due to the calling conventions and initialization of constants.
21
22//go:build gc && !purego
23
24#include "textflag.h"
25
26#define OUT R3
27#define INP R4
28#define LEN R5
29#define KEY R6
30#define CNT R7
31#define TMP R15
32
33#define CONSTBASE R16
34#define BLOCKS R17
35
36// for VPERMXOR
37#define MASK R18
38
39DATA consts<>+0x00(SB)/8, $0x3320646e61707865
40DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
41DATA consts<>+0x10(SB)/8, $0x0000000000000001
42DATA consts<>+0x18(SB)/8, $0x0000000000000000
43DATA consts<>+0x20(SB)/8, $0x0000000000000004
44DATA consts<>+0x28(SB)/8, $0x0000000000000000
45DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
46DATA consts<>+0x38(SB)/8, $0x0203000106070405
47DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
48DATA consts<>+0x48(SB)/8, $0x0102030005060704
49DATA consts<>+0x50(SB)/8, $0x6170786561707865
50DATA consts<>+0x58(SB)/8, $0x6170786561707865
51DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
52DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
53DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
54DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
55DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
56DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
57DATA consts<>+0x90(SB)/8, $0x0000000100000000
58DATA consts<>+0x98(SB)/8, $0x0000000300000002
59DATA consts<>+0xa0(SB)/8, $0x5566774411223300
60DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
61DATA consts<>+0xb0(SB)/8, $0x6677445522330011
62DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
63GLOBL consts<>(SB), RODATA, $0xc0
64
65//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
66TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
67 MOVD out+0(FP), OUT
68 MOVD inp+8(FP), INP
69 MOVD len+16(FP), LEN
70 MOVD key+24(FP), KEY
71 MOVD counter+32(FP), CNT
72
73 // Addressing for constants
74 MOVD $consts<>+0x00(SB), CONSTBASE
75 MOVD $16, R8
76 MOVD $32, R9
77 MOVD $48, R10
78 MOVD $64, R11
79 SRD $6, LEN, BLOCKS
80 // for VPERMXOR
81 MOVD $consts<>+0xa0(SB), MASK
82 MOVD $16, R20
83 // V16
84 LXVW4X (CONSTBASE)(R0), VS48
85 ADD $80,CONSTBASE
86
87 // Load key into V17,V18
88 LXVW4X (KEY)(R0), VS49
89 LXVW4X (KEY)(R8), VS50
90
91 // Load CNT, NONCE into V19
92 LXVW4X (CNT)(R0), VS51
93
94 // Clear V27
95 VXOR V27, V27, V27
96
97 // V28
98 LXVW4X (CONSTBASE)(R11), VS60
99
100 // Load mask constants for VPERMXOR
101 LXVW4X (MASK)(R0), V20
102 LXVW4X (MASK)(R20), V21
103
104 // splat slot from V19 -> V26
105 VSPLTW $0, V19, V26
106
107 VSLDOI $4, V19, V27, V19
108 VSLDOI $12, V27, V19, V19
109
110 VADDUWM V26, V28, V26
111
112 MOVD $10, R14
113 MOVD R14, CTR
114 PCALIGN $16
115loop_outer_vsx:
116 // V0, V1, V2, V3
117 LXVW4X (R0)(CONSTBASE), VS32
118 LXVW4X (R8)(CONSTBASE), VS33
119 LXVW4X (R9)(CONSTBASE), VS34
120 LXVW4X (R10)(CONSTBASE), VS35
121
122 // splat values from V17, V18 into V4-V11
123 VSPLTW $0, V17, V4
124 VSPLTW $1, V17, V5
125 VSPLTW $2, V17, V6
126 VSPLTW $3, V17, V7
127 VSPLTW $0, V18, V8
128 VSPLTW $1, V18, V9
129 VSPLTW $2, V18, V10
130 VSPLTW $3, V18, V11
131
132 // VOR
133 VOR V26, V26, V12
134
135 // splat values from V19 -> V13, V14, V15
136 VSPLTW $1, V19, V13
137 VSPLTW $2, V19, V14
138 VSPLTW $3, V19, V15
139
140 // splat const values
141 VSPLTISW $-16, V27
142 VSPLTISW $12, V28
143 VSPLTISW $8, V29
144 VSPLTISW $7, V30
145 PCALIGN $16
146loop_vsx:
147 VADDUWM V0, V4, V0
148 VADDUWM V1, V5, V1
149 VADDUWM V2, V6, V2
150 VADDUWM V3, V7, V3
151
152 VPERMXOR V12, V0, V21, V12
153 VPERMXOR V13, V1, V21, V13
154 VPERMXOR V14, V2, V21, V14
155 VPERMXOR V15, V3, V21, V15
156
157 VADDUWM V8, V12, V8
158 VADDUWM V9, V13, V9
159 VADDUWM V10, V14, V10
160 VADDUWM V11, V15, V11
161
162 VXOR V4, V8, V4
163 VXOR V5, V9, V5
164 VXOR V6, V10, V6
165 VXOR V7, V11, V7
166
167 VRLW V4, V28, V4
168 VRLW V5, V28, V5
169 VRLW V6, V28, V6
170 VRLW V7, V28, V7
171
172 VADDUWM V0, V4, V0
173 VADDUWM V1, V5, V1
174 VADDUWM V2, V6, V2
175 VADDUWM V3, V7, V3
176
177 VPERMXOR V12, V0, V20, V12
178 VPERMXOR V13, V1, V20, V13
179 VPERMXOR V14, V2, V20, V14
180 VPERMXOR V15, V3, V20, V15
181
182 VADDUWM V8, V12, V8
183 VADDUWM V9, V13, V9
184 VADDUWM V10, V14, V10
185 VADDUWM V11, V15, V11
186
187 VXOR V4, V8, V4
188 VXOR V5, V9, V5
189 VXOR V6, V10, V6
190 VXOR V7, V11, V7
191
192 VRLW V4, V30, V4
193 VRLW V5, V30, V5
194 VRLW V6, V30, V6
195 VRLW V7, V30, V7
196
197 VADDUWM V0, V5, V0
198 VADDUWM V1, V6, V1
199 VADDUWM V2, V7, V2
200 VADDUWM V3, V4, V3
201
202 VPERMXOR V15, V0, V21, V15
203 VPERMXOR V12, V1, V21, V12
204 VPERMXOR V13, V2, V21, V13
205 VPERMXOR V14, V3, V21, V14
206
207 VADDUWM V10, V15, V10
208 VADDUWM V11, V12, V11
209 VADDUWM V8, V13, V8
210 VADDUWM V9, V14, V9
211
212 VXOR V5, V10, V5
213 VXOR V6, V11, V6
214 VXOR V7, V8, V7
215 VXOR V4, V9, V4
216
217 VRLW V5, V28, V5
218 VRLW V6, V28, V6
219 VRLW V7, V28, V7
220 VRLW V4, V28, V4
221
222 VADDUWM V0, V5, V0
223 VADDUWM V1, V6, V1
224 VADDUWM V2, V7, V2
225 VADDUWM V3, V4, V3
226
227 VPERMXOR V15, V0, V20, V15
228 VPERMXOR V12, V1, V20, V12
229 VPERMXOR V13, V2, V20, V13
230 VPERMXOR V14, V3, V20, V14
231
232 VADDUWM V10, V15, V10
233 VADDUWM V11, V12, V11
234 VADDUWM V8, V13, V8
235 VADDUWM V9, V14, V9
236
237 VXOR V5, V10, V5
238 VXOR V6, V11, V6
239 VXOR V7, V8, V7
240 VXOR V4, V9, V4
241
242 VRLW V5, V30, V5
243 VRLW V6, V30, V6
244 VRLW V7, V30, V7
245 VRLW V4, V30, V4
246 BDNZ loop_vsx
247
248 VADDUWM V12, V26, V12
249
250 VMRGEW V0, V1, V27
251 VMRGEW V2, V3, V28
252
253 VMRGOW V0, V1, V0
254 VMRGOW V2, V3, V2
255
256 VMRGEW V4, V5, V29
257 VMRGEW V6, V7, V30
258
259 XXPERMDI VS32, VS34, $0, VS33
260 XXPERMDI VS32, VS34, $3, VS35
261 XXPERMDI VS59, VS60, $0, VS32
262 XXPERMDI VS59, VS60, $3, VS34
263
264 VMRGOW V4, V5, V4
265 VMRGOW V6, V7, V6
266
267 VMRGEW V8, V9, V27
268 VMRGEW V10, V11, V28
269
270 XXPERMDI VS36, VS38, $0, VS37
271 XXPERMDI VS36, VS38, $3, VS39
272 XXPERMDI VS61, VS62, $0, VS36
273 XXPERMDI VS61, VS62, $3, VS38
274
275 VMRGOW V8, V9, V8
276 VMRGOW V10, V11, V10
277
278 VMRGEW V12, V13, V29
279 VMRGEW V14, V15, V30
280
281 XXPERMDI VS40, VS42, $0, VS41
282 XXPERMDI VS40, VS42, $3, VS43
283 XXPERMDI VS59, VS60, $0, VS40
284 XXPERMDI VS59, VS60, $3, VS42
285
286 VMRGOW V12, V13, V12
287 VMRGOW V14, V15, V14
288
289 VSPLTISW $4, V27
290 VADDUWM V26, V27, V26
291
292 XXPERMDI VS44, VS46, $0, VS45
293 XXPERMDI VS44, VS46, $3, VS47
294 XXPERMDI VS61, VS62, $0, VS44
295 XXPERMDI VS61, VS62, $3, VS46
296
297 VADDUWM V0, V16, V0
298 VADDUWM V4, V17, V4
299 VADDUWM V8, V18, V8
300 VADDUWM V12, V19, V12
301
302 CMPU LEN, $64
303 BLT tail_vsx
304
305 // Bottom of loop
306 LXVW4X (INP)(R0), VS59
307 LXVW4X (INP)(R8), VS60
308 LXVW4X (INP)(R9), VS61
309 LXVW4X (INP)(R10), VS62
310
311 VXOR V27, V0, V27
312 VXOR V28, V4, V28
313 VXOR V29, V8, V29
314 VXOR V30, V12, V30
315
316 STXVW4X VS59, (OUT)(R0)
317 STXVW4X VS60, (OUT)(R8)
318 ADD $64, INP
319 STXVW4X VS61, (OUT)(R9)
320 ADD $-64, LEN
321 STXVW4X VS62, (OUT)(R10)
322 ADD $64, OUT
323 BEQ done_vsx
324
325 VADDUWM V1, V16, V0
326 VADDUWM V5, V17, V4
327 VADDUWM V9, V18, V8
328 VADDUWM V13, V19, V12
329
330 CMPU LEN, $64
331 BLT tail_vsx
332
333 LXVW4X (INP)(R0), VS59
334 LXVW4X (INP)(R8), VS60
335 LXVW4X (INP)(R9), VS61
336 LXVW4X (INP)(R10), VS62
337 VXOR V27, V0, V27
338
339 VXOR V28, V4, V28
340 VXOR V29, V8, V29
341 VXOR V30, V12, V30
342
343 STXVW4X VS59, (OUT)(R0)
344 STXVW4X VS60, (OUT)(R8)
345 ADD $64, INP
346 STXVW4X VS61, (OUT)(R9)
347 ADD $-64, LEN
348 STXVW4X VS62, (OUT)(V10)
349 ADD $64, OUT
350 BEQ done_vsx
351
352 VADDUWM V2, V16, V0
353 VADDUWM V6, V17, V4
354 VADDUWM V10, V18, V8
355 VADDUWM V14, V19, V12
356
357 CMPU LEN, $64
358 BLT tail_vsx
359
360 LXVW4X (INP)(R0), VS59
361 LXVW4X (INP)(R8), VS60
362 LXVW4X (INP)(R9), VS61
363 LXVW4X (INP)(R10), VS62
364
365 VXOR V27, V0, V27
366 VXOR V28, V4, V28
367 VXOR V29, V8, V29
368 VXOR V30, V12, V30
369
370 STXVW4X VS59, (OUT)(R0)
371 STXVW4X VS60, (OUT)(R8)
372 ADD $64, INP
373 STXVW4X VS61, (OUT)(R9)
374 ADD $-64, LEN
375 STXVW4X VS62, (OUT)(R10)
376 ADD $64, OUT
377 BEQ done_vsx
378
379 VADDUWM V3, V16, V0
380 VADDUWM V7, V17, V4
381 VADDUWM V11, V18, V8
382 VADDUWM V15, V19, V12
383
384 CMPU LEN, $64
385 BLT tail_vsx
386
387 LXVW4X (INP)(R0), VS59
388 LXVW4X (INP)(R8), VS60
389 LXVW4X (INP)(R9), VS61
390 LXVW4X (INP)(R10), VS62
391
392 VXOR V27, V0, V27
393 VXOR V28, V4, V28
394 VXOR V29, V8, V29
395 VXOR V30, V12, V30
396
397 STXVW4X VS59, (OUT)(R0)
398 STXVW4X VS60, (OUT)(R8)
399 ADD $64, INP
400 STXVW4X VS61, (OUT)(R9)
401 ADD $-64, LEN
402 STXVW4X VS62, (OUT)(R10)
403 ADD $64, OUT
404
405 MOVD $10, R14
406 MOVD R14, CTR
407 BNE loop_outer_vsx
408
409done_vsx:
410 // Increment counter by number of 64 byte blocks
411 MOVD (CNT), R14
412 ADD BLOCKS, R14
413 MOVD R14, (CNT)
414 RET
415
416tail_vsx:
417 ADD $32, R1, R11
418 MOVD LEN, CTR
419
420 // Save values on stack to copy from
421 STXVW4X VS32, (R11)(R0)
422 STXVW4X VS36, (R11)(R8)
423 STXVW4X VS40, (R11)(R9)
424 STXVW4X VS44, (R11)(R10)
425 ADD $-1, R11, R12
426 ADD $-1, INP
427 ADD $-1, OUT
428 PCALIGN $16
429looptail_vsx:
430 // Copying the result to OUT
431 // in bytes.
432 MOVBZU 1(R12), KEY
433 MOVBZU 1(INP), TMP
434 XOR KEY, TMP, KEY
435 MOVBU KEY, 1(OUT)
436 BDNZ looptail_vsx
437
438 // Clear the stack values
439 STXVW4X VS48, (R11)(R0)
440 STXVW4X VS48, (R11)(R8)
441 STXVW4X VS48, (R11)(R9)
442 STXVW4X VS48, (R11)(R10)
443 BR done_vsx
View as plain text