1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build gc && !purego
6
7#include "textflag.h"
8
9#define NUM_ROUNDS 10
10
11// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
12TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
13 MOVD dst+0(FP), R1
14 MOVD src+24(FP), R2
15 MOVD src_len+32(FP), R3
16 MOVD key+48(FP), R4
17 MOVD nonce+56(FP), R6
18 MOVD counter+64(FP), R7
19
20 MOVD $·constants(SB), R10
21 MOVD $·incRotMatrix(SB), R11
22
23 MOVW (R7), R20
24
25 AND $~255, R3, R13
26 ADD R2, R13, R12 // R12 for block end
27 AND $255, R3, R13
28loop:
29 MOVD $NUM_ROUNDS, R21
30 VLD1 (R11), [V30.S4, V31.S4]
31
32 // load contants
33 // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
34 WORD $0x4D60E940
35
36 // load keys
37 // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
38 WORD $0x4DFFE884
39 // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
40 WORD $0x4DFFE888
41 SUB $32, R4
42
43 // load counter + nonce
44 // VLD1R (R7), [V12.S4]
45 WORD $0x4D40C8EC
46
47 // VLD3R (R6), [V13.S4, V14.S4, V15.S4]
48 WORD $0x4D40E8CD
49
50 // update counter
51 VADD V30.S4, V12.S4, V12.S4
52
53chacha:
54 // V0..V3 += V4..V7
55 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
56 VADD V0.S4, V4.S4, V0.S4
57 VADD V1.S4, V5.S4, V1.S4
58 VADD V2.S4, V6.S4, V2.S4
59 VADD V3.S4, V7.S4, V3.S4
60 VEOR V12.B16, V0.B16, V12.B16
61 VEOR V13.B16, V1.B16, V13.B16
62 VEOR V14.B16, V2.B16, V14.B16
63 VEOR V15.B16, V3.B16, V15.B16
64 VREV32 V12.H8, V12.H8
65 VREV32 V13.H8, V13.H8
66 VREV32 V14.H8, V14.H8
67 VREV32 V15.H8, V15.H8
68 // V8..V11 += V12..V15
69 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
70 VADD V8.S4, V12.S4, V8.S4
71 VADD V9.S4, V13.S4, V9.S4
72 VADD V10.S4, V14.S4, V10.S4
73 VADD V11.S4, V15.S4, V11.S4
74 VEOR V8.B16, V4.B16, V16.B16
75 VEOR V9.B16, V5.B16, V17.B16
76 VEOR V10.B16, V6.B16, V18.B16
77 VEOR V11.B16, V7.B16, V19.B16
78 VSHL $12, V16.S4, V4.S4
79 VSHL $12, V17.S4, V5.S4
80 VSHL $12, V18.S4, V6.S4
81 VSHL $12, V19.S4, V7.S4
82 VSRI $20, V16.S4, V4.S4
83 VSRI $20, V17.S4, V5.S4
84 VSRI $20, V18.S4, V6.S4
85 VSRI $20, V19.S4, V7.S4
86
87 // V0..V3 += V4..V7
88 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
89 VADD V0.S4, V4.S4, V0.S4
90 VADD V1.S4, V5.S4, V1.S4
91 VADD V2.S4, V6.S4, V2.S4
92 VADD V3.S4, V7.S4, V3.S4
93 VEOR V12.B16, V0.B16, V12.B16
94 VEOR V13.B16, V1.B16, V13.B16
95 VEOR V14.B16, V2.B16, V14.B16
96 VEOR V15.B16, V3.B16, V15.B16
97 VTBL V31.B16, [V12.B16], V12.B16
98 VTBL V31.B16, [V13.B16], V13.B16
99 VTBL V31.B16, [V14.B16], V14.B16
100 VTBL V31.B16, [V15.B16], V15.B16
101
102 // V8..V11 += V12..V15
103 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
104 VADD V12.S4, V8.S4, V8.S4
105 VADD V13.S4, V9.S4, V9.S4
106 VADD V14.S4, V10.S4, V10.S4
107 VADD V15.S4, V11.S4, V11.S4
108 VEOR V8.B16, V4.B16, V16.B16
109 VEOR V9.B16, V5.B16, V17.B16
110 VEOR V10.B16, V6.B16, V18.B16
111 VEOR V11.B16, V7.B16, V19.B16
112 VSHL $7, V16.S4, V4.S4
113 VSHL $7, V17.S4, V5.S4
114 VSHL $7, V18.S4, V6.S4
115 VSHL $7, V19.S4, V7.S4
116 VSRI $25, V16.S4, V4.S4
117 VSRI $25, V17.S4, V5.S4
118 VSRI $25, V18.S4, V6.S4
119 VSRI $25, V19.S4, V7.S4
120
121 // V0..V3 += V5..V7, V4
122 // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
123 VADD V0.S4, V5.S4, V0.S4
124 VADD V1.S4, V6.S4, V1.S4
125 VADD V2.S4, V7.S4, V2.S4
126 VADD V3.S4, V4.S4, V3.S4
127 VEOR V15.B16, V0.B16, V15.B16
128 VEOR V12.B16, V1.B16, V12.B16
129 VEOR V13.B16, V2.B16, V13.B16
130 VEOR V14.B16, V3.B16, V14.B16
131 VREV32 V12.H8, V12.H8
132 VREV32 V13.H8, V13.H8
133 VREV32 V14.H8, V14.H8
134 VREV32 V15.H8, V15.H8
135
136 // V10 += V15; V5 <<<= ((V10 XOR V5), 12)
137 // ...
138 VADD V15.S4, V10.S4, V10.S4
139 VADD V12.S4, V11.S4, V11.S4
140 VADD V13.S4, V8.S4, V8.S4
141 VADD V14.S4, V9.S4, V9.S4
142 VEOR V10.B16, V5.B16, V16.B16
143 VEOR V11.B16, V6.B16, V17.B16
144 VEOR V8.B16, V7.B16, V18.B16
145 VEOR V9.B16, V4.B16, V19.B16
146 VSHL $12, V16.S4, V5.S4
147 VSHL $12, V17.S4, V6.S4
148 VSHL $12, V18.S4, V7.S4
149 VSHL $12, V19.S4, V4.S4
150 VSRI $20, V16.S4, V5.S4
151 VSRI $20, V17.S4, V6.S4
152 VSRI $20, V18.S4, V7.S4
153 VSRI $20, V19.S4, V4.S4
154
155 // V0 += V5; V15 <<<= ((V0 XOR V15), 8)
156 // ...
157 VADD V5.S4, V0.S4, V0.S4
158 VADD V6.S4, V1.S4, V1.S4
159 VADD V7.S4, V2.S4, V2.S4
160 VADD V4.S4, V3.S4, V3.S4
161 VEOR V0.B16, V15.B16, V15.B16
162 VEOR V1.B16, V12.B16, V12.B16
163 VEOR V2.B16, V13.B16, V13.B16
164 VEOR V3.B16, V14.B16, V14.B16
165 VTBL V31.B16, [V12.B16], V12.B16
166 VTBL V31.B16, [V13.B16], V13.B16
167 VTBL V31.B16, [V14.B16], V14.B16
168 VTBL V31.B16, [V15.B16], V15.B16
169
170 // V10 += V15; V5 <<<= ((V10 XOR V5), 7)
171 // ...
172 VADD V15.S4, V10.S4, V10.S4
173 VADD V12.S4, V11.S4, V11.S4
174 VADD V13.S4, V8.S4, V8.S4
175 VADD V14.S4, V9.S4, V9.S4
176 VEOR V10.B16, V5.B16, V16.B16
177 VEOR V11.B16, V6.B16, V17.B16
178 VEOR V8.B16, V7.B16, V18.B16
179 VEOR V9.B16, V4.B16, V19.B16
180 VSHL $7, V16.S4, V5.S4
181 VSHL $7, V17.S4, V6.S4
182 VSHL $7, V18.S4, V7.S4
183 VSHL $7, V19.S4, V4.S4
184 VSRI $25, V16.S4, V5.S4
185 VSRI $25, V17.S4, V6.S4
186 VSRI $25, V18.S4, V7.S4
187 VSRI $25, V19.S4, V4.S4
188
189 SUB $1, R21
190 CBNZ R21, chacha
191
192 // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
193 WORD $0x4D60E950
194
195 // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
196 WORD $0x4DFFE894
197 VADD V30.S4, V12.S4, V12.S4
198 VADD V16.S4, V0.S4, V0.S4
199 VADD V17.S4, V1.S4, V1.S4
200 VADD V18.S4, V2.S4, V2.S4
201 VADD V19.S4, V3.S4, V3.S4
202 // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
203 WORD $0x4DFFE898
204 // restore R4
205 SUB $32, R4
206
207 // load counter + nonce
208 // VLD1R (R7), [V28.S4]
209 WORD $0x4D40C8FC
210 // VLD3R (R6), [V29.S4, V30.S4, V31.S4]
211 WORD $0x4D40E8DD
212
213 VADD V20.S4, V4.S4, V4.S4
214 VADD V21.S4, V5.S4, V5.S4
215 VADD V22.S4, V6.S4, V6.S4
216 VADD V23.S4, V7.S4, V7.S4
217 VADD V24.S4, V8.S4, V8.S4
218 VADD V25.S4, V9.S4, V9.S4
219 VADD V26.S4, V10.S4, V10.S4
220 VADD V27.S4, V11.S4, V11.S4
221 VADD V28.S4, V12.S4, V12.S4
222 VADD V29.S4, V13.S4, V13.S4
223 VADD V30.S4, V14.S4, V14.S4
224 VADD V31.S4, V15.S4, V15.S4
225
226 VZIP1 V1.S4, V0.S4, V16.S4
227 VZIP2 V1.S4, V0.S4, V17.S4
228 VZIP1 V3.S4, V2.S4, V18.S4
229 VZIP2 V3.S4, V2.S4, V19.S4
230 VZIP1 V5.S4, V4.S4, V20.S4
231 VZIP2 V5.S4, V4.S4, V21.S4
232 VZIP1 V7.S4, V6.S4, V22.S4
233 VZIP2 V7.S4, V6.S4, V23.S4
234 VZIP1 V9.S4, V8.S4, V24.S4
235 VZIP2 V9.S4, V8.S4, V25.S4
236 VZIP1 V11.S4, V10.S4, V26.S4
237 VZIP2 V11.S4, V10.S4, V27.S4
238 VZIP1 V13.S4, V12.S4, V28.S4
239 VZIP2 V13.S4, V12.S4, V29.S4
240 VZIP1 V15.S4, V14.S4, V30.S4
241 VZIP2 V15.S4, V14.S4, V31.S4
242 VZIP1 V18.D2, V16.D2, V0.D2
243 VZIP2 V18.D2, V16.D2, V4.D2
244 VZIP1 V19.D2, V17.D2, V8.D2
245 VZIP2 V19.D2, V17.D2, V12.D2
246 VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
247
248 VZIP1 V22.D2, V20.D2, V1.D2
249 VZIP2 V22.D2, V20.D2, V5.D2
250 VZIP1 V23.D2, V21.D2, V9.D2
251 VZIP2 V23.D2, V21.D2, V13.D2
252 VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
253 VZIP1 V26.D2, V24.D2, V2.D2
254 VZIP2 V26.D2, V24.D2, V6.D2
255 VZIP1 V27.D2, V25.D2, V10.D2
256 VZIP2 V27.D2, V25.D2, V14.D2
257 VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
258 VZIP1 V30.D2, V28.D2, V3.D2
259 VZIP2 V30.D2, V28.D2, V7.D2
260 VZIP1 V31.D2, V29.D2, V11.D2
261 VZIP2 V31.D2, V29.D2, V15.D2
262 VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
263 VEOR V0.B16, V16.B16, V16.B16
264 VEOR V1.B16, V17.B16, V17.B16
265 VEOR V2.B16, V18.B16, V18.B16
266 VEOR V3.B16, V19.B16, V19.B16
267 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
268 VEOR V4.B16, V20.B16, V20.B16
269 VEOR V5.B16, V21.B16, V21.B16
270 VEOR V6.B16, V22.B16, V22.B16
271 VEOR V7.B16, V23.B16, V23.B16
272 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
273 VEOR V8.B16, V24.B16, V24.B16
274 VEOR V9.B16, V25.B16, V25.B16
275 VEOR V10.B16, V26.B16, V26.B16
276 VEOR V11.B16, V27.B16, V27.B16
277 VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
278 VEOR V12.B16, V28.B16, V28.B16
279 VEOR V13.B16, V29.B16, V29.B16
280 VEOR V14.B16, V30.B16, V30.B16
281 VEOR V15.B16, V31.B16, V31.B16
282 VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
283
284 ADD $4, R20
285 MOVW R20, (R7) // update counter
286
287 CMP R2, R12
288 BGT loop
289
290 RET
291
292
293DATA ·constants+0x00(SB)/4, $0x61707865
294DATA ·constants+0x04(SB)/4, $0x3320646e
295DATA ·constants+0x08(SB)/4, $0x79622d32
296DATA ·constants+0x0c(SB)/4, $0x6b206574
297GLOBL ·constants(SB), NOPTR|RODATA, $32
298
299DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
300DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
301DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
302DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
303DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
304DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
305DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
306DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
307GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32
View as plain text