1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64 || ppc64le
6
7#include "go_asm.h"
8#include "textflag.h"
9
10// 4K (smallest case) page size offset mask for PPC64.
11#define PAGE_OFFSET 4095
12
13// Likewise, the BC opcode is hard to read, and no extended
14// mnemonics are offered for these forms.
15#define BGELR_CR6 BC 4, CR6LT, (LR)
16#define BEQLR BC 12, CR0EQ, (LR)
17
18// memequal(a, b unsafe.Pointer, size uintptr) bool
19TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
20 // R3 = a
21 // R4 = b
22 // R5 = size
23 BR memeqbody<>(SB)
24
25// memequal_varlen(a, b unsafe.Pointer) bool
26TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
27 // R3 = a
28 // R4 = b
29 CMP R3, R4
30 BEQ eq
31 MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure
32 BR memeqbody<>(SB)
33eq:
34 MOVD $1, R3
35 RET
36
37// Do an efficient memequal for ppc64
38// R3 = s1
39// R4 = s2
40// R5 = len
41// On exit:
42// R3 = return value
43TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
44 MOVD R3, R8 // Move s1 into R8
45 ADD R5, R3, R9 // &s1[len(s1)]
46 ADD R5, R4, R10 // &s2[len(s2)]
47 MOVD $1, R11
48 CMP R5, $16 // Use GPR checks for check for len <= 16
49 BLE check0_16
50 MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
51 CMP R5, $32 // Use overlapping VSX loads for len <= 32
52 BLE check17_32 // Do a pair of overlapping VSR compares
53 CMP R5, $64
54 BLE check33_64 // Hybrid check + overlap compare.
55
56setup64:
57 SRD $6, R5, R6 // number of 64 byte chunks to compare
58 MOVD R6, CTR
59 MOVD $16, R14 // index for VSX loads and stores
60 MOVD $32, R15
61 MOVD $48, R16
62 ANDCC $0x3F, R5, R5 // len%64==0?
63
64 PCALIGN $16
65loop64:
66 LXVD2X (R8+R0), V0
67 LXVD2X (R4+R0), V1
68 VCMPEQUBCC V0, V1, V2 // compare, setting CR6
69 BGELR_CR6
70 LXVD2X (R8+R14), V0
71 LXVD2X (R4+R14), V1
72 VCMPEQUBCC V0, V1, V2
73 BGELR_CR6
74 LXVD2X (R8+R15), V0
75 LXVD2X (R4+R15), V1
76 VCMPEQUBCC V0, V1, V2
77 BGELR_CR6
78 LXVD2X (R8+R16), V0
79 LXVD2X (R4+R16), V1
80 VCMPEQUBCC V0, V1, V2
81 BGELR_CR6
82 ADD $64,R8 // bump up to next 64
83 ADD $64,R4
84 BDNZ loop64
85
86 ISEL CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
87 BEQLR // return if no tail.
88
89 ADD $-64, R9, R8
90 ADD $-64, R10, R4
91 LXVD2X (R8+R0), V0
92 LXVD2X (R4+R0), V1
93 VCMPEQUBCC V0, V1, V2
94 BGELR_CR6
95 LXVD2X (R8+R14), V0
96 LXVD2X (R4+R14), V1
97 VCMPEQUBCC V0, V1, V2
98 BGELR_CR6
99 LXVD2X (R8+R15), V0
100 LXVD2X (R4+R15), V1
101 VCMPEQUBCC V0, V1, V2
102 BGELR_CR6
103 LXVD2X (R8+R16), V0
104 LXVD2X (R4+R16), V1
105 VCMPEQUBCC V0, V1, V2
106 ISEL CR6LT, R11, R0, R3
107 RET
108
109check33_64:
110 // Bytes 0-15
111 LXVD2X (R8+R0), V0
112 LXVD2X (R4+R0), V1
113 VCMPEQUBCC V0, V1, V2
114 BGELR_CR6
115 ADD $16, R8
116 ADD $16, R4
117
118 // Bytes 16-31
119 LXVD2X (R8+R0), V0
120 LXVD2X (R4+R0), V1
121 VCMPEQUBCC V0, V1, V2
122 BGELR_CR6
123
124 // A little tricky, but point R4,R8 to &sx[len-32],
125 // and reuse check17_32 to check the next 1-31 bytes (with some overlap)
126 ADD $-32, R9, R8
127 ADD $-32, R10, R4
128 // Fallthrough
129
130check17_32:
131 LXVD2X (R8+R0), V0
132 LXVD2X (R4+R0), V1
133 VCMPEQUBCC V0, V1, V2
134 ISEL CR6LT, R11, R0, R5
135
136 // Load sX[len(sX)-16:len(sX)] and compare.
137 ADD $-16, R9
138 ADD $-16, R10
139 LXVD2X (R9+R0), V0
140 LXVD2X (R10+R0), V1
141 VCMPEQUBCC V0, V1, V2
142 ISEL CR6LT, R5, R0, R3
143 RET
144
145check0_16:
146#ifdef GOPPC64_power10
147 SLD $56, R5, R7
148 LXVL R8, R7, V0
149 LXVL R4, R7, V1
150 VCMPEQUDCC V0, V1, V2
151 ISEL CR6LT, R11, R0, R3
152 RET
153#else
154 CMP R5, $8
155 BLT check0_7
156 // Load sX[0:7] and compare.
157 MOVD (R8), R6
158 MOVD (R4), R7
159 CMP R6, R7
160 ISEL CR0EQ, R11, R0, R5
161 // Load sX[len(sX)-8:len(sX)] and compare.
162 MOVD -8(R9), R6
163 MOVD -8(R10), R7
164 CMP R6, R7
165 ISEL CR0EQ, R5, R0, R3
166 RET
167
168check0_7:
169 CMP R5,$0
170 MOVD $1, R3
171 BEQLR // return if len == 0
172
173 // Check < 8B loads with a single compare, but select the load address
174 // such that it cannot cross a page boundary. Load a few bytes from the
175 // lower address if that does not cross the lower page. Or, load a few
176 // extra bytes from the higher addresses. And align those values
177 // consistently in register as either address may have differing
178 // alignment requirements.
179 ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
180 ANDCC $PAGE_OFFSET, R4, R9
181 SUBC R5, $8, R12 // 8-len
182 SLD $3, R12, R14 // (8-len)*8
183 CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
184 CMPU R9, R12, CR0
185 SUB R12, R8, R6 // compute lower load address
186 SUB R12, R4, R9
187 ISEL CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
188 ISEL CR0LT, R4, R9, R4 // Similar for s2
189 MOVD (R8), R15
190 MOVD (R4), R16
191 SLD R14, R15, R7
192 SLD R14, R16, R17
193 SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
194 SRD R14, R17, R17
195 SRD R14, R15, R6 // Clear the lower (8-len) bytes
196 SRD R14, R16, R9
197#ifdef GOARCH_ppc64le
198 ISEL CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
199 ISEL CR0LT, R17, R9, R4
200#else
201 ISEL CR1LT, R6, R7, R8
202 ISEL CR0LT, R9, R17, R4
203#endif
204 CMP R4, R8
205 ISEL CR0EQ, R11, R0, R3
206 RET
207#endif // tail processing if !defined(GOPPC64_power10)
View as plain text