1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64le || ppc64
6
7#include "go_asm.h"
8#include "textflag.h"
9
10TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
11 // R3 = byte array pointer
12 // R4 = length
13 // R6 = byte to count
14 MTVRD R6, V1 // move compare byte
15 MOVD R6, R5
16 VSPLTB $7, V1, V1 // replicate byte across V1
17 BR countbytebody<>(SB)
18
19TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
20 // R3 = byte array pointer
21 // R4 = length
22 // R5 = byte to count
23 MTVRD R5, V1 // move compare byte
24 VSPLTB $7, V1, V1 // replicate byte across V1
25 BR countbytebody<>(SB)
26
27// R3: addr of string
28// R4: len of string
29// R5: byte to count
30// V1: byte to count, splatted.
31// On exit:
32// R3: return value
33TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
34 MOVD $0, R18 // byte count
35
36#ifndef GOPPC64_power10
37 RLDIMI $8, R5, $48, R5
38 RLDIMI $16, R5, $32, R5
39 RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
40#endif
41
42 CMPU R4, $32 // Check if it's a small string (<32 bytes)
43 BLT tail // Jump to the small string case
44 SRD $5, R4, R20
45 MOVD R20, CTR
46 MOVD $16, R21
47 XXLXOR V4, V4, V4
48 XXLXOR V5, V5, V5
49
50 PCALIGN $16
51cmploop:
52 LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
53 LXVD2X (R21)(R3), V2
54 VCMPEQUB V2, V1, V2
55 VCMPEQUB V0, V1, V0
56 VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
57 VPOPCNTD V0, V0
58 VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
59 VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
60 ADD $32, R3
61 BDNZ cmploop
62
63 VADDUDM V4, V5, V5
64 MFVSRD V5, R18
65 VSLDOI $8, V5, V5, V5
66 MFVSRD V5, R21
67 ADD R21, R18, R18
68 ANDCC $31, R4, R4
69 // Skip the tail processing if no bytes remaining.
70 BEQ tail_0
71
72#ifdef GOPPC64_power10
73 SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
74
75tail: // Count the last 0 - 31 bytes.
76 CMP R4, $16
77 BLE small_tail_p10
78 LXV 0(R3), V0
79 VCMPEQUB V0, V1, V0
80 VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
81 SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
82 ADD R14, R18, R18
83 ADD $16, R3, R3
84 ANDCC $15, R4, R4
85
86small_tail_p10:
87 SLD $56, R4, R6
88 LXVLL R3, R6, V0
89 VCMPEQUB V0, V1, V0
90 VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
91 VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
92 SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
93 ADD R14, R18, R3
94 RET
95
96#else
97tail: // Count the last 0 - 31 bytes.
98 CMP R4, $16
99 BLT tail_8
100 MOVD (R3), R12
101 MOVD 8(R3), R14
102 CMPB R12, R5, R12
103 CMPB R14, R5, R14
104 POPCNTD R12, R12
105 POPCNTD R14, R14
106 ADD R12, R18, R18
107 ADD R14, R18, R18
108 ADD $16, R3, R3
109 ADD $-16, R4, R4
110
111tail_8: // Count the remaining 0 - 15 bytes.
112 CMP R4, $8
113 BLT tail_4
114 MOVD (R3), R12
115 CMPB R12, R5, R12
116 POPCNTD R12, R12
117 ADD R12, R18, R18
118 ADD $8, R3, R3
119 ADD $-8, R4, R4
120
121tail_4: // Count the remaining 0 - 7 bytes.
122 CMP R4, $4
123 BLT tail_2
124 MOVWZ (R3), R12
125 CMPB R12, R5, R12
126 SLD $32, R12, R12 // Remove non-participating matches.
127 POPCNTD R12, R12
128 ADD R12, R18, R18
129 ADD $4, R3, R3
130 ADD $-4, R4, R4
131
132tail_2: // Count the remaining 0 - 3 bytes.
133 CMP R4, $2
134 BLT tail_1
135 MOVHZ (R3), R12
136 CMPB R12, R5, R12
137 SLD $48, R12, R12 // Remove non-participating matches.
138 POPCNTD R12, R12
139 ADD R12, R18, R18
140 ADD $2, R3, R3
141 ADD $-2, R4, R4
142
143tail_1: // Count the remaining 0 - 1 bytes.
144 CMP R4, $1
145 BLT tail_0
146 MOVBZ (R3), R12
147 CMPB R12, R5, R12
148 ANDCC $0x8, R12, R12
149 ADD R12, R18, R18
150#endif
151
152tail_0: // No remaining tail to count.
153 SRD $3, R18, R3 // Fixup count, it is off by 8x.
154 RET
View as plain text