1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "textflag.h"
7
8// condition code masks
9#define EQ 8
10#define NE 7
11
12// register assignments
13#define R_ZERO R0
14#define R_VAL R1
15#define R_TMP R2
16#define R_PTR R3
17#define R_LEN R4
18#define R_CHAR R5
19#define R_RET R6
20#define R_ITER R7
21#define R_CNT R8
22#define R_MPTR R9
23
24// vector register assignments
25#define V_ZERO V0
26#define V_CHAR V1
27#define V_MASK V2
28#define V_VAL V3
29#define V_CNT V4
30
31// mask for trailing bytes in vector implementation
32GLOBL countbytemask<>(SB), RODATA, $16
33DATA countbytemask<>+0(SB)/8, $0x0101010101010101
34DATA countbytemask<>+8(SB)/8, $0x0101010101010101
35
36// func Count(b []byte, c byte) int
37TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
38 LMG b+0(FP), R_PTR, R_LEN
39 MOVBZ c+24(FP), R_CHAR
40 MOVD $ret+32(FP), R_RET
41 BR countbytebody<>(SB)
42
43// func CountString(s string, c byte) int
44TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
45 LMG s+0(FP), R_PTR, R_LEN
46 MOVBZ c+16(FP), R_CHAR
47 MOVD $ret+24(FP), R_RET
48 BR countbytebody<>(SB)
49
50// input:
51// R_PTR = address of array of bytes
52// R_LEN = number of bytes in array
53// R_CHAR = byte value to count zero (extended to register width)
54// R_RET = address of return value
55TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
56 MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
57 MOVD $countbytemask<>(SB), R_MPTR
58 CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
59 SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
60 MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
61 CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
62
63 // Start of vector code (have vector facility).
64 //
65 // Set R_LEN to be the length mod 16 minus 1 to use as an index for
66 // vector 'load with length' (VLL). It will be in the range [-1,14].
67 // Also replicate c across a 16-byte vector and initialize V_ZERO.
68 ANDW $0xf, R_LEN
69 VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
70 VZERO V_ZERO // V_ZERO = [1]uint128{0}
71 ADDW $-1, R_LEN
72 VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
73
74 // Jump to loop if we have more than 15 bytes to process.
75 CGIJ $NE, R_ITER, $0, vxchunks
76
77 // Load 1-15 bytes and corresponding mask.
78 // Note: only the low 32-bits of R_LEN are used for the index.
79 VLL R_LEN, (R_PTR), V_VAL
80 VLL R_LEN, (R_MPTR), V_MASK
81
82 // Compare each byte in input chunk against byte to be counted.
83 // Each byte element will be set to either 0 (no match) or 1 (match).
84 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
85 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
86
87 // Accumulate matched byte count in 128-bit integer value.
88 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
89 VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
90
91 // Return rightmost (lowest) 64-bit part of accumulator.
92 VSTEG $1, V_CNT, (R_RET)
93 RET
94
95vxchunks:
96 // Load 0x01 into every byte element in the 16-byte mask vector.
97 VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
98 VZERO V_CNT // initial uint128 count of 0
99
100vxloop:
101 // Load input bytes in 16-byte chunks.
102 VL (R_PTR), V_VAL
103
104 // Compare each byte in input chunk against byte to be counted.
105 // Each byte element will be set to either 0 (no match) or 1 (match).
106 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
107 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
108
109 // Increment input string address.
110 MOVD $16(R_PTR), R_PTR
111
112 // Accumulate matched byte count in 128-bit integer value.
113 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
114 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
115 VAQ V_VAL, V_CNT, V_CNT // accumulate
116
117 // Repeat until all 16-byte chunks are done.
118 BRCTG R_ITER, vxloop
119
120 // Skip to end if there are no trailing bytes.
121 CIJ $EQ, R_LEN, $-1, vxret
122
123 // Load 1-15 bytes and corresponding mask.
124 // Note: only the low 32-bits of R_LEN are used for the index.
125 VLL R_LEN, (R_PTR), V_VAL
126 VLL R_LEN, (R_MPTR), V_MASK
127
128 // Compare each byte in input chunk against byte to be counted.
129 // Each byte element will be set to either 0 (no match) or 1 (match).
130 VCEQB V_CHAR, V_VAL, V_VAL
131 VN V_MASK, V_VAL, V_VAL
132
133 // Accumulate matched byte count in 128-bit integer value.
134 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
135 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
136 VAQ V_VAL, V_CNT, V_CNT // accumulate
137
138vxret:
139 // Return rightmost (lowest) 64-bit part of accumulator.
140 VSTEG $1, V_CNT, (R_RET)
141 RET
142
143novx:
144 // Start of non-vector code (the vector facility not available).
145 //
146 // Initialise counter and constant zero.
147 MOVD $0, R_CNT
148 MOVD $0, R_ZERO
149
150loop:
151 // Read 1-byte from input and compare.
152 // Note: avoid putting LOCGR in critical path.
153 MOVBZ (R_PTR), R_VAL
154 MOVD $1, R_TMP
155 MOVD $1(R_PTR), R_PTR
156 CMPW R_VAL, R_CHAR
157 LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
158 ADD R_TMP, R_CNT // accumulate 64-bit result
159
160 // Repeat until all bytes have been checked.
161 BRCTG R_LEN, loop
162
163ret:
164 MOVD R_CNT, (R_RET)
165 RET
166
167ret0:
168 MOVD $0, (R_RET)
169 RET
View as plain text