...

Text file src/internal/bytealg/count_loong64.s

Documentation: internal/bytealg

     1// Copyright 2025 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
     9	// R4 = b_base
    10	// R5 = b_len
    11	// R6 = b_cap (unused)
    12	// R7 = byte to count
    13	AND	$0xff, R7, R6
    14	JMP	countbody<>(SB)
    15
    16TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
    17	// R4 = s_base
    18	// R5 = s_len
    19	// R6 = byte to count
    20	AND	$0xff, R6
    21	JMP	countbody<>(SB)
    22
    23// input:
    24//   R4 = s_base
    25//   R5 = s_len
    26//   R6 = byte to count
    27TEXT countbody<>(SB),NOSPLIT,$0
    28	MOVV	R0, R7	// count
    29
    30	// short path to handle 0-byte case
    31	BEQ	R5, done
    32
    33	// jump directly to tail length < 8
    34	MOVV	$8, R8
    35	BLT	R5, R8, tail
    36
    37	// Implemented using 256-bit SMID instructions
    38lasxCountBody:
    39	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
    40	BEQ	R8, lsxCountBody
    41	XVMOVQ	R6, X0.B32
    42
    43	// jump directly to lasx32 if length < 128
    44	MOVV	$128, R8
    45	BLT	R5, R8, lasx32
    46lasx128:
    47lasx128Loop:
    48	XVMOVQ	0(R4), X1
    49	XVMOVQ	32(R4), X2
    50	XVMOVQ	64(R4), X3
    51	XVMOVQ	96(R4), X4
    52
    53	XVSEQB  X0, X1, X5
    54	XVSEQB  X0, X2, X6
    55	XVSEQB  X0, X3, X7
    56	XVSEQB  X0, X4, X8
    57
    58	XVANDB  $1, X5, X5
    59	XVANDB  $1, X6, X6
    60	XVANDB  $1, X7, X7
    61	XVANDB  $1, X8, X8
    62
    63	XVPCNTV	X5, X1
    64	XVPCNTV	X6, X2
    65	XVPCNTV	X7, X3
    66	XVPCNTV	X8, X4
    67
    68	XVADDV	X2, X1
    69	XVADDV	X4, X3
    70	XVADDV	X3, X1
    71
    72	XVMOVQ	X1.V[0], R9
    73	XVMOVQ	X1.V[1], R10
    74	XVMOVQ	X1.V[2], R11
    75	XVMOVQ	X1.V[3], R12
    76
    77	ADDV	R9, R10
    78	ADDV	R11, R12
    79	ADDV	R10, R7
    80	ADDV	R12, R7
    81
    82	ADDV	$-128, R5
    83	ADDV	$128, R4
    84	BGE	R5, R8, lasx128Loop
    85
    86lasx32:
    87	// jump directly to lasx8 if length < 32
    88	MOVV	$32, R8
    89	BLT	R5, R8, lasx8
    90lasx32Loop:
    91	XVMOVQ	0(R4), X1
    92	XVSEQB  X0, X1, X2
    93	XVANDB  $1, X2, X2
    94	XVPCNTV	X2, X1
    95	XVMOVQ	X1.V[0], R9
    96	XVMOVQ	X1.V[1], R10
    97	XVMOVQ	X1.V[2], R11
    98	XVMOVQ	X1.V[3], R12
    99	ADDV	R9, R10
   100	ADDV	R11, R12
   101	ADDV	R10, R7
   102	ADDV	R12, R7
   103	ADDV	$-32, R5
   104	ADDV	$32, R4
   105	BGE	R5, R8, lasx32Loop
   106lasx8:
   107	// jump directly to tail if length < 8
   108	MOVV	$8, R8
   109	BLT	R5, R8, tail
   110lasx8Loop:
   111	MOVV	0(R4), R9
   112	VMOVQ	R9, V1.V[0]
   113	VSEQB	V0, V1, V2
   114	VANDB	$1, V2, V2
   115	VPCNTV	V2, V1
   116
   117	VMOVQ	V1.V[0], R9
   118	ADDV	R9, R7
   119	ADDV	$-8, R5
   120	ADDV	$8, R4
   121	BGE	R5, R8, lasx8Loop
   122	JMP	tail
   123
   124	// Implemented using 128-bit SMID instructions
   125lsxCountBody:
   126	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
   127	BEQ	R8, genericCountBody
   128	VMOVQ	R6, V0.B16
   129
   130	// jump directly to lsx16 if length < 64
   131	MOVV	$64, R8
   132	BLT	R5, R8, lsx16
   133lsx64:
   134lsx64Loop:
   135	VMOVQ	0(R4),  V1
   136	VMOVQ	16(R4), V2
   137	VMOVQ	32(R4), V3
   138	VMOVQ	48(R4), V4
   139
   140	VSEQB  V0, V1, V5
   141	VSEQB  V0, V2, V6
   142	VSEQB  V0, V3, V7
   143	VSEQB  V0, V4, V8
   144
   145	VANDB  $1, V5, V5
   146	VANDB  $1, V6, V6
   147	VANDB  $1, V7, V7
   148	VANDB  $1, V8, V8
   149
   150	VPCNTV	V5, V1
   151	VPCNTV	V6, V2
   152	VPCNTV	V7, V3
   153	VPCNTV	V8, V4
   154
   155	VADDV	V2, V1
   156	VADDV	V4, V3
   157	VADDV	V3, V1
   158
   159	VMOVQ	V1.V[0], R9
   160	VMOVQ	V1.V[1], R10
   161	ADDV	R9, R7
   162	ADDV	R10, R7
   163
   164	ADDV	$-64, R5
   165	ADDV	$64, R4
   166	BGE	R5, R8, lsx64Loop
   167
   168lsx16:
   169	// jump directly to lsx8 if length < 16
   170	MOVV	$16, R8
   171	BLT	R5, R8, lsx8
   172lsx16Loop:
   173	VMOVQ	0(R4), V1
   174	VSEQB	V0, V1, V2
   175	VANDB  $1, V2, V2
   176	VPCNTV	V2, V1
   177	VMOVQ	V1.V[0], R9
   178	VMOVQ	V1.V[1], R10
   179	ADDV	R9, R7
   180	ADDV	R10, R7
   181	ADDV	$-16, R5
   182	ADDV	$16, R4
   183	BGE	R5, R8, lsx16Loop
   184lsx8:
   185	// jump directly to tail if length < 8
   186	MOVV	$8, R8
   187	BLT	R5, R8, tail
   188lsx8Loop:
   189	MOVV	0(R4), R9
   190	VMOVQ	R9, V1.V[0]
   191	VSEQB	V0, V1, V2
   192	VANDB	$1, V2, V2
   193	VPCNTV	V2, V1
   194
   195	VMOVQ	V1.V[0], R9
   196	ADDV	R9, R7
   197	ADDV	$-8, R5
   198	ADDV	$8, R4
   199	BGE	R5, R8, lsx8Loop
   200	JMP	tail
   201
   202	// Implemented using general instructions
   203genericCountBody:
   204	MOVV	$4, R8
   205	MOVV	$1, R9
   206genericLoop:
   207	BLT	R5, R8, tail
   208	ADDV	$-4, R5
   209	MOVWU	(R4)(R5), R10
   210	BSTRPICKW	$7, R10, $0, R11
   211	BSTRPICKW	$15, R10, $8, R12
   212	XOR	R6, R11
   213	XOR	R6, R12
   214	MASKNEZ	R11, R9, R13
   215	MASKNEZ	R12, R9, R14
   216	ADDV	R13, R7
   217	ADDV	R14, R7
   218	BSTRPICKW	$23, R10, $16, R11
   219	BSTRPICKW	$31, R10, $24, R12
   220	XOR	R6, R11
   221	XOR	R6, R12
   222	MASKNEZ	R11, R9, R13
   223	MASKNEZ	R12, R9, R14
   224	ADDV	R13, R7
   225	ADDV	R14, R7
   226	JMP	genericLoop
   227
   228	// Work with tail shorter than 8 bytes
   229tail:
   230	BEQ	R5, done
   231	ADDV	$-1, R5
   232	MOVBU   (R4)(R5), R8
   233	BNE	R6, R8, tail
   234	ADDV	$1, R7
   235	JMP	tail
   236done:
   237	MOVV	R7, R4
   238	RET

View as plain text