...

Text file src/internal/bytealg/count_arm64.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8// func Count(b []byte, c byte) int
     9// input:
    10//   R0: b ptr
    11//   R1: b len
    12//   R2: b cap
    13//   R3: c byte to search
    14// return:
    15//   R0: result
    16TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
    17	MOVD	R3, R2
    18	B	·CountString<ABIInternal>(SB)
    19
    20// func CountString(s string, c byte) int
    21// input:
    22//   R0: s ptr
    23//   R1: s len
    24//   R2: c byte to search (due to ABIInternal upper bits can contain junk)
    25// return:
    26//   R0: result
    27TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
    28	// R11 = count of byte to search
    29	MOVD	$0, R11
    30	// short path to handle 0-byte case
    31	CBZ	R1, done
    32	CMP	$0x20, R1
    33	// jump directly to head if length >= 32
    34	BHS	head
    35tail:
    36	// Work with tail shorter than 32 bytes
    37	MOVBU.P	1(R0), R5
    38	SUB	$1, R1, R1
    39	CMP	R2.UXTB, R5
    40	CINC	EQ, R11, R11
    41	CBNZ	R1, tail
    42done:
    43	MOVD	R11, R0
    44	RET
    45	PCALIGN	$16
    46head:
    47	ANDS	$0x1f, R0, R9
    48	BEQ	chunk
    49	// Work with not 32-byte aligned head
    50	BIC	$0x1f, R0, R3
    51	ADD	$0x20, R3
    52	PCALIGN $16
    53head_loop:
    54	MOVBU.P	1(R0), R5
    55	CMP	R2.UXTB, R5
    56	CINC	EQ, R11, R11
    57	SUB	$1, R1, R1
    58	CMP	R0, R3
    59	BNE	head_loop
    60chunk:
    61	BIC	$0x1f, R1, R9
    62	// The first chunk can also be the last
    63	CBZ	R9, tail
    64	// R3 = end of 32-byte chunks
    65	ADD	R0, R9, R3
    66	MOVD	$1, R5
    67	VMOV	R5, V5.B16
    68	// R1 = length of tail
    69	SUB	R9, R1, R1
    70	// Duplicate R2 (byte to search) to 16 1-byte elements of V0
    71	VMOV	R2, V0.B16
    72	// Clear the low 64-bit element of V7 and V8
    73	VEOR	V7.B8, V7.B8, V7.B8
    74	VEOR	V8.B8, V8.B8, V8.B8
    75	PCALIGN $16
    76	// Count the target byte in 32-byte chunk
    77chunk_loop:
    78	VLD1.P	(R0), [V1.B16, V2.B16]
    79	CMP	R0, R3
    80	VCMEQ	V0.B16, V1.B16, V3.B16
    81	VCMEQ	V0.B16, V2.B16, V4.B16
    82	// Clear the higher 7 bits
    83	VAND	V5.B16, V3.B16, V3.B16
    84	VAND	V5.B16, V4.B16, V4.B16
    85	// Count lanes match the requested byte
    86	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
    87	VUADDLV	V6.B16, V7
    88	// Accumulate the count in low 64-bit element of V8 when inside the loop
    89	VADD	V7, V8
    90	BNE	chunk_loop
    91	VMOV	V8.D[0], R6
    92	ADD	R6, R11, R11
    93	CBZ	R1, done
    94	B	tail

View as plain text