...

Text file src/internal/bytealg/indexbyte_arm64.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7TEXT ·IndexByte(SB),NOSPLIT,$0-40
     8	MOVD	b_base+0(FP), R0
     9	MOVD	b_len+8(FP), R2
    10	MOVBU	c+24(FP), R1
    11	MOVD	$ret+32(FP), R8
    12	B	indexbytebody<>(SB)
    13
    14TEXT ·IndexByteString(SB),NOSPLIT,$0-32
    15	MOVD	s_base+0(FP), R0
    16	MOVD	s_len+8(FP), R2
    17	MOVBU	c+16(FP), R1
    18	MOVD	$ret+24(FP), R8
    19	B	indexbytebody<>(SB)
    20
    21// input:
    22//   R0: data
    23//   R1: byte to search
    24//   R2: data len
    25//   R8: address to put result
    26TEXT indexbytebody<>(SB),NOSPLIT,$0
    27	// Core algorithm:
    28	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    29	// with two bits per byte. For each tuple, bit 0 is set if the
    30	// relevant byte matched the requested character and bit 1 is
    31	// not used (faster than using a 32bit syndrome). Since the bits
    32	// in the syndrome reflect exactly the order in which things occur
    33	// in the original string, counting trailing zeros allows to
    34	// identify exactly which byte has matched.
    35
    36	CBZ	R2, fail
    37	MOVD	R0, R11
    38	// Magic constant 0x40100401 allows us to identify
    39	// which lane matches the requested byte.
    40	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    41	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    42	MOVD	$0x40100401, R5
    43	VMOV	R1, V0.B16
    44	// Work with aligned 32-byte chunks
    45	BIC	$0x1f, R0, R3
    46	VMOV	R5, V5.S4
    47	ANDS	$0x1f, R0, R9
    48	AND	$0x1f, R2, R10
    49	BEQ	loop
    50
    51	// Input string is not 32-byte aligned. We calculate the
    52	// syndrome value for the aligned 32 bytes block containing
    53	// the first bytes and mask off the irrelevant part.
    54	VLD1.P	(R3), [V1.B16, V2.B16]
    55	SUB	$0x20, R9, R4
    56	ADDS	R4, R2, R2
    57	VCMEQ	V0.B16, V1.B16, V3.B16
    58	VCMEQ	V0.B16, V2.B16, V4.B16
    59	VAND	V5.B16, V3.B16, V3.B16
    60	VAND	V5.B16, V4.B16, V4.B16
    61	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    62	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    63	VMOV	V6.D[0], R6
    64	// Clear the irrelevant lower bits
    65	LSL	$1, R9, R4
    66	LSR	R4, R6, R6
    67	LSL	R4, R6, R6
    68	// The first block can also be the last
    69	BLS	masklast
    70	// Have we found something already?
    71	CBNZ	R6, tail
    72
    73loop:
    74	VLD1.P	(R3), [V1.B16, V2.B16]
    75	SUBS	$0x20, R2, R2
    76	VCMEQ	V0.B16, V1.B16, V3.B16
    77	VCMEQ	V0.B16, V2.B16, V4.B16
    78	// If we're out of data we finish regardless of the result
    79	BLS	end
    80	// Use a fast check for the termination condition
    81	VORR	V4.B16, V3.B16, V6.B16
    82	VADDP	V6.D2, V6.D2, V6.D2
    83	VMOV	V6.D[0], R6
    84	// We're not out of data, loop if we haven't found the character
    85	CBZ	R6, loop
    86
    87end:
    88	// Termination condition found, let's calculate the syndrome value
    89	VAND	V5.B16, V3.B16, V3.B16
    90	VAND	V5.B16, V4.B16, V4.B16
    91	VADDP	V4.B16, V3.B16, V6.B16
    92	VADDP	V6.B16, V6.B16, V6.B16
    93	VMOV	V6.D[0], R6
    94	// Only do the clear for the last possible block with less than 32 bytes
    95	// Condition flags come from SUBS in the loop
    96	BHS	tail
    97
    98masklast:
    99	// Clear the irrelevant upper bits
   100	ADD	R9, R10, R4
   101	AND	$0x1f, R4, R4
   102	SUB	$0x20, R4, R4
   103	NEG	R4<<1, R4
   104	LSL	R4, R6, R6
   105	LSR	R4, R6, R6
   106
   107tail:
   108	// Check that we have found a character
   109	CBZ	R6, fail
   110	// Count the trailing zeros using bit reversing
   111	RBIT	R6, R6
   112	// Compensate the last post-increment
   113	SUB	$0x20, R3, R3
   114	// And count the leading zeros
   115	CLZ	R6, R6
   116	// R6 is twice the offset into the fragment
   117	ADD	R6>>1, R3, R0
   118	// Compute the offset result
   119	SUB	R11, R0, R0
   120	MOVD	R0, (R8)
   121	RET
   122
   123fail:
   124	MOVD	$-1, R0
   125	MOVD	R0, (R8)
   126	RET

View as plain text