compare_amd64.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "asm_amd64.h"
     7#include "textflag.h"
     8
     9TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
    10	// AX = a_base (want in SI)
    11	// BX = a_len  (want in BX)
    12	// CX = a_cap  (unused)
    13	// DI = b_base (want in DI)
    14	// SI = b_len  (want in DX)
    15	// R8 = b_cap  (unused)
    16	MOVQ	SI, DX
    17	MOVQ	AX, SI
    18	JMP	cmpbody<>(SB)
    19
    20TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
    21	// AX = a_base (want in SI)
    22	// BX = a_len  (want in BX)
    23	// CX = b_base (want in DI)
    24	// DI = b_len  (want in DX)
    25	MOVQ	AX, SI
    26	MOVQ	DI, DX
    27	MOVQ	CX, DI
    28	JMP	cmpbody<>(SB)
    29
    30// input:
    31//   SI = a
    32//   DI = b
    33//   BX = alen
    34//   DX = blen
    35// output:
    36//   AX = output (-1/0/1)
    37TEXT cmpbody<>(SB),NOSPLIT,$0-0
    38	CMPQ	SI, DI
    39	JEQ	allsame
    40	CMPQ	BX, DX
    41	MOVQ	DX, R8
    42	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
    43	CMPQ	R8, $8
    44	JB	small
    45
    46	CMPQ	R8, $63
    47	JBE	loop
    48#ifndef hasAVX2
    49	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    50	JEQ     big_loop_avx2
    51	JMP	big_loop
    52#else
    53	JMP	big_loop_avx2
    54#endif
    55loop:
    56	CMPQ	R8, $16
    57	JBE	_0through16
    58	MOVOU	(SI), X0
    59	MOVOU	(DI), X1
    60	PCMPEQB X0, X1
    61	PMOVMSKB X1, AX
    62	XORQ	$0xffff, AX	// convert EQ to NE
    63	JNE	diff16	// branch if at least one byte is not equal
    64	ADDQ	$16, SI
    65	ADDQ	$16, DI
    66	SUBQ	$16, R8
    67	JMP	loop
    68
    69diff64:
    70	ADDQ	$48, SI
    71	ADDQ	$48, DI
    72	JMP	diff16
    73diff48:
    74	ADDQ	$32, SI
    75	ADDQ	$32, DI
    76	JMP	diff16
    77diff32:
    78	ADDQ	$16, SI
    79	ADDQ	$16, DI
    80	// AX = bit mask of differences
    81diff16:
    82	BSFQ	AX, BX	// index of first byte that differs
    83	XORQ	AX, AX
    84	MOVB	(SI)(BX*1), CX
    85	CMPB	CX, (DI)(BX*1)
    86	SETHI	AX
    87	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
    88	RET
    89
    90	// 0 through 16 bytes left, alen>=8, blen>=8
    91_0through16:
    92	CMPQ	R8, $8
    93	JBE	_0through8
    94	MOVQ	(SI), AX
    95	MOVQ	(DI), CX
    96	CMPQ	AX, CX
    97	JNE	diff8
    98_0through8:
    99	MOVQ	-8(SI)(R8*1), AX
   100	MOVQ	-8(DI)(R8*1), CX
   101	CMPQ	AX, CX
   102	JEQ	allsame
   103
   104	// AX and CX contain parts of a and b that differ.
   105diff8:
   106	BSWAPQ	AX	// reverse order of bytes
   107	BSWAPQ	CX
   108	XORQ	AX, CX
   109	BSRQ	CX, CX	// index of highest bit difference
   110	SHRQ	CX, AX	// move a's bit to bottom
   111	ANDQ	$1, AX	// mask bit
   112	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   113	RET
   114
   115	// 0-7 bytes in common
   116small:
   117	LEAQ	(R8*8), CX	// bytes left -> bits left
   118	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   119	JEQ	allsame
   120
   121	// load bytes of a into high bytes of AX
   122	CMPB	SI, $0xf8
   123	JA	si_high
   124	MOVQ	(SI), SI
   125	JMP	si_finish
   126si_high:
   127	MOVQ	-8(SI)(R8*1), SI
   128	SHRQ	CX, SI
   129si_finish:
   130	SHLQ	CX, SI
   131
   132	// load bytes of b in to high bytes of BX
   133	CMPB	DI, $0xf8
   134	JA	di_high
   135	MOVQ	(DI), DI
   136	JMP	di_finish
   137di_high:
   138	MOVQ	-8(DI)(R8*1), DI
   139	SHRQ	CX, DI
   140di_finish:
   141	SHLQ	CX, DI
   142
   143	BSWAPQ	SI	// reverse order of bytes
   144	BSWAPQ	DI
   145	XORQ	SI, DI	// find bit differences
   146	JEQ	allsame
   147	BSRQ	DI, CX	// index of highest bit difference
   148	SHRQ	CX, SI	// move a's bit to bottom
   149	ANDQ	$1, SI	// mask bit
   150	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   151	RET
   152
   153allsame:
   154	XORQ	AX, AX
   155	XORQ	CX, CX
   156	CMPQ	BX, DX
   157	SETGT	AX	// 1 if alen > blen
   158	SETEQ	CX	// 1 if alen == blen
   159	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   160	RET
   161
   162	// this works for >= 64 bytes of data.
   163#ifndef hasAVX2
   164big_loop:
   165	MOVOU	(SI), X0
   166	MOVOU	(DI), X1
   167	PCMPEQB X0, X1
   168	PMOVMSKB X1, AX
   169	XORQ	$0xffff, AX
   170	JNE	diff16
   171
   172	MOVOU	16(SI), X0
   173	MOVOU	16(DI), X1
   174	PCMPEQB X0, X1
   175	PMOVMSKB X1, AX
   176	XORQ	$0xffff, AX
   177	JNE	diff32
   178
   179	MOVOU	32(SI), X0
   180	MOVOU	32(DI), X1
   181	PCMPEQB X0, X1
   182	PMOVMSKB X1, AX
   183	XORQ	$0xffff, AX
   184	JNE	diff48
   185
   186	MOVOU	48(SI), X0
   187	MOVOU	48(DI), X1
   188	PCMPEQB X0, X1
   189	PMOVMSKB X1, AX
   190	XORQ	$0xffff, AX
   191	JNE	diff64
   192
   193	ADDQ	$64, SI
   194	ADDQ	$64, DI
   195	SUBQ	$64, R8
   196	CMPQ	R8, $64
   197	JBE	loop
   198	JMP	big_loop
   199#endif
   200
   201	// Compare 64-bytes per loop iteration.
   202	// Loop is unrolled and uses AVX2.
   203big_loop_avx2:
   204	VMOVDQU	(SI), Y2
   205	VMOVDQU	(DI), Y3
   206	VMOVDQU	32(SI), Y4
   207	VMOVDQU	32(DI), Y5
   208	VPCMPEQB Y2, Y3, Y0
   209	VPMOVMSKB Y0, AX
   210	XORL	$0xffffffff, AX
   211	JNE	diff32_avx2
   212	VPCMPEQB Y4, Y5, Y6
   213	VPMOVMSKB Y6, AX
   214	XORL	$0xffffffff, AX
   215	JNE	diff64_avx2
   216
   217	ADDQ	$64, SI
   218	ADDQ	$64, DI
   219	SUBQ	$64, R8
   220	CMPQ	R8, $64
   221	JB	big_loop_avx2_exit
   222	JMP	big_loop_avx2
   223
   224	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
   225diff32_avx2:
   226	VZEROUPPER
   227	JMP diff16
   228
   229	// Same as diff32_avx2, but for last 32 bytes.
   230diff64_avx2:
   231	VZEROUPPER
   232	JMP diff48
   233
   234	// For <64 bytes remainder jump to normal loop.
   235big_loop_avx2_exit:
   236	VZEROUPPER
   237	JMP loop
View as plain text