memclr_amd64.s

Documentation: runtime

     1// Copyright 2014 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !plan9
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9#include "asm_amd64.h"
    10
    11// See memclrNoHeapPointers Go doc for important implementation constraints.
    12
    13// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    14// ABIInternal for performance.
    15TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
    16	// AX = ptr
    17	// BX = n
    18	MOVQ	AX, DI	// DI = ptr
    19	XORQ	AX, AX
    20
    21	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
    22tail:
    23	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    24	TESTQ	BX, BX
    25	JEQ	_0
    26	CMPQ	BX, $2
    27	JBE	_1or2
    28	CMPQ	BX, $4
    29	JBE	_3or4
    30	CMPQ	BX, $8
    31	JB	_5through7
    32	JE	_8
    33	CMPQ	BX, $16
    34	JBE	_9through16
    35	CMPQ	BX, $32
    36	JBE	_17through32
    37	CMPQ	BX, $64
    38	JBE	_33through64
    39	CMPQ	BX, $128
    40	JBE	_65through128
    41	CMPQ	BX, $256
    42	JBE	_129through256
    43
    44	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    45	JNE	skip_erms
    46
    47	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
    48	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
    49	// in the Intel Optimization Guide shows better performance for ERMSB starting
    50	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
    51	CMPQ    BX, $2048
    52	JAE	loop_preheader_erms
    53
    54skip_erms:
    55#ifndef hasAVX2
    56	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    57	JE	loop_preheader_avx2
    58	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    59
    60loop:
    61	MOVOU	X15, 0(DI)
    62	MOVOU	X15, 16(DI)
    63	MOVOU	X15, 32(DI)
    64	MOVOU	X15, 48(DI)
    65	MOVOU	X15, 64(DI)
    66	MOVOU	X15, 80(DI)
    67	MOVOU	X15, 96(DI)
    68	MOVOU	X15, 112(DI)
    69	MOVOU	X15, 128(DI)
    70	MOVOU	X15, 144(DI)
    71	MOVOU	X15, 160(DI)
    72	MOVOU	X15, 176(DI)
    73	MOVOU	X15, 192(DI)
    74	MOVOU	X15, 208(DI)
    75	MOVOU	X15, 224(DI)
    76	MOVOU	X15, 240(DI)
    77	SUBQ	$256, BX
    78	ADDQ	$256, DI
    79	CMPQ	BX, $256
    80	JAE	loop
    81	JMP	tail
    82#endif
    83
    84loop_preheader_avx2:
    85	VPXOR X0, X0, X0
    86	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    87	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    88	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    89	CMPQ    BX, $0x2000000
    90	JAE	loop_preheader_avx2_huge
    91
    92loop_avx2:
    93	VMOVDQU	Y0, 0(DI)
    94	VMOVDQU	Y0, 32(DI)
    95	VMOVDQU	Y0, 64(DI)
    96	VMOVDQU	Y0, 96(DI)
    97	SUBQ	$128, BX
    98	ADDQ	$128, DI
    99	CMPQ	BX, $128
   100	JAE	loop_avx2
   101	VMOVDQU  Y0, -32(DI)(BX*1)
   102	VMOVDQU  Y0, -64(DI)(BX*1)
   103	VMOVDQU  Y0, -96(DI)(BX*1)
   104	VMOVDQU  Y0, -128(DI)(BX*1)
   105	VZEROUPPER
   106	RET
   107
   108loop_preheader_erms:
   109#ifndef hasAVX2
   110	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   111	JNE	loop_erms
   112#endif
   113
   114	VPXOR X0, X0, X0
   115	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
   116	// write protocol, ERMS could show the same or slower performance comparing to
   117	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
   118	CMPQ	BX, $0x2000000
   119	JAE	loop_preheader_avx2_huge
   120
   121loop_erms:
   122	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
   123	// for a memory subsystem as the GC requires this.
   124	MOVQ	BX, CX
   125	SHRQ	$3, CX
   126	ANDQ	$7, BX
   127	REP;	STOSQ
   128	JMP	tail
   129
   130loop_preheader_avx2_huge:
   131	// Align to 32 byte boundary
   132	VMOVDQU  Y0, 0(DI)
   133	MOVQ	DI, SI
   134	ADDQ	$32, DI
   135	ANDQ	$~31, DI
   136	SUBQ	DI, SI
   137	ADDQ	SI, BX
   138loop_avx2_huge:
   139	VMOVNTDQ	Y0, 0(DI)
   140	VMOVNTDQ	Y0, 32(DI)
   141	VMOVNTDQ	Y0, 64(DI)
   142	VMOVNTDQ	Y0, 96(DI)
   143	SUBQ	$128, BX
   144	ADDQ	$128, DI
   145	CMPQ	BX, $128
   146	JAE	loop_avx2_huge
   147	// In the description of MOVNTDQ in [1]
   148	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   149	// should be used in conjunction with MOVNTDQ instructions..."
   150	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   151	SFENCE
   152	VMOVDQU  Y0, -32(DI)(BX*1)
   153	VMOVDQU  Y0, -64(DI)(BX*1)
   154	VMOVDQU  Y0, -96(DI)(BX*1)
   155	VMOVDQU  Y0, -128(DI)(BX*1)
   156	VZEROUPPER
   157	RET
   158
   159_1or2:
   160	MOVB	AX, (DI)
   161	MOVB	AX, -1(DI)(BX*1)
   162	RET
   163_0:
   164	RET
   165_3or4:
   166	MOVW	AX, (DI)
   167	MOVW	AX, -2(DI)(BX*1)
   168	RET
   169_5through7:
   170	MOVL	AX, (DI)
   171	MOVL	AX, -4(DI)(BX*1)
   172	RET
   173_8:
   174	// We need a separate case for 8 to make sure we clear pointers atomically.
   175	MOVQ	AX, (DI)
   176	RET
   177_9through16:
   178	MOVQ	AX, (DI)
   179	MOVQ	AX, -8(DI)(BX*1)
   180	RET
   181_17through32:
   182	MOVOU	X15, (DI)
   183	MOVOU	X15, -16(DI)(BX*1)
   184	RET
   185_33through64:
   186	MOVOU	X15, (DI)
   187	MOVOU	X15, 16(DI)
   188	MOVOU	X15, -32(DI)(BX*1)
   189	MOVOU	X15, -16(DI)(BX*1)
   190	RET
   191_65through128:
   192	MOVOU	X15, (DI)
   193	MOVOU	X15, 16(DI)
   194	MOVOU	X15, 32(DI)
   195	MOVOU	X15, 48(DI)
   196	MOVOU	X15, -64(DI)(BX*1)
   197	MOVOU	X15, -48(DI)(BX*1)
   198	MOVOU	X15, -32(DI)(BX*1)
   199	MOVOU	X15, -16(DI)(BX*1)
   200	RET
   201_129through256:
   202	MOVOU	X15, (DI)
   203	MOVOU	X15, 16(DI)
   204	MOVOU	X15, 32(DI)
   205	MOVOU	X15, 48(DI)
   206	MOVOU	X15, 64(DI)
   207	MOVOU	X15, 80(DI)
   208	MOVOU	X15, 96(DI)
   209	MOVOU	X15, 112(DI)
   210	MOVOU	X15, -128(DI)(BX*1)
   211	MOVOU	X15, -112(DI)(BX*1)
   212	MOVOU	X15, -96(DI)(BX*1)
   213	MOVOU	X15, -80(DI)(BX*1)
   214	MOVOU	X15, -64(DI)(BX*1)
   215	MOVOU	X15, -48(DI)(BX*1)
   216	MOVOU	X15, -32(DI)(BX*1)
   217	MOVOU	X15, -16(DI)(BX*1)
   218	RET
View as plain text