memmove_amd64.s

Documentation: runtime

     1// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     3//
     4//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7//
     8// Permission is hereby granted, free of charge, to any person obtaining a copy
     9// of this software and associated documentation files (the "Software"), to deal
    10// in the Software without restriction, including without limitation the rights
    11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12// copies of the Software, and to permit persons to whom the Software is
    13// furnished to do so, subject to the following conditions:
    14//
    15// The above copyright notice and this permission notice shall be included in
    16// all copies or substantial portions of the Software.
    17//
    18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24// THE SOFTWARE.
    25
    26//go:build !plan9
    27
    28#include "go_asm.h"
    29#include "textflag.h"
    30
    31// See memmove Go doc for important implementation constraints.
    32
    33// func memmove(to, from unsafe.Pointer, n uintptr)
    34// ABIInternal for performance.
    35TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
    36	// AX = to
    37	// BX = from
    38	// CX = n
    39	MOVQ	AX, DI
    40	MOVQ	BX, SI
    41	MOVQ	CX, BX
    42
    43	// REP instructions have a high startup cost, so we handle small sizes
    44	// with some straightline code. The REP MOVSQ instruction is really fast
    45	// for large sizes. The cutover is approximately 2K.
    46tail:
    47	// move_129through256 or smaller work whether or not the source and the
    48	// destination memory regions overlap because they load all data into
    49	// registers before writing it back.  move_256through2048 on the other
    50	// hand can be used only when the memory regions don't overlap or the copy
    51	// direction is forward.
    52	//
    53	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    54	TESTQ	BX, BX
    55	JEQ	move_0
    56	CMPQ	BX, $2
    57	JBE	move_1or2
    58	CMPQ	BX, $4
    59	JB	move_3
    60	JBE	move_4
    61	CMPQ	BX, $8
    62	JB	move_5through7
    63	JE	move_8
    64	CMPQ	BX, $16
    65	JBE	move_9through16
    66	CMPQ	BX, $32
    67	JBE	move_17through32
    68	CMPQ	BX, $64
    69	JBE	move_33through64
    70	CMPQ	BX, $128
    71	JBE	move_65through128
    72	CMPQ	BX, $256
    73	JBE	move_129through256
    74
    75	MOVB	runtime·memmoveBits(SB), AX
    76	// We have AVX but we don't want to use REP MOVSx.
    77	CMPB	AX, $const_avxSupported
    78	JEQ	avxUnaligned
    79/*
    80 * check and set for backwards
    81 */
    82	CMPQ	SI, DI
    83	JLS	back
    84
    85/*
    86* forward copy loop
    87*/
    88forward:
    89	CMPQ	BX, $2048
    90	JL	check_avx
    91	// REP MOVSx is slow if destination address is unaligned.
    92	TESTQ	$15,DI
    93	JNZ	check_avx
    94	TESTB	$const_repmovsPreferred, AX
    95	JNZ	fwdBy8
    96	// For backward copy, REP MOVSx performs worse than avx.
    97check_avx:
    98	TESTB	$const_avxSupported, AX
    99	JNZ	avxUnaligned
   100
   101	CMPQ	BX, $2048
   102	JLS	move_256through2048
   103	// Check alignment
   104	MOVL	SI, AX
   105	ORL	DI, AX
   106	TESTL	$7, AX
   107	JEQ	fwdBy8
   108
   109	// Do 1 byte at a time
   110	MOVQ	BX, CX
   111	REP;	MOVSB
   112	RET
   113
   114fwdBy8:
   115	// Loading the last (possibly partially overlapping) word and writing
   116	// it at the end.
   117	MOVQ	-8(SI)(BX*1), AX
   118	LEAQ	-8(DI)(BX*1), DX
   119	// Do 8 bytes at a time
   120	LEAQ 	-1(BX),CX
   121	SHRQ	$3, CX
   122	REP;	MOVSQ
   123	MOVQ	AX, (DX)
   124	RET
   125
   126back:
   127/*
   128 * check overlap
   129 */
   130	MOVQ	SI, CX
   131	ADDQ	BX, CX
   132	CMPQ	CX, DI
   133	JLS	forward
   134
   135	TESTB	$const_avxSupported, AX
   136	JNZ	avxUnaligned
   137/*
   138 * whole thing backwards has
   139 * adjusted addresses
   140 */
   141	ADDQ	BX, DI
   142	ADDQ	BX, SI
   143	STD
   144
   145/*
   146 * copy
   147 */
   148	MOVQ	BX, CX
   149	SHRQ	$3, CX
   150	ANDQ	$7, BX
   151
   152	SUBQ	$8, DI
   153	SUBQ	$8, SI
   154	REP;	MOVSQ
   155
   156	CLD
   157	ADDQ	$8, DI
   158	ADDQ	$8, SI
   159	SUBQ	BX, DI
   160	SUBQ	BX, SI
   161	JMP	tail
   162
   163move_1or2:
   164	MOVB	(SI), AX
   165	MOVB	-1(SI)(BX*1), CX
   166	MOVB	AX, (DI)
   167	MOVB	CX, -1(DI)(BX*1)
   168	RET
   169move_0:
   170	RET
   171move_4:
   172	MOVL	(SI), AX
   173	MOVL	AX, (DI)
   174	RET
   175move_3:
   176	MOVW	(SI), AX
   177	MOVB	2(SI), CX
   178	MOVW	AX, (DI)
   179	MOVB	CX, 2(DI)
   180	RET
   181move_5through7:
   182	MOVL	(SI), AX
   183	MOVL	-4(SI)(BX*1), CX
   184	MOVL	AX, (DI)
   185	MOVL	CX, -4(DI)(BX*1)
   186	RET
   187move_8:
   188	// We need a separate case for 8 to make sure we write pointers atomically.
   189	MOVQ	(SI), AX
   190	MOVQ	AX, (DI)
   191	RET
   192move_9through16:
   193	MOVQ	(SI), AX
   194	MOVQ	-8(SI)(BX*1), CX
   195	MOVQ	AX, (DI)
   196	MOVQ	CX, -8(DI)(BX*1)
   197	RET
   198move_17through32:
   199	MOVOU	(SI), X0
   200	MOVOU	-16(SI)(BX*1), X1
   201	MOVOU	X0, (DI)
   202	MOVOU	X1, -16(DI)(BX*1)
   203	RET
   204move_33through64:
   205	MOVOU	(SI), X0
   206	MOVOU	16(SI), X1
   207	MOVOU	-32(SI)(BX*1), X2
   208	MOVOU	-16(SI)(BX*1), X3
   209	MOVOU	X0, (DI)
   210	MOVOU	X1, 16(DI)
   211	MOVOU	X2, -32(DI)(BX*1)
   212	MOVOU	X3, -16(DI)(BX*1)
   213	RET
   214move_65through128:
   215	MOVOU	(SI), X0
   216	MOVOU	16(SI), X1
   217	MOVOU	32(SI), X2
   218	MOVOU	48(SI), X3
   219	MOVOU	-64(SI)(BX*1), X4
   220	MOVOU	-48(SI)(BX*1), X5
   221	MOVOU	-32(SI)(BX*1), X6
   222	MOVOU	-16(SI)(BX*1), X7
   223	MOVOU	X0, (DI)
   224	MOVOU	X1, 16(DI)
   225	MOVOU	X2, 32(DI)
   226	MOVOU	X3, 48(DI)
   227	MOVOU	X4, -64(DI)(BX*1)
   228	MOVOU	X5, -48(DI)(BX*1)
   229	MOVOU	X6, -32(DI)(BX*1)
   230	MOVOU	X7, -16(DI)(BX*1)
   231	RET
   232move_129through256:
   233	MOVOU	(SI), X0
   234	MOVOU	16(SI), X1
   235	MOVOU	32(SI), X2
   236	MOVOU	48(SI), X3
   237	MOVOU	64(SI), X4
   238	MOVOU	80(SI), X5
   239	MOVOU	96(SI), X6
   240	MOVOU	112(SI), X7
   241	MOVOU	-128(SI)(BX*1), X8
   242	MOVOU	-112(SI)(BX*1), X9
   243	MOVOU	-96(SI)(BX*1), X10
   244	MOVOU	-80(SI)(BX*1), X11
   245	MOVOU	-64(SI)(BX*1), X12
   246	MOVOU	-48(SI)(BX*1), X13
   247	MOVOU	-32(SI)(BX*1), X14
   248	MOVOU	-16(SI)(BX*1), X15
   249	MOVOU	X0, (DI)
   250	MOVOU	X1, 16(DI)
   251	MOVOU	X2, 32(DI)
   252	MOVOU	X3, 48(DI)
   253	MOVOU	X4, 64(DI)
   254	MOVOU	X5, 80(DI)
   255	MOVOU	X6, 96(DI)
   256	MOVOU	X7, 112(DI)
   257	MOVOU	X8, -128(DI)(BX*1)
   258	MOVOU	X9, -112(DI)(BX*1)
   259	MOVOU	X10, -96(DI)(BX*1)
   260	MOVOU	X11, -80(DI)(BX*1)
   261	MOVOU	X12, -64(DI)(BX*1)
   262	MOVOU	X13, -48(DI)(BX*1)
   263	MOVOU	X14, -32(DI)(BX*1)
   264	MOVOU	X15, -16(DI)(BX*1)
   265	// X15 must be zero on return
   266	PXOR	X15, X15
   267	RET
   268move_256through2048:
   269	SUBQ	$256, BX
   270	MOVOU	(SI), X0
   271	MOVOU	16(SI), X1
   272	MOVOU	32(SI), X2
   273	MOVOU	48(SI), X3
   274	MOVOU	64(SI), X4
   275	MOVOU	80(SI), X5
   276	MOVOU	96(SI), X6
   277	MOVOU	112(SI), X7
   278	MOVOU	128(SI), X8
   279	MOVOU	144(SI), X9
   280	MOVOU	160(SI), X10
   281	MOVOU	176(SI), X11
   282	MOVOU	192(SI), X12
   283	MOVOU	208(SI), X13
   284	MOVOU	224(SI), X14
   285	MOVOU	240(SI), X15
   286	MOVOU	X0, (DI)
   287	MOVOU	X1, 16(DI)
   288	MOVOU	X2, 32(DI)
   289	MOVOU	X3, 48(DI)
   290	MOVOU	X4, 64(DI)
   291	MOVOU	X5, 80(DI)
   292	MOVOU	X6, 96(DI)
   293	MOVOU	X7, 112(DI)
   294	MOVOU	X8, 128(DI)
   295	MOVOU	X9, 144(DI)
   296	MOVOU	X10, 160(DI)
   297	MOVOU	X11, 176(DI)
   298	MOVOU	X12, 192(DI)
   299	MOVOU	X13, 208(DI)
   300	MOVOU	X14, 224(DI)
   301	MOVOU	X15, 240(DI)
   302	CMPQ	BX, $256
   303	LEAQ	256(SI), SI
   304	LEAQ	256(DI), DI
   305	JGE	move_256through2048
   306	// X15 must be zero on return
   307	PXOR	X15, X15
   308	JMP	tail
   309
   310avxUnaligned:
   311	// There are two implementations of move algorithm.
   312	// The first one for non-overlapped memory regions. It uses forward copying.
   313	// The second one for overlapped regions. It uses backward copying
   314	MOVQ	DI, CX
   315	SUBQ	SI, CX
   316	// Now CX contains distance between SRC and DEST
   317	CMPQ	CX, BX
   318	// If the distance lesser than region length it means that regions are overlapped
   319	JC	copy_backward
   320
   321	// Non-temporal copy would be better for big sizes.
   322	CMPQ	BX, $0x100000
   323	JAE	gobble_big_data_fwd
   324
   325	// Memory layout on the source side
   326	// SI                                       CX
   327	// |<---------BX before correction--------->|
   328	// |       |<--BX corrected-->|             |
   329	// |       |                  |<--- AX  --->|
   330	// |<-R11->|                  |<-128 bytes->|
   331	// +----------------------------------------+
   332	// | Head  | Body             | Tail        |
   333	// +-------+------------------+-------------+
   334	// ^       ^                  ^
   335	// |       |                  |
   336	// Save head into Y4          Save tail into X5..X12
   337	//         |
   338	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   339	// Algorithm:
   340	// 1. Unaligned save of the tail's 128 bytes
   341	// 2. Unaligned save of the head's 32  bytes
   342	// 3. Destination-aligned copying of body (128 bytes per iteration)
   343	// 4. Put head on the new place
   344	// 5. Put the tail on the new place
   345	// It can be important to satisfy processor's pipeline requirements for
   346	// small sizes as the cost of unaligned memory region copying is
   347	// comparable with the cost of main loop. So code is slightly messed there.
   348	// There is more clean implementation of that algorithm for bigger sizes
   349	// where the cost of unaligned part copying is negligible.
   350	// You can see it after gobble_big_data_fwd label.
   351	LEAQ	(SI)(BX*1), CX
   352	MOVQ	DI, R10
   353	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   354	MOVOU	-0x80(CX), X5
   355	MOVOU	-0x70(CX), X6
   356	MOVQ	$0x80, AX
   357	// Align destination address
   358	ANDQ	$-32, DI
   359	ADDQ	$32, DI
   360	// Continue tail saving.
   361	MOVOU	-0x60(CX), X7
   362	MOVOU	-0x50(CX), X8
   363	// Make R11 delta between aligned and unaligned destination addresses.
   364	MOVQ	DI, R11
   365	SUBQ	R10, R11
   366	// Continue tail saving.
   367	MOVOU	-0x40(CX), X9
   368	MOVOU	-0x30(CX), X10
   369	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   370	SUBQ	R11, BX
   371	// Continue tail saving.
   372	MOVOU	-0x20(CX), X11
   373	MOVOU	-0x10(CX), X12
   374	// The tail will be put on its place after main body copying.
   375	// It's time for the unaligned heading part.
   376	VMOVDQU	(SI), Y4
   377	// Adjust source address to point past head.
   378	ADDQ	R11, SI
   379	SUBQ	AX, BX
   380	// Aligned memory copying there
   381gobble_128_loop:
   382	VMOVDQU	(SI), Y0
   383	VMOVDQU	0x20(SI), Y1
   384	VMOVDQU	0x40(SI), Y2
   385	VMOVDQU	0x60(SI), Y3
   386	ADDQ	AX, SI
   387	VMOVDQA	Y0, (DI)
   388	VMOVDQA	Y1, 0x20(DI)
   389	VMOVDQA	Y2, 0x40(DI)
   390	VMOVDQA	Y3, 0x60(DI)
   391	ADDQ	AX, DI
   392	SUBQ	AX, BX
   393	JA	gobble_128_loop
   394	// Now we can store unaligned parts.
   395	ADDQ	AX, BX
   396	ADDQ	DI, BX
   397	VMOVDQU	Y4, (R10)
   398	VZEROUPPER
   399	MOVOU	X5, -0x80(BX)
   400	MOVOU	X6, -0x70(BX)
   401	MOVOU	X7, -0x60(BX)
   402	MOVOU	X8, -0x50(BX)
   403	MOVOU	X9, -0x40(BX)
   404	MOVOU	X10, -0x30(BX)
   405	MOVOU	X11, -0x20(BX)
   406	MOVOU	X12, -0x10(BX)
   407	RET
   408
   409gobble_big_data_fwd:
   410	// There is forward copying for big regions.
   411	// It uses non-temporal mov instructions.
   412	// Details of this algorithm are commented previously for small sizes.
   413	LEAQ	(SI)(BX*1), CX
   414	MOVOU	-0x80(SI)(BX*1), X5
   415	MOVOU	-0x70(CX), X6
   416	MOVOU	-0x60(CX), X7
   417	MOVOU	-0x50(CX), X8
   418	MOVOU	-0x40(CX), X9
   419	MOVOU	-0x30(CX), X10
   420	MOVOU	-0x20(CX), X11
   421	MOVOU	-0x10(CX), X12
   422	VMOVDQU	(SI), Y4
   423	MOVQ	DI, R8
   424	ANDQ	$-32, DI
   425	ADDQ	$32, DI
   426	MOVQ	DI, R10
   427	SUBQ	R8, R10
   428	SUBQ	R10, BX
   429	ADDQ	R10, SI
   430	LEAQ	(DI)(BX*1), CX
   431	SUBQ	$0x80, BX
   432gobble_mem_fwd_loop:
   433	PREFETCHNTA 0x1C0(SI)
   434	PREFETCHNTA 0x280(SI)
   435	// Prefetch values were chosen empirically.
   436	// Approach for prefetch usage as in 9.5.6 of [1]
   437	// [1] 64-ia-32-architectures-optimization-manual.pdf
   438	// https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   439	VMOVDQU	(SI), Y0
   440	VMOVDQU	0x20(SI), Y1
   441	VMOVDQU	0x40(SI), Y2
   442	VMOVDQU	0x60(SI), Y3
   443	ADDQ	$0x80, SI
   444	VMOVNTDQ Y0, (DI)
   445	VMOVNTDQ Y1, 0x20(DI)
   446	VMOVNTDQ Y2, 0x40(DI)
   447	VMOVNTDQ Y3, 0x60(DI)
   448	ADDQ	$0x80, DI
   449	SUBQ	$0x80, BX
   450	JA		gobble_mem_fwd_loop
   451	// NT instructions don't follow the normal cache-coherency rules.
   452	// We need SFENCE there to make copied data available timely.
   453	SFENCE
   454	VMOVDQU	Y4, (R8)
   455	VZEROUPPER
   456	MOVOU	X5, -0x80(CX)
   457	MOVOU	X6, -0x70(CX)
   458	MOVOU	X7, -0x60(CX)
   459	MOVOU	X8, -0x50(CX)
   460	MOVOU	X9, -0x40(CX)
   461	MOVOU	X10, -0x30(CX)
   462	MOVOU	X11, -0x20(CX)
   463	MOVOU	X12, -0x10(CX)
   464	RET
   465
   466copy_backward:
   467	MOVQ	DI, AX
   468	// Backward copying is about the same as the forward one.
   469	// Firstly we load unaligned tail in the beginning of region.
   470	MOVOU	(SI), X5
   471	MOVOU	0x10(SI), X6
   472	ADDQ	BX, DI
   473	MOVOU	0x20(SI), X7
   474	MOVOU	0x30(SI), X8
   475	LEAQ	-0x20(DI), R10
   476	MOVQ	DI, R11
   477	MOVOU	0x40(SI), X9
   478	MOVOU	0x50(SI), X10
   479	ANDQ	$0x1F, R11
   480	MOVOU	0x60(SI), X11
   481	MOVOU	0x70(SI), X12
   482	XORQ	R11, DI
   483	// Let's point SI to the end of region
   484	ADDQ	BX, SI
   485	// and load unaligned head into X4.
   486	VMOVDQU	-0x20(SI), Y4
   487	SUBQ	R11, SI
   488	SUBQ	R11, BX
   489	// If there is enough data for non-temporal moves go to special loop
   490	CMPQ	BX, $0x100000
   491	JA		gobble_big_data_bwd
   492	SUBQ	$0x80, BX
   493gobble_mem_bwd_loop:
   494	VMOVDQU	-0x20(SI), Y0
   495	VMOVDQU	-0x40(SI), Y1
   496	VMOVDQU	-0x60(SI), Y2
   497	VMOVDQU	-0x80(SI), Y3
   498	SUBQ	$0x80, SI
   499	VMOVDQA	Y0, -0x20(DI)
   500	VMOVDQA	Y1, -0x40(DI)
   501	VMOVDQA	Y2, -0x60(DI)
   502	VMOVDQA	Y3, -0x80(DI)
   503	SUBQ	$0x80, DI
   504	SUBQ	$0x80, BX
   505	JA		gobble_mem_bwd_loop
   506	// Let's store unaligned data
   507	VMOVDQU	Y4, (R10)
   508	VZEROUPPER
   509	MOVOU	X5, (AX)
   510	MOVOU	X6, 0x10(AX)
   511	MOVOU	X7, 0x20(AX)
   512	MOVOU	X8, 0x30(AX)
   513	MOVOU	X9, 0x40(AX)
   514	MOVOU	X10, 0x50(AX)
   515	MOVOU	X11, 0x60(AX)
   516	MOVOU	X12, 0x70(AX)
   517	RET
   518
   519gobble_big_data_bwd:
   520	SUBQ	$0x80, BX
   521gobble_big_mem_bwd_loop:
   522	PREFETCHNTA -0x1C0(SI)
   523	PREFETCHNTA -0x280(SI)
   524	VMOVDQU	-0x20(SI), Y0
   525	VMOVDQU	-0x40(SI), Y1
   526	VMOVDQU	-0x60(SI), Y2
   527	VMOVDQU	-0x80(SI), Y3
   528	SUBQ	$0x80, SI
   529	VMOVNTDQ	Y0, -0x20(DI)
   530	VMOVNTDQ	Y1, -0x40(DI)
   531	VMOVNTDQ	Y2, -0x60(DI)
   532	VMOVNTDQ	Y3, -0x80(DI)
   533	SUBQ	$0x80, DI
   534	SUBQ	$0x80, BX
   535	JA	gobble_big_mem_bwd_loop
   536	SFENCE
   537	VMOVDQU	Y4, (R10)
   538	VZEROUPPER
   539	MOVOU	X5, (AX)
   540	MOVOU	X6, 0x10(AX)
   541	MOVOU	X7, 0x20(AX)
   542	MOVOU	X8, 0x30(AX)
   543	MOVOU	X9, 0x40(AX)
   544	MOVOU	X10, 0x50(AX)
   545	MOVOU	X11, 0x60(AX)
   546	MOVOU	X12, 0x70(AX)
   547	RET
View as plain text