...

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1// Copyright 2025 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
     6
     7//go:build !math_big_pure_go
     8
     9#include "textflag.h"
    10
    11// func addVV(z, x, y []Word) (c Word)
    12TEXT ·addVV(SB), NOSPLIT, $0
    13	MOVQ z_len+8(FP), BX
    14	MOVQ x_base+24(FP), SI
    15	MOVQ y_base+48(FP), DI
    16	MOVQ z_base+0(FP), R8
    17	// compute unrolled loop lengths
    18	MOVQ BX, R9
    19	ANDQ $3, R9
    20	SHRQ $2, BX
    21	MOVQ $0, R10	// clear saved carry
    22loop1:
    23	TESTQ R9, R9; JZ loop1done
    24loop1cont:
    25	// unroll 1X
    26	ADDQ R10, R10	// restore carry
    27	MOVQ 0(SI), R10
    28	ADCQ 0(DI), R10
    29	MOVQ R10, 0(R8)
    30	SBBQ R10, R10	// save carry
    31	LEAQ 8(SI), SI	// ADD $8, SI
    32	LEAQ 8(DI), DI	// ADD $8, DI
    33	LEAQ 8(R8), R8	// ADD $8, R8
    34	SUBQ $1, R9; JNZ loop1cont
    35loop1done:
    36loop4:
    37	TESTQ BX, BX; JZ loop4done
    38loop4cont:
    39	// unroll 4X
    40	ADDQ R10, R10	// restore carry
    41	MOVQ 0(SI), R9
    42	MOVQ 8(SI), R10
    43	MOVQ 16(SI), R11
    44	MOVQ 24(SI), R12
    45	ADCQ 0(DI), R9
    46	ADCQ 8(DI), R10
    47	ADCQ 16(DI), R11
    48	ADCQ 24(DI), R12
    49	MOVQ R9, 0(R8)
    50	MOVQ R10, 8(R8)
    51	MOVQ R11, 16(R8)
    52	MOVQ R12, 24(R8)
    53	SBBQ R10, R10	// save carry
    54	LEAQ 32(SI), SI	// ADD $32, SI
    55	LEAQ 32(DI), DI	// ADD $32, DI
    56	LEAQ 32(R8), R8	// ADD $32, R8
    57	SUBQ $1, BX; JNZ loop4cont
    58loop4done:
    59	NEGQ R10	// convert add carry
    60	MOVQ R10, c+72(FP)
    61	RET
    62
    63// func subVV(z, x, y []Word) (c Word)
    64TEXT ·subVV(SB), NOSPLIT, $0
    65	MOVQ z_len+8(FP), BX
    66	MOVQ x_base+24(FP), SI
    67	MOVQ y_base+48(FP), DI
    68	MOVQ z_base+0(FP), R8
    69	// compute unrolled loop lengths
    70	MOVQ BX, R9
    71	ANDQ $3, R9
    72	SHRQ $2, BX
    73	MOVQ $0, R10	// clear saved carry
    74loop1:
    75	TESTQ R9, R9; JZ loop1done
    76loop1cont:
    77	// unroll 1X
    78	ADDQ R10, R10	// restore carry
    79	MOVQ 0(SI), R10
    80	SBBQ 0(DI), R10
    81	MOVQ R10, 0(R8)
    82	SBBQ R10, R10	// save carry
    83	LEAQ 8(SI), SI	// ADD $8, SI
    84	LEAQ 8(DI), DI	// ADD $8, DI
    85	LEAQ 8(R8), R8	// ADD $8, R8
    86	SUBQ $1, R9; JNZ loop1cont
    87loop1done:
    88loop4:
    89	TESTQ BX, BX; JZ loop4done
    90loop4cont:
    91	// unroll 4X
    92	ADDQ R10, R10	// restore carry
    93	MOVQ 0(SI), R9
    94	MOVQ 8(SI), R10
    95	MOVQ 16(SI), R11
    96	MOVQ 24(SI), R12
    97	SBBQ 0(DI), R9
    98	SBBQ 8(DI), R10
    99	SBBQ 16(DI), R11
   100	SBBQ 24(DI), R12
   101	MOVQ R9, 0(R8)
   102	MOVQ R10, 8(R8)
   103	MOVQ R11, 16(R8)
   104	MOVQ R12, 24(R8)
   105	SBBQ R10, R10	// save carry
   106	LEAQ 32(SI), SI	// ADD $32, SI
   107	LEAQ 32(DI), DI	// ADD $32, DI
   108	LEAQ 32(R8), R8	// ADD $32, R8
   109	SUBQ $1, BX; JNZ loop4cont
   110loop4done:
   111	NEGQ R10	// convert sub carry
   112	MOVQ R10, c+72(FP)
   113	RET
   114
   115// func lshVU(z, x []Word, s uint) (c Word)
   116TEXT ·lshVU(SB), NOSPLIT, $0
   117	MOVQ z_len+8(FP), BX
   118	TESTQ BX, BX; JZ ret0
   119	MOVQ s+48(FP), CX
   120	MOVQ x_base+24(FP), SI
   121	MOVQ z_base+0(FP), DI
   122	// run loop backward
   123	LEAQ (SI)(BX*8), SI
   124	LEAQ (DI)(BX*8), DI
   125	// shift first word into carry
   126	MOVQ -8(SI), R8
   127	MOVQ $0, R9
   128	SHLQ CX, R8, R9
   129	MOVQ R9, c+56(FP)
   130	// shift remaining words
   131	SUBQ $1, BX
   132	// compute unrolled loop lengths
   133	MOVQ BX, R9
   134	ANDQ $3, R9
   135	SHRQ $2, BX
   136loop1:
   137	TESTQ R9, R9; JZ loop1done
   138loop1cont:
   139	// unroll 1X
   140	MOVQ -16(SI), R10
   141	SHLQ CX, R10, R8
   142	MOVQ R8, -8(DI)
   143	MOVQ R10, R8
   144	LEAQ -8(SI), SI	// ADD $-8, SI
   145	LEAQ -8(DI), DI	// ADD $-8, DI
   146	SUBQ $1, R9; JNZ loop1cont
   147loop1done:
   148loop4:
   149	TESTQ BX, BX; JZ loop4done
   150loop4cont:
   151	// unroll 4X
   152	MOVQ -16(SI), R9
   153	MOVQ -24(SI), R10
   154	MOVQ -32(SI), R11
   155	MOVQ -40(SI), R12
   156	SHLQ CX, R9, R8
   157	SHLQ CX, R10, R9
   158	SHLQ CX, R11, R10
   159	SHLQ CX, R12, R11
   160	MOVQ R8, -8(DI)
   161	MOVQ R9, -16(DI)
   162	MOVQ R10, -24(DI)
   163	MOVQ R11, -32(DI)
   164	MOVQ R12, R8
   165	LEAQ -32(SI), SI	// ADD $-32, SI
   166	LEAQ -32(DI), DI	// ADD $-32, DI
   167	SUBQ $1, BX; JNZ loop4cont
   168loop4done:
   169	// store final shifted bits
   170	SHLQ CX, R8
   171	MOVQ R8, -8(DI)
   172	RET
   173ret0:
   174	MOVQ $0, c+56(FP)
   175	RET
   176
   177// func rshVU(z, x []Word, s uint) (c Word)
   178TEXT ·rshVU(SB), NOSPLIT, $0
   179	MOVQ z_len+8(FP), BX
   180	TESTQ BX, BX; JZ ret0
   181	MOVQ s+48(FP), CX
   182	MOVQ x_base+24(FP), SI
   183	MOVQ z_base+0(FP), DI
   184	// shift first word into carry
   185	MOVQ 0(SI), R8
   186	MOVQ $0, R9
   187	SHRQ CX, R8, R9
   188	MOVQ R9, c+56(FP)
   189	// shift remaining words
   190	SUBQ $1, BX
   191	// compute unrolled loop lengths
   192	MOVQ BX, R9
   193	ANDQ $3, R9
   194	SHRQ $2, BX
   195loop1:
   196	TESTQ R9, R9; JZ loop1done
   197loop1cont:
   198	// unroll 1X
   199	MOVQ 8(SI), R10
   200	SHRQ CX, R10, R8
   201	MOVQ R8, 0(DI)
   202	MOVQ R10, R8
   203	LEAQ 8(SI), SI	// ADD $8, SI
   204	LEAQ 8(DI), DI	// ADD $8, DI
   205	SUBQ $1, R9; JNZ loop1cont
   206loop1done:
   207loop4:
   208	TESTQ BX, BX; JZ loop4done
   209loop4cont:
   210	// unroll 4X
   211	MOVQ 8(SI), R9
   212	MOVQ 16(SI), R10
   213	MOVQ 24(SI), R11
   214	MOVQ 32(SI), R12
   215	SHRQ CX, R9, R8
   216	SHRQ CX, R10, R9
   217	SHRQ CX, R11, R10
   218	SHRQ CX, R12, R11
   219	MOVQ R8, 0(DI)
   220	MOVQ R9, 8(DI)
   221	MOVQ R10, 16(DI)
   222	MOVQ R11, 24(DI)
   223	MOVQ R12, R8
   224	LEAQ 32(SI), SI	// ADD $32, SI
   225	LEAQ 32(DI), DI	// ADD $32, DI
   226	SUBQ $1, BX; JNZ loop4cont
   227loop4done:
   228	// store final shifted bits
   229	SHRQ CX, R8
   230	MOVQ R8, 0(DI)
   231	RET
   232ret0:
   233	MOVQ $0, c+56(FP)
   234	RET
   235
   236// func mulAddVWW(z, x []Word, m, a Word) (c Word)
   237TEXT ·mulAddVWW(SB), NOSPLIT, $0
   238	MOVQ m+48(FP), BX
   239	MOVQ a+56(FP), SI
   240	MOVQ z_len+8(FP), DI
   241	MOVQ x_base+24(FP), R8
   242	MOVQ z_base+0(FP), R9
   243	// compute unrolled loop lengths
   244	MOVQ DI, R10
   245	ANDQ $3, R10
   246	SHRQ $2, DI
   247loop1:
   248	TESTQ R10, R10; JZ loop1done
   249loop1cont:
   250	// unroll 1X in batches of 1
   251	MOVQ 0(R8), AX
   252	// multiply
   253	MULQ BX
   254	ADDQ SI, AX
   255	MOVQ DX, SI
   256	ADCQ $0, SI
   257	MOVQ AX, 0(R9)
   258	LEAQ 8(R8), R8	// ADD $8, R8
   259	LEAQ 8(R9), R9	// ADD $8, R9
   260	SUBQ $1, R10; JNZ loop1cont
   261loop1done:
   262loop4:
   263	TESTQ DI, DI; JZ loop4done
   264loop4cont:
   265	// unroll 4X in batches of 1
   266	MOVQ 0(R8), AX
   267	// multiply
   268	MULQ BX
   269	ADDQ SI, AX
   270	MOVQ DX, SI
   271	ADCQ $0, SI
   272	MOVQ AX, 0(R9)
   273	MOVQ 8(R8), AX
   274	// multiply
   275	MULQ BX
   276	ADDQ SI, AX
   277	MOVQ DX, SI
   278	ADCQ $0, SI
   279	MOVQ AX, 8(R9)
   280	MOVQ 16(R8), AX
   281	// multiply
   282	MULQ BX
   283	ADDQ SI, AX
   284	MOVQ DX, SI
   285	ADCQ $0, SI
   286	MOVQ AX, 16(R9)
   287	MOVQ 24(R8), AX
   288	// multiply
   289	MULQ BX
   290	ADDQ SI, AX
   291	MOVQ DX, SI
   292	ADCQ $0, SI
   293	MOVQ AX, 24(R9)
   294	LEAQ 32(R8), R8	// ADD $32, R8
   295	LEAQ 32(R9), R9	// ADD $32, R9
   296	SUBQ $1, DI; JNZ loop4cont
   297loop4done:
   298	MOVQ SI, c+64(FP)
   299	RET
   300
   301// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
   302TEXT ·addMulVVWW(SB), NOSPLIT, $0
   303	CMPB ·hasADX(SB), $0; JNZ altcarry
   304	MOVQ m+72(FP), BX
   305	MOVQ a+80(FP), SI
   306	MOVQ z_len+8(FP), DI
   307	MOVQ x_base+24(FP), R8
   308	MOVQ y_base+48(FP), R9
   309	MOVQ z_base+0(FP), R10
   310	// compute unrolled loop lengths
   311	MOVQ DI, R11
   312	ANDQ $3, R11
   313	SHRQ $2, DI
   314loop1:
   315	TESTQ R11, R11; JZ loop1done
   316loop1cont:
   317	// unroll 1X in batches of 1
   318	MOVQ 0(R9), AX
   319	// multiply
   320	MULQ BX
   321	ADDQ SI, AX
   322	MOVQ DX, SI
   323	ADCQ $0, SI
   324	// add
   325	ADDQ 0(R8), AX
   326	ADCQ $0, SI
   327	MOVQ AX, 0(R10)
   328	LEAQ 8(R8), R8	// ADD $8, R8
   329	LEAQ 8(R9), R9	// ADD $8, R9
   330	LEAQ 8(R10), R10	// ADD $8, R10
   331	SUBQ $1, R11; JNZ loop1cont
   332loop1done:
   333loop4:
   334	TESTQ DI, DI; JZ loop4done
   335loop4cont:
   336	// unroll 4X in batches of 1
   337	MOVQ 0(R9), AX
   338	// multiply
   339	MULQ BX
   340	ADDQ SI, AX
   341	MOVQ DX, SI
   342	ADCQ $0, SI
   343	// add
   344	ADDQ 0(R8), AX
   345	ADCQ $0, SI
   346	MOVQ AX, 0(R10)
   347	MOVQ 8(R9), AX
   348	// multiply
   349	MULQ BX
   350	ADDQ SI, AX
   351	MOVQ DX, SI
   352	ADCQ $0, SI
   353	// add
   354	ADDQ 8(R8), AX
   355	ADCQ $0, SI
   356	MOVQ AX, 8(R10)
   357	MOVQ 16(R9), AX
   358	// multiply
   359	MULQ BX
   360	ADDQ SI, AX
   361	MOVQ DX, SI
   362	ADCQ $0, SI
   363	// add
   364	ADDQ 16(R8), AX
   365	ADCQ $0, SI
   366	MOVQ AX, 16(R10)
   367	MOVQ 24(R9), AX
   368	// multiply
   369	MULQ BX
   370	ADDQ SI, AX
   371	MOVQ DX, SI
   372	ADCQ $0, SI
   373	// add
   374	ADDQ 24(R8), AX
   375	ADCQ $0, SI
   376	MOVQ AX, 24(R10)
   377	LEAQ 32(R8), R8	// ADD $32, R8
   378	LEAQ 32(R9), R9	// ADD $32, R9
   379	LEAQ 32(R10), R10	// ADD $32, R10
   380	SUBQ $1, DI; JNZ loop4cont
   381loop4done:
   382	MOVQ SI, c+88(FP)
   383	RET
   384altcarry:
   385	MOVQ m+72(FP), DX
   386	MOVQ a+80(FP), BX
   387	MOVQ z_len+8(FP), SI
   388	MOVQ $0, DI
   389	MOVQ x_base+24(FP), R8
   390	MOVQ y_base+48(FP), R9
   391	MOVQ z_base+0(FP), R10
   392	// compute unrolled loop lengths
   393	MOVQ SI, R11
   394	ANDQ $7, R11
   395	SHRQ $3, SI
   396alt1:
   397	TESTQ R11, R11; JZ alt1done
   398alt1cont:
   399	// unroll 1X
   400	// multiply and add
   401	TESTQ AX, AX	// clear carry
   402	TESTQ AX, AX	// clear carry
   403	MULXQ 0(R9), R13, R12
   404	ADCXQ BX, R13
   405	ADOXQ 0(R8), R13
   406	MOVQ R13, 0(R10)
   407	MOVQ R12, BX
   408	ADCXQ DI, BX
   409	ADOXQ DI, BX
   410	LEAQ 8(R8), R8	// ADD $8, R8
   411	LEAQ 8(R9), R9	// ADD $8, R9
   412	LEAQ 8(R10), R10	// ADD $8, R10
   413	SUBQ $1, R11; JNZ alt1cont
   414alt1done:
   415alt8:
   416	TESTQ SI, SI; JZ alt8done
   417alt8cont:
   418	// unroll 8X in batches of 2
   419	// multiply and add
   420	TESTQ AX, AX	// clear carry
   421	TESTQ AX, AX	// clear carry
   422	MULXQ 0(R9), R13, R11
   423	ADCXQ BX, R13
   424	ADOXQ 0(R8), R13
   425	MULXQ 8(R9), R14, BX
   426	ADCXQ R11, R14
   427	ADOXQ 8(R8), R14
   428	MOVQ R13, 0(R10)
   429	MOVQ R14, 8(R10)
   430	MULXQ 16(R9), R13, R11
   431	ADCXQ BX, R13
   432	ADOXQ 16(R8), R13
   433	MULXQ 24(R9), R14, BX
   434	ADCXQ R11, R14
   435	ADOXQ 24(R8), R14
   436	MOVQ R13, 16(R10)
   437	MOVQ R14, 24(R10)
   438	MULXQ 32(R9), R13, R11
   439	ADCXQ BX, R13
   440	ADOXQ 32(R8), R13
   441	MULXQ 40(R9), R14, BX
   442	ADCXQ R11, R14
   443	ADOXQ 40(R8), R14
   444	MOVQ R13, 32(R10)
   445	MOVQ R14, 40(R10)
   446	MULXQ 48(R9), R13, R11
   447	ADCXQ BX, R13
   448	ADOXQ 48(R8), R13
   449	MULXQ 56(R9), R14, BX
   450	ADCXQ R11, R14
   451	ADOXQ 56(R8), R14
   452	MOVQ R13, 48(R10)
   453	MOVQ R14, 56(R10)
   454	ADCXQ DI, BX
   455	ADOXQ DI, BX
   456	LEAQ 64(R8), R8	// ADD $64, R8
   457	LEAQ 64(R9), R9	// ADD $64, R9
   458	LEAQ 64(R10), R10	// ADD $64, R10
   459	SUBQ $1, SI; JNZ alt8cont
   460alt8done:
   461	MOVQ BX, c+88(FP)
   462	RET

View as plain text