...

Text file src/math/big/arith_ppc64x.s

Documentation: math/big

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !math_big_pure_go && (ppc64 || ppc64le)
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// func addVV(z, y, y []Word) (c Word)
    13// z[i] = x[i] + y[i] for all i, carrying
    14TEXT ·addVV(SB), NOSPLIT, $0
    15	MOVD  z_len+8(FP), R7   // R7 = z_len
    16	MOVD  x+24(FP), R8      // R8 = x[]
    17	MOVD  y+48(FP), R9      // R9 = y[]
    18	MOVD  z+0(FP), R10      // R10 = z[]
    19
    20	// If z_len = 0, we are done
    21	CMP   R7, $0
    22	MOVD  R0, R4
    23	BEQ   done
    24
    25	// Process the first iteration out of the loop so we can
    26	// use MOVDU and avoid 3 index registers updates.
    27	MOVD  0(R8), R11      // R11 = x[i]
    28	MOVD  0(R9), R12      // R12 = y[i]
    29	ADD   $-1, R7         // R7 = z_len - 1
    30	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    31	CMP   R7, $0
    32	MOVD  R15, 0(R10)     // z[i]
    33	BEQ   final          // If z_len was 1, we are done
    34
    35	SRD   $2, R7, R5      // R5 = z_len/4
    36	CMP   R5, $0
    37	MOVD  R5, CTR         // Set up loop counter
    38	BEQ   tail            // If R5 = 0, we can't use the loop
    39
    40	// Process 4 elements per iteration. Unrolling this loop
    41	// means a performance trade-off: we will lose performance
    42	// for small values of z_len (0.90x in the worst case), but
    43	// gain significant performance as z_len increases (up to
    44	// 1.45x).
    45
    46	PCALIGN $16
    47loop:
    48	MOVD  8(R8), R11      // R11 = x[i]
    49	MOVD  16(R8), R12     // R12 = x[i+1]
    50	MOVD  24(R8), R14     // R14 = x[i+2]
    51	MOVDU 32(R8), R15     // R15 = x[i+3]
    52	MOVD  8(R9), R16      // R16 = y[i]
    53	MOVD  16(R9), R17     // R17 = y[i+1]
    54	MOVD  24(R9), R18     // R18 = y[i+2]
    55	MOVDU 32(R9), R19     // R19 = y[i+3]
    56	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    57	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    58	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    59	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    60	MOVD  R20, 8(R10)     // z[i]
    61	MOVD  R21, 16(R10)    // z[i+1]
    62	MOVD  R22, 24(R10)    // z[i+2]
    63	MOVDU R23, 32(R10)    // z[i+3]
    64	ADD   $-4, R7         // R7 = z_len - 4
    65	BDNZ  loop
    66
    67	// We may have more elements to read
    68	CMP   R7, $0
    69	BEQ   final
    70
    71	// Process the remaining elements, one at a time
    72tail:
    73	MOVDU 8(R8), R11      // R11 = x[i]
    74	MOVDU 8(R9), R16      // R16 = y[i]
    75	ADD   $-1, R7         // R7 = z_len - 1
    76	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    77	CMP   R7, $0
    78	MOVDU R20, 8(R10)     // z[i]
    79	BEQ   final           // If R7 = 0, we are done
    80
    81	MOVDU 8(R8), R11
    82	MOVDU 8(R9), R16
    83	ADD   $-1, R7
    84	ADDE  R11, R16, R20
    85	CMP   R7, $0
    86	MOVDU R20, 8(R10)
    87	BEQ   final
    88
    89	MOVD  8(R8), R11
    90	MOVD  8(R9), R16
    91	ADDE  R11, R16, R20
    92	MOVD  R20, 8(R10)
    93
    94final:
    95	ADDZE R4              // Capture CA
    96
    97done:
    98	MOVD  R4, c+72(FP)
    99	RET
   100
   101// func subVV(z, x, y []Word) (c Word)
   102// z[i] = x[i] - y[i] for all i, carrying
   103TEXT ·subVV(SB), NOSPLIT, $0
   104	MOVD  z_len+8(FP), R7 // R7 = z_len
   105	MOVD  x+24(FP), R8    // R8 = x[]
   106	MOVD  y+48(FP), R9    // R9 = y[]
   107	MOVD  z+0(FP), R10    // R10 = z[]
   108
   109	// If z_len = 0, we are done
   110	CMP   R7, $0
   111	MOVD  R0, R4
   112	BEQ   done
   113
   114	// Process the first iteration out of the loop so we can
   115	// use MOVDU and avoid 3 index registers updates.
   116	MOVD  0(R8), R11      // R11 = x[i]
   117	MOVD  0(R9), R12      // R12 = y[i]
   118	ADD   $-1, R7         // R7 = z_len - 1
   119	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   120	CMP   R7, $0
   121	MOVD  R15, 0(R10)     // z[i]
   122	BEQ   final           // If z_len was 1, we are done
   123
   124	SRD   $2, R7, R5      // R5 = z_len/4
   125	CMP   R5, $0
   126	MOVD  R5, CTR         // Set up loop counter
   127	BEQ   tail            // If R5 = 0, we can't use the loop
   128
   129	// Process 4 elements per iteration. Unrolling this loop
   130	// means a performance trade-off: we will lose performance
   131	// for small values of z_len (0.92x in the worst case), but
   132	// gain significant performance as z_len increases (up to
   133	// 1.45x).
   134
   135	PCALIGN $16
   136loop:
   137	MOVD  8(R8), R11      // R11 = x[i]
   138	MOVD  16(R8), R12     // R12 = x[i+1]
   139	MOVD  24(R8), R14     // R14 = x[i+2]
   140	MOVDU 32(R8), R15     // R15 = x[i+3]
   141	MOVD  8(R9), R16      // R16 = y[i]
   142	MOVD  16(R9), R17     // R17 = y[i+1]
   143	MOVD  24(R9), R18     // R18 = y[i+2]
   144	MOVDU 32(R9), R19     // R19 = y[i+3]
   145	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   146	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   147	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   148	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   149	MOVD  R20, 8(R10)     // z[i]
   150	MOVD  R21, 16(R10)    // z[i+1]
   151	MOVD  R22, 24(R10)    // z[i+2]
   152	MOVDU R23, 32(R10)    // z[i+3]
   153	ADD   $-4, R7         // R7 = z_len - 4
   154	BDNZ  loop
   155
   156	// We may have more elements to read
   157	CMP   R7, $0
   158	BEQ   final
   159
   160	// Process the remaining elements, one at a time
   161tail:
   162	MOVDU 8(R8), R11      // R11 = x[i]
   163	MOVDU 8(R9), R16      // R16 = y[i]
   164	ADD   $-1, R7         // R7 = z_len - 1
   165	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   166	CMP   R7, $0
   167	MOVDU R20, 8(R10)     // z[i]
   168	BEQ   final           // If R7 = 0, we are done
   169
   170	MOVDU 8(R8), R11
   171	MOVDU 8(R9), R16
   172	ADD   $-1, R7
   173	SUBE  R16, R11, R20
   174	CMP   R7, $0
   175	MOVDU R20, 8(R10)
   176	BEQ   final
   177
   178	MOVD  8(R8), R11
   179	MOVD  8(R9), R16
   180	SUBE  R16, R11, R20
   181	MOVD  R20, 8(R10)
   182
   183final:
   184	ADDZE R4
   185	XOR   $1, R4
   186
   187done:
   188	MOVD  R4, c+72(FP)
   189	RET
   190
   191// func addVW(z, x []Word, y Word) (c Word)
   192TEXT ·addVW(SB), NOSPLIT, $0
   193	MOVD z+0(FP), R10	// R10 = z[]
   194	MOVD x+24(FP), R8	// R8 = x[]
   195	MOVD y+48(FP), R4	// R4 = y = c
   196	MOVD z_len+8(FP), R11	// R11 = z_len
   197
   198	CMP   R11, $0		// If z_len is zero, return
   199	BEQ   done
   200
   201	// We will process the first iteration out of the loop so we capture
   202	// the value of c. In the subsequent iterations, we will rely on the
   203	// value of CA set here.
   204	MOVD  0(R8), R20	// R20 = x[i]
   205	ADD   $-1, R11		// R11 = z_len - 1
   206	ADDC  R20, R4, R6	// R6 = x[i] + c
   207	CMP   R11, $0		// If z_len was 1, we are done
   208	MOVD  R6, 0(R10)	// z[i]
   209	BEQ   final
   210
   211	// We will read 4 elements per iteration
   212	SRDCC $2, R11, R9	// R9 = z_len/4
   213	DCBT  (R8)
   214	MOVD  R9, CTR		// Set up the loop counter
   215	BEQ   tail		// If R9 = 0, we can't use the loop
   216	PCALIGN $16
   217
   218loop:
   219	MOVD  8(R8), R20	// R20 = x[i]
   220	MOVD  16(R8), R21	// R21 = x[i+1]
   221	MOVD  24(R8), R22	// R22 = x[i+2]
   222	MOVDU 32(R8), R23	// R23 = x[i+3]
   223	ADDZE R20, R24		// R24 = x[i] + CA
   224	ADDZE R21, R25		// R25 = x[i+1] + CA
   225	ADDZE R22, R26		// R26 = x[i+2] + CA
   226	ADDZE R23, R27		// R27 = x[i+3] + CA
   227	MOVD  R24, 8(R10)	// z[i]
   228	MOVD  R25, 16(R10)	// z[i+1]
   229	MOVD  R26, 24(R10)	// z[i+2]
   230	MOVDU R27, 32(R10)	// z[i+3]
   231	ADD   $-4, R11		// R11 = z_len - 4
   232	BDNZ  loop
   233
   234	// We may have some elements to read
   235	CMP R11, $0
   236	BEQ final
   237
   238tail:
   239	MOVDU 8(R8), R20
   240	ADDZE R20, R24
   241	ADD $-1, R11
   242	MOVDU R24, 8(R10)
   243	CMP R11, $0
   244	BEQ final
   245
   246	MOVDU 8(R8), R20
   247	ADDZE R20, R24
   248	ADD $-1, R11
   249	MOVDU R24, 8(R10)
   250	CMP R11, $0
   251	BEQ final
   252
   253	MOVD 8(R8), R20
   254	ADDZE R20, R24
   255	MOVD R24, 8(R10)
   256
   257final:
   258	ADDZE R0, R4		// c = CA
   259done:
   260	MOVD  R4, c+56(FP)
   261	RET
   262
   263// func subVW(z, x []Word, y Word) (c Word)
   264TEXT ·subVW(SB), NOSPLIT, $0
   265	MOVD  z+0(FP), R10	// R10 = z[]
   266	MOVD  x+24(FP), R8	// R8 = x[]
   267	MOVD  y+48(FP), R4	// R4 = y = c
   268	MOVD  z_len+8(FP), R11	// R11 = z_len
   269
   270	CMP   R11, $0		// If z_len is zero, return
   271	BEQ   done
   272
   273	// We will process the first iteration out of the loop so we capture
   274	// the value of c. In the subsequent iterations, we will rely on the
   275	// value of CA set here.
   276	MOVD  0(R8), R20	// R20 = x[i]
   277	ADD   $-1, R11		// R11 = z_len - 1
   278	SUBC  R4, R20, R6	// R6 = x[i] - c
   279	CMP   R11, $0		// If z_len was 1, we are done
   280	MOVD  R6, 0(R10)	// z[i]
   281	BEQ   final
   282
   283	// We will read 4 elements per iteration
   284	SRDCC $2, R11, R9	// R9 = z_len/4
   285	DCBT  (R8)
   286	MOVD  R9, CTR		// Set up the loop counter
   287	BEQ   tail		// If R9 = 0, we can't use the loop
   288
   289	// The loop here is almost the same as the one used in s390x, but
   290	// we don't need to capture CA every iteration because we've already
   291	// done that above.
   292
   293	PCALIGN $16
   294loop:
   295	MOVD  8(R8), R20
   296	MOVD  16(R8), R21
   297	MOVD  24(R8), R22
   298	MOVDU 32(R8), R23
   299	SUBE  R0, R20
   300	SUBE  R0, R21
   301	SUBE  R0, R22
   302	SUBE  R0, R23
   303	MOVD  R20, 8(R10)
   304	MOVD  R21, 16(R10)
   305	MOVD  R22, 24(R10)
   306	MOVDU R23, 32(R10)
   307	ADD   $-4, R11
   308	BDNZ  loop
   309
   310	// We may have some elements to read
   311	CMP   R11, $0
   312	BEQ   final
   313
   314tail:
   315	MOVDU 8(R8), R20
   316	SUBE  R0, R20
   317	ADD   $-1, R11
   318	MOVDU R20, 8(R10)
   319	CMP   R11, $0
   320	BEQ   final
   321
   322	MOVDU 8(R8), R20
   323	SUBE  R0, R20
   324	ADD   $-1, R11
   325	MOVDU R20, 8(R10)
   326	CMP   R11, $0
   327	BEQ   final
   328
   329	MOVD  8(R8), R20
   330	SUBE  R0, R20
   331	MOVD  R20, 8(R10)
   332
   333final:
   334	// Capture CA
   335	SUBE  R4, R4
   336	NEG   R4, R4
   337
   338done:
   339	MOVD  R4, c+56(FP)
   340	RET
   341
   342//func shlVU(z, x []Word, s uint) (c Word)
   343TEXT ·shlVU(SB), NOSPLIT, $0
   344	MOVD    z+0(FP), R3
   345	MOVD    x+24(FP), R6
   346	MOVD    s+48(FP), R9
   347	MOVD    z_len+8(FP), R4
   348	MOVD    x_len+32(FP), R7
   349	CMP     R9, $0          // s==0 copy(z,x)
   350	BEQ     zeroshift
   351	CMP     R4, $0          // len(z)==0 return
   352	BEQ     done
   353
   354	ADD     $-1, R4, R5     // len(z)-1
   355	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   356	SLD     $3, R5, R7
   357	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
   358	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
   359	MOVD    (R6)(R7), R14
   360	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
   361	CMP     R5, $0          // iterate from i=len(z)-1 to 0
   362	BEQ     loopexit        // Already at end?
   363	MOVD	0(R15),R10	// x[i]
   364	PCALIGN $16
   365shloop:
   366	SLD     R9, R10, R10    // x[i]<<s
   367	MOVDU   -8(R15), R14
   368	SRD     R4, R14, R11    // x[i-1]>>ŝ
   369	OR      R11, R10, R10
   370	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
   371	MOVD	R14, R10	// reuse x[i-1] for next iteration
   372	ADD     $-8, R16        // i--
   373	CMP     R15, R6         // &x[i-1]>&x[0]?
   374	BGT     shloop
   375loopexit:
   376	MOVD    0(R6), R4
   377	SLD     R9, R4, R4
   378	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
   379	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
   380	RET
   381
   382zeroshift:
   383	CMP     R6, $0          // x is null, nothing to copy
   384	BEQ     done
   385	CMP     R6, R3          // if x is same as z, nothing to copy
   386	BEQ     done
   387	CMP     R7, R4
   388	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
   389	SLD     $3, R7, R7
   390	SUB     R6, R3, R11     // dest - src
   391	CMPU    R11, R7, CR2    // < len?
   392	BLT     CR2, backward   // there is overlap, copy backwards
   393	MOVD    $0, R14
   394	// shlVU processes backwards, but added a forward copy option 
   395	// since its faster on POWER
   396repeat:
   397	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
   398	MOVD    R15, (R3)(R14)
   399	ADD     $8, R14
   400	CMP     R14, R7         // More 8 bytes left?
   401	BLT     repeat
   402	BR      done
   403backward:
   404	ADD     $-8,R7, R14
   405repeatback:
   406	MOVD    (R6)(R14), R15  // copy x into z backwards
   407	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
   408	SUB     $8, R14
   409	CMP     R14, $-8        // More 8 bytes left?
   410	BGT     repeatback
   411
   412done:
   413	MOVD    R0, c+56(FP)    // c=0
   414	RET
   415
   416//func shrVU(z, x []Word, s uint) (c Word)
   417TEXT ·shrVU(SB), NOSPLIT, $0
   418	MOVD    z+0(FP), R3
   419	MOVD    x+24(FP), R6
   420	MOVD    s+48(FP), R9
   421	MOVD    z_len+8(FP), R4
   422	MOVD    x_len+32(FP), R7
   423
   424	CMP     R9, $0          // s==0, copy(z,x)
   425	BEQ     zeroshift
   426	CMP     R4, $0          // len(z)==0 return
   427	BEQ     done
   428	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   429
   430	MOVD    0(R6), R7
   431	SLD     R5, R7, R7      // compute x[0]<<ŝ
   432	MOVD    $1, R8          // iterate from i=1 to i<len(z)
   433	CMP     R8, R4
   434	BGE     loopexit        // Already at end?
   435
   436	// vectorize if len(z) is >=3, else jump to scalar loop
   437	CMP     R4, $3
   438	BLT     scalar
   439	MTVSRD  R9, VS38        // s
   440	VSPLTB  $7, V6, V4
   441	MTVSRD  R5, VS39        // ŝ
   442	VSPLTB  $7, V7, V2
   443	ADD     $-2, R4, R16
   444	PCALIGN $16
   445loopback:
   446	ADD     $-1, R8, R10
   447	SLD     $3, R10
   448	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
   449	SLD     $3, R8, R12
   450	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
   451
   452	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
   453	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
   454	VOR     V3, V5, V5      // Or(|) the two registers together
   455	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
   456	ADD     $2, R8          // Done processing 2 entries, i and i+1
   457	CMP     R8, R16         // Are there at least a couple of more entries left?
   458	BLE     loopback
   459	CMP     R8, R4          // Are we at the last element?
   460	BEQ     loopexit
   461scalar:	
   462	ADD     $-1, R8, R10
   463	SLD     $3, R10
   464	MOVD    (R6)(R10),R11
   465	SRD     R9, R11, R11    // x[len(z)-2] >> s
   466	SLD     $3, R8, R12
   467	MOVD    (R6)(R12), R12
   468	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
   469	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   470	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   471loopexit:
   472	ADD     $-1, R4
   473	SLD     $3, R4
   474	MOVD    (R6)(R4), R5
   475	SRD     R9, R5, R5      // x[len(z)-1]>>s
   476	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
   477	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
   478	RET
   479
   480zeroshift:
   481	CMP     R6, $0          // x is null, nothing to copy
   482	BEQ     done
   483	CMP     R6, R3          // if x is same as z, nothing to copy
   484	BEQ     done
   485	CMP     R7, R4
   486	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
   487	SLD     $3, R7, R7
   488	MOVD    $0, R14
   489repeat:
   490	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
   491	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
   492	ADD     $8, R14
   493	CMP     R14, R7         // More 8 bytes left?
   494	BLT     repeat
   495done:
   496	MOVD    R0, c+56(FP)
   497	RET
   498
   499// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   500TEXT ·mulAddVWW(SB), NOSPLIT, $0
   501	MOVD    z+0(FP), R10      // R10 = z[]
   502	MOVD    x+24(FP), R8      // R8 = x[]
   503	MOVD    y+48(FP), R9      // R9 = y
   504	MOVD    r+56(FP), R4      // R4 = r = c
   505	MOVD    z_len+8(FP), R11  // R11 = z_len
   506
   507	CMP     R11, $0
   508	BEQ     done
   509
   510	MOVD    0(R8), R20
   511	ADD     $-1, R11
   512	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   513	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   514	ADDC    R4, R6            // R6 = z0 + r
   515	ADDZE   R7, R4            // R4 = z1 + CA
   516	CMP     R11, $0
   517	MOVD    R6, 0(R10)        // z[i]
   518	BEQ     done
   519
   520	// We will read 4 elements per iteration
   521	SRDCC   $2, R11, R14      // R14 = z_len/4
   522	DCBT    (R8)
   523	MOVD    R14, CTR          // Set up the loop counter
   524	BEQ     tail              // If R9 = 0, we can't use the loop
   525	PCALIGN $16
   526
   527loop:
   528	MOVD    8(R8), R20        // R20 = x[i]
   529	MOVD    16(R8), R21       // R21 = x[i+1]
   530	MOVD    24(R8), R22       // R22 = x[i+2]
   531	MOVDU   32(R8), R23       // R23 = x[i+3]
   532	MULLD   R9, R20, R24      // R24 = z0[i]
   533	MULHDU  R9, R20, R20      // R20 = z1[i]
   534	ADDC    R4, R24           // R24 = z0[i] + c
   535	MULLD   R9, R21, R25
   536	MULHDU  R9, R21, R21
   537	ADDE    R20, R25
   538	MULLD   R9, R22, R26
   539	MULHDU  R9, R22, R22
   540	MULLD   R9, R23, R27
   541	MULHDU  R9, R23, R23
   542	ADDE    R21, R26
   543	MOVD    R24, 8(R10)       // z[i]
   544	MOVD    R25, 16(R10)      // z[i+1]
   545	ADDE    R22, R27
   546	ADDZE   R23,R4		  // update carry
   547	MOVD    R26, 24(R10)      // z[i+2]
   548	MOVDU   R27, 32(R10)      // z[i+3]
   549	ADD     $-4, R11          // R11 = z_len - 4
   550	BDNZ    loop
   551
   552	// We may have some elements to read
   553	CMP   R11, $0
   554	BEQ   done
   555
   556	// Process the remaining elements, one at a time
   557tail:
   558	MOVDU   8(R8), R20        // R20 = x[i]
   559	MULLD   R9, R20, R24      // R24 = z0[i]
   560	MULHDU  R9, R20, R25      // R25 = z1[i]
   561	ADD     $-1, R11          // R11 = z_len - 1
   562	ADDC    R4, R24
   563	ADDZE   R25, R4
   564	MOVDU   R24, 8(R10)       // z[i]
   565	CMP     R11, $0
   566	BEQ     done              // If R11 = 0, we are done
   567
   568	MOVDU   8(R8), R20
   569	MULLD   R9, R20, R24
   570	MULHDU  R9, R20, R25
   571	ADD     $-1, R11
   572	ADDC    R4, R24
   573	ADDZE   R25, R4
   574	MOVDU   R24, 8(R10)
   575	CMP     R11, $0
   576	BEQ     done
   577
   578	MOVD    8(R8), R20
   579	MULLD   R9, R20, R24
   580	MULHDU  R9, R20, R25
   581	ADD     $-1, R11
   582	ADDC    R4, R24
   583	ADDZE   R25,R4
   584	MOVD    R24, 8(R10)
   585
   586done:
   587	MOVD    R4, c+64(FP)
   588	RET
   589
   590// func addMulVVW(z, x []Word, y Word) (c Word)
   591TEXT ·addMulVVW(SB), NOSPLIT, $0
   592	MOVD	z+0(FP), R3	// R3 = z[]
   593	MOVD	x+24(FP), R4	// R4 = x[]
   594	MOVD	y+48(FP), R5	// R5 = y
   595	MOVD	z_len+8(FP), R6	// R6 = z_len
   596
   597	CMP	R6, $4
   598	MOVD	R0, R9		// R9 = c = 0
   599	BLT	tail
   600	SRD	$2, R6, R7
   601	MOVD	R7, CTR		// Initialize loop counter
   602	PCALIGN	$16
   603
   604loop:
   605	MOVD	0(R4), R14	// x[i]
   606	MOVD	8(R4), R16	// x[i+1]
   607	MOVD	16(R4), R18	// x[i+2]
   608	MOVD	24(R4), R20	// x[i+3]
   609	MOVD	0(R3), R15	// z[i]
   610	MOVD	8(R3), R17	// z[i+1]
   611	MOVD	16(R3), R19	// z[i+2]
   612	MOVD	24(R3), R21	// z[i+3]
   613	MULLD	R5, R14, R10	// low x[i]*y
   614	MULHDU	R5, R14, R11	// high x[i]*y
   615	ADDC	R15, R10
   616	ADDZE	R11
   617	ADDC	R9, R10
   618	ADDZE	R11, R9
   619	MULLD	R5, R16, R14	// low x[i+1]*y
   620	MULHDU	R5, R16, R15	// high x[i+1]*y
   621	ADDC	R17, R14
   622	ADDZE	R15
   623	ADDC	R9, R14
   624	ADDZE	R15, R9
   625	MULLD	R5, R18, R16    // low x[i+2]*y
   626	MULHDU	R5, R18, R17    // high x[i+2]*y
   627	ADDC	R19, R16
   628	ADDZE	R17
   629	ADDC	R9, R16
   630	ADDZE	R17, R9
   631	MULLD	R5, R20, R18    // low x[i+3]*y
   632	MULHDU	R5, R20, R19    // high x[i+3]*y
   633	ADDC	R21, R18
   634	ADDZE	R19
   635	ADDC	R9, R18
   636	ADDZE	R19, R9
   637	MOVD	R10, 0(R3)	// z[i]
   638	MOVD	R14, 8(R3)	// z[i+1]
   639	MOVD	R16, 16(R3)	// z[i+2]
   640	MOVD	R18, 24(R3)	// z[i+3]
   641	ADD	$32, R3
   642	ADD	$32, R4
   643	BDNZ	loop
   644
   645	ANDCC	$3, R6
   646tail:
   647	CMP	R6, $0
   648	BEQ	done
   649	MOVD	R6, CTR
   650	PCALIGN $16
   651tailloop:
   652	MOVD	0(R4), R14
   653	MOVD	0(R3), R15
   654	MULLD	R5, R14, R10
   655	MULHDU	R5, R14, R11
   656	ADDC	R15, R10
   657	ADDZE	R11
   658	ADDC	R9, R10
   659	ADDZE	R11, R9
   660	MOVD	R10, 0(R3)
   661	ADD	$8, R3
   662	ADD	$8, R4
   663	BDNZ	tailloop
   664
   665done:
   666	MOVD	R9, c+56(FP)
   667	RET
   668

View as plain text