1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file provides the generic implementation of Sum and MAC. Other files 6 // might provide optimized assembly implementations of some of this code. 7 8 package poly1305 9 10 import ( 11 "encoding/binary" 12 "math/bits" 13 ) 14 15 // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag 16 // for a 64 bytes message is approximately 17 // 18 // s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5 19 // 20 // for some secret r and s. It can be computed sequentially like 21 // 22 // for len(msg) > 0: 23 // h += read(msg, 16) 24 // h *= r 25 // h %= 2¹³⁰ - 5 26 // return h + s 27 // 28 // All the complexity is about doing performant constant-time math on numbers 29 // larger than any available numeric type. 30 31 func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) { 32 h := newMACGeneric(key) 33 h.Write(msg) 34 h.Sum(out) 35 } 36 37 func newMACGeneric(key *[32]byte) macGeneric { 38 m := macGeneric{} 39 initialize(key, &m.macState) 40 return m 41 } 42 43 // macState holds numbers in saturated 64-bit little-endian limbs. That is, 44 // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸. 45 type macState struct { 46 // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but 47 // can grow larger during and after rounds. It must, however, remain below 48 // 2 * (2¹³⁰ - 5). 49 h [3]uint64 50 // r and s are the private key components. 51 r [2]uint64 52 s [2]uint64 53 } 54 55 type macGeneric struct { 56 macState 57 58 buffer [TagSize]byte 59 offset int 60 } 61 62 // Write splits the incoming message into TagSize chunks, and passes them to 63 // update. It buffers incomplete chunks. 64 func (h *macGeneric) Write(p []byte) (int, error) { 65 nn := len(p) 66 if h.offset > 0 { 67 n := copy(h.buffer[h.offset:], p) 68 if h.offset+n < TagSize { 69 h.offset += n 70 return nn, nil 71 } 72 p = p[n:] 73 h.offset = 0 74 updateGeneric(&h.macState, h.buffer[:]) 75 } 76 if n := len(p) - (len(p) % TagSize); n > 0 { 77 updateGeneric(&h.macState, p[:n]) 78 p = p[n:] 79 } 80 if len(p) > 0 { 81 h.offset += copy(h.buffer[h.offset:], p) 82 } 83 return nn, nil 84 } 85 86 // Sum flushes the last incomplete chunk from the buffer, if any, and generates 87 // the MAC output. It does not modify its state, in order to allow for multiple 88 // calls to Sum, even if no Write is allowed after Sum. 89 func (h *macGeneric) Sum(out *[TagSize]byte) { 90 state := h.macState 91 if h.offset > 0 { 92 updateGeneric(&state, h.buffer[:h.offset]) 93 } 94 finalize(out, &state.h, &state.s) 95 } 96 97 // [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It 98 // clears some bits of the secret coefficient to make it possible to implement 99 // multiplication more efficiently. 100 const ( 101 rMask0 = 0x0FFFFFFC0FFFFFFF 102 rMask1 = 0x0FFFFFFC0FFFFFFC 103 ) 104 105 // initialize loads the 256-bit key into the two 128-bit secret values r and s. 106 func initialize(key *[32]byte, m *macState) { 107 m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0 108 m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1 109 m.s[0] = binary.LittleEndian.Uint64(key[16:24]) 110 m.s[1] = binary.LittleEndian.Uint64(key[24:32]) 111 } 112 113 // uint128 holds a 128-bit number as two 64-bit limbs, for use with the 114 // bits.Mul64 and bits.Add64 intrinsics. 115 type uint128 struct { 116 lo, hi uint64 117 } 118 119 func mul64(a, b uint64) uint128 { 120 hi, lo := bits.Mul64(a, b) 121 return uint128{lo, hi} 122 } 123 124 func add128(a, b uint128) uint128 { 125 lo, c := bits.Add64(a.lo, b.lo, 0) 126 hi, c := bits.Add64(a.hi, b.hi, c) 127 if c != 0 { 128 panic("poly1305: unexpected overflow") 129 } 130 return uint128{lo, hi} 131 } 132 133 func shiftRightBy2(a uint128) uint128 { 134 a.lo = a.lo>>2 | (a.hi&3)<<62 135 a.hi = a.hi >> 2 136 return a 137 } 138 139 // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of 140 // 128 bits of message, it computes 141 // 142 // h₊ = (h + m) * r mod 2¹³⁰ - 5 143 // 144 // If the msg length is not a multiple of TagSize, it assumes the last 145 // incomplete chunk is the final one. 146 func updateGeneric(state *macState, msg []byte) { 147 h0, h1, h2 := state.h[0], state.h[1], state.h[2] 148 r0, r1 := state.r[0], state.r[1] 149 150 for len(msg) > 0 { 151 var c uint64 152 153 // For the first step, h + m, we use a chain of bits.Add64 intrinsics. 154 // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially 155 // reduced at the end of the multiplication below. 156 // 157 // The spec requires us to set a bit just above the message size, not to 158 // hide leading zeroes. For full chunks, that's 1 << 128, so we can just 159 // add 1 to the most significant (2¹²⁸) limb, h2. 160 if len(msg) >= TagSize { 161 h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) 162 h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) 163 h2 += c + 1 164 165 msg = msg[TagSize:] 166 } else { 167 var buf [TagSize]byte 168 copy(buf[:], msg) 169 buf[len(msg)] = 1 170 171 h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) 172 h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) 173 h2 += c 174 175 msg = nil 176 } 177 178 // Multiplication of big number limbs is similar to elementary school 179 // columnar multiplication. Instead of digits, there are 64-bit limbs. 180 // 181 // We are multiplying a 3 limbs number, h, by a 2 limbs number, r. 182 // 183 // h2 h1 h0 x 184 // r1 r0 = 185 // ---------------- 186 // h2r0 h1r0 h0r0 <-- individual 128-bit products 187 // + h2r1 h1r1 h0r1 188 // ------------------------ 189 // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs 190 // ------------------------ 191 // m3.hi m2.hi m1.hi m0.hi <-- carry propagation 192 // + m3.lo m2.lo m1.lo m0.lo 193 // ------------------------------- 194 // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs 195 // 196 // The main difference from pen-and-paper multiplication is that we do 197 // carry propagation in a separate step, as if we wrote two digit sums 198 // at first (the 128-bit limbs), and then carried the tens all at once. 199 200 h0r0 := mul64(h0, r0) 201 h1r0 := mul64(h1, r0) 202 h2r0 := mul64(h2, r0) 203 h0r1 := mul64(h0, r1) 204 h1r1 := mul64(h1, r1) 205 h2r1 := mul64(h2, r1) 206 207 // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their 208 // top 4 bits cleared by rMask{0,1}, we know that their product is not going 209 // to overflow 64 bits, so we can ignore the high part of the products. 210 // 211 // This also means that the product doesn't have a fifth limb (t4). 212 if h2r0.hi != 0 { 213 panic("poly1305: unexpected overflow") 214 } 215 if h2r1.hi != 0 { 216 panic("poly1305: unexpected overflow") 217 } 218 219 m0 := h0r0 220 m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again 221 m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1. 222 m3 := h2r1 223 224 t0 := m0.lo 225 t1, c := bits.Add64(m1.lo, m0.hi, 0) 226 t2, c := bits.Add64(m2.lo, m1.hi, c) 227 t3, _ := bits.Add64(m3.lo, m2.hi, c) 228 229 // Now we have the result as 4 64-bit limbs, and we need to reduce it 230 // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do 231 // a cheap partial reduction according to the reduction identity 232 // 233 // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5 234 // 235 // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is 236 // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the 237 // assumptions we make about h in the rest of the code. 238 // 239 // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23 240 241 // We split the final result at the 2¹³⁰ mark into h and cc, the carry. 242 // Note that the carry bits are effectively shifted left by 2, in other 243 // words, cc = c * 4 for the c in the reduction identity. 244 h0, h1, h2 = t0, t1, t2&maskLow2Bits 245 cc := uint128{t2 & maskNotLow2Bits, t3} 246 247 // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c. 248 249 h0, c = bits.Add64(h0, cc.lo, 0) 250 h1, c = bits.Add64(h1, cc.hi, c) 251 h2 += c 252 253 cc = shiftRightBy2(cc) 254 255 h0, c = bits.Add64(h0, cc.lo, 0) 256 h1, c = bits.Add64(h1, cc.hi, c) 257 h2 += c 258 259 // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most 260 // 261 // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1 262 } 263 264 state.h[0], state.h[1], state.h[2] = h0, h1, h2 265 } 266 267 const ( 268 maskLow2Bits uint64 = 0x0000000000000003 269 maskNotLow2Bits uint64 = ^maskLow2Bits 270 ) 271 272 // select64 returns x if v == 1 and y if v == 0, in constant time. 273 func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y } 274 275 // [p0, p1, p2] is 2¹³⁰ - 5 in little endian order. 276 const ( 277 p0 = 0xFFFFFFFFFFFFFFFB 278 p1 = 0xFFFFFFFFFFFFFFFF 279 p2 = 0x0000000000000003 280 ) 281 282 // finalize completes the modular reduction of h and computes 283 // 284 // out = h + s mod 2¹²⁸ 285 func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { 286 h0, h1, h2 := h[0], h[1], h[2] 287 288 // After the partial reduction in updateGeneric, h might be more than 289 // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction 290 // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the 291 // result if the subtraction underflows, and t otherwise. 292 293 hMinusP0, b := bits.Sub64(h0, p0, 0) 294 hMinusP1, b := bits.Sub64(h1, p1, b) 295 _, b = bits.Sub64(h2, p2, b) 296 297 // h = h if h < p else h - p 298 h0 = select64(b, h0, hMinusP0) 299 h1 = select64(b, h1, hMinusP1) 300 301 // Finally, we compute the last Poly1305 step 302 // 303 // tag = h + s mod 2¹²⁸ 304 // 305 // by just doing a wide addition with the 128 low bits of h and discarding 306 // the overflow. 307 h0, c := bits.Add64(h0, s[0], 0) 308 h1, _ = bits.Add64(h1, s[1], c) 309 310 binary.LittleEndian.PutUint64(out[0:8], h0) 311 binary.LittleEndian.PutUint64(out[8:16], h1) 312 } 313