chore: modernize CI and update Go toolchain

- Bump Go from 1.19 to 1.26 and update all dependencies - Rewrite CI workflow with matrix strategy (Linux, macOS, Windows) - Update GitHub Actions to current versions (checkout@v4, setup-go@v5) - Update CodeQL actions from v1 to v3 - Fix cross-platform bug in mock/path.go (path.Join -> filepath.Join) - Clean up dependabot config (weekly schedule, remove stale ignore) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-08 02:23:51 +02:00 · 2026-02-14 20:58:51 -05:00
parent cc85a4bdb1
commit 2a19755804
657 changed files with 49050 additions and 32001 deletions
--- a/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go
@@ -1,39 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !go1.13
-
-package poly1305
-
-// Generic fallbacks for the math/bits intrinsics, copied from
-// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had
-// variable time fallbacks until Go 1.13.
-
-func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
-	sum = x + y + carry
-	carryOut = ((x & y) | ((x | y) &^ sum)) >> 63
-	return
-}
-
-func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
-	diff = x - y - borrow
-	borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63
-	return
-}
-
-func bitsMul64(x, y uint64) (hi, lo uint64) {
-	const mask32 = 1<<32 - 1
-	x0 := x & mask32
-	x1 := x >> 32
-	y0 := y & mask32
-	y1 := y >> 32
-	w0 := x0 * y0
-	t := x1*y0 + w0>>32
-	w1 := t & mask32
-	w2 := t >> 32
-	w1 += x0 * y1
-	hi = x1*y1 + w2 + w1>>32
-	lo = x * y
-	return
-}
--- a/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go
@@ -1,21 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build go1.13
-
-package poly1305
-
-import "math/bits"
-
-func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
-	return bits.Add64(x, y, carry)
-}
-
-func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
-	return bits.Sub64(x, y, borrow)
-}
-
-func bitsMul64(x, y uint64) (hi, lo uint64) {
-	return bits.Mul64(x, y)
-}
--- a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego

 package poly1305

--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.

 //go:build gc && !purego

-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
-	ADDQ 0(msg), h0;  \
-	ADCQ 8(msg), h1;  \
-	ADCQ $1, h2;      \
-	LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
-	MOVQ  r0, AX;                  \
-	MULQ  h0;                      \
-	MOVQ  AX, t0;                  \
-	MOVQ  DX, t1;                  \
-	MOVQ  r0, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  r0, t2;                  \
-	IMULQ h2, t2;                  \
-	ADDQ  DX, t2;                  \
-	                               \
-	MOVQ  r1, AX;                  \
-	MULQ  h0;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  DX, h0;                  \
-	MOVQ  r1, t3;                  \
-	IMULQ h2, t3;                  \
-	MOVQ  r1, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t2;                  \
-	ADCQ  DX, t3;                  \
-	ADDQ  h0, t2;                  \
-	ADCQ  $0, t3;                  \
-	                               \
-	MOVQ  t0, h0;                  \
-	MOVQ  t1, h1;                  \
-	MOVQ  t2, h2;                  \
-	ANDQ  $3, h2;                  \
-	MOVQ  t2, t0;                  \
-	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
-	ADDQ  t0, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2;                  \
-	SHRQ  $2, t3, t2;              \
-	SHRQ  $2, t3;                  \
-	ADDQ  t2, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVQ state+0(FP), DI
 	MOVQ msg_base+8(FP), SI
 	MOVQ msg_len+16(FP), R15
-
-	MOVQ 0(DI), R8   // h0
-	MOVQ 8(DI), R9   // h1
-	MOVQ 16(DI), R10 // h2
-	MOVQ 24(DI), R11 // r0
-	MOVQ 32(DI), R12 // r1
-
-	CMPQ R15, $16
+	MOVQ (DI), R8
+	MOVQ 8(DI), R9
+	MOVQ 16(DI), R10
+	MOVQ 24(DI), R11
+	MOVQ 32(DI), R12
+	CMPQ R15, $0x10
 	JB   bytes_between_0_and_15

 loop:
-	POLY1305_ADD(SI, R8, R9, R10)
+	ADDQ (SI), R8
+	ADCQ 8(SI), R9
+	ADCQ $0x01, R10
+	LEAQ 16(SI), SI

 multiply:
-	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
-	SUBQ $16, R15
-	CMPQ R15, $16
-	JAE  loop
+	MOVQ  R11, AX
+	MULQ  R8
+	MOVQ  AX, BX
+	MOVQ  DX, CX
+	MOVQ  R11, AX
+	MULQ  R9
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  R11, R13
+	IMULQ R10, R13
+	ADDQ  DX, R13
+	MOVQ  R12, AX
+	MULQ  R8
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  DX, R8
+	MOVQ  R12, R14
+	IMULQ R10, R14
+	MOVQ  R12, AX
+	MULQ  R9
+	ADDQ  AX, R13
+	ADCQ  DX, R14
+	ADDQ  R8, R13
+	ADCQ  $0x00, R14
+	MOVQ  BX, R8
+	MOVQ  CX, R9
+	MOVQ  R13, R10
+	ANDQ  $0x03, R10
+	MOVQ  R13, BX
+	ANDQ  $-4, BX
+	ADDQ  BX, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SHRQ  $0x02, R14, R13
+	SHRQ  $0x02, R14
+	ADDQ  R13, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SUBQ  $0x10, R15
+	CMPQ  R15, $0x10
+	JAE   loop

 bytes_between_0_and_15:
 	TESTQ R15, R15
 	JZ    done
-	MOVQ  $1, BX
+	MOVQ  $0x00000001, BX
 	XORQ  CX, CX
 	XORQ  R13, R13
 	ADDQ  R15, SI

 flush_buffer:
-	SHLQ $8, BX, CX
-	SHLQ $8, BX
+	SHLQ $0x08, BX, CX
+	SHLQ $0x08, BX
 	MOVB -1(SI), R13
 	XORQ R13, BX
 	DECQ SI
 	DECQ R15
 	JNZ  flush_buffer
-
 	ADDQ BX, R8
 	ADCQ CX, R9
-	ADCQ $0, R10
-	MOVQ $16, R15
+	ADCQ $0x00, R10
+	MOVQ $0x00000010, R15
 	JMP  multiply

 done:
-	MOVQ R8, 0(DI)
+	MOVQ R8, (DI)
 	MOVQ R9, 8(DI)
 	MOVQ R10, 16(DI)
 	RET
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build gc && !purego
+//go:build gc && !purego && (amd64 || loong64 || ppc64 || ppc64le)

 package poly1305

--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go
@@ -7,7 +7,10 @@

 package poly1305

-import "encoding/binary"
+import (
+	"encoding/binary"
+	"math/bits"
+)

 // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
 // for a 64 bytes message is approximately
@@ -114,13 +117,13 @@ type uint128 struct {
 }

 func mul64(a, b uint64) uint128 {
-	hi, lo := bitsMul64(a, b)
+	hi, lo := bits.Mul64(a, b)
 	return uint128{lo, hi}
 }

 func add128(a, b uint128) uint128 {
-	lo, c := bitsAdd64(a.lo, b.lo, 0)
-	hi, c := bitsAdd64(a.hi, b.hi, c)
+	lo, c := bits.Add64(a.lo, b.lo, 0)
+	hi, c := bits.Add64(a.hi, b.hi, c)
 	if c != 0 {
 		panic("poly1305: unexpected overflow")
 	}
@@ -155,8 +158,8 @@ func updateGeneric(state *macState, msg []byte) {
 		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
 		// add 1 to the most significant (2¹²⁸) limb, h2.
 		if len(msg) >= TagSize {
-			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
-			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
+			h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
+			h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
 			h2 += c + 1

 			msg = msg[TagSize:]
@@ -165,8 +168,8 @@ func updateGeneric(state *macState, msg []byte) {
 			copy(buf[:], msg)
 			buf[len(msg)] = 1

-			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
-			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
+			h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
+			h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
 			h2 += c

 			msg = nil
@@ -219,9 +222,9 @@ func updateGeneric(state *macState, msg []byte) {
 		m3 := h2r1

 		t0 := m0.lo
-		t1, c := bitsAdd64(m1.lo, m0.hi, 0)
-		t2, c := bitsAdd64(m2.lo, m1.hi, c)
-		t3, _ := bitsAdd64(m3.lo, m2.hi, c)
+		t1, c := bits.Add64(m1.lo, m0.hi, 0)
+		t2, c := bits.Add64(m2.lo, m1.hi, c)
+		t3, _ := bits.Add64(m3.lo, m2.hi, c)

 		// Now we have the result as 4 64-bit limbs, and we need to reduce it
 		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
@@ -243,14 +246,14 @@ func updateGeneric(state *macState, msg []byte) {

 		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.

-		h0, c = bitsAdd64(h0, cc.lo, 0)
-		h1, c = bitsAdd64(h1, cc.hi, c)
+		h0, c = bits.Add64(h0, cc.lo, 0)
+		h1, c = bits.Add64(h1, cc.hi, c)
 		h2 += c

 		cc = shiftRightBy2(cc)

-		h0, c = bitsAdd64(h0, cc.lo, 0)
-		h1, c = bitsAdd64(h1, cc.hi, c)
+		h0, c = bits.Add64(h0, cc.lo, 0)
+		h1, c = bits.Add64(h1, cc.hi, c)
 		h2 += c

 		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
@@ -287,9 +290,9 @@ func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
 	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
 	// result if the subtraction underflows, and t otherwise.

-	hMinusP0, b := bitsSub64(h0, p0, 0)
-	hMinusP1, b := bitsSub64(h1, p1, b)
-	_, b = bitsSub64(h2, p2, b)
+	hMinusP0, b := bits.Sub64(h0, p0, 0)
+	hMinusP1, b := bits.Sub64(h1, p1, b)
+	_, b = bits.Sub64(h2, p2, b)

 	// h = h if h < p else h - p
 	h0 = select64(b, h0, hMinusP0)
@@ -301,8 +304,8 @@ func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
 	//
 	// by just doing a wide addition with the 128 low bits of h and discarding
 	// the overflow.
-	h0, c := bitsAdd64(h0, s[0], 0)
-	h1, _ = bitsAdd64(h1, s[1], c)
+	h0, c := bits.Add64(h0, s[0], 0)
+	h1, _ = bits.Add64(h1, s[1], c)

 	binary.LittleEndian.PutUint64(out[0:8], h0)
 	binary.LittleEndian.PutUint64(out[8:16], h1)
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s
@@ -0,0 +1,123 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+// func update(state *macState, msg []byte)
+TEXT ·update(SB), $0-32
+	MOVV	state+0(FP), R4
+	MOVV	msg_base+8(FP), R5
+	MOVV	msg_len+16(FP), R6
+
+	MOVV	$0x10, R7
+
+	MOVV	(R4), R8	// h0
+	MOVV	8(R4), R9	// h1
+	MOVV	16(R4), R10	// h2
+	MOVV	24(R4), R11	// r0
+	MOVV	32(R4), R12	// r1
+
+	BLT	R6, R7, bytes_between_0_and_15
+
+loop:
+	MOVV	(R5), R14	// msg[0:8]
+	MOVV	8(R5), R16	// msg[8:16]
+	ADDV	R14, R8, R8	// h0 (x1 + y1 = z1', if z1' < x1 then z1' overflow)
+	ADDV	R16, R9, R27
+	SGTU	R14, R8, R24	// h0.carry
+	SGTU	R9, R27, R28
+	ADDV	R27, R24, R9	// h1
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24	// h1.carry
+	ADDV	$0x01, R24, R24
+	ADDV	R10, R24, R10	// h2
+
+	ADDV	$16, R5, R5	// msg = msg[16:]
+
+multiply:
+	MULV	R8, R11, R14	// h0r0.lo
+	MULHVU	R8, R11, R15	// h0r0.hi
+	MULV	R9, R11, R13	// h1r0.lo
+	MULHVU	R9, R11, R16	// h1r0.hi
+	ADDV	R13, R15, R15
+	SGTU	R13, R15, R24
+	ADDV	R24, R16, R16
+	MULV	R10, R11, R25
+	ADDV	R16, R25, R25
+	MULV	R8, R12, R13	// h0r1.lo
+	MULHVU	R8, R12, R16	// h0r1.hi
+	ADDV	R13, R15, R15
+	SGTU	R13, R15, R24
+	ADDV	R24, R16, R16
+	MOVV	R16, R8
+	MULV	R10, R12, R26	// h2r1
+	MULV	R9, R12, R13	// h1r1.lo
+	MULHVU	R9, R12, R16	// h1r1.hi
+	ADDV	R13, R25, R25
+	ADDV	R16, R26, R27
+	SGTU	R13, R25, R24
+	ADDV	R27, R24, R26
+	ADDV	R8, R25, R25
+	SGTU	R8, R25, R24
+	ADDV	R24, R26, R26
+	AND	$3, R25, R10
+	AND	$-4, R25, R17
+	ADDV	R17, R14, R8
+	ADDV	R26, R15, R27
+	SGTU	R17, R8, R24
+	SGTU	R26, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R24, R10, R10
+	SLLV	$62, R26, R27
+	SRLV	$2, R25, R28
+	SRLV	$2, R26, R26
+	OR	R27, R28, R25
+	ADDV	R25, R8, R8
+	ADDV	R26, R9, R27
+	SGTU	R25, R8, R24
+	SGTU	R26, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R24, R10, R10
+
+	SUBV	$16, R6, R6
+	BGE	R6, R7, loop
+
+bytes_between_0_and_15:
+	BEQ	R6, R0, done
+	MOVV	$1, R14
+	XOR	R15, R15
+	ADDV	R6, R5, R5
+
+flush_buffer:
+	MOVBU	-1(R5), R25
+	SRLV	$56, R14, R24
+	SLLV	$8, R15, R28
+	SLLV	$8, R14, R14
+	OR	R24, R28, R15
+	XOR	R25, R14, R14
+	SUBV	$1, R6, R6
+	SUBV	$1, R5, R5
+	BNE	R6, R0, flush_buffer
+
+	ADDV	R14, R8, R8
+	SGTU	R14, R8, R24
+	ADDV	R15, R9, R27
+	SGTU	R15, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R10, R24, R10
+
+	MOVV	$16, R6
+	JMP	multiply
+
+done:
+	MOVV	R8, (R4)
+	MOVV	R9, 8(R4)
+	MOVV	R10, 16(R4)
+	RET
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
@@ -1,47 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build gc && !purego
-
-package poly1305
-
-//go:noescape
-func update(state *macState, msg []byte)
-
-// mac is a wrapper for macGeneric that redirects calls that would have gone to
-// updateGeneric to update.
-//
-// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
-// using function pointers would carry a major performance cost.
-type mac struct{ macGeneric }
-
-func (h *mac) Write(p []byte) (int, error) {
-	nn := len(p)
-	if h.offset > 0 {
-		n := copy(h.buffer[h.offset:], p)
-		if h.offset+n < TagSize {
-			h.offset += n
-			return nn, nil
-		}
-		p = p[n:]
-		h.offset = 0
-		update(&h.macState, h.buffer[:])
-	}
-	if n := len(p) - (len(p) % TagSize); n > 0 {
-		update(&h.macState, p[:n])
-		p = p[n:]
-	}
-	if len(p) > 0 {
-		h.offset += copy(h.buffer[h.offset:], p)
-	}
-	return nn, nil
-}
-
-func (h *mac) Sum(out *[16]byte) {
-	state := h.macState
-	if h.offset > 0 {
-		update(&state, h.buffer[:h.offset])
-	}
-	finalize(out, &state.h, &state.s)
-}
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
@@ -2,15 +2,25 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)

 #include "textflag.h"

 // This was ported from the amd64 implementation.

+#ifdef GOARCH_ppc64le
+#define LE_MOVD MOVD
+#define LE_MOVWZ MOVWZ
+#define LE_MOVHZ MOVHZ
+#else
+#define LE_MOVD MOVDBR
+#define LE_MOVWZ MOVWBR
+#define LE_MOVHZ MOVHBR
+#endif
+
 #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
-	MOVD (msg), t0;  \
-	MOVD 8(msg), t1; \
+	LE_MOVD (msg)( R0), t0; \
+	LE_MOVD (msg)(R24), t1; \
 	MOVD $1, t2;     \
 	ADDC t0, h0, h0; \
 	ADDE t1, h1, h1; \
@@ -19,15 +29,14 @@

 #define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
 	MULLD  r0, h0, t0;  \
-	MULLD  r0, h1, t4;  \
 	MULHDU r0, h0, t1;  \
+	MULLD  r0, h1, t4;  \
 	MULHDU r0, h1, t5;  \
 	ADDC   t4, t1, t1;  \
 	MULLD  r0, h2, t2;  \
-	ADDZE  t5;          \
 	MULHDU r1, h0, t4;  \
 	MULLD  r1, h0, h0;  \
-	ADD    t5, t2, t2;  \
+	ADDE   t5, t2, t2;  \
 	ADDC   h0, t1, t1;  \
 	MULLD  h2, r1, t3;  \
 	ADDZE  t4, h0;      \
@@ -37,13 +46,11 @@
 	ADDE   t5, t3, t3;  \
 	ADDC   h0, t2, t2;  \
 	MOVD   $-4, t4;     \
-	MOVD   t0, h0;      \
-	MOVD   t1, h1;      \
 	ADDZE  t3;          \
-	ANDCC  $3, t2, h2;  \
-	AND    t2, t4, t0;  \
+	RLDICL $0, t2, $62, h2; \
+	AND    t2, t4, h0;  \
 	ADDC   t0, h0, h0;  \
-	ADDE   t3, h1, h1;  \
+	ADDE   t3, t1, h1;  \
 	SLD    $62, t3, t4; \
 	SRD    $2, t2;      \
 	ADDZE  h2;          \
@@ -53,10 +60,6 @@
 	ADDE   t3, h1, h1;  \
 	ADDZE  h2

-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
-
 // func update(state *[7]uint64, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVD state+0(FP), R3
@@ -69,12 +72,15 @@ TEXT ·update(SB), $0-32
 	MOVD 24(R3), R11 // r0
 	MOVD 32(R3), R12 // r1

+	MOVD $8, R24
+
 	CMP R5, $16
 	BLT bytes_between_0_and_15

 loop:
 	POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)

+	PCALIGN $16
 multiply:
 	POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
 	ADD $-16, R5
@@ -96,7 +102,7 @@ flush_buffer:

 	// Greater than 8 -- load the rightmost remaining bytes in msg
 	// and put into R17 (h1)
-	MOVD (R4)(R21), R17
+	LE_MOVD (R4)(R21), R17
 	MOVD $16, R22

 	// Find the offset to those bytes
@@ -120,7 +126,7 @@ just1:
 	BLT less8

 	// Exactly 8
-	MOVD (R4), R16
+	LE_MOVD (R4), R16

 	CMP R17, $0

@@ -135,7 +141,7 @@ less8:
 	MOVD  $0, R22   // shift count
 	CMP   R5, $4
 	BLT   less4
-	MOVWZ (R4), R16
+	LE_MOVWZ (R4), R16
 	ADD   $4, R4
 	ADD   $-4, R5
 	MOVD  $32, R22
@@ -143,7 +149,7 @@ less8:
 less4:
 	CMP   R5, $2
 	BLT   less2
-	MOVHZ (R4), R21
+	LE_MOVHZ (R4), R21
 	SLD   R22, R21, R21
 	OR    R16, R21, R16
 	ADD   $16, R22