check in v3.8.1 source

2023-09-03 18:24:16 -07:00
parent fbb21ed921
commit b31c897352
1205 changed files with 561101 additions and 0 deletions
--- a/crypto/bn/arch/aarch64/bn_arch.h
+++ b/crypto/bn/arch/aarch64/bn_arch.h
@@ -0,0 +1,369 @@
+/*	$OpenBSD: bn_arch.h,v 1.13 2023/07/24 10:21:29 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#if defined(__GNUC__)
+
+#define HAVE_BN_CLZW
+
+static inline int
+bn_clzw(BN_ULONG w)
+{
+	BN_ULONG n;
+
+	__asm__ ("clz   %[n], %[w]"
+	    : [n]"=r"(n)
+	    : [w]"r"(w));
+
+	return n;
+}
+
+#define HAVE_BN_ADDW
+
+static inline void
+bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG carry, r0;
+
+	__asm__ (
+	    "adds  %[r0], %[a], %[b] \n"
+	    "cset  %[carry], cs \n"
+	    : [carry]"=r"(carry), [r0]"=r"(r0)
+	    : [a]"r"(a), [b]"r"(b)
+	    : "cc");
+
+	*out_r1 = carry;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_ADDW_ADDW
+
+static inline void
+bn_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
+    BN_ULONG *out_r0)
+{
+	BN_ULONG carry, r0;
+
+	__asm__ (
+	    "adds  %[r0], %[a], %[b] \n"
+	    "cset  %[carry], cs \n"
+	    "adds  %[r0], %[r0], %[c] \n"
+	    "cinc  %[carry], %[carry], cs \n"
+	    : [carry]"=&r"(carry), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c]"r"(c)
+	    : "cc");
+
+	*out_r1 = carry;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_QWADDQW
+
+static inline void
+bn_qwaddqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3,
+    BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG carry, BN_ULONG *out_carry,
+    BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r3, r2, r1, r0;
+
+	__asm__ (
+	    "adds  xzr, %[carry], #-1 \n"
+	    "adcs  %[r0], %[a0], %[b0] \n"
+	    "adcs  %[r1], %[a1], %[b1] \n"
+	    "adcs  %[r2], %[a2], %[b2] \n"
+	    "adcs  %[r3], %[a3], %[b3] \n"
+	    "cset  %[carry], cs \n"
+	    : [carry]"+r"(carry), [r3]"=&r"(r3), [r2]"=&r"(r2),
+		[r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0),
+		[b3]"r"(b3), [b2]"r"(b2), [b1]"r"(b1), [b0]"r"(b0)
+	    : "cc");
+
+	*out_carry = carry;
+	*out_r3 = r3;
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MULW
+
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/* Unsigned multiplication using a umulh/mul pair. */
+	__asm__ (
+	    "umulh %[r1], %[a], %[b] \n"
+	    "mul   %[r0], %[a], %[b] \n"
+	    : [r1]"=&r"(r1), [r0]"=r"(r0)
+	    : [a]"r"(a), [b]"r"(b));
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MULW_ADDW
+
+static inline void
+bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
+    BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	__asm__ (
+	    "umulh  %[r1], %[a], %[b] \n"
+	    "mul    %[r0], %[a], %[b] \n"
+	    "adds   %[r0], %[r0], %[c] \n"
+	    "adc    %[r1], %[r1], xzr \n"
+	    : [r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c]"r"(c)
+	    : "cc");
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MULW_ADDW_ADDW
+
+static inline void
+bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d,
+    BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	__asm__ (
+	    "umulh  %[r1], %[a], %[b] \n"
+	    "mul    %[r0], %[a], %[b] \n"
+	    "adds   %[r0], %[r0], %[c] \n"
+	    "adc    %[r1], %[r1], xzr \n"
+	    "adds   %[r0], %[r0], %[d] \n"
+	    "adc    %[r1], %[r1], xzr \n"
+	    : [r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c]"r"(c), [d]"r"(d)
+	    : "cc");
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MULW_ADDTW
+
+static inline void
+bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
+    BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r2, r1, r0;
+
+	__asm__ (
+	    "umulh  %[r1], %[a], %[b] \n"
+	    "mul    %[r0], %[a], %[b] \n"
+	    "adds   %[r0], %[r0], %[c0] \n"
+	    "adcs   %[r1], %[r1], %[c1] \n"
+	    "adc    %[r2], xzr, %[c2] \n"
+	    : [r2]"=&r"(r2), [r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0)
+	    : "cc");
+
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MUL2_MULW_ADDTW
+
+static inline void
+bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
+    BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r2, r1, r0, x1, x0;
+
+	__asm__ (
+	    "umulh  %[x1], %[a], %[b] \n"
+	    "mul    %[x0], %[a], %[b] \n"
+	    "adds   %[r0], %[c0], %[x0] \n"
+	    "adcs   %[r1], %[c1], %[x1] \n"
+	    "adc    %[r2], xzr, %[c2] \n"
+	    "adds   %[r0], %[r0], %[x0] \n"
+	    "adcs   %[r1], %[r1], %[x1] \n"
+	    "adc    %[r2], xzr, %[r2] \n"
+	    : [r2]"=&r"(r2), [r1]"=&r"(r1), [r0]"=&r"(r0), [x1]"=&r"(x1),
+		[x0]"=&r"(x0)
+	    : [a]"r"(a), [b]"r"(b), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0)
+	    : "cc");
+
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_QWMULW_ADDW
+
+static inline void
+bn_qwmulw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b,
+    BN_ULONG c, BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2,
+    BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r4, r3, r2, r1, r0;
+
+	__asm__ (
+	    "umulh  %[r1], %[a0], %[b] \n"
+	    "mul    %[r0], %[a0], %[b] \n"
+	    "adds   %[r0], %[r0], %[c] \n"
+	    "umulh  %[r2], %[a1], %[b] \n"
+	    "mul     %[c], %[a1], %[b] \n"
+	    "adcs   %[r1], %[r1], %[c] \n"
+	    "umulh  %[r3], %[a2], %[b] \n"
+	    "mul     %[c], %[a2], %[b] \n"
+	    "adcs   %[r2], %[r2], %[c] \n"
+	    "umulh  %[r4], %[a3], %[b] \n"
+	    "mul     %[c], %[a3], %[b] \n"
+	    "adcs   %[r3], %[r3], %[c] \n"
+	    "adc    %[r4], %[r4], xzr  \n"
+	    : [c]"+&r"(c), [r4]"=&r"(r4), [r3]"=&r"(r3), [r2]"=&r"(r2),
+		[r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), [b]"r"(b)
+	    : "cc");
+
+	*out_r4 = r4;
+	*out_r3 = r3;
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_QWMULW_ADDQW_ADDW
+
+static inline void
+bn_qwmulw_addqw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0,
+    BN_ULONG b, BN_ULONG c3, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, BN_ULONG d,
+    BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1,
+    BN_ULONG *out_r0)
+{
+	BN_ULONG r4, r3, r2, r1, r0;
+
+	__asm__ (
+	    "umulh  %[r1], %[a0], %[b]  \n"
+	    "mul    %[r0], %[a0], %[b]  \n"
+	    "adds   %[r0], %[r0], %[d]  \n"
+	    "umulh  %[r2], %[a1], %[b]  \n"
+	    "mul     %[d], %[a1], %[b]  \n"
+	    "adcs   %[r1], %[r1], %[d]  \n"
+	    "umulh  %[r3], %[a2], %[b]  \n"
+	    "mul     %[d], %[a2], %[b]  \n"
+	    "adcs   %[r2], %[r2], %[d]  \n"
+	    "umulh  %[r4], %[a3], %[b]  \n"
+	    "mul     %[d], %[a3], %[b]  \n"
+	    "adcs   %[r3], %[r3], %[d]  \n"
+	    "adc    %[r4], %[r4], xzr   \n"
+	    "adds   %[r0], %[r0], %[c0] \n"
+	    "adcs   %[r1], %[r1], %[c1] \n"
+	    "adcs   %[r2], %[r2], %[c2] \n"
+	    "adcs   %[r3], %[r3], %[c3] \n"
+	    "adc    %[r4], %[r4], xzr   \n"
+	    : [d]"+&r"(d), [r4]"=&r"(r4), [r3]"=&r"(r3), [r2]"=&r"(r2),
+		[r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), [b]"r"(b),
+		[c3]"r"(c3), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0)
+	    : "cc");
+
+	*out_r4 = r4;
+	*out_r3 = r3;
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_SUBW
+
+static inline void
+bn_subw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_borrow, BN_ULONG *out_r0)
+{
+	BN_ULONG borrow, r0;
+
+	__asm__ (
+	    "subs  %[r0], %[a], %[b] \n"
+	    "cset  %[borrow], cc \n"
+	    : [borrow]"=r"(borrow), [r0]"=r"(r0)
+	    : [a]"r"(a), [b]"r"(b)
+	    : "cc");
+
+	*out_borrow = borrow;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_SUBW_SUBW
+
+static inline void
+bn_subw_subw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_borrow,
+    BN_ULONG *out_r0)
+{
+	BN_ULONG borrow, r0;
+
+	__asm__ (
+	    "subs  %[r0], %[a], %[b] \n"
+	    "cset  %[borrow], cc \n"
+	    "subs  %[r0], %[r0], %[c] \n"
+	    "cinc  %[borrow], %[borrow], cc \n"
+	    : [borrow]"=&r"(borrow), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c]"r"(c)
+	    : "cc");
+
+	*out_borrow = borrow;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_QWSUBQW
+
+static inline void
+bn_qwsubqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3,
+    BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG borrow, BN_ULONG *out_borrow,
+    BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r3, r2, r1, r0;
+
+	__asm__ (
+	    "subs  xzr, xzr, %[borrow] \n"
+	    "sbcs  %[r0], %[a0], %[b0] \n"
+	    "sbcs  %[r1], %[a1], %[b1] \n"
+	    "sbcs  %[r2], %[a2], %[b2] \n"
+	    "sbcs  %[r3], %[a3], %[b3] \n"
+	    "cset  %[borrow], cc \n"
+	    : [borrow]"+r"(borrow), [r3]"=&r"(r3), [r2]"=&r"(r2),
+		[r1]"=&r"(r1), [r0]"=&r"(r0)
+	    : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0),
+		[b3]"r"(b3), [b2]"r"(b2), [b1]"r"(b1), [b0]"r"(b0)
+	    : "cc");
+
+	*out_borrow = borrow;
+	*out_r3 = r3;
+	*out_r2 = r2;
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#endif /* __GNUC__ */
+
+#endif
+#endif
--- a/crypto/bn/arch/alpha/bn_arch.h
+++ b/crypto/bn/arch/alpha/bn_arch.h
@@ -0,0 +1,44 @@
+/*	$OpenBSD: bn_arch.h,v 1.4 2023/02/16 10:41:03 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#if 0 /* Needs testing and enabling. */
+#if defined(__GNUC__)
+#define HAVE_BN_MULW
+
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/* Unsigned multiplication using a umulh/mulq pair. */
+	__asm__ ("umulh %2, %3, %0; mulq %2, %3, %1"
+	    : "=&r"(r1), "=r"(r0)
+	    : "r"(a), "r"(b));
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+#endif /* __GNUC__ */
+#endif
+
+#endif
+#endif
--- a/crypto/bn/arch/amd64/bignum_add.S
+++ b/crypto/bn/arch/amd64/bignum_add.S
@@ -0,0 +1,165 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Add, z := x + y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_add
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x + y operation, truncating modulo p words in general and
+// returning a top carry (0 or 1) in the p'th place, only adding the input
+// words below p (as well as m and n respectively) to get the sum and carry.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI:   RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
+        .text
+
+#define p rdi
+#define z rsi
+#define m rdx
+#define x rcx
+#define n r8
+#define y r9
+#define i r10
+#define a rax
+
+#define ashort eax
+
+
+
+S2N_BN_SYMBOL(bignum_add):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+        mov     r9, [rsp+64]
+#endif
+
+// Zero the main index counter for both branches
+
+        xor     i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmp     p, m
+        cmovc   m, p
+        cmp     p, n
+        cmovc   n, p
+        cmp     m, n
+        jc      ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        sub     p, m
+        sub     m, n
+        inc     m
+        test    n, n
+        jz      xtest
+xmainloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i],a
+        inc     i
+        dec     n
+        jnz     xmainloop
+        jmp     xtest
+xtoploop:
+        mov     a, [x+8*i]
+        adc     a, 0
+        mov     [z+8*i],a
+        inc     i
+xtest:
+        dec     m
+        jnz     xtoploop
+        mov     ashort, 0
+        adc     a, 0
+        test    p, p
+        jnz     tails
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+// The case where y is longer (p >= n > m)
+
+ylonger:
+
+        sub     p, n
+        sub     n, m
+        test    m, m
+        jz      ytoploop
+ymainloop:
+        mov     a, [x+8*i]
+        adc     a, [y+8*i]
+        mov     [z+8*i],a
+        inc     i
+        dec     m
+        jnz     ymainloop
+ytoploop:
+        mov     a, [y+8*i]
+        adc     a, 0
+        mov     [z+8*i],a
+        inc     i
+        dec     n
+        jnz     ytoploop
+        mov     ashort, 0
+        adc     a, 0
+        test    p, p
+        jnz     tails
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+// Adding a non-trivial tail, when p > max(m,n)
+
+tails:
+        mov     [z+8*i],a
+        xor     a, a
+        jmp     tail
+tailloop:
+        mov     [z+8*i],a
+tail:
+        inc     i
+        dec     p
+        jnz     tailloop
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_cmadd.S
+++ b/crypto/bn/arch/amd64/bignum_cmadd.S
@@ -0,0 +1,155 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Multiply-add with single-word multiplier, z := z + c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmadd
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z + c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when p = n + 1, or
+// more generally when n <= p and the result fits in p + 1 digits. In these
+// cases it gives the top digit of the (p + 1)-digit result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
+        .text
+
+#define p rdi
+#define z rsi
+#define c r9
+#define n rcx
+#define x r8
+
+#define i r10
+#define h r11
+
+#define r rbx
+
+#define hshort r11d
+#define ishort r10d
+
+
+
+S2N_BN_SYMBOL(bignum_cmadd):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+
+// Seems hard to avoid one more register
+
+        push    rbx
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmp     p, n
+        cmovc   n, p
+        sub     p, n
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+        xor     h, h
+        test    n, n
+        jz      end
+
+// Move c into a safer register as multiplies overwrite rdx
+
+        mov     c, rdx
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
+
+        mov     rax, [x]
+        mul     c
+        add     [z], rax
+        mov     h, rdx
+        mov     ishort, 1
+        dec     n
+        jz      hightail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+loop:
+        adc     h, [z+8*i]
+        sbb     r, r
+        mov     rax, [x+8*i]
+        mul     c
+        sub     rdx, r
+        add     rax, h
+        mov     [z+8*i], rax
+        mov     h, rdx
+        inc     i
+        dec     n
+        jnz     loop
+
+hightail:
+        adc     h, 0
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+tail:
+        test    p, p
+        jz      end
+
+        add     [z+8*i], h
+        mov     hshort, 0
+        inc     i
+        dec     p
+        jz      highend
+
+tloop:
+        adc     [z+8*i], h
+        inc     i
+        dec     p
+        jnz     tloop
+
+highend:
+
+        adc     h, 0
+
+// Return the high/carry word
+
+end:
+        mov     rax, h
+
+        pop     rbx
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_cmul.S
+++ b/crypto/bn/arch/amd64/bignum_cmul.S
@@ -0,0 +1,138 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word, z := c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmul
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := c * y" operation where y is n digits, result z is p.
+// Truncates the result in general unless p >= n + 1.
+//
+// The return value is a high/carry word that is meaningful when p >= n as
+// giving the high part of the result. Since this is always zero if p > n,
+// it is mainly of interest in the special case p = n, i.e. where the source
+// and destination have the same nominal size, when it gives the extra word
+// of the full result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
+        .text
+
+#define p rdi
+#define z rsi
+#define c r9
+#define n rcx
+#define x r8
+
+#define i r10
+#define h r11
+
+
+
+S2N_BN_SYMBOL(bignum_cmul):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+#endif
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output. Now we can
+// assume that n <= p
+
+        cmp     p, n
+        cmovc   n, p
+
+// Initialize current input/output pointer offset i and high part h.
+// But then if n = 0 skip the multiplication and go to the tail part
+
+        xor     h, h
+        xor     i, i
+        test    n, n
+        jz      tail
+
+// Move c into a safer register as multiplies overwrite rdx
+
+        mov     c, rdx
+
+// Initialization of the loop: [h,l] = c * x_0
+
+        mov     rax, [x]
+        mul     c
+        mov     [z], rax
+        mov     h, rdx
+        inc     i
+        cmp     i, n
+        jz      tail
+
+// Main loop doing the multiplications
+
+loop:
+        mov     rax, [x+8*i]
+        mul     c
+        add     rax, h
+        adc     rdx, 0
+        mov     [z+8*i], rax
+        mov     h, rdx
+        inc     i
+        cmp     i, n
+        jc      loop
+
+// Add a tail when the destination is longer
+
+tail:
+        cmp     i, p
+        jnc     end
+        mov     [z+8*i], h
+        xor     h, h
+        inc     i
+        cmp     i, p
+        jnc     end
+
+tloop:
+        mov     [z+8*i], h
+        inc     i
+        cmp     i, p
+        jc      tloop
+
+// Return the high/carry word
+
+end:
+        mov     rax, h
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_mul.S
+++ b/crypto/bn/arch/amd64/bignum_mul.S
@@ -0,0 +1,167 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[m], y[n]; output z[k]
+//
+//    extern void bignum_mul
+//     (uint64_t k, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
+// Truncates the result in general unless k >= m + n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
+        .text
+
+// These are actually right
+
+#define p rdi
+#define z rsi
+#define n r8
+
+// These are not
+
+#define c r15
+#define h r14
+#define l r13
+#define x r12
+#define y r11
+#define i rbx
+#define k r10
+#define m rbp
+
+// These are always local scratch since multiplier result is in these
+
+#define a rax
+#define d rdx
+
+
+
+S2N_BN_SYMBOL(bignum_mul):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+        mov     r9, [rsp+64]
+#endif
+
+// We use too many registers, and also we need rax:rdx for multiplications
+
+        push    rbx
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        mov     m, rdx
+
+// If the result size is zero, do nothing
+// Note that even if either or both inputs has size zero, we can't
+// just give up because we at least need to zero the output array
+// If we did a multiply-add variant, however, then we could
+
+        test    p, p
+        jz      end
+
+// Set initial 2-part sum to zero (we zero c inside the body)
+
+        xor     h,h
+        xor     l,l
+
+// Otherwise do outer loop k = 0 ... k = p - 1
+
+        xor     k, k
+
+outerloop:
+
+// Zero our carry term first; we eventually want it and a zero is useful now
+// Set a =  max 0 (k + 1 - n), i = min (k + 1) m
+// This defines the range a <= j < i for the inner summation
+// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
+// And since we want to increment it anyway, we might as well do it now
+
+        xor     c, c            // c = 0
+        inc     k               // k = k + 1
+
+        mov     a, k            // a = k + 1
+        sub     a, n            // a = k + 1 - n
+        cmovc   a, c            // a = max 0 (k + 1 - n)
+
+        mov     i, m            // i = m
+        cmp     k, m            // CF <=> k + 1 < m
+        cmovc   i, k            // i = min (k + 1) m
+
+// Turn i into a loop count, and skip things if it's <= 0
+// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
+// and then launch into the main inner loop, postdecrementing i
+
+        mov     d, k
+        sub     d, i
+        sub     i, a
+        jbe     innerend
+        lea     x,[rcx+8*a]
+        lea     y,[r9+8*d-8]
+
+innerloop:
+        mov     rax, [y+8*i]
+        mul     QWORD PTR  [x]
+        add     x, 8
+        add     l, rax
+        adc     h, rdx
+        adc     c, 0
+        dec     i
+        jnz     innerloop
+
+innerend:
+
+        mov     [z], l
+        mov     l, h
+        mov     h, c
+        add     z, 8
+
+        cmp     k, p
+        jc      outerloop
+
+end:
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+        pop     rbx
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_mul_4_8_alt.S
+++ b/crypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -0,0 +1,157 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8_alt
+//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
+        .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// This is moved from rdx to free it for muls
+
+#define y rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+
+S2N_BN_SYMBOL(bignum_mul_4_8_alt):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+
+// Copy y into a safe register to start with
+
+        mov     y, rdx
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+
+// Result term 3
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+
+// Result term 4
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        mov     [z+32], t0
+
+// Result term 5
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        mov     [z+40], t1
+
+// Result term 6
+
+        xor     t1, t1
+        combads(t0,t2,[x+24],[y+24])
+        mov     [z+48], t2
+
+// Result term 7
+
+        mov     [z+56], t0
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_mul_8_16_alt.S
+++ b/crypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -0,0 +1,244 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16_alt
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
+        .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// This is moved from rdx to free it for muls
+
+#define y rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+
+S2N_BN_SYMBOL(bignum_mul_8_16_alt):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+
+// Copy y into a safe register to start with
+
+        mov     y, rdx
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+
+// Result term 3
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+
+// Result term 4
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+
+// Result term 5
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+
+// Result term 6
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+48])
+        combadd(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        combadd(t1,t0,t2,[x+48],[y])
+        mov     [z+48], t2
+
+// Result term 7
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+56])
+        combadd(t2,t1,t0,[x+8],[y+48])
+        combadd(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        combadd(t2,t1,t0,[x+48],[y+8])
+        combadd(t2,t1,t0,[x+56],[y])
+        mov     [z+56], t0
+
+// Result term 8
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+8],[y+56])
+        combadd(t0,t2,t1,[x+16],[y+48])
+        combadd(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        combadd(t0,t2,t1,[x+48],[y+16])
+        combadd(t0,t2,t1,[x+56],[y+8])
+        mov     [z+64], t1
+
+// Result term 9
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+16],[y+56])
+        combadd(t1,t0,t2,[x+24],[y+48])
+        combadd(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        combadd(t1,t0,t2,[x+48],[y+24])
+        combadd(t1,t0,t2,[x+56],[y+16])
+        mov     [z+72], t2
+
+// Result term 10
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+24],[y+56])
+        combadd(t2,t1,t0,[x+32],[y+48])
+        combadd(t2,t1,t0,[x+40],[y+40])
+        combadd(t2,t1,t0,[x+48],[y+32])
+        combadd(t2,t1,t0,[x+56],[y+24])
+        mov     [z+80], t0
+
+// Result term 11
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+32],[y+56])
+        combadd(t0,t2,t1,[x+40],[y+48])
+        combadd(t0,t2,t1,[x+48],[y+40])
+        combadd(t0,t2,t1,[x+56],[y+32])
+        mov     [z+88], t1
+
+// Result term 12
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+40],[y+56])
+        combadd(t1,t0,t2,[x+48],[y+48])
+        combadd(t1,t0,t2,[x+56],[y+40])
+        mov     [z+96], t2
+
+// Result term 13
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+48],[y+56])
+        combadd(t2,t1,t0,[x+56],[y+48])
+        mov     [z+104], t0
+
+// Result term 14
+
+        combads(t2,t1,[x+56],[y+56])
+        mov     [z+112], t1
+
+// Result term 11
+
+        mov     [z+120], t2
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_sqr.S
+++ b/crypto/bn/arch/amd64/bignum_sqr.S
@@ -0,0 +1,197 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Square z := x^2
+// Input x[n]; output z[k]
+//
+//    extern void bignum_sqr
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+//
+// Does the "z := x^2" operation where x is n digits and result z is k.
+// Truncates the result in general unless k >= 2 * n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
+        .text
+
+// First three are where arguments come in, but n is moved.
+
+#define p rdi
+#define z rsi
+#define x rcx
+#define n r8
+
+// These are always local scratch since multiplier result is in these
+
+#define a rax
+#define d rdx
+
+// Other variables
+
+#define i rbx
+#define ll rbp
+#define hh r9
+#define k r10
+#define y r11
+#define htop r12
+#define l r13
+#define h r14
+#define c r15
+
+// Short versions
+
+#define llshort ebp
+
+S2N_BN_SYMBOL(bignum_sqr):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+#endif
+
+// We use too many registers, and also we need rax:rdx for multiplications
+
+        push    rbx
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        mov     n, rdx
+
+// If p = 0 the result is trivial and nothing needs doing
+
+        test    p, p
+        jz      end
+
+// initialize (hh,ll) = 0
+
+        xor     llshort, llshort
+        xor     hh, hh
+
+// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
+
+        xor     k, k
+
+outerloop:
+
+// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
+// We want to accumulate all x[i] * x[k - i] for bot <= i < top
+// For the optimization of squaring we avoid duplication and do
+// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
+// Initialize i = bot; in fact just compute bot as i directly.
+
+        xor     c, c
+        lea     i, [k+1]
+        mov     htop, i
+        shr     htop, 1
+        sub     i, n
+        cmovc   i, c
+        cmp     htop, n
+        cmovnc  htop, n
+
+// Initialize the three-part local sum (c,h,l); c was already done above
+
+        xor     l, l
+        xor     h, h
+
+// If htop <= bot then main doubled part of the sum is empty
+
+        cmp     i, htop
+        jnc     nosumming
+
+// Use a moving pointer for [y] = x[k-i] for the cofactor
+
+        mov     a, k
+        sub     a, i
+        lea     y, [x+8*a]
+
+// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
+
+innerloop:
+        mov     a, [x+8*i]
+        mul     QWORD PTR [y]
+        add     l, a
+        adc     h, d
+        adc     c, 0
+        sub     y, 8
+        inc     i
+        cmp     i, htop
+        jc      innerloop
+
+// Now double it
+
+        add     l, l
+        adc     h, h
+        adc     c, c
+
+// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
+
+nosumming:
+        test    k, 1
+        jnz     innerend
+        cmp     i, n
+        jnc     innerend
+
+        mov     a, [x+8*i]
+        mul     a
+        add     l, a
+        adc     h, d
+        adc     c, 0
+
+// Now add the local sum into the global sum, store and shift
+
+innerend:
+        add     l, ll
+        mov     [z+8*k], l
+        adc     h, hh
+        mov     ll, h
+        adc     c, 0
+        mov     hh, c
+
+        inc     k
+        cmp     k, p
+        jc      outerloop
+
+// Restore registers and return
+
+end:
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+        pop     rbx
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
+++ b/crypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -0,0 +1,145 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8_alt
+//      (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
+        .text
+
+// Input arguments
+
+#define z rdi
+#define x rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 rcx
+#define t1 r8
+#define t2 r9
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     rax
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+       xor     t2, t2
+       combadd2(t2,t1,t0,[x],[x+8])
+       mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+
+// Result term 3
+
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x],[x+24])
+        combadd2(t1,t0,t2,[x+8],[x+16])
+        mov     [z+24], t2
+
+// Result term 4
+
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x+8],[x+24])
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+
+// Result term 5
+
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+16],[x+24])
+        mov     [z+40], t1
+
+// Result term 6
+
+        xor     t1, t1
+        combads(t0,t2,[x+24])
+        mov     [z+48], t2
+
+// Result term 7
+
+        mov     [z+56], t0
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
+++ b/crypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -0,0 +1,242 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
+        .text
+
+// Input arguments
+
+#define z rdi
+#define x rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Additional temporaries for local windows to share doublings
+
+#define u0 rcx
+#define u1 r11
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// Set up initial window (c,h,l) = numa * numb
+
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+
+// Square term incorporation (c,h,l) += numba^2
+
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+
+// A version doubling directly before adding, for single non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     rax
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+
+// Result term 3
+
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+
+// Result term 4
+
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+
+// Result term 5
+
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+
+// Result term 6
+
+        combaddz(t1,u1,u0,[x],[x+48])
+        combadd(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+
+// Result term 7
+
+        combaddz(t2,u1,u0,[x],[x+56])
+        combadd(t2,u1,u0,[x+8],[x+48])
+        combadd(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+
+// Result term 8
+
+        combaddz(t0,u1,u0,[x+8],[x+56])
+        combadd(t0,u1,u0,[x+16],[x+48])
+        combadd(t0,u1,u0,[x+24],[x+40])
+        doubladd(t0,t2,t1,u1,u0)
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+
+// Result term 9
+
+        combaddz(t1,u1,u0,[x+16],[x+56])
+        combadd(t1,u1,u0,[x+24],[x+48])
+        combadd(t1,u1,u0,[x+32],[x+40])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+72], t2
+
+// Result term 10
+
+        combaddz(t2,u1,u0,[x+24],[x+56])
+        combadd(t2,u1,u0,[x+32],[x+48])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+40])
+        mov     [z+80], t0
+
+// Result term 11
+
+        combaddz(t0,u1,u0,[x+32],[x+56])
+        combadd(t0,u1,u0,[x+40],[x+48])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+88], t1
+
+// Result term 12
+
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+40],[x+56])
+        combadd1(t1,t0,t2,[x+48])
+        mov     [z+96], t2
+
+// Result term 13
+
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x+48],[x+56])
+        mov     [z+104], t0
+
+// Result term 14
+
+        combads(t2,t1,[x+56])
+        mov     [z+112], t1
+
+// Result term 15
+
+        mov     [z+120], t2
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bignum_sub.S
+++ b/crypto/bn/arch/amd64/bignum_sub.S
@@ -0,0 +1,153 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Subtract, z := x - y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_sub
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x - y operation, truncating modulo p words in general and
+// returning a top borrow (0 or 1) in the p'th place, only subtracting input
+// words below p (as well as m and n respectively) to get the diff and borrow.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI:   RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
+        .text
+
+#define p rdi
+#define z rsi
+#define m rdx
+#define x rcx
+#define n r8
+#define y r9
+#define i r10
+#define a rax
+
+#define ashort eax
+
+
+
+S2N_BN_SYMBOL(bignum_sub):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+        mov     r8, [rsp+56]
+        mov     r9, [rsp+64]
+#endif
+
+// Zero the main index counter for both branches
+
+        xor     i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmp     p, m
+        cmovc   m, p
+        cmp     p, n
+        cmovc   n, p
+        cmp     m, n
+        jc      ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        sub     p, m
+        sub     m, n
+        inc     m
+        test    n, n
+        jz      xtest
+xmainloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i],a
+        inc     i
+        dec     n
+        jnz     xmainloop
+        jmp     xtest
+xtoploop:
+        mov     a, [x+8*i]
+        sbb     a, 0
+        mov     [z+8*i],a
+        inc     i
+xtest:
+        dec     m
+        jnz     xtoploop
+        sbb     a, a
+        test    p, p
+        jz      tailskip
+tailloop:
+        mov     [z+8*i],a
+        inc     i
+        dec     p
+        jnz     tailloop
+tailskip:
+        neg     a
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+// The case where y is longer (p >= n > m)
+
+ylonger:
+
+        sub     p, n
+        sub     n, m
+        test    m, m
+        jz      ytoploop
+ymainloop:
+        mov     a, [x+8*i]
+        sbb     a, [y+8*i]
+        mov     [z+8*i],a
+        inc     i
+        dec     m
+        jnz     ymainloop
+ytoploop:
+        mov     ashort, 0
+        sbb     a, [y+8*i]
+        mov     [z+8*i],a
+        inc     i
+        dec     n
+        jnz     ytoploop
+        sbb     a, a
+        test    p, p
+        jnz     tailloop
+        neg     a
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/amd64/bn_arch.c
+++ b/crypto/bn/arch/amd64/bn_arch.c
@@ -0,0 +1,131 @@
+/*	$OpenBSD: bn_arch.c,v 1.7 2023/06/24 16:01:44 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#include "bn_arch.h"
+#include "bn_local.h"
+#include "s2n_bignum.h"
+
+#ifdef HAVE_BN_ADD
+BN_ULONG
+bn_add(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
+    int b_len)
+{
+	return bignum_add(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+	    b_len, (uint64_t *)b);
+}
+#endif
+
+
+#ifdef HAVE_BN_ADD_WORDS
+BN_ULONG
+bn_add_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
+{
+	return bignum_add(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+	    (uint64_t *)bd);
+}
+#endif
+
+#ifdef HAVE_BN_SUB
+BN_ULONG
+bn_sub(BN_ULONG *r, int r_len, const BN_ULONG *a, int a_len, const BN_ULONG *b,
+    int b_len)
+{
+	return bignum_sub(r_len, (uint64_t *)r, a_len, (uint64_t *)a,
+	    b_len, (uint64_t *)b);
+}
+#endif
+
+#ifdef HAVE_BN_SUB_WORDS
+BN_ULONG
+bn_sub_words(BN_ULONG *rd, const BN_ULONG *ad, const BN_ULONG *bd, int n)
+{
+	return bignum_sub(n, (uint64_t *)rd, n, (uint64_t *)ad, n,
+	    (uint64_t *)bd);
+}
+#endif
+
+#ifdef HAVE_BN_MUL_ADD_WORDS
+BN_ULONG
+bn_mul_add_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
+{
+	return bignum_cmadd(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+}
+#endif
+
+#ifdef HAVE_BN_MUL_WORDS
+BN_ULONG
+bn_mul_words(BN_ULONG *rd, const BN_ULONG *ad, int num, BN_ULONG w)
+{
+	return bignum_cmul(num, (uint64_t *)rd, w, num, (uint64_t *)ad);
+}
+#endif
+
+#ifdef HAVE_BN_MUL_COMBA4
+void
+bn_mul_comba4(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+{
+	/* XXX - consider using non-alt on CPUs that have the ADX extension. */
+	bignum_mul_4_8_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+}
+#endif
+
+#ifdef HAVE_BN_MUL_COMBA8
+void
+bn_mul_comba8(BN_ULONG *rd, BN_ULONG *ad, BN_ULONG *bd)
+{
+	/* XXX - consider using non-alt on CPUs that have the ADX extension. */
+	bignum_mul_8_16_alt((uint64_t *)rd, (uint64_t *)ad, (uint64_t *)bd);
+}
+#endif
+
+#ifdef HAVE_BN_SQR
+int
+bn_sqr(BIGNUM *r, const BIGNUM *a, int r_len, BN_CTX *ctx)
+{
+	bignum_sqr(r_len, (uint64_t *)r->d, a->top, (uint64_t *)a->d);
+
+	return 1;
+}
+#endif
+
+#ifdef HAVE_BN_SQR_COMBA4
+void
+bn_sqr_comba4(BN_ULONG *rd, const BN_ULONG *ad)
+{
+	/* XXX - consider using non-alt on CPUs that have the ADX extension. */
+	bignum_sqr_4_8_alt((uint64_t *)rd, (uint64_t *)ad);
+}
+#endif
+
+#ifdef HAVE_BN_SQR_COMBA8
+void
+bn_sqr_comba8(BN_ULONG *rd, const BN_ULONG *ad)
+{
+	/* XXX - consider using non-alt on CPUs that have the ADX extension. */
+	bignum_sqr_8_16_alt((uint64_t *)rd, (uint64_t *)ad);
+}
+#endif
+
+#ifdef HAVE_BN_WORD_CLZ
+int
+bn_word_clz(BN_ULONG w)
+{
+	return word_clz(w);
+}
+#endif
--- a/crypto/bn/arch/amd64/bn_arch.h
+++ b/crypto/bn/arch/amd64/bn_arch.h
@@ -0,0 +1,95 @@
+/*	$OpenBSD: bn_arch.h,v 1.13 2023/02/16 11:13:05 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifdef _WIN32
+#define OPENSSL_NO_ASM
+#else
+
+#ifndef OPENSSL_NO_ASM
+
+#define HAVE_BN_ADD
+#define HAVE_BN_ADD_WORDS
+
+#define HAVE_BN_DIV_WORDS
+
+#define HAVE_BN_MUL_ADD_WORDS
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+#define HAVE_BN_MUL_WORDS
+
+#define HAVE_BN_SQR
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
+#define HAVE_BN_SUB
+#define HAVE_BN_SUB_WORDS
+
+#define HAVE_BN_WORD_CLZ
+
+#if defined(__GNUC__)
+#define HAVE_BN_DIV_REM_WORDS_INLINE
+
+static inline void
+bn_div_rem_words_inline(BN_ULONG h, BN_ULONG l, BN_ULONG d, BN_ULONG *out_q,
+    BN_ULONG *out_r)
+{
+	BN_ULONG q, r;
+
+	/*
+	 * Unsigned division of %rdx:%rax by d with quotient being stored in
+	 * %rax and remainder in %rdx.
+	 */
+	__asm__ volatile ("divq %4"
+	    : "=a"(q), "=d"(r)
+	    : "d"(h), "a"(l), "rm"(d)
+	    : "cc");
+
+	*out_q = q;
+	*out_r = r;
+}
+#endif /* __GNUC__ */
+
+#if defined(__GNUC__)
+#define HAVE_BN_MULW
+
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/*
+	 * Unsigned multiplication of %rax, with the double word result being
+	 * stored in %rdx:%rax.
+	 */
+	__asm__ ("mulq %3"
+	    : "=d"(r1), "=a"(r0)
+	    : "a"(a), "rm"(b)
+	    : "cc");
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+#endif /* __GNUC__ */
+#endif /* _WIN32 */
+
+#endif
+#endif
--- a/crypto/bn/arch/amd64/word_clz.S
+++ b/crypto/bn/arch/amd64/word_clz.S
@@ -0,0 +1,60 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// ----------------------------------------------------------------------------
+// Count leading zero bits in a single word
+// Input a; output function return
+//
+//    extern uint64_t word_clz (uint64_t a);
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "s2n_bignum_internal.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz)
+        .text
+
+S2N_BN_SYMBOL(word_clz):
+	endbr64
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+#endif
+
+// First do rax = 63 - bsr(a), which is right except (maybe) for zero inputs
+
+        bsr     rax, rdi
+        xor     rax, 63
+
+// Force return of 64 in the zero-input case
+
+        mov     edx, 64
+        test    rdi, rdi
+        cmove   rax, rdx
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/bn/arch/arm/bn_arch.h
+++ b/crypto/bn/arch/arm/bn_arch.h
@@ -0,0 +1,73 @@
+/*	$OpenBSD: bn_arch.h,v 1.2 2023/06/24 15:51:47 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#if defined(__GNUC__)
+
+#define HAVE_BN_SUBW
+
+static inline void
+bn_subw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_borrow, BN_ULONG *out_r0)
+{
+	BN_ULONG borrow, r0;
+
+	__asm__ (
+	    "mov  %[borrow], #0 \n"
+	    "subs %[r0], %[a], %[b] \n"
+	    "sbc  %[borrow], %[borrow], #0 \n"
+	    "neg  %[borrow], %[borrow] \n"
+	    : [borrow]"=&r"(borrow), [r0]"=r"(r0)
+	    : [a]"r"(a), [b]"r"(b)
+	    : "cc");
+
+	*out_borrow = borrow;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_SUBW_SUBW
+
+static inline void
+bn_subw_subw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_borrow,
+    BN_ULONG *out_r0)
+{
+	BN_ULONG borrow, r0;
+
+	__asm__ (
+	    "mov  %[borrow], #0 \n"
+	    "subs %[r0], %[a], %[b] \n"
+	    "sbc  %[borrow], %[borrow], #0 \n"
+	    "subs %[r0], %[r0], %[c] \n"
+	    "sbc  %[borrow], %[borrow], #0 \n"
+	    "neg  %[borrow], %[borrow] \n"
+	    : [borrow]"=&r"(borrow), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b), [c]"r"(c)
+	    : "cc");
+
+	*out_borrow = borrow;
+	*out_r0 = r0;
+}
+
+#endif /* __GNUC__ */
+
+#endif
+#endif
--- a/crypto/bn/arch/hppa/bn_arch.h
+++ b/crypto/bn/arch/hppa/bn_arch.h
@@ -0,0 +1,24 @@
+/*	$OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:33 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#endif
+#endif
--- a/crypto/bn/arch/i386/bn_arch.h
+++ b/crypto/bn/arch/i386/bn_arch.h
@@ -0,0 +1,86 @@
+/*	$OpenBSD: bn_arch.h,v 1.9 2023/02/16 10:41:03 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#define HAVE_BN_ADD_WORDS
+
+#define HAVE_BN_DIV_WORDS
+
+#define HAVE_BN_MUL_ADD_WORDS
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+#define HAVE_BN_MUL_WORDS
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+#define HAVE_BN_SQR_WORDS
+
+#define HAVE_BN_SUB_WORDS
+
+#if defined(__GNUC__)
+#define HAVE_BN_DIV_REM_WORDS_INLINE
+
+static inline void
+bn_div_rem_words_inline(BN_ULONG h, BN_ULONG l, BN_ULONG d, BN_ULONG *out_q,
+    BN_ULONG *out_r)
+{
+	BN_ULONG q, r;
+
+	/*
+	 * Unsigned division of %edx:%eax by d with quotient being stored in
+	 * %eax and remainder in %edx.
+	 */
+	__asm__ volatile ("divl %4"
+	    : "=a"(q), "=d"(r)
+	    : "a"(l), "d"(h), "rm"(d)
+	    : "cc");
+
+	*out_q = q;
+	*out_r = r;
+}
+#endif /* __GNUC__ */
+
+#if defined(__GNUC__)
+#define HAVE_BN_MULW
+
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/*
+	 * Unsigned multiplication of %eax, with the double word result being
+	 * stored in %edx:%eax.
+	 */
+	__asm__ ("mull %3"
+	    : "=d"(r1), "=a"(r0)
+	    : "a"(a), "rm"(b)
+	    : "cc");
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+#endif /* __GNUC__ */
+
+#endif
+#endif
--- a/crypto/bn/arch/m88k/bn_arch.h
+++ b/crypto/bn/arch/m88k/bn_arch.h
@@ -0,0 +1,24 @@
+/*	$OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:33 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#endif
+#endif
--- a/crypto/bn/arch/mips64/bn_arch.h
+++ b/crypto/bn/arch/mips64/bn_arch.h
@@ -0,0 +1,40 @@
+/*	$OpenBSD: bn_arch.h,v 1.7 2023/01/23 12:17:58 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#define HAVE_BN_ADD_WORDS
+
+#define HAVE_BN_DIV_WORDS
+#define HAVE_BN_DIV_3_WORDS
+
+#define HAVE_BN_MUL_ADD_WORDS
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+#define HAVE_BN_MUL_WORDS
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+#define HAVE_BN_SQR_WORDS
+
+#define HAVE_BN_SUB_WORDS
+
+#endif
+#endif
--- a/crypto/bn/arch/powerpc/bn_arch.h
+++ b/crypto/bn/arch/powerpc/bn_arch.h
@@ -0,0 +1,39 @@
+/*	$OpenBSD: bn_arch.h,v 1.6 2023/01/23 12:17:58 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#define HAVE_BN_ADD_WORDS
+
+#define HAVE_BN_DIV_WORDS
+
+#define HAVE_BN_MUL_ADD_WORDS
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+#define HAVE_BN_MUL_WORDS
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+#define HAVE_BN_SQR_WORDS
+
+#define HAVE_BN_SUB_WORDS
+
+#endif
+#endif
--- a/crypto/bn/arch/powerpc64/bn_arch.h
+++ b/crypto/bn/arch/powerpc64/bn_arch.h
@@ -0,0 +1,44 @@
+/*	$OpenBSD: bn_arch.h,v 1.4 2023/02/16 10:41:03 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#if 0 /* Needs testing and enabling. */
+#if defined(__GNUC__)
+#define HAVE_BN_MULW
+
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/* Unsigned multiplication using a mulhdu/mul pair. */
+	__asm__ ("mulhdu %0, %2, %3; mul %1, %2, %3"
+	    : "=&r"(r1), "=r"(r0)
+	    : "r"(a), "r"(b));
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+#endif /* __GNUC__ */
+#endif
+
+#endif
+#endif
--- a/crypto/bn/arch/riscv64/bn_arch.h
+++ b/crypto/bn/arch/riscv64/bn_arch.h
@@ -0,0 +1,86 @@
+/*	$OpenBSD: bn_arch.h,v 1.7 2023/07/09 10:37:32 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <openssl/bn.h>
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#if defined(__GNUC__)
+
+#define HAVE_BN_ADDW
+
+static inline void
+bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG carry, r0;
+
+	__asm__ (
+	    "add   %[r0], %[a], %[b] \n"
+	    "sltu  %[carry], %[r0], %[a] \n"
+	    : [carry]"=r"(carry), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b));
+
+	*out_r1 = carry;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_MULW
+ 
+static inline void
+bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+	BN_ULONG r1, r0;
+
+	/*
+	 * Unsigned multiplication using a mulh/mul pair. Note that the order
+	 * of these instructions is important, as they can potentially be fused
+	 * into a single operation.
+	 */
+	__asm__ (
+	    "mulhu %[r1], %[a], %[b] \n"
+	    "mul   %[r0], %[a], %[b] \n"
+	    : [r1]"=&r"(r1), [r0]"=r"(r0)
+	    : [a]"r"(a), [b]"r"(b));
+
+	*out_r1 = r1;
+	*out_r0 = r0;
+}
+
+#define HAVE_BN_SUBW
+
+static inline void
+bn_subw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_borrow, BN_ULONG *out_r0)
+{
+	BN_ULONG borrow, r0;
+
+	__asm__ (
+	    "sub   %[r0], %[a], %[b] \n"
+	    "sltu  %[borrow], %[a], %[r0] \n"
+	    : [borrow]"=r"(borrow), [r0]"=&r"(r0)
+	    : [a]"r"(a), [b]"r"(b));
+
+	*out_borrow = borrow;
+	*out_r0 = r0;
+}
+
+#endif /* __GNUC__ */
+
+#endif
+#endif
--- a/crypto/bn/arch/sh/bn_arch.h
+++ b/crypto/bn/arch/sh/bn_arch.h
@@ -0,0 +1,24 @@
+/*	$OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:34 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#endif
+#endif
--- a/crypto/bn/arch/sparc64/bn_arch.h
+++ b/crypto/bn/arch/sparc64/bn_arch.h
@@ -0,0 +1,24 @@
+/*	$OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:34 jsing Exp $ */
+/*
+ * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HEADER_BN_ARCH_H
+#define HEADER_BN_ARCH_H
+
+#ifndef OPENSSL_NO_ASM
+
+#endif
+#endif