check in v3.8.2 source
This commit is contained in:
53
asm/arch/aarch64/arm64cpuid.S
Normal file
53
asm/arch/aarch64/arm64cpuid.S
Normal file
@@ -0,0 +1,53 @@
|
||||
#include "arm64_arch.h"
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto+sha3
|
||||
|
||||
.align 5
|
||||
.globl _armv7_neon_probe
|
||||
.type _armv7_neon_probe,%function
|
||||
_armv7_neon_probe:
|
||||
bti c
|
||||
orr v15.16b, v15.16b, v15.16b
|
||||
ret
|
||||
.size _armv7_neon_probe,.-_armv7_neon_probe
|
||||
|
||||
.globl _armv8_aes_probe
|
||||
.type _armv8_aes_probe,%function
|
||||
_armv8_aes_probe:
|
||||
bti c
|
||||
aese v0.16b, v0.16b
|
||||
ret
|
||||
.size _armv8_aes_probe,.-_armv8_aes_probe
|
||||
|
||||
.globl _armv8_sha1_probe
|
||||
.type _armv8_sha1_probe,%function
|
||||
_armv8_sha1_probe:
|
||||
bti c
|
||||
sha1h s0, s0
|
||||
ret
|
||||
.size _armv8_sha1_probe,.-_armv8_sha1_probe
|
||||
|
||||
.globl _armv8_sha256_probe
|
||||
.type _armv8_sha256_probe,%function
|
||||
_armv8_sha256_probe:
|
||||
bti c
|
||||
sha256su0 v0.4s, v0.4s
|
||||
ret
|
||||
.size _armv8_sha256_probe,.-_armv8_sha256_probe
|
||||
|
||||
.globl _armv8_pmull_probe
|
||||
.type _armv8_pmull_probe,%function
|
||||
_armv8_pmull_probe:
|
||||
bti c
|
||||
pmull v0.1q, v0.1d, v0.1d
|
||||
ret
|
||||
.size _armv8_pmull_probe,.-_armv8_pmull_probe
|
||||
|
||||
.globl _armv8_sha512_probe
|
||||
.type _armv8_sha512_probe,%function
|
||||
_armv8_sha512_probe:
|
||||
bti c
|
||||
sha512su0 v0.2d,v0.2d
|
||||
ret
|
||||
.size _armv8_sha512_probe,.-_armv8_sha512_probe
|
69
asm/arch/arm/armv4cpuid.S
Normal file
69
asm/arch/arm/armv4cpuid.S
Normal file
@@ -0,0 +1,69 @@
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__) && !defined(__APPLE__)
|
||||
.syntax unified
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#undef __thumb2__
|
||||
#endif
|
||||
|
||||
#if __ARM_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.align 5
|
||||
.globl _armv7_neon_probe
|
||||
.type _armv7_neon_probe,%function
|
||||
_armv7_neon_probe:
|
||||
vorr q0,q0,q0
|
||||
bx lr
|
||||
.size _armv7_neon_probe,.-_armv7_neon_probe
|
||||
|
||||
.globl _armv8_aes_probe
|
||||
.type _armv8_aes_probe,%function
|
||||
_armv8_aes_probe:
|
||||
#if defined(__thumb2__) && !defined(__APPLE__)
|
||||
.byte 0xb0,0xff,0x00,0x03 @ aese.8 q0,q0
|
||||
#else
|
||||
.byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
|
||||
#endif
|
||||
bx lr
|
||||
.size _armv8_aes_probe,.-_armv8_aes_probe
|
||||
|
||||
.globl _armv8_sha1_probe
|
||||
.type _armv8_sha1_probe,%function
|
||||
_armv8_sha1_probe:
|
||||
#if defined(__thumb2__) && !defined(__APPLE__)
|
||||
.byte 0x00,0xef,0x40,0x0c @ sha1c.32 q0,q0,q0
|
||||
#else
|
||||
.byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
|
||||
#endif
|
||||
bx lr
|
||||
.size _armv8_sha1_probe,.-_armv8_sha1_probe
|
||||
|
||||
.globl _armv8_sha256_probe
|
||||
.type _armv8_sha256_probe,%function
|
||||
_armv8_sha256_probe:
|
||||
#if defined(__thumb2__) && !defined(__APPLE__)
|
||||
.byte 0x00,0xff,0x40,0x0c @ sha256h.32 q0,q0,q0
|
||||
#else
|
||||
.byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
|
||||
#endif
|
||||
bx lr
|
||||
.size _armv8_sha256_probe,.-_armv8_sha256_probe
|
||||
.globl _armv8_pmull_probe
|
||||
.type _armv8_pmull_probe,%function
|
||||
_armv8_pmull_probe:
|
||||
#if defined(__thumb2__) && !defined(__APPLE__)
|
||||
.byte 0xa0,0xef,0x00,0x0e @ vmull.p64 q0,d0,d0
|
||||
#else
|
||||
.byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
|
||||
#endif
|
||||
bx lr
|
||||
.size _armv8_pmull_probe,.-_armv8_pmull_probe
|
||||
#endif
|
||||
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
164
asm/bn/arch/amd64/bignum_add.S
Normal file
164
asm/bn/arch/amd64/bignum_add.S
Normal file
@@ -0,0 +1,164 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Add, z := x + y
|
||||
// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
|
||||
//
|
||||
// extern uint64_t bignum_add
|
||||
// (uint64_t p, uint64_t *z,
|
||||
// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
|
||||
//
|
||||
// Does the z := x + y operation, truncating modulo p words in general and
|
||||
// returning a top carry (0 or 1) in the p'th place, only adding the input
|
||||
// words below p (as well as m and n respectively) to get the sum and carry.
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
|
||||
// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
|
||||
.text
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define m rdx
|
||||
#define x rcx
|
||||
#define n r8
|
||||
#define y r9
|
||||
#define i r10
|
||||
#define a rax
|
||||
|
||||
#define ashort eax
|
||||
|
||||
|
||||
|
||||
S2N_BN_SYMBOL(bignum_add):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+56]
|
||||
mov r9, [rsp+64]
|
||||
#endif
|
||||
|
||||
// Zero the main index counter for both branches
|
||||
|
||||
xor i, i
|
||||
|
||||
// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
|
||||
// we'll never need words past the p'th. Can now assume m <= p and n <= p.
|
||||
// Then compare the modified m and n and branch accordingly
|
||||
|
||||
cmp p, m
|
||||
cmovc m, p
|
||||
cmp p, n
|
||||
cmovc n, p
|
||||
cmp m, n
|
||||
jc ylonger
|
||||
|
||||
// The case where x is longer or of the same size (p >= m >= n)
|
||||
|
||||
sub p, m
|
||||
sub m, n
|
||||
inc m
|
||||
test n, n
|
||||
jz xtest
|
||||
xmainloop:
|
||||
mov a, [x+8*i]
|
||||
adc a, [y+8*i]
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec n
|
||||
jnz xmainloop
|
||||
jmp xtest
|
||||
xtoploop:
|
||||
mov a, [x+8*i]
|
||||
adc a, 0
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
xtest:
|
||||
dec m
|
||||
jnz xtoploop
|
||||
mov ashort, 0
|
||||
adc a, 0
|
||||
test p, p
|
||||
jnz tails
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
// The case where y is longer (p >= n > m)
|
||||
|
||||
ylonger:
|
||||
|
||||
sub p, n
|
||||
sub n, m
|
||||
test m, m
|
||||
jz ytoploop
|
||||
ymainloop:
|
||||
mov a, [x+8*i]
|
||||
adc a, [y+8*i]
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec m
|
||||
jnz ymainloop
|
||||
ytoploop:
|
||||
mov a, [y+8*i]
|
||||
adc a, 0
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec n
|
||||
jnz ytoploop
|
||||
mov ashort, 0
|
||||
adc a, 0
|
||||
test p, p
|
||||
jnz tails
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
// Adding a non-trivial tail, when p > max(m,n)
|
||||
|
||||
tails:
|
||||
mov [z+8*i],a
|
||||
xor a, a
|
||||
jmp tail
|
||||
tailloop:
|
||||
mov [z+8*i],a
|
||||
tail:
|
||||
inc i
|
||||
dec p
|
||||
jnz tailloop
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
154
asm/bn/arch/amd64/bignum_cmadd.S
Normal file
154
asm/bn/arch/amd64/bignum_cmadd.S
Normal file
@@ -0,0 +1,154 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Multiply-add with single-word multiplier, z := z + c * y
|
||||
// Inputs c, y[n]; outputs function return (carry-out) and z[k]
|
||||
//
|
||||
// extern uint64_t bignum_cmadd
|
||||
// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
|
||||
//
|
||||
// Does the "z := z + c * y" operation where y is n digits, result z is p.
|
||||
// Truncates the result in general.
|
||||
//
|
||||
// The return value is a high/carry word that is meaningful when p = n + 1, or
|
||||
// more generally when n <= p and the result fits in p + 1 digits. In these
|
||||
// cases it gives the top digit of the (p + 1)-digit result.
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
|
||||
// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
|
||||
.text
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define c r9
|
||||
#define n rcx
|
||||
#define x r8
|
||||
|
||||
#define i r10
|
||||
#define h r11
|
||||
|
||||
#define r rbx
|
||||
|
||||
#define hshort r11d
|
||||
#define ishort r10d
|
||||
|
||||
|
||||
|
||||
S2N_BN_SYMBOL(bignum_cmadd):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+56]
|
||||
#endif
|
||||
|
||||
// Seems hard to avoid one more register
|
||||
|
||||
push rbx
|
||||
|
||||
// First clamp the input size n := min(p,n) since we can never need to read
|
||||
// past the p'th term of the input to generate p-digit output.
|
||||
// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
|
||||
|
||||
cmp p, n
|
||||
cmovc n, p
|
||||
sub p, n
|
||||
|
||||
// Initialize high part h = 0; if n = 0 do nothing but return that zero
|
||||
|
||||
xor h, h
|
||||
test n, n
|
||||
jz end
|
||||
|
||||
// Move c into a safer register as multiplies overwrite rdx
|
||||
|
||||
mov c, rdx
|
||||
|
||||
// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
|
||||
|
||||
mov rax, [x]
|
||||
mul c
|
||||
add [z], rax
|
||||
mov h, rdx
|
||||
mov ishort, 1
|
||||
dec n
|
||||
jz hightail
|
||||
|
||||
// Main loop, where we always have CF + previous high part h to add in
|
||||
|
||||
loop:
|
||||
adc h, [z+8*i]
|
||||
sbb r, r
|
||||
mov rax, [x+8*i]
|
||||
mul c
|
||||
sub rdx, r
|
||||
add rax, h
|
||||
mov [z+8*i], rax
|
||||
mov h, rdx
|
||||
inc i
|
||||
dec n
|
||||
jnz loop
|
||||
|
||||
hightail:
|
||||
adc h, 0
|
||||
|
||||
// Propagate the carry all the way to the end with h as extra carry word
|
||||
|
||||
tail:
|
||||
test p, p
|
||||
jz end
|
||||
|
||||
add [z+8*i], h
|
||||
mov hshort, 0
|
||||
inc i
|
||||
dec p
|
||||
jz highend
|
||||
|
||||
tloop:
|
||||
adc [z+8*i], h
|
||||
inc i
|
||||
dec p
|
||||
jnz tloop
|
||||
|
||||
highend:
|
||||
|
||||
adc h, 0
|
||||
|
||||
// Return the high/carry word
|
||||
|
||||
end:
|
||||
mov rax, h
|
||||
|
||||
pop rbx
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
137
asm/bn/arch/amd64/bignum_cmul.S
Normal file
137
asm/bn/arch/amd64/bignum_cmul.S
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Multiply by a single word, z := c * y
|
||||
// Inputs c, y[n]; outputs function return (carry-out) and z[k]
|
||||
//
|
||||
// extern uint64_t bignum_cmul
|
||||
// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
|
||||
//
|
||||
// Does the "z := c * y" operation where y is n digits, result z is p.
|
||||
// Truncates the result in general unless p >= n + 1.
|
||||
//
|
||||
// The return value is a high/carry word that is meaningful when p >= n as
|
||||
// giving the high part of the result. Since this is always zero if p > n,
|
||||
// it is mainly of interest in the special case p = n, i.e. where the source
|
||||
// and destination have the same nominal size, when it gives the extra word
|
||||
// of the full result.
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
|
||||
// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
|
||||
.text
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define c r9
|
||||
#define n rcx
|
||||
#define x r8
|
||||
|
||||
#define i r10
|
||||
#define h r11
|
||||
|
||||
|
||||
|
||||
S2N_BN_SYMBOL(bignum_cmul):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+56]
|
||||
#endif
|
||||
|
||||
// First clamp the input size n := min(p,n) since we can never need to read
|
||||
// past the p'th term of the input to generate p-digit output. Now we can
|
||||
// assume that n <= p
|
||||
|
||||
cmp p, n
|
||||
cmovc n, p
|
||||
|
||||
// Initialize current input/output pointer offset i and high part h.
|
||||
// But then if n = 0 skip the multiplication and go to the tail part
|
||||
|
||||
xor h, h
|
||||
xor i, i
|
||||
test n, n
|
||||
jz tail
|
||||
|
||||
// Move c into a safer register as multiplies overwrite rdx
|
||||
|
||||
mov c, rdx
|
||||
|
||||
// Initialization of the loop: [h,l] = c * x_0
|
||||
|
||||
mov rax, [x]
|
||||
mul c
|
||||
mov [z], rax
|
||||
mov h, rdx
|
||||
inc i
|
||||
cmp i, n
|
||||
jz tail
|
||||
|
||||
// Main loop doing the multiplications
|
||||
|
||||
loop:
|
||||
mov rax, [x+8*i]
|
||||
mul c
|
||||
add rax, h
|
||||
adc rdx, 0
|
||||
mov [z+8*i], rax
|
||||
mov h, rdx
|
||||
inc i
|
||||
cmp i, n
|
||||
jc loop
|
||||
|
||||
// Add a tail when the destination is longer
|
||||
|
||||
tail:
|
||||
cmp i, p
|
||||
jnc end
|
||||
mov [z+8*i], h
|
||||
xor h, h
|
||||
inc i
|
||||
cmp i, p
|
||||
jnc end
|
||||
|
||||
tloop:
|
||||
mov [z+8*i], h
|
||||
inc i
|
||||
cmp i, p
|
||||
jc tloop
|
||||
|
||||
// Return the high/carry word
|
||||
|
||||
end:
|
||||
mov rax, h
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
166
asm/bn/arch/amd64/bignum_mul.S
Normal file
166
asm/bn/arch/amd64/bignum_mul.S
Normal file
@@ -0,0 +1,166 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Multiply z := x * y
|
||||
// Inputs x[m], y[n]; output z[k]
|
||||
//
|
||||
// extern void bignum_mul
|
||||
// (uint64_t k, uint64_t *z,
|
||||
// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
|
||||
//
|
||||
// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
|
||||
// Truncates the result in general unless k >= m + n
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
|
||||
// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
|
||||
.text
|
||||
|
||||
// These are actually right
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define n r8
|
||||
|
||||
// These are not
|
||||
|
||||
#define c r15
|
||||
#define h r14
|
||||
#define l r13
|
||||
#define x r12
|
||||
#define y r11
|
||||
#define i rbx
|
||||
#define k r10
|
||||
#define m rbp
|
||||
|
||||
// These are always local scratch since multiplier result is in these
|
||||
|
||||
#define a rax
|
||||
#define d rdx
|
||||
|
||||
|
||||
|
||||
S2N_BN_SYMBOL(bignum_mul):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+56]
|
||||
mov r9, [rsp+64]
|
||||
#endif
|
||||
|
||||
// We use too many registers, and also we need rax:rdx for multiplications
|
||||
|
||||
push rbx
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov m, rdx
|
||||
|
||||
// If the result size is zero, do nothing
|
||||
// Note that even if either or both inputs has size zero, we can't
|
||||
// just give up because we at least need to zero the output array
|
||||
// If we did a multiply-add variant, however, then we could
|
||||
|
||||
test p, p
|
||||
jz end
|
||||
|
||||
// Set initial 2-part sum to zero (we zero c inside the body)
|
||||
|
||||
xor h,h
|
||||
xor l,l
|
||||
|
||||
// Otherwise do outer loop k = 0 ... k = p - 1
|
||||
|
||||
xor k, k
|
||||
|
||||
outerloop:
|
||||
|
||||
// Zero our carry term first; we eventually want it and a zero is useful now
|
||||
// Set a = max 0 (k + 1 - n), i = min (k + 1) m
|
||||
// This defines the range a <= j < i for the inner summation
|
||||
// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
|
||||
// And since we want to increment it anyway, we might as well do it now
|
||||
|
||||
xor c, c // c = 0
|
||||
inc k // k = k + 1
|
||||
|
||||
mov a, k // a = k + 1
|
||||
sub a, n // a = k + 1 - n
|
||||
cmovc a, c // a = max 0 (k + 1 - n)
|
||||
|
||||
mov i, m // i = m
|
||||
cmp k, m // CF <=> k + 1 < m
|
||||
cmovc i, k // i = min (k + 1) m
|
||||
|
||||
// Turn i into a loop count, and skip things if it's <= 0
|
||||
// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
|
||||
// and then launch into the main inner loop, postdecrementing i
|
||||
|
||||
mov d, k
|
||||
sub d, i
|
||||
sub i, a
|
||||
jbe innerend
|
||||
lea x,[rcx+8*a]
|
||||
lea y,[r9+8*d-8]
|
||||
|
||||
innerloop:
|
||||
mov rax, [y+8*i]
|
||||
mul QWORD PTR [x]
|
||||
add x, 8
|
||||
add l, rax
|
||||
adc h, rdx
|
||||
adc c, 0
|
||||
dec i
|
||||
jnz innerloop
|
||||
|
||||
innerend:
|
||||
|
||||
mov [z], l
|
||||
mov l, h
|
||||
mov h, c
|
||||
add z, 8
|
||||
|
||||
cmp k, p
|
||||
jc outerloop
|
||||
|
||||
end:
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
156
asm/bn/arch/amd64/bignum_mul_4_8_alt.S
Normal file
156
asm/bn/arch/amd64/bignum_mul_4_8_alt.S
Normal file
@@ -0,0 +1,156 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Multiply z := x * y
|
||||
// Inputs x[4], y[4]; output z[8]
|
||||
//
|
||||
// extern void bignum_mul_4_8_alt
|
||||
// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
|
||||
// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
|
||||
.text
|
||||
|
||||
// These are actually right
|
||||
|
||||
#define z rdi
|
||||
#define x rsi
|
||||
|
||||
// This is moved from rdx to free it for muls
|
||||
|
||||
#define y rcx
|
||||
|
||||
// Other variables used as a rotating 3-word window to add terms to
|
||||
|
||||
#define t0 r8
|
||||
#define t1 r9
|
||||
#define t2 r10
|
||||
|
||||
// Macro for the key "multiply and add to (c,h,l)" step
|
||||
|
||||
#define combadd(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
// A minutely shorter form for when c = 0 initially
|
||||
|
||||
#define combadz(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, c
|
||||
|
||||
// A short form where we don't expect a top carry
|
||||
|
||||
#define combads(h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx
|
||||
|
||||
S2N_BN_SYMBOL(bignum_mul_4_8_alt):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
#endif
|
||||
|
||||
// Copy y into a safe register to start with
|
||||
|
||||
mov y, rdx
|
||||
|
||||
// Result term 0
|
||||
|
||||
mov rax, [x]
|
||||
mul QWORD PTR [y]
|
||||
|
||||
mov [z], rax
|
||||
mov t0, rdx
|
||||
xor t1, t1
|
||||
|
||||
// Result term 1
|
||||
|
||||
xor t2, t2
|
||||
combads(t1,t0,[x],[y+8])
|
||||
combadz(t2,t1,t0,[x+8],[y])
|
||||
mov [z+8], t0
|
||||
|
||||
// Result term 2
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x],[y+16])
|
||||
combadd(t0,t2,t1,[x+8],[y+8])
|
||||
combadd(t0,t2,t1,[x+16],[y])
|
||||
mov [z+16], t1
|
||||
|
||||
// Result term 3
|
||||
|
||||
xor t1, t1
|
||||
combadz(t1,t0,t2,[x],[y+24])
|
||||
combadd(t1,t0,t2,[x+8],[y+16])
|
||||
combadd(t1,t0,t2,[x+16],[y+8])
|
||||
combadd(t1,t0,t2,[x+24],[y])
|
||||
mov [z+24], t2
|
||||
|
||||
// Result term 4
|
||||
|
||||
xor t2, t2
|
||||
combadz(t2,t1,t0,[x+8],[y+24])
|
||||
combadd(t2,t1,t0,[x+16],[y+16])
|
||||
combadd(t2,t1,t0,[x+24],[y+8])
|
||||
mov [z+32], t0
|
||||
|
||||
// Result term 5
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x+16],[y+24])
|
||||
combadd(t0,t2,t1,[x+24],[y+16])
|
||||
mov [z+40], t1
|
||||
|
||||
// Result term 6
|
||||
|
||||
xor t1, t1
|
||||
combads(t0,t2,[x+24],[y+24])
|
||||
mov [z+48], t2
|
||||
|
||||
// Result term 7
|
||||
|
||||
mov [z+56], t0
|
||||
|
||||
// Return
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
243
asm/bn/arch/amd64/bignum_mul_8_16_alt.S
Normal file
243
asm/bn/arch/amd64/bignum_mul_8_16_alt.S
Normal file
@@ -0,0 +1,243 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Multiply z := x * y
|
||||
// Inputs x[8], y[8]; output z[16]
|
||||
//
|
||||
// extern void bignum_mul_8_16_alt
|
||||
// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
|
||||
// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
|
||||
.text
|
||||
|
||||
// These are actually right
|
||||
|
||||
#define z rdi
|
||||
#define x rsi
|
||||
|
||||
// This is moved from rdx to free it for muls
|
||||
|
||||
#define y rcx
|
||||
|
||||
// Other variables used as a rotating 3-word window to add terms to
|
||||
|
||||
#define t0 r8
|
||||
#define t1 r9
|
||||
#define t2 r10
|
||||
|
||||
// Macro for the key "multiply and add to (c,h,l)" step
|
||||
|
||||
#define combadd(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
// A minutely shorter form for when c = 0 initially
|
||||
|
||||
#define combadz(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, c
|
||||
|
||||
// A short form where we don't expect a top carry
|
||||
|
||||
#define combads(h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx
|
||||
|
||||
S2N_BN_SYMBOL(bignum_mul_8_16_alt):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
#endif
|
||||
|
||||
// Copy y into a safe register to start with
|
||||
|
||||
mov y, rdx
|
||||
|
||||
// Result term 0
|
||||
|
||||
mov rax, [x]
|
||||
mul QWORD PTR [y]
|
||||
|
||||
mov [z], rax
|
||||
mov t0, rdx
|
||||
xor t1, t1
|
||||
|
||||
// Result term 1
|
||||
|
||||
xor t2, t2
|
||||
combads(t1,t0,[x],[y+8])
|
||||
combadz(t2,t1,t0,[x+8],[y])
|
||||
mov [z+8], t0
|
||||
|
||||
// Result term 2
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x],[y+16])
|
||||
combadd(t0,t2,t1,[x+8],[y+8])
|
||||
combadd(t0,t2,t1,[x+16],[y])
|
||||
mov [z+16], t1
|
||||
|
||||
// Result term 3
|
||||
|
||||
xor t1, t1
|
||||
combadz(t1,t0,t2,[x],[y+24])
|
||||
combadd(t1,t0,t2,[x+8],[y+16])
|
||||
combadd(t1,t0,t2,[x+16],[y+8])
|
||||
combadd(t1,t0,t2,[x+24],[y])
|
||||
mov [z+24], t2
|
||||
|
||||
// Result term 4
|
||||
|
||||
xor t2, t2
|
||||
combadz(t2,t1,t0,[x],[y+32])
|
||||
combadd(t2,t1,t0,[x+8],[y+24])
|
||||
combadd(t2,t1,t0,[x+16],[y+16])
|
||||
combadd(t2,t1,t0,[x+24],[y+8])
|
||||
combadd(t2,t1,t0,[x+32],[y])
|
||||
mov [z+32], t0
|
||||
|
||||
// Result term 5
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x],[y+40])
|
||||
combadd(t0,t2,t1,[x+8],[y+32])
|
||||
combadd(t0,t2,t1,[x+16],[y+24])
|
||||
combadd(t0,t2,t1,[x+24],[y+16])
|
||||
combadd(t0,t2,t1,[x+32],[y+8])
|
||||
combadd(t0,t2,t1,[x+40],[y])
|
||||
mov [z+40], t1
|
||||
|
||||
// Result term 6
|
||||
|
||||
xor t1, t1
|
||||
combadz(t1,t0,t2,[x],[y+48])
|
||||
combadd(t1,t0,t2,[x+8],[y+40])
|
||||
combadd(t1,t0,t2,[x+16],[y+32])
|
||||
combadd(t1,t0,t2,[x+24],[y+24])
|
||||
combadd(t1,t0,t2,[x+32],[y+16])
|
||||
combadd(t1,t0,t2,[x+40],[y+8])
|
||||
combadd(t1,t0,t2,[x+48],[y])
|
||||
mov [z+48], t2
|
||||
|
||||
// Result term 7
|
||||
|
||||
xor t2, t2
|
||||
combadz(t2,t1,t0,[x],[y+56])
|
||||
combadd(t2,t1,t0,[x+8],[y+48])
|
||||
combadd(t2,t1,t0,[x+16],[y+40])
|
||||
combadd(t2,t1,t0,[x+24],[y+32])
|
||||
combadd(t2,t1,t0,[x+32],[y+24])
|
||||
combadd(t2,t1,t0,[x+40],[y+16])
|
||||
combadd(t2,t1,t0,[x+48],[y+8])
|
||||
combadd(t2,t1,t0,[x+56],[y])
|
||||
mov [z+56], t0
|
||||
|
||||
// Result term 8
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x+8],[y+56])
|
||||
combadd(t0,t2,t1,[x+16],[y+48])
|
||||
combadd(t0,t2,t1,[x+24],[y+40])
|
||||
combadd(t0,t2,t1,[x+32],[y+32])
|
||||
combadd(t0,t2,t1,[x+40],[y+24])
|
||||
combadd(t0,t2,t1,[x+48],[y+16])
|
||||
combadd(t0,t2,t1,[x+56],[y+8])
|
||||
mov [z+64], t1
|
||||
|
||||
// Result term 9
|
||||
|
||||
xor t1, t1
|
||||
combadz(t1,t0,t2,[x+16],[y+56])
|
||||
combadd(t1,t0,t2,[x+24],[y+48])
|
||||
combadd(t1,t0,t2,[x+32],[y+40])
|
||||
combadd(t1,t0,t2,[x+40],[y+32])
|
||||
combadd(t1,t0,t2,[x+48],[y+24])
|
||||
combadd(t1,t0,t2,[x+56],[y+16])
|
||||
mov [z+72], t2
|
||||
|
||||
// Result term 10
|
||||
|
||||
xor t2, t2
|
||||
combadz(t2,t1,t0,[x+24],[y+56])
|
||||
combadd(t2,t1,t0,[x+32],[y+48])
|
||||
combadd(t2,t1,t0,[x+40],[y+40])
|
||||
combadd(t2,t1,t0,[x+48],[y+32])
|
||||
combadd(t2,t1,t0,[x+56],[y+24])
|
||||
mov [z+80], t0
|
||||
|
||||
// Result term 11
|
||||
|
||||
xor t0, t0
|
||||
combadz(t0,t2,t1,[x+32],[y+56])
|
||||
combadd(t0,t2,t1,[x+40],[y+48])
|
||||
combadd(t0,t2,t1,[x+48],[y+40])
|
||||
combadd(t0,t2,t1,[x+56],[y+32])
|
||||
mov [z+88], t1
|
||||
|
||||
// Result term 12
|
||||
|
||||
xor t1, t1
|
||||
combadz(t1,t0,t2,[x+40],[y+56])
|
||||
combadd(t1,t0,t2,[x+48],[y+48])
|
||||
combadd(t1,t0,t2,[x+56],[y+40])
|
||||
mov [z+96], t2
|
||||
|
||||
// Result term 13
|
||||
|
||||
xor t2, t2
|
||||
combadz(t2,t1,t0,[x+48],[y+56])
|
||||
combadd(t2,t1,t0,[x+56],[y+48])
|
||||
mov [z+104], t0
|
||||
|
||||
// Result term 14
|
||||
|
||||
combads(t2,t1,[x+56],[y+56])
|
||||
mov [z+112], t1
|
||||
|
||||
// Result term 11
|
||||
|
||||
mov [z+120], t2
|
||||
|
||||
// Return
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
196
asm/bn/arch/amd64/bignum_sqr.S
Normal file
196
asm/bn/arch/amd64/bignum_sqr.S
Normal file
@@ -0,0 +1,196 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Square z := x^2
|
||||
// Input x[n]; output z[k]
|
||||
//
|
||||
// extern void bignum_sqr
|
||||
// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
|
||||
//
|
||||
// Does the "z := x^2" operation where x is n digits and result z is k.
|
||||
// Truncates the result in general unless k >= 2 * n
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
|
||||
// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
|
||||
.text
|
||||
|
||||
// First three are where arguments come in, but n is moved.
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define x rcx
|
||||
#define n r8
|
||||
|
||||
// These are always local scratch since multiplier result is in these
|
||||
|
||||
#define a rax
|
||||
#define d rdx
|
||||
|
||||
// Other variables
|
||||
|
||||
#define i rbx
|
||||
#define ll rbp
|
||||
#define hh r9
|
||||
#define k r10
|
||||
#define y r11
|
||||
#define htop r12
|
||||
#define l r13
|
||||
#define h r14
|
||||
#define c r15
|
||||
|
||||
// Short versions
|
||||
|
||||
#define llshort ebp
|
||||
|
||||
S2N_BN_SYMBOL(bignum_sqr):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
#endif
|
||||
|
||||
// We use too many registers, and also we need rax:rdx for multiplications
|
||||
|
||||
push rbx
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov n, rdx
|
||||
|
||||
// If p = 0 the result is trivial and nothing needs doing
|
||||
|
||||
test p, p
|
||||
jz end
|
||||
|
||||
// initialize (hh,ll) = 0
|
||||
|
||||
xor llshort, llshort
|
||||
xor hh, hh
|
||||
|
||||
// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
|
||||
|
||||
xor k, k
|
||||
|
||||
outerloop:
|
||||
|
||||
// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
|
||||
// We want to accumulate all x[i] * x[k - i] for bot <= i < top
|
||||
// For the optimization of squaring we avoid duplication and do
|
||||
// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
|
||||
// Initialize i = bot; in fact just compute bot as i directly.
|
||||
|
||||
xor c, c
|
||||
lea i, [k+1]
|
||||
mov htop, i
|
||||
shr htop, 1
|
||||
sub i, n
|
||||
cmovc i, c
|
||||
cmp htop, n
|
||||
cmovnc htop, n
|
||||
|
||||
// Initialize the three-part local sum (c,h,l); c was already done above
|
||||
|
||||
xor l, l
|
||||
xor h, h
|
||||
|
||||
// If htop <= bot then main doubled part of the sum is empty
|
||||
|
||||
cmp i, htop
|
||||
jnc nosumming
|
||||
|
||||
// Use a moving pointer for [y] = x[k-i] for the cofactor
|
||||
|
||||
mov a, k
|
||||
sub a, i
|
||||
lea y, [x+8*a]
|
||||
|
||||
// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
|
||||
|
||||
innerloop:
|
||||
mov a, [x+8*i]
|
||||
mul QWORD PTR [y]
|
||||
add l, a
|
||||
adc h, d
|
||||
adc c, 0
|
||||
sub y, 8
|
||||
inc i
|
||||
cmp i, htop
|
||||
jc innerloop
|
||||
|
||||
// Now double it
|
||||
|
||||
add l, l
|
||||
adc h, h
|
||||
adc c, c
|
||||
|
||||
// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
|
||||
|
||||
nosumming:
|
||||
test k, 1
|
||||
jnz innerend
|
||||
cmp i, n
|
||||
jnc innerend
|
||||
|
||||
mov a, [x+8*i]
|
||||
mul a
|
||||
add l, a
|
||||
adc h, d
|
||||
adc c, 0
|
||||
|
||||
// Now add the local sum into the global sum, store and shift
|
||||
|
||||
innerend:
|
||||
add l, ll
|
||||
mov [z+8*k], l
|
||||
adc h, hh
|
||||
mov ll, h
|
||||
adc c, 0
|
||||
mov hh, c
|
||||
|
||||
inc k
|
||||
cmp k, p
|
||||
jc outerloop
|
||||
|
||||
// Restore registers and return
|
||||
|
||||
end:
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
144
asm/bn/arch/amd64/bignum_sqr_4_8_alt.S
Normal file
144
asm/bn/arch/amd64/bignum_sqr_4_8_alt.S
Normal file
@@ -0,0 +1,144 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Square, z := x^2
|
||||
// Input x[4]; output z[8]
|
||||
//
|
||||
// extern void bignum_sqr_4_8_alt
|
||||
// (uint64_t z[static 8], uint64_t x[static 4]);
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = z, RSI = x
|
||||
// Microsoft x64 ABI: RCX = z, RDX = x
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
|
||||
.text
|
||||
|
||||
// Input arguments
|
||||
|
||||
#define z rdi
|
||||
#define x rsi
|
||||
|
||||
// Other variables used as a rotating 3-word window to add terms to
|
||||
|
||||
#define t0 rcx
|
||||
#define t1 r8
|
||||
#define t2 r9
|
||||
|
||||
// Macro for the key "multiply and add to (c,h,l)" step, for square term
|
||||
|
||||
#define combadd1(c,h,l,numa) \
|
||||
mov rax, numa; \
|
||||
mul rax; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
// A short form where we don't expect a top carry
|
||||
|
||||
#define combads(h,l,numa) \
|
||||
mov rax, numa; \
|
||||
mul rax; \
|
||||
add l, rax; \
|
||||
adc h, rdx
|
||||
|
||||
// A version doubling before adding, for non-square terms
|
||||
|
||||
#define combadd2(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add rax, rax; \
|
||||
adc rdx, rdx; \
|
||||
adc c, 0; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
#endif
|
||||
|
||||
// Result term 0
|
||||
|
||||
mov rax, [x]
|
||||
mul rax
|
||||
|
||||
mov [z], rax
|
||||
mov t0, rdx
|
||||
xor t1, t1
|
||||
|
||||
// Result term 1
|
||||
|
||||
xor t2, t2
|
||||
combadd2(t2,t1,t0,[x],[x+8])
|
||||
mov [z+8], t0
|
||||
|
||||
// Result term 2
|
||||
|
||||
xor t0, t0
|
||||
combadd1(t0,t2,t1,[x+8])
|
||||
combadd2(t0,t2,t1,[x],[x+16])
|
||||
mov [z+16], t1
|
||||
|
||||
// Result term 3
|
||||
|
||||
xor t1, t1
|
||||
combadd2(t1,t0,t2,[x],[x+24])
|
||||
combadd2(t1,t0,t2,[x+8],[x+16])
|
||||
mov [z+24], t2
|
||||
|
||||
// Result term 4
|
||||
|
||||
xor t2, t2
|
||||
combadd2(t2,t1,t0,[x+8],[x+24])
|
||||
combadd1(t2,t1,t0,[x+16])
|
||||
mov [z+32], t0
|
||||
|
||||
// Result term 5
|
||||
|
||||
xor t0, t0
|
||||
combadd2(t0,t2,t1,[x+16],[x+24])
|
||||
mov [z+40], t1
|
||||
|
||||
// Result term 6
|
||||
|
||||
xor t1, t1
|
||||
combads(t0,t2,[x+24])
|
||||
mov [z+48], t2
|
||||
|
||||
// Result term 7
|
||||
|
||||
mov [z+56], t0
|
||||
|
||||
// Return
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
241
asm/bn/arch/amd64/bignum_sqr_8_16_alt.S
Normal file
241
asm/bn/arch/amd64/bignum_sqr_8_16_alt.S
Normal file
@@ -0,0 +1,241 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Square, z := x^2
|
||||
// Input x[8]; output z[16]
|
||||
//
|
||||
// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = z, RSI = x
|
||||
// Microsoft x64 ABI: RCX = z, RDX = x
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
|
||||
.text
|
||||
|
||||
// Input arguments
|
||||
|
||||
#define z rdi
|
||||
#define x rsi
|
||||
|
||||
// Other variables used as a rotating 3-word window to add terms to
|
||||
|
||||
#define t0 r8
|
||||
#define t1 r9
|
||||
#define t2 r10
|
||||
|
||||
// Additional temporaries for local windows to share doublings
|
||||
|
||||
#define u0 rcx
|
||||
#define u1 r11
|
||||
|
||||
// Macro for the key "multiply and add to (c,h,l)" step
|
||||
|
||||
#define combadd(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
// Set up initial window (c,h,l) = numa * numb
|
||||
|
||||
#define combaddz(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
xor c, c; \
|
||||
mov l, rax; \
|
||||
mov h, rdx
|
||||
|
||||
// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
|
||||
|
||||
#define doubladd(c,h,l,hh,ll) \
|
||||
add ll, ll; \
|
||||
adc hh, hh; \
|
||||
adc c, c; \
|
||||
add l, ll; \
|
||||
adc h, hh; \
|
||||
adc c, 0
|
||||
|
||||
// Square term incorporation (c,h,l) += numba^2
|
||||
|
||||
#define combadd1(c,h,l,numa) \
|
||||
mov rax, numa; \
|
||||
mul rax; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
// A short form where we don't expect a top carry
|
||||
|
||||
#define combads(h,l,numa) \
|
||||
mov rax, numa; \
|
||||
mul rax; \
|
||||
add l, rax; \
|
||||
adc h, rdx
|
||||
|
||||
// A version doubling directly before adding, for single non-square terms
|
||||
|
||||
#define combadd2(c,h,l,numa,numb) \
|
||||
mov rax, numa; \
|
||||
mul QWORD PTR numb; \
|
||||
add rax, rax; \
|
||||
adc rdx, rdx; \
|
||||
adc c, 0; \
|
||||
add l, rax; \
|
||||
adc h, rdx; \
|
||||
adc c, 0
|
||||
|
||||
S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
#endif
|
||||
|
||||
// Result term 0
|
||||
|
||||
mov rax, [x]
|
||||
mul rax
|
||||
|
||||
mov [z], rax
|
||||
mov t0, rdx
|
||||
xor t1, t1
|
||||
|
||||
// Result term 1
|
||||
|
||||
xor t2, t2
|
||||
combadd2(t2,t1,t0,[x],[x+8])
|
||||
mov [z+8], t0
|
||||
|
||||
// Result term 2
|
||||
|
||||
xor t0, t0
|
||||
combadd1(t0,t2,t1,[x+8])
|
||||
combadd2(t0,t2,t1,[x],[x+16])
|
||||
mov [z+16], t1
|
||||
|
||||
// Result term 3
|
||||
|
||||
combaddz(t1,u1,u0,[x],[x+24])
|
||||
combadd(t1,u1,u0,[x+8],[x+16])
|
||||
doubladd(t1,t0,t2,u1,u0)
|
||||
mov [z+24], t2
|
||||
|
||||
// Result term 4
|
||||
|
||||
combaddz(t2,u1,u0,[x],[x+32])
|
||||
combadd(t2,u1,u0,[x+8],[x+24])
|
||||
doubladd(t2,t1,t0,u1,u0)
|
||||
combadd1(t2,t1,t0,[x+16])
|
||||
mov [z+32], t0
|
||||
|
||||
// Result term 5
|
||||
|
||||
combaddz(t0,u1,u0,[x],[x+40])
|
||||
combadd(t0,u1,u0,[x+8],[x+32])
|
||||
combadd(t0,u1,u0,[x+16],[x+24])
|
||||
doubladd(t0,t2,t1,u1,u0)
|
||||
mov [z+40], t1
|
||||
|
||||
// Result term 6
|
||||
|
||||
combaddz(t1,u1,u0,[x],[x+48])
|
||||
combadd(t1,u1,u0,[x+8],[x+40])
|
||||
combadd(t1,u1,u0,[x+16],[x+32])
|
||||
doubladd(t1,t0,t2,u1,u0)
|
||||
combadd1(t1,t0,t2,[x+24])
|
||||
mov [z+48], t2
|
||||
|
||||
// Result term 7
|
||||
|
||||
combaddz(t2,u1,u0,[x],[x+56])
|
||||
combadd(t2,u1,u0,[x+8],[x+48])
|
||||
combadd(t2,u1,u0,[x+16],[x+40])
|
||||
combadd(t2,u1,u0,[x+24],[x+32])
|
||||
doubladd(t2,t1,t0,u1,u0)
|
||||
mov [z+56], t0
|
||||
|
||||
// Result term 8
|
||||
|
||||
combaddz(t0,u1,u0,[x+8],[x+56])
|
||||
combadd(t0,u1,u0,[x+16],[x+48])
|
||||
combadd(t0,u1,u0,[x+24],[x+40])
|
||||
doubladd(t0,t2,t1,u1,u0)
|
||||
combadd1(t0,t2,t1,[x+32])
|
||||
mov [z+64], t1
|
||||
|
||||
// Result term 9
|
||||
|
||||
combaddz(t1,u1,u0,[x+16],[x+56])
|
||||
combadd(t1,u1,u0,[x+24],[x+48])
|
||||
combadd(t1,u1,u0,[x+32],[x+40])
|
||||
doubladd(t1,t0,t2,u1,u0)
|
||||
mov [z+72], t2
|
||||
|
||||
// Result term 10
|
||||
|
||||
combaddz(t2,u1,u0,[x+24],[x+56])
|
||||
combadd(t2,u1,u0,[x+32],[x+48])
|
||||
doubladd(t2,t1,t0,u1,u0)
|
||||
combadd1(t2,t1,t0,[x+40])
|
||||
mov [z+80], t0
|
||||
|
||||
// Result term 11
|
||||
|
||||
combaddz(t0,u1,u0,[x+32],[x+56])
|
||||
combadd(t0,u1,u0,[x+40],[x+48])
|
||||
doubladd(t0,t2,t1,u1,u0)
|
||||
mov [z+88], t1
|
||||
|
||||
// Result term 12
|
||||
|
||||
xor t1, t1
|
||||
combadd2(t1,t0,t2,[x+40],[x+56])
|
||||
combadd1(t1,t0,t2,[x+48])
|
||||
mov [z+96], t2
|
||||
|
||||
// Result term 13
|
||||
|
||||
xor t2, t2
|
||||
combadd2(t2,t1,t0,[x+48],[x+56])
|
||||
mov [z+104], t0
|
||||
|
||||
// Result term 14
|
||||
|
||||
combads(t2,t1,[x+56])
|
||||
mov [z+112], t1
|
||||
|
||||
// Result term 15
|
||||
|
||||
mov [z+120], t2
|
||||
|
||||
// Return
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
152
asm/bn/arch/amd64/bignum_sub.S
Normal file
152
asm/bn/arch/amd64/bignum_sub.S
Normal file
@@ -0,0 +1,152 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Subtract, z := x - y
|
||||
// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
|
||||
//
|
||||
// extern uint64_t bignum_sub
|
||||
// (uint64_t p, uint64_t *z,
|
||||
// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
|
||||
//
|
||||
// Does the z := x - y operation, truncating modulo p words in general and
|
||||
// returning a top borrow (0 or 1) in the p'th place, only subtracting input
|
||||
// words below p (as well as m and n respectively) to get the diff and borrow.
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
|
||||
// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
|
||||
.text
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define m rdx
|
||||
#define x rcx
|
||||
#define n r8
|
||||
#define y r9
|
||||
#define i r10
|
||||
#define a rax
|
||||
|
||||
#define ashort eax
|
||||
|
||||
|
||||
|
||||
S2N_BN_SYMBOL(bignum_sub):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
mov r8, [rsp+56]
|
||||
mov r9, [rsp+64]
|
||||
#endif
|
||||
|
||||
// Zero the main index counter for both branches
|
||||
|
||||
xor i, i
|
||||
|
||||
// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
|
||||
// we'll never need words past the p'th. Can now assume m <= p and n <= p.
|
||||
// Then compare the modified m and n and branch accordingly
|
||||
|
||||
cmp p, m
|
||||
cmovc m, p
|
||||
cmp p, n
|
||||
cmovc n, p
|
||||
cmp m, n
|
||||
jc ylonger
|
||||
|
||||
// The case where x is longer or of the same size (p >= m >= n)
|
||||
|
||||
sub p, m
|
||||
sub m, n
|
||||
inc m
|
||||
test n, n
|
||||
jz xtest
|
||||
xmainloop:
|
||||
mov a, [x+8*i]
|
||||
sbb a, [y+8*i]
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec n
|
||||
jnz xmainloop
|
||||
jmp xtest
|
||||
xtoploop:
|
||||
mov a, [x+8*i]
|
||||
sbb a, 0
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
xtest:
|
||||
dec m
|
||||
jnz xtoploop
|
||||
sbb a, a
|
||||
test p, p
|
||||
jz tailskip
|
||||
tailloop:
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec p
|
||||
jnz tailloop
|
||||
tailskip:
|
||||
neg a
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
// The case where y is longer (p >= n > m)
|
||||
|
||||
ylonger:
|
||||
|
||||
sub p, n
|
||||
sub n, m
|
||||
test m, m
|
||||
jz ytoploop
|
||||
ymainloop:
|
||||
mov a, [x+8*i]
|
||||
sbb a, [y+8*i]
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec m
|
||||
jnz ymainloop
|
||||
ytoploop:
|
||||
mov ashort, 0
|
||||
sbb a, [y+8*i]
|
||||
mov [z+8*i],a
|
||||
inc i
|
||||
dec n
|
||||
jnz ytoploop
|
||||
sbb a, a
|
||||
test p, p
|
||||
jnz tailloop
|
||||
neg a
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
59
asm/bn/arch/amd64/word_clz.S
Normal file
59
asm/bn/arch/amd64/word_clz.S
Normal file
@@ -0,0 +1,59 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Count leading zero bits in a single word
|
||||
// Input a; output function return
|
||||
//
|
||||
// extern uint64_t word_clz (uint64_t a);
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = a, returns RAX
|
||||
// Microsoft x64 ABI: RCX = a, returns RAX
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz)
|
||||
.text
|
||||
|
||||
S2N_BN_SYMBOL(word_clz):
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
#endif
|
||||
|
||||
// First do rax = 63 - bsr(a), which is right except (maybe) for zero inputs
|
||||
|
||||
bsr rax, rdi
|
||||
xor rax, 63
|
||||
|
||||
// Force return of 64 in the zero-input case
|
||||
|
||||
mov edx, 64
|
||||
test rdi, rdi
|
||||
cmove rax, rdx
|
||||
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
101
asm/sparccpuid.S
Normal file
101
asm/sparccpuid.S
Normal file
@@ -0,0 +1,101 @@
|
||||
#if defined(__SUNPRO_C) && defined(__sparcv9)
|
||||
# define ABI64 /* They've said -xarch=v9 at command line */
|
||||
#elif defined(__GNUC__) && defined(__arch64__)
|
||||
# define ABI64 /* They've said -m64 at command line */
|
||||
#endif
|
||||
|
||||
#ifdef ABI64
|
||||
.register %g2,#scratch
|
||||
.register %g3,#scratch
|
||||
# define FRAME -192
|
||||
# define BIAS 2047
|
||||
#else
|
||||
# define FRAME -96
|
||||
# define BIAS 0
|
||||
#endif
|
||||
|
||||
.global _sparcv9_vis1_probe
|
||||
.align 8
|
||||
_sparcv9_vis1_probe:
|
||||
add %sp,BIAS+2,%o1
|
||||
.word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
|
||||
retl
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.type _sparcv9_vis1_probe,#function
|
||||
.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
|
||||
|
||||
! Probe and instrument VIS1 instruction. Output is number of cycles it
|
||||
! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
|
||||
! is slow (documented to be 6 cycles on T2) and the core is in-order
|
||||
! single-issue, it should be possible to distinguish Tx reliably...
|
||||
! Observed return values are:
|
||||
!
|
||||
! UltraSPARC IIe 7
|
||||
! UltraSPARC III 7
|
||||
! UltraSPARC T1 24
|
||||
!
|
||||
! Numbers for T2 and SPARC64 V-VII are more than welcomed.
|
||||
!
|
||||
! It would be possible to detect specifically US-T1 by instrumenting
|
||||
! fmul8ulx16, which is emulated on T1 and as such accounts for quite
|
||||
! a lot of %tick-s, couple of thousand on Linux...
|
||||
.global _sparcv9_vis1_instrument
|
||||
.align 8
|
||||
_sparcv9_vis1_instrument:
|
||||
.word 0x91410000 !rd %tick,%o0
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.word 0x85b08d82 !fxor %f2,%f2,%f2
|
||||
.word 0x93410000 !rd %tick,%o1
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.word 0x85b08d82 !fxor %f2,%f2,%f2
|
||||
.word 0x95410000 !rd %tick,%o2
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.word 0x85b08d82 !fxor %f2,%f2,%f2
|
||||
.word 0x97410000 !rd %tick,%o3
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.word 0x85b08d82 !fxor %f2,%f2,%f2
|
||||
.word 0x99410000 !rd %tick,%o4
|
||||
|
||||
! calculate intervals
|
||||
sub %o1,%o0,%o0
|
||||
sub %o2,%o1,%o1
|
||||
sub %o3,%o2,%o2
|
||||
sub %o4,%o3,%o3
|
||||
|
||||
! find minimum value
|
||||
cmp %o0,%o1
|
||||
.word 0x38680002 !bgu,a %xcc,.+8
|
||||
mov %o1,%o0
|
||||
cmp %o0,%o2
|
||||
.word 0x38680002 !bgu,a %xcc,.+8
|
||||
mov %o2,%o0
|
||||
cmp %o0,%o3
|
||||
.word 0x38680002 !bgu,a %xcc,.+8
|
||||
mov %o3,%o0
|
||||
|
||||
retl
|
||||
nop
|
||||
.type _sparcv9_vis1_instrument,#function
|
||||
.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
|
||||
|
||||
.global _sparcv9_vis2_probe
|
||||
.align 8
|
||||
_sparcv9_vis2_probe:
|
||||
retl
|
||||
.word 0x81b00980 !bshuffle %f0,%f0,%f0
|
||||
.type _sparcv9_vis2_probe,#function
|
||||
.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe
|
||||
|
||||
.global _sparcv9_fmadd_probe
|
||||
.align 8
|
||||
_sparcv9_fmadd_probe:
|
||||
.word 0x81b00d80 !fxor %f0,%f0,%f0
|
||||
.word 0x85b08d82 !fxor %f2,%f2,%f2
|
||||
retl
|
||||
.word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0
|
||||
.type _sparcv9_fmadd_probe,#function
|
||||
.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
|
||||
|
||||
.section ".init",#alloc,#execinstr
|
||||
call OPENSSL_cpuid_setup
|
||||
nop
|
Reference in New Issue
Block a user