libressl-portable/asm/bn/arch/amd64/bignum_mul.S

// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

// ----------------------------------------------------------------------------
// Multiply z := x * y
// Inputs x[m], y[n]; output z[k]
//
//    extern void bignum_mul
//     (uint64_t k, uint64_t *z,
//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
//
// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
// Truncates the result in general unless k >= m + n
//
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
// ----------------------------------------------------------------------------

#include "s2n_bignum_internal.h"

        .intel_syntax noprefix
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
        .text

// These are actually right

#define p rdi
#define z rsi
#define n r8

// These are not

#define c r15
#define h r14
#define l r13
#define x r12
#define y r11
#define i rbx
#define k r10
#define m rbp

// These are always local scratch since multiplier result is in these

#define a rax
#define d rdx


S2N_BN_SYMBOL(bignum_mul):

#if WINDOWS_ABI
        push    rdi
        push    rsi
        mov     rdi, rcx
        mov     rsi, rdx
        mov     rdx, r8
        mov     rcx, r9
        mov     r8, [rsp+56]
        mov     r9, [rsp+64]
#endif

// We use too many registers, and also we need rax:rdx for multiplications

        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        mov     m, rdx

// If the result size is zero, do nothing
// Note that even if either or both inputs has size zero, we can't
// just give up because we at least need to zero the output array
// If we did a multiply-add variant, however, then we could

        test    p, p
        jz      end

// Set initial 2-part sum to zero (we zero c inside the body)

        xor     h,h
        xor     l,l

// Otherwise do outer loop k = 0 ... k = p - 1

        xor     k, k

outerloop:

// Zero our carry term first; we eventually want it and a zero is useful now
// Set a =  max 0 (k + 1 - n), i = min (k + 1) m
// This defines the range a <= j < i for the inner summation
// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
// And since we want to increment it anyway, we might as well do it now

        xor     c, c            // c = 0
        inc     k               // k = k + 1

        mov     a, k            // a = k + 1
        sub     a, n            // a = k + 1 - n
        cmovc   a, c            // a = max 0 (k + 1 - n)

        mov     i, m            // i = m
        cmp     k, m            // CF <=> k + 1 < m
        cmovc   i, k            // i = min (k + 1) m

// Turn i into a loop count, and skip things if it's <= 0
// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
// and then launch into the main inner loop, postdecrementing i

        mov     d, k
        sub     d, i
        sub     i, a
        jbe     innerend
        lea     x,[rcx+8*a]
        lea     y,[r9+8*d-8]

innerloop:
        mov     rax, [y+8*i]
        mul     QWORD PTR  [x]
        add     x, 8
        add     l, rax
        adc     h, rdx
        adc     c, 0
        dec     i
        jnz     innerloop

innerend:

        mov     [z], l
        mov     l, h
        mov     h, c
        add     z, 8

        cmp     k, p
        jc      outerloop

end:
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
#if WINDOWS_ABI
        pop    rsi
        pop    rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
check in v3.8.2 source 2023-11-06 23:46:37 -08:00			`// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice appear in all copies.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES`
			`// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF`
			`// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR`
			`// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES`
			`// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN`
			`// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF`
			`// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.`

			`// ----------------------------------------------------------------------------`
			`// Multiply z := x * y`
			`// Inputs x[m], y[n]; output z[k]`
			`//`
			`// extern void bignum_mul`
			`// (uint64_t k, uint64_t *z,`
			`// uint64_t m, uint64_t x, uint64_t n, uint64_t y);`
			`//`
			`// Does the "z := x * y" operation where x is m digits, y is n, result z is k.`
			`// Truncates the result in general unless k >= m + n`
			`//`
			`// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y`
			`// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y`
			`// ----------------------------------------------------------------------------`

			`#include "s2n_bignum_internal.h"`

			`.intel_syntax noprefix`
			`S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)`
			`S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)`
			`.text`

			`// These are actually right`

			`#define p rdi`
			`#define z rsi`
			`#define n r8`

			`// These are not`

			`#define c r15`
			`#define h r14`
			`#define l r13`
			`#define x r12`
			`#define y r11`
			`#define i rbx`
			`#define k r10`
			`#define m rbp`

			`// These are always local scratch since multiplier result is in these`

			`#define a rax`
			`#define d rdx`



			`S2N_BN_SYMBOL(bignum_mul):`

			`#if WINDOWS_ABI`
			`push rdi`
			`push rsi`
			`mov rdi, rcx`
			`mov rsi, rdx`
			`mov rdx, r8`
			`mov rcx, r9`
			`mov r8, [rsp+56]`
			`mov r9, [rsp+64]`
			`#endif`

			`// We use too many registers, and also we need rax:rdx for multiplications`

			`push rbx`
			`push rbp`
			`push r12`
			`push r13`
			`push r14`
			`push r15`
			`mov m, rdx`

			`// If the result size is zero, do nothing`
			`// Note that even if either or both inputs has size zero, we can't`
			`// just give up because we at least need to zero the output array`
			`// If we did a multiply-add variant, however, then we could`

			`test p, p`
			`jz end`

			`// Set initial 2-part sum to zero (we zero c inside the body)`

			`xor h,h`
			`xor l,l`

			`// Otherwise do outer loop k = 0 ... k = p - 1`

			`xor k, k`

			`outerloop:`

			`// Zero our carry term first; we eventually want it and a zero is useful now`
			`// Set a = max 0 (k + 1 - n), i = min (k + 1) m`
			`// This defines the range a <= j < i for the inner summation`
			`// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow`
			`// And since we want to increment it anyway, we might as well do it now`

			`xor c, c // c = 0`
			`inc k // k = k + 1`

			`mov a, k // a = k + 1`
			`sub a, n // a = k + 1 - n`
			`cmovc a, c // a = max 0 (k + 1 - n)`

			`mov i, m // i = m`
			`cmp k, m // CF <=> k + 1 < m`
			`cmovc i, k // i = min (k + 1) m`

			`// Turn i into a loop count, and skip things if it's <= 0`
			`// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]`
			`// and then launch into the main inner loop, postdecrementing i`

			`mov d, k`
			`sub d, i`
			`sub i, a`
			`jbe innerend`
			`lea x,[rcx+8*a]`
			`lea y,[r9+8*d-8]`

			`innerloop:`
			`mov rax, [y+8*i]`
			`mul QWORD PTR [x]`
			`add x, 8`
			`add l, rax`
			`adc h, rdx`
			`adc c, 0`
			`dec i`
			`jnz innerloop`

			`innerend:`

			`mov [z], l`
			`mov l, h`
			`mov h, c`
			`add z, 8`

			`cmp k, p`
			`jc outerloop`

			`end:`
			`pop r15`
			`pop r14`
			`pop r13`
			`pop r12`
			`pop rbp`
			`pop rbx`
			`#if WINDOWS_ABI`
			`pop rsi`
			`pop rdi`
			`#endif`
			`ret`

			`#if defined(__linux__) && defined(__ELF__)`
			`.section .note.GNU-stack,"",%progbits`
			`#endif`