diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 887ee1c..272f11f 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -18,9 +18,12 @@ jobs: - cfg: {name: 'gcc10', runson: 'ubuntu-latest', container: 'debian:bullseye'} - cfg: {name: 'gcc11', runson: 'ubuntu-latest', container: 'ubuntu:jammy'} - cfg: {name: 'gcc12', runson: 'ubuntu-latest', container: 'debian:bookworm'} + - cfg: {name: 'gcc13', runson: 'ubuntu-latest', container: 'ubuntu:noble'} - cfg: {name: 'gcc-arch', runson: 'ubuntu-latest', container: 'archlinux'} - cfg: {name: 'clang-arch', runson: 'ubuntu-latest', container: 'archlinux', cmake: '-DCMAKE_CXX_COMPILER=clang++'} + - cfg: {name: 'clang-lto-arch', runson: 'ubuntu-latest', container: 'archlinux', + cmake: '-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_INTERPROCEDURAL_OPTIMIZATION=On'} - cfg: {name: 'clang-libc++-arch', runson: 'ubuntu-latest', container: 'archlinux', cmake: '-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-stdlib=libc++'} - cfg: {name: 'clang-UBSAN-arch', runson: 'ubuntu-latest', container: 'archlinux', diff --git a/CMakeLists.txt b/CMakeLists.txt index 238a0a7..3e7c7cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR AND NOT CMAKE_BUILD_TYPE A set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -project(bls12-381 VERSION 1.0.0) +project(bls12-381 VERSION 1.0.0 LANGUAGES CXX ASM) file(GLOB BLS12-381_SOURCES CONFIGURE_DEPENDS src/*.cpp) file(GLOB BLS12-381_HEADERS CONFIGURE_DEPENDS include/bls12-381/*.hpp) @@ -15,6 +15,11 @@ target_include_directories(bls12-381 PUBLIC include) set_target_properties(bls12-381 PROPERTIES PUBLIC_HEADER "${BLS12-381_HEADERS}") target_compile_features(bls12-381 PUBLIC cxx_std_20) +if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64) + target_sources(bls12-381 PRIVATE src/arithmetic.s) + set_source_files_properties(src/arithmetic.s PROPERTIES COMPILE_FLAGS "-Wno-unused-command-line-argument") +endif() + # the tests enable_testing() add_subdirectory(test) diff --git a/src/arithmetic.cpp b/src/arithmetic.cpp index d5f2650..acbe906 100644 --- a/src/arithmetic.cpp +++ b/src/arithmetic.cpp @@ -8,68 +8,7 @@ using namespace std; namespace bls12_381 { -#ifdef __x86_64__ -void _add(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // y => %rdx - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("push %r15;"); - asm("push %r14;"); - asm("push %r13;"); - asm("push %r12;"); - asm("push %rbx;"); - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%r12;"); - asm("mov 0x28(%rsi),%r13;"); - asm("add (%rdx),%r8;"); - asm("adc 0x08(%rdx),%r9;"); - asm("adc 0x10(%rdx),%r10;"); - asm("adc 0x18(%rdx),%r11;"); - asm("adc 0x20(%rdx),%r12;"); - asm("adc 0x28(%rdx),%r13;"); - asm("mov %r8,%r14;"); - asm("mov %r9,%r15;"); - asm("mov %r10,%rcx;"); - asm("mov %r11,%rdx;"); - asm("mov %r12,%rsi;"); - asm("mov %r13,%rbx;"); - asm("mov $0xb9feffffffffaaab,%rax;"); - asm("sub %rax,%r14;"); - asm("mov $0x1eabfffeb153ffff,%rax;"); - asm("sbb %rax,%r15;"); - asm("mov $0x6730d2a0f6b0f624,%rax;"); - asm("sbb %rax,%rcx;"); - asm("mov $0x64774b84f38512bf,%rax;"); - asm("sbb %rax,%rdx;"); - asm("mov $0x4b1ba7b6434bacd7,%rax;"); - asm("sbb %rax,%rsi;"); - asm("mov $0x1a0111ea397fe69a,%rax;"); - asm("sbb %rax,%rbx;"); - asm("cmovae %r14,%r8;"); - asm("cmovae %r15,%r9;"); - asm("cmovae %rcx,%r10;"); - asm("cmovae %rdx,%r11;"); - asm("cmovae %rsi,%r12;"); - asm("cmovae %rbx,%r13;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %r12,0x20(%rdi);"); - asm("mov %r13,0x28(%rdi);"); - asm("pop %rbx;"); - asm("pop %r12;"); - asm("pop %r13;"); - asm("pop %r14;"); - asm("pop %r15;"); -} -#else +#ifndef __x86_64__ void _add(fp* z, const fp* x, const fp* y) { uint64_t carry, _; @@ -94,36 +33,7 @@ void _add(fp* z, const fp* x, const fp* y) tie(z->d[5], _) = Sub64(z->d[5], fp::MODULUS.d[5], b); } } -#endif -#ifdef __x86_64__ -void _ladd(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // y => %rdx - // callee needs to restore registers r15, r14, r13, r12, rbx (if clobbed) before returning - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%rcx;"); - asm("mov 0x28(%rsi),%rax;"); - asm("add (%rdx),%r8;"); - asm("adc 0x08(%rdx),%r9;"); - asm("adc 0x10(%rdx),%r10;"); - asm("adc 0x18(%rdx),%r11;"); - asm("adc 0x20(%rdx),%rcx;"); - asm("adc 0x28(%rdx),%rax;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %rcx,0x20(%rdi);"); - asm("mov %rax,0x28(%rdi);"); -} -#else void _ladd(fp* z, const fp* x, const fp* y) { uint64_t carry, _; @@ -134,69 +44,7 @@ void _ladd(fp* z, const fp* x, const fp* y) tie(z->d[4], carry) = Add64(x->d[4], y->d[4], carry); tie(z->d[5], _) = Add64(x->d[5], y->d[5], carry); } -#endif -#ifdef __x86_64__ -void _double(fp* z, const fp* x) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("push %r15;"); - asm("push %r14;"); - asm("push %r13;"); - asm("push %r12;"); - asm("push %rbx;"); - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%r12;"); - asm("mov 0x28(%rsi),%r13;"); - asm("add %r8,%r8;"); - asm("adc %r9,%r9;"); - asm("adc %r10,%r10;"); - asm("adc %r11,%r11;"); - asm("adc %r12,%r12;"); - asm("adc %r13,%r13;"); - asm("mov %r8,%r14;"); - asm("mov %r9,%r15;"); - asm("mov %r10,%rcx;"); - asm("mov %r11,%rdx;"); - asm("mov %r12,%rsi;"); - asm("mov %r13,%rbx;"); - asm("mov $0xb9feffffffffaaab,%rax;"); - asm("sub %rax,%r14;"); - asm("mov $0x1eabfffeb153ffff,%rax;"); - asm("sbb %rax,%r15;"); - asm("mov $0x6730d2a0f6b0f624,%rax;"); - asm("sbb %rax,%rcx;"); - asm("mov $0x64774b84f38512bf,%rax;"); - asm("sbb %rax,%rdx;"); - asm("mov $0x4b1ba7b6434bacd7,%rax;"); - asm("sbb %rax,%rsi;"); - asm("mov $0x1a0111ea397fe69a,%rax;"); - asm("sbb %rax,%rbx;"); - asm("cmovae %r14,%r8;"); - asm("cmovae %r15,%r9;"); - asm("cmovae %rcx,%r10;"); - asm("cmovae %rdx,%r11;"); - asm("cmovae %rsi,%r12;"); - asm("cmovae %rbx,%r13;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %r12,0x20(%rdi);"); - asm("mov %r13,0x28(%rdi);"); - asm("pop %rbx;"); - asm("pop %r12;"); - asm("pop %r13;"); - asm("pop %r14;"); - asm("pop %r15;"); -} -#else void _double(fp* z, const fp* x) { uint64_t carry, _; @@ -221,35 +69,7 @@ void _double(fp* z, const fp* x) tie(z->d[5], _) = Sub64(z->d[5], fp::MODULUS.d[5], b); } } -#endif -#ifdef __x86_64__ -void _ldouble(fp* z, const fp* x) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // callee needs to restore registers r15, r14, r13, r12, rbx (if clobbed) before returning - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%rcx;"); - asm("mov 0x28(%rsi),%rax;"); - asm("add %r8,%r8;"); - asm("adc %r9,%r9;"); - asm("adc %r10,%r10;"); - asm("adc %r11,%r11;"); - asm("adc %rcx,%rcx;"); - asm("adc %rax,%rax;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %rcx,0x20(%rdi);"); - asm("mov %rax,0x28(%rdi);"); -} -#else void _ldouble(fp* z, const fp* x) { uint64_t carry, _; @@ -261,65 +81,7 @@ void _ldouble(fp* z, const fp* x) tie(z->d[4], carry) = Add64(x->d[4], x->d[4], carry); tie(z->d[5], _) = Add64(x->d[5], x->d[5], carry); } -#endif -#ifdef __x86_64__ -void _subtract(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // y => %rdx - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("push %r15;"); - asm("push %r14;"); - asm("push %r13;"); - asm("push %r12;"); - asm("push %rbx;"); - asm("xor %rax,%rax;"); - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%r12;"); - asm("mov 0x28(%rsi),%r13;"); - asm("sub (%rdx),%r8;"); - asm("sbb 0x08(%rdx),%r9;"); - asm("sbb 0x10(%rdx),%r10;"); - asm("sbb 0x18(%rdx),%r11;"); - asm("sbb 0x20(%rdx),%r12;"); - asm("sbb 0x28(%rdx),%r13;"); - asm("mov $0xb9feffffffffaaab,%r14;"); - asm("mov $0x1eabfffeb153ffff,%r15;"); - asm("mov $0x6730d2a0f6b0f624,%rcx;"); - asm("mov $0x64774b84f38512bf,%rdx;"); - asm("mov $0x4b1ba7b6434bacd7,%rsi;"); - asm("mov $0x1a0111ea397fe69a,%rbx;"); - asm("cmovae %rax,%r14;"); - asm("cmovae %rax,%r15;"); - asm("cmovae %rax,%rcx;"); - asm("cmovae %rax,%rdx;"); - asm("cmovae %rax,%rsi;"); - asm("cmovae %rax,%rbx;"); - asm("add %r14,%r8;"); - asm("adc %r15,%r9;"); - asm("adc %rcx,%r10;"); - asm("adc %rdx,%r11;"); - asm("adc %rsi,%r12;"); - asm("adc %rbx,%r13;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %r12,0x20(%rdi);"); - asm("mov %r13,0x28(%rdi);"); - asm("pop %rbx;"); - asm("pop %r12;"); - asm("pop %r13;"); - asm("pop %r14;"); - asm("pop %r15;"); -} -#else void _subtract(fp* z, const fp* x, const fp* y) { uint64_t b; @@ -340,36 +102,7 @@ void _subtract(fp* z, const fp* x, const fp* y) tie(z->d[5], _) = Add64(z->d[5], fp::MODULUS.d[5], c); } } -#endif -#ifdef __x86_64__ -void _lsubtract(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // y => %rdx - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("mov (%rsi),%r8;"); - asm("mov 0x08(%rsi),%r9;"); - asm("mov 0x10(%rsi),%r10;"); - asm("mov 0x18(%rsi),%r11;"); - asm("mov 0x20(%rsi),%rcx;"); - asm("mov 0x28(%rsi),%rax;"); - asm("sub (%rdx),%r8;"); - asm("sbb 0x08(%rdx),%r9;"); - asm("sbb 0x10(%rdx),%r10;"); - asm("sbb 0x18(%rdx),%r11;"); - asm("sbb 0x20(%rdx),%rcx;"); - asm("sbb 0x28(%rdx),%rax;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %rcx,0x20(%rdi);"); - asm("mov %rax,0x28(%rdi);"); -} -#else void _lsubtract(fp* z, const fp* x, const fp* y) { uint64_t b, _; @@ -383,40 +116,15 @@ void _lsubtract(fp* z, const fp* x, const fp* y) #endif #ifdef __x86_64__ -void __negate(fp* z, const fp* x) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi - // x => %rsi - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("mov $0xb9feffffffffaaab,%r8;"); - asm("mov $0x1eabfffeb153ffff,%r9;"); - asm("mov $0x6730d2a0f6b0f624,%r10;"); - asm("mov $0x64774b84f38512bf,%r11;"); - asm("mov $0x4b1ba7b6434bacd7,%rcx;"); - asm("mov $0x1a0111ea397fe69a,%rax;"); - asm("sub (%rsi),%r8;"); - asm("sbb 0x08(%rsi),%r9;"); - asm("sbb 0x10(%rsi),%r10;"); - asm("sbb 0x18(%rsi),%r11;"); - asm("sbb 0x20(%rsi),%rcx;"); - asm("sbb 0x28(%rsi),%rax;"); - asm("mov %r8, (%rdi);"); - asm("mov %r9, 0x08(%rdi);"); - asm("mov %r10,0x10(%rdi);"); - asm("mov %r11,0x18(%rdi);"); - asm("mov %rcx,0x20(%rdi);"); - asm("mov %rax,0x28(%rdi);"); -} +void __negate(fp* z, const fp* x); void _negate(fp* z, const fp* x) { - __negate(z, x); - // put zero check after __neg because gcc messes up %rdi in -O3 (doesn't restore it before inlining asm code) if(x->isZero()) { *z = *x; return; } + __negate(z, x); } #else void _negate(fp* z, const fp* x) @@ -437,1043 +145,8 @@ void _negate(fp* z, const fp* x) #endif #ifdef __x86_64__ -void __multiply(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi (=> stack) - // x => %rsi - // y => %rdx (=> %rdi) - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("push %rbp;"); - asm("push %r15;"); - asm("push %r14;"); - asm("push %r13;"); - asm("push %r12;"); - asm("push %rbx;"); - - // this pushes the constants (INP and MODULUS) used in this function to the stack - // TODO: should be referenced from data section instead - asm("mov $0x89f3fffcfffcfffd,%rax;"); - asm("push %rax;"); - asm("mov %rsp,%rbp;"); - asm("mov $0xb9feffffffffaaab,%rax;"); - asm("push %rax;"); - asm("mov $0x1eabfffeb153ffff,%rax;"); - asm("push %rax;"); - asm("mov $0x6730d2a0f6b0f624,%rax;"); - asm("push %rax;"); - asm("mov $0x64774b84f38512bf,%rax;"); - asm("push %rax;"); - asm("mov $0x4b1ba7b6434bacd7,%rax;"); - asm("push %rax;"); - asm("mov $0x1a0111ea397fe69a,%rax;"); - asm("push %rax;"); - - asm("mov $0,%r9;"); - asm("mov $0,%r10;"); - asm("mov $0,%r11;"); - asm("mov $0,%r12;"); - asm("mov $0,%r13;"); - asm("mov $0,%r14;"); - asm("mov $0,%r15;"); - - asm("push %rdi;"); // save z for later - asm("mov %rdx,%rdi;"); // move y to %rdi to free up %rdx for use in mul - // i0 - asm("mov (%rsi),%rcx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("push %rax;"); - asm("mov %rdx,%r8;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r8;"); - asm("adc %rdx,%r9;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r9;"); - asm("adc %rdx,%r10;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r10;"); - asm("adc %rdx,%r11;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%r12;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - // i1 - asm("mov 0x08(%rsi),%rcx;"); - asm("mov $0,%rbx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r8;"); - asm("adc %rdx,%r9;"); - asm("adc $0,%r10;"); - asm("adc $0,%rbx;"); - asm("push %r8;"); - asm("mov $0,%r8;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r9;"); - asm("adc %rdx,%r10;"); - asm("adc %rbx,%r11;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r10;"); - asm("adc %rdx,%r11;"); - asm("adc %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%r12;"); - asm("adc %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc %rbx,%r14;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%r14;"); - // i2 - asm("mov 0x10(%rsi),%rcx;"); - asm("mov $0,%rbx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r9;"); - asm("adc %rdx,%r10;"); - asm("adc $0,%r11;"); - asm("adc $0,%rbx;"); - asm("push %r9;"); - asm("mov $0,%r9;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r10;"); - asm("adc %rdx,%r11;"); - asm("adc %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%r12;"); - asm("adc %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%r14;"); - asm("adc %rbx,%r15;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r14;"); - asm("adc %rdx,%r15;"); - // i3 - asm("mov 0x18(%rsi),%rcx;"); - asm("mov $0,%rbx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r10;"); - asm("adc %rdx,%r11;"); - asm("adc $0,%r12;"); - asm("adc $0,%rbx;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%r12;"); - asm("adc %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%r14;"); - asm("adc %rbx,%r15;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r14;"); - asm("adc %rdx,%r15;"); - asm("adc %rbx,%r8;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r15;"); - asm("adc %rdx,%r8;"); - // i4 - asm("mov 0x20(%rsi),%rcx;"); - asm("mov $0,%rbx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%r12;"); - asm("adc $0,%r13;"); - asm("adc $0,%rbx;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%r14;"); - asm("adc %rbx,%r15;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r14;"); - asm("adc %rdx,%r15;"); - asm("adc %rbx,%r8;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r15;"); - asm("adc %rdx,%r8;"); - asm("adc %rbx,%r9;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r8;"); - asm("adc %rdx,%r9;"); - // i5 - asm("mov 0x28(%rsi),%rcx;"); - asm("mov $0,%rbx;"); - asm("mov (%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc $0,%r14;"); - asm("adc $0,%rbx;"); - asm("mov 0x08(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%r14;"); - asm("adc %rbx,%r15;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x10(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r14;"); - asm("adc %rdx,%r15;"); - asm("adc %rbx,%r8;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x18(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r15;"); - asm("adc %rdx,%r8;"); - asm("adc %rbx,%r9;"); - asm("mov $0,%rbx;"); - asm("adc $0,%rbx;"); - asm("mov 0x20(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r8;"); - asm("adc %rdx,%r9;"); - asm("adc $0,%rbx;"); - asm("mov 0x28(%rdi),%rax;"); - asm("mul %rcx;"); - asm("add %rax,%r9;"); - asm("adc %rdx,%rbx;"); - - asm("pop %rsi;"); - asm("pop %rdi;"); - asm("pop %rcx;"); - asm("push %rbx;"); - asm("push %r9;"); - - // montgomery reduction - // i0 - asm("mov %rcx,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rcx;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rdi;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%rdi;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rsi;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%rsi;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r10;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r10;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r11;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r11;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r12;"); - asm("adc %rdx,%r13;"); - asm("adc $0,%rcx;"); - // i1 - asm("mov %rdi,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rdi;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rsi;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%rsi;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r10;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r10;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r11;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r11;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r13;"); - asm("adc %rdx,%rcx;"); - asm("add %rbx,%r13;"); - asm("adc %rcx,%r14;"); - asm("mov $0,%rcx;"); - asm("adc $0,%rcx;"); - // i2 - asm("mov %rsi,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rsi;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r10;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r10;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r11;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r11;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r13;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r14;"); - asm("adc %rdx,%rcx;"); - asm("add %rbx,%r14;"); - asm("adc %rcx,%r15;"); - asm("mov $0,%rcx;"); - asm("adc $0,%rcx;"); - // i3 - asm("mov %r10,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r10;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r11;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r11;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r13;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r14;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r15;"); - asm("adc %rdx,%rcx;"); - asm("add %rbx,%r15;"); - asm("adc %rcx,%r8;"); - asm("mov $0,%rcx;"); - asm("adc $0,%rcx;"); - // i4 - asm("mov %r11,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r11;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r12;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r13;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r14;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r15;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r15;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r8;"); - asm("adc %rdx,%rcx;"); - asm("add %rbx,%r8;"); - asm("pop %rdi;"); - asm("adc %rcx,%rdi;"); - asm("mov $0,%rcx;"); - asm("adc $0,%rcx;"); - // i5 - asm("mov %r12,%rax;"); - asm("mulq (%rbp);"); - asm("mov %rax,%r9;"); - asm("mov $0,%rbx;"); - // j0 - asm("mov -0x08(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r12;"); - asm("adc %rdx,%rbx;"); - // j1 - asm("mov -0x10(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r13;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r13;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j2 - asm("mov -0x18(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r14;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r14;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j3 - asm("mov -0x20(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r15;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r15;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j4 - asm("mov -0x28(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%r8;"); - asm("adc $0,%rdx;"); - asm("add %rbx,%r8;"); - asm("mov $0,%rbx;"); - asm("adc %rdx,%rbx;"); - // j5 - asm("mov -0x30(%rbp),%rax;"); - asm("mul %r9;"); - asm("add %rax,%rdi;"); - asm("adc %rdx,%rcx;"); - asm("add %rbx,%rdi;"); - asm("pop %rsi;"); - asm("adc %rsi,%rcx;"); - - // modular reduction - asm("mov %r13,%r10;"); - asm("sub -0x08(%rbp),%r10;"); - asm("mov %r14,%r11;"); - asm("sbb -0x10(%rbp),%r11;"); - asm("mov %r15,%r12;"); - asm("sbb -0x18(%rbp),%r12;"); - asm("mov %r8,%rax;"); - asm("sbb -0x20(%rbp),%rax;"); - asm("mov %rdi,%rbx;"); - asm("sbb -0x28(%rbp),%rbx;"); - asm("mov %rcx,%r9;"); - asm("sbb -0x30(%rbp),%r9;"); - - // out - asm("pop %rsi;"); - asm("cmovae %r10,%r13;"); - asm("mov %r13, (%rsi);"); - asm("cmovae %r11,%r14;"); - asm("mov %r14,0x08(%rsi);"); - asm("cmovae %r12,%r15;"); - asm("mov %r15,0x10(%rsi);"); - asm("cmovae %rax,%r8;"); - asm("mov %r8,0x18(%rsi);"); - asm("cmovae %rbx,%rdi;"); - asm("mov %rdi,0x20(%rsi);"); - asm("cmovae %r9,%rcx;"); - asm("mov %rcx,0x28(%rsi);"); - - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - - asm("pop %rbx;"); - asm("pop %r12;"); - asm("pop %r13;"); - asm("pop %r14;"); - asm("pop %r15;"); - asm("pop %rbp;"); -} -void __mul_ex(fp* z, const fp* x, const fp* y) -{ - // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): - // z => %rdi (=> stack) - // x => %rsi - // y => %rdx (=> %rdi) - // callee needs to restore registers r15, r14, r13, r12, rbx before returning - asm("push %rbp;"); - asm("push %r15;"); - asm("push %r14;"); - asm("push %r13;"); - asm("push %r12;"); - asm("push %rbx;"); - - // this pushes the constants (INP and MODULUS) used in this function to the stack - // TODO: should be referenced from data section instead - asm("mov $0x89f3fffcfffcfffd,%rax;"); - asm("push %rax;"); - asm("mov %rsp,%rbp;"); - asm("mov $0xb9feffffffffaaab,%rax;"); - asm("push %rax;"); - asm("mov $0x1eabfffeb153ffff,%rax;"); - asm("push %rax;"); - asm("mov $0x6730d2a0f6b0f624,%rax;"); - asm("push %rax;"); - asm("mov $0x64774b84f38512bf,%rax;"); - asm("push %rax;"); - asm("mov $0x4b1ba7b6434bacd7,%rax;"); - asm("push %rax;"); - asm("mov $0x1a0111ea397fe69a,%rax;"); - asm("push %rax;"); - - asm("push %rdi;"); // save z for later - asm("mov %rdx,%rdi;"); // move y to %rdi to free up %rdx for use in mulx - asm("xor %rax,%rax;"); - // i0 - asm("mov (%rsi),%rdx;"); // x0 - asm("mulx (%rdi),%rax,%rcx;"); // x0 * y0 - asm("push %rax;"); - asm("mulx 0x08(%rdi),%rax,%r8;"); // x0 * y1 - asm("adcx %rax,%rcx;"); - asm("mulx 0x10(%rdi),%rax,%r9;"); // x0 * y2 - asm("adcx %rax,%r8;"); - asm("mulx 0x18(%rdi),%rax,%r10;"); // x0 * y3 - asm("adcx %rax,%r9;"); - asm("mulx 0x20(%rdi),%rax,%r11;"); // x0 * y4 - asm("adcx %rax,%r10;"); - asm("mulx 0x28(%rdi),%rax,%r12;"); // x0 * y5 - asm("adcx %rax,%r11;"); - asm("adc $0,%r12;"); - // i1 - asm("mov 0x08(%rsi),%rdx;"); // x1 - asm("xor %r13,%r13;"); - asm("mulx (%rdi),%rax,%rbx;"); // x1 * y0 - asm("adox %rax,%rcx;"); - asm("adcx %rbx,%r8;"); - asm("push %rcx;"); - asm("mulx 0x08(%rdi),%rax,%rbx;"); // x1 * y1 - asm("adox %rax,%r8;"); - asm("adcx %rbx,%r9;"); - asm("mulx 0x10(%rdi),%rax,%rbx;"); // x1 * y2 - asm("adox %rax,%r9;"); - asm("adcx %rbx,%r10;"); - asm("mulx 0x18(%rdi),%rax,%rbx;"); // x1 * y3 - asm("adox %rax,%r10;"); - asm("adcx %rbx,%r11;"); - asm("mulx 0x20(%rdi),%rax,%rbx;"); // x1 * y4 - asm("adox %rax,%r11;"); - asm("adcx %rbx,%r12;"); - asm("mulx 0x28(%rdi),%rax,%rbx;"); // x1 * y5 - asm("adox %rax,%r12;"); - asm("adox %r13,%r13;"); - asm("adcx %rbx,%r13;"); - // i2 - asm("mov 0x10(%rsi),%rdx;"); // x2 - asm("xor %r14,%r14;"); - asm("mulx (%rdi),%rax,%rbx;"); // x2 * y0 - asm("adox %rax,%r8;"); - asm("adcx %rbx,%r9;"); - asm("mulx 0x08(%rdi),%rax,%rbx;"); // x2 * y1 - asm("adox %rax,%r9;"); - asm("adcx %rbx,%r10;"); - asm("mulx 0x10(%rdi),%rax,%rbx;"); // x2 * y2 - asm("adox %rax,%r10;"); - asm("adcx %rbx,%r11;"); - asm("mulx 0x18(%rdi),%rax,%rbx;"); // x2 * y3 - asm("adox %rax,%r11;"); - asm("adcx %rbx,%r12;"); - asm("mulx 0x20(%rdi),%rax,%rbx;"); // x2 * y4 - asm("adox %rax,%r12;"); - asm("adcx %rbx,%r13;"); - asm("mulx 0x28(%rdi),%rax,%rbx;"); // x2 * y5 - asm("adox %rax,%r13;"); - asm("adox %r14,%r14;"); - asm("adcx %rbx,%r14;"); - // i3 - asm("mov 0x18(%rsi),%rdx;"); // x3 - asm("xor %r15,%r15;"); - asm("mulx (%rdi),%rax,%rbx;"); // x3 * y0 - asm("adox %rax,%r9;"); - asm("adcx %rbx,%r10;"); - asm("mulx 0x08(%rdi),%rax,%rbx;"); // x3 * y1 - asm("adox %rax,%r10;"); - asm("adcx %rbx,%r11;"); - asm("mulx 0x10(%rdi),%rax,%rbx;"); // x3 * y2 - asm("adox %rax,%r11;"); - asm("adcx %rbx,%r12;"); - asm("mulx 0x18(%rdi),%rax,%rbx;"); // x3 * y3 - asm("adox %rax,%r12;"); - asm("adcx %rbx,%r13;"); - asm("mulx 0x20(%rdi),%rax,%rbx;"); // x3 * y4 - asm("adox %rax,%r13;"); - asm("adcx %rbx,%r14;"); - asm("mulx 0x28(%rdi),%rax,%rbx;"); // x3 * y5 - asm("adox %rax,%r14;"); - asm("adox %r15,%r15;"); - asm("adcx %rbx,%r15;"); - // i4 - asm("mov 0x20(%rsi),%rdx;"); // x4 - asm("xor %rcx,%rcx;"); - asm("mulx (%rdi),%rax,%rbx;"); // x4 * y0 - asm("adox %rax,%r10;"); - asm("adcx %rbx,%r11;"); - asm("mulx 0x08(%rdi),%rax,%rbx;"); // x4 * y1 - asm("adox %rax,%r11;"); - asm("adcx %rbx,%r12;"); - asm("mulx 0x10(%rdi),%rax,%rbx;"); // x4 * y2 - asm("adox %rax,%r12;"); - asm("adcx %rbx,%r13;"); - asm("mulx 0x18(%rdi),%rax,%rbx;"); // x4 * y3 - asm("adox %rax,%r13;"); - asm("adcx %rbx,%r14;"); - asm("mulx 0x20(%rdi),%rax,%rbx;"); // x4 * y4 - asm("adox %rax,%r14;"); - asm("adcx %rbx,%r15;"); - asm("mulx 0x28(%rdi),%rax,%rbx;"); // x4 * y5 - asm("adox %rax,%r15;"); - asm("adox %rcx,%rcx;"); - asm("adcx %rbx,%rcx;"); - // i5 - asm("mov 0x28(%rsi),%rdx;"); // x5 - asm("xor %rsi,%rsi;"); - asm("mulx (%rdi),%rax,%rbx;"); // x5 * y0 - asm("adox %rax,%r11;"); - asm("adcx %rbx,%r12;"); - asm("mulx 0x08(%rdi),%rax,%rbx;"); // x5 * y1 - asm("adox %rax,%r12;"); - asm("adcx %rbx,%r13;"); - asm("mulx 0x10(%rdi),%rax,%rbx;"); // x5 * y2 - asm("adox %rax,%r13;"); - asm("adcx %rbx,%r14;"); - asm("mulx 0x18(%rdi),%rax,%rbx;"); // x5 * y3 - asm("adox %rax,%r14;"); - asm("adcx %rbx,%r15;"); - asm("mulx 0x20(%rdi),%rax,%rbx;"); // x5 * y4 - asm("adox %rax,%r15;"); - asm("adcx %rbx,%rcx;"); - asm("mulx 0x28(%rdi),%rax,%rbx;"); // x5 * y5 - asm("adox %rax,%rcx;"); - asm("adox %rbx,%rsi;"); - asm("adc $0,%rsi;"); - - asm("pop %rdi;"); - asm("pop %rbx;"); - asm("push %rsi;"); - - // montgomery reduction - asm("xor %rax,%rax;"); - // i0 - asm("mov %rbx, %rdx"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%rbx;"); - asm("adcx %rsi,%rdi;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%rdi;"); - asm("adcx %rsi,%r8;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r8;"); - asm("adcx %rsi,%r9;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r9;"); - asm("adcx %rsi,%r10;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r10;"); - asm("adcx %rsi,%r11;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("adox %rbx,%r12;"); - asm("adcx %rbx,%rbx;"); - asm("mov $0,%rax;"); - asm("adox %rax,%rbx;"); - asm("xor %rax,%rax;"); - // i1 - asm("mov %rdi,%rdx;"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%rdi;"); - asm("adcx %rsi,%r8;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%r8;"); - asm("adcx %rsi,%r9;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r9;"); - asm("adcx %rsi,%r10;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r10;"); - asm("adcx %rsi,%r11;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%r12;"); - asm("adcx %rsi,%r13;"); - asm("adox %rbx,%r13;"); - asm("adcx %rdi,%rdi;"); - asm("mov $0,%rax;"); - asm("adox %rax,%rdi;"); - asm("xor %rax,%rax;"); - // i2 - asm("mov %r8,%rdx;"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%r8;"); - asm("adcx %rsi,%r9;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%r9;"); - asm("adcx %rsi,%r10;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r10;"); - asm("adcx %rsi,%r11;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r12;"); - asm("adcx %rsi,%r13;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%r13;"); - asm("adcx %rsi,%r14;"); - asm("adox %rdi,%r14;"); - asm("adcx %r8,%r8;"); - asm("mov $0,%rax;"); - asm("adox %rax,%r8;"); - asm("xor %rax,%rax;"); - // i3 - asm("mov %r9,%rdx;"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%r9;"); - asm("adcx %rsi,%r10;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%r10;"); - asm("adcx %rsi,%r11;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r12;"); - asm("adcx %rsi,%r13;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r13;"); - asm("adcx %rsi,%r14;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%r14;"); - asm("adcx %rsi,%r15;"); - asm("adox %r8,%r15;"); - asm("adcx %r9,%r9;"); - asm("mov $0,%rax;"); - asm("adox %rax,%r9;"); - asm("xor %rax,%rax;"); - // i4 - asm("mov %r10,%rdx;"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%r10;"); - asm("adcx %rsi,%r11;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r12;"); - asm("adcx %rsi,%r13;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r13;"); - asm("adcx %rsi,%r14;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r14;"); - asm("adcx %rsi,%r15;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%r15;"); - asm("adcx %rsi,%rcx;"); - asm("adox %r9,%rcx;"); - asm("adcx %r10,%r10;"); - asm("mov $0,%rax;"); - asm("adox %rax,%r10;"); - asm("xor %rax,%rax;"); - // i5 - asm("mov %r11,%rdx;"); - asm("mulx (%rbp),%rdx,%rsi;"); - asm("mulx -0x08(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); // j0 - asm("adox %rax,%r11;"); - asm("adcx %rsi,%r12;"); - asm("mulx -0x10(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); // j1 - asm("adox %rax,%r12;"); - asm("adcx %rsi,%r13;"); - asm("mulx -0x18(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); // j2 - asm("adox %rax,%r13;"); - asm("adcx %rsi,%r14;"); - asm("mulx -0x20(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); // j3 - asm("adox %rax,%r14;"); - asm("adcx %rsi,%r15;"); - asm("mulx -0x28(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); // j4 - asm("adox %rax,%r15;"); - asm("adcx %rsi,%rcx;"); - asm("mulx -0x30(%rbp),%rax,%rsi;"); - //asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); // j5 - asm("adox %rax,%rcx;"); - asm("pop %rbx;"); - asm("adcx %rsi,%rbx;"); - asm("adox %r10,%rbx;"); - // modular reduction - asm("mov %r12,%rax;"); - asm("sub -0x08(%rbp),%rax;"); - //asm("sub %0,%%rax;" : : "m" (fp::MODULUS.d[0])); - asm("mov %r13,%rsi;"); - asm("sbb -0x10(%rbp),%rsi;"); - //asm("sbb %0,%%rsi;" : : "m" (fp::MODULUS.d[1])); - asm("mov %r14,%rdi;"); - asm("sbb -0x18(%rbp),%rdi;"); - //asm("sbb %0,%%rdi;" : : "m" (fp::MODULUS.d[2])); - asm("mov %r15,%r8;"); - asm("sbb -0x20(%rbp),%r8;"); - //asm("sbb %0,%%r8;" : : "m" (fp::MODULUS.d[3])); - asm("mov %rcx,%r9;"); - asm("sbb -0x28(%rbp),%r9;"); - //asm("sbb %0,%%r9;" : : "m" (fp::MODULUS.d[4])); - asm("mov %rbx,%r10;"); - asm("sbb -0x30(%rbp),%r10;"); - //asm("sbb %0,%%r10;" : : "m" (fp::MODULUS.d[5])); - // out - asm("pop %r11;"); - asm("cmovae %rax,%r12;"); - asm("mov %r12,(%r11);"); - asm("cmovae %rsi,%r13;"); - asm("mov %r13,0x8(%r11);"); - asm("cmovae %rdi,%r14;"); - asm("mov %r14,0x10(%r11);"); - asm("cmovae %r8,%r15;"); - asm("mov %r15,0x18(%r11);"); - asm("cmovae %r9,%rcx"); - asm("mov %rcx,0x20(%r11);"); - asm("cmovae %r10,%rbx;"); - asm("mov %rbx,0x28(%r11);"); - - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - asm("pop %rax;"); - - asm("pop %rbx;"); - asm("pop %r12;"); - asm("pop %r13;"); - asm("pop %r14;"); - asm("pop %r15;"); - asm("pop %rbp;"); -} +void __multiply(fp* z, const fp* x, const fp* y); +void __mul_ex(fp* z, const fp* x, const fp* y); typedef void (*blsmul_func_t)(fp*, const fp*, const fp*); @@ -1646,16 +319,6 @@ void _multiply(fp* z, const fp* x, const fp* y) #ifdef __x86_64__ void _square(fp* z, const fp* x) { - #ifdef __clang__ - // The clang compiler completely optimizes out the _square() function and inlines __multiply() wherever - // it occurs. However, for some reason the compiler forgets that it has to move the third - // parameter ('y') of __multiply() into %rdx according to the calling convention. The first two - // parameters, 'z' (%rdi) and 'x' (%rsi), are set properly because they are the exact same as for - // __square(). But the third parameter (which sould be 'x' as well) is somehow ignored by the - // clang compiler. So we need to help out by moving it into %rdx before calling __multiply(). - // This is probably a bug in clang! - asm("mov %rsi,%rdx;"); - #endif __multiply(z, x, x); } #else diff --git a/src/arithmetic.s b/src/arithmetic.s new file mode 100644 index 0000000..38c69c9 --- /dev/null +++ b/src/arithmetic.s @@ -0,0 +1,1311 @@ +.file "arithmetic.s" +.text + +# void bls12_381::_add(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_3814_addEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_3814_addEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_3814_addEPNS_2fpEPKS0_S3_: + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%r12 + mov 0x28(%rsi),%r13 + add (%rdx),%r8 + adc 0x08(%rdx),%r9 + adc 0x10(%rdx),%r10 + adc 0x18(%rdx),%r11 + adc 0x20(%rdx),%r12 + adc 0x28(%rdx),%r13 + mov %r8,%r14 + mov %r9,%r15 + mov %r10,%rcx + mov %r11,%rdx + mov %r12,%rsi + mov %r13,%rbx + mov $0xb9feffffffffaaab,%rax + sub %rax,%r14 + mov $0x1eabfffeb153ffff,%rax + sbb %rax,%r15 + mov $0x6730d2a0f6b0f624,%rax + sbb %rax,%rcx + mov $0x64774b84f38512bf,%rax + sbb %rax,%rdx + mov $0x4b1ba7b6434bacd7,%rax + sbb %rax,%rsi + mov $0x1a0111ea397fe69a,%rax + sbb %rax,%rbx + cmovae %r14,%r8 + cmovae %r15,%r9 + cmovae %rcx,%r10 + cmovae %rdx,%r11 + cmovae %rsi,%r12 + cmovae %rbx,%r13 + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %r12,0x20(%rdi) + mov %r13,0x28(%rdi) + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + retq +.size _ZN9bls12_3814_addEPNS_2fpEPKS0_S3_, .-_ZN9bls12_3814_addEPNS_2fpEPKS0_S3_ + +# void bls12_381::_ladd(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_3815_laddEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_3815_laddEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_3815_laddEPNS_2fpEPKS0_S3_: + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%rcx + mov 0x28(%rsi),%rax + add (%rdx),%r8 + adc 0x08(%rdx),%r9 + adc 0x10(%rdx),%r10 + adc 0x18(%rdx),%r11 + adc 0x20(%rdx),%rcx + adc 0x28(%rdx),%rax + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %rcx,0x20(%rdi) + mov %rax,0x28(%rdi) + retq +.size _ZN9bls12_3815_laddEPNS_2fpEPKS0_S3_, .-_ZN9bls12_3815_laddEPNS_2fpEPKS0_S3_ + +# void bls12_381::_double(fp* z, const fp* x) +.globl _ZN9bls12_3817_doubleEPNS_2fpEPKS0_ +.type _ZN9bls12_3817_doubleEPNS_2fpEPKS0_, @function +_ZN9bls12_3817_doubleEPNS_2fpEPKS0_: + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%r12 + mov 0x28(%rsi),%r13 + add %r8,%r8 + adc %r9,%r9 + adc %r10,%r10 + adc %r11,%r11 + adc %r12,%r12 + adc %r13,%r13 + mov %r8,%r14 + mov %r9,%r15 + mov %r10,%rcx + mov %r11,%rdx + mov %r12,%rsi + mov %r13,%rbx + mov $0xb9feffffffffaaab,%rax + sub %rax,%r14 + mov $0x1eabfffeb153ffff,%rax + sbb %rax,%r15 + mov $0x6730d2a0f6b0f624,%rax + sbb %rax,%rcx + mov $0x64774b84f38512bf,%rax + sbb %rax,%rdx + mov $0x4b1ba7b6434bacd7,%rax + sbb %rax,%rsi + mov $0x1a0111ea397fe69a,%rax + sbb %rax,%rbx + cmovae %r14,%r8 + cmovae %r15,%r9 + cmovae %rcx,%r10 + cmovae %rdx,%r11 + cmovae %rsi,%r12 + cmovae %rbx,%r13 + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %r12,0x20(%rdi) + mov %r13,0x28(%rdi) + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + retq +.size _ZN9bls12_3817_doubleEPNS_2fpEPKS0_, .-_ZN9bls12_3817_doubleEPNS_2fpEPKS0_ + +# void bls12_381::_ldouble(fp* z, const fp* x) +.globl _ZN9bls12_3818_ldoubleEPNS_2fpEPKS0_ +.type _ZN9bls12_3818_ldoubleEPNS_2fpEPKS0_, @function +_ZN9bls12_3818_ldoubleEPNS_2fpEPKS0_: + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%rcx + mov 0x28(%rsi),%rax + add %r8,%r8 + adc %r9,%r9 + adc %r10,%r10 + adc %r11,%r11 + adc %rcx,%rcx + adc %rax,%rax + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %rcx,0x20(%rdi) + mov %rax,0x28(%rdi) + retq +.size _ZN9bls12_3818_ldoubleEPNS_2fpEPKS0_, .-_ZN9bls12_3818_ldoubleEPNS_2fpEPKS0_ + +# void bls12_381::_subtract(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_3819_subtractEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_3819_subtractEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_3819_subtractEPNS_2fpEPKS0_S3_: + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + xor %rax,%rax + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%r12 + mov 0x28(%rsi),%r13 + sub (%rdx),%r8 + sbb 0x08(%rdx),%r9 + sbb 0x10(%rdx),%r10 + sbb 0x18(%rdx),%r11 + sbb 0x20(%rdx),%r12 + sbb 0x28(%rdx),%r13 + mov $0xb9feffffffffaaab,%r14 + mov $0x1eabfffeb153ffff,%r15 + mov $0x6730d2a0f6b0f624,%rcx + mov $0x64774b84f38512bf,%rdx + mov $0x4b1ba7b6434bacd7,%rsi + mov $0x1a0111ea397fe69a,%rbx + cmovae %rax,%r14 + cmovae %rax,%r15 + cmovae %rax,%rcx + cmovae %rax,%rdx + cmovae %rax,%rsi + cmovae %rax,%rbx + add %r14,%r8 + adc %r15,%r9 + adc %rcx,%r10 + adc %rdx,%r11 + adc %rsi,%r12 + adc %rbx,%r13 + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %r12,0x20(%rdi) + mov %r13,0x28(%rdi) + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + retq +.size _ZN9bls12_3819_subtractEPNS_2fpEPKS0_S3_, .-_ZN9bls12_3819_subtractEPNS_2fpEPKS0_S3_ + +# void bls12_381::_lsubtract(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_38110_lsubtractEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_38110_lsubtractEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_38110_lsubtractEPNS_2fpEPKS0_S3_: + mov (%rsi),%r8 + mov 0x08(%rsi),%r9 + mov 0x10(%rsi),%r10 + mov 0x18(%rsi),%r11 + mov 0x20(%rsi),%rcx + mov 0x28(%rsi),%rax + sub (%rdx),%r8 + sbb 0x08(%rdx),%r9 + sbb 0x10(%rdx),%r10 + sbb 0x18(%rdx),%r11 + sbb 0x20(%rdx),%rcx + sbb 0x28(%rdx),%rax + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %rcx,0x20(%rdi) + mov %rax,0x28(%rdi) + retq +.size _ZN9bls12_38110_lsubtractEPNS_2fpEPKS0_S3_, .-_ZN9bls12_38110_lsubtractEPNS_2fpEPKS0_S3_ + +# void bls12_381::__negate(fp* z, const fp* x) +.globl _ZN9bls12_3818__negateEPNS_2fpEPKS0_ +.type _ZN9bls12_3818__negateEPNS_2fpEPKS0_, @function +_ZN9bls12_3818__negateEPNS_2fpEPKS0_: + mov $0xb9feffffffffaaab,%r8 + mov $0x1eabfffeb153ffff,%r9 + mov $0x6730d2a0f6b0f624,%r10 + mov $0x64774b84f38512bf,%r11 + mov $0x4b1ba7b6434bacd7,%rcx + mov $0x1a0111ea397fe69a,%rax + sub (%rsi),%r8 + sbb 0x08(%rsi),%r9 + sbb 0x10(%rsi),%r10 + sbb 0x18(%rsi),%r11 + sbb 0x20(%rsi),%rcx + sbb 0x28(%rsi),%rax + mov %r8, (%rdi) + mov %r9, 0x08(%rdi) + mov %r10,0x10(%rdi) + mov %r11,0x18(%rdi) + mov %rcx,0x20(%rdi) + mov %rax,0x28(%rdi) + retq +.size _ZN9bls12_3818__negateEPNS_2fpEPKS0_, .-_ZN9bls12_3818__negateEPNS_2fpEPKS0_ + +# void bls12_381::__multiply(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_38110__multiplyEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_38110__multiplyEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_38110__multiplyEPNS_2fpEPKS0_S3_: + push %rbp + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + # this pushes the constants (INP and MODULUS) used in this function to the stack + # TODO: should be referenced from data section instead + mov $0x89f3fffcfffcfffd,%rax + push %rax + mov %rsp,%rbp + mov $0xb9feffffffffaaab,%rax + push %rax + mov $0x1eabfffeb153ffff,%rax + push %rax + mov $0x6730d2a0f6b0f624,%rax + push %rax + mov $0x64774b84f38512bf,%rax + push %rax + mov $0x4b1ba7b6434bacd7,%rax + push %rax + mov $0x1a0111ea397fe69a,%rax + push %rax + + mov $0,%r9 + mov $0,%r10 + mov $0,%r11 + mov $0,%r12 + mov $0,%r13 + mov $0,%r14 + mov $0,%r15 + + push %rdi # save z for later + mov %rdx,%rdi # move y to %rdi to free up %rdx for use in mul + # i0 + mov (%rsi),%rcx + mov (%rdi),%rax + mul %rcx + push %rax + mov %rdx,%r8 + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r8 + adc %rdx,%r9 + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r9 + adc %rdx,%r10 + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r10 + adc %rdx,%r11 + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r11 + adc %rdx,%r12 + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + # i1 + mov 0x08(%rsi),%rcx + mov $0,%rbx + mov (%rdi),%rax + mul %rcx + add %rax,%r8 + adc %rdx,%r9 + adc $0,%r10 + adc $0,%rbx + push %r8 + mov $0,%r8 + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r9 + adc %rdx,%r10 + adc %rbx,%r11 + mov $0,%rbx + adc $0,%rbx + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r10 + adc %rdx,%r11 + adc %rbx,%r12 + mov $0,%rbx + adc $0,%rbx + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r11 + adc %rdx,%r12 + adc %rbx,%r13 + mov $0,%rbx + adc $0,%rbx + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + adc %rbx,%r14 + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r13 + adc %rdx,%r14 + # i2 + mov 0x10(%rsi),%rcx + mov $0,%rbx + mov (%rdi),%rax + mul %rcx + add %rax,%r9 + adc %rdx,%r10 + adc $0,%r11 + adc $0,%rbx + push %r9 + mov $0,%r9 + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r10 + adc %rdx,%r11 + adc %rbx,%r12 + mov $0,%rbx + adc $0,%rbx + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r11 + adc %rdx,%r12 + adc %rbx,%r13 + mov $0,%rbx + adc $0,%rbx + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + adc %rbx,%r14 + mov $0,%rbx + adc $0,%rbx + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r13 + adc %rdx,%r14 + adc %rbx,%r15 + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r14 + adc %rdx,%r15 + # i3 + mov 0x18(%rsi),%rcx + mov $0,%rbx + mov (%rdi),%rax + mul %rcx + add %rax,%r10 + adc %rdx,%r11 + adc $0,%r12 + adc $0,%rbx + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r11 + adc %rdx,%r12 + adc %rbx,%r13 + mov $0,%rbx + adc $0,%rbx + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + adc %rbx,%r14 + mov $0,%rbx + adc $0,%rbx + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r13 + adc %rdx,%r14 + adc %rbx,%r15 + mov $0,%rbx + adc $0,%rbx + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r14 + adc %rdx,%r15 + adc %rbx,%r8 + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r15 + adc %rdx,%r8 + # i4 + mov 0x20(%rsi),%rcx + mov $0,%rbx + mov (%rdi),%rax + mul %rcx + add %rax,%r11 + adc %rdx,%r12 + adc $0,%r13 + adc $0,%rbx + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + adc %rbx,%r14 + mov $0,%rbx + adc $0,%rbx + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r13 + adc %rdx,%r14 + adc %rbx,%r15 + mov $0,%rbx + adc $0,%rbx + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r14 + adc %rdx,%r15 + adc %rbx,%r8 + mov $0,%rbx + adc $0,%rbx + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r15 + adc %rdx,%r8 + adc %rbx,%r9 + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r8 + adc %rdx,%r9 + # i5 + mov 0x28(%rsi),%rcx + mov $0,%rbx + mov (%rdi),%rax + mul %rcx + add %rax,%r12 + adc %rdx,%r13 + adc $0,%r14 + adc $0,%rbx + mov 0x08(%rdi),%rax + mul %rcx + add %rax,%r13 + adc %rdx,%r14 + adc %rbx,%r15 + mov $0,%rbx + adc $0,%rbx + mov 0x10(%rdi),%rax + mul %rcx + add %rax,%r14 + adc %rdx,%r15 + adc %rbx,%r8 + mov $0,%rbx + adc $0,%rbx + mov 0x18(%rdi),%rax + mul %rcx + add %rax,%r15 + adc %rdx,%r8 + adc %rbx,%r9 + mov $0,%rbx + adc $0,%rbx + mov 0x20(%rdi),%rax + mul %rcx + add %rax,%r8 + adc %rdx,%r9 + adc $0,%rbx + mov 0x28(%rdi),%rax + mul %rcx + add %rax,%r9 + adc %rdx,%rbx + + pop %rsi + pop %rdi + pop %rcx + push %rbx + push %r9 + + # montgomery reduction + # i0 + mov %rcx,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%rcx + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%rdi + adc $0,%rdx + add %rbx,%rdi + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%rsi + adc $0,%rdx + add %rbx,%rsi + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r10 + adc $0,%rdx + add %rbx,%r10 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r11 + adc $0,%rdx + add %rbx,%r11 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%r12 + adc $0,%rdx + add %rbx,%r12 + adc %rdx,%r13 + adc $0,%rcx + # i1 + mov %rdi,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%rdi + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%rsi + adc $0,%rdx + add %rbx,%rsi + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%r10 + adc $0,%rdx + add %rbx,%r10 + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r11 + adc $0,%rdx + add %rbx,%r11 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r12 + adc $0,%rdx + add %rbx,%r12 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%r13 + adc %rdx,%rcx + add %rbx,%r13 + adc %rcx,%r14 + mov $0,%rcx + adc $0,%rcx + # i2 + mov %rsi,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%rsi + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%r10 + adc $0,%rdx + add %rbx,%r10 + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%r11 + adc $0,%rdx + add %rbx,%r11 + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r12 + adc $0,%rdx + add %rbx,%r12 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r13 + adc $0,%rdx + add %rbx,%r13 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%r14 + adc %rdx,%rcx + add %rbx,%r14 + adc %rcx,%r15 + mov $0,%rcx + adc $0,%rcx + # i3 + mov %r10,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%r10 + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%r11 + adc $0,%rdx + add %rbx,%r11 + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%r12 + adc $0,%rdx + add %rbx,%r12 + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r13 + adc $0,%rdx + add %rbx,%r13 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r14 + adc $0,%rdx + add %rbx,%r14 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%r15 + adc %rdx,%rcx + add %rbx,%r15 + adc %rcx,%r8 + mov $0,%rcx + adc $0,%rcx + # i4 + mov %r11,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%r11 + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%r12 + adc $0,%rdx + add %rbx,%r12 + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%r13 + adc $0,%rdx + add %rbx,%r13 + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r14 + adc $0,%rdx + add %rbx,%r14 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r15 + adc $0,%rdx + add %rbx,%r15 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%r8 + adc %rdx,%rcx + add %rbx,%r8 + pop %rdi + adc %rcx,%rdi + mov $0,%rcx + adc $0,%rcx + # i5 + mov %r12,%rax + mulq (%rbp) + mov %rax,%r9 + mov $0,%rbx + # j0 + mov -0x08(%rbp),%rax + mul %r9 + add %rax,%r12 + adc %rdx,%rbx + # j1 + mov -0x10(%rbp),%rax + mul %r9 + add %rax,%r13 + adc $0,%rdx + add %rbx,%r13 + mov $0,%rbx + adc %rdx,%rbx + # j2 + mov -0x18(%rbp),%rax + mul %r9 + add %rax,%r14 + adc $0,%rdx + add %rbx,%r14 + mov $0,%rbx + adc %rdx,%rbx + # j3 + mov -0x20(%rbp),%rax + mul %r9 + add %rax,%r15 + adc $0,%rdx + add %rbx,%r15 + mov $0,%rbx + adc %rdx,%rbx + # j4 + mov -0x28(%rbp),%rax + mul %r9 + add %rax,%r8 + adc $0,%rdx + add %rbx,%r8 + mov $0,%rbx + adc %rdx,%rbx + # j5 + mov -0x30(%rbp),%rax + mul %r9 + add %rax,%rdi + adc %rdx,%rcx + add %rbx,%rdi + pop %rsi + adc %rsi,%rcx + + # modular reduction + mov %r13,%r10 + sub -0x08(%rbp),%r10 + mov %r14,%r11 + sbb -0x10(%rbp),%r11 + mov %r15,%r12 + sbb -0x18(%rbp),%r12 + mov %r8,%rax + sbb -0x20(%rbp),%rax + mov %rdi,%rbx + sbb -0x28(%rbp),%rbx + mov %rcx,%r9 + sbb -0x30(%rbp),%r9 + + # out + pop %rsi + cmovae %r10,%r13 + mov %r13, (%rsi) + cmovae %r11,%r14 + mov %r14,0x08(%rsi) + cmovae %r12,%r15 + mov %r15,0x10(%rsi) + cmovae %rax,%r8 + mov %r8,0x18(%rsi) + cmovae %rbx,%rdi + mov %rdi,0x20(%rsi) + cmovae %r9,%rcx + mov %rcx,0x28(%rsi) + + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + pop %rbp + retq +.size _ZN9bls12_38110__multiplyEPNS_2fpEPKS0_S3_, .-_ZN9bls12_38110__multiplyEPNS_2fpEPKS0_S3_ + +# void bls12_381::__mul_ex(fp* z, const fp* x, const fp* y) +.globl _ZN9bls12_3818__mul_exEPNS_2fpEPKS0_S3_ +.type _ZN9bls12_3818__mul_exEPNS_2fpEPKS0_S3_, @function +_ZN9bls12_3818__mul_exEPNS_2fpEPKS0_S3_: + push %rbp + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + # this pushes the constants (INP and MODULUS) used in this function to the stack + # TODO: should be referenced from data section instead + mov $0x89f3fffcfffcfffd,%rax + push %rax + mov %rsp,%rbp + mov $0xb9feffffffffaaab,%rax + push %rax + mov $0x1eabfffeb153ffff,%rax + push %rax + mov $0x6730d2a0f6b0f624,%rax + push %rax + mov $0x64774b84f38512bf,%rax + push %rax + mov $0x4b1ba7b6434bacd7,%rax + push %rax + mov $0x1a0111ea397fe69a,%rax + push %rax + + push %rdi # save z for later + mov %rdx,%rdi # move y to %rdi to free up %rdx for use in mulx + xor %rax,%rax + # i0 + mov (%rsi),%rdx # x0 + mulx (%rdi),%rax,%rcx # x0 * y0 + push %rax + mulx 0x08(%rdi),%rax,%r8 # x0 * y1 + adcx %rax,%rcx + mulx 0x10(%rdi),%rax,%r9 # x0 * y2 + adcx %rax,%r8 + mulx 0x18(%rdi),%rax,%r10 # x0 * y3 + adcx %rax,%r9 + mulx 0x20(%rdi),%rax,%r11 # x0 * y4 + adcx %rax,%r10 + mulx 0x28(%rdi),%rax,%r12 # x0 * y5 + adcx %rax,%r11 + adc $0,%r12 + # i1 + mov 0x08(%rsi),%rdx # x1 + xor %r13,%r13 + mulx (%rdi),%rax,%rbx # x1 * y0 + adox %rax,%rcx + adcx %rbx,%r8 + push %rcx + mulx 0x08(%rdi),%rax,%rbx # x1 * y1 + adox %rax,%r8 + adcx %rbx,%r9 + mulx 0x10(%rdi),%rax,%rbx # x1 * y2 + adox %rax,%r9 + adcx %rbx,%r10 + mulx 0x18(%rdi),%rax,%rbx # x1 * y3 + adox %rax,%r10 + adcx %rbx,%r11 + mulx 0x20(%rdi),%rax,%rbx # x1 * y4 + adox %rax,%r11 + adcx %rbx,%r12 + mulx 0x28(%rdi),%rax,%rbx # x1 * y5 + adox %rax,%r12 + adox %r13,%r13 + adcx %rbx,%r13 + # i2 + mov 0x10(%rsi),%rdx # x2 + xor %r14,%r14 + mulx (%rdi),%rax,%rbx # x2 * y0 + adox %rax,%r8 + adcx %rbx,%r9 + mulx 0x08(%rdi),%rax,%rbx # x2 * y1 + adox %rax,%r9 + adcx %rbx,%r10 + mulx 0x10(%rdi),%rax,%rbx # x2 * y2 + adox %rax,%r10 + adcx %rbx,%r11 + mulx 0x18(%rdi),%rax,%rbx # x2 * y3 + adox %rax,%r11 + adcx %rbx,%r12 + mulx 0x20(%rdi),%rax,%rbx # x2 * y4 + adox %rax,%r12 + adcx %rbx,%r13 + mulx 0x28(%rdi),%rax,%rbx # x2 * y5 + adox %rax,%r13 + adox %r14,%r14 + adcx %rbx,%r14 + # i3 + mov 0x18(%rsi),%rdx # x3 + xor %r15,%r15 + mulx (%rdi),%rax,%rbx # x3 * y0 + adox %rax,%r9 + adcx %rbx,%r10 + mulx 0x08(%rdi),%rax,%rbx # x3 * y1 + adox %rax,%r10 + adcx %rbx,%r11 + mulx 0x10(%rdi),%rax,%rbx # x3 * y2 + adox %rax,%r11 + adcx %rbx,%r12 + mulx 0x18(%rdi),%rax,%rbx # x3 * y3 + adox %rax,%r12 + adcx %rbx,%r13 + mulx 0x20(%rdi),%rax,%rbx # x3 * y4 + adox %rax,%r13 + adcx %rbx,%r14 + mulx 0x28(%rdi),%rax,%rbx # x3 * y5 + adox %rax,%r14 + adox %r15,%r15 + adcx %rbx,%r15 + # i4 + mov 0x20(%rsi),%rdx # x4 + xor %rcx,%rcx + mulx (%rdi),%rax,%rbx # x4 * y0 + adox %rax,%r10 + adcx %rbx,%r11 + mulx 0x08(%rdi),%rax,%rbx # x4 * y1 + adox %rax,%r11 + adcx %rbx,%r12 + mulx 0x10(%rdi),%rax,%rbx # x4 * y2 + adox %rax,%r12 + adcx %rbx,%r13 + mulx 0x18(%rdi),%rax,%rbx # x4 * y3 + adox %rax,%r13 + adcx %rbx,%r14 + mulx 0x20(%rdi),%rax,%rbx # x4 * y4 + adox %rax,%r14 + adcx %rbx,%r15 + mulx 0x28(%rdi),%rax,%rbx # x4 * y5 + adox %rax,%r15 + adox %rcx,%rcx + adcx %rbx,%rcx + # i5 + mov 0x28(%rsi),%rdx # x5 + xor %rsi,%rsi + mulx (%rdi),%rax,%rbx # x5 * y0 + adox %rax,%r11 + adcx %rbx,%r12 + mulx 0x08(%rdi),%rax,%rbx # x5 * y1 + adox %rax,%r12 + adcx %rbx,%r13 + mulx 0x10(%rdi),%rax,%rbx # x5 * y2 + adox %rax,%r13 + adcx %rbx,%r14 + mulx 0x18(%rdi),%rax,%rbx # x5 * y3 + adox %rax,%r14 + adcx %rbx,%r15 + mulx 0x20(%rdi),%rax,%rbx # x5 * y4 + adox %rax,%r15 + adcx %rbx,%rcx + mulx 0x28(%rdi),%rax,%rbx # x5 * y5 + adox %rax,%rcx + adox %rbx,%rsi + adc $0,%rsi + + pop %rdi + pop %rbx + push %rsi + + # montgomery reduction + xor %rax,%rax + # i0 + mov %rbx, %rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%rbx + adcx %rsi,%rdi + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%rdi + adcx %rsi,%r8 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r8 + adcx %rsi,%r9 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r9 + adcx %rsi,%r10 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r10 + adcx %rsi,%r11 + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%r11 + adcx %rsi,%r12 + adox %rbx,%r12 + adcx %rbx,%rbx + mov $0,%rax + adox %rax,%rbx + xor %rax,%rax + # i1 + mov %rdi,%rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%rdi + adcx %rsi,%r8 + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%r8 + adcx %rsi,%r9 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r9 + adcx %rsi,%r10 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r10 + adcx %rsi,%r11 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r11 + adcx %rsi,%r12 + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%r12 + adcx %rsi,%r13 + adox %rbx,%r13 + adcx %rdi,%rdi + mov $0,%rax + adox %rax,%rdi + xor %rax,%rax + # i2 + mov %r8,%rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%r8 + adcx %rsi,%r9 + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%r9 + adcx %rsi,%r10 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r10 + adcx %rsi,%r11 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r11 + adcx %rsi,%r12 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r12 + adcx %rsi,%r13 + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%r13 + adcx %rsi,%r14 + adox %rdi,%r14 + adcx %r8,%r8 + mov $0,%rax + adox %rax,%r8 + xor %rax,%rax + # i3 + mov %r9,%rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%r9 + adcx %rsi,%r10 + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%r10 + adcx %rsi,%r11 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r11 + adcx %rsi,%r12 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r12 + adcx %rsi,%r13 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r13 + adcx %rsi,%r14 + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%r14 + adcx %rsi,%r15 + adox %r8,%r15 + adcx %r9,%r9 + mov $0,%rax + adox %rax,%r9 + xor %rax,%rax + # i4 + mov %r10,%rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%r10 + adcx %rsi,%r11 + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%r11 + adcx %rsi,%r12 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r12 + adcx %rsi,%r13 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r13 + adcx %rsi,%r14 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r14 + adcx %rsi,%r15 + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%r15 + adcx %rsi,%rcx + adox %r9,%rcx + adcx %r10,%r10 + mov $0,%rax + adox %rax,%r10 + xor %rax,%rax + # i5 + mov %r11,%rdx + mulx (%rbp),%rdx,%rsi + mulx -0x08(%rbp),%rax,%rsi + #asm("mulx %0,%%rdx,%%rsi;" : : "m" (fp::INP)); + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[0])); # j0 + adox %rax,%r11 + adcx %rsi,%r12 + mulx -0x10(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[1])); # j1 + adox %rax,%r12 + adcx %rsi,%r13 + mulx -0x18(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[2])); # j2 + adox %rax,%r13 + adcx %rsi,%r14 + mulx -0x20(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[3])); # j3 + adox %rax,%r14 + adcx %rsi,%r15 + mulx -0x28(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[4])); # j4 + adox %rax,%r15 + adcx %rsi,%rcx + mulx -0x30(%rbp),%rax,%rsi + #asm("mulx %0,%%rax,%%rsi;" : : "m" (fp::MODULUS.d[5])); # j5 + adox %rax,%rcx + pop %rbx + adcx %rsi,%rbx + adox %r10,%rbx + # modular reduction + mov %r12,%rax + sub -0x08(%rbp),%rax + #asm("sub %0,%%rax;" : : "m" (fp::MODULUS.d[0])); + mov %r13,%rsi + sbb -0x10(%rbp),%rsi + #asm("sbb %0,%%rsi;" : : "m" (fp::MODULUS.d[1])); + mov %r14,%rdi + sbb -0x18(%rbp),%rdi + #asm("sbb %0,%%rdi;" : : "m" (fp::MODULUS.d[2])); + mov %r15,%r8 + sbb -0x20(%rbp),%r8 + #asm("sbb %0,%%r8;" : : "m" (fp::MODULUS.d[3])); + mov %rcx,%r9 + sbb -0x28(%rbp),%r9 + #asm("sbb %0,%%r9;" : : "m" (fp::MODULUS.d[4])); + mov %rbx,%r10 + sbb -0x30(%rbp),%r10 + #asm("sbb %0,%%r10;" : : "m" (fp::MODULUS.d[5])); + # out + pop %r11 + cmovae %rax,%r12 + mov %r12,(%r11) + cmovae %rsi,%r13 + mov %r13,0x8(%r11) + cmovae %rdi,%r14 + mov %r14,0x10(%r11) + cmovae %r8,%r15 + mov %r15,0x18(%r11) + cmovae %r9,%rcx + mov %rcx,0x20(%r11) + cmovae %r10,%rbx + mov %rbx,0x28(%r11) + + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + pop %rbp + retq +.size _ZN9bls12_3818__mul_exEPNS_2fpEPKS0_S3_, .-_ZN9bls12_3818__mul_exEPNS_2fpEPKS0_S3_ + +.section .note.GNU-stack,"",@progbits # non executable stack