diff --git a/src/blst/LICENSE b/src/blst/LICENSE new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/src/blst/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/blst/README.md b/src/blst/README.md new file mode 100644 index 0000000000..f865a3d03d --- /dev/null +++ b/src/blst/README.md @@ -0,0 +1,226 @@ +[![Build Status](https://api.travis-ci.com/supranational/blst.svg?branch=master)](https://travis-ci.com/github/supranational/blst) [![Actions status](https://github.com/supranational/blst/workflows/build/badge.svg)](https://github.com/supranational/blst/actions) [![CodeQL status](https://github.com/supranational/blst/workflows/CodeQL/badge.svg)](https://github.com/supranational/blst/actions/workflows/codeql-analysis.yml) +
+ +
+ +# blst +blst (pronounced 'blast') is a BLS12-381 signature library focused on performance and security. It is written in C and assembly. + +## Table of Contents + + * [Status](#status) + * [General notes on implementation](#general-notes-on-implementation) + * [Platform and Language Compatibility](#platform-and-language-compatibility) + * [API](#api) + * [Introductory Tutorial](#introductory-tutorial) + + [Public Keys and Signatures](#public-keys-and-signatures) + + [Signature Verification](#signature-verification) + + [Signature Aggregation](#signature-aggregation) + + [Serialization Format](#serialization-format) + * [Build](#build) + + [C static library](#c-static-library) + * [Language-specific notes](#language-specific-notes) + + [Go](#go) + + [Rust](#rust) + * [Repository Structure](#repository-structure) + * [Performance](#performance) + * [License](#license) + +## Status +**This library is under active development** + +An initial audit of this library was conducted by NCC Group in January 2021 and can be found [here](https://research.nccgroup.com/wp-content/uploads/2021/01/NCC_Group_EthereumFoundation_ETHF002_Report_2021-01-20_v1.0.pdf). + +Formal verification of this library by Galois is on-going and can be found [here](https://github.com/GaloisInc/BLST-Verification). + +This library is compliant with the following IETF draft specifications: +- [IETF BLS Signature V5](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature) +- [IETF RFC 9380 Hashing to Elliptic Curves](https://www.rfc-editor.org/rfc/rfc9380.html) + +The serialization formatting is implemented according to [the ZCash definition](#serialization-format). + +## General notes on implementation +The goal of the blst library is to provide a foundational component for applications and other libraries that require high performance and formally verified BLS12-381 operations. With that in mind some decisions are made to maximize the public good beyond BLS12-381. For example, the field operations are optimized for general 384-bit usage, as opposed to tuned specifically for the 381-bit BLS12-381 curve parameters. With the formal verification of these foundational components, we believe they can provide a reliable building block for other curves that would like high performance and an extra element of security. + +The library deliberately abstains from dealing with memory management and multi-threading, with the rationale that these ultimately belong in language-specific bindings. Another responsibility that is left to application is random number generation. All this in the name of run-time neutrality, which makes integration into more stringent environments like Intel SGX or ARM TrustZone trivial. + +## Platform and Language Compatibility + +This library primarily supports x86_64 and ARM64 hardware platforms, and Linux, Mac, and Windows operating systems. But it does have a portable replacement for the assembly modules, which can be compiled for a plethora of other platforms. Problem reports for these will be considered and are likely to be addressed. + +This repository includes explicit bindings for: +- [Go](bindings/go) +- [Rust](bindings/rust) + +Unless deemed appropriate to implement, bindings for other languages will be provided using [SWIG](http://swig.org). Proof-of-concept scripts are available for: +- [Python](bindings/python) +- [Java](bindings/java) +- [Node.js](bindings/node.js) +- [Emscripten](bindings/emscripten) +- [C#](bindings/c%23) + +## API + +The blst API is defined in the C header [bindings/blst.h](bindings/blst.h). The API can be categorized as follows, with some example operations: +- Field Operations (add, sub, mul, neg, inv, to/from Montgomery) +- Curve Operations (add, double, mul, to/from affine, group check) +- Intermediate (hash to curve, pairing, serdes) +- BLS12-381 signature (sign, verify, aggregate) + +Note: there is also an auxiliary header file, [bindings/blst_aux.h](bindings/blst_aux.h), that is used as a staging area for experimental interfaces that may or may not get promoted to blst.h. + +## Introductory Tutorial + +Programming is understanding, and understanding implies mastering the lingo. So we have a pair of additive groups being mapped to multiplicative one... What does it mean? Well, this tutorial is not about explaining that, but rather about making the connection between what you're supposed to know about [pairing-based cryptography](https://en.wikipedia.org/wiki/Pairing-based_cryptography) and the interface provided by the library. + +### Public Keys and Signatures + +We have two elliptic curves, E1 and E2, points on which are contained in `blst_p1` and `blst_p2`, or `blst_p1_affine` and `blst_p2_affine` structures. Elements in the multiplicative group are held in a `blst_fp12` structure. One of the curves, or more specifically, a subset of points that form a cyclic group, is chosen for public keys, and another, for signatures. The choice is denoted by the subroutines' suffixes, `_pk_in_g1` or `_pk_in_g2`. The most common choice appears to be the former, that is, `blst_p1` for public keys, and `blst_p2` for signatures. But it all starts with a secret key... + +The secret key is held in a 256-bit `blst_scalar` structure which can be instantiated with either [`blst_keygen`](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature#section-2.3), or deserialized with `blst_scalar_from_bendian` or `blst_scalar_from_lendian` from a previously serialized byte sequence. It shouldn't come as surprise that there are two uses for a secret key: + +- generating the associated public key, either with `blst_sk_to_pk_in_g1` or `blst_sk_to_pk_in_g2`; +- performing a sign operation, either with `blst_sign_pk_in_g1` or `blst_sign_pk_in_g2`; + +As for signing, unlike what your intuition might suggest, `blst_sign_*` doesn't sign a message, but rather a point on the corresponding elliptic curve. You can obtain this point from a message by calling `blst_hash_to_g2` or `blst_encode_to_g2` (see the [IETF hash-to-curve](https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve#section-3) draft for distinction). Another counter-intuitive aspect is the apparent g1 vs. g2 naming mismatch, in the sense that `blst_sign_pk_in_g1` accepts output from `blst_hash_to_g2`, and `blst_sign_pk_in_g2` accepts output from `blst_hash_to_g1`. This is because, as you should recall, public keys and signatures come from complementary groups. + +Now that you have a public key and signature, as points on corresponding elliptic curves, you can serialize them with `blst_p1_serialize`/`blst_p1_compress` and `blst_p2_serialize`/`blst_p2_compress` and send the resulting byte sequences over the network for deserialization/uncompression and verification. + +### Signature Verification + +Even though there are "single-shot" `blst_core_verify_pk_in_g1` and `blst_core_verify_pk_in_g2`, you should really familiarize yourself with the more generalized pairing interface. `blst_pairing` is an opaque structure, and the only thing you know about it is `blst_pairing_sizeof`, which is how much memory you're supposed to allocate for it. In order to verify an aggregated signature for a set of public keys and messages, or just one[!], you would: +``` +blst_pairing_init(ctx, hash_or_encode, domain_separation_tag); +blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, message[0]); +blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, message[1]); +... +blst_pairing_commit(ctx); +result = blst_pairing_finalverify(ctx, NULL); +``` +**The essential point to note** is that it's the caller's responsibility to ensure that public keys are group-checked with `blst_p1_affine_in_g1`. This is because it's a relatively expensive operation and it's naturally assumed that the application would cache the check's outcome. Signatures are group-checked internally. Not shown in the pseudo-code snippet above, but `aggregate` and `commit` calls return `BLST_ERROR` denoting success or failure in performing the operation. Call to `finalverify`, on the other hand, returns boolean. + +Another, potentially more useful usage pattern is: +``` +blst_p2_affine_in_g2(signature); +blst_aggregated_in_g2(gtsig, signature); +blst_pairing_init(ctx, hash_or_encode, domain_separation_tag); +blst_pairing_aggregate_pk_in_g1(ctx, PK[0], NULL, message[0]); +blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, message[1]); +... +blst_pairing_commit(ctx); +result = blst_pairing_finalverify(ctx, gtsig); +``` +What is useful about it is that `aggregated_signature` can be handled in a separate thread. And while we are at it, aggregate calls can also be executed in different threads. This naturally implies that each thread will operate on its own `blst_pairing` context, which will have to be combined with `blst_pairing_merge` as threads join. + +### Signature Aggregation + +Aggregation is a trivial operation of performing point additions, with `blst_p2_add_or_double_affine` or `blst_p1_add_or_double_affine`. Note that the accumulator is a non-affine point. + +--- + +That's about what you need to know to get started with nitty-gritty of actual function declarations. + +### Serialization Format + +From the ZCash BLS12-381 specification + +* Fq elements are encoded in big-endian form. They occupy 48 bytes in this form. +* Fq2 elements are encoded in big-endian form, meaning that the Fq2 element c0 + c1 * u is represented by the Fq element c1 followed by the Fq element c0. This means Fq2 elements occupy 96 bytes in this form. +* The group G1 uses Fq elements for coordinates. The group G2 uses Fq2 elements for coordinates. +* G1 and G2 elements can be encoded in uncompressed form (the x-coordinate followed by the y-coordinate) or in compressed form (just the x-coordinate). G1 elements occupy 96 bytes in uncompressed form, and 48 bytes in compressed form. G2 elements occupy 192 bytes in uncompressed form, and 96 bytes in compressed form. + +The most-significant three bits of a G1 or G2 encoding should be masked away before the coordinate(s) are interpreted. These bits are used to unambiguously represent the underlying element: + +* The most significant bit, when set, indicates that the point is in compressed form. Otherwise, the point is in uncompressed form. +* The second-most significant bit indicates that the point is at infinity. If this bit is set, the remaining bits of the group element's encoding should be set to zero. +* The third-most significant bit is set if (and only if) this point is in compressed form _and_ it is not the point at infinity _and_ its y-coordinate is the lexicographically largest of the two associated with the encoded x-coordinate. + +## Build +The build process is very simple and only requires a C compiler. It's integrated into the Go and Rust ecosystems, so that respective users would go about as they would with any other external module. Otherwise, a binary library would have to be compiled. + +### C static library +A static library called libblst.a can be built in the current working directory of the user's choice: + +Linux, Mac, and Windows (in MinGW or Cygwin environments) +``` +/some/where/build.sh +``` + +Windows (Visual C) +``` +\some\where\build.bat +``` + +If final application crashes with an "illegal instruction" exception [after copying to another system], pass `-D__BLST_PORTABLE__` on `build.sh` command line. If you don't use build.sh, complement the `CFLAGS` environment variable with the said command line option. If you compile a Go application, you will need to modify the `CGO_CFLAGS` variable instead. And if you compile a Rust application, you can pass `--features portable` to `cargo build`. Alternatively, if you compile on an older Intel system, but will execute application on a newer one, consider instead passing `--features force-adx` for better performance. + +## Language-specific notes + +### [Go](bindings/go) +There are two primary modes of operation that can be chosen based on type definitions in the application. + +For minimal-pubkey-size operations: +``` +type PublicKey = blst.P1Affine +type Signature = blst.P2Affine +type AggregateSignature = blst.P2Aggregate +type AggregatePublicKey = blst.P1Aggregate +``` + +For minimal-signature-size operations: +``` +type PublicKey = blst.P2Affine +type Signature = blst.P1Affine +type AggregateSignature = blst.P1Aggregate +type AggregatePublicKey = blst.P2Aggregate +``` + +For more details see the Go binding [readme](bindings/go/README.md). + +### [Rust](bindings/rust) +[`blst`](https://crates.io/crates/blst) is the Rust binding crate. + +To use min-pk version: +``` +use blst::min_pk::*; +``` + +To use min-sig version: +``` +use blst::min_sig::*; +``` + +For more details see the Rust binding [readme](bindings/rust/README.md). + +## Repository Structure + +**Root** - Contains various configuration files, documentation, licensing, and a build script +* **Bindings** - Contains the files that define the blst interface + * blst.h - provides C API to blst library + * blst_aux.h - contains experimental functions not yet committed for long-term maintenance + * blst.hpp - provides foundational class-oriented C++ interface to blst library + * blst.swg - provides SWIG definitions for creating blst bindings for other languages, such as Java and Python + * **C#** - folder containing C# bindings and an example of how to use them + * **Emscripten** - folder containing an example of how to use Emscripten WebAssembly bindings from Javascript + * **Go** - folder containing Go bindings for blst, including tests and benchmarks + * **Java** - folder containing an example of how to use SWIG Java bindings for blst + * **Node.js** - folder containing an example of how to use SWIG Javascript bindings for blst + * **Python** - folder containing an example of how to use SWIG Python bindings for blst + * **Rust** - folder containing Rust bindings for blst, including tests and benchmarks + * **Vectors** + * **Hash_to_curve**: folder containing test for hash_to_curve from IETF specification +* **Src** - folder containing C code for lower level blst functions such as field operations, extension field operations, hash-to-field, and more + * **Asm** - folder containing Perl scripts that are used to generate assembly code for different hardware platforms including x86 with ADX instructions, x86 without ADX instructions, and ARMv8, and [ABI](https://en.wikipedia.org/wiki/Application_binary_interface)[1] +* **Build** - this folder containing a set of pre-generated assembly files for a variety of operating systems and maintenance scripts. + * **Cheri** - assembly code for use on [CHERI](https://www.cl.cam.ac.uk/research/security/ctsrd/cheri/) platforms + * **Coff** - assembly code for use on Window systems with GNU toolchain + * **Elf** - assembly code for use on Unix systems + * **Mach-o** - assembly code for use on Apple operating systems + * **Win64** - assembly code for use on Windows systems with Microsoft toolchain + +[1]: See [refresh.sh](build/refresh.sh) for usage. This method allows for simple reuse of optimized assembly across various platforms with minimal effort. + +## Performance +Currently both the [Go](bindings/go) and [Rust](bindings/rust) bindings provide benchmarks for a variety of signature related operations. + +## License +The blst library is licensed under the [Apache License Version 2.0](LICENSE) software license. diff --git a/src/blst/SECURITY.md b/src/blst/SECURITY.md new file mode 100644 index 0000000000..547127b42a --- /dev/null +++ b/src/blst/SECURITY.md @@ -0,0 +1,9 @@ +# Security Policy + +## Reporting a Vulnerability + +To report security issues please send an e-mail to hello@supranational.net. + +For sensitive information or critical issues, please contact the above e-mail address with 'CRITICAL' in the subject line and we will respond with a mechanism to securely communicate. + +Please try to provide a clear description of any issue reported, along with how to reproduce the issue if possible. diff --git a/src/blst/bindings/blst.h b/src/blst/bindings/blst.h new file mode 100644 index 0000000000..8e0a488cd7 --- /dev/null +++ b/src/blst/bindings/blst.h @@ -0,0 +1,488 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif !defined(bool) +# if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +# else +# define bool int +# endif +# define __blst_h_bool__ +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specific Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specific Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specific Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specific Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(void); +#endif // SWIG + +/* + * BLS12-381-specific point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(void); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(void); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(void); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(void); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[], + const blst_p1_affine *const Ps[], + size_t n); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(void); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#elif defined(__blst_h_bool__) +# undef __blst_h_bool__ +# undef bool +#endif +#endif diff --git a/src/blst/bindings/blst.hpp b/src/blst/bindings/blst.hpp new file mode 100644 index 0000000000..127755be05 --- /dev/null +++ b/src/blst/bindings/blst.hpp @@ -0,0 +1,979 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_HPP__ +#define __BLST_HPP__ + +#include +#include +#include +#include + +#if __cplusplus >= 201703L +# include +# ifndef app__string_view +# define app__string_view std::string_view // std::basic_string_view +# endif +#endif + +namespace blst { + +#if __cplusplus >= 201703L +static const app__string_view None; +#endif + +#if __cplusplus < 201103L && !defined(nullptr) +# ifdef __GNUG__ +# define nullptr __null +# elif !defined(_MSVC_LANG) || _MSVC_LANG < 201103L +# define nullptr 0 +# endif +#endif + +#ifdef __clang__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wextern-c-compat" +#endif + +#include "blst.h" + +#ifdef __clang__ +# pragma GCC diagnostic pop +#endif + +class P1_Affine; +class P1; +class P2_Affine; +class P2; +class Pairing; + +inline const byte *C_bytes(const void *ptr) +{ return static_cast(ptr); } + +/* + * As for SecretKey being struct and not class, and lack of constructors + * with one accepting for example |IKM|. We can't make assumptions about + * application's policy toward handling secret key material. Hence it's + * argued that application is entitled for transparent structure, not + * opaque or semi-opaque class. And in the context it's appropriate not + * to "entice" developers with idiomatic constructors:-) Though this + * doesn't really apply to SWIG-assisted interfaces... + */ +struct SecretKey { +#ifdef SWIG +private: +#endif + blst_scalar key; + +#ifdef SWIG +public: +#endif + void keygen(const byte* IKM, size_t IKM_len, + const std::string& info = "") + { blst_keygen(&key, IKM, IKM_len, C_bytes(info.data()), info.size()); } + void keygen_v3(const byte* IKM, size_t IKM_len, + const std::string& info = "") + { blst_keygen_v3(&key, IKM, IKM_len, C_bytes(info.data()), info.size()); } + void keygen_v4_5(const byte* IKM, size_t IKM_len, + const byte* salt, size_t salt_len, + const std::string& info = "") + { blst_keygen_v4_5(&key, IKM, IKM_len, salt, salt_len, + C_bytes(info.data()), info.size()); + } + void keygen_v5(const byte* IKM, size_t IKM_len, + const byte* salt, size_t salt_len, + const std::string& info = "") + { blst_keygen_v5(&key, IKM, IKM_len, salt, salt_len, + C_bytes(info.data()), info.size()); + } +#if __cplusplus >= 201703L + void keygen(const app__string_view IKM, // string_view by value, cool! + const std::string& info = "") + { keygen(C_bytes(IKM.data()), IKM.size(), info); } + void keygen_v3(const app__string_view IKM, // string_view by value, cool! + const std::string& info = "") + { keygen_v3(C_bytes(IKM.data()), IKM.size(), info); } + void keygen_v4_5(const app__string_view IKM, // string_view by value, cool! + const app__string_view salt, + const std::string& info = "") + { keygen_v4_5(C_bytes(IKM.data()), IKM.size(), + C_bytes(salt.data()), salt.size(), info); + } + void keygen_v5(const app__string_view IKM, // string_view by value, cool! + const app__string_view salt, + const std::string& info = "") + { keygen_v5(C_bytes(IKM.data()), IKM.size(), + C_bytes(salt.data()), salt.size(), info); + } +#endif + void derive_master_eip2333(const byte* IKM, size_t IKM_len) + { blst_derive_master_eip2333(&key, IKM, IKM_len); } + void derive_child_eip2333(const SecretKey& SK, unsigned int child_index) + { blst_derive_child_eip2333(&key, &SK.key, child_index); } + + void from_bendian(const byte in[32]) { blst_scalar_from_bendian(&key, in); } + void from_lendian(const byte in[32]) { blst_scalar_from_lendian(&key, in); } + + void to_bendian(byte out[32]) const + { blst_bendian_from_scalar(out, &key); } + void to_lendian(byte out[32]) const + { blst_lendian_from_scalar(out, &key); } +}; + +class Scalar { +private: + blst_scalar val; + +public: + Scalar() { memset(&val, 0, sizeof(val)); } + Scalar(const byte* scalar, size_t nbits) + { blst_scalar_from_le_bytes(&val, scalar, (nbits+7)/8); } + Scalar(const byte *msg, size_t msg_len, const std::string& DST) + { (void)hash_to(msg, msg_len, DST); } +#if __cplusplus >= 201703L + Scalar(const app__string_view msg, const std::string& DST = "") + { (void)hash_to(C_bytes(msg.data()), msg.size(), DST); } +#endif + + Scalar* hash_to(const byte *msg, size_t msg_len, const std::string& DST = "") + { byte elem[48]; + blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, + C_bytes(DST.data()), DST.size()); + blst_scalar_from_be_bytes(&val, elem, sizeof(elem)); + return this; + } +#if __cplusplus >= 201703L + Scalar* hash_to(const app__string_view msg, const std::string& DST = "") + { return hash_to(C_bytes(msg.data()), msg.size(), DST); } +#endif + + Scalar dup() const { return *this; } + Scalar* from_bendian(const byte *msg, size_t msg_len) + { blst_scalar_from_be_bytes(&val, msg, msg_len); return this; } + Scalar* from_lendian(const byte *msg, size_t msg_len) + { blst_scalar_from_le_bytes(&val, msg, msg_len); return this; } + void to_bendian(byte out[32]) const + { blst_bendian_from_scalar(out, &val); } + void to_lendian(byte out[32]) const + { blst_lendian_from_scalar(out, &val); } + + Scalar* add(const Scalar& a) + { if (!blst_sk_add_n_check(&val, &val, a)) + throw BLST_BAD_SCALAR; + return this; + } + Scalar* add(const SecretKey& a) + { if (!blst_sk_add_n_check(&val, &val, &a.key)) + throw BLST_BAD_SCALAR; + return this; + } + Scalar* sub(const Scalar& a) + { if (!blst_sk_sub_n_check(&val, &val, a)) + throw BLST_BAD_SCALAR; + return this; + } + Scalar* mul(const Scalar& a) + { if (!blst_sk_mul_n_check(&val, &val, a)) + throw BLST_BAD_SCALAR; + return this; + } + Scalar* inverse() + { blst_sk_inverse(&val, &val); return this; } + +private: + friend class P1; + friend class P2; + operator const blst_scalar*() const { return &val; } + operator const byte*() const { return val.b; } +}; + +class P1_Affine { +private: + blst_p1_affine point; + + P1_Affine(const blst_p1_affine *cptr) { point = *cptr; } +public: + P1_Affine() { memset(&point, 0, sizeof(point)); } +#ifndef SWIG + P1_Affine(const byte *in) + { BLST_ERROR err = blst_p1_deserialize(&point, in); + if (err != BLST_SUCCESS) + throw err; + } +#endif + P1_Affine(const byte *in, size_t len) + { if (len == 0 || len != (in[0]&0x80 ? 48 : 96)) + throw BLST_BAD_ENCODING; + BLST_ERROR err = blst_p1_deserialize(&point, in); + if (err != BLST_SUCCESS) + throw err; + } + P1_Affine(const P1& jacobian); + + P1_Affine dup() const { return *this; } + P1 to_jacobian() const; + void serialize(byte out[96]) const + { blst_p1_affine_serialize(out, &point); } + void compress(byte out[48]) const + { blst_p1_affine_compress(out, &point); } + bool on_curve() const { return blst_p1_affine_on_curve(&point); } + bool in_group() const { return blst_p1_affine_in_g1(&point); } + bool is_inf() const { return blst_p1_affine_is_inf(&point); } + bool is_equal(const P1_Affine& p) const + { return blst_p1_affine_is_equal(&point, &p.point); } + BLST_ERROR core_verify(const P2_Affine& pk, bool hash_or_encode, + const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) const; +#if __cplusplus >= 201703L + BLST_ERROR core_verify(const P2_Affine& pk, bool hash_or_encode, + const app__string_view msg, + const std::string& DST = "", + const app__string_view aug = None) const; +#endif + static P1_Affine generator() + { return P1_Affine(blst_p1_affine_generator()); } + +private: + friend class Pairing; + friend class P2_Affine; + friend class PT; + friend class P1; + friend class P1_Affines; + operator const blst_p1_affine*() const { return &point; } + operator blst_p1_affine*() { return &point; } +}; + +class P1 { +private: + blst_p1 point; + + P1(const blst_p1 *cptr) { point = *cptr; } +public: + P1() { memset(&point, 0, sizeof(point)); } + P1(const SecretKey& sk) { blst_sk_to_pk_in_g1(&point, &sk.key); } +#ifndef SWIG + P1(const byte *in) + { blst_p1_affine a; + BLST_ERROR err = blst_p1_deserialize(&a, in); + if (err != BLST_SUCCESS) + throw err; + blst_p1_from_affine(&point, &a); + } +#endif + P1(const byte *in, size_t len) + { if (len == 0 || len != (in[0]&0x80 ? 48 : 96)) + throw BLST_BAD_ENCODING; + blst_p1_affine a; + BLST_ERROR err = blst_p1_deserialize(&a, in); + if (err != BLST_SUCCESS) + throw err; + blst_p1_from_affine(&point, &a); + } + P1(const P1_Affine& affine) { blst_p1_from_affine(&point, affine); } + + P1 dup() const { return *this; } + P1_Affine to_affine() const { return P1_Affine(*this); } + void serialize(byte out[96]) const { blst_p1_serialize(out, &point); } + void compress(byte out[48]) const { blst_p1_compress(out, &point); } + bool on_curve() const { return blst_p1_on_curve(&point); } + bool in_group() const { return blst_p1_in_g1(&point); } + bool is_inf() const { return blst_p1_is_inf(&point); } + bool is_equal(const P1& p) const + { return blst_p1_is_equal(&point, &p.point); } + void aggregate(const P1_Affine& in) + { if (blst_p1_affine_in_g1(in)) + blst_p1_add_or_double_affine(&point, &point, in); + else + throw BLST_POINT_NOT_IN_GROUP; + } + P1* sign_with(const SecretKey& sk) + { blst_sign_pk_in_g2(&point, &point, &sk.key); return this; } + P1* sign_with(const Scalar& scalar) + { blst_sign_pk_in_g2(&point, &point, scalar); return this; } + P1* hash_to(const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) + { blst_hash_to_g1(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), + aug, aug_len); + return this; + } + P1* encode_to(const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) + { blst_encode_to_g1(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), + aug, aug_len); + return this; + } +#if __cplusplus >= 201703L + P1* hash_to(const app__string_view msg, const std::string& DST = "", + const app__string_view aug = None) + { return hash_to(C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); + } + P1* encode_to(const app__string_view msg, const std::string& DST = "", + const app__string_view aug = None) + { return encode_to(C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); + } +#endif + P1* mult(const byte* scalar, size_t nbits) + { blst_p1_mult(&point, &point, scalar, nbits); return this; } + P1* mult(const Scalar& scalar) + { blst_p1_mult(&point, &point, scalar, 255); return this; } + P1* cneg(bool flag) + { blst_p1_cneg(&point, flag); return this; } + P1* neg() + { blst_p1_cneg(&point, true); return this; } + P1* add(const P1& a) + { blst_p1_add_or_double(&point, &point, a); return this; } + P1* add(const P1_Affine &a) + { blst_p1_add_or_double_affine(&point, &point, a); return this; } + P1* dbl() + { blst_p1_double(&point, &point); return this; } +#ifndef SWIG + static P1 add(const P1& a, const P1& b) + { P1 ret; blst_p1_add_or_double(ret, a, b); return ret; } + static P1 add(const P1& a, const P1_Affine& b) + { P1 ret; blst_p1_add_or_double_affine(ret, a, b); return ret; } + static P1 dbl(const P1& a) + { P1 ret; blst_p1_double(ret, a); return ret; } +#endif + static P1 generator() + { return P1(blst_p1_generator()); } + +private: + friend class P1_Affine; + friend class P1_Affines; + operator const blst_p1*() const { return &point; } + operator blst_p1*() { return &point; } +}; + +class P1_Affines { +private: + struct p1_affine_no_init { + blst_p1_affine point; + p1_affine_no_init() { } + operator blst_p1_affine*() { return &point; } + operator const blst_p1_affine*() const { return &point; } + }; + + std::vector table; + size_t wbits, npoints; + +public: +#ifndef SWIG + P1_Affines() {} + P1_Affines(size_t wbits, const P1_Affine* const points[], size_t npoints) + { this->wbits = wbits; + this->npoints = npoints; + table.resize(npoints << (wbits-1)); + blst_p1s_mult_wbits_precompute(table[0], wbits, + reinterpret_cast(points), + npoints); + } + P1_Affines(size_t wbits, const P1_Affine points[], size_t npoints) + { const P1_Affine* const ptrs[2] = { points, nullptr }; + P1_Affines(wbits, ptrs, npoints); + } + P1_Affines(size_t wbits, const std::vector points) + { P1_Affines(wbits, &points[0], points.size()); } + + P1_Affines(size_t wbits, const P1* const points[], size_t npoints) + { size_t cap = npoints << (wbits-1); + + this->wbits = wbits; + this->npoints = npoints; + table.resize(cap); + blst_p1s_to_affine(table[cap-npoints], + reinterpret_cast(points), + npoints); + const blst_p1_affine* const ptrs[2] = { table[cap-npoints], nullptr }; + blst_p1s_mult_wbits_precompute(table[0], wbits, ptrs, npoints); + } + P1_Affines(size_t wbits, const P1 points[], size_t npoints) + { const P1* const ptrs[2] = { points, nullptr }; + P1_Affines(wbits, ptrs, npoints); + } + P1_Affines(size_t wbits, const std::vector points) + { P1_Affines(wbits, &points[0], points.size()); } + + P1_Affines(const P1* const points[], size_t npoints) + { this->wbits = 0; + this->npoints = npoints; + table.resize(npoints); + blst_p1s_to_affine(table[0], + reinterpret_cast(points), + npoints); + } + P1_Affines(const P1 points[], size_t npoints) + { const P1* const ptrs[2] = { points, nullptr }; + P1_Affines(ptrs, npoints); + } + P1_Affines(const std::vector points) + { P1_Affines(&points[0], points.size()); } + + P1 mult(const byte* const scalars[], size_t nbits) const + { P1 ret; + + if (wbits != 0) { + size_t sz = blst_p1s_mult_wbits_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + blst_p1s_mult_wbits(ret, table[0], wbits, npoints, + scalars, nbits, scratch.get()); + } else { + size_t sz = blst_p1s_mult_pippenger_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + const blst_p1_affine* const ptrs[2] = { table[0], nullptr }; + blst_p1s_mult_pippenger(ret, ptrs, npoints, + scalars, nbits, scratch.get()); + } + return ret; + } + + static std::vector from(const P1* const points[], size_t npoints) + { std::vector ret; + ret.resize(npoints); + blst_p1s_to_affine(ret[0], + reinterpret_cast(points), + npoints); + return ret; + } + static std::vector from(const P1 points[], size_t npoints) + { const P1* const ptrs[2] = { points, nullptr }; + return from(ptrs, npoints); + } + static std::vector from(std::vector points) + { return from(&points[0], points.size()); } +#endif + + static P1 mult_pippenger(const P1_Affine* const points[], size_t npoints, + const byte* const scalars[], size_t nbits) + { P1 ret; + size_t sz = blst_p1s_mult_pippenger_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + blst_p1s_mult_pippenger(ret, + reinterpret_cast(points), + npoints, scalars, nbits, scratch.get()); + return ret; + } +#ifndef SWIG + static P1 mult_pippenger(const P1_Affine points[], size_t npoints, + const byte* const scalars[], size_t nbits) + { const P1_Affine* const ptrs[2] = { points, nullptr }; + return mult_pippenger(ptrs, npoints, scalars, nbits); + } + static P1 mult_pippenger(const std::vector points, + const byte* const scalars[], size_t nbits) + { return mult_pippenger(&points[0], points.size(), scalars, nbits); } +#endif + + static P1 add(const P1_Affine* const points[], size_t npoints) + { P1 ret; + blst_p1s_add(ret, + reinterpret_cast(points), + npoints); + return ret; + } +#ifndef SWIG + static P1 add(const P1_Affine points[], size_t npoints) + { const P1_Affine* const ptrs[2] = { points, nullptr }; + return add(ptrs, npoints); + } + static P1 add(const std::vector points) + { return add(&points[0], points.size()); } +#endif +}; + +class P2_Affine { +private: + blst_p2_affine point; + + P2_Affine(const blst_p2_affine *cptr) { point = *cptr; } +public: + P2_Affine() { memset(&point, 0, sizeof(point)); } +#ifndef SWIG + P2_Affine(const byte *in) + { BLST_ERROR err = blst_p2_deserialize(&point, in); + if (err != BLST_SUCCESS) + throw err; + } +#endif + P2_Affine(const byte *in, size_t len) + { if (len == 0 || len != (in[0]&0x80 ? 96 : 192)) + throw BLST_BAD_ENCODING; + BLST_ERROR err = blst_p2_deserialize(&point, in); + if (err != BLST_SUCCESS) + throw err; + } + P2_Affine(const P2& jacobian); + + P2_Affine dup() const { return *this; } + P2 to_jacobian() const; + void serialize(byte out[192]) const + { blst_p2_affine_serialize(out, &point); } + void compress(byte out[96]) const + { blst_p2_affine_compress(out, &point); } + bool on_curve() const { return blst_p2_affine_on_curve(&point); } + bool in_group() const { return blst_p2_affine_in_g2(&point); } + bool is_inf() const { return blst_p2_affine_is_inf(&point); } + bool is_equal(const P2_Affine& p) const + { return blst_p2_affine_is_equal(&point, &p.point); } + BLST_ERROR core_verify(const P1_Affine& pk, bool hash_or_encode, + const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) const; +#if __cplusplus >= 201703L + BLST_ERROR core_verify(const P1_Affine& pk, bool hash_or_encode, + const app__string_view msg, + const std::string& DST = "", + const app__string_view aug = None) const; +#endif + static P2_Affine generator() + { return P2_Affine(blst_p2_affine_generator()); } + +private: + friend class Pairing; + friend class P1_Affine; + friend class PT; + friend class P2; + friend class P2_Affines; + operator const blst_p2_affine*() const { return &point; } + operator blst_p2_affine*() { return &point; } +}; + +class P2 { +private: + blst_p2 point; + + P2(const blst_p2 *cptr) { point = *cptr; } +public: + P2() { memset(&point, 0, sizeof(point)); } + P2(const SecretKey& sk) { blst_sk_to_pk_in_g2(&point, &sk.key); } +#ifndef SWIG + P2(const byte *in) + { blst_p2_affine a; + BLST_ERROR err = blst_p2_deserialize(&a, in); + if (err != BLST_SUCCESS) + throw err; + blst_p2_from_affine(&point, &a); + } +#endif + P2(const byte *in, size_t len) + { if (len == 0 || len != (in[0]&0x80 ? 96 : 192)) + throw BLST_BAD_ENCODING; + blst_p2_affine a; + BLST_ERROR err = blst_p2_deserialize(&a, in); + if (err != BLST_SUCCESS) + throw err; + blst_p2_from_affine(&point, &a); + } + P2(const P2_Affine& affine) { blst_p2_from_affine(&point, affine); } + + P2 dup() const { return *this; } + P2_Affine to_affine() const { return P2_Affine(*this); } + void serialize(byte out[192]) const { blst_p2_serialize(out, &point); } + void compress(byte out[96]) const { blst_p2_compress(out, &point); } + bool on_curve() const { return blst_p2_on_curve(&point); } + bool in_group() const { return blst_p2_in_g2(&point); } + bool is_inf() const { return blst_p2_is_inf(&point); } + bool is_equal(const P2& p) const + { return blst_p2_is_equal(&point, &p.point); } + void aggregate(const P2_Affine& in) + { if (blst_p2_affine_in_g2(in)) + blst_p2_add_or_double_affine(&point, &point, in); + else + throw BLST_POINT_NOT_IN_GROUP; + } + P2* sign_with(const SecretKey& sk) + { blst_sign_pk_in_g1(&point, &point, &sk.key); return this; } + P2* sign_with(const Scalar& scalar) + { blst_sign_pk_in_g1(&point, &point, scalar); return this; } + P2* hash_to(const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) + { blst_hash_to_g2(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), + aug, aug_len); + return this; + } + P2* encode_to(const byte* msg, size_t msg_len, + const std::string& DST = "", + const byte* aug = nullptr, size_t aug_len = 0) + { blst_encode_to_g2(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), + aug, aug_len); + return this; + } +#if __cplusplus >= 201703L + P2* hash_to(const app__string_view msg, const std::string& DST = "", + const app__string_view aug = None) + { return hash_to(C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); + } + P2* encode_to(const app__string_view msg, const std::string& DST = "", + const app__string_view aug = None) + { return encode_to(C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); + } +#endif + P2* mult(const byte* scalar, size_t nbits) + { blst_p2_mult(&point, &point, scalar, nbits); return this; } + P2* mult(const Scalar& scalar) + { blst_p2_mult(&point, &point, scalar, 255); return this; } + P2* cneg(bool flag) + { blst_p2_cneg(&point, flag); return this; } + P2* neg() + { blst_p2_cneg(&point, true); return this; } + P2* add(const P2& a) + { blst_p2_add_or_double(&point, &point, a); return this; } + P2* add(const P2_Affine &a) + { blst_p2_add_or_double_affine(&point, &point, a); return this; } + P2* dbl() + { blst_p2_double(&point, &point); return this; } +#ifndef SWIG + static P2 add(const P2& a, const P2& b) + { P2 ret; blst_p2_add_or_double(ret, a, b); return ret; } + static P2 add(const P2& a, const P2_Affine& b) + { P2 ret; blst_p2_add_or_double_affine(ret, a, b); return ret; } + static P2 dbl(const P2& a) + { P2 ret; blst_p2_double(ret, a); return ret; } +#endif + static P2 generator() + { return P2(blst_p2_generator()); } + +private: + friend class P2_Affine; + friend class P2_Affines; + operator const blst_p2*() const { return &point; } + operator blst_p2*() { return &point; } +}; + +class P2_Affines { +private: + struct p2_affine_no_init { + blst_p2_affine point; + p2_affine_no_init() { } + operator blst_p2_affine*() { return &point; } + operator const blst_p2_affine*() const { return &point; } + }; + + std::vector table; + size_t wbits, npoints; + +public: +#ifndef SWIG + P2_Affines() {} + P2_Affines(size_t wbits, const P2_Affine* const points[], size_t npoints) + { this->wbits = wbits; + this->npoints = npoints; + table.resize(npoints << (wbits-1)); + blst_p2s_mult_wbits_precompute(table[0], wbits, + reinterpret_cast(points), + npoints); + } + P2_Affines(size_t wbits, const P2_Affine points[], size_t npoints) + { const P2_Affine* const ptrs[2] = { points, nullptr }; + P2_Affines(wbits, ptrs, npoints); + } + P2_Affines(size_t wbits, const std::vector points) + { P2_Affines(wbits, &points[0], points.size()); } + + P2_Affines(size_t wbits, const P2* const points[], size_t npoints) + { size_t cap = npoints << (wbits-1); + + this->wbits = wbits; + this->npoints = npoints; + table.resize(cap); + blst_p2s_to_affine(table[cap-npoints], + reinterpret_cast(points), + npoints); + const blst_p2_affine* const ptrs[2] = { table[cap-npoints], nullptr }; + blst_p2s_mult_wbits_precompute(table[0], wbits, ptrs, npoints); + } + P2_Affines(size_t wbits, const P2 points[], size_t npoints) + { const P2* const ptrs[2] = { points, nullptr }; + P2_Affines(wbits, ptrs, npoints); + } + P2_Affines(size_t wbits, const std::vector points) + { P2_Affines(wbits, &points[0], points.size()); } + + P2_Affines(const P2* const points[], size_t npoints) + { this->wbits = 0; + this->npoints = npoints; + table.resize(npoints); + blst_p2s_to_affine(table[0], + reinterpret_cast(points), + npoints); + } + P2_Affines(const P2 points[], size_t npoints) + { const P2* const ptrs[2] = { points, nullptr }; + P2_Affines(ptrs, npoints); + } + P2_Affines(const std::vector points) + { P2_Affines(&points[0], points.size()); } + + P2 mult(const byte* const scalars[], size_t nbits) const + { P2 ret; + + if (wbits != 0) { + size_t sz = blst_p2s_mult_wbits_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + blst_p2s_mult_wbits(ret, table[0], wbits, npoints, + scalars, nbits, scratch.get()); + } else { + size_t sz = blst_p2s_mult_pippenger_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + const blst_p2_affine* const ptrs[2] = { table[0], nullptr }; + blst_p2s_mult_pippenger(ret, ptrs, npoints, + scalars, nbits, scratch.get()); + } + return ret; + } + + static std::vector from(const P2* const points[], size_t npoints) + { std::vector ret; + ret.resize(npoints); + blst_p2s_to_affine(ret[0], + reinterpret_cast(points), + npoints); + return ret; + } + static std::vector from(const P2 points[], size_t npoints) + { const P2* const ptrs[2] = { points, nullptr }; + return from(ptrs, npoints); + } + static std::vector from(std::vector points) + { return from(&points[0], points.size()); } +#endif + + static P2 mult_pippenger(const P2_Affine* const points[], size_t npoints, + const byte* const scalars[], size_t nbits) + { P2 ret; + size_t sz = blst_p2s_mult_pippenger_scratch_sizeof(npoints); + std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; + blst_p2s_mult_pippenger(ret, + reinterpret_cast(points), + npoints, scalars, nbits, scratch.get()); + return ret; + } +#ifndef SWIG + static P2 mult_pippenger(const P2_Affine points[], size_t npoints, + const byte* const scalars[], size_t nbits) + { const P2_Affine* const ptrs[2] = { points, nullptr }; + return mult_pippenger(ptrs, npoints, scalars, nbits); + } + static P2 mult_pippenger(const std::vector points, + const byte* const scalars[], size_t nbits) + { return mult_pippenger(&points[0], points.size(), scalars, nbits); } +#endif + + static P2 add(const P2_Affine* const points[], size_t npoints) + { P2 ret; + blst_p2s_add(ret, + reinterpret_cast(points), + npoints); + return ret; + } +#ifndef SWIG + static P2 add(const P2_Affine points[], size_t npoints) + { const P2_Affine* const ptrs[2] = { points, nullptr }; + return add(ptrs, npoints); + } + static P2 add(const std::vector points) + { return add(&points[0], points.size()); } +#endif +}; + +inline P1_Affine::P1_Affine(const P1& jacobian) +{ blst_p1_to_affine(&point, jacobian); } +inline P2_Affine::P2_Affine(const P2& jacobian) +{ blst_p2_to_affine(&point, jacobian); } + +inline P1 P1_Affine::to_jacobian() const { P1 ret(*this); return ret; } +inline P2 P2_Affine::to_jacobian() const { P2 ret(*this); return ret; } + +inline P1 G1() { return P1::generator(); } +inline P2 G2() { return P2::generator(); } + +inline BLST_ERROR P1_Affine::core_verify(const P2_Affine& pk, + bool hash_or_encode, + const byte* msg, size_t msg_len, + const std::string& DST, + const byte* aug, size_t aug_len) const +{ return blst_core_verify_pk_in_g2(pk, &point, hash_or_encode, + msg, msg_len, + C_bytes(DST.data()), DST.size(), + aug, aug_len); +} +inline BLST_ERROR P2_Affine::core_verify(const P1_Affine& pk, + bool hash_or_encode, + const byte* msg, size_t msg_len, + const std::string& DST, + const byte* aug, size_t aug_len) const +{ return blst_core_verify_pk_in_g1(pk, &point, hash_or_encode, + msg, msg_len, + C_bytes(DST.data()), DST.size(), + aug, aug_len); +} +#if __cplusplus >= 201703L +inline BLST_ERROR P1_Affine::core_verify(const P2_Affine& pk, + bool hash_or_encode, + const app__string_view msg, + const std::string& DST, + const app__string_view aug) const +{ return core_verify(pk, hash_or_encode, C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); +} +inline BLST_ERROR P2_Affine::core_verify(const P1_Affine& pk, + bool hash_or_encode, + const app__string_view msg, + const std::string& DST, + const app__string_view aug) const +{ return core_verify(pk, hash_or_encode, C_bytes(msg.data()), msg.size(), DST, + C_bytes(aug.data()), aug.size()); +} +#endif + +class PT { +private: + blst_fp12 value; + + PT(const blst_fp12 *v) { value = *v; } +public: + PT(const P1_Affine& p) { blst_aggregated_in_g1(&value, p); } + PT(const P2_Affine& q) { blst_aggregated_in_g2(&value, q); } + PT(const P2_Affine& q, const P1_Affine& p) + { blst_miller_loop(&value, q, p); } + PT(const P1_Affine& p, const P2_Affine& q) : PT(q, p) {} + PT(const P2& q, const P1& p) + { blst_miller_loop(&value, P2_Affine(q), P1_Affine(p)); } + PT(const P1& p, const P2& q) : PT(q, p) {} + + PT dup() const { return *this; } + bool is_one() const { return blst_fp12_is_one(&value); } + bool is_equal(const PT& p) const + { return blst_fp12_is_equal(&value, p); } + PT* sqr() { blst_fp12_sqr(&value, &value); return this; } + PT* mul(const PT& p) { blst_fp12_mul(&value, &value, p); return this; } + PT* final_exp() { blst_final_exp(&value, &value); return this; } + bool in_group() const { return blst_fp12_in_group(&value); } + void to_bendian(byte out[48*12]) const + { blst_bendian_from_fp12(out, &value); } + + static bool finalverify(const PT& gt1, const PT& gt2) + { return blst_fp12_finalverify(gt1, gt2); } + static PT one() { return PT(blst_fp12_one()); } + +private: + friend class Pairing; + operator const blst_fp12*() const { return &value; } +}; + +class Pairing { +private: + operator blst_pairing*() + { return reinterpret_cast(this); } + operator const blst_pairing*() const + { return reinterpret_cast(this); } + + void init(bool hash_or_encode, const byte* DST, size_t DST_len) + { // Copy DST to heap, std::string can be volatile, especially in SWIG:-( + byte *dst = new byte[DST_len]; + memcpy(dst, DST, DST_len); + blst_pairing_init(*this, hash_or_encode, dst, DST_len); + } + +public: +#ifndef SWIG + void* operator new(size_t) + { return new uint64_t[blst_pairing_sizeof()/sizeof(uint64_t)]; } + void operator delete(void *ptr) + { delete[] static_cast(ptr); } + + Pairing(bool hash_or_encode, const std::string& DST) + { init(hash_or_encode, C_bytes(DST.data()), DST.size()); } +#if __cplusplus >= 201703L + Pairing(bool hash_or_encode, const app__string_view DST) + { init(hash_or_encode, C_bytes(DST.data()), DST.size()); } +#endif +#endif +#ifndef SWIGJAVA + Pairing(bool hash_or_encode, const byte* DST, size_t DST_len) + { init(hash_or_encode, DST, DST_len); } + ~Pairing() { delete[] blst_pairing_get_dst(*this); } +#endif + + BLST_ERROR aggregate(const P1_Affine* pk, const P2_Affine* sig, + const byte* msg, size_t msg_len, + const byte* aug = nullptr, size_t aug_len = 0) + { return blst_pairing_aggregate_pk_in_g1(*this, *pk, *sig, + msg, msg_len, aug, aug_len); + } + BLST_ERROR aggregate(const P2_Affine* pk, const P1_Affine* sig, + const byte* msg, size_t msg_len, + const byte* aug = nullptr, size_t aug_len = 0) + { return blst_pairing_aggregate_pk_in_g2(*this, *pk, *sig, + msg, msg_len, aug, aug_len); + } + BLST_ERROR mul_n_aggregate(const P1_Affine* pk, const P2_Affine* sig, + const byte* scalar, size_t nbits, + const byte* msg, size_t msg_len, + const byte* aug = nullptr, size_t aug_len = 0) + { return blst_pairing_mul_n_aggregate_pk_in_g1(*this, *pk, *sig, + scalar, nbits, msg, msg_len, aug, aug_len); + } + BLST_ERROR mul_n_aggregate(const P2_Affine* pk, const P1_Affine* sig, + const byte* scalar, size_t nbits, + const byte* msg, size_t msg_len, + const byte* aug = nullptr, size_t aug_len = 0) + { return blst_pairing_mul_n_aggregate_pk_in_g2(*this, *pk, *sig, + scalar, nbits, msg, msg_len, aug, aug_len); + } +#if __cplusplus >= 201703L + BLST_ERROR aggregate(const P1_Affine* pk, const P2_Affine* sig, + const app__string_view msg, + const app__string_view aug = None) + { return aggregate(pk, sig, C_bytes(msg.data()), msg.size(), + C_bytes(aug.data()), aug.size()); + } + BLST_ERROR aggregate(const P2_Affine* pk, const P1_Affine* sig, + const app__string_view msg, + const app__string_view aug = None) + { return aggregate(pk, sig, C_bytes(msg.data()), msg.size(), + C_bytes(aug.data()), aug.size()); + } + BLST_ERROR mul_n_aggregate(const P1_Affine* pk, const P2_Affine* sig, + const byte* scalar, size_t nbits, + const app__string_view msg, + const app__string_view aug = None) + { return mul_n_aggregate(pk, sig, scalar, nbits, + C_bytes(msg.data()), msg.size(), + C_bytes(aug.data()), aug.size()); + } + BLST_ERROR mul_n_aggregate(const P2_Affine* pk, const P1_Affine* sig, + const byte* scalar, size_t nbits, + const app__string_view msg, + const app__string_view aug = None) + { return mul_n_aggregate(pk, sig, scalar, nbits, + C_bytes(msg.data()), msg.size(), + C_bytes(aug.data()), aug.size()); + } +#endif + void commit() + { blst_pairing_commit(*this); } + BLST_ERROR merge(const Pairing* ctx) + { return blst_pairing_merge(*this, *ctx); } + bool finalverify(const PT* sig = nullptr) const + { return sig == nullptr ? blst_pairing_finalverify(*this, nullptr) + : blst_pairing_finalverify(*this, *sig); + } + void raw_aggregate(const P2_Affine* q, const P1_Affine* p) + { blst_pairing_raw_aggregate(*this, *q, *p); } + PT as_fp12() + { return PT(blst_pairing_as_fp12(*this)); } +}; + +} // namespace blst + +#endif diff --git a/src/blst/bindings/blst.swg b/src/blst/bindings/blst.swg new file mode 100644 index 0000000000..4cb9c30d28 --- /dev/null +++ b/src/blst/bindings/blst.swg @@ -0,0 +1,851 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +%module blst +%rename("%(strip:[blst_])s") ""; // prefix is redundant in named module + +%include "exception.i" +#ifdef __cplusplus +%include "std_string.i" +%typemap(out) SELF* OUTPUT = SWIGTYPE*; // to be overridden as required +#else +#warning consider using C++ interface +#endif +%include "stdint.i" + +%apply const char* { const byte*, const byte[ANY] } +%apply (const char *STRING, size_t LENGTH) { (const byte *STRING, + size_t LENGTH) } + +#if defined(SWIGPYTHON) + +// some sorcery to allow assignments as output, e.g. +// hash = blst.encode_to_g1(b"foo") +%typemap(in, numinputs=0) OBJECT *OUTPUT($1_basetype temp) %{ $1 = &temp; %} +%typemap(argout) OBJECT *OUTPUT { + PyObject *obj = SWIG_NewPointerObj(memcpy(malloc(sizeof($1_basetype)), + $1,sizeof($1_basetype)), + $descriptor, SWIG_POINTER_NEW); + $result = ($result==NULL) ? obj + : SWIG_Python_AppendOutput($result, obj); +} +%apply OBJECT *OUTPUT { + blst_p1 *out, blst_p1 *out_pk, blst_p1 *out_sig, + blst_p1_affine *out, blst_p1_affine *out_pk, blst_p1_affine *out_sig, + blst_p2 *out, blst_p2 *out_pk, blst_p2 *out_sig, + blst_p2_affine *out, blst_p2_affine *out_pk, blst_p2_affine *out_sig, + blst_scalar *out, blst_scalar *out_SK, + blst_fp12 *out +} + +// accept 'bytes' and 'bytearray' as inputs... +%typemap(in) const byte* %{ + if ($input == Py_None) { + $1 = NULL; + } else if (PyBytes_Check($input)) { + char *buf; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); + + $1 = ($1_ltype)buf; + } else if (PyByteArray_Check($input)) { + $1 = ($1_ltype)PyByteArray_AsString($input); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'bytes' or 'bytearray'"); + } +%} +%typemap(freearg) const byte* "" + +%typemap(in) const byte[ANY] %{ + if (PyBytes_Check($input)) { + char *buf; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); + + if (nbytes != $1_dim0) + SWIG_exception_fail(SWIG_ValueError, "in method '$symname', " + "expecting $1_dim0 bytes"); + $1 = ($1_ltype)buf; + } else if (PyByteArray_Check($input)) { + if (PyByteArray_Size($input) != $1_dim0) + SWIG_exception_fail(SWIG_ValueError, "in method '$symname', " + "expecting $1_dim0 bytes"); + $1 = ($1_ltype)PyByteArray_AsString($input); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'bytes' or 'bytearray'"); + } +%} +%typemap(freearg) const byte[ANY] "" + +%typemap(in) (const byte *STRING, size_t LENGTH) %{ + if ($input == Py_None) { + $1 = NULL; + $2 = 0; + } else if (PyBytes_Check($input)) { + char *buf; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) + SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); + + $1 = ($1_ltype)buf; + $2 = nbytes; + } else if (PyByteArray_Check($input)) { + $1 = ($1_ltype)PyByteArray_AsString($input); + $2 = PyByteArray_Size($input); +#ifdef Py_USING_UNICODE + } else if (PyUnicode_Check($input)) { + char *buf; + Py_ssize_t nbytes; + PyObject *obj = PyUnicode_AsUTF8String($input); + + if (obj == NULL || PyBytes_AsStringAndSize(obj, &buf, &nbytes) < 0) + SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); + + $1 = ($1_ltype)alloca($2 = nbytes); + memcpy($1, buf, $2); + Py_DECREF(obj); +#endif + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'bytes' or 'bytearray'"); + } +%} + +// let users use Python 'int', 'bytes' and 'bytearray' as scalars +%typemap(in) (const byte* scalar, size_t nbits) %{ + if (PyBytes_Check($input)) { + char *scalar; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &scalar, &nbytes) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); + + $1 = ($1_ltype)scalar; + $2 = 8 * nbytes; + } else if (PyByteArray_Check($input)) { + $1 = ($1_ltype)PyByteArray_AsString($input); + $2 = 8 * PyByteArray_Size($input); + } else if (PyLong_Check($input)) { + size_t nbytes; + + $2 = _PyLong_NumBits($input); + $1 = ($1_ltype)alloca(nbytes = ($2 + 7)/8); + + if (_PyLong_AsByteArray((PyLongObject*)$input, $1, nbytes, 1, 0) < 0) + SWIG_exception_fail(SWIG_OverflowError, "in method '$symname'"); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'int', 'bytes' " + "or 'bytearray'"); + } +%} + +#ifdef __cplusplus +%typemap(in) (const POINT* points[], size_t npoints) + (std::unique_ptr<$*1_ltype[]> points, size_t _global_npoints) %{ + if (PyList_Check($input)) { + _global_npoints = PyList_Size($input); + points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[_global_npoints]); + PyObject* obj = PyList_GET_ITEM($input, 0); + // check the type of the 1st element + if (SWIG_ConvertPtr(obj, (void**)&points[0], $*1_descriptor, 0) != SWIG_OK) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'list' of '$*1_ltype'"); + for (size_t i = 1; i < _global_npoints; i++) { + obj = PyList_GET_ITEM($input, i); + points[i] = ($*1_ltype)SWIG_Python_GetSwigThis(obj)->ptr; + } + $1 = points.get(); + $2 = _global_npoints; + } else if (PyBytes_Check($input)) { + char *bytes; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &bytes, &nbytes) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); + + points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[2]); + points[0] = ($*1_ltype)bytes; + points[1] = nullptr; + $1 = points.get(); + $2 = _global_npoints = nbytes / sizeof(points[0][0]); + } else if (PyMemoryView_Check($input)) { // output from to_affine() + Py_buffer *buf = PyMemoryView_GET_BUFFER($input); + + if (!PyBytes_Check(buf->obj)) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'bytes' in " + "'memoryview' for points[]"); + points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[2]); + points[0] = ($*1_ltype)buf->buf; + points[1] = nullptr; + $1 = points.get(); + $2 = _global_npoints = buf->len / sizeof(points[0][0]); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', expecting " + "'list', 'bytes' or 'memoryview' " + "for points[]"); + } +%} +%apply (const POINT* points[], size_t npoints) { + (const blst::P1_Affine* const points[], size_t npoints), + (const blst::P2_Affine* const points[], size_t npoints), + (const blst::P1* const points[], size_t npoints), + (const blst::P2* const points[], size_t npoints) +} + +%typemap(in, numinputs=0) POINT points[] (PyObject *obj) "" +%typemap(check) POINT points[] { + char *bytes; + Py_ssize_t size = sizeof($1[0]) * _global_npoints; + + obj$argnum = PyBytes_FromStringAndSize(NULL, size); + if (obj$argnum == NULL) SWIG_fail; + PyBytes_AsStringAndSize(obj$argnum, &bytes, &size); + $1 = ($1_ltype)bytes; +} +%typemap(argout) POINT points[] %{ + $result = PyMemoryView_FromObject(obj$argnum); + if ($result != NULL) { + // .itemsize to return size of point, and len() - amount of points + PyMemoryView_GET_BUFFER($result)->itemsize = sizeof($1[0]); + PyMemoryView_GET_BUFFER($result)->shape[0] /= sizeof($1[0]); + } else { + Py_DECREF(obj$argnum); + } +%} +%apply POINT points[] { blst_p1_affine dst[], blst_p2_affine dst[] } + +%extend blst::P1_Affines { + static PyObject* as_memory(blst_p1_affine dst[], + const blst::P1* const points[], size_t npoints) + { blst_p1s_to_affine(dst, (const blst_p1 *const*)points, npoints); + return NULL; // ignored by 'argout' typemap above + } +} +%extend blst::P2_Affines { + static PyObject* as_memory(blst_p2_affine dst[], + const blst::P2* const points[], size_t npoints) + { blst_p2s_to_affine(dst, (const blst_p2 *const*)points, npoints); + return NULL; // ignored by 'argout' typemap above + } +} +%nodefault blst::P1_Affines; +%nodefault blst::P2_Affines; + +%typemap(in) (const byte* const scalars[], size_t nbits) + (std::unique_ptr bytes, byte *scalars[2]) %{ + if (PyList_Check($input)) { + if ((size_t)PyList_Size($input) != _global_npoints) + SWIG_exception_fail(SWIG_IndexError, "in method '$symname', 'list' " + "length mismatch for scalars[]"); + + PyObject *obj = PyList_GET_ITEM($input, 0); + if (PyLong_Check(obj)) { + $2 = _PyLong_NumBits(obj); + for (size_t i = 1; i < _global_npoints; i++) { + size_t nbits; + obj = PyList_GET_ITEM($input, i); + if (!PyLong_Check(obj) || _PyLong_Sign(obj) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting all 'long's"); + nbits = _PyLong_NumBits(obj); + if (nbits > $2) $2 = nbits; + } + + size_t nbytes = ($2 + 7)/8; + bytes = std::unique_ptr(new byte[_global_npoints*nbytes]); + byte* scalar = bytes.get(); + for (size_t i = 0; i < _global_npoints; i++, scalar += nbytes) + _PyLong_AsByteArray((PyLongObject*)PyList_GET_ITEM($input, i), + scalar, nbytes, 1, 0); + + scalars[0] = bytes.get(); + scalars[1] = nullptr; + $1 = scalars; + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'list' of 'long's " + "for scalars[]"); + } + } else if (PyBytes_Check($input)) { + char *bytes; + Py_ssize_t nbytes; + + if (PyBytes_AsStringAndSize($input, &bytes, &nbytes) < 0) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); + + scalars[0] = ($*1_ltype)bytes; + scalars[1] = nullptr; + $1 = scalars; + $2 = 8 * (nbytes / _global_npoints); + } else if (PyByteArray_Check($input)) { + scalars[0] = ($*1_ltype)PyByteArray_AsString($input); + scalars[1] = nullptr; + $1 = scalars; + $2 = 8 * (PyByteArray_Size($input) / _global_npoints); + } else if (PyMemoryView_Check($input)) { + Py_buffer *buf = PyMemoryView_GET_BUFFER($input); + + if (!PyBytes_Check(buf->obj) && !PyByteArray_Check(buf->obj)) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting 'bytes' in " + "'memoryview' for points[]"); + scalars[0] = ($*1_ltype)buf->buf; + scalars[1] = nullptr; + $1 = scalars; + $2 = 8 * (buf->len / _global_npoints); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', expecting " + "'list', 'bytes', 'bytearray' " + "or 'memoryview' for scalars[]"); + } +%} + +%typemap(out) BLST_ERROR %{ + if ($1 != BLST_SUCCESS) { + SWIG_exception(SWIG_ValueError, BLST_ERROR_str[$1]); + SWIG_fail; + } + $result = SWIG_From_int($1); +%} + +// return |this| +%typemap(out) SELF* OUTPUT %{ (void)$1; Py_INCREF($result = swig_obj[0]); %} +#endif + +#elif defined(SWIGJAVA) + +%header %{ +#ifdef __cplusplus +# define JCALL(func, ...) jenv->func(__VA_ARGS__) +#else +# define JCALL(func, ...) (*jenv)->func(jenv, __VA_ARGS__) +#endif +%} + +%include "enums.swg" +%include "arrays_java.i" +%javaconst(1); + +#if SWIG_VERSION < 0x040000 +%apply (char *STRING, size_t LENGTH) { (const byte *STRING, size_t LENGTH) } +#endif + +%pragma(java) jniclassimports=%{ +import java.io.*; +import java.nio.file.*; +%} +%pragma(java) jniclasscode=%{ + final static String libName = System.mapLibraryName("$module"); + final static String resName = System.getProperty("os.name").replaceFirst(" .*","") + + "/" + System.getProperty("os.arch") + + "/" + libName; + static { + Class imClazz = $imclassname.class; + InputStream res = imClazz.getResourceAsStream( + System.getProperty(imClazz.getPackageName() + ".jniResource", resName)); + if (res == null) { + try { + System.loadLibrary("$module"); + } catch (UnsatisfiedLinkError e) { + String[] cmd = System.getProperty("sun.java.command").split("/"); + if (!"$imclassname".equals(cmd[cmd.length-1])) + // suppress exception if 'main' below is executed + throw new RuntimeException(e.getMessage()); + } + } else { + // unpack shared library into a temporary directory and load it + try { + Path tmpdir = Files.createTempDirectory("$module@"); + tmpdir.toFile().deleteOnExit(); + Path tmpdll = Paths.get(tmpdir.toString(), libName); + tmpdll.toFile().deleteOnExit(); + Files.copy(res, tmpdll, StandardCopyOption.REPLACE_EXISTING); + res.close(); + System.load(tmpdll.toString()); + } catch (IOException e) { + throw new RuntimeException(e.getMessage()); + } + } + } + public static void main(String argv[]) { + System.out.println(resName); + } +%} + +#ifdef __cplusplus +// Extensive sorcery to shift memory management to JVM GC. General idea is +// to use Java long[] as opaque storage for blst data. Methods that return +// new objects allocate suitably sized long[] arrays from JVM heap, +// references to which are then assigned to |swigCPtr| on the Java side. +// And when passed back to JNI, |swigCPtr|s are dereferenced with +// GetLongArrayElements... And no destructors! +%nodefaultdtor; +%typemap(javafinalize) SWIGTYPE "" +%typemap(javadestruct) SWIGTYPE "" + +%typemap(javabody) SWIGTYPE %{ + private transient long[] swigCPtr; + + protected $javaclassname(long[] cPtr) { swigCPtr = cPtr; } + + protected static long[] getCPtr($javaclassname obj) { + return obj != null ? obj.swigCPtr : null; + } + + public $javaclassname dup() { return new $javaclassname(swigCPtr.clone()); } +%} +%ignore dup; +%typemap(javaconstruct) SWIGTYPE { this($imcall); } +%typemap(jni) SWIGTYPE, SWIGTYPE&, SWIGTYPE* "jlongArray" +%typemap(jtype) SWIGTYPE, SWIGTYPE&, SWIGTYPE* "long[]" +%typemap(javaout) SWIGTYPE, SWIGTYPE&, SWIGTYPE* { + return new $javaclassname($jnicall); +} +%typemap(in) SWIGTYPE&, SWIGTYPE* %{ + $1 = ($1_ltype)JCALL(GetLongArrayElements, $input, 0); +%} +%typemap(in) const SWIGTYPE&, const SWIGTYPE* %{ + $1 = $input ? ($1_ltype)JCALL(GetLongArrayElements, $input, 0) : NULL; +%} +%typemap(out) SWIGTYPE&, SWIGTYPE* %{ + if ($1 != $null) { + size_t sz = (sizeof($1_basetype) + sizeof(jlong) - 1)/sizeof(jlong); + $result = JCALL(NewLongArray, sz); + if ($result != $null) + JCALL(SetLongArrayRegion, $result, 0, sz, (const jlong *)$1); + } +%} +%typemap(out) SWIGTYPE { + size_t sz = (sizeof($1_basetype) + sizeof(jlong) - 1)/sizeof(jlong); + $result = JCALL(NewLongArray, sz); + if ($result != $null) + JCALL(SetLongArrayRegion, $result, 0, sz, (const jlong *)&$1); +} +%typemap(newfree) SWIGTYPE* "delete $1;" +%typemap(freearg) SWIGTYPE&, SWIGTYPE* %{ + JCALL(ReleaseLongArrayElements, $input, (jlong *)$1, 0); +%} +%typemap(freearg) const SWIGTYPE&, const SWIGTYPE* %{ + if ($input) JCALL(ReleaseLongArrayElements, $input, (jlong *)$1, JNI_ABORT); +%} +%typemap(freearg) const std::string& "" + +// I wish |jenv| was available in the constructor, so that NewLongArray +// could be called at once, without having to resort to matching +// %typemap(out)... +%extend blst::Pairing { + Pairing(bool hash_or_encode, const std::string& DST) + { size_t sz = blst_pairing_sizeof(); + size_t SZ = (sz + DST.size() + sizeof(jlong) - 1)/sizeof(jlong); + blst_pairing *ret = (blst_pairing *)malloc(SZ*sizeof(jlong)); + if (DST.size() != 0) { + byte *dst = (byte *)ret + sz; + memcpy(dst, DST.data(), DST.size()); + blst_pairing_init(ret, hash_or_encode, dst, DST.size()); + } else { + blst_pairing_init(ret, hash_or_encode, NULL, 0); + } + return (Pairing *)ret; + } +} +%typemap(out) blst::Pairing* { + size_t sz = blst_pairing_sizeof(); + size_t SZ = (sz + arg2->size() + sizeof(jlong) - 1)/sizeof(jlong); + $result = JCALL(NewLongArray, SZ); + if ($result != $null) + JCALL(SetLongArrayRegion, $result, 0, SZ, (const jlong *)$1); +} +%typemap(newfree) blst::Pairing* "free($1);" + +%typemap(javaout) SELF* OUTPUT { $jnicall; return this; } +%typemap(out) SELF* OUTPUT "(void)$1;" +%typemap(jni) SELF* OUTPUT "void" +%typemap(jtype) SELF* OUTPUT "void" +#endif + +%typemap(throws) BLST_ERROR %{ + SWIG_JavaThrowException(jenv, SWIG_JavaRuntimeException, + BLST_ERROR_str[$1]); +%} + +// handle input const byte[] more efficiently... +%apply signed char[] { const byte* } +%typemap(in) const byte* %{ + $1 = $input ? ($1_ltype)JCALL(GetByteArrayElements, $input, 0) : NULL; +%} +%typemap(argout) const byte* "" +%typemap(freearg) const byte* %{ + if ($input) JCALL(ReleaseByteArrayElements, $input, (jbyte *)$1, JNI_ABORT); +%} + +%apply const byte* { const byte[ANY] } +%typemap(in) const byte[ANY] { + size_t sz = JCALL(GetArrayLength, $input); + if (sz != $1_dim0) { + SWIG_JavaThrowException(jenv, SWIG_JavaIndexOutOfBoundsException, + "BLST_ERROR: input size mismatch"); + return $null; + } + $1 = ($1_ltype)JCALL(GetByteArrayElements, $input, 0); +} + +// let users use 'java.math.BigInteger' as scalars +%typemap(in) (const byte* scalar, size_t nbits) %{ + $2 = JCALL(GetArrayLength, $input); + $1 = ($1_ltype)alloca($2); + JCALL(GetByteArrayRegion, $input, 0, $2, (jbyte*)$1); + if (*(jbyte*)$1 < 0) { + SWIG_JavaThrowException(jenv, SWIG_JavaIllegalArgumentException, + "expecting unsigned value"); + return $null; + } + { // BigInteger.toByteArray() emits big-endian, flip the order... + size_t i, j; + for(i=0, j=$2-1; i<$2/2; i++, j--) { + $*1_ltype t=$1[i]; $1[i]=$1[j]; $1[j]=t; + } + } + if ($1[$2-1] == 0) + $2--; + $2 *= 8; +%} +%typemap(jni) (const byte* scalar, size_t nbits) "jbyteArray" +%typemap(jtype) (const byte* scalar, size_t nbits) "byte[]" +%typemap(jstype) (const byte* scalar, size_t nbits) "java.math.BigInteger" +%typemap(javain) (const byte* scalar, size_t nbits) "$javainput.toByteArray()" + +#elif defined(SWIGJAVASCRIPT) && defined(SWIG_JAVASCRIPT_V8) + +%header %{ +#if V8_MAJOR_VERSION >= 8 +# define GetData() GetBackingStore()->Data() +#else +# define GetData() GetContents().Data() +#endif +%} + +%typemap(throws) BLST_ERROR %{ SWIG_V8_Raise(BLST_ERROR_str[$1]); SWIG_fail; %} + +%typemap(in) const byte* %{ + if ($input->IsArrayBufferView()) { + auto av = v8::Local::Cast($input); + auto buf = av->Buffer(); + $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); + } else if ($input->IsNull()) { + $1 = nullptr; + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting "); + } +%} +%typemap(argout) const byte* "" +%typemap(freearg) const byte* "" + +%apply const byte* { const byte[ANY] } +%typemap(in) const byte[ANY] %{ + if ($input->IsArrayBufferView()) { + auto av = v8::Local::Cast($input); + if (av->ByteLength() != $1_dim0) + SWIG_exception_fail(SWIG_IndexError, "in method '$symname', " + "expecting $1_dim0 bytes"); + auto buf = av->Buffer(); + $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting "); + } +%} + +// let users use JavaScript and as scalars +%typemap(in) (const byte* scalar, size_t nbits) %{ + if ($input->IsArrayBufferView()) { + auto av = v8::Local::Cast($input); + auto buf = av->Buffer(); + $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); + $2 = 8*av->ByteLength(); +#if V8_MAJOR_VERSION >=6 && V8_MINOR_VERSION >= 8 + } else if ($input->IsBigInt()) { + auto bi = v8::Local::Cast($input); + int sign, word_count = bi->WordCount(); + uint64_t* words = (uint64_t*)alloca($2 = word_count*sizeof(uint64_t)); + + bi->ToWordsArray(&sign, &word_count, words); + if (sign) + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting unsigned value"); + $1 = ($1_ltype)words; + $2 *= 8; + + const union { + long one; + char little; + } is_endian = { 1 }; + + if (!is_endian.little) { + byte* p = $1; + for (int i = 0; i < word_count; i++) { + uint64_t val = words[i]; + for (size_t j = 0; j < sizeof(val); j++, val >>= 8) + *p++ = (byte)val; + } + } +#endif + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting or "); + } +%} + +%typemap(in) (const byte *STRING, size_t LENGTH) %{ + if ($input->IsArrayBufferView()) { + auto av = v8::Local::Cast($input); + auto buf = av->Buffer(); + $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); + $2 = av->ByteLength(); + } else if ($input->IsString()) { + auto str = v8::Local::Cast($input); + $2 = SWIGV8_UTF8_LENGTH(str); + $1 = ($1_ltype)alloca($2); + SWIGV8_WRITE_UTF8(str, (char *)$1, $2); + } else if ($input->IsNull()) { + $1 = nullptr; + $2 = 0; + } else { + SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " + "expecting or "); + } +%} + +// return |this| +%typemap(out) SELF* OUTPUT %{ (void)$1; $result = args.Holder(); %} + +#elif defined(SWIGPERL) + +// let users use byte[] as scalars +%apply (const char *STRING, size_t LENGTH) { (const byte* scalar, size_t nbits) } +%typemap(check) (const byte* scalar, size_t nbits) %{ $2 *= 8; %} + +#ifdef __cplusplus +// return |this| +%typemap(out) SELF* OUTPUT %{ (void)$1; argvi++; %} +#endif + +#endif // SWIG + +// everybody has a way to bundle pointer and buffer size, but C:-( +%apply (const byte *STRING, size_t LENGTH) { + (const byte *msg, size_t msg_len), + (const byte *DST, size_t DST_len), + (const byte *aug, size_t aug_len), + (const byte *IKM, size_t IKM_len), + (const byte *info, size_t info_len), + (const byte *in, size_t len) +} + +// some sorcery to return byte[] from serialization methods +%typemap(in, numinputs=0) byte out[ANY] (byte temp[$1_dim0]) %{ $1 = temp; %} +%typemap(argout) byte out[ANY] { +#if defined(SWIGPYTHON) + PyObject *obj = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); + $result = ($result==NULL) ? obj + : SWIG_Python_AppendOutput($result, obj); +#elif defined(SWIGJAVA) + $result = JCALL(NewByteArray, $1_dim0); + if ($result != $null) { + JCALL(SetByteArrayRegion, $result, 0, $1_dim0, (const jbyte *)$1); + } +#elif defined(SWIGJAVASCRIPT) && defined(SWIG_JAVASCRIPT_V8) + auto ab = v8::ArrayBuffer::New(v8::Isolate::GetCurrent(), $1_dim0); + memcpy(ab->GetData(), $1, $1_dim0); + $result = v8::Uint8Array::New(ab, 0, $1_dim0); +#elif defined(SWIGPERL) + $result = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); argvi++; +#else // TODO: figure out more language-specific ways to return multi-values... + if ($result == NULL) + $result = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); +#endif +} +%typemap(freearg) byte out[ANY] "" +#ifdef SWIGJAVA +%typemap(jni) byte out[ANY] "jbyteArray" +%typemap(jtype) byte out[ANY] "byte[]" +%typemap(jstype) byte out[ANY] "byte[]" +%typemap(javaout) byte out[ANY] { return $jnicall; } +#endif +%apply byte out[ANY] { + void to_bendian, void blst_bendian_from_scalar, + void to_lendian, void blst_lendian_from_scalar, + void serialize, void blst_p1_serialize, void blst_p1_affine_serialize, + void blst_p2_serialize, void blst_p2_affine_serialize, + void compress, void blst_p1_compress, void blst_p1_affine_compress, + void blst_p2_compress, void blst_p2_affine_compress, + void blst_sk_to_pk2_in_g1, void blst_sign_pk2_in_g1, + void blst_sk_to_pk2_in_g2, void blst_sign_pk2_in_g2 +} + +#ifdef __cplusplus +%apply const std::string& { const std::string* } + +#pragma SWIG nowarn=509,516 + +#if !defined(SWIGPYTHON) +%ignore P1_Affines; +%ignore P2_Affines; +#endif + +%ignore nullptr; +%ignore None; +%ignore C_bytes; +%catches(BLST_ERROR) P1(const byte* in, size_t len); +%catches(BLST_ERROR) P1_Affine(const byte* in, size_t len); +%catches(BLST_ERROR) aggregate(const P1_Affine& in); + +%catches(BLST_ERROR) P2(const byte* in, size_t len); +%catches(BLST_ERROR) P2_Affine(const byte* in, size_t len); +%catches(BLST_ERROR) aggregate(const P2_Affine& in); + +%catches(BLST_ERROR) blst::Scalar::add; +%catches(BLST_ERROR) blst::Scalar::sub; +%catches(BLST_ERROR) blst::Scalar::mul; + +// methods returning |this| +%apply SELF* OUTPUT { + blst::P1* sign_with, blst::P2* sign_with, + blst::P1* hash_to, blst::P2* hash_to, + blst::P1* encode_to, blst::P2* encode_to, + blst::P1* mult, blst::P2* mult, + blst::P1* cneg, blst::P2* cneg, + blst::P1* neg, blst::P2* neg, + blst::P1* add, blst::P2* add, + blst::P1* dbl, blst::P2* dbl, + blst::PT* mul, blst::PT* sqr, + blst::PT* final_exp, + blst::Scalar* from_bendian, + blst::Scalar* from_lendian, + blst::Scalar* add, + blst::Scalar* sub, + blst::Scalar* mul, + blst::Scalar* inverse +} + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +%include "blst.hpp" + +extern const blst::P1_Affine BLS12_381_G1; +extern const blst::P1_Affine BLS12_381_NEG_G1; +extern const blst::P2_Affine BLS12_381_G2; +extern const blst::P2_Affine BLS12_381_NEG_G2; + +#else +%ignore blst_fr; +%ignore blst_fp; +%ignore blst_fp2; +%ignore blst_fp6; +%ignore blst_scalar_from_uint32; +%ignore blst_scalar_from_uint64; +%ignore blst_uint32_from_scalar; +%ignore blst_uint64_from_scalar; +%ignore blst_pairing_init; +%ignore blst_pairing_get_dst; + +%include "blst.h" +%include "blst_aux.h" +%extend blst_pairing { + blst_pairing(bool hash_or_encode, const byte *DST DEFNULL, + size_t DST_len DEFNULL) + { void *ret = malloc(blst_pairing_sizeof()); + if (DST_len != 0) { + void *dst = malloc(DST_len); + memcpy(dst, DST, DST_len); + blst_pairing_init(ret, hash_or_encode, dst, DST_len); + } else { + blst_pairing_init(ret, hash_or_encode, NULL, 0); + } + return ret; + } + ~blst_pairing() + { void *dst = (void *)blst_pairing_get_dst($self); + if (dst != NULL) free(dst); + free($self); + } +} +#endif + +%begin %{ +#ifdef __cplusplus +# include +# include "blst.hpp" +using namespace blst; +#else +# include "blst.h" +#endif + +static const char *const BLST_ERROR_str [] = { + "BLST_ERROR: success", + "BLST_ERROR: bad point encoding", + "BLST_ERROR: point is not on curve", + "BLST_ERROR: point is not in group", + "BLST_ERROR: context type mismatch", + "BLST_ERROR: verify failed", + "BLST_ERROR: public key is infinite", +}; + +#define SWIG_PYTHON_STRICT_BYTE_CHAR + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif +%} + +#if defined(SWIGPYTHON) || defined(SWIGPERL) +%include "cdata.i" +#endif + +#if SWIG_VERSION < 0x040100 && defined(SWIGJAVASCRIPT) +%wrapper %{ +#ifdef NODE_MODULE +# undef NODE_MODULE +# define NODE_MODULE NODE_MODULE_CONTEXT_AWARE +// actually error-prone and not exactly suitable for production, but +// sufficient for development purposes till SWIG 4.1.0 is released... +#endif +%} +#endif + +#if SWIG_VERSION < 0x040100 && defined(SWIGJAVA) +/* SWIG versions prior 4.1 were crossing the MinGW's ways on the path + * to JNI 'jlong' type */ +%begin %{ +#if defined(__MINGW32__) && defined(__int64) +# undef __int64 +#endif +%} +#endif diff --git a/src/blst/bindings/blst_aux.h b/src/blst/bindings/blst_aux.h new file mode 100644 index 0000000000..3de0850e33 --- /dev/null +++ b/src/blst/bindings/blst_aux.h @@ -0,0 +1,117 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); +void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); +#ifdef BLST_FR_PENTAROOT +void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); +void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); +#endif + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +#ifdef __BLST_RUST_BINDGEN__ +typedef struct {} blst_uniq; +#else +typedef struct blst_opaque blst_uniq; +#endif + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); +void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); + +void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_derive_master_eip2333(blst_scalar *out_SK, + const byte *IKM, size_t IKM_len); +void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, + uint32_t child_index); + +void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); +void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); +void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); + +size_t blst_p1_sizeof(void); +size_t blst_p1_affine_sizeof(void); +size_t blst_p2_sizeof(void); +size_t blst_p2_affine_sizeof(void); +size_t blst_fp12_sizeof(void); + +/* + * Single-shot SHA-256 hash function. + */ +void blst_sha256(byte out[32], const byte *msg, size_t msg_len); +#endif diff --git a/src/blst/bindings/c#/poc.cs b/src/blst/bindings/c#/poc.cs new file mode 100644 index 0000000000..6c4b75e688 --- /dev/null +++ b/src/blst/bindings/c#/poc.cs @@ -0,0 +1,79 @@ +using System; +using System.Text; +using supranational; + +class PoC { + private static void Main(string[] args) + { + var msg = Encoding.UTF8.GetBytes("assertion"); + var DST = "MY-DST"; + + var SK = new blst.SecretKey(); + SK.keygen(Encoding.UTF8.GetBytes(new string('*', 32))); + + // generate public key and serialize it... + var pk_for_wire = new blst.P1(SK).serialize(); + + // sign |msg| and serialize the signature... + var sig_for_wire = new blst.P2().hash_to(msg, DST, pk_for_wire) + .sign_with(SK) + .serialize(); + + // now on "receiving" side, start with deserialization... + var _sig = new blst.P2_Affine(sig_for_wire); + var _pk = new blst.P1_Affine(pk_for_wire); + if (!_pk.in_group()) + throw new blst.Exception(blst.ERROR.POINT_NOT_IN_GROUP); + var ctx = new blst.Pairing(true, DST); + var err = ctx.aggregate(_pk, _sig, msg, pk_for_wire); + if (err != blst.ERROR.SUCCESS) + throw new blst.Exception(err); + ctx.commit(); + if (!ctx.finalverify()) + throw new blst.Exception(blst.ERROR.VERIFY_FAIL); + Console.WriteLine("OK"); + + // exercise .as_fp12 by performing equivalent of ctx.finalverify above + var C1 = new blst.PT(_sig); + var C2 = ctx.as_fp12(); + if (!blst.PT.finalverify(C1, C2)) + throw new blst.Exception(blst.ERROR.VERIFY_FAIL); + + // test integers as scalar multiplicands + var p = blst.G1(); + var q = p.dup().dbl().dbl().add(p); + if (!p.mult(5).is_equal(q)) + throw new ApplicationException("disaster"); + if (!blst.G1().mult(-5).is_equal(q.neg())) + throw new ApplicationException("disaster"); + + // low-order sanity check + var p11 = new blst.P1(fromHexString("80803f0d09fec09a95f2ee7495323c15c162270c7cceaffa8566e941c66bcf206e72955d58b3b32e564de3209d672ca5")); + if (p11.in_group()) + throw new ApplicationException("disaster"); + if (!p11.mult(11).is_inf()) + throw new ApplicationException("disaster"); + } + + private static int fromHexChar(char c) + { + if (c>='0' && c<='9') return c - '0'; + else if (c>='a' && c<='f') return c - 'a' + 10; + else if (c>='A' && c<='F') return c - 'A' + 10; + throw new ArgumentOutOfRangeException("non-hex character"); + } + + private static byte[] fromHexString(string str) + { + if (str.Length%2 != 0) + throw new ArgumentException("odd number of characters in hex string"); + + char[] hex = str.ToCharArray(); + byte[] ret = new byte[hex.Length/2]; + + for (int i=0; i + + + Exe + net6.0 + + + diff --git a/src/blst/bindings/c#/run.me b/src/blst/bindings/c#/run.me new file mode 100755 index 0000000000..a7a1f42bb5 --- /dev/null +++ b/src/blst/bindings/c#/run.me @@ -0,0 +1,825 @@ +#!/usr/bin/env python3 +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import os +import re +import sys +import glob +import subprocess + +top = """ +using System; +using System.Text; +using System.Numerics; +using System.Runtime.InteropServices; +using size_t = System.UIntPtr; + +#if NET5_0_OR_GREATER +using System.Runtime.Loader; +using System.Reflection; +using System.IO; +#endif + +namespace supranational { public static class blst { + +#if NET5_0_OR_GREATER +private static readonly string dll; + +static blst() +{ + if (String.IsNullOrEmpty(dll)) { + var name = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "blst.dll" + : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "libblst.dll.dylib" + : "libblst.dll.so"; + + var dir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + var arch = RuntimeInformation.ProcessArchitecture switch { + Architecture.X64 => "x64", + Architecture.Arm64 => "arm64", + _ => "unsupported" + }; + +#if NET8_0_OR_GREATER + // RuntimeInformation.RuntimeIdentifier changed between .NET 7 and 8 + // and only aligns to the nuget layout in 8+ + var rid = RuntimeInformation.RuntimeIdentifier; +#else + // Mimic pre-8 RuntimeInformation.RuntimeIdentifier as + // "win-x64", "linux-x64", "linux-arm64", "osx-x64", etc. + var os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "win" + : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "osx" + : RuntimeInformation.IsOSPlatform(OSPlatform.FreeBSD) ? "freebsd" + : "linux"; + var rid = $"{os}-{arch}"; +#endif + + // first look for the file in the standard locations for a nuget installed native lib + dll = Path.Combine(dir, "runtimes", rid, "native", name); + + if (!File.Exists(dll)) + dll = Path.Combine(dir, arch, name); // try the original non-standard location + + if (!File.Exists(dll)) + dll = Path.Combine(Environment.CurrentDirectory, name); + + if (File.Exists(dll)) { + AssemblyLoadContext.Default.ResolvingUnmanagedDll += (asm, needs) => + (needs == "blst.dll" ? NativeLibrary.Load(dll) : IntPtr.Zero); + } + } +} +#endif + +public enum ERROR { + SUCCESS = 0, + BAD_ENCODING, + POINT_NOT_ON_CURVE, + POINT_NOT_IN_GROUP, + AGGR_TYPE_MISMATCH, + VERIFY_FAIL, + PK_IS_INFINITY, + BAD_SCALAR, +} + +public class Exception : ApplicationException { + private readonly ERROR code; + + public Exception(ERROR err) { code = err; } + public override string Message + { get + { switch(code) { + case ERROR.BAD_ENCODING: return "bad encoding"; + case ERROR.POINT_NOT_ON_CURVE: return "point not on curve"; + case ERROR.POINT_NOT_IN_GROUP: return "point not in group"; + case ERROR.AGGR_TYPE_MISMATCH: return "aggregate type mismatch"; + case ERROR.VERIFY_FAIL: return "verify failure"; + case ERROR.PK_IS_INFINITY: return "public key is infinity"; + case ERROR.BAD_SCALAR: return "bad scalar"; + default: return null; + } + } + } +} + +public enum ByteOrder { + BigEndian, + LittleEndian, +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v3([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v4_5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] salt, size_t salt_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] salt, size_t salt_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_derive_master_eip2333([Out] byte[] key, + [In] byte[] IKM, size_t IKM_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_derive_child_eip2333([Out] byte[] key, + [In] byte[] master, uint child_index); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_bendian([Out] byte[] ret, [In] byte[] key); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_bendian_from_scalar([Out] byte[] ret, [In] byte[] key); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_check([In] byte[] key); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_lendian([Out] byte[] key, [In] byte[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_lendian_from_scalar([Out] byte[] key, [In] byte[] inp); + +public struct SecretKey { + internal byte[] key; + + //public SecretKey() { key = new byte[32]; } + public SecretKey(byte[] IKM, string info) + { key = new byte[32]; keygen(IKM, info); } + public SecretKey(byte[] inp, ByteOrder order=ByteOrder.BigEndian) + { key = new byte[32]; + switch(order) { + case ByteOrder.BigEndian: from_bendian(inp); break; + case ByteOrder.LittleEndian: from_lendian(inp); break; + } + } + + public void keygen(byte[] IKM, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen(key, IKM, (size_t)IKM.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v3(byte[] IKM, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v3(key, IKM, (size_t)IKM.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v4_5(byte[] IKM, string salt, string info="") + { if (key == null) key = new byte[32]; + byte[] salt_bytes = Encoding.UTF8.GetBytes(salt); + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v4_5(key, IKM, (size_t)IKM.Length, + salt_bytes, (size_t)salt_bytes.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v5(byte[] IKM, byte[] salt, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v5(key, IKM, (size_t)IKM.Length, + salt, (size_t)salt.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v5(byte[] IKM, string salt, string info="") + { keygen_v5(IKM, Encoding.UTF8.GetBytes(salt), info); } + public void derive_master_eip2333(byte[] IKM) + { if (key == null) key = new byte[32]; + blst_derive_master_eip2333(key, IKM, (size_t)IKM.Length); + } + public SecretKey(SecretKey master, uint child_index) + { key = new byte[32]; + blst_derive_child_eip2333(key, master.key, child_index); + } + + public void from_bendian(byte[] inp) + { if (inp.Length != 32) + throw new Exception(ERROR.BAD_ENCODING); + if (key == null) key = new byte[32]; + blst_scalar_from_bendian(key, inp); + if (!blst_sk_check(key)) + throw new Exception(ERROR.BAD_ENCODING); + } + public void from_lendian(byte[] inp) + { if (inp.Length != 32) + throw new Exception(ERROR.BAD_ENCODING); + if (key == null) key = new byte[32]; + blst_scalar_from_lendian(key, inp); + if (!blst_sk_check(key)) + throw new Exception(ERROR.BAD_ENCODING); + } + + public byte[] to_bendian() + { byte[] ret = new byte[32]; + blst_bendian_from_scalar(ret, key); + return ret; + } + public byte[] to_lendian() + { byte[] ret = new byte[32]; + blst_lendian_from_scalar(ret, key); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_be_bytes([Out] byte[] ret, [In] byte[] inp, + size_t inp_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_le_bytes([Out] byte[] ret, [In] byte[] inp, + size_t inp_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_add_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_sub_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_mul_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_sk_inverse([Out] byte[] ret, [In] byte[] a); + +public struct Scalar { + internal byte[] val; + + //public Scalar() { val = new byte[32]; } + public Scalar(byte[] inp, ByteOrder order=ByteOrder.BigEndian) + { val = new byte[32]; + switch(order) { + case ByteOrder.BigEndian: from_bendian(inp); break; + case ByteOrder.LittleEndian: from_lendian(inp); break; + } + } + private Scalar(bool _) { val = new byte[32]; } + private Scalar(Scalar orig) { val = (byte[])orig.val.Clone(); } + + public Scalar dup() { return new Scalar(this); } + + public void from_bendian(byte[] inp) + { if (val == null) val = new byte[32]; + blst_scalar_from_be_bytes(val, inp, (size_t)inp.Length); + } + public void from_lendian(byte[] inp) + { if (val == null) val = new byte[32]; + blst_scalar_from_le_bytes(val, inp, (size_t)inp.Length); + } + + public byte[] to_bendian() + { byte[] ret = new byte[32]; + blst_bendian_from_scalar(ret, val); + return ret; + } + public byte[] to_lendian() + { byte[] ret = new byte[32]; + blst_lendian_from_scalar(ret, val); + return ret; + } + + public Scalar add(SecretKey a) + { if (!blst_sk_add_n_check(val, val, a.key)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar add(Scalar a) + { if (!blst_sk_add_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar sub(Scalar a) + { if (!blst_sk_sub_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar mul(Scalar a) + { if (!blst_sk_mul_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar inverse() + { blst_sk_inverse(val, val); return this; } + + public static Scalar operator+(Scalar a, Scalar b) + { return a.dup().add(b); } + public static Scalar operator-(Scalar a, Scalar b) + { return a.dup().sub(b); } + public static Scalar operator*(Scalar a, Scalar b) + { return a.dup().mul(b); } + public static Scalar operator/(Scalar a, Scalar b) + { return b.dup().inverse().mul(a); } +} + +private const int P1_COMPRESSED_SZ = 384/8; +private const int P2_COMPRESSED_SZ = 2*P1_COMPRESSED_SZ; +""" +middle = """ +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p1_affine_sizeof(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_p1_deserialize([Out] long[] ret, [In] byte[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_affine_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_affine_compress([Out] byte[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_to_affine([Out] long[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_in_g1([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_p1_generator(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_core_verify_pk_in_g2([In] long[] pk, [In] long[] sig, + bool hash_or_encode, + [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); + +public struct P1_Affine { + internal readonly long[] point; + + private static readonly int sz = (int)blst_p1_affine_sizeof()/sizeof(long); + + //public P1_Affine() { point = new long[sz]; } + private P1_Affine(bool _) { point = new long[sz]; } + private P1_Affine(P1_Affine p) { point = (long[])p.point.Clone(); } + + public P1_Affine(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ + : 2*P1_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p1_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + } + public P1_Affine(P1 jacobian) : this(true) + { blst_p1_to_affine(point, jacobian.point); } + + public P1_Affine dup() { return new P1_Affine(this); } + public P1 to_jacobian() { return new P1(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; + blst_p1_affine_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P1_COMPRESSED_SZ]; + blst_p1_affine_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p1_affine_on_curve(point); } + public bool in_group() { return blst_p1_affine_in_g1(point); } + public bool is_inf() { return blst_p1_affine_is_inf(point); } + public bool is_equal(P1_Affine p) + { return blst_p1_affine_is_equal(point, p.point); } + + ERROR core_verify(P2_Affine pk, bool hash_or_encode, + byte[] msg, string DST = "", byte[] aug = null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + return blst_core_verify_pk_in_g2(pk.point, point, + hash_or_encode, + msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + + public static P1_Affine generator() + { var ret = new P1_Affine(true); + Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p1_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_compress([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_from_affine([Out] long[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_in_g1([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_sk_to_pk_in_g1([Out] long[] ret, [In] byte[] SK); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_encode_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_hash_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_sign_pk_in_g2([Out] long[] ret, [In] long[] hash, [In] byte[] SK); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_mult([Out] long[] ret, [In] long[] a, + [In] byte[] scalar, size_t nbits); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_cneg([Out] long[] ret, bool cbit); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_add_or_double_affine([Out] long[] ret, [In] long[] a, + [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_double([Out] long[] ret, [In] long[] a); + +public struct P1 { + internal long[] point; + + private static readonly int sz = (int)blst_p1_sizeof()/sizeof(long); + + //public P1() { point = new long[sz]; } + private P1(bool _) { point = new long[sz]; } + private P1(P1 p) { point = (long[])p.point.Clone(); } + private long[] self() + { if (point==null) { point = new long[sz]; } return point; } + + public P1(SecretKey sk) : this(true) + { blst_sk_to_pk_in_g1(point, sk.key); } + public P1(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ + : 2*P1_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p1_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + blst_p1_from_affine(point, point); + } + public P1(P1_Affine affine) : this(true) + { blst_p1_from_affine(point, affine.point); } + + public P1 dup() { return new P1(this); } + public P1_Affine to_affine() { return new P1_Affine(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; + blst_p1_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P1_COMPRESSED_SZ]; + blst_p1_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p1_on_curve(point); } + public bool in_group() { return blst_p1_in_g1(point); } + public bool is_inf() { return blst_p1_is_inf(point); } + public bool is_equal(P1 p) { return blst_p1_is_equal(point, p.point); } + + public P1 hash_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_hash_to_g1(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + public P1 encode_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_encode_to_g1(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + + public P1 sign_with(SecretKey sk) + { blst_sign_pk_in_g2(point, point, sk.key); return this; } + public P1 sign_with(Scalar scalar) + { blst_sign_pk_in_g2(point, point, scalar.val); return this; } + + public void aggregate(P1_Affine inp) + { if (blst_p1_affine_in_g1(inp.point)) + blst_p1_add_or_double_affine(point, point, inp.point); + else + throw new Exception(ERROR.POINT_NOT_IN_GROUP); + } + + public P1 mult(byte[] scalar) + { blst_p1_mult(point, point, scalar, (size_t)(scalar.Length*8)); + return this; + } + public P1 mult(Scalar scalar) + { blst_p1_mult(point, point, scalar.val, (size_t)255); + return this; + } + public P1 mult(BigInteger scalar) + { byte[] val; + if (scalar.Sign < 0) { + val = BigInteger.Negate(scalar).ToByteArray(); + blst_p1_cneg(point, true); + } else { + val = scalar.ToByteArray(); + } + int len = val.Length; + if (val[len-1]==0) len--; + blst_p1_mult(point, point, val, (size_t)(len*8)); + return this; + } + public P1 cneg(bool flag) { blst_p1_cneg(point, flag); return this; } + public P1 neg() { blst_p1_cneg(point, true); return this; } + public P1 add(P1 a) + { blst_p1_add_or_double(point, point, a.point); return this; } + public P1 add(P1_Affine a) + { blst_p1_add_or_double_affine(point, point, a.point); return this; } + public P1 dbl() + { blst_p1_double(point, point); return this; } + + public static P1 generator() + { var ret = new P1(true); + Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +public static P1 G1() { return P1.generator(); } + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_aggregated_in_g1([Out] long[] fp12, [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_aggregate_pk_in_g1([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g1([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] scalar, size_t nbits, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); +""" +bottom = """ +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_fp12_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_miller_loop([Out] long[] fp12, [In] long[] q, + [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_is_one([In] long[] fp12); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_is_equal([In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_fp12_sqr([Out] long[] ret, [In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_fp12_mul([Out] long[] ret, [In] long[] a, + [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_final_exp([Out] long[] ret, [In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_finalverify([In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_fp12_one(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_in_group([In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_bendian_from_fp12([Out] byte[] ret, [In] long[] a); + +public struct PT { + internal readonly long[] fp12; + + private static readonly int sz = (int)blst_fp12_sizeof()/sizeof(long); + + internal PT(bool _) { fp12 = new long[sz]; } + private PT(PT orig) { fp12 = (long[])orig.fp12.Clone(); } + + public PT(P1_Affine p) : this(true) + { blst_aggregated_in_g1(fp12, p.point); } + public PT(P1 p) : this(true) + { blst_aggregated_in_g1(fp12, (new P1_Affine(p)).point); } + public PT(P2_Affine q) : this(true) + { blst_aggregated_in_g2(fp12, q.point); } + public PT(P2 q) : this(true) + { blst_aggregated_in_g2(fp12, (new P2_Affine(q)).point); } + public PT(P2_Affine q, P1_Affine p) : this(true) + { blst_miller_loop(fp12, q.point, p.point); } + public PT(P1_Affine p, P2_Affine q) : this(q, p) {} + public PT(P2 q, P1 p) : this(true) + { blst_miller_loop(fp12, (new P2_Affine(q)).point, + (new P1_Affine(p)).point); + } + public PT(P1 p, P2 q) : this(q, p) {} + + public PT dup() { return new PT(this); } + public bool is_one() { return blst_fp12_is_one(fp12); } + public bool is_equal(PT p) + { return blst_fp12_is_equal(fp12, p.fp12); } + public PT sqr() { blst_fp12_sqr(fp12, fp12); return this; } + public PT mul(PT p) { blst_fp12_mul(fp12, fp12, p.fp12); return this; } + public PT final_exp() { blst_final_exp(fp12, fp12); return this; } + public bool in_group() { return blst_fp12_in_group(fp12); } + public byte[] to_bendian() + { byte[] ret = new byte[12*P1_COMPRESSED_SZ]; + blst_bendian_from_fp12(ret, fp12); + return ret; + } + + public static bool finalverify(PT gt1, PT gt2) + { return blst_fp12_finalverify(gt1.fp12, gt2.fp12); } + + public static PT one() + { var ret = new PT(true); + Marshal.Copy(blst_fp12_one(), ret.fp12, 0, ret.fp12.Length); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_pairing_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_pairing_init([In, Out] long[] ctx, bool hash_or_encode, + [In] ref long dst, size_t dst_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_pairing_commit([In, Out] long[] ctx); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_merge([In, Out] long[] ctx, [In] long[] ctx1); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_pairing_finalverify([In] long[] ctx, [In] long[] sig); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_pairing_raw_aggregate([In, Out] long[] ctx, [In] long[] q, + [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_pairing_as_fp12([In] long[] ctx); + +public struct Pairing { + private readonly long[] ctx; + + private static readonly int sz = (int)blst_pairing_sizeof()/sizeof(long); + + public Pairing(bool hash_or_encode=false, string DST="") + { + byte[] dst = Encoding.UTF8.GetBytes(DST); + int dst_len = dst.Length; + int add_len = dst_len!=0 ? (dst_len+sizeof(long)-1)/sizeof(long) : 1; + Array.Resize(ref dst, add_len*sizeof(long)); + + ctx = new long[sz+add_len]; + + for (int i=0; i sig, + byte[] msg, byte[] aug=null) + { return blst_pairing_aggregate_pk_in_g1(ctx, pk.point, + sig.HasValue ? sig.Value.point : null, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR aggregate(P2_Affine pk, Nullable sig, + byte[] msg, byte[] aug=null) + { return blst_pairing_aggregate_pk_in_g2(ctx, pk.point, + sig.HasValue ? sig.Value.point : null, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR mul_n_aggregate(P2_Affine pk, P1_Affine sig, + byte[] scalar, int nbits, + byte[] msg, byte[] aug=null) + { return blst_pairing_mul_n_aggregate_pk_in_g2(ctx, pk.point, sig.point, + scalar, (size_t)nbits, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR mul_n_aggregate(P1_Affine pk, P2_Affine sig, + byte[] scalar, int nbits, + byte[] msg, byte[] aug=null) + { return blst_pairing_mul_n_aggregate_pk_in_g1(ctx, pk.point, sig.point, + scalar, (size_t)nbits, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + + public void commit() { blst_pairing_commit(ctx); } + public void merge(Pairing a) + { var err = blst_pairing_merge(ctx, a.ctx); + if (err != ERROR.SUCCESS) + throw new Exception(err); + } + public bool finalverify(PT sig=new PT()) + { return blst_pairing_finalverify(ctx, sig.fp12); } + + public void raw_aggregate(P2_Affine q, P1_Affine p) + { blst_pairing_raw_aggregate(ctx, q.point, p.point); } + public void raw_aggregate(P1_Affine p, P2_Affine q) + { raw_aggregate(q, p); } + public void raw_aggregate(P2 q, P1 p) + { blst_pairing_raw_aggregate(ctx, (new P2_Affine(q)).point, + (new P1_Affine(p)).point); + } + public void raw_aggregate(P1 p, P2 q) + { raw_aggregate(q, p); } + public PT as_fp12() + { var ret = new PT(true); + GCHandle h = GCHandle.Alloc(ctx, GCHandleType.Pinned); + Marshal.Copy(blst_pairing_as_fp12(ctx), ret.fp12, 0, ret.fp12.Length); + h.Free(); + return ret; + } +} +}}""" + +here = re.split(r'[/\\](?=[^/\\]*$)', sys.argv[0]) +if len(here) > 1: + os.chdir(here[0]) + + +def xchg_1vs2(matchobj): + if matchobj.group(2) == '1': + return matchobj.group(1) + '2' + else: + return matchobj.group(1) + '1' + + +def newer(files): + if len(files) == 1: + return True + rh = files[-1] + if not os.path.exists(rh): + return True + for lh in files[:-1]: + if os.stat(lh).st_ctime > os.stat(rh).st_ctime: + return True + return False + + +fname = "supranational.blst.cs" +if newer([here[-1], fname]): + fd = open(fname, "w") + print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) + print("// DO NOT EDIT THIS FILE!!!", file=fd) + print("// The file is auto-generated by " + here[-1], file=fd) + print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) + print("\n\n", file=fd) + print(top, file=fd) + print(middle, file=fd) + print(re.sub(r'((? "x64", + Architecture.Arm64 => "arm64", + _ => "unsupported" + }; + +#if NET8_0_OR_GREATER + // RuntimeInformation.RuntimeIdentifier changed between .NET 7 and 8 + // and only aligns to the nuget layout in 8+ + var rid = RuntimeInformation.RuntimeIdentifier; +#else + // Mimic pre-8 RuntimeInformation.RuntimeIdentifier as + // "win-x64", "linux-x64", "linux-arm64", "osx-x64", etc. + var os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "win" + : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "osx" + : RuntimeInformation.IsOSPlatform(OSPlatform.FreeBSD) ? "freebsd" + : "linux"; + var rid = $"{os}-{arch}"; +#endif + + // first look for the file in the standard locations for a nuget installed native lib + dll = Path.Combine(dir, "runtimes", rid, "native", name); + + if (!File.Exists(dll)) + dll = Path.Combine(dir, arch, name); // try the original non-standard location + + if (!File.Exists(dll)) + dll = Path.Combine(Environment.CurrentDirectory, name); + + if (File.Exists(dll)) { + AssemblyLoadContext.Default.ResolvingUnmanagedDll += (asm, needs) => + (needs == "blst.dll" ? NativeLibrary.Load(dll) : IntPtr.Zero); + } + } +} +#endif + +public enum ERROR { + SUCCESS = 0, + BAD_ENCODING, + POINT_NOT_ON_CURVE, + POINT_NOT_IN_GROUP, + AGGR_TYPE_MISMATCH, + VERIFY_FAIL, + PK_IS_INFINITY, + BAD_SCALAR, +} + +public class Exception : ApplicationException { + private readonly ERROR code; + + public Exception(ERROR err) { code = err; } + public override string Message + { get + { switch(code) { + case ERROR.BAD_ENCODING: return "bad encoding"; + case ERROR.POINT_NOT_ON_CURVE: return "point not on curve"; + case ERROR.POINT_NOT_IN_GROUP: return "point not in group"; + case ERROR.AGGR_TYPE_MISMATCH: return "aggregate type mismatch"; + case ERROR.VERIFY_FAIL: return "verify failure"; + case ERROR.PK_IS_INFINITY: return "public key is infinity"; + case ERROR.BAD_SCALAR: return "bad scalar"; + default: return null; + } + } + } +} + +public enum ByteOrder { + BigEndian, + LittleEndian, +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v3([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v4_5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] salt, size_t salt_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_keygen_v5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, + [In] byte[] salt, size_t salt_len, + [In] byte[] info, size_t info_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_derive_master_eip2333([Out] byte[] key, + [In] byte[] IKM, size_t IKM_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_derive_child_eip2333([Out] byte[] key, + [In] byte[] master, uint child_index); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_bendian([Out] byte[] ret, [In] byte[] key); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_bendian_from_scalar([Out] byte[] ret, [In] byte[] key); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_check([In] byte[] key); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_lendian([Out] byte[] key, [In] byte[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_lendian_from_scalar([Out] byte[] key, [In] byte[] inp); + +public struct SecretKey { + internal byte[] key; + + //public SecretKey() { key = new byte[32]; } + public SecretKey(byte[] IKM, string info) + { key = new byte[32]; keygen(IKM, info); } + public SecretKey(byte[] inp, ByteOrder order=ByteOrder.BigEndian) + { key = new byte[32]; + switch(order) { + case ByteOrder.BigEndian: from_bendian(inp); break; + case ByteOrder.LittleEndian: from_lendian(inp); break; + } + } + + public void keygen(byte[] IKM, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen(key, IKM, (size_t)IKM.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v3(byte[] IKM, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v3(key, IKM, (size_t)IKM.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v4_5(byte[] IKM, string salt, string info="") + { if (key == null) key = new byte[32]; + byte[] salt_bytes = Encoding.UTF8.GetBytes(salt); + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v4_5(key, IKM, (size_t)IKM.Length, + salt_bytes, (size_t)salt_bytes.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v5(byte[] IKM, byte[] salt, string info="") + { if (key == null) key = new byte[32]; + byte[] info_bytes = Encoding.UTF8.GetBytes(info); + blst_keygen_v5(key, IKM, (size_t)IKM.Length, + salt, (size_t)salt.Length, + info_bytes, (size_t)info_bytes.Length); + } + public void keygen_v5(byte[] IKM, string salt, string info="") + { keygen_v5(IKM, Encoding.UTF8.GetBytes(salt), info); } + public void derive_master_eip2333(byte[] IKM) + { if (key == null) key = new byte[32]; + blst_derive_master_eip2333(key, IKM, (size_t)IKM.Length); + } + public SecretKey(SecretKey master, uint child_index) + { key = new byte[32]; + blst_derive_child_eip2333(key, master.key, child_index); + } + + public void from_bendian(byte[] inp) + { if (inp.Length != 32) + throw new Exception(ERROR.BAD_ENCODING); + if (key == null) key = new byte[32]; + blst_scalar_from_bendian(key, inp); + if (!blst_sk_check(key)) + throw new Exception(ERROR.BAD_ENCODING); + } + public void from_lendian(byte[] inp) + { if (inp.Length != 32) + throw new Exception(ERROR.BAD_ENCODING); + if (key == null) key = new byte[32]; + blst_scalar_from_lendian(key, inp); + if (!blst_sk_check(key)) + throw new Exception(ERROR.BAD_ENCODING); + } + + public byte[] to_bendian() + { byte[] ret = new byte[32]; + blst_bendian_from_scalar(ret, key); + return ret; + } + public byte[] to_lendian() + { byte[] ret = new byte[32]; + blst_lendian_from_scalar(ret, key); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_be_bytes([Out] byte[] ret, [In] byte[] inp, + size_t inp_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_scalar_from_le_bytes([Out] byte[] ret, [In] byte[] inp, + size_t inp_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_add_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_sub_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_sk_mul_n_check([Out] byte[] ret, [In] byte[] a, + [In] byte[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_sk_inverse([Out] byte[] ret, [In] byte[] a); + +public struct Scalar { + internal byte[] val; + + //public Scalar() { val = new byte[32]; } + public Scalar(byte[] inp, ByteOrder order=ByteOrder.BigEndian) + { val = new byte[32]; + switch(order) { + case ByteOrder.BigEndian: from_bendian(inp); break; + case ByteOrder.LittleEndian: from_lendian(inp); break; + } + } + private Scalar(bool _) { val = new byte[32]; } + private Scalar(Scalar orig) { val = (byte[])orig.val.Clone(); } + + public Scalar dup() { return new Scalar(this); } + + public void from_bendian(byte[] inp) + { if (val == null) val = new byte[32]; + blst_scalar_from_be_bytes(val, inp, (size_t)inp.Length); + } + public void from_lendian(byte[] inp) + { if (val == null) val = new byte[32]; + blst_scalar_from_le_bytes(val, inp, (size_t)inp.Length); + } + + public byte[] to_bendian() + { byte[] ret = new byte[32]; + blst_bendian_from_scalar(ret, val); + return ret; + } + public byte[] to_lendian() + { byte[] ret = new byte[32]; + blst_lendian_from_scalar(ret, val); + return ret; + } + + public Scalar add(SecretKey a) + { if (!blst_sk_add_n_check(val, val, a.key)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar add(Scalar a) + { if (!blst_sk_add_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar sub(Scalar a) + { if (!blst_sk_sub_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar mul(Scalar a) + { if (!blst_sk_mul_n_check(val, val, a.val)) + throw new Exception(ERROR.BAD_SCALAR); + return this; + } + public Scalar inverse() + { blst_sk_inverse(val, val); return this; } + + public static Scalar operator+(Scalar a, Scalar b) + { return a.dup().add(b); } + public static Scalar operator-(Scalar a, Scalar b) + { return a.dup().sub(b); } + public static Scalar operator*(Scalar a, Scalar b) + { return a.dup().mul(b); } + public static Scalar operator/(Scalar a, Scalar b) + { return b.dup().inverse().mul(a); } +} + +private const int P1_COMPRESSED_SZ = 384/8; +private const int P2_COMPRESSED_SZ = 2*P1_COMPRESSED_SZ; + + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p1_affine_sizeof(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_p1_deserialize([Out] long[] ret, [In] byte[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_affine_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_affine_compress([Out] byte[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_to_affine([Out] long[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_in_g1([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_affine_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_p1_generator(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_core_verify_pk_in_g2([In] long[] pk, [In] long[] sig, + bool hash_or_encode, + [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); + +public struct P1_Affine { + internal readonly long[] point; + + private static readonly int sz = (int)blst_p1_affine_sizeof()/sizeof(long); + + //public P1_Affine() { point = new long[sz]; } + private P1_Affine(bool _) { point = new long[sz]; } + private P1_Affine(P1_Affine p) { point = (long[])p.point.Clone(); } + + public P1_Affine(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ + : 2*P1_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p1_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + } + public P1_Affine(P1 jacobian) : this(true) + { blst_p1_to_affine(point, jacobian.point); } + + public P1_Affine dup() { return new P1_Affine(this); } + public P1 to_jacobian() { return new P1(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; + blst_p1_affine_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P1_COMPRESSED_SZ]; + blst_p1_affine_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p1_affine_on_curve(point); } + public bool in_group() { return blst_p1_affine_in_g1(point); } + public bool is_inf() { return blst_p1_affine_is_inf(point); } + public bool is_equal(P1_Affine p) + { return blst_p1_affine_is_equal(point, p.point); } + + ERROR core_verify(P2_Affine pk, bool hash_or_encode, + byte[] msg, string DST = "", byte[] aug = null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + return blst_core_verify_pk_in_g2(pk.point, point, + hash_or_encode, + msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + + public static P1_Affine generator() + { var ret = new P1_Affine(true); + Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p1_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_compress([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_from_affine([Out] long[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_in_g1([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p1_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_sk_to_pk_in_g1([Out] long[] ret, [In] byte[] SK); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_encode_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_hash_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_sign_pk_in_g2([Out] long[] ret, [In] long[] hash, [In] byte[] SK); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_mult([Out] long[] ret, [In] long[] a, + [In] byte[] scalar, size_t nbits); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_cneg([Out] long[] ret, bool cbit); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p1_add_or_double_affine([Out] long[] ret, [In] long[] a, + [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p1_double([Out] long[] ret, [In] long[] a); + +public struct P1 { + internal long[] point; + + private static readonly int sz = (int)blst_p1_sizeof()/sizeof(long); + + //public P1() { point = new long[sz]; } + private P1(bool _) { point = new long[sz]; } + private P1(P1 p) { point = (long[])p.point.Clone(); } + private long[] self() + { if (point==null) { point = new long[sz]; } return point; } + + public P1(SecretKey sk) : this(true) + { blst_sk_to_pk_in_g1(point, sk.key); } + public P1(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ + : 2*P1_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p1_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + blst_p1_from_affine(point, point); + } + public P1(P1_Affine affine) : this(true) + { blst_p1_from_affine(point, affine.point); } + + public P1 dup() { return new P1(this); } + public P1_Affine to_affine() { return new P1_Affine(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; + blst_p1_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P1_COMPRESSED_SZ]; + blst_p1_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p1_on_curve(point); } + public bool in_group() { return blst_p1_in_g1(point); } + public bool is_inf() { return blst_p1_is_inf(point); } + public bool is_equal(P1 p) { return blst_p1_is_equal(point, p.point); } + + public P1 hash_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_hash_to_g1(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + public P1 encode_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_encode_to_g1(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + + public P1 sign_with(SecretKey sk) + { blst_sign_pk_in_g2(point, point, sk.key); return this; } + public P1 sign_with(Scalar scalar) + { blst_sign_pk_in_g2(point, point, scalar.val); return this; } + + public void aggregate(P1_Affine inp) + { if (blst_p1_affine_in_g1(inp.point)) + blst_p1_add_or_double_affine(point, point, inp.point); + else + throw new Exception(ERROR.POINT_NOT_IN_GROUP); + } + + public P1 mult(byte[] scalar) + { blst_p1_mult(point, point, scalar, (size_t)(scalar.Length*8)); + return this; + } + public P1 mult(Scalar scalar) + { blst_p1_mult(point, point, scalar.val, (size_t)255); + return this; + } + public P1 mult(BigInteger scalar) + { byte[] val; + if (scalar.Sign < 0) { + val = BigInteger.Negate(scalar).ToByteArray(); + blst_p1_cneg(point, true); + } else { + val = scalar.ToByteArray(); + } + int len = val.Length; + if (val[len-1]==0) len--; + blst_p1_mult(point, point, val, (size_t)(len*8)); + return this; + } + public P1 cneg(bool flag) { blst_p1_cneg(point, flag); return this; } + public P1 neg() { blst_p1_cneg(point, true); return this; } + public P1 add(P1 a) + { blst_p1_add_or_double(point, point, a.point); return this; } + public P1 add(P1_Affine a) + { blst_p1_add_or_double_affine(point, point, a.point); return this; } + public P1 dbl() + { blst_p1_double(point, point); return this; } + + public static P1 generator() + { var ret = new P1(true); + Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +public static P1 G1() { return P1.generator(); } + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_aggregated_in_g1([Out] long[] fp12, [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_aggregate_pk_in_g1([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g1([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] scalar, size_t nbits, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); + + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p2_affine_sizeof(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_p2_deserialize([Out] long[] ret, [In] byte[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_affine_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_affine_compress([Out] byte[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_to_affine([Out] long[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_affine_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_affine_in_g2([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_affine_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_affine_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_p2_generator(); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_core_verify_pk_in_g1([In] long[] pk, [In] long[] sig, + bool hash_or_encode, + [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); + +public struct P2_Affine { + internal readonly long[] point; + + private static readonly int sz = (int)blst_p2_affine_sizeof()/sizeof(long); + + //public P2_Affine() { point = new long[sz]; } + private P2_Affine(bool _) { point = new long[sz]; } + private P2_Affine(P2_Affine p) { point = (long[])p.point.Clone(); } + + public P2_Affine(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P2_COMPRESSED_SZ + : 2*P2_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p2_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + } + public P2_Affine(P2 jacobian) : this(true) + { blst_p2_to_affine(point, jacobian.point); } + + public P2_Affine dup() { return new P2_Affine(this); } + public P2 to_jacobian() { return new P2(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P2_COMPRESSED_SZ]; + blst_p2_affine_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P2_COMPRESSED_SZ]; + blst_p2_affine_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p2_affine_on_curve(point); } + public bool in_group() { return blst_p2_affine_in_g2(point); } + public bool is_inf() { return blst_p2_affine_is_inf(point); } + public bool is_equal(P2_Affine p) + { return blst_p2_affine_is_equal(point, p.point); } + + ERROR core_verify(P1_Affine pk, bool hash_or_encode, + byte[] msg, string DST = "", byte[] aug = null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + return blst_core_verify_pk_in_g1(pk.point, point, + hash_or_encode, + msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + + public static P2_Affine generator() + { var ret = new P2_Affine(true); + Marshal.Copy(blst_p2_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_p2_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_serialize([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_compress([Out] byte[] ret, [In] long[] inp); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_from_affine([Out] long[] ret, [In] long[] inp); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_on_curve([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_in_g2([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_is_inf([In] long[] point); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_p2_is_equal([In] long[] a, [In] long[] b); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_sk_to_pk_in_g2([Out] long[] ret, [In] byte[] SK); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_encode_to_g2([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_hash_to_g2([Out] long[] ret, [In] byte[] msg, size_t msg_len, + [In] byte[] dst, size_t dst_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_sign_pk_in_g1([Out] long[] ret, [In] long[] hash, [In] byte[] SK); + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p2_mult([Out] long[] ret, [In] long[] a, + [In] byte[] scalar, size_t nbits); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_cneg([Out] long[] ret, bool cbit); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p2_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_p2_add_or_double_affine([Out] long[] ret, [In] long[] a, + [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_p2_double([Out] long[] ret, [In] long[] a); + +public struct P2 { + internal long[] point; + + private static readonly int sz = (int)blst_p2_sizeof()/sizeof(long); + + //public P2() { point = new long[sz]; } + private P2(bool _) { point = new long[sz]; } + private P2(P2 p) { point = (long[])p.point.Clone(); } + private long[] self() + { if (point==null) { point = new long[sz]; } return point; } + + public P2(SecretKey sk) : this(true) + { blst_sk_to_pk_in_g2(point, sk.key); } + public P2(byte[] inp) : this(true) + { int len = inp.Length; + if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P2_COMPRESSED_SZ + : 2*P2_COMPRESSED_SZ)) + throw new Exception(ERROR.BAD_ENCODING); + ERROR err = blst_p2_deserialize(point, inp); + if (err != ERROR.SUCCESS) + throw new Exception(err); + blst_p2_from_affine(point, point); + } + public P2(P2_Affine affine) : this(true) + { blst_p2_from_affine(point, affine.point); } + + public P2 dup() { return new P2(this); } + public P2_Affine to_affine() { return new P2_Affine(this); } + public byte[] serialize() + { byte[] ret = new byte[2*P2_COMPRESSED_SZ]; + blst_p2_serialize(ret, point); + return ret; + } + public byte[] compress() + { byte[] ret = new byte[P2_COMPRESSED_SZ]; + blst_p2_compress(ret, point); + return ret; + } + + public bool on_curve() { return blst_p2_on_curve(point); } + public bool in_group() { return blst_p2_in_g2(point); } + public bool is_inf() { return blst_p2_is_inf(point); } + public bool is_equal(P2 p) { return blst_p2_is_equal(point, p.point); } + + public P2 hash_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_hash_to_g2(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + public P2 encode_to(byte[] msg, string DST="", byte[] aug=null) + { byte[] dst = Encoding.UTF8.GetBytes(DST); + blst_encode_to_g2(self(), msg, (size_t)msg.Length, + dst, (size_t)dst.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + return this; + } + + public P2 sign_with(SecretKey sk) + { blst_sign_pk_in_g1(point, point, sk.key); return this; } + public P2 sign_with(Scalar scalar) + { blst_sign_pk_in_g1(point, point, scalar.val); return this; } + + public void aggregate(P2_Affine inp) + { if (blst_p2_affine_in_g2(inp.point)) + blst_p2_add_or_double_affine(point, point, inp.point); + else + throw new Exception(ERROR.POINT_NOT_IN_GROUP); + } + + public P2 mult(byte[] scalar) + { blst_p2_mult(point, point, scalar, (size_t)(scalar.Length*8)); + return this; + } + public P2 mult(Scalar scalar) + { blst_p2_mult(point, point, scalar.val, (size_t)255); + return this; + } + public P2 mult(BigInteger scalar) + { byte[] val; + if (scalar.Sign < 0) { + val = BigInteger.Negate(scalar).ToByteArray(); + blst_p2_cneg(point, true); + } else { + val = scalar.ToByteArray(); + } + int len = val.Length; + if (val[len-1]==0) len--; + blst_p2_mult(point, point, val, (size_t)(len*8)); + return this; + } + public P2 cneg(bool flag) { blst_p2_cneg(point, flag); return this; } + public P2 neg() { blst_p2_cneg(point, true); return this; } + public P2 add(P2 a) + { blst_p2_add_or_double(point, point, a.point); return this; } + public P2 add(P2_Affine a) + { blst_p2_add_or_double_affine(point, point, a.point); return this; } + public P2 dbl() + { blst_p2_double(point, point); return this; } + + public static P2 generator() + { var ret = new P2(true); + Marshal.Copy(blst_p2_generator(), ret.point, 0, ret.point.Length); + return ret; + } +} + +public static P2 G2() { return P2.generator(); } + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_aggregated_in_g2([Out] long[] fp12, [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_aggregate_pk_in_g2([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g2([In, Out] long[] fp12, + [In] long[] pk, [In] long[] sig, + [In] byte[] scalar, size_t nbits, + [In] byte[] msg, size_t msg_len, + [In] byte[] aug, size_t aug_len); + + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_fp12_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_miller_loop([Out] long[] fp12, [In] long[] q, + [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_is_one([In] long[] fp12); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_is_equal([In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_fp12_sqr([Out] long[] ret, [In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_fp12_mul([Out] long[] ret, [In] long[] a, + [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_final_exp([Out] long[] ret, [In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_finalverify([In] long[] a, [In] long[] b); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_fp12_one(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_fp12_in_group([In] long[] a); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_bendian_from_fp12([Out] byte[] ret, [In] long[] a); + +public struct PT { + internal readonly long[] fp12; + + private static readonly int sz = (int)blst_fp12_sizeof()/sizeof(long); + + internal PT(bool _) { fp12 = new long[sz]; } + private PT(PT orig) { fp12 = (long[])orig.fp12.Clone(); } + + public PT(P1_Affine p) : this(true) + { blst_aggregated_in_g1(fp12, p.point); } + public PT(P1 p) : this(true) + { blst_aggregated_in_g1(fp12, (new P1_Affine(p)).point); } + public PT(P2_Affine q) : this(true) + { blst_aggregated_in_g2(fp12, q.point); } + public PT(P2 q) : this(true) + { blst_aggregated_in_g2(fp12, (new P2_Affine(q)).point); } + public PT(P2_Affine q, P1_Affine p) : this(true) + { blst_miller_loop(fp12, q.point, p.point); } + public PT(P1_Affine p, P2_Affine q) : this(q, p) {} + public PT(P2 q, P1 p) : this(true) + { blst_miller_loop(fp12, (new P2_Affine(q)).point, + (new P1_Affine(p)).point); + } + public PT(P1 p, P2 q) : this(q, p) {} + + public PT dup() { return new PT(this); } + public bool is_one() { return blst_fp12_is_one(fp12); } + public bool is_equal(PT p) + { return blst_fp12_is_equal(fp12, p.fp12); } + public PT sqr() { blst_fp12_sqr(fp12, fp12); return this; } + public PT mul(PT p) { blst_fp12_mul(fp12, fp12, p.fp12); return this; } + public PT final_exp() { blst_final_exp(fp12, fp12); return this; } + public bool in_group() { return blst_fp12_in_group(fp12); } + public byte[] to_bendian() + { byte[] ret = new byte[12*P1_COMPRESSED_SZ]; + blst_bendian_from_fp12(ret, fp12); + return ret; + } + + public static bool finalverify(PT gt1, PT gt2) + { return blst_fp12_finalverify(gt1.fp12, gt2.fp12); } + + public static PT one() + { var ret = new PT(true); + Marshal.Copy(blst_fp12_one(), ret.fp12, 0, ret.fp12.Length); + return ret; + } +} + +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern size_t blst_pairing_sizeof(); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_pairing_init([In, Out] long[] ctx, bool hash_or_encode, + [In] ref long dst, size_t dst_len); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern void blst_pairing_commit([In, Out] long[] ctx); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern ERROR blst_pairing_merge([In, Out] long[] ctx, [In] long[] ctx1); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern bool blst_pairing_finalverify([In] long[] ctx, [In] long[] sig); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern +void blst_pairing_raw_aggregate([In, Out] long[] ctx, [In] long[] q, + [In] long[] p); +[DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] +static extern IntPtr blst_pairing_as_fp12([In] long[] ctx); + +public struct Pairing { + private readonly long[] ctx; + + private static readonly int sz = (int)blst_pairing_sizeof()/sizeof(long); + + public Pairing(bool hash_or_encode=false, string DST="") + { + byte[] dst = Encoding.UTF8.GetBytes(DST); + int dst_len = dst.Length; + int add_len = dst_len!=0 ? (dst_len+sizeof(long)-1)/sizeof(long) : 1; + Array.Resize(ref dst, add_len*sizeof(long)); + + ctx = new long[sz+add_len]; + + for (int i=0; i sig, + byte[] msg, byte[] aug=null) + { return blst_pairing_aggregate_pk_in_g1(ctx, pk.point, + sig.HasValue ? sig.Value.point : null, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR aggregate(P2_Affine pk, Nullable sig, + byte[] msg, byte[] aug=null) + { return blst_pairing_aggregate_pk_in_g2(ctx, pk.point, + sig.HasValue ? sig.Value.point : null, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR mul_n_aggregate(P2_Affine pk, P1_Affine sig, + byte[] scalar, int nbits, + byte[] msg, byte[] aug=null) + { return blst_pairing_mul_n_aggregate_pk_in_g2(ctx, pk.point, sig.point, + scalar, (size_t)nbits, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + public ERROR mul_n_aggregate(P1_Affine pk, P2_Affine sig, + byte[] scalar, int nbits, + byte[] msg, byte[] aug=null) + { return blst_pairing_mul_n_aggregate_pk_in_g1(ctx, pk.point, sig.point, + scalar, (size_t)nbits, + msg, (size_t)msg.Length, + aug, (size_t)(aug!=null ? aug.Length : 0)); + } + + public void commit() { blst_pairing_commit(ctx); } + public void merge(Pairing a) + { var err = blst_pairing_merge(ctx, a.ctx); + if (err != ERROR.SUCCESS) + throw new Exception(err); + } + public bool finalverify(PT sig=new PT()) + { return blst_pairing_finalverify(ctx, sig.fp12); } + + public void raw_aggregate(P2_Affine q, P1_Affine p) + { blst_pairing_raw_aggregate(ctx, q.point, p.point); } + public void raw_aggregate(P1_Affine p, P2_Affine q) + { raw_aggregate(q, p); } + public void raw_aggregate(P2 q, P1 p) + { blst_pairing_raw_aggregate(ctx, (new P2_Affine(q)).point, + (new P1_Affine(p)).point); + } + public void raw_aggregate(P1 p, P2 q) + { raw_aggregate(q, p); } + public PT as_fp12() + { var ret = new PT(true); + GCHandle h = GCHandle.Alloc(ctx, GCHandleType.Pinned); + Marshal.Copy(blst_pairing_as_fp12(ctx), ret.fp12, 0, ret.fp12.Length); + h.Free(); + return ret; + } +} +}} diff --git a/src/blst/bindings/go/README.md b/src/blst/bindings/go/README.md new file mode 100644 index 0000000000..961853ed69 --- /dev/null +++ b/src/blst/bindings/go/README.md @@ -0,0 +1,80 @@ +# blst [![Lint Status](https://github.com/supranational/blst/workflows/golang-lint/badge.svg)](https://github.com/supranational/blst/actions/workflows/golang-lint.yml) + +The `blst` package provides a Go interface to the blst BLS12-381 signature library. + +## Build +The build process consists of two steps, code generation followed by compilation. + +``` +./generate.py # Optional - only required if making code changes +go build +go test +``` + +The generate.py script is used to generate both min-pk and min-sig variants of the binding from a common code base. It consumes the `*.tgo` files along with `blst_minpk_test.go` and produces `blst.go` and `blst_minsig_test.go`. The .tgo files can treated as if they were .go files, including the use of gofmt and goimports. The generate script will filter out extra imports while processing and automatically run goimports on the final blst.go file. + +After running generate.py, `go build` and `go test` can be run as usual. Cgo will compile `cgo_server.c`, which includes the required C implementation files, and `cgo_assembly.S`, which includes appropriate pre-generated assembly code for the platform. To compile on Windows one has to have MinGW gcc on the `%PATH%`. + +If the test or target application crashes with an "illegal instruction" exception [after copying to an older system], rebuild with `CGO_CFLAGS` environment variable set to `-O2 -D__BLST_PORTABLE__`. Don't forget `-O2`! + +## Usage +There are two primary modes of operation that can be chosen based on type definitions in the application. + +For minimal-pubkey-size operations: +``` +type PublicKey = blst.P1Affine +type Signature = blst.P2Affine +type AggregateSignature = blst.P2Aggregate +type AggregatePublicKey = blst.P1Aggregate +``` + +For minimal-signature-size operations: +``` +type PublicKey = blst.P2Affine +type Signature = blst.P1Affine +type AggregateSignature = blst.P1Aggregate +type AggregatePublicKey = blst.P2Aggregate +``` + +TODO - structures and possibly methods + +A complete example for generating a key, signing a message, and verifying the message: +``` +package main + +import ( + "crypto/rand" + "fmt" + + blst "github.com/supranational/blst/bindings/go" +) + +type PublicKey = blst.P1Affine +type Signature = blst.P2Affine +type AggregateSignature = blst.P2Aggregate +type AggregatePublicKey = blst.P1Aggregate + +func main() { + var ikm [32]byte + _, _ = rand.Read(ikm[:]) + sk := blst.KeyGen(ikm[:]) + pk := new(PublicKey).From(sk) + + var dst = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") + msg := []byte("hello foo") + sig := new(Signature).Sign(sk, msg, dst) + + if !sig.Verify(true, pk, true, msg, dst) { + fmt.Println("ERROR: Invalid!") + } else { + fmt.Println("Valid!") + } +} +``` + +See the tests for further examples of usage. + +If you're cross-compiling, you have to set `CC` environment variable to the target C cross-compiler and `CGO_ENABLED` to 1. For example, to compile the test program for ARM: +``` +env GOARCH=arm CC=arm-linux-gnueabi-gcc CGO_ENABLED=1 go test -c +``` diff --git a/src/blst/bindings/go/blst.go b/src/blst/bindings/go/blst.go new file mode 100644 index 0000000000..77f2241934 --- /dev/null +++ b/src/blst/bindings/go/blst.go @@ -0,0 +1,3580 @@ +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// DO NOT MODIFY THIS FILE!! +// The file is generated from *.tgo by generate.py +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// // no-asm 64-bit platforms from https://go.dev/doc/install/source +// #cgo loong64 mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ +// +// #include "blst.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// static void handler(int signum) +// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " +// "consult /bindings/go/README.md.\n", 70); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void blst_cgo_init() +// { blst_fp temp = { 0 }; +// struct sigaction act = { handler }, oact; +// sigaction(SIGILL, &act, &oact); +// blst_fp_sqr(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// +// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, +// const byte *DST, size_t DST_len) +// { if (DST != NULL) { +// byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); +// for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; +// DST = dst; +// } +// blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); +// } +// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) +// { *pt = *blst_pairing_as_fp12(ctx); } +// +// static void go_p1slice_to_affine(blst_p1_affine dst[], +// const blst_p1 points[], size_t npoints) +// { const blst_p1 *ppoints[2] = { points, NULL }; +// blst_p1s_to_affine(dst, ppoints, npoints); +// } +// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], +// size_t npoints) +// { const blst_p1_affine *ppoints[2] = { points, NULL }; +// blst_p1s_add(dst, ppoints, npoints); +// } +// static void go_p2slice_to_affine(blst_p2_affine dst[], +// const blst_p2 points[], size_t npoints) +// { const blst_p2 *ppoints[2] = { points, NULL }; +// blst_p2s_to_affine(dst, ppoints, npoints); +// } +// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], +// size_t npoints) +// { const blst_p2_affine *ppoints[2] = { points, NULL }; +// blst_p2s_add(dst, ppoints, npoints); +// } +// +// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p1 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p1_generator(); +// else if (affine) +// blst_p1_from_affine(m, p), p = m; +// blst_p1_mult(m, p, scalar, nbits); +// blst_p1_add_or_double(acc, acc, m); +// } +// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p2 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p2_generator(); +// else if (affine) +// blst_p2_from_affine(m, p), p = m; +// blst_p2_mult(m, p, scalar, nbits); +// blst_p2_add_or_double(acc, acc, m); +// } +// +// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) +// { blst_p1 minus_b; +// if (affine) +// blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); +// else +// minus_b = *(const blst_p1*)x; +// blst_p1_cneg(&minus_b, 1); +// blst_p1_add_or_double(a, a, &minus_b); +// } +// +// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) +// { blst_p2 minus_b; +// if (affine) +// blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); +// else +// minus_b = *(const blst_p2*)x; +// blst_p2_cneg(&minus_b, 1); +// blst_p2_add_or_double(a, a, &minus_b); +// } +// +// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) +// { blst_scalar_from_bendian(ret, in); +// return blst_sk_check(ret); +// } +// static bool go_hash_to_scalar(blst_scalar *ret, +// const byte *msg, size_t msg_len, +// const byte *DST, size_t DST_len) +// { byte elem[48]; +// blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); +// return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); +// } +// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], +// const blst_p1_affine P[], +// size_t npoints, bool acc) +// { const blst_p2_affine *Qs[2] = { Q, NULL }; +// const blst_p1_affine *Ps[2] = { P, NULL }; +// if (acc) { +// blst_fp12 tmp; +// blst_miller_loop_n(&tmp, Qs, Ps, npoints); +// blst_fp12_mul(dst, dst, &tmp); +// } else { +// blst_miller_loop_n(dst, Qs, Ps, npoints); +// } +// } +// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) +// { size_t i; +// blst_fp12_mul(dst, &in[0], &in[1]); +// for (i = 2; i < n; i++) +// blst_fp12_mul(dst, dst, &in[i]); +// } +// static bool go_p1_affine_validate(const blst_p1_affine *p, bool infcheck) +// { if (infcheck && blst_p1_affine_is_inf(p)) +// return 0; +// return blst_p1_affine_in_g1(p); +// } +// static bool go_p2_affine_validate(const blst_p2_affine *p, bool infcheck) +// { if (infcheck && blst_p2_affine_is_inf(p)) +// return 0; +// return blst_p2_affine_in_g2(p); +// } +import "C" + +import ( + "fmt" + "math/bits" + "runtime" + "sync" + "sync/atomic" +) + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_FP_BYTES = 384 / 8 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []C.blst_pairing +type SecretKey = Scalar +type P1s []P1 +type P2s []P2 +type P1Affines []P1Affine +type P2Affines []P2Affine + +// +// Configuration +// + +var maxProcs = initMaxProcs() + +func initMaxProcs() int { + maxProcs := runtime.GOMAXPROCS(0) + var version float32 + _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) + if err != nil || version < 1.14 { + // be cooperative and leave one processor for the application + maxProcs -= 1 + } + if maxProcs <= 0 { + maxProcs = 1 + } + return maxProcs +} + +func SetMaxProcs(max int) { + if max <= 0 { + max = 1 + } + maxProcs = max +} + +func numThreads(maxThreads int) int { + numThreads := maxProcs + + // take into consideration the possility that application reduced + // GOMAXPROCS after |maxProcs| was initialized + numProcs := runtime.GOMAXPROCS(0) + if maxProcs > numProcs { + numThreads = numProcs + } + + if maxThreads > 0 && numThreads > maxThreads { + return maxThreads + } + return numThreads +} + +var cgo_pairingSizeOf = C.blst_pairing_sizeof() +var cgo_p1Generator = *C.blst_p1_generator() +var cgo_p2Generator = *C.blst_p2_generator() +var cgo_fp12One = *C.blst_fp12_one() + +// Secret key +func (sk *SecretKey) Zeroize() { + var zero SecretKey + *sk = zero +} + +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func DeriveMasterEip2333(ikm []byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { + var sk SecretKey + C.blst_derive_child_eip2333(&sk, master, C.uint(child_index)) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +// Pairing +func pairingSizeOf(DST_len C.size_t) int { + return int((cgo_pairingSizeOf + DST_len + 7) / 8) +} + +func PairingCtx(hash_or_encode bool, DST []byte) Pairing { + DST_len := C.size_t(len(DST)) + ctx := make([]C.blst_pairing, pairingSizeOf(DST_len)) + C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), ptrOrNil(DST), DST_len) + return ctx +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit(&ctx[0]) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify(&ctx[0], gtsig)) +} + +func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { + C.blst_pairing_raw_aggregate(&ctx[0], q, p) +} + +func PairingAsFp12(ctx Pairing) *Fp12 { + var pt Fp12 + C.go_pairing_as_fp12(&pt, &ctx[0]) + return &pt +} + +func Fp12One() Fp12 { + return cgo_fp12One +} + +func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { + return bool(C.blst_fp12_finalverify(pt1, pt2)) +} + +func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { + var pt Fp12 + C.blst_miller_loop(&pt, q, p) + return &pt +} + +func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { + if len(qs) != len(ps) || len(qs) == 0 { + panic("inputs' lengths mismatch") + } + + nElems := uint32(len(qs)) + nThreads := uint32(maxProcs) + + if nThreads == 1 || nElems == 1 { + var pt Fp12 + C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false) + return &pt + } + + stride := (nElems + nThreads - 1) / nThreads + if stride > 16 { + stride = 16 + } + + strides := (nElems + stride - 1) / stride + if nThreads > strides { + nThreads = strides + } + + msgsCh := make(chan Fp12, nThreads) + curElem := uint32(0) + + for tid := uint32(0); tid < nThreads; tid++ { + go func() { + acc := Fp12One() + first := true + for { + work := atomic.AddUint32(&curElem, stride) - stride + if work >= nElems { + break + } + n := nElems - work + if n > stride { + n = stride + } + C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n), + C.bool(!first)) + first = false + } + msgsCh <- acc + }() + } + + var ret = make([]Fp12, nThreads) + for i := range ret { + ret[i] = <-msgsCh + } + + var pt Fp12 + C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads)) + return &pt +} + +func (pt *Fp12) MulAssign(p *Fp12) { + C.blst_fp12_mul(pt, pt, p) +} + +func (pt *Fp12) FinalExp() { + C.blst_final_exp(pt, pt) +} + +func (pt *Fp12) InGroup() bool { + return bool(C.blst_fp12_in_group(pt)) +} + +func (pt *Fp12) ToBendian() []byte { + var out [BLST_FP_BYTES * 12]byte + C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt) + return out[:] +} + +func (pt1 *Fp12) Equals(pt2 *Fp12) bool { + return *pt1 == *pt2 +} + +func ptrOrNil(bytes []byte) *C.byte { + var ptr *C.byte + if len(bytes) > 0 { + ptr = (*C.byte)(&bytes[0]) + } + return ptr +} + +// +// MIN-PK +// + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +func (pk *P1Affine) KeyValidate() bool { + return bool(C.go_p1_affine_validate(pk, true)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { + return bool(C.go_p2_affine_validate(sig, C.bool(sigInfcheck))) +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, + pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } + return pks[i], nil + } + + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (_ *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, + pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() //nolint:staticcheck + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g2(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P2Affine { + return sig + } + pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (_ *P2Affine) MultipleAggregateVerify(sigs []*P2Affine, + sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, _ *P2Affine, _ *P1Affine, rand *Scalar) ( + *P2Affine, *P1Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, + rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P1Affine + var tempSig P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AggregateWithRandomness(pointsIf interface{}, + scalarsIf interface{}, nbits int, groupcheck bool) bool { + if groupcheck && !P2AffinesValidate(pointsIf) { + return false + } + agg.v = P2AffinesMult(pointsIf, scalarsIf, nbits) + return true +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return new(P2Affine) + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) coreAggregate(getter aggGetterP2, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} + +// +// MIN-SIG +// + +// +// PublicKey +// + +func (pk *P2Affine) From(s *Scalar) *P2Affine { + C.blst_sk_to_pk2_in_g2(nil, pk, s) + return pk +} + +func (pk *P2Affine) KeyValidate() bool { + return bool(C.go_p2_affine_validate(pk, true)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { + return bool(C.go_p1_affine_validate(sig, C.bool(sigInfcheck))) +} + +// +// Sign +// + +func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P1Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P1 + if useHash { + q = HashToG1(msg, dst, augSingle) + } else { + q = EncodeToG1(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g2(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP1 func() *P1Affine +type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) + +// Single verify with decompressed pk +func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P1Affine) AggregateVerify(sigGroupcheck bool, + pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P1Affine { + return sig + } + + pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } + return pks[i], nil + } + + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (_ *P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P1Affine { + sigP := new(P1Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, + pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() //nolint:staticcheck + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG2(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g1(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P1Affine { + return sig + } + pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + return int(C.blst_core_verify_pk_in_g2(pk, sig, C.bool(hash_or_encode), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P2Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P2Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (_ *P1Affine) MultipleAggregateVerify(sigs []*P1Affine, + sigsGroupcheck bool, pks []*P2Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, _ *P1Affine, _ *P2Affine, rand *Scalar) ( + *P1Affine, *P2Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine, + rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P2Affine + var tempSig P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P1 +// + +type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine +type P1Aggregate struct { + v *P1 +} + +// Aggregate uncompressed elements +func (agg *P1Aggregate) Aggregate(elmts []*P1Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P1Aggregate) AggregateWithRandomness(pointsIf interface{}, + scalarsIf interface{}, nbits int, groupcheck bool) bool { + if groupcheck && !P1AffinesValidate(pointsIf) { + return false + } + agg.v = P1AffinesMult(pointsIf, scalarsIf, nbits) + return true +} + +// Aggregate compressed elements +func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P1Affine) *P1Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p1_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p1_affine_in_g1(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P1) + C.blst_p1_from_affine(agg.v, elmt) + } else { + C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P1Aggregate) ToAffine() *P1Affine { + if agg.v == nil { + return new(P1Affine) + } + return agg.v.ToAffine() +} + +func (agg *P1Aggregate) coreAggregate(getter aggGetterP1, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P1 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P1 + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p1_affine_in_g1(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p1_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p1_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p1_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} + +func (p1 *P1Affine) InG1() bool { + return bool(C.blst_p1_affine_in_g1(p1)) +} + +func (_ *P1Affine) BatchUncompress(in [][]byte) []*P1Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P1Affine, n) + pointsPtrs := make([]*P1Affine, n) + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits)) + return p1 +} + +func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { + ret := *p1 + return ret.MultAssign(scalarIf, optional...) +} + +func (p1 *P1) AddAssign(pointIf interface{}) *P1 { + switch val := pointIf.(type) { + case *P1: + C.blst_p1_add_or_double(p1, p1, val) + case *P1Affine: + C.blst_p1_add_or_double_affine(p1, p1, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p1 +} + +func (p1 *P1) Add(pointIf interface{}) *P1 { + ret := *p1 + return ret.AddAssign(pointIf) +} + +func (p1 *P1) SubAssign(pointIf interface{}) *P1 { + var x *Fp + var affine C.bool + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p1_sub_assign(p1, x, affine) + return p1 +} + +func (p1 *P1) Sub(pointIf interface{}) *P1 { + ret := *p1 + return ret.SubAssign(pointIf) +} + +func P1Generator() *P1 { + return &cgo_p1Generator +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P1 { + var x *Fp + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +func (p *P1) FromAffine(pa *P1Affine) { + C.blst_p1_from_affine(p, pa) +} + +// Hash +func HashToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_hash_to_g1(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_encode_to_g1(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P1sToAffine(points []*P1, optional ...int) P1Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P1Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P1s) ToAffine(optional ...P1Affines) P1Affines { + npoints := len(points) + var ret P1Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P1Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P1Affine, inp *P1, delta int) { + C.go_p1slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P1Affines) Add() *P1 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P1 + C.go_p1slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P1, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P1Affine, delta int) { + var ret P1 + C.go_p1slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p1_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P1s) Add() *P1 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := numThreads(0) + + if numThreads < 2 { + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P1Affine{nil, nil} + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[0] + case []P1Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[0].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + if npoints < 32 { + if numThreads > npoints { + numThreads = npoints + } + + curItem := uint32(0) + msgs := make(chan P1, numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + var acc P1 + + for { + workItem := int(atomic.AddUint32(&curItem, 1) - 1) + if workItem >= npoints { + break + } + + var point *P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[workItem] + case []P1Affine: + point = &val[workItem] + case P1Affines: + point = &val[workItem] + } + + var scalar *C.byte + switch val := scalarsIf.(type) { + case []byte: + scalar = (*C.byte)(&val[workItem*nbytes]) + case [][]byte: + scalar = scalars[workItem] + case []Scalar: + if nbits > 248 { + scalar = &val[workItem].b[0] + } else { + scalar = scalars[workItem] + } + case []*Scalar: + scalar = scalars[workItem] + } + + C.go_p1_mult_n_acc(&acc, &point.x, true, + scalar, C.size_t(nbits)) + } + + msgs <- acc + }() + } + + ret := <-msgs + for tid := 1; tid < numThreads; tid++ { + point := <-msgs + C.blst_p1_add_or_double(&ret, &ret, &point) + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P1 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[x] + case []P1Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[x].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p1s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P1 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p1_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p1_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { + return P1AffinesMult(points, scalarsIf, nbits) +} + +func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { + return points.ToAffine().Mult(scalarsIf, nbits) +} + +// +// Group-check +// + +func P1AffinesValidate(pointsIf interface{}) bool { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := numThreads(npoints) + + if numThreads < 2 { + for i := 0; i < npoints; i++ { + var point *P1Affine + + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[i] + case []P1Affine: + point = &val[i] + case P1Affines: + point = &val[i] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p1_affine_validate(point, true) { + return false + } + } + + return true + } + + valid := int32(1) + curItem := uint32(0) + + var wg sync.WaitGroup + wg.Add(numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) != 0 { + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(npoints) { + break + } + + var point *P1Affine + + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[work] + case []P1Affine: + point = &val[work] + case P1Affines: + point = &val[work] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p1_affine_validate(point, true) { + atomic.StoreInt32(&valid, 0) + break + } + } + + wg.Done() + }() + } + + wg.Wait() + + return atomic.LoadInt32(&valid) != 0 +} + +func (points P1Affines) Validate() bool { + return P1AffinesValidate(points) +} +func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P2 Serdes +func (p2 *P2Affine) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { + if len(in) != BLST_P2_SERIALIZE_BYTES { + return nil + } + if C.blst_p2_deserialize(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} +func (p2 *P2Affine) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_affine_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { + if len(in) != BLST_P2_COMPRESS_BYTES { + return nil + } + if C.blst_p2_uncompress(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} + +func (p2 *P2Affine) InG2() bool { + return bool(C.blst_p2_affine_in_g2(p2)) +} + +func (_ *P2Affine) BatchUncompress(in [][]byte) []*P2Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P2Affine, n) + pointsPtrs := make([]*P2Affine, n) + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p2 *P2) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_serialize((*C.byte)(&out[0]), p2) + return out[:] +} +func (p2 *P2) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p2_mult(p2, p2, scalar, C.size_t(nbits)) + return p2 +} + +func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 { + ret := *p2 + return ret.MultAssign(scalarIf, optional...) +} + +func (p2 *P2) AddAssign(pointIf interface{}) *P2 { + switch val := pointIf.(type) { + case *P2: + C.blst_p2_add_or_double(p2, p2, val) + case *P2Affine: + C.blst_p2_add_or_double_affine(p2, p2, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p2 +} + +func (p2 *P2) Add(pointIf interface{}) *P2 { + ret := *p2 + return ret.AddAssign(pointIf) +} + +func (p2 *P2) SubAssign(pointIf interface{}) *P2 { + var x *Fp2 + var affine C.bool + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p2_sub_assign(p2, x, affine) + return p2 +} + +func (p2 *P2) Sub(pointIf interface{}) *P2 { + ret := *p2 + return ret.SubAssign(pointIf) +} + +func P2Generator() *P2 { + return &cgo_p2Generator +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P2 { + var x *Fp2 + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p2_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P2) ToAffine() *P2Affine { + var pa P2Affine + C.blst_p2_to_affine(&pa, p) + return &pa +} + +func (p *P2) FromAffine(pa *P2Affine) { + C.blst_p2_from_affine(p, pa) +} + +// Hash +func HashToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_hash_to_g2(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +func EncodeToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_encode_to_g2(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P2sToAffine(points []*P2, optional ...int) P2Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P2Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P2s) ToAffine(optional ...P2Affines) P2Affines { + npoints := len(points) + var ret P2Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P2Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p2slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P2Affine, inp *P2, delta int) { + C.go_p2slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P2Affines) Add() *P2 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P2 + C.go_p2slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P2, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P2Affine, delta int) { + var ret P2 + C.go_p2slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p2_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P2s) Add() *P2 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 { + var npoints int + switch val := pointsIf.(type) { + case []*P2Affine: + npoints = len(val) + case []P2Affine: + npoints = len(val) + case P2Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := numThreads(0) + + if numThreads < 2 { + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P2Affine{nil, nil} + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[0] + case []P2Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[0].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + if npoints < 32 { + if numThreads > npoints { + numThreads = npoints + } + + curItem := uint32(0) + msgs := make(chan P2, numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + var acc P2 + + for { + workItem := int(atomic.AddUint32(&curItem, 1) - 1) + if workItem >= npoints { + break + } + + var point *P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + point = val[workItem] + case []P2Affine: + point = &val[workItem] + case P2Affines: + point = &val[workItem] + } + + var scalar *C.byte + switch val := scalarsIf.(type) { + case []byte: + scalar = (*C.byte)(&val[workItem*nbytes]) + case [][]byte: + scalar = scalars[workItem] + case []Scalar: + if nbits > 248 { + scalar = &val[workItem].b[0] + } else { + scalar = scalars[workItem] + } + case []*Scalar: + scalar = scalars[workItem] + } + + C.go_p2_mult_n_acc(&acc, &point.x, true, + scalar, C.size_t(nbits)) + } + + msgs <- acc + }() + } + + ret := <-msgs + for tid := 1; tid < numThreads; tid++ { + point := <-msgs + C.blst_p2_add_or_double(&ret, &ret, &point) + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P2 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[x] + case []P2Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[x].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p2s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P2 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p2_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p2_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 { + return P2AffinesMult(points, scalarsIf, nbits) +} + +func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 { + return points.ToAffine().Mult(scalarsIf, nbits) +} + +// +// Group-check +// + +func P2AffinesValidate(pointsIf interface{}) bool { + var npoints int + switch val := pointsIf.(type) { + case []*P2Affine: + npoints = len(val) + case []P2Affine: + npoints = len(val) + case P2Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := numThreads(npoints) + + if numThreads < 2 { + for i := 0; i < npoints; i++ { + var point *P2Affine + + switch val := pointsIf.(type) { + case []*P2Affine: + point = val[i] + case []P2Affine: + point = &val[i] + case P2Affines: + point = &val[i] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p2_affine_validate(point, true) { + return false + } + } + + return true + } + + valid := int32(1) + curItem := uint32(0) + + var wg sync.WaitGroup + wg.Add(numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) != 0 { + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(npoints) { + break + } + + var point *P2Affine + + switch val := pointsIf.(type) { + case []*P2Affine: + point = val[work] + case []P2Affine: + point = &val[work] + case P2Affines: + point = &val[work] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p2_affine_validate(point, true) { + atomic.StoreInt32(&valid, 0) + break + } + } + + wg.Done() + }() + } + + wg.Wait() + + return atomic.LoadInt32(&valid) != 0 +} + +func (points P2Affines) Validate() bool { + return P2AffinesValidate(points) +} + +func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { + var aug [][]byte // For aggregate verify + var augSingle []byte // For signing + useHash := true // hash (true), encode (false) + + for _, arg := range optional { + switch v := arg.(type) { + case []byte: + augSingle = v + case [][]byte: + aug = v + case bool: + useHash = v + default: + return nil, nil, useHash, false + } + } + return augSingle, aug, useHash, true +} + +// These methods are inefficient because of cgo call overhead. For this +// reason they should be used primarily for prototyping with a goal to +// formulate interfaces that would process multiple scalars per cgo call. +func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_mul_n_check(a, a, b)) +} + +func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_mul_n_check(&ret, a, b)) +} + +func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_add_n_check(a, a, b)) +} + +func (a *Scalar) Add(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_add_n_check(&ret, a, b)) +} + +func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_sub_n_check(a, a, b)) +} + +func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_sub_n_check(&ret, a, b)) +} + +func (a *Scalar) Inverse() *Scalar { + var ret Scalar + C.blst_sk_inverse(&ret, a) + return &ret +} + +// +// Serialization/Deserialization. +// + +// Scalar serdes +func (s *Scalar) Serialize() []byte { + var out [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) + return out[:] +} + +func (s *Scalar) Deserialize(in []byte) *Scalar { + if len(in) != BLST_SCALAR_BYTES || + !C.go_scalar_from_bendian(s, (*C.byte)(&in[0])) { + return nil + } + return s +} + +func (s *Scalar) Valid() bool { + return bool(C.blst_sk_check(s)) +} + +func (s *Scalar) HashTo(msg []byte, dst []byte) bool { + ret := HashToScalar(msg, dst) + if ret != nil { + *s = *ret + return true + } + return false +} + +func HashToScalar(msg []byte, dst []byte) *Scalar { + var ret Scalar + + if C.go_hash_to_scalar(&ret, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst))) { + return &ret + } + + return nil +} + +// +// LEndian +// + +func (fr *Scalar) ToLEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToLEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromLEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_le_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromLEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_lendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// BEndian +// + +func (fr *Scalar) ToBEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToBEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromBEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_be_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromBEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_bendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// Printing +// + +func PrintBytes(val []byte, name string) { + fmt.Printf("%s = %02x\n", name, val) +} + +func (s *Scalar) Print(name string) { + arr := s.ToBEndian() + PrintBytes(arr, name) +} + +func (p *P1Affine) Print(name string) { + fmt.Printf("%s:\n", name) + arr := p.x.ToBEndian() + PrintBytes(arr, " x") + arr = p.y.ToBEndian() + PrintBytes(arr, " y") +} + +func (p *P1) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +func (f *Fp2) Print(name string) { + fmt.Printf("%s:\n", name) + arr := f.fp[0].ToBEndian() + PrintBytes(arr, " 0") + arr = f.fp[1].ToBEndian() + PrintBytes(arr, " 1") +} + +func (p *P2Affine) Print(name string) { + fmt.Printf("%s:\n", name) + p.x.Print(" x") + p.y.Print(" y") +} + +func (p *P2) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +// +// Equality +// + +func (s1 *Scalar) Equals(s2 *Scalar) bool { + return *s1 == *s2 +} + +func (e1 *Fp) Equals(e2 *Fp) bool { + return *e1 == *e2 +} + +func (e1 *Fp2) Equals(e2 *Fp2) bool { + return *e1 == *e2 +} + +func (e1 *P1Affine) Equals(e2 *P1Affine) bool { + return bool(C.blst_p1_affine_is_equal(e1, e2)) +} + +func (e1 *P1) Equals(e2 *P1) bool { + return bool(C.blst_p1_is_equal(e1, e2)) +} + +func (e1 *P2Affine) Equals(e2 *P2Affine) bool { + return bool(C.blst_p2_affine_is_equal(e1, e2)) +} + +func (e1 *P2) Equals(e2 *P2) bool { + return bool(C.blst_p2_is_equal(e1, e2)) +} + +// private thunk for testing + +func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte { + ret := make([]byte, len_in_bytes) + + C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst))) + return ret +} + +func breakdown(nbits, window, ncpus int) (nx int, ny int, wnd int) { + + if nbits > window*ncpus { //nolint:nestif + nx = 1 + wnd = bits.Len(uint(ncpus) / 4) + if (window + wnd) > 18 { + wnd = window - wnd + } else { + wnd = (nbits/window + ncpus - 1) / ncpus + if (nbits/(window+1)+ncpus-1)/ncpus < wnd { + wnd = window + 1 + } else { + wnd = window + } + } + } else { + nx = 2 + wnd = window - 2 + for (nbits/wnd+1)*nx < ncpus { + nx += 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + nx -= 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + ny = nbits/wnd + 1 + wnd = nbits/ny + 1 + + return nx, ny, wnd +} + +func pippenger_window_size(npoints int) int { + wbits := bits.Len(uint(npoints)) + + if wbits > 13 { + return wbits - 4 + } + if wbits > 5 { + return wbits - 3 + } + return 2 +} diff --git a/src/blst/bindings/go/blst.tgo b/src/blst/bindings/go/blst.tgo new file mode 100644 index 0000000000..947089c34f --- /dev/null +++ b/src/blst/bindings/go/blst.tgo @@ -0,0 +1,466 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// // no-asm 64-bit platforms from https://go.dev/doc/install/source +// #cgo loong64 mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ +// +// #include "blst.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// static void handler(int signum) +// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " +// "consult /bindings/go/README.md.\n", 70); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void blst_cgo_init() +// { blst_fp temp = { 0 }; +// struct sigaction act = { handler }, oact; +// sigaction(SIGILL, &act, &oact); +// blst_fp_sqr(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// +// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, +// const byte *DST, size_t DST_len) +// { if (DST != NULL) { +// byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); +// for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; +// DST = dst; +// } +// blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); +// } +// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) +// { *pt = *blst_pairing_as_fp12(ctx); } +// +// static void go_p1slice_to_affine(blst_p1_affine dst[], +// const blst_p1 points[], size_t npoints) +// { const blst_p1 *ppoints[2] = { points, NULL }; +// blst_p1s_to_affine(dst, ppoints, npoints); +// } +// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], +// size_t npoints) +// { const blst_p1_affine *ppoints[2] = { points, NULL }; +// blst_p1s_add(dst, ppoints, npoints); +// } +// static void go_p2slice_to_affine(blst_p2_affine dst[], +// const blst_p2 points[], size_t npoints) +// { const blst_p2 *ppoints[2] = { points, NULL }; +// blst_p2s_to_affine(dst, ppoints, npoints); +// } +// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], +// size_t npoints) +// { const blst_p2_affine *ppoints[2] = { points, NULL }; +// blst_p2s_add(dst, ppoints, npoints); +// } +// +// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p1 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p1_generator(); +// else if (affine) +// blst_p1_from_affine(m, p), p = m; +// blst_p1_mult(m, p, scalar, nbits); +// blst_p1_add_or_double(acc, acc, m); +// } +// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p2 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p2_generator(); +// else if (affine) +// blst_p2_from_affine(m, p), p = m; +// blst_p2_mult(m, p, scalar, nbits); +// blst_p2_add_or_double(acc, acc, m); +// } +// +// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) +// { blst_p1 minus_b; +// if (affine) +// blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); +// else +// minus_b = *(const blst_p1*)x; +// blst_p1_cneg(&minus_b, 1); +// blst_p1_add_or_double(a, a, &minus_b); +// } +// +// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) +// { blst_p2 minus_b; +// if (affine) +// blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); +// else +// minus_b = *(const blst_p2*)x; +// blst_p2_cneg(&minus_b, 1); +// blst_p2_add_or_double(a, a, &minus_b); +// } +// +// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) +// { blst_scalar_from_bendian(ret, in); +// return blst_sk_check(ret); +// } +// static bool go_hash_to_scalar(blst_scalar *ret, +// const byte *msg, size_t msg_len, +// const byte *DST, size_t DST_len) +// { byte elem[48]; +// blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); +// return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); +// } +// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], +// const blst_p1_affine P[], +// size_t npoints, bool acc) +// { const blst_p2_affine *Qs[2] = { Q, NULL }; +// const blst_p1_affine *Ps[2] = { P, NULL }; +// if (acc) { +// blst_fp12 tmp; +// blst_miller_loop_n(&tmp, Qs, Ps, npoints); +// blst_fp12_mul(dst, dst, &tmp); +// } else { +// blst_miller_loop_n(dst, Qs, Ps, npoints); +// } +// } +// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) +// { size_t i; +// blst_fp12_mul(dst, &in[0], &in[1]); +// for (i = 2; i < n; i++) +// blst_fp12_mul(dst, dst, &in[i]); +// } +// static bool go_p1_affine_validate(const blst_p1_affine *p, bool infcheck) +// { if (infcheck && blst_p1_affine_is_inf(p)) +// return 0; +// return blst_p1_affine_in_g1(p); +// } +// static bool go_p2_affine_validate(const blst_p2_affine *p, bool infcheck) +// { if (infcheck && blst_p2_affine_is_inf(p)) +// return 0; +// return blst_p2_affine_in_g2(p); +// } +import "C" + +import "runtime" + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_FP_BYTES = 384 / 8 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []C.blst_pairing +type SecretKey = Scalar +type P1s []P1 +type P2s []P2 +type P1Affines []P1Affine +type P2Affines []P2Affine + +// +// Configuration +// + +var maxProcs = initMaxProcs() + +func initMaxProcs() int { + maxProcs := runtime.GOMAXPROCS(0) + var version float32 + _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) + if err != nil || version < 1.14 { + // be cooperative and leave one processor for the application + maxProcs -= 1 + } + if maxProcs <= 0 { + maxProcs = 1 + } + return maxProcs +} + +func SetMaxProcs(max int) { + if max <= 0 { + max = 1 + } + maxProcs = max +} + +func numThreads(maxThreads int) int { + numThreads := maxProcs + + // take into consideration the possility that application reduced + // GOMAXPROCS after |maxProcs| was initialized + numProcs := runtime.GOMAXPROCS(0) + if maxProcs > numProcs { + numThreads = numProcs + } + + if maxThreads > 0 && numThreads > maxThreads { + return maxThreads + } + return numThreads +} + +var cgo_pairingSizeOf = C.blst_pairing_sizeof() +var cgo_p1Generator = *C.blst_p1_generator() +var cgo_p2Generator = *C.blst_p2_generator() +var cgo_fp12One = *C.blst_fp12_one() + +// +// Secret key +// +func (sk *SecretKey) Zeroize() { + var zero SecretKey + *sk = zero +} + +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + if len(optional) > 0 { + info = optional[0] + } + C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + ptrOrNil(info), C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func DeriveMasterEip2333(ikm []byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { + var sk SecretKey + C.blst_derive_child_eip2333(&sk, master, C.uint(child_index)) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +// +// Pairing +// +func pairingSizeOf(DST_len C.size_t) int { + return int((cgo_pairingSizeOf + DST_len + 7) / 8) +} + +func PairingCtx(hash_or_encode bool, DST []byte) Pairing { + DST_len := C.size_t(len(DST)) + ctx := make([]C.blst_pairing, pairingSizeOf(DST_len)) + C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), ptrOrNil(DST), DST_len) + return ctx +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit(&ctx[0]) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify(&ctx[0], gtsig)) +} + +func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { + C.blst_pairing_raw_aggregate(&ctx[0], q, p) +} + +func PairingAsFp12(ctx Pairing) *Fp12 { + var pt Fp12 + C.go_pairing_as_fp12(&pt, &ctx[0]) + return &pt +} + +func Fp12One() Fp12 { + return cgo_fp12One +} + +func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { + return bool(C.blst_fp12_finalverify(pt1, pt2)) +} + +func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { + var pt Fp12 + C.blst_miller_loop(&pt, q, p) + return &pt +} + +func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { + if len(qs) != len(ps) || len(qs) == 0 { + panic("inputs' lengths mismatch") + } + + nElems := uint32(len(qs)) + nThreads := uint32(maxProcs) + + if nThreads == 1 || nElems == 1 { + var pt Fp12 + C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false) + return &pt + } + + stride := (nElems + nThreads - 1) / nThreads + if stride > 16 { + stride = 16 + } + + strides := (nElems + stride - 1) / stride + if nThreads > strides { + nThreads = strides + } + + msgsCh := make(chan Fp12, nThreads) + curElem := uint32(0) + + for tid := uint32(0); tid < nThreads; tid++ { + go func() { + acc := Fp12One() + first := true + for { + work := atomic.AddUint32(&curElem, stride) - stride + if work >= nElems { + break + } + n := nElems - work + if n > stride { + n = stride + } + C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n), + C.bool(!first)) + first = false + } + msgsCh <- acc + }() + } + + var ret = make([]Fp12, nThreads); + for i := range(ret) { + ret[i] = <- msgsCh + } + + var pt Fp12 + C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads)) + return &pt +} + +func (pt *Fp12) MulAssign(p *Fp12) { + C.blst_fp12_mul(pt, pt, p) +} + +func (pt *Fp12) FinalExp() { + C.blst_final_exp(pt, pt) +} + +func (pt *Fp12) InGroup() bool { + return bool(C.blst_fp12_in_group(pt)) +} + +func (pt *Fp12) ToBendian() []byte { + var out [BLST_FP_BYTES*12]byte + C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt) + return out[:] +} + +func (pt1 *Fp12) Equals(pt2 *Fp12) bool { + return *pt1 == *pt2 +} + +func ptrOrNil(bytes []byte) *C.byte { + var ptr *C.byte + if len(bytes) > 0 { + ptr = (*C.byte)(&bytes[0]) + } + return ptr +} diff --git a/src/blst/bindings/go/blst_htoc_test.go b/src/blst/bindings/go/blst_htoc_test.go new file mode 100644 index 0000000000..9cf6b2417b --- /dev/null +++ b/src/blst/bindings/go/blst_htoc_test.go @@ -0,0 +1,227 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +import ( + "bytes" + "encoding/hex" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "strconv" + "strings" + "testing" +) + +func decodeP1(m map[string]interface{}) *P1Affine { + x, err := hex.DecodeString(m["x"].(string)[2:]) + if err != nil { + fmt.Println(err) + return nil + } + y, err := hex.DecodeString(m["y"].(string)[2:]) + if err != nil { + fmt.Println(err) + return nil + } + var p1 P1Affine + p1.x.FromBEndian(x) + p1.y.FromBEndian(y) + return &p1 +} + +func jsonG1HashToCurve(t *testing.T, fname string) { + t.Helper() + vfile, err := os.Open(fname) + if err != nil { + t.Skipf("%.16s... not found", fname) + } + defer vfile.Close() + buf, err := ioutil.ReadAll(vfile) + if err != nil { + t.Errorf(err.Error()) + } + + var vectors map[string]interface{} + err = json.Unmarshal(buf, &vectors) + if err != nil { + t.Errorf(err.Error()) + } + + dst := []byte(vectors["dst"].(string)) + hash_or_encode := vectors["randomOracle"].(bool) + + vectorsArr, ok := vectors["vectors"].([]interface{}) + if !ok { + t.Errorf("Could not cast vectors to an array") + } + + for _, v := range vectorsArr { + testMap, ok := v.(map[string]interface{}) + if !ok { + t.Errorf("Could not cast vector to map") + } + + msg := []byte(testMap["msg"].(string)) + p1Expected := decodeP1(testMap["P"].(map[string]interface{})) + var p1Hashed *P1Affine + if hash_or_encode { + p1Hashed = HashToG1(msg, dst).ToAffine() + } else { + p1Hashed = EncodeToG1(msg, dst).ToAffine() + } + + if !p1Hashed.Equals(p1Expected) { + t.Errorf("hashed != expected") + } + } +} + +func TestG1HashToCurve(t *testing.T) { + t.Parallel() + jsonG1HashToCurve(t, "../vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json") + jsonG1HashToCurve(t, "../vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json") +} + +func decodeP2(m map[string]interface{}) *P2Affine { + xArr := strings.Split(m["x"].(string), ",") + x0, err := hex.DecodeString(xArr[0][2:]) + if err != nil { + fmt.Println(err) + return nil + } + x1, err := hex.DecodeString(xArr[1][2:]) + if err != nil { + fmt.Println(err) + return nil + } + yArr := strings.Split(m["y"].(string), ",") + y0, err := hex.DecodeString(yArr[0][2:]) + if err != nil { + fmt.Println(err) + return nil + } + y1, err := hex.DecodeString(yArr[1][2:]) + if err != nil { + fmt.Println(err) + return nil + } + var p2 P2Affine + p2.x.fp[0].FromBEndian(x0) + p2.x.fp[1].FromBEndian(x1) + p2.y.fp[0].FromBEndian(y0) + p2.y.fp[1].FromBEndian(y1) + return &p2 +} + +func jsonG2HashToCurve(t *testing.T, fname string) { + t.Helper() + vfile, err := os.Open(fname) + if err != nil { + t.Skipf("%.16s... not found", fname) + } + defer vfile.Close() + buf, err := ioutil.ReadAll(vfile) + if err != nil { + t.Errorf(err.Error()) + } + + var vectors map[string]interface{} + err = json.Unmarshal(buf, &vectors) + if err != nil { + t.Errorf(err.Error()) + } + + dst := []byte(vectors["dst"].(string)) + hash_or_encode := vectors["randomOracle"].(bool) + + vectorsArr, ok := vectors["vectors"].([]interface{}) + if !ok { + t.Errorf("Could not cast vectors to an array") + } + + for _, v := range vectorsArr { + testMap, ok := v.(map[string]interface{}) + if !ok { + t.Errorf("Could not cast vector to map") + } + + msg := []byte(testMap["msg"].(string)) + p2Expected := decodeP2(testMap["P"].(map[string]interface{})) + var p2Hashed *P2Affine + if hash_or_encode { + p2Hashed = HashToG2(msg, dst).ToAffine() + } else { + p2Hashed = EncodeToG2(msg, dst).ToAffine() + } + + if !p2Hashed.Equals(p2Expected) { + t.Errorf("hashed != expected") + } + } +} + +func TestG2HashToCurve(t *testing.T) { + t.Parallel() + jsonG2HashToCurve(t, "../vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json") + jsonG2HashToCurve(t, "../vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json") +} + +func jsonExpandMessageXmd(t *testing.T, fname string) { + t.Helper() + vfile, err := os.Open(fname) + if err != nil { + t.Skipf("%.16s... not found", fname) + } + defer vfile.Close() + buf, err := ioutil.ReadAll(vfile) + if err != nil { + t.Errorf(err.Error()) + } + + var vectors map[string]interface{} + err = json.Unmarshal(buf, &vectors) + if err != nil { + t.Errorf(err.Error()) + } + + DST := []byte(vectors["DST"].(string)) + + tests, ok := vectors["tests"].([]interface{}) + if !ok { + t.Errorf("Could not cast 'tests' to an array") + } + + for _, v := range tests { + test, ok := v.(map[string]interface{}) + if !ok { + t.Errorf("Could not map 'tests[]' element") + } + + len_in_bytes, err := strconv.ParseInt(test["len_in_bytes"].(string), 0, 0) + if err != nil { + t.Errorf(err.Error()) + } + msg := []byte(test["msg"].(string)) + expected, err := hex.DecodeString(test["uniform_bytes"].(string)) + if err != nil { + t.Errorf(err.Error()) + } + + hashed := expandMessageXmd(msg, DST, int(len_in_bytes)) + if !bytes.Equal(hashed, expected) { + t.Errorf("hashed != expected") + } + } +} + +func TestExpandMessageXmd(t *testing.T) { + t.Parallel() + jsonExpandMessageXmd(t, "../vectors/hash_to_curve/expand_message_xmd_SHA256_256.json") + jsonExpandMessageXmd(t, "../vectors/hash_to_curve/expand_message_xmd_SHA256_38.json") +} diff --git a/src/blst/bindings/go/blst_miller_loop_test.go b/src/blst/bindings/go/blst_miller_loop_test.go new file mode 100644 index 0000000000..1d636e3388 --- /dev/null +++ b/src/blst/bindings/go/blst_miller_loop_test.go @@ -0,0 +1,38 @@ +package blst + +import ( + "crypto/rand" + "testing" +) + +func TestMillerLoopN(t *testing.T) { + t.Parallel() + const npoints = 97 + scalars := make([]byte, npoints*8) + _, err := rand.Read(scalars) + if err != nil { + t.Errorf(err.Error()) + return + } + + p1s := make([]P1, npoints) + p2s := make([]P2, npoints) + g1 := P1Generator() + g2 := P2Generator() + for i := range p1s { + p1s[i] = *g1.Mult(scalars[i*8:i*8+4], 32) + p2s[i] = *g2.Mult(scalars[i*8+4:i*8+8], 32) + } + + ps := P1s(p1s).ToAffine() + qs := P2s(p2s).ToAffine() + + naive := Fp12One() + for i := range p1s { + naive.MulAssign(Fp12MillerLoop(&qs[i], &ps[i])) + } + + if !naive.Equals(Fp12MillerLoopN(qs, ps)) { + t.Errorf("failed self-consistency Fp12MillerLoopN test") + } +} diff --git a/src/blst/bindings/go/blst_minpk.tgo b/src/blst/bindings/go/blst_minpk.tgo new file mode 100644 index 0000000000..7d329bbc23 --- /dev/null +++ b/src/blst/bindings/go/blst_minpk.tgo @@ -0,0 +1,602 @@ + +import ( + "runtime" + "sync" + "sync/atomic" +) + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +func (pk *P1Affine) KeyValidate() bool { + return bool(C.go_p1_affine_validate(pk, true)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { + return bool(C.go_p2_affine_validate(sig, C.bool(sigInfcheck))) +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, + pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } + return pks[i], nil + } + + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (_ *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0] & 0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0] & 0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, + pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() //nolint:staticcheck + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g2(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P2Affine { + return sig + } + pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (_ *P2Affine) MultipleAggregateVerify(sigs []*P2Affine, + sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, _ *P2Affine, _ *P1Affine, rand *Scalar) ( + *P2Affine, *P1Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, + rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P1Affine + var tempSig P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AggregateWithRandomness(pointsIf interface{}, + scalarsIf interface{}, nbits int, groupcheck bool) bool { + if groupcheck && !P2AffinesValidate(pointsIf) { + return false + } + agg.v = P2AffinesMult(pointsIf, scalarsIf, nbits) + return true +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.coreAggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return new(P2Affine) + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) coreAggregate(getter aggGetterP2, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} diff --git a/src/blst/bindings/go/blst_minpk_test.go b/src/blst/bindings/go/blst_minpk_test.go new file mode 100644 index 0000000000..44c55b2968 --- /dev/null +++ b/src/blst/bindings/go/blst_minpk_test.go @@ -0,0 +1,722 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +import ( + "crypto/rand" + "fmt" + "runtime" + "testing" +) + +// Min PK. +type PublicKeyMinPk = P1Affine +type SignatureMinPk = P2Affine +type AggregateSignatureMinPk = P2Aggregate +type AggregatePublicKeyMinPk = P1Aggregate + +// Names in this file must be unique to support min-sig so we can't use 'dst' +// here. +var dstMinPk = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") + +func init() { + // Use all cores when testing and benchmarking + SetMaxProcs(runtime.GOMAXPROCS(0)) +} + +func TestInfinityMinPk(t *testing.T) { + t.Parallel() + var infComp [48]byte + infComp[0] |= 0xc0 + new(PublicKeyMinPk).Uncompress(infComp[:]) +} + +func TestSerdesMinPk(t *testing.T) { + t.Parallel() + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + defer sk.Zeroize() + + // Serialize/deserialize sk + sk2 := new(SecretKey).Deserialize(sk.Serialize()) + defer sk2.Zeroize() + if !sk.Equals(sk2) { + t.Errorf("sk2 != sk") + } + + // Negative test equals + sk.b[0]++ + if sk.Equals(sk2) { + t.Errorf("sk2 == sk") + } + + // pk + pk := new(PublicKeyMinPk).From(sk) + + // Compress/decompress sk + pk2 := new(PublicKeyMinPk).Uncompress(pk.Compress()) + if !pk.Equals(pk2) { + t.Errorf("pk2 != pk") + } + + // Serialize/deserialize sk + pk3 := new(PublicKeyMinPk).Deserialize(pk.Serialize()) + if !pk.Equals(pk3) { + t.Errorf("pk3 != pk") + } + + // Negative test equals + // pk.x.l[0] = pk.x.l[0] + 1 + // if pk.Equals(pk2) { + // t.Errorf("pk2 == pk") + // } +} + +func TestSignVerifyMinPk(t *testing.T) { + t.Parallel() + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk0 := KeyGen(ikm[:]) + ikm[0]++ + sk1 := KeyGen(ikm[:]) + + // pk + pk0 := new(PublicKeyMinPk).From(sk0) + pk1 := new(PublicKeyMinPk).From(sk1) + + // Sign + msg0 := []byte("hello foo") + msg1 := []byte("hello bar!") + sig0 := new(SignatureMinPk).Sign(sk0, msg0, dstMinPk) + sig1 := new(SignatureMinPk).Sign(sk1, msg1, dstMinPk) + + // Verify + if !sig0.Verify(true, pk0, false, msg0, dstMinPk) { + t.Errorf("verify sig0") + } + if !sig1.Verify(true, pk1, false, msg1, dstMinPk) { + t.Errorf("verify sig1") + } + if !new(SignatureMinPk).VerifyCompressed(sig1.Compress(), true, + pk1.Compress(), false, + msg1, dstMinPk) { + t.Errorf("verify sig1") + } + // Batch verify + if !sig0.AggregateVerify(true, []*PublicKeyMinPk{pk0}, false, + []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0") + } + // Verify compressed inputs + if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Compress(), true, + [][]byte{pk0.Compress()}, + false, + []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0 compressed") + } + + // Verify serialized inputs + if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Compress(), true, + [][]byte{pk0.Serialize()}, + false, + []Message{msg0}, dstMinPk) { + t.Errorf("aggregate verify sig0 serialized") + } + + // Compressed with empty pk + var emptyPk []byte + if new(SignatureMinPk).VerifyCompressed(sig0.Compress(), true, + emptyPk, false, msg0, dstMinPk) { + t.Errorf("verify sig compressed inputs") + } + // Wrong message + if sig0.Verify(true, pk0, false, msg1, dstMinPk) { + t.Errorf("Expected Verify to return false") + } + // Wrong key + if sig0.Verify(true, pk1, false, msg0, dstMinPk) { + t.Errorf("Expected Verify to return false") + } + // Wrong sig + if sig1.Verify(true, pk0, false, msg0, dstMinPk) { + t.Errorf("Expected Verify to return false") + } +} + +func TestSignVerifyAugMinPk(t *testing.T) { + t.Parallel() + sk := genRandomKeyMinPk() + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + aug := []byte("augmentation") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, aug) + if !sig.Verify(true, pk, false, msg, dstMinPk, aug) { + t.Errorf("verify sig") + } + aug2 := []byte("augmentation2") + if sig.Verify(true, pk, false, msg, dstMinPk, aug2) { + t.Errorf("verify sig, wrong augmentation") + } + if sig.Verify(true, pk, false, msg, dstMinPk) { + t.Errorf("verify sig, no augmentation") + } + // TODO: augmentation with aggregate verify +} + +func TestSignVerifyEncodeMinPk(t *testing.T) { + t.Parallel() + sk := genRandomKeyMinPk() + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, false) + if !sig.Verify(true, pk, false, msg, dstMinPk, false) { + t.Errorf("verify sig") + } + if sig.Verify(true, pk, false, msg, dstMinPk) { + t.Errorf("verify sig expected fail, wrong hashing engine") + } + if sig.Verify(true, pk, false, msg, dstMinPk, 0) { + t.Errorf("verify sig expected fail, illegal argument") + } +} + +func TestSignVerifyAggregateMinPk(t *testing.T) { + t.Parallel() + for size := 1; size < 20; size++ { + sks, msgs, _, pubks, _, err := + generateBatchTestDataUncompressedMinPk(size) + if err { + t.Errorf("Error generating test data") + return + } + + // All signers sign the same message + sigs := make([]*SignatureMinPk, 0) + for i := 0; i < size; i++ { + sigs = append(sigs, new(SignatureMinPk).Sign(sks[i], msgs[0], + dstMinPk)) + } + agProj := new(AggregateSignatureMinPk) + if !agProj.Aggregate(sigs, false) { + t.Errorf("Aggregate unexpectedly returned nil") + return + } + agSig := agProj.ToAffine() + + if !agSig.FastAggregateVerify(false, pubks, msgs[0], dstMinPk) { + t.Errorf("failed to verify size %d", size) + } + + // Negative test + if agSig.FastAggregateVerify(false, pubks, msgs[0][1:], dstMinPk) { + t.Errorf("failed to not verify size %d", size) + } + + // Test compressed signature aggregation + compSigs := make([][]byte, size) + for i := 0; i < size; i++ { + compSigs[i] = sigs[i].Compress() + } + agProj = new(AggregateSignatureMinPk) + if !agProj.AggregateCompressed(compSigs, false) { + t.Errorf("AggregateCompressed unexpectedly returned nil") + return + } + agSig = agProj.ToAffine() + if !agSig.FastAggregateVerify(false, pubks, msgs[0], dstMinPk) { + t.Errorf("failed to verify size %d", size) + } + + // Negative test + if agSig.FastAggregateVerify(false, pubks, msgs[0][1:], dstMinPk) { + t.Errorf("failed to not verify size %d", size) + } + } +} + +func TestSignMultipleVerifyAggregateMinPk(t *testing.T) { + t.Parallel() + msgCount := 5 + for size := 1; size < 20; size++ { + msgs := make([]Message, 0) + sks := make([]*SecretKey, 0) + pks := make([]*PublicKeyMinPk, 0) + + // Generate messages + for i := 0; i < msgCount; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d %d", i, size)) + msgs = append(msgs, msg) + } + + // Generate keypairs + for i := 0; i < size; i++ { + priv := genRandomKeyMinPk() + sks = append(sks, priv) + pks = append(pks, new(PublicKeyMinPk).From(priv)) + } + + // All signers sign each message + aggSigs := make([]*SignatureMinPk, 0) + aggPks := make([]*PublicKeyMinPk, 0) + for i := 0; i < msgCount; i++ { + sigsToAgg := make([]*SignatureMinPk, 0) + pksToAgg := make([]*PublicKeyMinPk, 0) + for j := 0; j < size; j++ { + sigsToAgg = append(sigsToAgg, + new(SignatureMinPk).Sign(sks[j], msgs[i], + dstMinPk)) + pksToAgg = append(pksToAgg, pks[j]) + } + + agSig := new(AggregateSignatureMinPk) + if !agSig.Aggregate(sigsToAgg, true) { + t.Errorf("failed to aggregate") + } + afSig := agSig.ToAffine() + agPk := new(AggregatePublicKeyMinPk) + agPk.Aggregate(pksToAgg, false) + afPk := agPk.ToAffine() + aggSigs = append(aggSigs, afSig) + aggPks = append(aggPks, afPk) + + // Verify aggregated signature and pk + if !afSig.Verify(false, afPk, false, msgs[i], dstMinPk) { + t.Errorf("failed to verify single aggregate size %d", size) + } + + } + + randFn := func(s *Scalar) { + var rbytes [BLST_SCALAR_BYTES]byte + _, err := rand.Read(rbytes[:]) + if err != nil { + t.Errorf(err.Error()) + } + s.FromBEndian(rbytes[:]) + } + + // Verify + randBits := 64 + if !new(SignatureMinPk).MultipleAggregateVerify(aggSigs, true, + aggPks, false, + msgs, dstMinPk, + randFn, randBits) { + t.Errorf("failed to verify multiple aggregate size %d", size) + } + + // Negative test + if new(SignatureMinPk).MultipleAggregateVerify(aggSigs, true, + aggPks, false, + msgs, dstMinPk[1:], + randFn, randBits) { + t.Errorf("failed to not verify multiple aggregate size %d", size) + } + } +} + +func TestBatchUncompressMinPk(t *testing.T) { + t.Parallel() + size := 128 + var points []*P2Affine + var compPoints [][]byte + + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + p2 := HashToG2(msg, dstMinPk).ToAffine() + points = append(points, p2) + compPoints = append(compPoints, p2.Compress()) + } + uncompPoints := new(SignatureMinPk).BatchUncompress(compPoints) + if uncompPoints == nil { + t.Errorf("BatchUncompress returned nil size %d", size) + } + for i := 0; i < size; i++ { + if !points[i].Equals(uncompPoints[i]) { + t.Errorf("Uncompressed point does not equal initial point %d", i) + } + } +} + +func BenchmarkCoreSignMinPk(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + defer sk.Zeroize() + msg := []byte("hello foo") + for i := 0; i < b.N; i++ { + new(SignatureMinPk).Sign(sk, msg, dstMinPk) + } +} + +func BenchmarkCoreVerifyMinPk(b *testing.B) { + var ikm = [...]byte{ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, + 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, + 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} + + sk := KeyGen(ikm[:]) + defer sk.Zeroize() + pk := new(PublicKeyMinPk).From(sk) + msg := []byte("hello foo") + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk) + + // Verify + for i := 0; i < b.N; i++ { + if !sig.Verify(true, pk, false, msg, dstMinPk) { + b.Fatal("verify sig") + } + } +} + +func BenchmarkCoreVerifyAggregateMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + b.Helper() + msgs, _, pubks, agsig, err := generateBatchTestDataMinPk(size) + if err { + b.Fatal("Error generating test data") + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !new(SignatureMinPk).AggregateVerifyCompressed(agsig, true, + pubks, false, + msgs, dstMinPk) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkVerifyAggregateUncompressedMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + b.Helper() + _, msgs, _, pubks, agsig, err := + generateBatchTestDataUncompressedMinPk(size) + if err { + b.Fatal("Error generating test data") + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if !agsig.AggregateVerify(true, pubks, false, msgs, dstMinPk) { + b.Fatal("failed to verify") + } + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func BenchmarkCoreAggregateMinPk(b *testing.B) { + run := func(size int) func(b *testing.B) { + return func(b *testing.B) { + b.Helper() + _, sigs, _, _, err := generateBatchTestDataMinPk(size) + if err { + b.Fatal("Error generating test data") + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + var agg AggregateSignatureMinPk + agg.AggregateCompressed(sigs, true) + } + } + } + + b.Run("1", run(1)) + b.Run("10", run(10)) + b.Run("50", run(50)) + b.Run("100", run(100)) + b.Run("300", run(300)) + b.Run("1000", run(1000)) + b.Run("4000", run(4000)) +} + +func genRandomKeyMinPk() *SecretKey { + // Generate 32 bytes of randomness + var ikm [32]byte + _, err := rand.Read(ikm[:]) + + if err != nil { + return nil + } + return KeyGen(ikm[:]) +} + +func generateBatchTestDataMinPk(size int) (msgs []Message, + sigs [][]byte, pubks [][]byte, agsig []byte, err bool) { + err = false + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinPk() + sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk). + Compress()) + pubks = append(pubks, new(PublicKeyMinPk).From(priv).Compress()) + } + agProj := new(AggregateSignatureMinPk) + if !agProj.AggregateCompressed(sigs, true) { + fmt.Println("AggregateCompressed unexpectedly returned nil") + err = true + return //nolint:revive + } + agAff := agProj.ToAffine() + if agAff == nil { + fmt.Println("ToAffine unexpectedly returned nil") + err = true + return //nolint:revive + } + agsig = agAff.Compress() + return //nolint:revive +} + +func generateBatchTestDataUncompressedMinPk(size int) (sks []*SecretKey, + msgs []Message, sigs []*SignatureMinPk, //nolint:unparam + pubks []*PublicKeyMinPk, agsig *SignatureMinPk, err bool) { + err = false + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + msgs = append(msgs, msg) + priv := genRandomKeyMinPk() + sks = append(sks, priv) + sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk)) + pubks = append(pubks, new(PublicKeyMinPk).From(priv)) + } + agProj := new(AggregateSignatureMinPk) + if !agProj.Aggregate(sigs, true) { + fmt.Println("Aggregate unexpectedly returned nil") + err = true + return //nolint:revive + } + agsig = agProj.ToAffine() + return //nolint:revive +} + +func BenchmarkBatchUncompressMinPk(b *testing.B) { + size := 128 + var compPoints [][]byte + + for i := 0; i < size; i++ { + msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) + p2 := HashToG2(msg, dstMinPk).ToAffine() + compPoints = append(compPoints, p2.Compress()) + } + b.Run("Single", func(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + var tmp SignatureMinPk + for i := 0; i < b.N; i++ { + for j := 0; j < size; j++ { + if tmp.Uncompress(compPoints[j]) == nil { + b.Fatal("could not uncompress point") + } + } + } + }) + b.Run("Batch", func(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + var tmp SignatureMinPk + for i := 0; i < b.N; i++ { + if tmp.BatchUncompress(compPoints) == nil { + b.Fatal("could not batch uncompress points") + } + } + }) +} + +func TestSignVerifyAggregateValidatesInfinitePubkeyMinPk(t *testing.T) { + t.Parallel() + size := 20 + sks, msgs, _, pubks, _, err := + generateBatchTestDataUncompressedMinPk(size) + if err { + t.Errorf("Error generating test data") + return + } + + // All signers sign the same message + sigs := make([]*SignatureMinPk, size) + for i := range sigs { + sigs[i] = new(SignatureMinPk).Sign(sks[i], msgs[i], dstMinPk) + } + + // Single message: Infinite pubkeys and signature + zeroKey := new(PublicKeyMinPk) + zeroSig := new(SignatureMinPk) + agProj := new(AggregateSignatureMinPk) + if !agProj.Aggregate([]*SignatureMinPk{zeroSig}, false) { + t.Errorf("Aggregate unexpectedly returned nil") + return + } + agSig := agProj.ToAffine() + + if agSig.AggregateVerify(false, []*PublicKeyMinPk{zeroKey}, false, + [][]byte{msgs[0]}, dstMinPk) { + t.Errorf("failed to NOT verify signature") + } + + // Replace firstkey with infinite pubkey. + pubks[0] = zeroKey + sigs[0] = zeroSig + agProj = new(AggregateSignatureMinPk) + if !agProj.Aggregate(sigs, false) { + t.Errorf("Aggregate unexpectedly returned nil") + return + } + agSig = agProj.ToAffine() + + if agSig.AggregateVerify(false, pubks, false, msgs, dstMinPk) { + t.Errorf("failed to NOT verify signature") + } +} + +func TestEmptyMessageMinPk(t *testing.T) { + t.Parallel() + msg := []byte("") + var sk_bytes = []byte {99, 64, 58, 175, 15, 139, 113, 184, 37, 222, 127, + 204, 233, 209, 34, 8, 61, 27, 85, 251, 68, 31, 255, 214, 8, 189, 190, 71, + 198, 16, 210, 91}; + sk := new(SecretKey).Deserialize(sk_bytes) + pk := new(PublicKeyMinPk).From(sk) + sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk) + if !new(SignatureMinPk).VerifyCompressed(sig.Compress(), true, + pk.Compress(), false, msg, dstMinPk) { + t.Errorf("failed to verify empty message") + } +} + +func TestEmptySignatureMinPk(t *testing.T) { + t.Parallel() + msg := []byte("message") + var sk_bytes = []byte {99, 64, 58, 175, 15, 139, 113, 184, 37, 222, 127, + 204, 233, 209, 34, 8, 61, 27, 85, 251, 68, 31, 255, 214, 8, 189, 190, 71, + 198, 16, 210, 91}; + sk := new(SecretKey).Deserialize(sk_bytes) + pk := new(PublicKeyMinPk).From(sk) + var emptySig []byte + if new(SignatureMinPk).VerifyCompressed(emptySig, true, pk.Compress(), false, msg, dstMinPk) { + t.Errorf("failed to NOT verify empty signature") + } +} + +func TestMultiScalarP1(t *testing.T) { + t.Parallel() + const npoints = 1027 + scalars := make([]byte, npoints*16) + _, err := rand.Read(scalars) + if err != nil { + t.Errorf(err.Error()) + return + } + points := make([]P1, npoints) + refs := make([]P1, npoints) + generator := P1Generator() + for i := range points { + points[i] = *generator.Mult(scalars[i*4:(i+1)*4]) + refs[i] = *points[i].Mult(scalars[i*16:(i+1)*16], 128) + if i < 27 { + ref := P1s(refs[:i+1]).Add() + ret := P1s(points[:i+1]).Mult(scalars, 128) + if !ref.Equals(ret) { + t.Errorf("failed self-consistency multi-scalar test") + } + } + } + ref := P1s(refs).Add() + ret := P1s(points).Mult(scalars, 128) + if !ref.Equals(ret) { + t.Errorf("failed self-consistency multi-scalar test") + } +} + +func BenchmarkMultiScalarP1(b *testing.B) { + const npoints = 200000 + scalars := make([]byte, npoints*32) + _, err := rand.Read(scalars) + if err != nil { + b.Fatal(err.Error()) + } + temp := make([]P1, npoints) + generator := P1Generator() + for i := range temp { + temp[i] = *generator.Mult(scalars[i*4:(i+1)*4]) + } + points := P1s(temp).ToAffine() + run := func(points []P1Affine) func(b *testing.B) { + return func(b *testing.B) { + b.Helper() + for i:=0; i window*ncpus { //nolint:nestif + nx = 1 + wnd = bits.Len(uint(ncpus)/4) + if (window + wnd) > 18 { + wnd = window - wnd + } else { + wnd = (nbits / window + ncpus - 1) / ncpus; + if (nbits / (window + 1) + ncpus - 1) / ncpus < wnd { + wnd = window + 1; + } else { + wnd = window; + } + } + } else { + nx = 2 + wnd = window-2 + for (nbits/wnd+1)*nx < ncpus { + nx += 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + nx -= 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + ny = nbits/wnd + 1 + wnd = nbits/ny + 1 + + return nx, ny, wnd +} + +func pippenger_window_size(npoints int) int { + wbits := bits.Len(uint(npoints)) + + if wbits > 13 { + return wbits - 4 + } + if wbits > 5 { + return wbits - 3 + } + return 2 +} diff --git a/src/blst/bindings/go/blst_px.tgo b/src/blst/bindings/go/blst_px.tgo new file mode 100644 index 0000000000..ebe2ac3c28 --- /dev/null +++ b/src/blst/bindings/go/blst_px.tgo @@ -0,0 +1,804 @@ +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(aug), C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} + +func (p1 *P1Affine) InG1() bool { + return bool(C.blst_p1_affine_in_g1(p1)) +} + +func (_ *P1Affine) BatchUncompress(in [][]byte) []*P1Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P1Affine, n) + pointsPtrs := make([]*P1Affine, n) + + numThreads := numThreads(n) + + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if ! <-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val)*8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits)) + return p1 +} + +func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { + ret := *p1 + return ret.MultAssign(scalarIf, optional...) +} + +func (p1 *P1) AddAssign(pointIf interface{}) *P1 { + switch val := pointIf.(type) { + case *P1: + C.blst_p1_add_or_double(p1, p1, val) + case *P1Affine: + C.blst_p1_add_or_double_affine(p1, p1, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p1 +} + +func (p1 *P1) Add(pointIf interface{}) *P1 { + ret := *p1 + return ret.AddAssign(pointIf) +} + +func (p1 *P1) SubAssign(pointIf interface{}) *P1 { + var x *Fp + var affine C.bool + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p1_sub_assign(p1, x, affine) + return p1 +} + +func (p1 *P1) Sub(pointIf interface{}) *P1 { + ret := *p1 + return ret.SubAssign(pointIf) +} + +func P1Generator() *P1 { + return &cgo_p1Generator +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// group generator point" +func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P1 { + var x *Fp + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val)*8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +func (p *P1) FromAffine(pa *P1Affine) { + C.blst_p1_from_affine(p, pa) +} + +// +// Hash +// +func HashToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_hash_to_g1(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + var aug []byte + if len(optional) > 0 { + aug = optional[0] + } + + C.blst_encode_to_g1(&q, ptrOrNil(msg), C.size_t(len(msg)), + ptrOrNil(dst), C.size_t(len(dst)), + ptrOrNil(aug), C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P1sToAffine(points []*P1, optional ...int) P1Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P1Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P1s) ToAffine(optional ...P1Affines) P1Affines { + npoints := len(points) + var ret P1Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P1Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices + 1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P1Affine, inp *P1, delta int) { + C.go_p1slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P1Affines) Add() *P1 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P1 + C.go_p1slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices + 1, npoints%nslices + + msgs := make(chan P1, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P1Affine, delta int) { + var ret P1 + C.go_p1slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <- msgs + for i := 1; i < nslices; i++ { + msg := <- msgs + C.blst_p1_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P1s) Add() *P1 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits+7)/8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T",val)) + } + + numThreads := numThreads(0) + + if numThreads < 2 { + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints)))/8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P1Affine{nil, nil} + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[0] + case []P1Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[0].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range(scalars) { + scalars[i] = nil + } + + return &ret + } + + if npoints < 32 { + if numThreads > npoints { + numThreads = npoints + } + + curItem := uint32(0) + msgs := make(chan P1, numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + var acc P1 + + for { + workItem := int(atomic.AddUint32(&curItem, 1) - 1) + if workItem >= npoints { + break + } + + var point *P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[workItem] + case []P1Affine: + point = &val[workItem] + case P1Affines: + point = &val[workItem] + } + + var scalar *C.byte + switch val := scalarsIf.(type) { + case []byte: + scalar = (*C.byte)(&val[workItem*nbytes]) + case [][]byte: + scalar = scalars[workItem] + case []Scalar: + if nbits > 248 { + scalar = &val[workItem].b[0] + } else { + scalar = scalars[workItem] + } + case []*Scalar: + scalar = scalars[workItem] + } + + C.go_p1_mult_n_acc(&acc, &point.x, true, + scalar, C.size_t(nbits)) + } + + msgs <- acc + }() + } + + ret := <-msgs + for tid := 1; tid < numThreads; tid++ { + point := <- msgs + C.blst_p1_add_or_double(&ret, &ret, &point); + } + + for i := range(scalars) { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0))/8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { x, dx, y, dy int + point P1 }, nx*ny) + + dx := npoints/nx + y := window*(ny-1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total*dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz << uint(window-1)) + pointsBySlice := [2]*P1Affine{nil, nil} + scalarsBySlice := [2]*C.byte{nil, nil} + _cgoCheckPointer := func(...interface{}) {} + + for { + workItem := atomic.AddInt32(&curItem, 1) - 1 + if int(workItem) >= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[x] + case []P1Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = &val[x].b[0] + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p1s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)); + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P1 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <- msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p1_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p1_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range(scalars) { + scalars[i] = nil + } + + return &ret +} + +func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { + return P1AffinesMult(points, scalarsIf, nbits) +} + +func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { + return points.ToAffine().Mult(scalarsIf, nbits) +} + +// +// Group-check +// + +func P1AffinesValidate(pointsIf interface{}) bool { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := numThreads(npoints) + + if numThreads < 2 { + for i := 0; i < npoints; i++ { + var point *P1Affine + + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[i] + case []P1Affine: + point = &val[i] + case P1Affines: + point = &val[i] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p1_affine_validate(point, true) { + return false + } + } + + return true + } + + valid := int32(1) + curItem := uint32(0) + + var wg sync.WaitGroup + wg.Add(numThreads) + + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) != 0 { + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(npoints) { + break + } + + var point *P1Affine + + switch val := pointsIf.(type) { + case []*P1Affine: + point = val[work] + case []P1Affine: + point = &val[work] + case P1Affines: + point = &val[work] + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + if !C.go_p1_affine_validate(point, true) { + atomic.StoreInt32(&valid, 0) + break + } + } + + wg.Done() + }() + } + + wg.Wait() + + return atomic.LoadInt32(&valid) != 0 +} + +func (points P1Affines) Validate() bool { + return P1AffinesValidate(points) +} diff --git a/src/blst/bindings/go/cgo_assembly.S b/src/blst/bindings/go/cgo_assembly.S new file mode 100644 index 0000000000..87d9d37240 --- /dev/null +++ b/src/blst/bindings/go/cgo_assembly.S @@ -0,0 +1 @@ +#include "assembly.S" diff --git a/src/blst/bindings/go/cgo_server.c b/src/blst/bindings/go/cgo_server.c new file mode 100644 index 0000000000..eac8d202d3 --- /dev/null +++ b/src/blst/bindings/go/cgo_server.c @@ -0,0 +1 @@ +#include "server.c" diff --git a/src/blst/bindings/go/generate.py b/src/blst/bindings/go/generate.py new file mode 100755 index 0000000000..d40fad2109 --- /dev/null +++ b/src/blst/bindings/go/generate.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +import os +import sys +import re +import subprocess + +here = re.split(r'/(?=[^/]*$)', sys.argv[0]) +if len(here) > 1: + os.chdir(here[0]) + +for dir in re.split(r':', os.getenv("GOPATH")): + goimports = dir + "/bin/goimports" + if os.path.isfile(goimports) and os.access(goimports, os.X_OK): + break + goimports = None + +if goimports is None: + version = subprocess.check_output(["go", "version"]).decode('ascii') + v = re.search(r'version go([0-9]+\.[0-9]+)', version) + if not v: + raise OSError(2, "unparseable output from 'go version'") + if float(v.group(1)) < 1.17: + advice = "'go get golang.org/x/tools/cmd/goimports'" + else: + advice = "'go install golang.org/x/tools/cmd/goimports@latest'" + print("'goimports' is not found on $GOPATH, install with", file=sys.stderr) + print(advice, file=sys.stderr) + sys.exit(1) + +outFile = 'blst.go' + + +def concatFile(fout, fin, removeImports): + for line in fin: + if removeImports and 'import' in line: + while ')' not in line: + line = fin.readline() + continue + print(line, file=fout, end='') + + +def remap(fout, fin, mapping, dont_touch, removeImports): + for line in fin: + if removeImports and 'import' in line: + while ')' not in line: + line = fin.readline() + continue + for (a, b) in dont_touch: + line = line.replace(a, b) + + for (a, b) in mapping: + line = line.replace(a, a+"_tmp") + line = line.replace(b, b+"_tmp") + line = line.replace(a+"_tmp", b) + line = line.replace(b+"_tmp", a) + + for (a, b) in dont_touch: + line = line.replace(b, a) + print(line, file=fout, end='') + +fout = open(outFile, "w") + +print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) +print("// DO NOT MODIFY THIS FILE!!", file=fout) +print("// The file is generated from *.tgo by " + here[-1], file=fout) +print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) + +fin = open('blst.tgo', "r") +concatFile(fout, fin, False) +fin.close() + +# min-pk +print("//", file=fout) +print("// MIN-PK", file=fout) +print("//", file=fout) + +fin = open('blst_minpk.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +# These are strings that overlap with the mapping names but we don't +# actually want to change. The second value should be a unique string. +dont_touch = (('Fp12', 'foo1234'),) + +# We're going to swap these names to get from min-pk to min-sig +mapping = [('P1', 'P2'), + ('p1', 'p2'), + ('Fp', 'Fp2'), + ('G1', 'G2'), + ('g1', 'g2') + ] + +# min-sig +print("//", file=fout) +print("// MIN-SIG", file=fout) +print("//", file=fout) + +with open('blst_minpk.tgo', "r") as fin: + remap(fout, fin, mapping, dont_touch, True) + +# serdes and other functions +fin = open('blst_px.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +with open('blst_px.tgo', "r") as fin: + remap(fout, fin, mapping, dont_touch, True) + +# final code +fin = open('blst_misc.tgo', "r") +concatFile(fout, fin, True) +fin.close() + +fout.close() + +# Use goimports to generate the import list +os.system(goimports + " -w blst.go") + +# Generate min-sig tests +fout = open('blst_minsig_test.go', "w") +print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) +print("// DO NOT EDIT THIS FILE!!", file=fout) +print("// The file is generated from blst_minpk_test.go by " + here[-1], file=fout) +print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) + +mapping.append(('MinPk', 'MinSig')) + +with open('blst_minpk_test.go', "r") as fin: + remap(fout, fin, mapping, dont_touch, False) +fout.close() diff --git a/src/blst/bindings/go/rb_tree.go b/src/blst/bindings/go/rb_tree.go new file mode 100644 index 0000000000..4904471558 --- /dev/null +++ b/src/blst/bindings/go/rb_tree.go @@ -0,0 +1,149 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Reimplement rb_tree.c, because C.call overhead is too high in + * comparison to tree insertion subroutine. + */ + +package blst + +import "bytes" + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +const red, black bool = true, false + +type node struct { + leafs [2]*node + data *[]byte + colour bool +} + +type rbTree struct { + root *node + nnodes uint + nodes []node +} + +func (tree *rbTree) insert(data *[]byte) bool { + var nodes [64]*node /* visited nodes */ + var dirs [64]byte /* taken directions */ + var k uint /* walked distance */ + + for p := tree.root; p != nil; k++ { + cmp := bytes.Compare(*data, *p.data) + + if cmp == 0 { + return false /* already in tree, no insertion */ + } + + /* record the step */ + nodes[k] = p + if cmp > 0 { + dirs[k] = 1 + } else { + dirs[k] = 0 + } + p = p.leafs[dirs[k]] + } + + /* allocate new node */ + z := &tree.nodes[tree.nnodes]; tree.nnodes++ + z.data = data + z.colour = red + + /* graft |z| */ + if k > 0 { + nodes[k-1].leafs[dirs[k-1]] = z + } else { + tree.root = z + } + + /* re-balance |tree| */ + for k >= 2 /* && IS_RED(y = nodes[k-1]) */ { + y := nodes[k-1] + if y.colour == black { //nolint:gosimple + break + } + + ydir := dirs[k-2] + x := nodes[k-2] /* |z|'s grandparent */ + s := x.leafs[ydir^1] /* |z|'s uncle */ + + if s != nil && s.colour == red { //nolint:gosimple,revive + x.colour = red + y.colour = black + s.colour = black + k -= 2 + } else { + if dirs[k-1] != ydir { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + t := y + y = y.leafs[ydir^1] + t.leafs[ydir^1] = y.leafs[ydir] + y.leafs[ydir] = t + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x.leafs[ydir] = y.leafs[ydir^1] + y.leafs[ydir^1] = x + + x.colour = red + y.colour = black + + if k > 2 { + nodes[k-3].leafs[dirs[k-3]] = y + } else { + tree.root = y + } + + break + } + } + + tree.root.colour = black + + return true +} + +func Uniq(msgs []Message) bool { + n := len(msgs) + + if n == 1 { + return true + } else if n == 2 { + return !bytes.Equal(msgs[0], msgs[1]) + } + + var tree rbTree + tree.nodes = make([]node, n) + + for i := 0; i < n; i++ { + if !tree.insert(&msgs[i]) { + return false + } + } + + return true +} diff --git a/src/blst/bindings/rust/Cargo.toml b/src/blst/bindings/rust/Cargo.toml new file mode 100644 index 0000000000..f52a69979a --- /dev/null +++ b/src/blst/bindings/rust/Cargo.toml @@ -0,0 +1,76 @@ +[package] +name = "blst" +version = "0.3.13" +authors = ["sean-sn "] +edition = "2018" +license = "Apache-2.0" +description = "Bindings for blst BLS12-381 library" +repository = "https://github.com/supranational/blst" +readme = "README.md" +categories = ["cryptography"] +keywords = ["crypto", "bls", "signature", "asm", "wasm"] +include = [ + "**/*.rs", + "/Cargo.toml", + "/README.md", + "/rustfmt.toml", + "/blst/src/*.c", + "/blst/src/*.h*", + "/blst/build/**", + "/blst/bindings/blst.h", + "/blst/bindings/blst_aux.h", + "/blst/bindings/blst.hpp", +] +links = "blst" + +[features] +# By default, compile with ADX extension if the host supports it. +# Binary can be executed on systems similar to the host. +default = [] +# Compile in portable mode, without ISA extensions. +# Binary can be executed on all systems. +portable = [] +# Enable ADX even if the host CPU doesn't support it. +# Binary can be executed on Broadwell+ and Ryzen+ systems. +force-adx = [] +# Suppress multi-threading. +# Engaged on wasm32 target architecture automatically. +no-threads = [] +# Add support for serializing SecretKey, not suitable for production. +serde-secret = ["serde"] + +[build-dependencies] +cc = "1.0" +[target.'cfg(target_env = "msvc")'.build-dependencies] +glob = "0.3" + +[dependencies] +zeroize = { version = "^1.1", features = ["zeroize_derive"] } +serde = { version = "1.0.152", optional = true } + +[target.'cfg(not(any(target_arch="wasm32", target_os="none", target_os="unknown", target_os="uefi")))'.dependencies] +threadpool = "^1.8.1" + +[dev-dependencies] +rand = "0.8" +rand_chacha = "0.3" +rmp-serde = "1.1.1" +# Uncomment if you want to execute the test suite with Rust 1.56 through 1.64. +#csv = "=1.1.6" +#byteorder = "=1.4.3" +#regex = "=1.7.3" +#rayon = "=1.6.1" +#rayon-core = "=1.10.1" + +[target.'cfg(any(unix, windows))'.dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "blst_benches" +harness = false + +[profile.release] +#opt-level = 3 + +[badges] +maintenance = { status = "actively-developed" } diff --git a/src/blst/bindings/rust/README.md b/src/blst/bindings/rust/README.md new file mode 100644 index 0000000000..b640270517 --- /dev/null +++ b/src/blst/bindings/rust/README.md @@ -0,0 +1,70 @@ +# blst [![Crates.io](https://img.shields.io/crates/v/blst.svg)](https://crates.io/crates/blst) + +The `blst` crate provides a rust interface to the blst BLS12-381 signature library. + +## Build +[bindgen](https://github.com/rust-lang/rust-bindgen) is used to generate FFI bindings to blst.h. Then [build.rs](https://github.com/supranational/blst/blob/master/bindings/rust/build.rs) invokes C compiler to compile everything into libblst.a within the rust target build area. On Linux it's possible to choose compiler by setting `CC` environment variable. + +Everything can be built and run with the typical cargo commands: + +``` +cargo test +cargo bench +``` + +If the target application crashes with an "illegal instruction" exception [after copying to an older system], activate `portable` feature when building blst. Conversely, if you compile on an older Intel system, but will execute the binary on a newer one, consider instead activating `force-adx` feature. Though keep in mind that [cc](https://crates.io/crates/cc) passes the value of `CFLAGS` environment variable to the C compiler, and if set to contain specific flags, it can interfere with feature selection. `-D__BLST_PORTABLE__` and `-D__ADX__` are the said features' equivalents. + +To compile for WebAssembly, your clang has to recognize `--target=wasm32`. Alternatively you can build your project with `CC` environment variable set to `emcc`, the [Emscripten compiler](https://emscripten.org), and `AR` set to `emar`, naturally, with both commands available on your `PATH`. + +While `cargo test`'s dependencies happen to require at least Rust 1.65, the library by itself can be compiled with earlier compiler versions. Though in order to use Rust version prior 1.56 you would need to pin`zeroize` to "=1.3.0" and `zeroize_derive` to "=1.3.3" in **your** project Cargo.toml. Even `cc` might require pinning to "=1.0.79". And if you find yourself with Rust 1.56 through 1.64 as the only option and want to execute `cargo test` you'd need to pin some of `[dev-dependencies]` versions in **this** project's Cargo.toml by uncommenting following lines: + +``` +csv = "=1.1.6" +byteorder = "=1.4.3" +regex = "=1.7.3" +rayon = "=1.6.1" +rayon-core = "=1.10.1" +``` + +## Usage +There are two primary modes of operation that can be chosen based on declaration path: + +For minimal-pubkey-size operations: +```rust +use blst::min_pk::*; +``` + +For minimal-signature-size operations: +```rust +use blst::min_sig::*; +``` + +There are five structs with inherent implementations that provide the BLS12-381 signature functionality. +``` +SecretKey +PublicKey +AggregatePublicKey +Signature +AggregateSignature +``` + +A simple example for generating a key, signing a message, and verifying the message: +```rust +use blst::min_pk::SecretKey; + +let mut rng = rand::thread_rng(); +let mut ikm = [0u8; 32]; +rng.fill_bytes(&mut ikm); + +let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); +let pk = sk.sk_to_pk(); + +let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; +let msg = b"blst is such a blast"; +let sig = sk.sign(msg, dst, &[]); + +let err = sig.verify(true, msg, dst, &[], &pk, true); +assert_eq!(err, blst::BLST_ERROR::BLST_SUCCESS); +``` + +See the tests in src/lib.rs and benchmarks in benches/blst_benches.rs for further examples of usage. diff --git a/src/blst/bindings/rust/benches/blst_benches.rs b/src/blst/bindings/rust/benches/blst_benches.rs new file mode 100644 index 0000000000..c0936b3557 --- /dev/null +++ b/src/blst/bindings/rust/benches/blst_benches.rs @@ -0,0 +1,478 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +use blst::*; + +// Benchmark min_pk +use blst::min_pk::*; + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use rand::{RngCore, SeedableRng}; +use rand_chacha::ChaCha20Rng; + +struct BenchData { + sk: SecretKey, + pk: PublicKey, + msg: Vec, + dst: Vec, + sig: Signature, +} + +fn gen_bench_data(rng: &mut rand_chacha::ChaCha20Rng) -> BenchData { + let msg_len = (rng.next_u64() & 0x3F) + 1; + let mut msg = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msg); + + gen_bench_data_for_msg(rng, &msg) +} + +fn gen_bench_data_for_msg( + rng: &mut rand_chacha::ChaCha20Rng, + msg: &Vec, +) -> BenchData { + let mut ikm = [0u8; 32]; + rng.fill_bytes(&mut ikm); + + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + let dst = "BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_" + .as_bytes() + .to_owned(); + + let sig = sk.sign(&msg, &dst, &[]); + + let bd = BenchData { + sk, + pk, + dst, + msg: msg.clone(), + sig, + }; + bd +} + +fn bench_verify_multi_aggregate(c: &mut Criterion) { + let mut group = c.benchmark_group("verify_multi_aggregate"); + + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; + let mut ikm = [0u8; 32]; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let num_sigs = vec![8, 16, 32, 64, 128]; + let pks_per_sig = 3; + + for n in num_sigs.iter() { + let mut msgs: Vec> = vec![vec![]; *n]; + let mut sigs: Vec = Vec::with_capacity(*n); + let mut pks: Vec = Vec::with_capacity(*n); + let mut rands: Vec = Vec::with_capacity(*n); + + for i in 0..*n { + // Create public keys + rng.fill_bytes(&mut ikm); + let sks_i: Vec<_> = (0..pks_per_sig) + .map(|_| { + ikm[0] += 1; + SecretKey::key_gen(&ikm, &[]).unwrap() + }) + .collect(); + let pks_i = + sks_i.iter().map(|sk| sk.sk_to_pk()).collect::>(); + let pks_refs_i: Vec<&PublicKey> = + pks_i.iter().map(|pk| pk).collect(); + + // Create random message for pks to all sign + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + + // Generate signature for each key pair + let sigs_i = sks_i + .iter() + .map(|sk| sk.sign(&msgs[i], dst, &[])) + .collect::>(); + + // Aggregate signature + let sig_refs_i = + sigs_i.iter().map(|s| s).collect::>(); + let agg_i = match AggregateSignature::aggregate(&sig_refs_i, false) + { + Ok(agg_i) => agg_i, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + sigs.push(agg_i.to_signature()); + + // aggregate public keys and push into vec + let agg_pk_i = + match AggregatePublicKey::aggregate(&pks_refs_i, false) { + Ok(agg_pk_i) => agg_pk_i, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + pks.push(agg_pk_i.to_public_key()); + + // create random values + let mut vals = [0u64; 4]; + vals[0] = rng.next_u64(); + let mut rand_i = std::mem::MaybeUninit::::uninit(); + unsafe { + blst_scalar_from_uint64(rand_i.as_mut_ptr(), vals.as_ptr()); + rands.push(rand_i.assume_init()); + } + } + + let msgs_refs: Vec<&[u8]> = msgs.iter().map(|m| m.as_slice()).collect(); + let sig_refs = sigs.iter().map(|s| s).collect::>(); + let pks_refs: Vec<&PublicKey> = pks.iter().map(|pk| pk).collect(); + + let agg_ver = (sig_refs, pks_refs, msgs_refs, dst, rands); + + group.bench_with_input( + BenchmarkId::new("verify_multi_aggregate", n), + &agg_ver, + |b, (s, p, m, d, r)| { + b.iter(|| { + let result = + Signature::verify_multiple_aggregate_signatures( + &m, *d, &p, false, &s, false, &r, 64, + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + }); + }, + ); + } + + group.finish(); +} + +fn bench_fast_aggregate_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("fast_aggregate_verify"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let msg_len = (rng.next_u64() & 0x3F) + 1; + let mut msg = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msg); + + let sizes = vec![8, 16, 32, 64, 128]; + + let bds: Vec<_> = (0..sizes[sizes.len() - 1]) + .map(|_| gen_bench_data_for_msg(&mut rng, &msg)) + .collect(); + + for size in sizes.iter() { + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + let agg = match AggregateSignature::aggregate(&sig_refs, false) { + Ok(agg) => agg, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + let agg_sig = agg.to_signature(); + + let agg_pks = match AggregatePublicKey::aggregate(&pks_refs, false) { + Ok(agg_pks) => agg_pks, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + let agg_pk = agg_pks.to_public_key(); + + let agg_ver = (agg_sig, pks_refs, &bds[0].msg, &bds[0].dst); + let agg_pre_ver = (agg_sig, agg_pk, &bds[0].msg, &bds[0].dst); + + group.bench_with_input( + BenchmarkId::new("fast_aggregate_verify", size), + &agg_ver, + |b, (a, p, m, d)| { + b.iter(|| { + let result = a.fast_aggregate_verify(true, &m, &d, &p); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("fast_aggregate_verify_preagg", size), + &agg_pre_ver, + |b, (a, p, m, d)| { + b.iter(|| { + let result = a + .fast_aggregate_verify_pre_aggregated(true, &m, &d, &p); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + }); + }, + ); + } + + group.finish(); +} + +fn bench_aggregate_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("aggregate_verify"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sizes = vec![8, 16, 32, 64, 128]; + // [10, 50, 100, 300, 1000, 4000]; + + let bds: Vec<_> = (0..sizes[sizes.len() - 1]) + .map(|_| gen_bench_data(&mut rng)) + .collect(); + + for size in sizes.iter() { + let msgs_refs = bds + .iter() + .take(*size) + .map(|s| s.msg.as_slice()) + .collect::>(); + + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + let agg = match AggregateSignature::aggregate(&sig_refs, false) { + Ok(agg) => agg, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + let agg_sig = agg.to_signature(); + let agg_ver = (agg_sig, pks_refs, msgs_refs, &bds[0].dst); + + group.bench_with_input( + BenchmarkId::new("aggregate_verify", size), + &agg_ver, + |b, (a, p, m, d)| { + b.iter(|| { + let result = a.aggregate_verify(true, &m, &d, &p, false); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + }); + }, + ); + } + + group.finish(); +} + +fn bench_aggregate(c: &mut Criterion) { + let mut group = c.benchmark_group("aggregate"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sizes: [usize; 6] = [10, 50, 100, 300, 1000, 4000]; + + let bds: Vec<_> = (0..4000).map(|_| gen_bench_data(&mut rng)).collect(); + + for size in sizes.iter() { + let sig_refs = bds + .iter() + .take(*size) + .map(|s| &s.sig) + .collect::>(); + + group.bench_with_input( + BenchmarkId::new("aggregate_signature", size), + &sig_refs, + |b, s| { + b.iter(|| AggregateSignature::aggregate(&s, false)); + }, + ); + + let pks_refs = bds + .iter() + .take(*size) + .map(|s| &s.pk) + .collect::>(); + + group.bench_with_input( + BenchmarkId::new("aggregate_public_key", size), + &pks_refs, + |b, p| { + b.iter(|| AggregatePublicKey::aggregate(&p, false)); + }, + ); + } + + group.finish(); +} + +fn bench_single_message(c: &mut Criterion) { + let mut group = c.benchmark_group("single_message"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + let bd = gen_bench_data(&mut rng); + + group.bench_function("sign", |b| { + b.iter(|| bd.sk.sign(&bd.msg, &bd.dst, &[])) + }); + + group.bench_function("verify", |b| { + b.iter(|| bd.sig.verify(true, &bd.msg, &bd.dst, &[], &bd.pk, false)) + }); + + group.finish(); +} + +fn bench_serdes(c: &mut Criterion) { + let mut group = c.benchmark_group("serdes"); + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + let bd = gen_bench_data(&mut rng); + + let sk = bd.sk; + let sk_ser = sk.serialize(); + + let pk = bd.pk; + let pk_comp = pk.compress(); + let pk_ser = pk.serialize(); + + let sig = bd.sig; + let sig_comp = sig.compress(); + let sig_ser = sig.serialize(); + + let mut pk_jac = std::mem::MaybeUninit::::uninit(); + let mut sig_jac = std::mem::MaybeUninit::::uninit(); + + let mut p1_comp = [0; 48]; + let mut p2_comp = [0; 96]; + let mut p1_ser = [0; 96]; + let mut p2_ser = [0; 192]; + + unsafe { + let mut junk = [0u8; 32]; + rng.fill_bytes(&mut junk); + blst_encode_to_g1( + pk_jac.as_mut_ptr(), + junk.as_ptr(), + junk.len(), + "junk".as_ptr(), + 4, + std::ptr::null(), + 0, + ); + blst_encode_to_g2( + sig_jac.as_mut_ptr(), + junk.as_ptr(), + junk.len(), + "junk".as_ptr(), + 4, + std::ptr::null(), + 0, + ); + } + + group.bench_function("secret_key_serialize", |b| b.iter(|| sk.serialize())); + + group.bench_function("secret_key_deserialize", |b| { + b.iter(|| SecretKey::deserialize(&sk_ser)); + }); + + group.bench_function("public_key_serialize", |b| b.iter(|| pk.serialize())); + + group.bench_function("public_key_compress", |b| b.iter(|| pk.compress())); + + group.bench_function("public_key_uncompress", |b| { + b.iter(|| PublicKey::uncompress(&pk_comp)) + }); + + group.bench_function("public_key_deserialize", |b| { + b.iter(|| PublicKey::deserialize(&pk_ser)); + }); + + group.bench_function("signature_serialize", |b| b.iter(|| sig.serialize())); + + group.bench_function("signature_compress", |b| b.iter(|| sig.compress())); + + group.bench_function("signature_uncompress", |b| { + b.iter(|| Signature::uncompress(&sig_comp)) + }); + + group.bench_function("signature_deserialize", |b| { + b.iter(|| Signature::deserialize(&sig_ser)) + }); + + group.bench_function("p1_serialize", |b| { + b.iter(|| unsafe { + blst_p1_serialize(p1_ser.as_mut_ptr(), pk_jac.as_ptr()) + }) + }); + + group.bench_function("p1_compress", |b| { + b.iter(|| unsafe { + blst_p1_compress(p1_comp.as_mut_ptr(), pk_jac.as_ptr()) + }) + }); + + group.bench_function("p2_serialize", |b| { + b.iter(|| unsafe { + blst_p2_serialize(p2_ser.as_mut_ptr(), sig_jac.as_ptr()) + }) + }); + + group.bench_function("p2_compress", |b| { + b.iter(|| unsafe { + blst_p2_compress(p2_comp.as_mut_ptr(), sig_jac.as_ptr()) + }) + }); + + group.finish(); +} + +fn bench_keys(c: &mut Criterion) { + let mut group = c.benchmark_group("keys"); + let ikm: [u8; 32] = [ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, + 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, + 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99, + ]; + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + let pk_comp = pk.compress(); + + group.bench_function("key_gen", |b| { + b.iter(|| SecretKey::key_gen(&ikm, &[])) + }); + + group.bench_function("sk_to_pk", |b| { + b.iter(|| sk.sk_to_pk()); + }); + + group.bench_function("key_validate", |b| { + b.iter(|| PublicKey::key_validate(&pk_comp)); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_verify_multi_aggregate, + bench_fast_aggregate_verify, + bench_aggregate_verify, + bench_aggregate, + bench_single_message, + bench_serdes, + bench_keys +); +criterion_main!(benches); diff --git a/src/blst/bindings/rust/build.rs b/src/blst/bindings/rust/build.rs new file mode 100644 index 0000000000..d823057fe5 --- /dev/null +++ b/src/blst/bindings/rust/build.rs @@ -0,0 +1,247 @@ +#![allow(unused_imports)] + +extern crate cc; + +use std::env; +use std::path::{Path, PathBuf}; + +fn assembly( + file_vec: &mut Vec, + base_dir: &Path, + _arch: &str, + _is_msvc: bool, +) { + #[cfg(target_env = "msvc")] + if _is_msvc { + let sfx = match _arch { + "x86_64" => "x86_64", + "aarch64" => "armv8", + _ => "unknown", + }; + let files = + glob::glob(&format!("{}/win64/*-{}.asm", base_dir.display(), sfx)) + .expect("unable to collect assembly files"); + for file in files { + file_vec.push(file.unwrap()); + } + return; + } + + file_vec.push(base_dir.join("assembly.S")); +} + +fn main() { + if env::var("CARGO_FEATURE_SERDE_SECRET").is_ok() { + println!( + "cargo:warning=blst: non-production feature serde-secret enabled" + ); + } + + // account for cross-compilation [by examining environment variables] + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap(); + let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); + let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + let target_family = env::var("CARGO_CFG_TARGET_FAMILY").unwrap_or_default(); + + let target_no_std = target_os.eq("none") + || (target_os.eq("unknown") && target_arch.eq("wasm32")) + || target_os.eq("uefi") + || env::var("BLST_TEST_NO_STD").is_ok(); + + if !target_no_std { + println!("cargo:rustc-cfg=feature=\"std\""); + if target_arch.eq("wasm32") || target_os.eq("unknown") { + println!("cargo:rustc-cfg=feature=\"no-threads\""); + } + } + println!("cargo:rerun-if-env-changed=BLST_TEST_NO_STD"); + + /* + * Use pre-built libblst.a if there is one. This is primarily + * for trouble-shooting purposes. Idea is that libblst.a can be + * compiled with flags independent from cargo defaults, e.g. + * '../../build.sh -O1 ...'. + */ + if Path::new("libblst.a").exists() { + println!("cargo:rustc-link-search=."); + println!("cargo:rustc-link-lib=blst"); + println!("cargo:rerun-if-changed=libblst.a"); + return; + } + + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + let mut blst_base_dir = manifest_dir.join("blst"); + if !blst_base_dir.exists() { + // Reach out to ../.., which is the root of the blst repo. + // Use an absolute path to avoid issues with relative paths + // being treated as strings by `cc` and getting concatenated + // in ways that reach out of the OUT_DIR. + blst_base_dir = manifest_dir + .parent() + .and_then(|dir| dir.parent()) + .expect("can't access parent of parent of current directory") + .into(); + } + println!("Using blst source directory {}", blst_base_dir.display()); + + // Set CC environment variable to choose alternative C compiler. + // Optimization level depends on whether or not --release is passed + // or implied. + + if target_os.eq("uefi") && env::var("CC").is_err() { + match std::process::Command::new("clang") + .arg("--version") + .output() + { + Ok(_) => env::set_var("CC", "clang"), + Err(_) => { /* no clang in sight, just ignore the error */ } + } + } + + if target_env.eq("sgx") && env::var("CC").is_err() { + match std::process::Command::new("clang") + .arg("--version") + .output() + { + Ok(out) => { + let version = String::from_utf8(out.stdout) + .unwrap_or("unintelligible".to_string()); + if let Some(x) = version.find("clang version ") { + let x = x + 14; + let y = version[x..].find('.').unwrap_or(0); + if version[x..x + y].parse::().unwrap_or(0) >= 11 { + env::set_var("CC", "clang"); + } + } + } + Err(_) => { /* no clang in sight, just ignore the error */ } + } + } + + if target_env.eq("msvc") + && env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap().eq("32") + && env::var("CC").is_err() + { + match std::process::Command::new("clang-cl") + .args(["-m32", "--version"]) + .output() + { + Ok(out) => { + if String::from_utf8(out.stdout) + .unwrap_or("unintelligible".to_string()) + .contains("Target: i386-pc-windows-msvc") + { + env::set_var("CC", "clang-cl"); + } + } + Err(_) => { /* no clang-cl in sight, just ignore the error */ } + } + } + + let mut cc = cc::Build::new(); + + let c_src_dir = blst_base_dir.join("src"); + println!("cargo:rerun-if-changed={}", c_src_dir.display()); + let mut file_vec = vec![c_src_dir.join("server.c")]; + + if target_arch.eq("x86_64") || target_arch.eq("aarch64") { + let asm_dir = blst_base_dir.join("build"); + println!("cargo:rerun-if-changed={}", asm_dir.display()); + assembly( + &mut file_vec, + &asm_dir, + &target_arch, + cc.get_compiler().is_like_msvc(), + ); + } else { + cc.define("__BLST_NO_ASM__", None); + } + match (cfg!(feature = "portable"), cfg!(feature = "force-adx")) { + (true, false) => { + if target_arch.eq("x86_64") && target_env.eq("sgx") { + panic!("'portable' is not supported on SGX target"); + } + println!("Compiling in portable mode without ISA extensions"); + cc.define("__BLST_PORTABLE__", None); + } + (false, true) => { + if target_arch.eq("x86_64") { + println!("Enabling ADX support via `force-adx` feature"); + cc.define("__ADX__", None); + } else { + println!("`force-adx` is ignored for non-x86_64 targets"); + } + } + (false, false) => { + if target_arch.eq("x86_64") { + if target_env.eq("sgx") { + println!("Enabling ADX for Intel SGX target"); + cc.define("__ADX__", None); + } else if env::var("CARGO_ENCODED_RUSTFLAGS") + .unwrap_or_default() + .contains("target-cpu=") + { + // If target-cpu is specified on the rustc command line, + // then obey the resulting target-features. + let feat_list = env::var("CARGO_CFG_TARGET_FEATURE") + .unwrap_or_default(); + let features: Vec<_> = feat_list.split(',').collect(); + if !features.contains(&"ssse3") { + println!( + "Compiling in portable mode without ISA extensions" + ); + cc.define("__BLST_PORTABLE__", None); + } else if features.contains(&"adx") { + println!( + "Enabling ADX because it was set as target-feature" + ); + cc.define("__ADX__", None); + } + } else { + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("adx") { + println!( + "Enabling ADX because it was detected on the host" + ); + cc.define("__ADX__", None); + } + } + } + } + (true, true) => panic!( + "Cannot compile with both `portable` and `force-adx` features" + ), + } + if target_env.eq("msvc") && cc.get_compiler().is_like_msvc() { + cc.flag("-Zl"); + } + cc.flag_if_supported("-mno-avx") // avoid costly transitions + .flag_if_supported("-fno-builtin") + .flag_if_supported("-Wno-unused-function") + .flag_if_supported("-Wno-unused-command-line-argument"); + if target_arch.eq("wasm32") || target_family.is_empty() { + cc.flag("-ffreestanding"); + } + if target_arch.eq("wasm32") || target_no_std { + cc.define("SCRATCH_LIMIT", "(45 * 1024)"); + } + if target_env.eq("sgx") { + cc.flag_if_supported("-mlvi-hardening"); + cc.define("__SGX_LVI_HARDENING__", None); + cc.define("__BLST_NO_CPUID__", None); + cc.define("__ELF__", None); + cc.define("SCRATCH_LIMIT", "(45 * 1024)"); + } + if !cfg!(debug_assertions) { + cc.opt_level(2); + } + cc.files(&file_vec).compile("blst"); + + // pass some DEP_BLST_* variables to dependents + println!( + "cargo:BINDINGS={}", + blst_base_dir.join("bindings").to_string_lossy() + ); + println!("cargo:C_SRC={}", c_src_dir.to_string_lossy()); +} diff --git a/src/blst/bindings/rust/publish.sh b/src/blst/bindings/rust/publish.sh new file mode 100755 index 0000000000..a307efa802 --- /dev/null +++ b/src/blst/bindings/rust/publish.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" + +if [ ! -d blst ]; then + trap '[ -h blst ] && rm -f blst' 0 2 + ln -s ../.. blst +fi + +# --allow-dirty because the temporary blst symbolic link is not committed +cargo +stable publish --allow-dirty "$@" diff --git a/src/blst/bindings/rust/rustfmt.toml b/src/blst/bindings/rust/rustfmt.toml new file mode 100644 index 0000000000..df99c69198 --- /dev/null +++ b/src/blst/bindings/rust/rustfmt.toml @@ -0,0 +1 @@ +max_width = 80 diff --git a/src/blst/bindings/rust/src/bindings.rs b/src/blst/bindings/rust/src/bindings.rs new file mode 100644 index 0000000000..f72753c38e --- /dev/null +++ b/src/blst/bindings/rust/src/bindings.rs @@ -0,0 +1,1409 @@ +/* automatically generated by rust-bindgen 0.65.1 */ + +#[repr(u32)] +#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +pub enum BLST_ERROR { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING = 1, + BLST_POINT_NOT_ON_CURVE = 2, + BLST_POINT_NOT_IN_GROUP = 3, + BLST_AGGR_TYPE_MISMATCH = 4, + BLST_VERIFY_FAIL = 5, + BLST_PK_IS_INFINITY = 6, + BLST_BAD_SCALAR = 7, +} +pub type byte = u8; +pub type limb_t = u64; +#[repr(C)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Zeroize)] +#[zeroize(drop)] +pub struct blst_scalar { + pub b: [byte; 32usize], +} +#[test] +fn bindgen_test_layout_blst_scalar() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 32usize, + concat!("Size of: ", stringify!(blst_scalar)) + ); + assert_eq!( + ::core::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(blst_scalar)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).b) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_scalar), + "::", + stringify!(b) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct blst_fr { + pub l: [limb_t; 4usize], +} +#[test] +fn bindgen_test_layout_blst_fr() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 32usize, + concat!("Size of: ", stringify!(blst_fr)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_fr)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).l) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_fr), + "::", + stringify!(l) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct blst_fp { + pub l: [limb_t; 6usize], +} +#[test] +fn bindgen_test_layout_blst_fp() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 48usize, + concat!("Size of: ", stringify!(blst_fp)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_fp)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).l) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_fp), + "::", + stringify!(l) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct blst_fp2 { + pub fp: [blst_fp; 2usize], +} +#[test] +fn bindgen_test_layout_blst_fp2() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 96usize, + concat!("Size of: ", stringify!(blst_fp2)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_fp2)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).fp) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_fp2), + "::", + stringify!(fp) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct blst_fp6 { + pub fp2: [blst_fp2; 3usize], +} +#[test] +fn bindgen_test_layout_blst_fp6() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 288usize, + concat!("Size of: ", stringify!(blst_fp6)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_fp6)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).fp2) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_fp6), + "::", + stringify!(fp2) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone, Eq)] +pub struct blst_fp12 { + pub fp6: [blst_fp6; 2usize], +} +#[test] +fn bindgen_test_layout_blst_fp12() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 576usize, + concat!("Size of: ", stringify!(blst_fp12)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_fp12)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).fp6) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_fp12), + "::", + stringify!(fp6) + ) + ); +} +extern "C" { + pub fn blst_scalar_from_uint32(out: *mut blst_scalar, a: *const u32); +} +extern "C" { + pub fn blst_uint32_from_scalar(out: *mut u32, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_from_uint64(out: *mut blst_scalar, a: *const u64); +} +extern "C" { + pub fn blst_uint64_from_scalar(out: *mut u64, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_from_bendian(out: *mut blst_scalar, a: *const byte); +} +extern "C" { + pub fn blst_bendian_from_scalar(out: *mut byte, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_from_lendian(out: *mut blst_scalar, a: *const byte); +} +extern "C" { + pub fn blst_lendian_from_scalar(out: *mut byte, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_fr_check(a: *const blst_scalar) -> bool; +} +extern "C" { + pub fn blst_sk_check(a: *const blst_scalar) -> bool; +} +extern "C" { + pub fn blst_sk_add_n_check( + out: *mut blst_scalar, + a: *const blst_scalar, + b: *const blst_scalar, + ) -> bool; +} +extern "C" { + pub fn blst_sk_sub_n_check( + out: *mut blst_scalar, + a: *const blst_scalar, + b: *const blst_scalar, + ) -> bool; +} +extern "C" { + pub fn blst_sk_mul_n_check( + out: *mut blst_scalar, + a: *const blst_scalar, + b: *const blst_scalar, + ) -> bool; +} +extern "C" { + pub fn blst_sk_inverse(out: *mut blst_scalar, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_from_le_bytes(out: *mut blst_scalar, in_: *const byte, len: usize) -> bool; +} +extern "C" { + pub fn blst_scalar_from_be_bytes(out: *mut blst_scalar, in_: *const byte, len: usize) -> bool; +} +extern "C" { + pub fn blst_fr_add(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); +} +extern "C" { + pub fn blst_fr_sub(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); +} +extern "C" { + pub fn blst_fr_mul_by_3(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_lshift(ret: *mut blst_fr, a: *const blst_fr, count: usize); +} +extern "C" { + pub fn blst_fr_rshift(ret: *mut blst_fr, a: *const blst_fr, count: usize); +} +extern "C" { + pub fn blst_fr_mul(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); +} +extern "C" { + pub fn blst_fr_sqr(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_cneg(ret: *mut blst_fr, a: *const blst_fr, flag: bool); +} +extern "C" { + pub fn blst_fr_eucl_inverse(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_inverse(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_from_uint64(ret: *mut blst_fr, a: *const u64); +} +extern "C" { + pub fn blst_uint64_from_fr(ret: *mut u64, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_from_scalar(ret: *mut blst_fr, a: *const blst_scalar); +} +extern "C" { + pub fn blst_scalar_from_fr(ret: *mut blst_scalar, a: *const blst_fr); +} +extern "C" { + pub fn blst_fp_add(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); +} +extern "C" { + pub fn blst_fp_sub(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); +} +extern "C" { + pub fn blst_fp_mul_by_3(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_mul_by_8(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_lshift(ret: *mut blst_fp, a: *const blst_fp, count: usize); +} +extern "C" { + pub fn blst_fp_mul(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); +} +extern "C" { + pub fn blst_fp_sqr(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_cneg(ret: *mut blst_fp, a: *const blst_fp, flag: bool); +} +extern "C" { + pub fn blst_fp_eucl_inverse(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_inverse(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_sqrt(ret: *mut blst_fp, a: *const blst_fp) -> bool; +} +extern "C" { + pub fn blst_fp_from_uint32(ret: *mut blst_fp, a: *const u32); +} +extern "C" { + pub fn blst_uint32_from_fp(ret: *mut u32, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_from_uint64(ret: *mut blst_fp, a: *const u64); +} +extern "C" { + pub fn blst_uint64_from_fp(ret: *mut u64, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_from_bendian(ret: *mut blst_fp, a: *const byte); +} +extern "C" { + pub fn blst_bendian_from_fp(ret: *mut byte, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_from_lendian(ret: *mut blst_fp, a: *const byte); +} +extern "C" { + pub fn blst_lendian_from_fp(ret: *mut byte, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp2_add(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_sub(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_mul_by_3(ret: *mut blst_fp2, a: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_mul_by_8(ret: *mut blst_fp2, a: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_lshift(ret: *mut blst_fp2, a: *const blst_fp2, count: usize); +} +extern "C" { + pub fn blst_fp2_mul(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_sqr(ret: *mut blst_fp2, a: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_cneg(ret: *mut blst_fp2, a: *const blst_fp2, flag: bool); +} +extern "C" { + pub fn blst_fp2_eucl_inverse(ret: *mut blst_fp2, a: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_inverse(ret: *mut blst_fp2, a: *const blst_fp2); +} +extern "C" { + pub fn blst_fp2_sqrt(ret: *mut blst_fp2, a: *const blst_fp2) -> bool; +} +extern "C" { + pub fn blst_fp12_sqr(ret: *mut blst_fp12, a: *const blst_fp12); +} +extern "C" { + pub fn blst_fp12_cyclotomic_sqr(ret: *mut blst_fp12, a: *const blst_fp12); +} +extern "C" { + pub fn blst_fp12_mul(ret: *mut blst_fp12, a: *const blst_fp12, b: *const blst_fp12); +} +extern "C" { + pub fn blst_fp12_mul_by_xy00z0( + ret: *mut blst_fp12, + a: *const blst_fp12, + xy00z0: *const blst_fp6, + ); +} +extern "C" { + pub fn blst_fp12_conjugate(a: *mut blst_fp12); +} +extern "C" { + pub fn blst_fp12_inverse(ret: *mut blst_fp12, a: *const blst_fp12); +} +extern "C" { + pub fn blst_fp12_frobenius_map(ret: *mut blst_fp12, a: *const blst_fp12, n: usize); +} +extern "C" { + pub fn blst_fp12_is_equal(a: *const blst_fp12, b: *const blst_fp12) -> bool; +} +extern "C" { + pub fn blst_fp12_is_one(a: *const blst_fp12) -> bool; +} +extern "C" { + pub fn blst_fp12_in_group(a: *const blst_fp12) -> bool; +} +extern "C" { + pub fn blst_fp12_one() -> *const blst_fp12; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, Eq)] +pub struct blst_p1 { + pub x: blst_fp, + pub y: blst_fp, + pub z: blst_fp, +} +#[test] +fn bindgen_test_layout_blst_p1() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 144usize, + concat!("Size of: ", stringify!(blst_p1)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_p1)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_p1), + "::", + stringify!(x) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, + 48usize, + concat!( + "Offset of field: ", + stringify!(blst_p1), + "::", + stringify!(y) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).z) as usize - ptr as usize }, + 96usize, + concat!( + "Offset of field: ", + stringify!(blst_p1), + "::", + stringify!(z) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, Eq)] +pub struct blst_p1_affine { + pub x: blst_fp, + pub y: blst_fp, +} +#[test] +fn bindgen_test_layout_blst_p1_affine() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 96usize, + concat!("Size of: ", stringify!(blst_p1_affine)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_p1_affine)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_p1_affine), + "::", + stringify!(x) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, + 48usize, + concat!( + "Offset of field: ", + stringify!(blst_p1_affine), + "::", + stringify!(y) + ) + ); +} +extern "C" { + pub fn blst_p1_add(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1); +} +extern "C" { + pub fn blst_p1_add_or_double(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1); +} +extern "C" { + pub fn blst_p1_add_affine(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1_affine); +} +extern "C" { + pub fn blst_p1_add_or_double_affine( + out: *mut blst_p1, + a: *const blst_p1, + b: *const blst_p1_affine, + ); +} +extern "C" { + pub fn blst_p1_double(out: *mut blst_p1, a: *const blst_p1); +} +extern "C" { + pub fn blst_p1_mult(out: *mut blst_p1, p: *const blst_p1, scalar: *const byte, nbits: usize); +} +extern "C" { + pub fn blst_p1_cneg(p: *mut blst_p1, cbit: bool); +} +extern "C" { + pub fn blst_p1_to_affine(out: *mut blst_p1_affine, in_: *const blst_p1); +} +extern "C" { + pub fn blst_p1_from_affine(out: *mut blst_p1, in_: *const blst_p1_affine); +} +extern "C" { + pub fn blst_p1_on_curve(p: *const blst_p1) -> bool; +} +extern "C" { + pub fn blst_p1_in_g1(p: *const blst_p1) -> bool; +} +extern "C" { + pub fn blst_p1_is_equal(a: *const blst_p1, b: *const blst_p1) -> bool; +} +extern "C" { + pub fn blst_p1_is_inf(a: *const blst_p1) -> bool; +} +extern "C" { + pub fn blst_p1_generator() -> *const blst_p1; +} +extern "C" { + pub fn blst_p1_affine_on_curve(p: *const blst_p1_affine) -> bool; +} +extern "C" { + pub fn blst_p1_affine_in_g1(p: *const blst_p1_affine) -> bool; +} +extern "C" { + pub fn blst_p1_affine_is_equal(a: *const blst_p1_affine, b: *const blst_p1_affine) -> bool; +} +extern "C" { + pub fn blst_p1_affine_is_inf(a: *const blst_p1_affine) -> bool; +} +extern "C" { + pub fn blst_p1_affine_generator() -> *const blst_p1_affine; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, Eq)] +pub struct blst_p2 { + pub x: blst_fp2, + pub y: blst_fp2, + pub z: blst_fp2, +} +#[test] +fn bindgen_test_layout_blst_p2() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 288usize, + concat!("Size of: ", stringify!(blst_p2)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_p2)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_p2), + "::", + stringify!(x) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, + 96usize, + concat!( + "Offset of field: ", + stringify!(blst_p2), + "::", + stringify!(y) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).z) as usize - ptr as usize }, + 192usize, + concat!( + "Offset of field: ", + stringify!(blst_p2), + "::", + stringify!(z) + ) + ); +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, Eq)] +pub struct blst_p2_affine { + pub x: blst_fp2, + pub y: blst_fp2, +} +#[test] +fn bindgen_test_layout_blst_p2_affine() { + const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); + let ptr = UNINIT.as_ptr(); + assert_eq!( + ::core::mem::size_of::(), + 192usize, + concat!("Size of: ", stringify!(blst_p2_affine)) + ); + assert_eq!( + ::core::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(blst_p2_affine)) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(blst_p2_affine), + "::", + stringify!(x) + ) + ); + assert_eq!( + unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, + 96usize, + concat!( + "Offset of field: ", + stringify!(blst_p2_affine), + "::", + stringify!(y) + ) + ); +} +extern "C" { + pub fn blst_p2_add(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2); +} +extern "C" { + pub fn blst_p2_add_or_double(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2); +} +extern "C" { + pub fn blst_p2_add_affine(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2_affine); +} +extern "C" { + pub fn blst_p2_add_or_double_affine( + out: *mut blst_p2, + a: *const blst_p2, + b: *const blst_p2_affine, + ); +} +extern "C" { + pub fn blst_p2_double(out: *mut blst_p2, a: *const blst_p2); +} +extern "C" { + pub fn blst_p2_mult(out: *mut blst_p2, p: *const blst_p2, scalar: *const byte, nbits: usize); +} +extern "C" { + pub fn blst_p2_cneg(p: *mut blst_p2, cbit: bool); +} +extern "C" { + pub fn blst_p2_to_affine(out: *mut blst_p2_affine, in_: *const blst_p2); +} +extern "C" { + pub fn blst_p2_from_affine(out: *mut blst_p2, in_: *const blst_p2_affine); +} +extern "C" { + pub fn blst_p2_on_curve(p: *const blst_p2) -> bool; +} +extern "C" { + pub fn blst_p2_in_g2(p: *const blst_p2) -> bool; +} +extern "C" { + pub fn blst_p2_is_equal(a: *const blst_p2, b: *const blst_p2) -> bool; +} +extern "C" { + pub fn blst_p2_is_inf(a: *const blst_p2) -> bool; +} +extern "C" { + pub fn blst_p2_generator() -> *const blst_p2; +} +extern "C" { + pub fn blst_p2_affine_on_curve(p: *const blst_p2_affine) -> bool; +} +extern "C" { + pub fn blst_p2_affine_in_g2(p: *const blst_p2_affine) -> bool; +} +extern "C" { + pub fn blst_p2_affine_is_equal(a: *const blst_p2_affine, b: *const blst_p2_affine) -> bool; +} +extern "C" { + pub fn blst_p2_affine_is_inf(a: *const blst_p2_affine) -> bool; +} +extern "C" { + pub fn blst_p2_affine_generator() -> *const blst_p2_affine; +} +extern "C" { + pub fn blst_p1s_to_affine( + dst: *mut blst_p1_affine, + points: *const *const blst_p1, + npoints: usize, + ); +} +extern "C" { + pub fn blst_p1s_add(ret: *mut blst_p1, points: *const *const blst_p1_affine, npoints: usize); +} +extern "C" { + pub fn blst_p1s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p1s_mult_wbits_precompute( + table: *mut blst_p1_affine, + wbits: usize, + points: *const *const blst_p1_affine, + npoints: usize, + ); +} +extern "C" { + pub fn blst_p1s_mult_wbits_scratch_sizeof(npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p1s_mult_wbits( + ret: *mut blst_p1, + table: *const blst_p1_affine, + wbits: usize, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + ); +} +extern "C" { + pub fn blst_p1s_mult_pippenger_scratch_sizeof(npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p1s_mult_pippenger( + ret: *mut blst_p1, + points: *const *const blst_p1_affine, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + ); +} +extern "C" { + pub fn blst_p1s_tile_pippenger( + ret: *mut blst_p1, + points: *const *const blst_p1_affine, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + bit0: usize, + window: usize, + ); +} +extern "C" { + pub fn blst_p2s_to_affine( + dst: *mut blst_p2_affine, + points: *const *const blst_p2, + npoints: usize, + ); +} +extern "C" { + pub fn blst_p2s_add(ret: *mut blst_p2, points: *const *const blst_p2_affine, npoints: usize); +} +extern "C" { + pub fn blst_p2s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p2s_mult_wbits_precompute( + table: *mut blst_p2_affine, + wbits: usize, + points: *const *const blst_p2_affine, + npoints: usize, + ); +} +extern "C" { + pub fn blst_p2s_mult_wbits_scratch_sizeof(npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p2s_mult_wbits( + ret: *mut blst_p2, + table: *const blst_p2_affine, + wbits: usize, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + ); +} +extern "C" { + pub fn blst_p2s_mult_pippenger_scratch_sizeof(npoints: usize) -> usize; +} +extern "C" { + pub fn blst_p2s_mult_pippenger( + ret: *mut blst_p2, + points: *const *const blst_p2_affine, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + ); +} +extern "C" { + pub fn blst_p2s_tile_pippenger( + ret: *mut blst_p2, + points: *const *const blst_p2_affine, + npoints: usize, + scalars: *const *const byte, + nbits: usize, + scratch: *mut limb_t, + bit0: usize, + window: usize, + ); +} +extern "C" { + pub fn blst_map_to_g1(out: *mut blst_p1, u: *const blst_fp, v: *const blst_fp); +} +extern "C" { + pub fn blst_map_to_g2(out: *mut blst_p2, u: *const blst_fp2, v: *const blst_fp2); +} +extern "C" { + pub fn blst_encode_to_g1( + out: *mut blst_p1, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ); +} +extern "C" { + pub fn blst_hash_to_g1( + out: *mut blst_p1, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ); +} +extern "C" { + pub fn blst_encode_to_g2( + out: *mut blst_p2, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ); +} +extern "C" { + pub fn blst_hash_to_g2( + out: *mut blst_p2, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ); +} +extern "C" { + pub fn blst_p1_serialize(out: *mut byte, in_: *const blst_p1); +} +extern "C" { + pub fn blst_p1_compress(out: *mut byte, in_: *const blst_p1); +} +extern "C" { + pub fn blst_p1_affine_serialize(out: *mut byte, in_: *const blst_p1_affine); +} +extern "C" { + pub fn blst_p1_affine_compress(out: *mut byte, in_: *const blst_p1_affine); +} +extern "C" { + pub fn blst_p1_uncompress(out: *mut blst_p1_affine, in_: *const byte) -> BLST_ERROR; +} +extern "C" { + pub fn blst_p1_deserialize(out: *mut blst_p1_affine, in_: *const byte) -> BLST_ERROR; +} +extern "C" { + pub fn blst_p2_serialize(out: *mut byte, in_: *const blst_p2); +} +extern "C" { + pub fn blst_p2_compress(out: *mut byte, in_: *const blst_p2); +} +extern "C" { + pub fn blst_p2_affine_serialize(out: *mut byte, in_: *const blst_p2_affine); +} +extern "C" { + pub fn blst_p2_affine_compress(out: *mut byte, in_: *const blst_p2_affine); +} +extern "C" { + pub fn blst_p2_uncompress(out: *mut blst_p2_affine, in_: *const byte) -> BLST_ERROR; +} +extern "C" { + pub fn blst_p2_deserialize(out: *mut blst_p2_affine, in_: *const byte) -> BLST_ERROR; +} +extern "C" { + pub fn blst_keygen( + out_SK: *mut blst_scalar, + IKM: *const byte, + IKM_len: usize, + info: *const byte, + info_len: usize, + ); +} +extern "C" { + pub fn blst_sk_to_pk_in_g1(out_pk: *mut blst_p1, SK: *const blst_scalar); +} +extern "C" { + pub fn blst_sign_pk_in_g1(out_sig: *mut blst_p2, hash: *const blst_p2, SK: *const blst_scalar); +} +extern "C" { + pub fn blst_sk_to_pk_in_g2(out_pk: *mut blst_p2, SK: *const blst_scalar); +} +extern "C" { + pub fn blst_sign_pk_in_g2(out_sig: *mut blst_p1, hash: *const blst_p1, SK: *const blst_scalar); +} +extern "C" { + pub fn blst_miller_loop( + ret: *mut blst_fp12, + Q: *const blst_p2_affine, + P: *const blst_p1_affine, + ); +} +extern "C" { + pub fn blst_miller_loop_n( + ret: *mut blst_fp12, + Qs: *const *const blst_p2_affine, + Ps: *const *const blst_p1_affine, + n: usize, + ); +} +extern "C" { + pub fn blst_final_exp(ret: *mut blst_fp12, f: *const blst_fp12); +} +extern "C" { + pub fn blst_precompute_lines(Qlines: *mut blst_fp6, Q: *const blst_p2_affine); +} +extern "C" { + pub fn blst_miller_loop_lines( + ret: *mut blst_fp12, + Qlines: *const blst_fp6, + P: *const blst_p1_affine, + ); +} +extern "C" { + pub fn blst_fp12_finalverify(gt1: *const blst_fp12, gt2: *const blst_fp12) -> bool; +} +#[repr(C)] +#[repr(align(1))] +#[derive(Debug, Default)] +pub struct blst_pairing { + pub _bindgen_opaque_blob: [u8; 0usize], +} +#[test] +fn bindgen_test_layout_blst_pairing() { + assert_eq!( + ::core::mem::size_of::(), + 0usize, + concat!("Size of: ", stringify!(blst_pairing)) + ); + assert_eq!( + ::core::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(blst_pairing)) + ); +} +extern "C" { + pub fn blst_pairing_sizeof() -> usize; +} +extern "C" { + pub fn blst_pairing_init( + new_ctx: *mut blst_pairing, + hash_or_encode: bool, + DST: *const byte, + DST_len: usize, + ); +} +extern "C" { + pub fn blst_pairing_get_dst(ctx: *const blst_pairing) -> *const byte; +} +extern "C" { + pub fn blst_pairing_commit(ctx: *mut blst_pairing); +} +extern "C" { + pub fn blst_pairing_aggregate_pk_in_g2( + ctx: *mut blst_pairing, + PK: *const blst_p2_affine, + signature: *const blst_p1_affine, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_chk_n_aggr_pk_in_g2( + ctx: *mut blst_pairing, + PK: *const blst_p2_affine, + pk_grpchk: bool, + signature: *const blst_p1_affine, + sig_grpchk: bool, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_mul_n_aggregate_pk_in_g2( + ctx: *mut blst_pairing, + PK: *const blst_p2_affine, + sig: *const blst_p1_affine, + scalar: *const byte, + nbits: usize, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_chk_n_mul_n_aggr_pk_in_g2( + ctx: *mut blst_pairing, + PK: *const blst_p2_affine, + pk_grpchk: bool, + sig: *const blst_p1_affine, + sig_grpchk: bool, + scalar: *const byte, + nbits: usize, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_aggregate_pk_in_g1( + ctx: *mut blst_pairing, + PK: *const blst_p1_affine, + signature: *const blst_p2_affine, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_chk_n_aggr_pk_in_g1( + ctx: *mut blst_pairing, + PK: *const blst_p1_affine, + pk_grpchk: bool, + signature: *const blst_p2_affine, + sig_grpchk: bool, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_mul_n_aggregate_pk_in_g1( + ctx: *mut blst_pairing, + PK: *const blst_p1_affine, + sig: *const blst_p2_affine, + scalar: *const byte, + nbits: usize, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_chk_n_mul_n_aggr_pk_in_g1( + ctx: *mut blst_pairing, + PK: *const blst_p1_affine, + pk_grpchk: bool, + sig: *const blst_p2_affine, + sig_grpchk: bool, + scalar: *const byte, + nbits: usize, + msg: *const byte, + msg_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_merge(ctx: *mut blst_pairing, ctx1: *const blst_pairing) -> BLST_ERROR; +} +extern "C" { + pub fn blst_pairing_finalverify(ctx: *const blst_pairing, gtsig: *const blst_fp12) -> bool; +} +extern "C" { + pub fn blst_aggregate_in_g1( + out: *mut blst_p1, + in_: *const blst_p1, + zwire: *const byte, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_aggregate_in_g2( + out: *mut blst_p2, + in_: *const blst_p2, + zwire: *const byte, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_aggregated_in_g1(out: *mut blst_fp12, signature: *const blst_p1_affine); +} +extern "C" { + pub fn blst_aggregated_in_g2(out: *mut blst_fp12, signature: *const blst_p2_affine); +} +extern "C" { + pub fn blst_core_verify_pk_in_g1( + pk: *const blst_p1_affine, + signature: *const blst_p2_affine, + hash_or_encode: bool, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub fn blst_core_verify_pk_in_g2( + pk: *const blst_p2_affine, + signature: *const blst_p1_affine, + hash_or_encode: bool, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + aug: *const byte, + aug_len: usize, + ) -> BLST_ERROR; +} +extern "C" { + pub static BLS12_381_G1: blst_p1_affine; +} +extern "C" { + pub static BLS12_381_NEG_G1: blst_p1_affine; +} +extern "C" { + pub static BLS12_381_G2: blst_p2_affine; +} +extern "C" { + pub static BLS12_381_NEG_G2: blst_p2_affine; +} +extern "C" { + pub fn blst_fr_ct_bfly(x0: *mut blst_fr, x1: *mut blst_fr, twiddle: *const blst_fr); +} +extern "C" { + pub fn blst_fr_gs_bfly(x0: *mut blst_fr, x1: *mut blst_fr, twiddle: *const blst_fr); +} +extern "C" { + pub fn blst_fr_to(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fr_from(ret: *mut blst_fr, a: *const blst_fr); +} +extern "C" { + pub fn blst_fp_to(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_from(ret: *mut blst_fp, a: *const blst_fp); +} +extern "C" { + pub fn blst_fp_is_square(a: *const blst_fp) -> bool; +} +extern "C" { + pub fn blst_fp2_is_square(a: *const blst_fp2) -> bool; +} +extern "C" { + pub fn blst_p1_from_jacobian(out: *mut blst_p1, in_: *const blst_p1); +} +extern "C" { + pub fn blst_p2_from_jacobian(out: *mut blst_p2, in_: *const blst_p2); +} +extern "C" { + pub fn blst_sk_to_pk2_in_g1( + out: *mut byte, + out_pk: *mut blst_p1_affine, + SK: *const blst_scalar, + ); +} +extern "C" { + pub fn blst_sign_pk2_in_g1( + out: *mut byte, + out_sig: *mut blst_p2_affine, + hash: *const blst_p2, + SK: *const blst_scalar, + ); +} +extern "C" { + pub fn blst_sk_to_pk2_in_g2( + out: *mut byte, + out_pk: *mut blst_p2_affine, + SK: *const blst_scalar, + ); +} +extern "C" { + pub fn blst_sign_pk2_in_g2( + out: *mut byte, + out_sig: *mut blst_p1_affine, + hash: *const blst_p1, + SK: *const blst_scalar, + ); +} +#[repr(C)] +#[repr(align(1))] +#[derive(Debug, Default)] +pub struct blst_uniq { + pub _bindgen_opaque_blob: [u8; 0usize], +} +#[test] +fn bindgen_test_layout_blst_uniq() { + assert_eq!( + ::core::mem::size_of::(), + 0usize, + concat!("Size of: ", stringify!(blst_uniq)) + ); + assert_eq!( + ::core::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(blst_uniq)) + ); +} +extern "C" { + pub fn blst_uniq_sizeof(n_nodes: usize) -> usize; +} +extern "C" { + pub fn blst_uniq_init(tree: *mut blst_uniq); +} +extern "C" { + pub fn blst_uniq_test(tree: *mut blst_uniq, msg: *const byte, len: usize) -> bool; +} +extern "C" { + pub fn blst_expand_message_xmd( + out: *mut byte, + out_len: usize, + msg: *const byte, + msg_len: usize, + DST: *const byte, + DST_len: usize, + ); +} +extern "C" { + pub fn blst_p1_unchecked_mult( + out: *mut blst_p1, + p: *const blst_p1, + scalar: *const byte, + nbits: usize, + ); +} +extern "C" { + pub fn blst_p2_unchecked_mult( + out: *mut blst_p2, + p: *const blst_p2, + scalar: *const byte, + nbits: usize, + ); +} +extern "C" { + pub fn blst_pairing_raw_aggregate( + ctx: *mut blst_pairing, + q: *const blst_p2_affine, + p: *const blst_p1_affine, + ); +} +extern "C" { + pub fn blst_pairing_as_fp12(ctx: *mut blst_pairing) -> *mut blst_fp12; +} +extern "C" { + pub fn blst_bendian_from_fp12(out: *mut byte, a: *const blst_fp12); +} +extern "C" { + pub fn blst_keygen_v3( + out_SK: *mut blst_scalar, + IKM: *const byte, + IKM_len: usize, + info: *const byte, + info_len: usize, + ); +} +extern "C" { + pub fn blst_keygen_v4_5( + out_SK: *mut blst_scalar, + IKM: *const byte, + IKM_len: usize, + salt: *const byte, + salt_len: usize, + info: *const byte, + info_len: usize, + ); +} +extern "C" { + pub fn blst_keygen_v5( + out_SK: *mut blst_scalar, + IKM: *const byte, + IKM_len: usize, + salt: *const byte, + salt_len: usize, + info: *const byte, + info_len: usize, + ); +} +extern "C" { + pub fn blst_derive_master_eip2333(out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize); +} +extern "C" { + pub fn blst_derive_child_eip2333( + out_SK: *mut blst_scalar, + SK: *const blst_scalar, + child_index: u32, + ); +} +extern "C" { + pub fn blst_scalar_from_hexascii(out: *mut blst_scalar, hex: *const byte); +} +extern "C" { + pub fn blst_fr_from_hexascii(ret: *mut blst_fr, hex: *const byte); +} +extern "C" { + pub fn blst_fp_from_hexascii(ret: *mut blst_fp, hex: *const byte); +} +extern "C" { + pub fn blst_p1_sizeof() -> usize; +} +extern "C" { + pub fn blst_p1_affine_sizeof() -> usize; +} +extern "C" { + pub fn blst_p2_sizeof() -> usize; +} +extern "C" { + pub fn blst_p2_affine_sizeof() -> usize; +} +extern "C" { + pub fn blst_fp12_sizeof() -> usize; +} +extern "C" { + pub fn blst_sha256(out: *mut byte, msg: *const byte, msg_len: usize); +} +#[test] +fn bindgen_test_normal_types() { + // from "Rust for Rustaceans" by Jon Gjengset + fn is_normal() {} + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); + is_normal::(); +} diff --git a/src/blst/bindings/rust/src/lib.rs b/src/blst/bindings/rust/src/lib.rs new file mode 100644 index 0000000000..a91c4b7482 --- /dev/null +++ b/src/blst/bindings/rust/src/lib.rs @@ -0,0 +1,2281 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg_attr(not(feature = "std"), no_std)] +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(unexpected_cfgs)] + +extern crate alloc; + +use alloc::boxed::Box; +use alloc::vec; +use alloc::vec::Vec; +use core::any::Any; +use core::mem::{transmute, MaybeUninit}; +use core::ptr; +use zeroize::Zeroize; + +#[cfg(feature = "std")] +use std::sync::{atomic::*, mpsc::channel, Arc}; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +#[cfg(feature = "std")] +trait ThreadPoolExt { + fn joined_execute<'any, F>(&self, job: F) + where + F: FnOnce() + Send + 'any; +} + +#[cfg(all(not(feature = "no-threads"), feature = "std"))] +mod mt { + use super::*; + use std::sync::{Mutex, Once}; + use threadpool::ThreadPool; + + pub fn da_pool() -> ThreadPool { + static INIT: Once = Once::new(); + static mut POOL: *const Mutex = ptr::null(); + + INIT.call_once(|| { + let pool = Mutex::new(ThreadPool::default()); + unsafe { POOL = transmute::, *const _>(Box::new(pool)) }; + }); + unsafe { (*POOL).lock().unwrap().clone() } + } + + type Thunk<'any> = Box; + + impl ThreadPoolExt for ThreadPool { + fn joined_execute<'scope, F>(&self, job: F) + where + F: FnOnce() + Send + 'scope, + { + // Bypass 'lifetime limitations by brute force. It works, + // because we explicitly join the threads... + self.execute(unsafe { + transmute::, Thunk<'static>>(Box::new(job)) + }) + } + } +} + +#[cfg(all(feature = "no-threads", feature = "std"))] +mod mt { + use super::*; + + pub struct EmptyPool {} + + pub fn da_pool() -> EmptyPool { + EmptyPool {} + } + + impl EmptyPool { + pub fn max_count(&self) -> usize { + 1 + } + } + + impl ThreadPoolExt for EmptyPool { + fn joined_execute<'scope, F>(&self, job: F) + where + F: FnOnce() + Send + 'scope, + { + job() + } + } +} + +include!("bindings.rs"); + +impl PartialEq for blst_p1 { + fn eq(&self, other: &Self) -> bool { + unsafe { blst_p1_is_equal(self, other) } + } +} + +impl PartialEq for blst_p1_affine { + fn eq(&self, other: &Self) -> bool { + unsafe { blst_p1_affine_is_equal(self, other) } + } +} + +impl PartialEq for blst_p2 { + fn eq(&self, other: &Self) -> bool { + unsafe { blst_p2_is_equal(self, other) } + } +} + +impl PartialEq for blst_p2_affine { + fn eq(&self, other: &Self) -> bool { + unsafe { blst_p2_affine_is_equal(self, other) } + } +} + +impl Default for blst_fp12 { + fn default() -> Self { + unsafe { *blst_fp12_one() } + } +} + +impl PartialEq for blst_fp12 { + fn eq(&self, other: &Self) -> bool { + unsafe { blst_fp12_is_equal(self, other) } + } +} + +impl core::ops::Mul for blst_fp12 { + type Output = Self; + + fn mul(self, other: Self) -> Self { + let mut out = MaybeUninit::::uninit(); + unsafe { + blst_fp12_mul(out.as_mut_ptr(), &self, &other); + out.assume_init() + } + } +} + +impl core::ops::MulAssign for blst_fp12 { + fn mul_assign(&mut self, other: Self) { + unsafe { blst_fp12_mul(self, self, &other) } + } +} + +impl blst_fp12 { + pub fn miller_loop(q: &blst_p2_affine, p: &blst_p1_affine) -> Self { + let mut out = MaybeUninit::::uninit(); + unsafe { + blst_miller_loop(out.as_mut_ptr(), q, p); + out.assume_init() + } + } + + #[cfg(not(feature = "std"))] + pub fn miller_loop_n(q: &[blst_p2_affine], p: &[blst_p1_affine]) -> Self { + let n_elems = q.len(); + if n_elems != p.len() || n_elems == 0 { + panic!("inputs' lengths mismatch"); + } + let qs: [*const _; 2] = [&q[0], ptr::null()]; + let ps: [*const _; 2] = [&p[0], ptr::null()]; + let mut out = MaybeUninit::::uninit(); + unsafe { + blst_miller_loop_n(out.as_mut_ptr(), &qs[0], &ps[0], n_elems); + out.assume_init() + } + } + + #[cfg(feature = "std")] + pub fn miller_loop_n(q: &[blst_p2_affine], p: &[blst_p1_affine]) -> Self { + let n_elems = q.len(); + if n_elems != p.len() || n_elems == 0 { + panic!("inputs' lengths mismatch"); + } + + let pool = mt::da_pool(); + + let mut n_workers = pool.max_count(); + if n_workers == 1 { + let qs: [*const _; 2] = [&q[0], ptr::null()]; + let ps: [*const _; 2] = [&p[0], ptr::null()]; + let mut out = MaybeUninit::::uninit(); + unsafe { + blst_miller_loop_n(out.as_mut_ptr(), &qs[0], &ps[0], n_elems); + return out.assume_init(); + } + } + + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + + let stride = core::cmp::min((n_elems + n_workers - 1) / n_workers, 16); + n_workers = core::cmp::min((n_elems + stride - 1) / stride, n_workers); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + + pool.joined_execute(move || { + let mut acc = blst_fp12::default(); + let mut tmp = MaybeUninit::::uninit(); + let mut qs: [*const _; 2] = [ptr::null(), ptr::null()]; + let mut ps: [*const _; 2] = [ptr::null(), ptr::null()]; + + loop { + let work = counter.fetch_add(stride, Ordering::Relaxed); + if work >= n_elems { + break; + } + let n = core::cmp::min(n_elems - work, stride); + qs[0] = &q[work]; + ps[0] = &p[work]; + unsafe { + blst_miller_loop_n(tmp.as_mut_ptr(), &qs[0], &ps[0], n); + acc *= tmp.assume_init(); + } + } + + tx.send(acc).expect("disaster"); + }); + } + + let mut acc = rx.recv().unwrap(); + for _ in 1..n_workers { + acc *= rx.recv().unwrap(); + } + + acc + } + + pub fn final_exp(&self) -> Self { + let mut out = MaybeUninit::::uninit(); + unsafe { + blst_final_exp(out.as_mut_ptr(), self); + out.assume_init() + } + } + + pub fn in_group(&self) -> bool { + unsafe { blst_fp12_in_group(self) } + } + + pub fn finalverify(a: &Self, b: &Self) -> bool { + unsafe { blst_fp12_finalverify(a, b) } + } + + pub fn to_bendian(&self) -> [u8; 48 * 12] { + let mut out = MaybeUninit::<[u8; 48 * 12]>::uninit(); + unsafe { + blst_bendian_from_fp12(out.as_mut_ptr() as *mut u8, self); + out.assume_init() + } + } +} + +impl blst_scalar { + pub fn hash_to(msg: &[u8], dst: &[u8]) -> Option { + unsafe { + let mut out = ::default(); + let mut elem = [0u8; 48]; + blst_expand_message_xmd( + elem.as_mut_ptr(), + elem.len(), + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + ); + if blst_scalar_from_be_bytes(&mut out, elem.as_ptr(), elem.len()) { + Some(out) + } else { + None + } + } + } +} + +#[derive(Debug)] +pub struct Pairing { + v: Box<[u64]>, +} + +impl Pairing { + pub fn new(hash_or_encode: bool, dst: &[u8]) -> Self { + let v: Vec = vec![0; unsafe { blst_pairing_sizeof() } / 8]; + let mut obj = Self { + v: v.into_boxed_slice(), + }; + obj.init(hash_or_encode, dst); + obj + } + + pub fn init(&mut self, hash_or_encode: bool, dst: &[u8]) { + unsafe { + blst_pairing_init( + self.ctx(), + hash_or_encode, + dst.as_ptr(), + dst.len(), + ) + } + } + fn ctx(&mut self) -> *mut blst_pairing { + self.v.as_mut_ptr() as *mut blst_pairing + } + fn const_ctx(&self) -> *const blst_pairing { + self.v.as_ptr() as *const blst_pairing + } + + pub fn aggregate( + &mut self, + pk: &dyn Any, + pk_validate: bool, + sig: &dyn Any, + sig_groupcheck: bool, + msg: &[u8], + aug: &[u8], + ) -> BLST_ERROR { + if pk.is::() { + unsafe { + blst_pairing_chk_n_aggr_pk_in_g1( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + pk_validate, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + sig_groupcheck, + msg.as_ptr(), + msg.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else if pk.is::() { + unsafe { + blst_pairing_chk_n_aggr_pk_in_g2( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + pk_validate, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + sig_groupcheck, + msg.as_ptr(), + msg.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else { + panic!("whaaaa?") + } + } + + #[allow(clippy::too_many_arguments)] + pub fn mul_n_aggregate( + &mut self, + pk: &dyn Any, + pk_validate: bool, + sig: &dyn Any, + sig_groupcheck: bool, + scalar: &[u8], + nbits: usize, + msg: &[u8], + aug: &[u8], + ) -> BLST_ERROR { + if pk.is::() { + unsafe { + blst_pairing_chk_n_mul_n_aggr_pk_in_g1( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + pk_validate, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + sig_groupcheck, + scalar.as_ptr(), + nbits, + msg.as_ptr(), + msg.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else if pk.is::() { + unsafe { + blst_pairing_chk_n_mul_n_aggr_pk_in_g2( + self.ctx(), + match pk.downcast_ref::() { + Some(pk) => pk, + None => ptr::null(), + }, + pk_validate, + match sig.downcast_ref::() { + Some(sig) => sig, + None => ptr::null(), + }, + sig_groupcheck, + scalar.as_ptr(), + nbits, + msg.as_ptr(), + msg.len(), + aug.as_ptr(), + aug.len(), + ) + } + } else { + panic!("whaaaa?") + } + } + + pub fn aggregated(gtsig: &mut blst_fp12, sig: &dyn Any) { + if sig.is::() { + unsafe { + blst_aggregated_in_g1( + gtsig, + sig.downcast_ref::().unwrap(), + ) + } + } else if sig.is::() { + unsafe { + blst_aggregated_in_g2( + gtsig, + sig.downcast_ref::().unwrap(), + ) + } + } else { + panic!("whaaaa?") + } + } + + pub fn commit(&mut self) { + unsafe { blst_pairing_commit(self.ctx()) } + } + + pub fn merge(&mut self, ctx1: &Self) -> BLST_ERROR { + unsafe { blst_pairing_merge(self.ctx(), ctx1.const_ctx()) } + } + + pub fn finalverify(&self, gtsig: Option<&blst_fp12>) -> bool { + unsafe { + blst_pairing_finalverify( + self.const_ctx(), + match gtsig { + Some(gtsig) => gtsig, + None => ptr::null(), + }, + ) + } + } + + pub fn raw_aggregate(&mut self, q: &blst_p2_affine, p: &blst_p1_affine) { + unsafe { blst_pairing_raw_aggregate(self.ctx(), q, p) } + } + + pub fn as_fp12(&mut self) -> blst_fp12 { + unsafe { *blst_pairing_as_fp12(self.ctx()) } + } +} + +pub fn uniq(msgs: &[&[u8]]) -> bool { + let n_elems = msgs.len(); + + if n_elems == 1 { + return true; + } else if n_elems == 2 { + return msgs[0] != msgs[1]; + } + + let mut v: Vec = vec![0; unsafe { blst_uniq_sizeof(n_elems) } / 8]; + let ctx = v.as_mut_ptr() as *mut blst_uniq; + + unsafe { blst_uniq_init(ctx) }; + + for msg in msgs.iter() { + if !unsafe { blst_uniq_test(ctx, msg.as_ptr(), msg.len()) } { + return false; + } + } + + true +} + +#[cfg(feature = "std")] +pub fn print_bytes(bytes: &[u8], name: &str) { + print!("{} ", name); + for b in bytes.iter() { + print!("{:02x}", b); + } + println!(); +} + +macro_rules! sig_variant_impl { + ( + $name:expr, + $pk:ty, + $pk_aff:ty, + $sig:ty, + $sig_aff:ty, + $sk_to_pk:ident, + $hash_or_encode:expr, + $hash_or_encode_to:ident, + $sign:ident, + $pk_eq:ident, + $sig_eq:ident, + $verify:ident, + $pk_in_group:ident, + $pk_to_aff:ident, + $pk_from_aff:ident, + $pk_ser:ident, + $pk_comp:ident, + $pk_deser:ident, + $pk_uncomp:ident, + $pk_comp_size:expr, + $pk_ser_size:expr, + $sig_in_group:ident, + $sig_to_aff:ident, + $sig_from_aff:ident, + $sig_ser:ident, + $sig_comp:ident, + $sig_deser:ident, + $sig_uncomp:ident, + $sig_comp_size:expr, + $sig_ser_size:expr, + $pk_add_or_dbl:ident, + $pk_add_or_dbl_aff:ident, + $sig_add_or_dbl:ident, + $sig_add_or_dbl_aff:ident, + $pk_is_inf:ident, + $sig_is_inf:ident, + $sig_aggr_in_group:ident, + ) => { + /// Secret Key + #[derive(Default, Debug, Clone, Zeroize)] + #[zeroize(drop)] + pub struct SecretKey { + value: blst_scalar, + } + + impl SecretKey { + /// Deterministically generate a secret key from key material + pub fn key_gen( + ikm: &[u8], + key_info: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = SecretKey::default(); + unsafe { + blst_keygen( + &mut sk.value, + ikm.as_ptr(), + ikm.len(), + key_info.as_ptr(), + key_info.len(), + ); + } + Ok(sk) + } + + pub fn key_gen_v3( + ikm: &[u8], + key_info: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = SecretKey::default(); + unsafe { + blst_keygen_v3( + &mut sk.value, + ikm.as_ptr(), + ikm.len(), + key_info.as_ptr(), + key_info.len(), + ); + } + Ok(sk) + } + + pub fn key_gen_v4_5( + ikm: &[u8], + salt: &[u8], + info: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = SecretKey::default(); + unsafe { + blst_keygen_v4_5( + &mut sk.value, + ikm.as_ptr(), + ikm.len(), + salt.as_ptr(), + salt.len(), + info.as_ptr(), + info.len(), + ); + } + Ok(sk) + } + + pub fn key_gen_v5( + ikm: &[u8], + salt: &[u8], + info: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = SecretKey::default(); + unsafe { + blst_keygen_v5( + &mut sk.value, + ikm.as_ptr(), + ikm.len(), + salt.as_ptr(), + salt.len(), + info.as_ptr(), + info.len(), + ); + } + Ok(sk) + } + + pub fn derive_master_eip2333( + ikm: &[u8], + ) -> Result { + if ikm.len() < 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + let mut sk = SecretKey::default(); + unsafe { + blst_derive_master_eip2333( + &mut sk.value, + ikm.as_ptr(), + ikm.len(), + ); + } + Ok(sk) + } + + pub fn derive_child_eip2333(&self, child_index: u32) -> Self { + let mut sk = SecretKey::default(); + unsafe { + blst_derive_child_eip2333( + &mut sk.value, + &self.value, + child_index, + ); + } + sk + } + + // sk_to_pk + pub fn sk_to_pk(&self) -> PublicKey { + // TODO - would the user like the serialized/compressed pk as well? + let mut pk_aff = PublicKey::default(); + //let mut pk_ser = [0u8; $pk_ser_size]; + + unsafe { + $sk_to_pk( + //pk_ser.as_mut_ptr(), + ptr::null_mut(), + &mut pk_aff.point, + &self.value, + ); + } + pk_aff + } + + // Sign + pub fn sign( + &self, + msg: &[u8], + dst: &[u8], + aug: &[u8], + ) -> Signature { + // TODO - would the user like the serialized/compressed sig as well? + let mut q = <$sig>::default(); + let mut sig_aff = <$sig_aff>::default(); + //let mut sig_ser = [0u8; $sig_ser_size]; + unsafe { + $hash_or_encode_to( + &mut q, + msg.as_ptr(), + msg.len(), + dst.as_ptr(), + dst.len(), + aug.as_ptr(), + aug.len(), + ); + $sign(ptr::null_mut(), &mut sig_aff, &q, &self.value); + } + Signature { point: sig_aff } + } + + // TODO - formally speaking application is entitled to have + // ultimate control over secret key storage, which means that + // corresponding serialization/deserialization subroutines + // should accept reference to where to store the result, as + // opposite to returning one. + + // serialize + pub fn serialize(&self) -> [u8; 32] { + let mut sk_out = [0; 32]; + unsafe { + blst_bendian_from_scalar(sk_out.as_mut_ptr(), &self.value); + } + sk_out + } + + // deserialize + pub fn deserialize(sk_in: &[u8]) -> Result { + let mut sk = blst_scalar::default(); + if sk_in.len() != 32 { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + unsafe { + blst_scalar_from_bendian(&mut sk, sk_in.as_ptr()); + if !blst_sk_check(&sk) { + return Err(BLST_ERROR::BLST_BAD_ENCODING); + } + } + Ok(Self { value: sk }) + } + + pub fn to_bytes(&self) -> [u8; 32] { + SecretKey::serialize(&self) + } + + pub fn from_bytes(sk_in: &[u8]) -> Result { + SecretKey::deserialize(sk_in) + } + } + + #[cfg(feature = "serde-secret")] + impl Serialize for SecretKey { + fn serialize( + &self, + ser: S, + ) -> Result { + let bytes = zeroize::Zeroizing::new(self.serialize()); + ser.serialize_bytes(bytes.as_ref()) + } + } + + #[cfg(feature = "serde-secret")] + impl<'de> Deserialize<'de> for SecretKey { + fn deserialize>( + deser: D, + ) -> Result { + let bytes: &[u8] = Deserialize::deserialize(deser)?; + Self::deserialize(bytes).map_err(|e| { + ::custom(format!("{:?}", e)) + }) + } + } + + #[repr(transparent)] + #[derive(Default, Debug, Clone, Copy)] + pub struct PublicKey { + point: $pk_aff, + } + + impl PublicKey { + // Core operations + + // key_validate + pub fn validate(&self) -> Result<(), BLST_ERROR> { + unsafe { + if $pk_is_inf(&self.point) { + return Err(BLST_ERROR::BLST_PK_IS_INFINITY); + } + if !$pk_in_group(&self.point) { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + } + Ok(()) + } + + pub fn key_validate(key: &[u8]) -> Result { + let pk = PublicKey::from_bytes(key)?; + pk.validate()?; + Ok(pk) + } + + pub fn from_aggregate(agg_pk: &AggregatePublicKey) -> Self { + let mut pk_aff = <$pk_aff>::default(); + unsafe { + $pk_to_aff(&mut pk_aff, &agg_pk.point); + } + Self { point: pk_aff } + } + + // Serdes + + pub fn compress(&self) -> [u8; $pk_comp_size] { + let mut pk_comp = [0u8; $pk_comp_size]; + unsafe { + $pk_comp(pk_comp.as_mut_ptr(), &self.point); + } + pk_comp + } + + pub fn serialize(&self) -> [u8; $pk_ser_size] { + let mut pk_out = [0u8; $pk_ser_size]; + unsafe { + $pk_ser(pk_out.as_mut_ptr(), &self.point); + } + pk_out + } + + pub fn uncompress(pk_comp: &[u8]) -> Result { + if pk_comp.len() == $pk_comp_size && (pk_comp[0] & 0x80) != 0 { + let mut pk = <$pk_aff>::default(); + let err = unsafe { $pk_uncomp(&mut pk, pk_comp.as_ptr()) }; + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { point: pk }) + } else { + Err(BLST_ERROR::BLST_BAD_ENCODING) + } + } + + pub fn deserialize(pk_in: &[u8]) -> Result { + if (pk_in.len() == $pk_ser_size && (pk_in[0] & 0x80) == 0) + || (pk_in.len() == $pk_comp_size && (pk_in[0] & 0x80) != 0) + { + let mut pk = <$pk_aff>::default(); + let err = unsafe { $pk_deser(&mut pk, pk_in.as_ptr()) }; + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { point: pk }) + } else { + Err(BLST_ERROR::BLST_BAD_ENCODING) + } + } + + pub fn from_bytes(pk_in: &[u8]) -> Result { + PublicKey::deserialize(pk_in) + } + + pub fn to_bytes(&self) -> [u8; $pk_comp_size] { + self.compress() + } + } + + // Trait for equality comparisons which are equivalence relations. + // + // This means, that in addition to a == b and a != b being strict + // inverses, the equality must be reflexive, symmetric and transitive. + impl Eq for PublicKey {} + + impl PartialEq for PublicKey { + fn eq(&self, other: &Self) -> bool { + unsafe { $pk_eq(&self.point, &other.point) } + } + } + + #[cfg(feature = "serde")] + impl Serialize for PublicKey { + fn serialize( + &self, + ser: S, + ) -> Result { + ser.serialize_bytes(&self.serialize()) + } + } + + #[cfg(feature = "serde")] + impl<'de> Deserialize<'de> for PublicKey { + fn deserialize>( + deser: D, + ) -> Result { + let bytes: &[u8] = Deserialize::deserialize(deser)?; + Self::deserialize(&bytes).map_err(|e| { + ::custom(format!("{:?}", e)) + }) + } + } + + #[repr(transparent)] + #[derive(Debug, Clone, Copy)] + pub struct AggregatePublicKey { + point: $pk, + } + + impl AggregatePublicKey { + pub fn from_public_key(pk: &PublicKey) -> Self { + let mut agg_pk = <$pk>::default(); + unsafe { + $pk_from_aff(&mut agg_pk, &pk.point); + } + Self { point: agg_pk } + } + + pub fn to_public_key(&self) -> PublicKey { + let mut pk = <$pk_aff>::default(); + unsafe { + $pk_to_aff(&mut pk, &self.point); + } + PublicKey { point: pk } + } + + // Aggregate + pub fn aggregate( + pks: &[&PublicKey], + pks_validate: bool, + ) -> Result { + if pks.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + if pks_validate { + pks[0].validate()?; + } + let mut agg_pk = AggregatePublicKey::from_public_key(pks[0]); + for s in pks.iter().skip(1) { + if pks_validate { + s.validate()?; + } + unsafe { + $pk_add_or_dbl_aff( + &mut agg_pk.point, + &agg_pk.point, + &s.point, + ); + } + } + Ok(agg_pk) + } + + pub fn aggregate_with_randomness( + pks: &[PublicKey], + randomness: &[u8], + nbits: usize, + pks_groupcheck: bool, + ) -> Result { + if pks.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + if pks_groupcheck { + pks.validate()?; + } + Ok(pks.mult(randomness, nbits)) + } + + pub fn aggregate_serialized( + pks: &[&[u8]], + pks_validate: bool, + ) -> Result { + // TODO - threading + if pks.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + let mut pk = if pks_validate { + PublicKey::key_validate(pks[0])? + } else { + PublicKey::from_bytes(pks[0])? + }; + let mut agg_pk = AggregatePublicKey::from_public_key(&pk); + for s in pks.iter().skip(1) { + pk = if pks_validate { + PublicKey::key_validate(s)? + } else { + PublicKey::from_bytes(s)? + }; + unsafe { + $pk_add_or_dbl_aff( + &mut agg_pk.point, + &agg_pk.point, + &pk.point, + ); + } + } + Ok(agg_pk) + } + + pub fn add_aggregate(&mut self, agg_pk: &AggregatePublicKey) { + unsafe { + $pk_add_or_dbl(&mut self.point, &self.point, &agg_pk.point); + } + } + + pub fn add_public_key( + &mut self, + pk: &PublicKey, + pk_validate: bool, + ) -> Result<(), BLST_ERROR> { + if pk_validate { + pk.validate()?; + } + unsafe { + $pk_add_or_dbl_aff(&mut self.point, &self.point, &pk.point); + } + Ok(()) + } + } + + #[repr(transparent)] + #[derive(Debug, Clone, Copy)] + pub struct Signature { + point: $sig_aff, + } + + impl Signature { + // sig_infcheck, check for infinity, is a way to avoid going + // into resource-consuming verification. Passing 'false' is + // always cryptographically safe, but application might want + // to guard against obviously bogus individual[!] signatures. + pub fn validate( + &self, + sig_infcheck: bool, + ) -> Result<(), BLST_ERROR> { + unsafe { + if sig_infcheck && $sig_is_inf(&self.point) { + return Err(BLST_ERROR::BLST_PK_IS_INFINITY); + } + if !$sig_in_group(&self.point) { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + } + Ok(()) + } + + pub fn sig_validate( + sig: &[u8], + sig_infcheck: bool, + ) -> Result { + let sig = Signature::from_bytes(sig)?; + sig.validate(sig_infcheck)?; + Ok(sig) + } + + pub fn verify( + &self, + sig_groupcheck: bool, + msg: &[u8], + dst: &[u8], + aug: &[u8], + pk: &PublicKey, + pk_validate: bool, + ) -> BLST_ERROR { + let aug_msg = [aug, msg].concat(); + self.aggregate_verify( + sig_groupcheck, + &[aug_msg.as_slice()], + dst, + &[pk], + pk_validate, + ) + } + + #[cfg(not(feature = "std"))] + pub fn aggregate_verify( + &self, + sig_groupcheck: bool, + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + pks_validate: bool, + ) -> BLST_ERROR { + let n_elems = pks.len(); + if n_elems == 0 || msgs.len() != n_elems { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + + let mut pairing = Pairing::new($hash_or_encode, dst); + + let err = pairing.aggregate( + &pks[0].point, + pks_validate, + &self.point, + sig_groupcheck, + &msgs[0], + &[], + ); + if err != BLST_ERROR::BLST_SUCCESS { + return err; + } + + for i in 1..n_elems { + let err = pairing.aggregate( + &pks[i].point, + pks_validate, + &unsafe { ptr::null::<$sig_aff>().as_ref() }, + false, + &msgs[i], + &[], + ); + if err != BLST_ERROR::BLST_SUCCESS { + return err; + } + } + + pairing.commit(); + + if pairing.finalverify(None) { + BLST_ERROR::BLST_SUCCESS + } else { + BLST_ERROR::BLST_VERIFY_FAIL + } + } + + #[cfg(feature = "std")] + pub fn aggregate_verify( + &self, + sig_groupcheck: bool, + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + pks_validate: bool, + ) -> BLST_ERROR { + let n_elems = pks.len(); + if n_elems == 0 || msgs.len() != n_elems { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + + // TODO - check msg uniqueness? + + let pool = mt::da_pool(); + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + let valid = Arc::new(AtomicBool::new(true)); + + let n_workers = core::cmp::min(pool.max_count(), n_elems); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + let valid = valid.clone(); + + pool.joined_execute(move || { + let mut pairing = Pairing::new($hash_or_encode, dst); + + while valid.load(Ordering::Relaxed) { + let work = counter.fetch_add(1, Ordering::Relaxed); + if work >= n_elems { + break; + } + if pairing.aggregate( + &pks[work].point, + pks_validate, + &unsafe { ptr::null::<$sig_aff>().as_ref() }, + false, + &msgs[work], + &[], + ) != BLST_ERROR::BLST_SUCCESS + { + valid.store(false, Ordering::Relaxed); + break; + } + } + if valid.load(Ordering::Relaxed) { + pairing.commit(); + } + tx.send(pairing).expect("disaster"); + }); + } + + if sig_groupcheck && valid.load(Ordering::Relaxed) { + match self.validate(false) { + Err(_err) => valid.store(false, Ordering::Relaxed), + _ => (), + } + } + + let mut gtsig = blst_fp12::default(); + if valid.load(Ordering::Relaxed) { + Pairing::aggregated(&mut gtsig, &self.point); + } + + let mut acc = rx.recv().unwrap(); + for _ in 1..n_workers { + acc.merge(&rx.recv().unwrap()); + } + + if valid.load(Ordering::Relaxed) + && acc.finalverify(Some(>sig)) + { + BLST_ERROR::BLST_SUCCESS + } else { + BLST_ERROR::BLST_VERIFY_FAIL + } + } + + // pks are assumed to be verified for proof of possession, + // which implies that they are already group-checked + pub fn fast_aggregate_verify( + &self, + sig_groupcheck: bool, + msg: &[u8], + dst: &[u8], + pks: &[&PublicKey], + ) -> BLST_ERROR { + let agg_pk = match AggregatePublicKey::aggregate(pks, false) { + Ok(agg_sig) => agg_sig, + Err(err) => return err, + }; + let pk = agg_pk.to_public_key(); + self.aggregate_verify( + sig_groupcheck, + &[msg], + dst, + &[&pk], + false, + ) + } + + pub fn fast_aggregate_verify_pre_aggregated( + &self, + sig_groupcheck: bool, + msg: &[u8], + dst: &[u8], + pk: &PublicKey, + ) -> BLST_ERROR { + self.aggregate_verify(sig_groupcheck, &[msg], dst, &[pk], false) + } + + // https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407 + #[cfg(feature = "std")] + #[allow(clippy::too_many_arguments)] + pub fn verify_multiple_aggregate_signatures( + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + pks_validate: bool, + sigs: &[&Signature], + sigs_groupcheck: bool, + rands: &[blst_scalar], + rand_bits: usize, + ) -> BLST_ERROR { + let n_elems = pks.len(); + if n_elems == 0 + || msgs.len() != n_elems + || sigs.len() != n_elems + || rands.len() != n_elems + { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + + // TODO - check msg uniqueness? + + let pool = mt::da_pool(); + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + let valid = Arc::new(AtomicBool::new(true)); + + let n_workers = core::cmp::min(pool.max_count(), n_elems); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + let valid = valid.clone(); + + pool.joined_execute(move || { + let mut pairing = Pairing::new($hash_or_encode, dst); + + // TODO - engage multi-point mul-n-add for larger + // amount of inputs... + while valid.load(Ordering::Relaxed) { + let work = counter.fetch_add(1, Ordering::Relaxed); + if work >= n_elems { + break; + } + + if pairing.mul_n_aggregate( + &pks[work].point, + pks_validate, + &sigs[work].point, + sigs_groupcheck, + &rands[work].b, + rand_bits, + msgs[work], + &[], + ) != BLST_ERROR::BLST_SUCCESS + { + valid.store(false, Ordering::Relaxed); + break; + } + } + if valid.load(Ordering::Relaxed) { + pairing.commit(); + } + tx.send(pairing).expect("disaster"); + }); + } + + let mut acc = rx.recv().unwrap(); + for _ in 1..n_workers { + acc.merge(&rx.recv().unwrap()); + } + + if valid.load(Ordering::Relaxed) && acc.finalverify(None) { + BLST_ERROR::BLST_SUCCESS + } else { + BLST_ERROR::BLST_VERIFY_FAIL + } + } + + #[cfg(not(feature = "std"))] + #[allow(clippy::too_many_arguments)] + pub fn verify_multiple_aggregate_signatures( + msgs: &[&[u8]], + dst: &[u8], + pks: &[&PublicKey], + pks_validate: bool, + sigs: &[&Signature], + sigs_groupcheck: bool, + rands: &[blst_scalar], + rand_bits: usize, + ) -> BLST_ERROR { + let n_elems = pks.len(); + if n_elems == 0 + || msgs.len() != n_elems + || sigs.len() != n_elems + || rands.len() != n_elems + { + return BLST_ERROR::BLST_VERIFY_FAIL; + } + + // TODO - check msg uniqueness? + + let mut pairing = Pairing::new($hash_or_encode, dst); + + for i in 0..n_elems { + let err = pairing.mul_n_aggregate( + &pks[i].point, + pks_validate, + &sigs[i].point, + sigs_groupcheck, + &rands[i].b, + rand_bits, + msgs[i], + &[], + ); + if err != BLST_ERROR::BLST_SUCCESS { + return err; + } + } + + pairing.commit(); + + if pairing.finalverify(None) { + BLST_ERROR::BLST_SUCCESS + } else { + BLST_ERROR::BLST_VERIFY_FAIL + } + } + + pub fn from_aggregate(agg_sig: &AggregateSignature) -> Self { + let mut sig_aff = <$sig_aff>::default(); + unsafe { + $sig_to_aff(&mut sig_aff, &agg_sig.point); + } + Self { point: sig_aff } + } + + pub fn compress(&self) -> [u8; $sig_comp_size] { + let mut sig_comp = [0; $sig_comp_size]; + unsafe { + $sig_comp(sig_comp.as_mut_ptr(), &self.point); + } + sig_comp + } + + pub fn serialize(&self) -> [u8; $sig_ser_size] { + let mut sig_out = [0; $sig_ser_size]; + unsafe { + $sig_ser(sig_out.as_mut_ptr(), &self.point); + } + sig_out + } + + pub fn uncompress(sig_comp: &[u8]) -> Result { + if sig_comp.len() == $sig_comp_size && (sig_comp[0] & 0x80) != 0 + { + let mut sig = <$sig_aff>::default(); + let err = + unsafe { $sig_uncomp(&mut sig, sig_comp.as_ptr()) }; + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { point: sig }) + } else { + Err(BLST_ERROR::BLST_BAD_ENCODING) + } + } + + pub fn deserialize(sig_in: &[u8]) -> Result { + if (sig_in.len() == $sig_ser_size && (sig_in[0] & 0x80) == 0) + || (sig_in.len() == $sig_comp_size + && (sig_in[0] & 0x80) != 0) + { + let mut sig = <$sig_aff>::default(); + let err = unsafe { $sig_deser(&mut sig, sig_in.as_ptr()) }; + if err != BLST_ERROR::BLST_SUCCESS { + return Err(err); + } + Ok(Self { point: sig }) + } else { + Err(BLST_ERROR::BLST_BAD_ENCODING) + } + } + + pub fn from_bytes(sig_in: &[u8]) -> Result { + Signature::deserialize(sig_in) + } + + pub fn to_bytes(&self) -> [u8; $sig_comp_size] { + self.compress() + } + + pub fn subgroup_check(&self) -> bool { + unsafe { $sig_in_group(&self.point) } + } + } + + // Trait for equality comparisons which are equivalence relations. + // + // This means, that in addition to a == b and a != b being strict + // inverses, the equality must be reflexive, symmetric and transitive. + impl Eq for Signature {} + + impl PartialEq for Signature { + fn eq(&self, other: &Self) -> bool { + unsafe { $sig_eq(&self.point, &other.point) } + } + } + + #[cfg(feature = "serde")] + impl Serialize for Signature { + fn serialize( + &self, + ser: S, + ) -> Result { + ser.serialize_bytes(&self.serialize()) + } + } + + #[cfg(feature = "serde")] + impl<'de> Deserialize<'de> for Signature { + fn deserialize>( + deser: D, + ) -> Result { + let bytes: &[u8] = Deserialize::deserialize(deser)?; + Self::deserialize(&bytes).map_err(|e| { + ::custom(format!("{:?}", e)) + }) + } + } + + #[repr(transparent)] + #[derive(Debug, Clone, Copy)] + pub struct AggregateSignature { + point: $sig, + } + + impl AggregateSignature { + pub fn validate(&self) -> Result<(), BLST_ERROR> { + unsafe { + if !$sig_aggr_in_group(&self.point) { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + } + Ok(()) + } + + pub fn from_signature(sig: &Signature) -> Self { + let mut agg_sig = <$sig>::default(); + unsafe { + $sig_from_aff(&mut agg_sig, &sig.point); + } + Self { point: agg_sig } + } + + pub fn to_signature(&self) -> Signature { + let mut sig = <$sig_aff>::default(); + unsafe { + $sig_to_aff(&mut sig, &self.point); + } + Signature { point: sig } + } + + // Aggregate + pub fn aggregate( + sigs: &[&Signature], + sigs_groupcheck: bool, + ) -> Result { + if sigs.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + if sigs_groupcheck { + // We can't actually judge if input is individual or + // aggregated signature, so we can't enforce infinity + // check. + sigs[0].validate(false)?; + } + let mut agg_sig = AggregateSignature::from_signature(sigs[0]); + for s in sigs.iter().skip(1) { + if sigs_groupcheck { + s.validate(false)?; + } + unsafe { + $sig_add_or_dbl_aff( + &mut agg_sig.point, + &agg_sig.point, + &s.point, + ); + } + } + Ok(agg_sig) + } + + pub fn aggregate_with_randomness( + sigs: &[Signature], + randomness: &[u8], + nbits: usize, + sigs_groupcheck: bool, + ) -> Result { + if sigs.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + if sigs_groupcheck { + sigs.validate()?; + } + Ok(sigs.mult(randomness, nbits)) + } + + pub fn aggregate_serialized( + sigs: &[&[u8]], + sigs_groupcheck: bool, + ) -> Result { + // TODO - threading + if sigs.len() == 0 { + return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); + } + let mut sig = if sigs_groupcheck { + Signature::sig_validate(sigs[0], false)? + } else { + Signature::from_bytes(sigs[0])? + }; + let mut agg_sig = AggregateSignature::from_signature(&sig); + for s in sigs.iter().skip(1) { + sig = if sigs_groupcheck { + Signature::sig_validate(s, false)? + } else { + Signature::from_bytes(s)? + }; + unsafe { + $sig_add_or_dbl_aff( + &mut agg_sig.point, + &agg_sig.point, + &sig.point, + ); + } + } + Ok(agg_sig) + } + + pub fn add_aggregate(&mut self, agg_sig: &AggregateSignature) { + unsafe { + $sig_add_or_dbl( + &mut self.point, + &self.point, + &agg_sig.point, + ); + } + } + + pub fn add_signature( + &mut self, + sig: &Signature, + sig_groupcheck: bool, + ) -> Result<(), BLST_ERROR> { + if sig_groupcheck { + sig.validate(false)?; + } + unsafe { + $sig_add_or_dbl_aff( + &mut self.point, + &self.point, + &sig.point, + ); + } + Ok(()) + } + + pub fn subgroup_check(&self) -> bool { + unsafe { $sig_aggr_in_group(&self.point) } + } + } + + impl MultiPoint for [PublicKey] { + type Output = AggregatePublicKey; + + fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output { + Self::Output { + point: unsafe { transmute::<&[_], &[$pk_aff]>(self) } + .mult(scalars, nbits), + } + } + + fn add(&self) -> Self::Output { + Self::Output { + point: unsafe { transmute::<&[_], &[$pk_aff]>(self) } + .add(), + } + } + + fn validate(&self) -> Result<(), BLST_ERROR> { + unsafe { transmute::<&[_], &[$pk_aff]>(self) }.validate() + } + } + + impl MultiPoint for [Signature] { + type Output = AggregateSignature; + + fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output { + Self::Output { + point: unsafe { transmute::<&[_], &[$sig_aff]>(self) } + .mult(scalars, nbits), + } + } + + fn add(&self) -> Self::Output { + Self::Output { + point: unsafe { transmute::<&[_], &[$sig_aff]>(self) } + .add(), + } + } + + fn validate(&self) -> Result<(), BLST_ERROR> { + unsafe { transmute::<&[_], &[$sig_aff]>(self) }.validate() + } + } + + #[cfg(test)] + mod tests { + use super::*; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + // Testing only - do not use for production + pub fn gen_random_key( + rng: &mut rand_chacha::ChaCha20Rng, + ) -> SecretKey { + let mut ikm = [0u8; 32]; + rng.fill_bytes(&mut ikm); + + let mut sk = ::default(); + unsafe { + blst_keygen(&mut sk, ikm.as_ptr(), 32, ptr::null(), 0); + } + SecretKey { value: sk } + } + + #[test] + fn test_sign_n_verify() { + let ikm: [u8; 32] = [ + 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, + 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, + 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, + 0x48, 0x99, + ]; + + let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); + let pk = sk.sk_to_pk(); + + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; + let msg = b"hello foo"; + let sig = sk.sign(msg, dst, &[]); + + let err = sig.verify(true, msg, dst, &[], &pk, true); + assert_eq!(err, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_aggregate() { + let num_msgs = 10; + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sks: Vec<_> = + (0..num_msgs).map(|_| gen_random_key(&mut rng)).collect(); + let pks = + sks.iter().map(|sk| sk.sk_to_pk()).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + let pks_rev: Vec<&PublicKey> = + pks.iter().rev().map(|pk| pk).collect(); + + let pk_comp = pks[0].compress(); + let pk_uncomp = PublicKey::uncompress(&pk_comp); + assert_eq!(pk_uncomp.is_ok(), true); + + let mut msgs: Vec> = vec![vec![]; num_msgs]; + for i in 0..num_msgs { + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + } + + let msgs_refs: Vec<&[u8]> = + msgs.iter().map(|m| m.as_slice()).collect(); + + let sigs = sks + .iter() + .zip(msgs.iter()) + .map(|(sk, m)| (sk.sign(m, dst, &[]))) + .collect::>(); + + let mut errs = sigs + .iter() + .zip(msgs.iter()) + .zip(pks.iter()) + .map(|((s, m), pk)| (s.verify(true, m, dst, &[], pk, true))) + .collect::>(); + assert_eq!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); + + // Swap message/public key pairs to create bad signature + errs = sigs + .iter() + .zip(msgs.iter()) + .zip(pks.iter().rev()) + .map(|((s, m), pk)| (s.verify(true, m, dst, &[], pk, true))) + .collect::>(); + assert_ne!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); + + let sig_refs = + sigs.iter().map(|s| s).collect::>(); + let agg = match AggregateSignature::aggregate(&sig_refs, true) { + Ok(agg) => agg, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + + let agg_sig = agg.to_signature(); + let mut result = agg_sig + .aggregate_verify(false, &msgs_refs, dst, &pks_refs, false); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // Swap message/public key pairs to create bad signature + result = agg_sig + .aggregate_verify(false, &msgs_refs, dst, &pks_rev, false); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_multiple_agg_sigs() { + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; + let num_pks_per_sig = 10; + let num_sigs = 10; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let mut msgs: Vec> = vec![vec![]; num_sigs]; + let mut sigs: Vec = Vec::with_capacity(num_sigs); + let mut pks: Vec = Vec::with_capacity(num_sigs); + let mut rands: Vec = Vec::with_capacity(num_sigs); + for i in 0..num_sigs { + // Create public keys + let sks_i: Vec<_> = (0..num_pks_per_sig) + .map(|_| gen_random_key(&mut rng)) + .collect(); + + let pks_i = sks_i + .iter() + .map(|sk| sk.sk_to_pk()) + .collect::>(); + let pks_refs_i: Vec<&PublicKey> = + pks_i.iter().map(|pk| pk).collect(); + + // Create random message for pks to all sign + let msg_len = (rng.next_u64() & 0x3F) + 1; + msgs[i] = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msgs[i]); + + // Generate signature for each key pair + let sigs_i = sks_i + .iter() + .map(|sk| sk.sign(&msgs[i], dst, &[])) + .collect::>(); + + // Test each current single signature + let errs = sigs_i + .iter() + .zip(pks_i.iter()) + .map(|(s, pk)| { + (s.verify(true, &msgs[i], dst, &[], pk, true)) + }) + .collect::>(); + assert_eq!( + errs, + vec![BLST_ERROR::BLST_SUCCESS; num_pks_per_sig] + ); + + let sig_refs_i = + sigs_i.iter().map(|s| s).collect::>(); + let agg_i = + match AggregateSignature::aggregate(&sig_refs_i, false) + { + Ok(agg_i) => agg_i, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + + // Test current aggregate signature + sigs.push(agg_i.to_signature()); + let mut result = sigs[i].fast_aggregate_verify( + false, + &msgs[i], + dst, + &pks_refs_i, + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative test + if i != 0 { + result = sigs[i - 1].fast_aggregate_verify( + false, + &msgs[i], + dst, + &pks_refs_i, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + // aggregate public keys and push into vec + let agg_pk_i = + match AggregatePublicKey::aggregate(&pks_refs_i, false) + { + Ok(agg_pk_i) => agg_pk_i, + Err(err) => panic!("aggregate failure: {:?}", err), + }; + pks.push(agg_pk_i.to_public_key()); + + // Test current aggregate signature with aggregated pks + result = sigs[i].fast_aggregate_verify_pre_aggregated( + false, &msgs[i], dst, &pks[i], + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative test + if i != 0 { + result = sigs[i - 1] + .fast_aggregate_verify_pre_aggregated( + false, &msgs[i], dst, &pks[i], + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + // create random values + let mut vals = [0u64; 4]; + vals[0] = rng.next_u64(); + while vals[0] == 0 { + // Reject zero as it is used for multiplication. + vals[0] = rng.next_u64(); + } + let mut rand_i = MaybeUninit::::uninit(); + unsafe { + blst_scalar_from_uint64( + rand_i.as_mut_ptr(), + vals.as_ptr(), + ); + rands.push(rand_i.assume_init()); + } + } + + let msgs_refs: Vec<&[u8]> = + msgs.iter().map(|m| m.as_slice()).collect(); + let sig_refs = + sigs.iter().map(|s| s).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + + let msgs_rev: Vec<&[u8]> = + msgs.iter().rev().map(|m| m.as_slice()).collect(); + let sig_rev = + sigs.iter().rev().map(|s| s).collect::>(); + let pks_rev: Vec<&PublicKey> = + pks.iter().rev().map(|pk| pk).collect(); + + let mut result = + Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_refs, false, &sig_refs, true, + &rands, 64, + ); + assert_eq!(result, BLST_ERROR::BLST_SUCCESS); + + // negative tests (use reverse msgs, pks, and sigs) + result = Signature::verify_multiple_aggregate_signatures( + &msgs_rev, dst, &pks_refs, false, &sig_refs, true, &rands, + 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + + result = Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_rev, false, &sig_refs, true, &rands, + 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + + result = Signature::verify_multiple_aggregate_signatures( + &msgs_refs, dst, &pks_refs, false, &sig_rev, true, &rands, + 64, + ); + assert_ne!(result, BLST_ERROR::BLST_SUCCESS); + } + + #[test] + fn test_serialization() { + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + let sk = gen_random_key(&mut rng); + let sk2 = gen_random_key(&mut rng); + + let pk = sk.sk_to_pk(); + let pk_comp = pk.compress(); + let pk_ser = pk.serialize(); + + let pk_uncomp = PublicKey::uncompress(&pk_comp); + assert_eq!(pk_uncomp.is_ok(), true); + assert_eq!(pk_uncomp.unwrap(), pk); + + let pk_deser = PublicKey::deserialize(&pk_ser); + assert_eq!(pk_deser.is_ok(), true); + assert_eq!(pk_deser.unwrap(), pk); + + let pk2 = sk2.sk_to_pk(); + let pk_comp2 = pk2.compress(); + let pk_ser2 = pk2.serialize(); + + let pk_uncomp2 = PublicKey::uncompress(&pk_comp2); + assert_eq!(pk_uncomp2.is_ok(), true); + assert_eq!(pk_uncomp2.unwrap(), pk2); + + let pk_deser2 = PublicKey::deserialize(&pk_ser2); + assert_eq!(pk_deser2.is_ok(), true); + assert_eq!(pk_deser2.unwrap(), pk2); + + assert_ne!(pk, pk2); + assert_ne!(pk_uncomp.unwrap(), pk2); + assert_ne!(pk_deser.unwrap(), pk2); + assert_ne!(pk_uncomp2.unwrap(), pk); + assert_ne!(pk_deser2.unwrap(), pk); + } + + #[cfg(feature = "serde")] + #[test] + fn test_serde() { + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + // generate a sk, pk, and sig, and make sure it signs + let sk = gen_random_key(&mut rng); + let pk = sk.sk_to_pk(); + let sig = sk.sign(b"asdf", b"qwer", b"zxcv"); + assert_eq!( + sig.verify(true, b"asdf", b"qwer", b"zxcv", &pk, true), + BLST_ERROR::BLST_SUCCESS + ); + + // roundtrip through serde + let pk_ser = + rmp_serde::encode::to_vec_named(&pk).expect("ser pk"); + let sig_ser = + rmp_serde::encode::to_vec_named(&sig).expect("ser sig"); + let pk_des: PublicKey = + rmp_serde::decode::from_slice(&pk_ser).expect("des pk"); + let sig_des: Signature = + rmp_serde::decode::from_slice(&sig_ser).expect("des sig"); + + // check that we got back the right things + assert_eq!(pk, pk_des); + assert_eq!(sig, sig_des); + assert_eq!( + sig.verify(true, b"asdf", b"qwer", b"zxcv", &pk_des, true), + BLST_ERROR::BLST_SUCCESS + ); + assert_eq!( + sig_des.verify(true, b"asdf", b"qwer", b"zxcv", &pk, true), + BLST_ERROR::BLST_SUCCESS + ); + assert_eq!(sk.sign(b"asdf", b"qwer", b"zxcv"), sig_des); + + #[cfg(feature = "serde-secret")] + if true { + let sk_ser = + rmp_serde::encode::to_vec_named(&sk).expect("ser sk"); + let sk_des: SecretKey = + rmp_serde::decode::from_slice(&sk_ser).expect("des sk"); + // BLS signatures are deterministic, so this establishes + // that sk == sk_des + assert_eq!(sk_des.sign(b"asdf", b"qwer", b"zxcv"), sig); + } + } + + #[test] + fn test_multi_point() { + let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; + let num_pks = 13; + + let seed = [0u8; 32]; + let mut rng = ChaCha20Rng::from_seed(seed); + + // Create public keys + let sks: Vec<_> = + (0..num_pks).map(|_| gen_random_key(&mut rng)).collect(); + + let pks = + sks.iter().map(|sk| sk.sk_to_pk()).collect::>(); + let pks_refs: Vec<&PublicKey> = + pks.iter().map(|pk| pk).collect(); + + // Create random message for pks to all sign + let msg_len = (rng.next_u64() & 0x3F) + 1; + let mut msg = vec![0u8; msg_len as usize]; + rng.fill_bytes(&mut msg); + + // Generate signature for each key pair + let sigs = sks + .iter() + .map(|sk| sk.sign(&msg, dst, &[])) + .collect::>(); + let sigs_refs: Vec<&Signature> = + sigs.iter().map(|s| s).collect(); + + // create random values + let mut rands: Vec = Vec::with_capacity(8 * num_pks); + for _ in 0..num_pks { + let mut r = rng.next_u64(); + while r == 0 { + // Reject zero as it is used for multiplication. + r = rng.next_u64(); + } + rands.extend_from_slice(&r.to_le_bytes()); + } + + // Sanity test each current single signature + let errs = sigs + .iter() + .zip(pks.iter()) + .map(|(s, pk)| (s.verify(true, &msg, dst, &[], pk, true))) + .collect::>(); + assert_eq!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_pks]); + + // sanity test aggregated signature + let agg_pk = AggregatePublicKey::aggregate(&pks_refs, false) + .unwrap() + .to_public_key(); + let agg_sig = AggregateSignature::aggregate(&sigs_refs, false) + .unwrap() + .to_signature(); + let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); + assert_eq!(err, BLST_ERROR::BLST_SUCCESS); + + // test multi-point aggregation using add + let agg_pk = pks.add().to_public_key(); + let agg_sig = sigs.add().to_signature(); + let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); + assert_eq!(err, BLST_ERROR::BLST_SUCCESS); + + // test multi-point aggregation using mult + let agg_pk = pks.mult(&rands, 64).to_public_key(); + let agg_sig = sigs.mult(&rands, 64).to_signature(); + let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); + assert_eq!(err, BLST_ERROR::BLST_SUCCESS); + } + } + }; +} + +pub mod min_pk { + use super::*; + + sig_variant_impl!( + "MinPk", + blst_p1, + blst_p1_affine, + blst_p2, + blst_p2_affine, + blst_sk_to_pk2_in_g1, + true, + blst_hash_to_g2, + blst_sign_pk2_in_g1, + blst_p1_affine_is_equal, + blst_p2_affine_is_equal, + blst_core_verify_pk_in_g1, + blst_p1_affine_in_g1, + blst_p1_to_affine, + blst_p1_from_affine, + blst_p1_affine_serialize, + blst_p1_affine_compress, + blst_p1_deserialize, + blst_p1_uncompress, + 48, + 96, + blst_p2_affine_in_g2, + blst_p2_to_affine, + blst_p2_from_affine, + blst_p2_affine_serialize, + blst_p2_affine_compress, + blst_p2_deserialize, + blst_p2_uncompress, + 96, + 192, + blst_p1_add_or_double, + blst_p1_add_or_double_affine, + blst_p2_add_or_double, + blst_p2_add_or_double_affine, + blst_p1_affine_is_inf, + blst_p2_affine_is_inf, + blst_p2_in_g2, + ); +} + +pub mod min_sig { + use super::*; + + sig_variant_impl!( + "MinSig", + blst_p2, + blst_p2_affine, + blst_p1, + blst_p1_affine, + blst_sk_to_pk2_in_g2, + true, + blst_hash_to_g1, + blst_sign_pk2_in_g2, + blst_p2_affine_is_equal, + blst_p1_affine_is_equal, + blst_core_verify_pk_in_g2, + blst_p2_affine_in_g2, + blst_p2_to_affine, + blst_p2_from_affine, + blst_p2_affine_serialize, + blst_p2_affine_compress, + blst_p2_deserialize, + blst_p2_uncompress, + 96, + 192, + blst_p1_affine_in_g1, + blst_p1_to_affine, + blst_p1_from_affine, + blst_p1_affine_serialize, + blst_p1_affine_compress, + blst_p1_deserialize, + blst_p1_uncompress, + 48, + 96, + blst_p2_add_or_double, + blst_p2_add_or_double_affine, + blst_p1_add_or_double, + blst_p1_add_or_double_affine, + blst_p2_affine_is_inf, + blst_p1_affine_is_inf, + blst_p1_in_g1, + ); +} + +pub trait MultiPoint { + type Output; + + fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output; + fn add(&self) -> Self::Output; + fn validate(&self) -> Result<(), BLST_ERROR> { + Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP) + } +} + +#[cfg(feature = "std")] +include!("pippenger.rs"); + +#[cfg(not(feature = "std"))] +include!("pippenger-no_std.rs"); + +#[cfg(test)] +mod fp12_test { + use super::*; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + #[test] + fn miller_loop_n() { + const npoints: usize = 97; + const nbits: usize = 64; + const nbytes: usize = (nbits + 7) / 8; + + let mut scalars = Box::new([0u8; nbytes * npoints]); + ChaCha20Rng::from_entropy().fill_bytes(scalars.as_mut()); + + let mut p1s: Vec = Vec::with_capacity(npoints); + let mut p2s: Vec = Vec::with_capacity(npoints); + + unsafe { + p1s.set_len(npoints); + p2s.set_len(npoints); + + for i in 0..npoints { + blst_p1_mult( + &mut p1s[i], + blst_p1_generator(), + &scalars[i * nbytes], + 32, + ); + blst_p2_mult( + &mut p2s[i], + blst_p2_generator(), + &scalars[i * nbytes + 4], + 32, + ); + } + } + + let ps = p1_affines::from(&p1s); + let qs = p2_affines::from(&p2s); + + let mut naive = blst_fp12::default(); + for i in 0..npoints { + naive *= blst_fp12::miller_loop(&qs[i], &ps[i]); + } + + assert_eq!( + naive, + blst_fp12::miller_loop_n(qs.as_slice(), ps.as_slice()) + ); + } +} + +#[cfg(test)] +mod sk_test { + use super::*; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + #[test] + fn inverse() { + let mut bytes = [0u8; 64]; + ChaCha20Rng::from_entropy().fill_bytes(bytes.as_mut()); + + let mut sk = blst_scalar::default(); + let mut p1 = blst_p1::default(); + let mut p2 = blst_p2::default(); + + unsafe { + blst_scalar_from_be_bytes(&mut sk, bytes.as_ptr(), bytes.len()); + + blst_p1_mult(&mut p1, blst_p1_generator(), sk.b.as_ptr(), 255); + blst_sk_inverse(&mut sk, &sk); + blst_p1_mult(&mut p1, &p1, sk.b.as_ptr(), 255); + + blst_p2_mult(&mut p2, blst_p2_generator(), sk.b.as_ptr(), 255); + blst_sk_inverse(&mut sk, &sk); + blst_p2_mult(&mut p2, &p2, sk.b.as_ptr(), 255); + } + + assert_eq!(p1, unsafe { *blst_p1_generator() }); + assert_eq!(p2, unsafe { *blst_p2_generator() }); + } +} diff --git a/src/blst/bindings/rust/src/pippenger-no_std.rs b/src/blst/bindings/rust/src/pippenger-no_std.rs new file mode 100644 index 0000000000..10f48bece6 --- /dev/null +++ b/src/blst/bindings/rust/src/pippenger-no_std.rs @@ -0,0 +1,179 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +use core::ops::{Index, IndexMut}; +use core::slice::SliceIndex; + +macro_rules! pippenger_mult_impl { + ( + $points:ident, + $point:ty, + $point_affine:ty, + $to_affines:ident, + $scratch_sizeof:ident, + $multi_scalar_mult:ident, + $tile_mult:ident, + $add_or_double:ident, + $double:ident, + $test_mod:ident, + $generator:ident, + $mult:ident, + $add:ident, + $is_inf:ident, + $in_group:ident, + ) => { + pub struct $points { + points: Vec<$point_affine>, + } + + impl> Index for $points { + type Output = I::Output; + + #[inline] + fn index(&self, i: I) -> &Self::Output { + &self.points[i] + } + } + impl> IndexMut for $points { + #[inline] + fn index_mut(&mut self, i: I) -> &mut Self::Output { + &mut self.points[i] + } + } + + impl $points { + #[inline] + pub fn as_slice(&self) -> &[$point_affine] { + self.points.as_slice() + } + + pub fn from(points: &[$point]) -> Self { + let npoints = points.len(); + let mut ret = Self { + points: Vec::with_capacity(npoints), + }; + #[allow(clippy::uninit_vec)] + unsafe { ret.points.set_len(npoints) }; + + let p: [*const $point; 2] = [&points[0], ptr::null()]; + unsafe { $to_affines(&mut ret.points[0], &p[0], npoints) }; + ret + } + + #[inline] + pub fn mult(&self, scalars: &[u8], nbits: usize) -> $point { + self.as_slice().mult(scalars, nbits) + } + + #[inline] + pub fn add(&self) -> $point { + self.as_slice().add() + } + } + + impl MultiPoint for [$point_affine] { + type Output = $point; + + fn mult(&self, scalars: &[u8], nbits: usize) -> $point { + let npoints = self.len(); + let nbytes = (nbits + 7) / 8; + + if scalars.len() < nbytes * npoints { + panic!("scalars length mismatch"); + } + + let p: [*const $point_affine; 2] = [&self[0], ptr::null()]; + let s: [*const u8; 2] = [&scalars[0], ptr::null()]; + + let mut ret = <$point>::default(); + unsafe { + let mut scratch: Vec = + Vec::with_capacity($scratch_sizeof(npoints) / 8); + #[allow(clippy::uninit_vec)] + scratch.set_len(scratch.capacity()); + $multi_scalar_mult( + &mut ret, + &p[0], + npoints, + &s[0], + nbits, + &mut scratch[0], + ); + } + ret + } + + fn add(&self) -> $point { + let npoints = self.len(); + + let p: [*const _; 2] = [&self[0], ptr::null()]; + let mut ret = <$point>::default(); + unsafe { $add(&mut ret, &p[0], npoints) }; + + ret + } + + fn validate(&self) -> Result<(), BLST_ERROR> { + for i in 0..self.len() { + if unsafe { $is_inf(&self[i]) } { + return Err(BLST_ERROR::BLST_PK_IS_INFINITY); + } + if !unsafe { $in_group(&self[i]) } { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + } + Ok(()) + } + } + + #[cfg(test)] + pippenger_test_mod!( + $test_mod, + $points, + $point, + $add_or_double, + $generator, + $mult, + ); + }; +} + +#[cfg(test)] +include!("pippenger-test_mod.rs"); + +pippenger_mult_impl!( + p1_affines, + blst_p1, + blst_p1_affine, + blst_p1s_to_affine, + blst_p1s_mult_pippenger_scratch_sizeof, + blst_p1s_mult_pippenger, + blst_p1s_tile_pippenger, + blst_p1_add_or_double, + blst_p1_double, + p1_multi_point, + blst_p1_generator, + blst_p1_mult, + blst_p1s_add, + blst_p1_affine_is_inf, + blst_p1_affine_in_g1, +); + +pippenger_mult_impl!( + p2_affines, + blst_p2, + blst_p2_affine, + blst_p2s_to_affine, + blst_p2s_mult_pippenger_scratch_sizeof, + blst_p2s_mult_pippenger, + blst_p2s_tile_pippenger, + blst_p2_add_or_double, + blst_p2_double, + p2_multi_point, + blst_p2_generator, + blst_p2_mult, + blst_p2s_add, + blst_p2_affine_is_inf, + blst_p2_affine_in_g2, +); diff --git a/src/blst/bindings/rust/src/pippenger-test_mod.rs b/src/blst/bindings/rust/src/pippenger-test_mod.rs new file mode 100644 index 0000000000..4874a12ee7 --- /dev/null +++ b/src/blst/bindings/rust/src/pippenger-test_mod.rs @@ -0,0 +1,85 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +macro_rules! pippenger_test_mod { + ( + $test_mod:ident, + $points:ident, + $point:ty, + $add_or_double:ident, + $generator:ident, + $mult:ident, + ) => { + mod $test_mod { + use super::*; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + #[test] + fn test_mult() { + const npoints: usize = 2000; + const nbits: usize = 160; + const nbytes: usize = (nbits + 7) / 8; + + let mut scalars = Box::new([0u8; nbytes * npoints]); + ChaCha20Rng::from_seed([0u8; 32]).fill_bytes(scalars.as_mut()); + + let mut points: Vec<$point> = Vec::with_capacity(npoints); + unsafe { points.set_len(points.capacity()) }; + + let mut naive = <$point>::default(); + for i in 0..npoints { + unsafe { + let mut t = <$point>::default(); + $mult( + &mut points[i], + $generator(), + &scalars[i * nbytes], + core::cmp::min(32, nbits), + ); + $mult(&mut t, &points[i], &scalars[i * nbytes], nbits); + $add_or_double(&mut naive, &naive, &t); + } + if i < 27 { + let points = $points::from(&points[0..i + 1]); + assert_eq!(naive, points.mult(scalars.as_ref(), nbits)); + } + } + + let points = $points::from(&points); + + assert_eq!(naive, points.mult(scalars.as_ref(), nbits)); + } + + #[test] + fn test_add() { + const npoints: usize = 2000; + const nbits: usize = 32; + const nbytes: usize = (nbits + 7) / 8; + + let mut scalars = Box::new([0u8; nbytes * npoints]); + ChaCha20Rng::from_seed([0u8; 32]).fill_bytes(scalars.as_mut()); + + let mut points: Vec<$point> = Vec::with_capacity(npoints); + unsafe { points.set_len(points.capacity()) }; + + let mut naive = <$point>::default(); + for i in 0..npoints { + unsafe { + $mult( + &mut points[i], + $generator(), + &scalars[i * nbytes], + 32, + ); + $add_or_double(&mut naive, &naive, &points[i]); + } + } + + let points = $points::from(&points); + assert_eq!(naive, points.add()); + } + } + }; +} diff --git a/src/blst/bindings/rust/src/pippenger.rs b/src/blst/bindings/rust/src/pippenger.rs new file mode 100644 index 0000000000..a3a9f50c08 --- /dev/null +++ b/src/blst/bindings/rust/src/pippenger.rs @@ -0,0 +1,550 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +use core::num::Wrapping; +use core::ops::{Index, IndexMut}; +use core::slice::SliceIndex; +use std::sync::Barrier; + +struct tile { + x: usize, + dx: usize, + y: usize, + dy: usize, +} + +// Minimalist core::cell::Cell stand-in, but with Sync marker, which +// makes it possible to pass it to multiple threads. It works, because +// *here* each Cell is written only once and by just one thread. +#[repr(transparent)] +struct Cell { + value: T, +} +unsafe impl Sync for Cell {} +impl Cell { + pub fn as_ptr(&self) -> *mut T { + &self.value as *const T as *mut T + } +} + +macro_rules! pippenger_mult_impl { + ( + $points:ident, + $point:ty, + $point_affine:ty, + $to_affines:ident, + $scratch_sizeof:ident, + $multi_scalar_mult:ident, + $tile_mult:ident, + $add_or_double:ident, + $double:ident, + $test_mod:ident, + $generator:ident, + $mult:ident, + $add:ident, + $is_inf:ident, + $in_group:ident, + $from_affine:ident, + ) => { + pub struct $points { + points: Vec<$point_affine>, + } + + impl> Index for $points { + type Output = I::Output; + + #[inline] + fn index(&self, i: I) -> &Self::Output { + &self.points[i] + } + } + impl> IndexMut for $points { + #[inline] + fn index_mut(&mut self, i: I) -> &mut Self::Output { + &mut self.points[i] + } + } + + impl $points { + #[inline] + pub fn as_slice(&self) -> &[$point_affine] { + self.points.as_slice() + } + + pub fn from(points: &[$point]) -> Self { + let npoints = points.len(); + let mut ret = Self { + points: Vec::with_capacity(npoints), + }; + unsafe { ret.points.set_len(npoints) }; + + let pool = mt::da_pool(); + let ncpus = pool.max_count(); + if ncpus < 2 || npoints < 768 { + let p: [*const $point; 2] = [&points[0], ptr::null()]; + unsafe { $to_affines(&mut ret.points[0], &p[0], npoints) }; + return ret; + } + + let mut nslices = (npoints + 511) / 512; + nslices = core::cmp::min(nslices, ncpus); + let wg = Arc::new((Barrier::new(2), AtomicUsize::new(nslices))); + + let (mut delta, mut rem) = + (npoints / nslices + 1, Wrapping(npoints % nslices)); + let mut x = 0usize; + while x < npoints { + let out = &mut ret.points[x]; + let inp = &points[x]; + + delta -= (rem == Wrapping(0)) as usize; + rem -= Wrapping(1); + x += delta; + + let wg = wg.clone(); + pool.joined_execute(move || { + let p: [*const $point; 2] = [inp, ptr::null()]; + unsafe { $to_affines(out, &p[0], delta) }; + if wg.1.fetch_sub(1, Ordering::AcqRel) == 1 { + wg.0.wait(); + } + }); + } + wg.0.wait(); + + ret + } + + #[inline] + pub fn mult(&self, scalars: &[u8], nbits: usize) -> $point { + self.as_slice().mult(scalars, nbits) + } + + #[inline] + pub fn add(&self) -> $point { + self.as_slice().add() + } + } + + impl MultiPoint for [$point_affine] { + type Output = $point; + + fn mult(&self, scalars: &[u8], nbits: usize) -> $point { + let npoints = self.len(); + let nbytes = (nbits + 7) / 8; + + if scalars.len() < nbytes * npoints { + panic!("scalars length mismatch"); + } + + let pool = mt::da_pool(); + let ncpus = pool.max_count(); + if ncpus < 2 { + let p: [*const $point_affine; 2] = [&self[0], ptr::null()]; + let s: [*const u8; 2] = [&scalars[0], ptr::null()]; + + unsafe { + let mut scratch: Vec = + Vec::with_capacity($scratch_sizeof(npoints) / 8); + #[allow(clippy::uninit_vec)] + scratch.set_len(scratch.capacity()); + let mut ret = <$point>::default(); + $multi_scalar_mult( + &mut ret, + &p[0], + npoints, + &s[0], + nbits, + &mut scratch[0], + ); + return ret; + } + } + + if npoints < 32 { + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + let n_workers = core::cmp::min(ncpus, npoints); + + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + + pool.joined_execute(move || { + let mut acc = <$point>::default(); + let mut tmp = <$point>::default(); + let mut first = true; + + loop { + let work = + counter.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + + unsafe { + $from_affine(&mut tmp, &self[work]); + let scalar = &scalars[nbytes * work]; + if first { + $mult(&mut acc, &tmp, scalar, nbits); + first = false; + } else { + $mult(&mut tmp, &tmp, scalar, nbits); + $add_or_double(&mut acc, &acc, &tmp); + } + } + } + + tx.send(acc).expect("disaster"); + }); + } + + let mut ret = rx.recv().expect("disaster"); + for _ in 1..n_workers { + let p = rx.recv().expect("disaster"); + unsafe { $add_or_double(&mut ret, &ret, &p) }; + } + + return ret; + } + + let (nx, ny, window) = + breakdown(nbits, pippenger_window_size(npoints), ncpus); + + // |grid[]| holds "coordinates" and place for result + let mut grid: Vec<(tile, Cell<$point>)> = + Vec::with_capacity(nx * ny); + #[allow(clippy::uninit_vec)] + unsafe { grid.set_len(grid.capacity()) }; + let dx = npoints / nx; + let mut y = window * (ny - 1); + let mut total = 0usize; + + while total < nx { + grid[total].0.x = total * dx; + grid[total].0.dx = dx; + grid[total].0.y = y; + grid[total].0.dy = nbits - y; + total += 1; + } + grid[total - 1].0.dx = npoints - grid[total - 1].0.x; + while y != 0 { + y -= window; + for i in 0..nx { + grid[total].0.x = grid[i].0.x; + grid[total].0.dx = grid[i].0.dx; + grid[total].0.y = y; + grid[total].0.dy = window; + total += 1; + } + } + let grid = &grid[..]; + + let points = &self[..]; + let sz = unsafe { $scratch_sizeof(0) / 8 }; + + let mut row_sync: Vec = Vec::with_capacity(ny); + row_sync.resize_with(ny, Default::default); + let row_sync = Arc::new(row_sync); + let counter = Arc::new(AtomicUsize::new(0)); + let (tx, rx) = channel(); + let n_workers = core::cmp::min(ncpus, total); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + let row_sync = row_sync.clone(); + + pool.joined_execute(move || { + let mut scratch = vec![0u64; sz << (window - 1)]; + let mut p: [*const $point_affine; 2] = + [ptr::null(), ptr::null()]; + let mut s: [*const u8; 2] = [ptr::null(), ptr::null()]; + + loop { + let work = counter.fetch_add(1, Ordering::Relaxed); + if work >= total { + break; + } + let x = grid[work].0.x; + let y = grid[work].0.y; + + p[0] = &points[x]; + s[0] = &scalars[x * nbytes]; + unsafe { + $tile_mult( + grid[work].1.as_ptr(), + &p[0], + grid[work].0.dx, + &s[0], + nbits, + &mut scratch[0], + y, + window, + ); + } + if row_sync[y / window] + .fetch_add(1, Ordering::AcqRel) + == nx - 1 + { + tx.send(y).expect("disaster"); + } + } + }); + } + + let mut ret = <$point>::default(); + let mut rows = vec![false; ny]; + let mut row = 0usize; + for _ in 0..ny { + let mut y = rx.recv().unwrap(); + rows[y / window] = true; + while grid[row].0.y == y { + while row < total && grid[row].0.y == y { + unsafe { + $add_or_double( + &mut ret, + &ret, + grid[row].1.as_ptr(), + ); + } + row += 1; + } + if y == 0 { + break; + } + for _ in 0..window { + unsafe { $double(&mut ret, &ret) }; + } + y -= window; + if !rows[y / window] { + break; + } + } + } + ret + } + + fn add(&self) -> $point { + let npoints = self.len(); + + let pool = mt::da_pool(); + let ncpus = pool.max_count(); + if ncpus < 2 || npoints < 384 { + let p: [*const _; 2] = [&self[0], ptr::null()]; + let mut ret = <$point>::default(); + unsafe { $add(&mut ret, &p[0], npoints) }; + return ret; + } + + let (tx, rx) = channel(); + let counter = Arc::new(AtomicUsize::new(0)); + let nchunks = (npoints + 255) / 256; + let chunk = npoints / nchunks + 1; + + let n_workers = core::cmp::min(ncpus, nchunks); + for _ in 0..n_workers { + let tx = tx.clone(); + let counter = counter.clone(); + + pool.joined_execute(move || { + let mut acc = <$point>::default(); + let mut chunk = chunk; + let mut p: [*const _; 2] = [ptr::null(), ptr::null()]; + + loop { + let work = + counter.fetch_add(chunk, Ordering::Relaxed); + if work >= npoints { + break; + } + p[0] = &self[work]; + if work + chunk > npoints { + chunk = npoints - work; + } + unsafe { + let mut t = MaybeUninit::<$point>::uninit(); + $add(t.as_mut_ptr(), &p[0], chunk); + $add_or_double(&mut acc, &acc, t.as_ptr()); + }; + } + tx.send(acc).expect("disaster"); + }); + } + + let mut ret = rx.recv().unwrap(); + for _ in 1..n_workers { + unsafe { + $add_or_double(&mut ret, &ret, &rx.recv().unwrap()) + }; + } + + ret + } + + fn validate(&self) -> Result<(), BLST_ERROR> { + fn check(point: &$point_affine) -> Result<(), BLST_ERROR> { + if unsafe { $is_inf(point) } { + return Err(BLST_ERROR::BLST_PK_IS_INFINITY); + } + if !unsafe { $in_group(point) } { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + Ok(()) + } + + let npoints = self.len(); + + let pool = mt::da_pool(); + let n_workers = core::cmp::min(npoints, pool.max_count()); + if n_workers < 2 { + for i in 0..npoints { + check(&self[i])? + } + return Ok(()) + } + + let counter = Arc::new(AtomicUsize::new(0)); + let valid = Arc::new(AtomicBool::new(true)); + let wg = + Arc::new((Barrier::new(2), AtomicUsize::new(n_workers))); + + for _ in 0..n_workers { + let counter = counter.clone(); + let valid = valid.clone(); + let wg = wg.clone(); + + pool.joined_execute(move || { + while valid.load(Ordering::Relaxed) { + let work = counter.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + + if check(&self[work]).is_err() { + valid.store(false, Ordering::Relaxed); + break; + } + } + + if wg.1.fetch_sub(1, Ordering::AcqRel) == 1 { + wg.0.wait(); + } + }); + } + + wg.0.wait(); + + if valid.load(Ordering::Relaxed) { + return Ok(()); + } else { + return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); + } + } + } + + #[cfg(test)] + pippenger_test_mod!( + $test_mod, + $points, + $point, + $add_or_double, + $generator, + $mult, + ); + }; +} + +#[cfg(test)] +include!("pippenger-test_mod.rs"); + +pippenger_mult_impl!( + p1_affines, + blst_p1, + blst_p1_affine, + blst_p1s_to_affine, + blst_p1s_mult_pippenger_scratch_sizeof, + blst_p1s_mult_pippenger, + blst_p1s_tile_pippenger, + blst_p1_add_or_double, + blst_p1_double, + p1_multi_point, + blst_p1_generator, + blst_p1_mult, + blst_p1s_add, + blst_p1_affine_is_inf, + blst_p1_affine_in_g1, + blst_p1_from_affine, +); + +pippenger_mult_impl!( + p2_affines, + blst_p2, + blst_p2_affine, + blst_p2s_to_affine, + blst_p2s_mult_pippenger_scratch_sizeof, + blst_p2s_mult_pippenger, + blst_p2s_tile_pippenger, + blst_p2_add_or_double, + blst_p2_double, + p2_multi_point, + blst_p2_generator, + blst_p2_mult, + blst_p2s_add, + blst_p2_affine_is_inf, + blst_p2_affine_in_g2, + blst_p2_from_affine, +); + +fn num_bits(l: usize) -> usize { + 8 * core::mem::size_of_val(&l) - l.leading_zeros() as usize +} + +fn breakdown( + nbits: usize, + window: usize, + ncpus: usize, +) -> (usize, usize, usize) { + let mut nx: usize; + let mut wnd: usize; + + if nbits > window * ncpus { + nx = 1; + wnd = num_bits(ncpus / 4); + if (window + wnd) > 18 { + wnd = window - wnd; + } else { + wnd = (nbits / window + ncpus - 1) / ncpus; + if (nbits / (window + 1) + ncpus - 1) / ncpus < wnd { + wnd = window + 1; + } else { + wnd = window; + } + } + } else { + nx = 2; + wnd = window - 2; + while (nbits / wnd + 1) * nx < ncpus { + nx += 1; + wnd = window - num_bits(3 * nx / 2); + } + nx -= 1; + wnd = window - num_bits(3 * nx / 2); + } + let ny = nbits / wnd + 1; + wnd = nbits / ny + 1; + + (nx, ny, wnd) +} + +fn pippenger_window_size(npoints: usize) -> usize { + let wbits = num_bits(npoints); + + if wbits > 13 { + return wbits - 4; + } + if wbits > 5 { + return wbits - 3; + } + 2 +} diff --git a/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json b/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json new file mode 100644 index 0000000000..cc3d17a350 --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json @@ -0,0 +1,90 @@ +{ + "L": "0x40", + "Z": "0xb", + "ciphersuite": "BLS12381G1_XMD:SHA-256_SSWU_NU_", + "curve": "BLS12-381 G1", + "dst": "QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_NU_", + "expand": "XMD", + "field": { + "m": "0x1", + "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" + }, + "hash": "sha256", + "k": "0x80", + "map": { + "name": "SSWU" + }, + "randomOracle": false, + "vectors": [ + { + "P": { + "x": "0x184bb665c37ff561a89ec2122dd343f20e0f4cbcaec84e3c3052ea81d1834e192c426074b02ed3dca4e7676ce4ce48ba", + "y": "0x04407b8d35af4dacc809927071fc0405218f1401a6d15af775810e4e460064bcc9468beeba82fdc751be70476c888bf3" + }, + "Q": { + "x": "0x11398d3b324810a1b093f8e35aa8571cced95858207e7f49c4fd74656096d61d8a2f9a23cdb18a4dd11cd1d66f41f709", + "y": "0x19316b6fb2ba7717355d5d66a361899057e1e84a6823039efc7beccefe09d023fb2713b1c415fcf278eb0c39a89b4f72" + }, + "msg": "", + "u": [ + "0x156c8a6a2c184569d69a76be144b5cdc5141d2d2ca4fe341f011e25e3969c55ad9e9b9ce2eb833c81a908e5fa4ac5f03" + ] + }, + { + "P": { + "x": "0x009769f3ab59bfd551d53a5f846b9984c59b97d6842b20a2c565baa167945e3d026a3755b6345df8ec7e6acb6868ae6d", + "y": "0x1532c00cf61aa3d0ce3e5aa20c3b531a2abd2c770a790a2613818303c6b830ffc0ecf6c357af3317b9575c567f11cd2c" + }, + "Q": { + "x": "0x1998321bc27ff6d71df3051b5aec12ff47363d81a5e9d2dff55f444f6ca7e7d6af45c56fd029c58237c266ef5cda5254", + "y": "0x034d274476c6307ae584f951c82e7ea85b84f72d28f4d6471732356121af8d62a49bc263e8eb913a6cf6f125995514ee" + }, + "msg": "abc", + "u": [ + "0x147e1ed29f06e4c5079b9d14fc89d2820d32419b990c1c7bb7dbea2a36a045124b31ffbde7c99329c05c559af1c6cc82" + ] + }, + { + "P": { + "x": "0x1974dbb8e6b5d20b84df7e625e2fbfecb2cdb5f77d5eae5fb2955e5ce7313cae8364bc2fff520a6c25619739c6bdcb6a", + "y": "0x15f9897e11c6441eaa676de141c8d83c37aab8667173cbe1dfd6de74d11861b961dccebcd9d289ac633455dfcc7013a3" + }, + "Q": { + "x": "0x17d502fa43bd6a4cad2859049a0c3ecefd60240d129be65da271a4c03a9c38fa78163b9d2a919d2beb57df7d609b4919", + "y": "0x109019902ae93a8732abecf2ff7fecd2e4e305eb91f41c9c3267f16b6c19de138c7272947f25512745da6c466cdfd1ac" + }, + "msg": "abcdef0123456789", + "u": [ + "0x04090815ad598a06897dd89bcda860f25837d54e897298ce31e6947378134d3761dc59a572154963e8c954919ecfa82d" + ] + }, + { + "P": { + "x": "0x0a7a047c4a8397b3446450642c2ac64d7239b61872c9ae7a59707a8f4f950f101e766afe58223b3bff3a19a7f754027c", + "y": "0x1383aebba1e4327ccff7cf9912bda0dbc77de048b71ef8c8a81111d71dc33c5e3aa6edee9cf6f5fe525d50cc50b77cc9" + }, + "Q": { + "x": "0x112eb92dd2b3aa9cd38b08de4bef603f2f9fb0ca226030626a9a2e47ad1e9847fe0a5ed13766c339e38f514bba143b21", + "y": "0x17542ce2f8d0a54f2c5ba8c4b14e10b22d5bcd7bae2af3c965c8c872b571058c720eac448276c99967ded2bf124490e1" + }, + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "u": [ + "0x08dccd088ca55b8bfbc96fb50bb25c592faa867a8bb78d4e94a8cc2c92306190244532e91feba2b7fed977e3c3bb5a1f" + ] + }, + { + "P": { + "x": "0x0e7a16a975904f131682edbb03d9560d3e48214c9986bd50417a77108d13dc957500edf96462a3d01e62dc6cd468ef11", + "y": "0x0ae89e677711d05c30a48d6d75e76ca9fb70fe06c6dd6ff988683d89ccde29ac7d46c53bb97a59b1901abf1db66052db" + }, + "Q": { + "x": "0x1775d400a1bacc1c39c355da7e96d2d1c97baa9430c4a3476881f8521c09a01f921f592607961efc99c4cd46bd78ca19", + "y": "0x1109b5d59f65964315de65a7a143e86eabc053104ed289cf480949317a5685fad7254ff8e7fe6d24d3104e5d55ad6370" + }, + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "u": [ + "0x0dd824886d2123a96447f6c56e3a3fa992fbfefdba17b6673f9f630ff19e4d326529db37e1c1be43f905bf9202e0278d" + ] + } + ] +} diff --git a/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json b/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json new file mode 100644 index 0000000000..46c7574f05 --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json @@ -0,0 +1,115 @@ +{ + "L": "0x40", + "Z": "0xb", + "ciphersuite": "BLS12381G1_XMD:SHA-256_SSWU_RO_", + "curve": "BLS12-381 G1", + "dst": "QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_", + "expand": "XMD", + "field": { + "m": "0x1", + "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" + }, + "hash": "sha256", + "k": "0x80", + "map": { + "name": "SSWU" + }, + "randomOracle": true, + "vectors": [ + { + "P": { + "x": "0x052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1", + "y": "0x08ba738453bfed09cb546dbb0783dbb3a5f1f566ed67bb6be0e8c67e2e81a4cc68ee29813bb7994998f3eae0c9c6a265" + }, + "Q0": { + "x": "0x11a3cce7e1d90975990066b2f2643b9540fa40d6137780df4e753a8054d07580db3b7f1f03396333d4a359d1fe3766fe", + "y": "0x0eeaf6d794e479e270da10fdaf768db4c96b650a74518fc67b04b03927754bac66f3ac720404f339ecdcc028afa091b7" + }, + "Q1": { + "x": "0x160003aaf1632b13396dbad518effa00fff532f604de1a7fc2082ff4cb0afa2d63b2c32da1bef2bf6c5ca62dc6b72f9c", + "y": "0x0d8bb2d14e20cf9f6036152ed386d79189415b6d015a20133acb4e019139b94e9c146aaad5817f866c95d609a361735e" + }, + "msg": "", + "u": [ + "0x0ba14bd907ad64a016293ee7c2d276b8eae71f25a4b941eece7b0d89f17f75cb3ae5438a614fb61d6835ad59f29c564f", + "0x019b9bd7979f12657976de2884c7cce192b82c177c80e0ec604436a7f538d231552f0d96d9f7babe5fa3b19b3ff25ac9" + ] + }, + { + "P": { + "x": "0x03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903", + "y": "0x0b9c15f3fe6e5cf4211f346271d7b01c8f3b28be689c8429c85b67af215533311f0b8dfaaa154fa6b88176c229f2885d" + }, + "Q0": { + "x": "0x125435adce8e1cbd1c803e7123f45392dc6e326d292499c2c45c5865985fd74fe8f042ecdeeec5ecac80680d04317d80", + "y": "0x0e8828948c989126595ee30e4f7c931cbd6f4570735624fd25aef2fa41d3f79cfb4b4ee7b7e55a8ce013af2a5ba20bf2" + }, + "Q1": { + "x": "0x11def93719829ecda3b46aa8c31fc3ac9c34b428982b898369608e4f042babee6c77ab9218aad5c87ba785481eff8ae4", + "y": "0x0007c9cef122ccf2efd233d6eb9bfc680aa276652b0661f4f820a653cec1db7ff69899f8e52b8e92b025a12c822a6ce6" + }, + "msg": "abc", + "u": [ + "0x0d921c33f2bad966478a03ca35d05719bdf92d347557ea166e5bba579eea9b83e9afa5c088573c2281410369fbd32951", + "0x003574a00b109ada2f26a37a91f9d1e740dffd8d69ec0c35e1e9f4652c7dba61123e9dd2e76c655d956e2b3462611139" + ] + }, + { + "P": { + "x": "0x11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98", + "y": "0x03a87ae2caf14e8ee52e51fa2ed8eefe80f02457004ba4d486d6aa1f517c0889501dc7413753f9599b099ebcbbd2d709" + }, + "Q0": { + "x": "0x08834484878c217682f6d09a4b51444802fdba3d7f2df9903a0ddadb92130ebbfa807fffa0eabf257d7b48272410afff", + "y": "0x0b318f7ecf77f45a0f038e62d7098221d2dbbca2a394164e2e3fe953dc714ac2cde412d8f2d7f0c03b259e6795a2508e" + }, + "Q1": { + "x": "0x158418ed6b27e2549f05531a8281b5822b31c3bf3144277fbb977f8d6e2694fedceb7011b3c2b192f23e2a44b2bd106e", + "y": "0x1879074f344471fac5f839e2b4920789643c075792bec5af4282c73f7941cda5aa77b00085eb10e206171b9787c4169f" + }, + "msg": "abcdef0123456789", + "u": [ + "0x062d1865eb80ebfa73dcfc45db1ad4266b9f3a93219976a3790ab8d52d3e5f1e62f3b01795e36834b17b70e7b76246d4", + "0x0cdc3e2f271f29c4ff75020857ce6c5d36008c9b48385ea2f2bf6f96f428a3deb798aa033cd482d1cdc8b30178b08e3a" + ] + }, + { + "P": { + "x": "0x15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488", + "y": "0x1807a1d50c29f430b8cafc4f8638dfeeadf51211e1602a5f184443076715f91bb90a48ba1e370edce6ae1062f5e6dd38" + }, + "Q0": { + "x": "0x0cbd7f84ad2c99643fea7a7ac8f52d63d66cefa06d9a56148e58b984b3dd25e1f41ff47154543343949c64f88d48a710", + "y": "0x052c00e4ed52d000d94881a5638ae9274d3efc8bc77bc0e5c650de04a000b2c334a9e80b85282a00f3148dfdface0865" + }, + "Q1": { + "x": "0x06493fb68f0d513af08be0372f849436a787e7b701ae31cb964d968021d6ba6bd7d26a38aaa5a68e8c21a6b17dc8b579", + "y": "0x02e98f2ccf5802b05ffaac7c20018bc0c0b2fd580216c4aa2275d2909dc0c92d0d0bdc979226adeb57a29933536b6bb4" + }, + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "u": [ + "0x010476f6a060453c0b1ad0b628f3e57c23039ee16eea5e71bb87c3b5419b1255dc0e5883322e563b84a29543823c0e86", + "0x0b1a912064fb0554b180e07af7e787f1f883a0470759c03c1b6509eb8ce980d1670305ae7b928226bb58fdc0a419f46e" + ] + }, + { + "P": { + "x": "0x082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe", + "y": "0x05b84ae5a942248eea39e1d91030458c40153f3b654ab7872d779ad1e942856a20c438e8d99bc8abfbf74729ce1f7ac8" + }, + "Q0": { + "x": "0x0cf97e6dbd0947857f3e578231d07b309c622ade08f2c08b32ff372bd90db19467b2563cc997d4407968d4ac80e154f8", + "y": "0x127f0cddf2613058101a5701f4cb9d0861fd6c2a1b8e0afe194fccf586a3201a53874a2761a9ab6d7220c68661a35ab3" + }, + "Q1": { + "x": "0x092f1acfa62b05f95884c6791fba989bbe58044ee6355d100973bf9553ade52b47929264e6ae770fb264582d8dce512a", + "y": "0x028e6d0169a72cfedb737be45db6c401d3adfb12c58c619c82b93a5dfcccef12290de530b0480575ddc8397cda0bbebf" + }, + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "u": [ + "0x0a8ffa7447f6be1c5a2ea4b959c9454b431e29ccc0802bc052413a9c5b4f9aac67a93431bd480d15be1e057c8a08e8c6", + "0x05d487032f602c90fa7625dbafe0f4a49ef4a6b0b33d7bb349ff4cf5410d297fd6241876e3e77b651cfc8191e40a68b7" + ] + } + ] +} diff --git a/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json b/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json new file mode 100644 index 0000000000..7695cfc2b1 --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json @@ -0,0 +1,90 @@ +{ + "L": "0x40", + "Z": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9,0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa", + "ciphersuite": "BLS12381G2_XMD:SHA-256_SSWU_NU_", + "curve": "BLS12-381 G2", + "dst": "QUUX-V01-CS02-with-BLS12381G2_XMD:SHA-256_SSWU_NU_", + "expand": "XMD", + "field": { + "m": "0x2", + "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" + }, + "hash": "sha256", + "k": "0x80", + "map": { + "name": "SSWU" + }, + "randomOracle": false, + "vectors": [ + { + "P": { + "x": "0x00e7f4568a82b4b7dc1f14c6aaa055edf51502319c723c4dc2688c7fe5944c213f510328082396515734b6612c4e7bb7,0x126b855e9e69b1f691f816e48ac6977664d24d99f8724868a184186469ddfd4617367e94527d4b74fc86413483afb35b", + "y": "0x0caead0fd7b6176c01436833c79d305c78be307da5f6af6c133c47311def6ff1e0babf57a0fb5539fce7ee12407b0a42,0x1498aadcf7ae2b345243e281ae076df6de84455d766ab6fcdaad71fab60abb2e8b980a440043cd305db09d283c895e3d" + }, + "Q": { + "x": "0x18ed3794ad43c781816c523776188deafba67ab773189b8f18c49bc7aa841cd81525171f7a5203b2a340579192403bef,0x0727d90785d179e7b5732c8a34b660335fed03b913710b60903cf4954b651ed3466dc3728e21855ae822d4a0f1d06587", + "y": "0x00764a5cf6c5f61c52c838523460eb2168b5a5b43705e19cb612e006f29b717897facfd15dd1c8874c915f6d53d0342d,0x19290bb9797c12c1d275817aa2605ebe42275b66860f0e4d04487ebc2e47c50b36edd86c685a60c20a2bd584a82b011a" + }, + "msg": "", + "u": [ + "0x07355d25caf6e7f2f0cb2812ca0e513bd026ed09dda65b177500fa31714e09ea0ded3a078b526bed3307f804d4b93b04,0x02829ce3c021339ccb5caf3e187f6370e1e2a311dec9b75363117063ab2015603ff52c3d3b98f19c2f65575e99e8b78c" + ] + }, + { + "P": { + "x": "0x108ed59fd9fae381abfd1d6bce2fd2fa220990f0f837fa30e0f27914ed6e1454db0d1ee957b219f61da6ff8be0d6441f,0x0296238ea82c6d4adb3c838ee3cb2346049c90b96d602d7bb1b469b905c9228be25c627bffee872def773d5b2a2eb57d", + "y": "0x033f90f6057aadacae7963b0a0b379dd46750c1c94a6357c99b65f63b79e321ff50fe3053330911c56b6ceea08fee656,0x153606c417e59fb331b7ae6bce4fbf7c5190c33ce9402b5ebe2b70e44fca614f3f1382a3625ed5493843d0b0a652fc3f" + }, + "Q": { + "x": "0x0f40e1d5025ecef0d850aa0bb7bbeceab21a3d4e85e6bee857805b09693051f5b25428c6be343edba5f14317fcc30143,0x02e0d261f2b9fee88b82804ec83db330caa75fbb12719cfa71ccce1c532dc4e1e79b0a6a281ed8d3817524286c8bc04c", + "y": "0x0cf4a4adc5c66da0bca4caddc6a57ecd97c8252d7526a8ff478e0dfed816c4d321b5c3039c6683ae9b1e6a3a38c9c0ae,0x11cad1646bb3768c04be2ab2bbe1f80263b7ff6f8f9488f5bc3b6850e5a3e97e20acc583613c69cf3d2bfe8489744ebb" + }, + "msg": "abc", + "u": [ + "0x138879a9559e24cecee8697b8b4ad32cced053138ab913b99872772dc753a2967ed50aabc907937aefb2439ba06cc50c,0x0a1ae7999ea9bab1dcc9ef8887a6cb6e8f1e22566015428d220b7eec90ffa70ad1f624018a9ad11e78d588bd3617f9f2" + ] + }, + { + "P": { + "x": "0x038af300ef34c7759a6caaa4e69363cafeed218a1f207e93b2c70d91a1263d375d6730bd6b6509dcac3ba5b567e85bf3,0x0da75be60fb6aa0e9e3143e40c42796edf15685cafe0279afd2a67c3dff1c82341f17effd402e4f1af240ea90f4b659b", + "y": "0x19b148cbdf163cf0894f29660d2e7bfb2b68e37d54cc83fd4e6e62c020eaa48709302ef8e746736c0e19342cc1ce3df4,0x0492f4fed741b073e5a82580f7c663f9b79e036b70ab3e51162359cec4e77c78086fe879b65ca7a47d34374c8315ac5e" + }, + "Q": { + "x": "0x13a9d4a738a85c9f917c7be36b240915434b58679980010499b9ae8d7a1bf7fbe617a15b3cd6060093f40d18e0f19456,0x16fa88754e7670366a859d6f6899ad765bf5a177abedb2740aacc9252c43f90cd0421373fbd5b2b76bb8f5c4886b5d37", + "y": "0x0a7fa7d82c46797039398253e8765a4194100b330dfed6d7fbb46d6fbf01e222088779ac336e3675c7a7a0ee05bbb6e3,0x0c6ee170ab766d11fa9457cef53253f2628010b2cffc102b3b28351eb9df6c281d3cfc78e9934769d661b72a5265338d" + }, + "msg": "abcdef0123456789", + "u": [ + "0x18c16fe362b7dbdfa102e42bdfd3e2f4e6191d479437a59db4eb716986bf08ee1f42634db66bde97d6c16bbfd342b3b8,0x0e37812ce1b146d998d5f92bdd5ada2a31bfd63dfe18311aa91637b5f279dd045763166aa1615e46a50d8d8f475f184e" + ] + }, + { + "P": { + "x": "0x0c5ae723be00e6c3f0efe184fdc0702b64588fe77dda152ab13099a3bacd3876767fa7bbad6d6fd90b3642e902b208f9,0x12c8c05c1d5fc7bfa847f4d7d81e294e66b9a78bc9953990c358945e1f042eedafce608b67fdd3ab0cb2e6e263b9b1ad", + "y": "0x04e77ddb3ede41b5ec4396b7421dd916efc68a358a0d7425bddd253547f2fb4830522358491827265dfc5bcc1928a569,0x11c624c56dbe154d759d021eec60fab3d8b852395a89de497e48504366feedd4662d023af447d66926a28076813dd646" + }, + "Q": { + "x": "0x0a08b2f639855dfdeaaed972702b109e2241a54de198b2b4cd12ad9f88fa419a6086a58d91fc805de812ea29bee427c2,0x04a7442e4cb8b42ef0f41dac9ee74e65ecad3ce0851f0746dc47568b0e7a8134121ed09ba054509232c49148aef62cda", + "y": "0x05d60b1f04212b2c87607458f71d770f43973511c260f0540eef3a565f42c7ce59aa1cea684bb2a7bcab84acd2f36c8c,0x1017aa5747ba15505ece266a86b0ca9c712f41a254b76ca04094ca442ce45ecd224bd5544cd16685d0d1b9d156dd0531" + }, + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "u": [ + "0x08d4a0997b9d52fecf99427abb721f0fa779479963315fe21c6445250de7183e3f63bfdf86570da8929489e421d4ee95,0x16cb4ccad91ec95aab070f22043916cd6a59c4ca94097f7f510043d48515526dc8eaaea27e586f09151ae613688d5a89" + ] + }, + { + "P": { + "x": "0x0ea4e7c33d43e17cc516a72f76437c4bf81d8f4eac69ac355d3bf9b71b8138d55dc10fd458be115afa798b55dac34be1,0x1565c2f625032d232f13121d3cfb476f45275c303a037faa255f9da62000c2c864ea881e2bcddd111edc4a3c0da3e88d", + "y": "0x043b6f5fe4e52c839148dc66f2b3751e69a0f6ebb3d056d6465d50d4108543ecd956e10fa1640dfd9bc0030cc2558d28,0x0f8991d2a1ad662e7b6f58ab787947f1fa607fce12dde171bc17903b012091b657e15333e11701edcf5b63ba2a561247" + }, + "Q": { + "x": "0x19592c812d5a50c5601062faba14c7d670711745311c879de1235a0a11c75aab61327bf2d1725db07ec4d6996a682886,0x0eef4fa41ddc17ed47baf447a2c498548f3c72a02381313d13bef916e240b61ce125539090d62d9fbb14a900bf1b8e90", + "y": "0x1260d6e0987eae96af9ebe551e08de22b37791d53f4db9e0d59da736e66699735793e853e26362531fe4adf99c1883e3,0x0dbace5df0a4ac4ac2f45d8fdf8aee45484576fdd6efc4f98ab9b9f4112309e628255e183022d98ea5ed6e47ca00306c" + }, + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "u": [ + "0x03f80ce4ff0ca2f576d797a3660e3f65b274285c054feccc3215c879e2c0589d376e83ede13f93c32f05da0f68fd6a10,0x006488a837c5413746d868d1efb7232724da10eca410b07d8b505b9363bdccf0a1fc0029bad07d65b15ccfe6dd25e20d" + ] + } + ] +} diff --git a/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json b/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json new file mode 100644 index 0000000000..5807ee6f6b --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json @@ -0,0 +1,115 @@ +{ + "L": "0x40", + "Z": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9,0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa", + "ciphersuite": "BLS12381G2_XMD:SHA-256_SSWU_RO_", + "curve": "BLS12-381 G2", + "dst": "QUUX-V01-CS02-with-BLS12381G2_XMD:SHA-256_SSWU_RO_", + "expand": "XMD", + "field": { + "m": "0x2", + "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" + }, + "hash": "sha256", + "k": "0x80", + "map": { + "name": "SSWU" + }, + "randomOracle": true, + "vectors": [ + { + "P": { + "x": "0x0141ebfbdca40eb85b87142e130ab689c673cf60f1a3e98d69335266f30d9b8d4ac44c1038e9dcdd5393faf5c41fb78a,0x05cb8437535e20ecffaef7752baddf98034139c38452458baeefab379ba13dff5bf5dd71b72418717047f5b0f37da03d", + "y": "0x0503921d7f6a12805e72940b963c0cf3471c7b2a524950ca195d11062ee75ec076daf2d4bc358c4b190c0c98064fdd92,0x12424ac32561493f3fe3c260708a12b7c620e7be00099a974e259ddc7d1f6395c3c811cdd19f1e8dbf3e9ecfdcbab8d6" + }, + "Q0": { + "x": "0x019ad3fc9c72425a998d7ab1ea0e646a1f6093444fc6965f1cad5a3195a7b1e099c050d57f45e3fa191cc6d75ed7458c,0x171c88b0b0efb5eb2b88913a9e74fe111a4f68867b59db252ce5868af4d1254bfab77ebde5d61cd1a86fb2fe4a5a1c1d", + "y": "0x0ba10604e62bdd9eeeb4156652066167b72c8d743b050fb4c1016c31b505129374f76e03fa127d6a156213576910fef3,0x0eb22c7a543d3d376e9716a49b72e79a89c9bfe9feee8533ed931cbb5373dde1fbcd7411d8052e02693654f71e15410a" + }, + "Q1": { + "x": "0x113d2b9cd4bd98aee53470b27abc658d91b47a78a51584f3d4b950677cfb8a3e99c24222c406128c91296ef6b45608be,0x13855912321c5cb793e9d1e88f6f8d342d49c0b0dbac613ee9e17e3c0b3c97dfbb5a49cc3fb45102fdbaf65e0efe2632", + "y": "0x0fd3def0b7574a1d801be44fde617162aa2e89da47f464317d9bb5abc3a7071763ce74180883ad7ad9a723a9afafcdca,0x056f617902b3c0d0f78a9a8cbda43a26b65f602f8786540b9469b060db7b38417915b413ca65f875c130bebfaa59790c" + }, + "msg": "", + "u": [ + "0x03dbc2cce174e91ba93cbb08f26b917f98194a2ea08d1cce75b2b9cc9f21689d80bd79b594a613d0a68eb807dfdc1cf8,0x05a2acec64114845711a54199ea339abd125ba38253b70a92c876df10598bd1986b739cad67961eb94f7076511b3b39a", + "0x02f99798e8a5acdeed60d7e18e9120521ba1f47ec090984662846bc825de191b5b7641148c0dbc237726a334473eee94,0x145a81e418d4010cc027a68f14391b30074e89e60ee7a22f87217b2f6eb0c4b94c9115b436e6fa4607e95a98de30a435" + ] + }, + { + "P": { + "x": "0x02c2d18e033b960562aae3cab37a27ce00d80ccd5ba4b7fe0e7a210245129dbec7780ccc7954725f4168aff2787776e6,0x139cddbccdc5e91b9623efd38c49f81a6f83f175e80b06fc374de9eb4b41dfe4ca3a230ed250fbe3a2acf73a41177fd8", + "y": "0x1787327b68159716a37440985269cf584bcb1e621d3a7202be6ea05c4cfe244aeb197642555a0645fb87bf7466b2ba48,0x00aa65dae3c8d732d10ecd2c50f8a1baf3001578f71c694e03866e9f3d49ac1e1ce70dd94a733534f106d4cec0eddd16" + }, + "Q0": { + "x": "0x12b2e525281b5f4d2276954e84ac4f42cf4e13b6ac4228624e17760faf94ce5706d53f0ca1952f1c5ef75239aeed55ad,0x05d8a724db78e570e34100c0bc4a5fa84ad5839359b40398151f37cff5a51de945c563463c9efbdda569850ee5a53e77", + "y": "0x02eacdc556d0bdb5d18d22f23dcb086dd106cad713777c7e6407943edbe0b3d1efe391eedf11e977fac55f9b94f2489c,0x04bbe48bfd5814648d0b9e30f0717b34015d45a861425fabc1ee06fdfce36384ae2c808185e693ae97dcde118f34de41" + }, + "Q1": { + "x": "0x19f18cc5ec0c2f055e47c802acc3b0e40c337256a208001dde14b25afced146f37ea3d3ce16834c78175b3ed61f3c537,0x15b0dadc256a258b4c68ea43605dffa6d312eef215c19e6474b3e101d33b661dfee43b51abbf96fee68fc6043ac56a58", + "y": "0x05e47c1781286e61c7ade887512bd9c2cb9f640d3be9cf87ea0bad24bd0ebfe946497b48a581ab6c7d4ca74b5147287f,0x19f98db2f4a1fcdf56a9ced7b320ea9deecf57c8e59236b0dc21f6ee7229aa9705ce9ac7fe7a31c72edca0d92370c096" + }, + "msg": "abc", + "u": [ + "0x15f7c0aa8f6b296ab5ff9c2c7581ade64f4ee6f1bf18f55179ff44a2cf355fa53dd2a2158c5ecb17d7c52f63e7195771,0x01c8067bf4c0ba709aa8b9abc3d1cef589a4758e09ef53732d670fd8739a7274e111ba2fcaa71b3d33df2a3a0c8529dd", + "0x187111d5e088b6b9acfdfad078c4dacf72dcd17ca17c82be35e79f8c372a693f60a033b461d81b025864a0ad051a06e4,0x08b852331c96ed983e497ebc6dee9b75e373d923b729194af8e72a051ea586f3538a6ebb1e80881a082fa2b24df9f566" + ] + }, + { + "P": { + "x": "0x121982811d2491fde9ba7ed31ef9ca474f0e1501297f68c298e9f4c0028add35aea8bb83d53c08cfc007c1e005723cd0,0x190d119345b94fbd15497bcba94ecf7db2cbfd1e1fe7da034d26cbba169fb3968288b3fafb265f9ebd380512a71c3f2c", + "y": "0x05571a0f8d3c08d094576981f4a3b8eda0a8e771fcdcc8ecceaf1356a6acf17574518acb506e435b639353c2e14827c8,0x0bb5e7572275c567462d91807de765611490205a941a5a6af3b1691bfe596c31225d3aabdf15faff860cb4ef17c7c3be" + }, + "Q0": { + "x": "0x0f48f1ea1318ddb713697708f7327781fb39718971d72a9245b9731faaca4dbaa7cca433d6c434a820c28b18e20ea208,0x06051467c8f85da5ba2540974758f7a1e0239a5981de441fdd87680a995649c211054869c50edbac1f3a86c561ba3162", + "y": "0x168b3d6df80069dbbedb714d41b32961ad064c227355e1ce5fac8e105de5e49d77f0c64867f3834848f152497eb76333,0x134e0e8331cee8cb12f9c2d0742714ed9eee78a84d634c9a95f6a7391b37125ed48bfc6e90bf3546e99930ff67cc97bc" + }, + "Q1": { + "x": "0x004fd03968cd1c99a0dd84551f44c206c84dcbdb78076c5bfee24e89a92c8508b52b88b68a92258403cbe1ea2da3495f,0x1674338ea298281b636b2eb0fe593008d03171195fd6dcd4531e8a1ed1f02a72da238a17a635de307d7d24aa2d969a47", + "y": "0x0dc7fa13fff6b12558419e0a1e94bfc3cfaf67238009991c5f24ee94b632c3d09e27eca329989aee348a67b50d5e236c,0x169585e164c131103d85324f2d7747b23b91d66ae5d947c449c8194a347969fc6bbd967729768da485ba71868df8aed2" + }, + "msg": "abcdef0123456789", + "u": [ + "0x0313d9325081b415bfd4e5364efaef392ecf69b087496973b229303e1816d2080971470f7da112c4eb43053130b785e1,0x062f84cb21ed89406890c051a0e8b9cf6c575cf6e8e18ecf63ba86826b0ae02548d83b483b79e48512b82a6c0686df8f", + "0x1739123845406baa7be5c5dc74492051b6d42504de008c635f3535bb831d478a341420e67dcc7b46b2e8cba5379cca97,0x01897665d9cb5db16a27657760bbea7951f67ad68f8d55f7113f24ba6ddd82caef240a9bfa627972279974894701d975" + ] + }, + { + "P": { + "x": "0x19a84dd7248a1066f737cc34502ee5555bd3c19f2ecdb3c7d9e24dc65d4e25e50d83f0f77105e955d78f4762d33c17da,0x0934aba516a52d8ae479939a91998299c76d39cc0c035cd18813bec433f587e2d7a4fef038260eef0cef4d02aae3eb91", + "y": "0x14f81cd421617428bc3b9fe25afbb751d934a00493524bc4e065635b0555084dd54679df1536101b2c979c0152d09192,0x09bcccfa036b4847c9950780733633f13619994394c23ff0b32fa6b795844f4a0673e20282d07bc69641cee04f5e5662" + }, + "Q0": { + "x": "0x09eccbc53df677f0e5814e3f86e41e146422834854a224bf5a83a50e4cc0a77bfc56718e8166ad180f53526ea9194b57,0x0c3633943f91daee715277bd644fba585168a72f96ded64fc5a384cce4ec884a4c3c30f08e09cd2129335dc8f67840ec", + "y": "0x0eb6186a0457d5b12d132902d4468bfeb7315d83320b6c32f1c875f344efcba979952b4aa418589cb01af712f98cc555,0x119e3cf167e69eb16c1c7830e8df88856d48be12e3ff0a40791a5cd2f7221311d4bf13b1847f371f467357b3f3c0b4c7" + }, + "Q1": { + "x": "0x0eb3aabc1ddfce17ff18455fcc7167d15ce6b60ddc9eb9b59f8d40ab49420d35558686293d046fc1e42f864b7f60e381,0x198bdfb19d7441ebcca61e8ff774b29d17da16547d2c10c273227a635cacea3f16826322ae85717630f0867539b5ed8b", + "y": "0x0aaf1dee3adf3ed4c80e481c09b57ea4c705e1b8d25b897f0ceeec3990748716575f92abff22a1c8f4582aff7b872d52,0x0d058d9061ed27d4259848a06c96c5ca68921a5d269b078650c882cb3c2bd424a8702b7a6ee4e0ead9982baf6843e924" + }, + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "u": [ + "0x025820cefc7d06fd38de7d8e370e0da8a52498be9b53cba9927b2ef5c6de1e12e12f188bbc7bc923864883c57e49e253,0x034147b77ce337a52e5948f66db0bab47a8d038e712123bb381899b6ab5ad20f02805601e6104c29df18c254b8618c7b", + "0x0930315cae1f9a6017c3f0c8f2314baa130e1cf13f6532bff0a8a1790cd70af918088c3db94bda214e896e1543629795,0x10c4df2cacf67ea3cb3108b00d4cbd0b3968031ebc8eac4b1ebcefe84d6b715fde66bef0219951ece29d1facc8a520ef" + ] + }, + { + "P": { + "x": "0x01a6ba2f9a11fa5598b2d8ace0fbe0a0eacb65deceb476fbbcb64fd24557c2f4b18ecfc5663e54ae16a84f5ab7f62534,0x11fca2ff525572795a801eed17eb12785887c7b63fb77a42be46ce4a34131d71f7a73e95fee3f812aea3de78b4d01569", + "y": "0x0b6798718c8aed24bc19cb27f866f1c9effcdbf92397ad6448b5c9db90d2b9da6cbabf48adc1adf59a1a28344e79d57e,0x03a47f8e6d1763ba0cad63d6114c0accbef65707825a511b251a660a9b3994249ae4e63fac38b23da0c398689ee2ab52" + }, + "Q0": { + "x": "0x17cadf8d04a1a170f8347d42856526a24cc466cb2ddfd506cff01191666b7f944e31244d662c904de5440516a2b09004,0x0d13ba91f2a8b0051cf3279ea0ee63a9f19bc9cb8bfcc7d78b3cbd8cc4fc43ba726774b28038213acf2b0095391c523e", + "y": "0x17ef19497d6d9246fa94d35575c0f8d06ee02f21a284dbeaa78768cb1e25abd564e3381de87bda26acd04f41181610c5,0x12c3c913ba4ed03c24f0721a81a6be7430f2971ffca8fd1729aafe496bb725807531b44b34b59b3ae5495e5a2dcbd5c8" + }, + "Q1": { + "x": "0x16ec57b7fe04c71dfe34fb5ad84dbce5a2dbbd6ee085f1d8cd17f45e8868976fc3c51ad9eeda682c7869024d24579bfd,0x13103f7aace1ae1420d208a537f7d3a9679c287208026e4e3439ab8cd534c12856284d95e27f5e1f33eec2ce656533b0", + "y": "0x0958b2c4c2c10fcef5a6c59b9e92c4a67b0fae3e2e0f1b6b5edad9c940b8f3524ba9ebbc3f2ceb3cfe377655b3163bd7,0x0ccb594ed8bd14ca64ed9cb4e0aba221be540f25dd0d6ba15a4a4be5d67bcf35df7853b2d8dad3ba245f1ea3697f66aa" + }, + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "u": [ + "0x190b513da3e66fc9a3587b78c76d1d132b1152174d0b83e3c1114066392579a45824c5fa17649ab89299ddd4bda54935,0x12ab625b0fe0ebd1367fe9fac57bb1168891846039b4216b9d94007b674de2d79126870e88aeef54b2ec717a887dcf39", + "0x0e6a42010cf435fb5bacc156a585e1ea3294cc81d0ceb81924d95040298380b164f702275892cedd81b62de3aba3f6b5,0x117d9a0defc57a33ed208428cb84e54c85a6840e7648480ae428838989d25d97a0af8e3255be62b25c2a85630d2dddd8" + ] + } + ] +} diff --git a/src/blst/bindings/vectors/hash_to_curve/README b/src/blst/bindings/vectors/hash_to_curve/README new file mode 100644 index 0000000000..6e36d550e4 --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/README @@ -0,0 +1,3 @@ +These files are downloaded from https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve/tree/master/poc/vectors, commit 6d40f98. + +Note the file names cannot have ":" in them as this is incompatible with Windows. diff --git a/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_256.json b/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_256.json new file mode 100644 index 0000000000..b5e2aa3f40 --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_256.json @@ -0,0 +1,78 @@ +{ + "DST": "QUUX-V01-CS02-with-expander-SHA256-128-long-DST-1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", + "hash": "SHA256", + "k": 128, + "name": "expand_message_xmd", + "tests": [ + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x20", + "msg": "", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "e8dc0c8b686b7ef2074086fbdd2f30e3f8bfbd3bdf177f73f04b97ce618a3ed3" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x20", + "msg": "abc", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "52dbf4f36cf560fca57dedec2ad924ee9c266341d8f3d6afe5171733b16bbb12" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x20", + "msg": "abcdef0123456789", + "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "35387dcf22618f3728e6c686490f8b431f76550b0b2c61cbc1ce7001536f4521" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x20", + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "01b637612bb18e840028be900a833a74414140dde0c4754c198532c3a0ba42bc" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x20", + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "20cce7033cabc5460743180be6fa8aac5a103f56d481cf369a8accc0c374431b" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x80", + "msg": "", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "14604d85432c68b757e485c8894db3117992fc57e0e136f71ad987f789a0abc287c47876978e2388a02af86b1e8d1342e5ce4f7aaa07a87321e691f6fba7e0072eecc1218aebb89fb14a0662322d5edbd873f0eb35260145cd4e64f748c5dfe60567e126604bcab1a3ee2dc0778102ae8a5cfd1429ebc0fa6bf1a53c36f55dfc" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x80", + "msg": "abc", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "1a30a5e36fbdb87077552b9d18b9f0aee16e80181d5b951d0471d55b66684914aef87dbb3626eaabf5ded8cd0686567e503853e5c84c259ba0efc37f71c839da2129fe81afdaec7fbdc0ccd4c794727a17c0d20ff0ea55e1389d6982d1241cb8d165762dbc39fb0cee4474d2cbbd468a835ae5b2f20e4f959f56ab24cd6fe267" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x80", + "msg": "abcdef0123456789", + "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "d2ecef3635d2397f34a9f86438d772db19ffe9924e28a1caf6f1c8f15603d4028f40891044e5c7e39ebb9b31339979ff33a4249206f67d4a1e7c765410bcd249ad78d407e303675918f20f26ce6d7027ed3774512ef5b00d816e51bfcc96c3539601fa48ef1c07e494bdc37054ba96ecb9dbd666417e3de289d4f424f502a982" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x80", + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "ed6e8c036df90111410431431a232d41a32c86e296c05d426e5f44e75b9a50d335b2412bc6c91e0a6dc131de09c43110d9180d0a70f0d6289cb4e43b05f7ee5e9b3f42a1fad0f31bac6a625b3b5c50e3a83316783b649e5ecc9d3b1d9471cb5024b7ccf40d41d1751a04ca0356548bc6e703fca02ab521b505e8e45600508d32" + }, + { + "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "len_in_bytes": "0x80", + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", + "uniform_bytes": "78b53f2413f3c688f07732c10e5ced29a17c6a16f717179ffbe38d92d6c9ec296502eb9889af83a1928cd162e845b0d3c5424e83280fed3d10cffb2f8431f14e7a23f4c68819d40617589e4c41169d0b56e0e3535be1fd71fbb08bb70c5b5ffed953d6c14bf7618b35fc1f4c4b30538236b4b08c9fbf90462447a8ada60be495" + } + ] +} diff --git a/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_38.json b/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_38.json new file mode 100644 index 0000000000..e6a8c7481e --- /dev/null +++ b/src/blst/bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_38.json @@ -0,0 +1,78 @@ +{ + "DST": "QUUX-V01-CS02-with-expander-SHA256-128", + "hash": "SHA256", + "k": 128, + "name": "expand_message_xmd", + "tests": [ + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x20", + "msg": "", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "68a985b87eb6b46952128911f2a4412bbc302a9d759667f87f7a21d803f07235" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x20", + "msg": "abc", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "d8ccab23b5985ccea865c6c97b6e5b8350e794e603b4b97902f53a8a0d605615" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x20", + "msg": "abcdef0123456789", + "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "eff31487c770a893cfb36f912fbfcbff40d5661771ca4b2cb4eafe524333f5c1" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x20", + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "b23a1d2b4d97b2ef7785562a7e8bac7eed54ed6e97e29aa51bfe3f12ddad1ff9" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x20", + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "4623227bcc01293b8c130bf771da8c298dede7383243dc0993d2d94823958c4c" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x80", + "msg": "", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "af84c27ccfd45d41914fdff5df25293e221afc53d8ad2ac06d5e3e29485dadbee0d121587713a3e0dd4d5e69e93eb7cd4f5df4cd103e188cf60cb02edc3edf18eda8576c412b18ffb658e3dd6ec849469b979d444cf7b26911a08e63cf31f9dcc541708d3491184472c2c29bb749d4286b004ceb5ee6b9a7fa5b646c993f0ced" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x80", + "msg": "abc", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "abba86a6129e366fc877aab32fc4ffc70120d8996c88aee2fe4b32d6c7b6437a647e6c3163d40b76a73cf6a5674ef1d890f95b664ee0afa5359a5c4e07985635bbecbac65d747d3d2da7ec2b8221b17b0ca9dc8a1ac1c07ea6a1e60583e2cb00058e77b7b72a298425cd1b941ad4ec65e8afc50303a22c0f99b0509b4c895f40" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x80", + "msg": "abcdef0123456789", + "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "ef904a29bffc4cf9ee82832451c946ac3c8f8058ae97d8d629831a74c6572bd9ebd0df635cd1f208e2038e760c4994984ce73f0d55ea9f22af83ba4734569d4bc95e18350f740c07eef653cbb9f87910d833751825f0ebefa1abe5420bb52be14cf489b37fe1a72f7de2d10be453b2c9d9eb20c7e3f6edc5a60629178d9478df" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x80", + "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "80be107d0884f0d881bb460322f0443d38bd222db8bd0b0a5312a6fedb49c1bbd88fd75d8b9a09486c60123dfa1d73c1cc3169761b17476d3c6b7cbbd727acd0e2c942f4dd96ae3da5de368d26b32286e32de7e5a8cb2949f866a0b80c58116b29fa7fabb3ea7d520ee603e0c25bcaf0b9a5e92ec6a1fe4e0391d1cdbce8c68a" + }, + { + "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "len_in_bytes": "0x80", + "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", + "uniform_bytes": "546aff5444b5b79aa6148bd81728704c32decb73a3ba76e9e75885cad9def1d06d6792f8a7d12794e90efed817d96920d728896a4510864370c207f99bd4a608ea121700ef01ed879745ee3e4ceef777eda6d9e5e38b90c86ea6fb0b36504ba4a45d22e86f6db5dd43d98a294bebb9125d5b794e9d2a81181066eb954966a487" + } + ] +} diff --git a/src/blst/blst_logo_small.png b/src/blst/blst_logo_small.png new file mode 100644 index 0000000000..0993ac3d6f Binary files /dev/null and b/src/blst/blst_logo_small.png differ diff --git a/src/blst/build.bat b/src/blst/build.bat new file mode 100755 index 0000000000..088c2a7cee --- /dev/null +++ b/src/blst/build.bat @@ -0,0 +1,53 @@ +@echo off +SETLOCAL +set PATH=%windir%\system32;%PATH% &:: override msys if there is one on %PATH% +set TOP=%~dp0 +cl /nologo /c /O2 /Zi /Fdblst.pdb /W4 /MT /Zl %TOP%src\server.c || EXIT /B +cl 2>&1 | find "for ARM64" > nul: +IF ERRORLEVEL 1 ( + set arm64=no + FOR %%F IN (%TOP%build\win64\*-x86_64.asm) DO ( + ml64 /nologo /c /Cp /Cx /Zi %%F || EXIT /B + ) +) ELSE ( + set arm64=yes + FOR %%F IN (%TOP%build\win64\*-armv8.asm) DO ( + armasm64 -nologo %%F || EXIT /B + ) +) +SETLOCAL ENABLEDELAYEDEXPANSION +set static=/out:blst.lib +set shared= +set arm64ec= +FOR %%O IN (%*) DO ( + set opt=%%O + IF "!opt!" == "-shared" ( + IF [!shared!] EQU [] set shared=/out:blst.dll + ) ELSE IF "!opt:~0,5!" == "/out:" ( + IF "!opt:~-4!" == ".dll" (set shared=!opt!) ELSE (set static=!opt!) + ) ELSE IF "!opt!" == "-arm64x" ( + set arm64x=%arm64% + ) +) +IF [%shared%] NEQ [] ( + cl /nologo /c /O2 /Oi- /MD %TOP%build\win64\dll.c || EXIT /B + IF [%arm64x%] NEQ [yes] ( + link /nologo /debug /dll /entry:DllMain /incremental:no %shared% ^ + /def:%TOP%build\win64\blst.def *.obj kernel32.lib && del *.obj + ) ELSE ( + lib /nologo /out:blst_arm64.lib *.obj && del *.obj || EXIT /B + cl /nologo /arm64EC /c /O2 /Zi /Fdblst.pdb /W4 /MT /Zl %TOP%src\server.c || EXIT /B + FOR %%F IN (%TOP%build\win64\*-armv8.asm) DO ( + armasm64 -nologo -machine arm64ec -nowarn %%F || EXIT /B + ) + cl /nologo /arm64EC /c /O2 /Oi- /MD %TOP%build\win64\dll.c || EXIT /B + link /nologo /machine:arm64x /dll /noentry %shared% ^ + /def:%TOP%build\win64\blst.def *.obj ^ + /defArm64Native:%TOP%build\win64\blst.def blst_arm64.lib ^ + kernel32.lib && del *.obj blst_arm64.lib + ) +) ELSE ( + lib /nologo %static% *.obj && del *.obj +) +ENDLOCAL +EXIT /B diff --git a/src/blst/build.sh b/src/blst/build.sh new file mode 100755 index 0000000000..3e9b4b66f6 --- /dev/null +++ b/src/blst/build.sh @@ -0,0 +1,117 @@ +#!/bin/sh -e +# +# The script allows to override 'CC', 'CFLAGS' and 'flavour' at command +# line, as well as specify additional compiler flags. For example to +# compile for x32: +# +# /some/where/build.sh flavour=elf32 -mx32 +# +# To cross-compile for mingw/Windows: +# +# /some/where/build.sh flavour=mingw64 CC=x86_64-w64-mingw32-gcc +# +# In addition script recognizes -shared flag and creates shared library +# alongside libblst.a. +# +# To cross-compile for WebAssembly with Emscripten SDK: +# +# /some/where/build.sh CROSS_COMPILE=em + +[ -d /usr/xpg4/bin ] && PATH=/usr/xpg4/bin:$PATH # Solaris + +TOP=`dirname $0` + +# if -Werror stands in the way, bypass with -Wno-error on command line, +# or suppress specific one with -Wno- +CFLAGS=${CFLAGS:--O2 -fno-builtin -fPIC -Wall -Wextra -Werror} +PERL=${PERL:-perl} +unset cflags shared dll + +case `uname -s` in + Darwin) flavour=macosx + if [ "`sysctl -n hw.optional.adx 2>/dev/null`" = "1" ]; then + cflags="-D__ADX__" + fi + ;; + CYGWIN*) flavour=mingw64;; + MINGW*) flavour=mingw64;; + *) flavour=elf;; +esac + +while [ "x$1" != "x" ]; do + case $1 in + -shared) shared=1;; + -dll) shared=1; dll=".dll";; + -m*) CFLAGS="$CFLAGS $1";; + -*target*) CFLAGS="$CFLAGS $1";; + -*) cflags="$cflags $1";; + *=*) eval "$1";; + esac + shift +done + +if [ "x$CC" = "x" ]; then + CC=gcc + which ${CROSS_COMPILE}cc >/dev/null 2>&1 && CC=cc +fi +if which ${CROSS_COMPILE}${CC} >/dev/null 2>&1; then + CC=${CROSS_COMPILE}${CC} +fi +if [ "x$CROSS_COMPILE" = "x" ]; then + CROSS_COMPILE=`echo $CC | + awk '{ print substr($1,0,match($1,"-(g?cc|clang)$")) }' 2>/dev/null` + # fix up android prefix... + CROSS_COMPILE=`echo $CROSS_COMPILE | + awk '{ off=match($1,"-android[0-9]+-"); + if (off) { printf "%sandroid-\n",substr($1,0,off) } + else { print $1 } }'` +fi + +predefs=`(${CC} ${CFLAGS} -dM -E -x c /dev/null || true) 2>/dev/null` + +if [ -z "${CROSS_COMPILE}${AR}" ] && echo ${predefs} | grep -q clang; then + search_dirs=`${CC} -print-search-dirs | awk -F= '/^programs:/{print$2}' | \ + (sed -E -e 's/([a-z]):\\\/\/\1\//gi' -e 'y/\\\;/\/:/' 2>/dev/null || true)` + if [ -n "$search_dirs" ] && \ + env PATH="$search_dirs:$PATH" which llvm-ar > /dev/null 2>&1; then + PATH="$search_dirs:$PATH" + AR=llvm-ar + fi +fi +AR=${AR:-${CROSS_COMPILE}ar} + +if echo ${predefs} | grep -q x86_64; then + if (grep -q -e '^flags.*\badx\b' /proc/cpuinfo) 2>/dev/null; then + cflags="-D__ADX__ $cflags" + fi +fi +if echo ${predefs} | grep -q __AVX__; then + cflags="$cflags -mno-avx" # avoid costly transitions +fi +if echo ${predefs} | grep -q 'x86_64\|aarch64'; then :; else + cflags="$cflags -D__BLST_NO_ASM__" +fi + +CFLAGS="$CFLAGS $cflags" +TMPDIR=${TMPDIR:-/tmp} + +rm -f libblst.a +trap '[ $? -ne 0 ] && rm -f libblst.a; rm -f *.o ${TMPDIR}/*.blst.$$' 0 + +(set -x; ${CC} ${CFLAGS} -c ${TOP}/src/server.c) +(set -x; ${CC} ${CFLAGS} -c ${TOP}/build/assembly.S) +(set -x; ${AR} rc libblst.a *.o) + +if [ $shared ]; then + case $flavour in + macosx) (set -x; ${CC} -dynamiclib -o libblst$dll.dylib \ + -all_load libblst.a ${CFLAGS}); exit 0;; + mingw*) sharedlib="blst.dll ${TOP}/build/win64/blst.def" + CFLAGS="${CFLAGS} --entry=DllMain ${TOP}/build/win64/dll.c" + CFLAGS="${CFLAGS} -nostdlib -lgcc";; + *) sharedlib=libblst$dll.so;; + esac + (set -x; ${CC} -shared -o $sharedlib \ + -Wl,--whole-archive,libblst.a,--no-whole-archive ${CFLAGS} \ + -Wl,-Bsymbolic) +fi diff --git a/src/blst/src/aggregate.c b/src/blst/src/aggregate.c new file mode 100644 index 0000000000..ca78876aca --- /dev/null +++ b/src/blst/src/aggregate.c @@ -0,0 +1,673 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Usage pattern on single-processor system is + * + * blst_pairing_init(ctx, hash_or_encode, DST); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); + * ... + * blst_pairing_commit(ctx); + * blst_pairing_finalverify(ctx, NULL); + * + *********************************************************************** + * Usage pattern on multi-processor system is + * + * blst_pairing_init(pk[0], hash_or_encode, DST); + * blst_pairing_init(pk[1], hash_or_encode, DST); + * ... + * start threads each processing an N/nthreads slice of PKs and messages: + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); + * ... + * blst_pairing_commit(pkx); + * ... + * meanwhile in main thread + * blst_fp12 gtsig; + * blst_aggregated_in_g2(>sig, aggregated_signature); + * join threads and merge their contexts: + * blst_pairing_merge(pk[0], pk[1]); + * blst_pairing_merge(pk[0], pk[2]); + * ... + * blst_pairing_finalverify(pk[0], gtsig); + */ + +#ifndef N_MAX +# define N_MAX 8 +#endif + +typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; +typedef struct { + unsigned int ctrl; + unsigned int nelems; + const void *DST; + size_t DST_len; + vec384fp12 GT; + AggregatedSignature AggrSign; + POINTonE2_affine Q[N_MAX]; + POINTonE1_affine P[N_MAX]; +} PAIRING; + +enum { AGGR_UNDEFINED = 0, + AGGR_MIN_SIG = 1, + AGGR_MIN_PK = 2, + AGGR_SIGN_SET = 0x10, + AGGR_GT_SET = 0x20, + AGGR_HASH_OR_ENCODE = 0x40 }; +#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) + +static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; + +size_t blst_pairing_sizeof(void) +{ return sizeof_pairing; } + +void blst_pairing_init(PAIRING *ctx, int hash_or_encode, + const void *DST, size_t DST_len) +{ + ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx->nelems = 0; + ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 + : DST; + ctx->DST_len = DST_len; +} + +static const void *pairing_get_dst(const PAIRING *ctx) +{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing + : ctx->DST; +} + +const void *blst_pairing_get_dst(const PAIRING *ctx) +{ return pairing_get_dst(ctx); } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) + +/* + * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated + * signature verification as discussed at + * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. + * Usage pattern is not finalized yet, because (sig != NULL) is better and + * will be handled separately... + */ +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_groupcheck, + const POINTonE1_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_SIG; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE1 *S = &ctx->AggrSign.e1; + POINTonE1 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + POINTonE1_mult_w5(P, P, scalar, nbits); + POINTonE1_dadd(S, S, P, NULL); + } else { + POINTonE1_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE1 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE2 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(H, H, scalar, nbits); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_groupcheck, + const POINTonE2_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_PK; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE2 *S = &ctx->AggrSign.e2; + POINTonE2 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + + POINTonE2_mult_w5(P, P, scalar, nbits); + POINTonE2_dadd(S, S, P, NULL); + } else { + POINTonE2_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE2_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE2 H[1]; + POINTonE1 pk[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE1 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + if (nbits != 0 && scalar != NULL) { + FROM_AFFINE(pk, PK); + POINTonE1_mult_w5(pk, pk, scalar, nbits); + POINTonE1_from_Jacobian(pk, pk); + PK = (const POINTonE1_affine *)pk; + } + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + unsigned int n; + + if ((n = ctx->nelems) != 0) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + break; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->ctrl & AGGR_GT_SET) { + ctx->ctrl |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->ctrl & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->ctrl & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + /* + * The aggregated signature was infinite, relation between the + * hashes and the public keys has to be VERY special... + */ + vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return (int)PAIRING_FinalVerify(ctx, GTsig); } + +int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) +{ + vec384fp12 GT; + + vec_copy(GT, GT1, sizeof(GT)); + conjugate_fp12(GT); + mul_fp12(GT, GT, GT2); + final_exp(GT, GT); + + /* return GT==1 */ + return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); +} + +void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, + const POINTonE1_affine *p) +{ + unsigned int n; + + if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) + return; + + n = ctx->nelems; + vec_copy(ctx->Q + n, q, sizeof(*q)); + vec_copy(ctx->P + n, p, sizeof(*p)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; +} + +vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) +{ + PAIRING_Commit(ctx); + return (vec384fp12 *)ctx->GT; +} + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1 P[1]; + BLST_ERROR ret; + + ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE1_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) + vec_copy(out, P, sizeof(P)); + else + POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2 P[1]; + BLST_ERROR ret; + + ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE2_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out, P, sizeof(P)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/src/blst/src/asm/add_mod_256-armv8.pl b/src/blst/src/asm/add_mod_256-armv8.pl new file mode 100755 index 0000000000..34d9145261 --- /dev/null +++ b/src/blst/src/asm/add_mod_256-armv8.pl @@ -0,0 +1,412 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..7)); +@a=map("x$_",(8..11)); +@b=map("x$_",(12..15)); +@t=map("x$_",(16,17,1..3)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + adds @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + adcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + + adds @a[0],@b[0],@b[0] + ldp @mod[0],@mod[1],[$b_ptr] + adcs @a[1],@b[1],@b[1] + ldp @mod[2],@mod[3],[$b_ptr,#16] + adcs @a[2],@b[2],@b[2] + adcs @a[3],@b[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_lshift_mod_256: + adds @a[0],@a[0],@a[0] + sub $b_ptr,$b_ptr,#1 + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adc @t[4],xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + + cbnz $b_ptr,.Loop_lshift_mod_256 + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_rshift: + adds @b[0],@a[0],@mod[0] + sub $b_ptr,$b_ptr,#1 + adcs @b[1],@a[1],@mod[1] + adcs @b[2],@a[2],@mod[2] + adcs @b[3],@a[3],@mod[3] + adc @t[4],xzr,xzr + tst @a[0],#1 + + csel @b[0],@b[0],@a[0],ne + csel @b[1],@b[1],@a[1],ne + csel @b[2],@b[2],@a[2],ne + csel @b[3],@b[3],@a[3],ne + csel @t[4],@t[4],xzr,ne + + extr @a[0],@b[1],@b[0],#1 + extr @a[1],@b[2],@b[1],#1 + extr @a[2],@b[3],@b[2],#1 + extr @a[3],@t[4],@b[3],#1 + + cbnz $b_ptr,.Loop_rshift + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @b[0],@mod[0],@a[0] + ldp @mod[2],@mod[3],[$n_ptr,#16] + orr @mod[0],@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr @mod[1],@a[2],@a[3] + sbcs @b[2],@mod[2],@a[2] + orr @t[4],@mod[0],@mod[1] + sbc @b[3],@mod[3],@a[3] + + cmp @t[4],#0 + csetm @t[4],ne + ands $b_ptr,$b_ptr,@t[4] + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@b[3],eq + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + stp @a[0],@a[1],[$r_ptr] + adc @a[3],@a[3],@mod[3] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + subs xzr,@a[0],@mod[0] + sbcs xzr,@a[1],@mod[1] + orr @a[0],@a[0],@a[1] + sbcs xzr,@a[2],@mod[2] + orr @a[0],@a[0],@a[2] + sbcs xzr,@a[3],@mod[3] + orr @a[0],@a[0],@a[3] + sbc $a_ptr,xzr,xzr + + cmp @a[0],#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,$a_ptr + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + adds @a[0],@a[0],@b[0] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[1],@a[1],@b[1] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + adc @a[3],@a[3],@mod[3] + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ + +print $code; + +close STDOUT; diff --git a/src/blst/src/asm/add_mod_256-x86_64.pl b/src/blst/src/asm/add_mod_256-x86_64.pl new file mode 100755 index 0000000000..6b605a8af3 --- /dev/null +++ b/src/blst/src/asm/add_mod_256-x86_64.pl @@ -0,0 +1,574 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits add +my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,\@function,4,"unwind" +.align 32 +add_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loaded_a_add_mod_256: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_256,.-add_mod_256 + +######################################################################## +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,\@function,3,"unwind" +.align 32 +mul_by_3_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org,$n_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $a_ptr,$b_org + mov 8*3($a_ptr), @acc[3] + + call __lshift_mod_256 + mov 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,\@abi-omnipotent +.align 32 +__lshift_mod_256: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + mov @acc[0], @acc[4] + adc @acc[2], @acc[2] + mov @acc[1], @acc[5] + adc @acc[3], @acc[3] + sbb @acc[8], @acc[8] + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, @acc[8] + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + cmovc @acc[6], @acc[2] + cmovc @acc[7], @acc[3] + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[4] +.size __lshift_mod_256,.-__lshift_mod_256 + +######################################################################## +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,\@function,4,"unwind" +.align 32 +lshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_lshift_mod_256: + call __lshift_mod_256 + dec %edx + jnz .Loop_lshift_mod_256 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + +######################################################################## +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,\@function,4,"unwind" +.align 32 +rshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[7] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_rshift_mod_256: + mov @acc[7], @acc[0] + and \$1, @acc[7] + mov 8*0($n_ptr), @acc[4] + neg @acc[7] + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + + and @acc[7], @acc[4] + and @acc[7], @acc[5] + and @acc[7], @acc[6] + and 8*3($n_ptr), @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + sbb @acc[4], @acc[4] + + shr \$1, @acc[0] + mov @acc[1], @acc[7] + shr \$1, @acc[1] + mov @acc[2], @acc[6] + shr \$1, @acc[2] + mov @acc[3], @acc[5] + shr \$1, @acc[3] + + shl \$63, @acc[7] + shl \$63, @acc[6] + or @acc[0], @acc[7] + shl \$63, @acc[5] + or @acc[6], @acc[1] + shl \$63, @acc[4] + or @acc[5], @acc[2] + or @acc[4], @acc[3] + + dec %edx + jnz .Loop_rshift_mod_256 + + mov @acc[7], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + +######################################################################## +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,\@function,4,"unwind" +.align 32 +cneg_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[8] # load a[0:3] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov @acc[8], @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], @acc[8] + or @acc[2], @acc[8] + or @acc[3], @acc[8] + mov \$-1, @acc[7] + + mov 8*0($n_ptr), @acc[4] # load n[0:3] + cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + and @acc[8], @acc[4] # n[0:3] &= mask + mov 8*3($n_ptr), @acc[7] + and @acc[8], @acc[5] + and @acc[8], @acc[6] + and @acc[8], @acc[7] + + sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 + sbb @acc[1], @acc[5] + sbb @acc[2], @acc[6] + sbb @acc[3], @acc[7] + + or $b_org, $b_org # check condition flag + + cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] + cmovz @acc[1], @acc[5] + mov @acc[4], 8*0($r_ptr) + cmovz @acc[2], @acc[6] + mov @acc[5], 8*1($r_ptr) + cmovz @acc[3], @acc[7] + mov @acc[6], 8*2($r_ptr) + mov @acc[7], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + +######################################################################## +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,\@function,4,"unwind" +.align 32 +sub_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + +######################################################################## +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,\@function,2,"unwind" +.align 32 +check_mod_256: +.cfi_startproc +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($r_ptr), %rax + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + + mov %rax, @acc[0] # see if it's zero + or @acc[1], %rax + or @acc[2], %rax + or @acc[3], %rax + + sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? + sbb 8*1($a_ptr), @acc[1] + sbb 8*2($a_ptr), @acc[2] + sbb 8*3($a_ptr), @acc[3] + sbb $a_ptr, $a_ptr + + mov \$1, %rdx + cmp \$0, %rax + cmovne %rdx, %rax + and $a_ptr, %rax +.cfi_epilogue + ret +.cfi_endproc +.size check_mod_256,.-check_mod_256 + +######################################################################## +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,\@function,4,"unwind" +.align 32 +add_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + +######################################################################## +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,\@function,4,"unwind" +.align 32 +sub_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/add_mod_384-armv8.pl b/src/blst/src/asm/add_mod_384-armv8.pl new file mode 100755 index 0000000000..9a555fca79 --- /dev/null +++ b/src/blst/src/asm/add_mod_384-armv8.pl @@ -0,0 +1,937 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..9)); +@a=map("x$_",(10..15)); +@b=map("x$_",(16,17,19..22)); +$carry=$n_ptr; + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + +__add_mod_384_ab_are_loaded: + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adcs @a[4],@a[4],@b[4] + adcs @a[5],@a[5],@b[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + + stp @a[0],@a[1],[$r_ptr] + cadd $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + cadd $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __add_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_rshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __rshift_mod_384 + cbnz $b_ptr,.Loop_rshift_mod_384 + + ldr c30,[csp,#__SIZEOF_POINTER__] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx @b[5],@a[0],#0,#1 + and @b[0],@b[5],@mod[0] + and @b[1],@b[5],@mod[1] + adds @a[0],@a[0],@b[0] + and @b[2],@b[5],@mod[2] + adcs @a[1],@a[1],@b[1] + and @b[3],@b[5],@mod[3] + adcs @a[2],@a[2],@b[2] + and @b[4],@b[5],@mod[4] + adcs @a[3],@a[3],@b[3] + and @b[5],@b[5],@mod[5] + adcs @a[4],@a[4],@b[4] + extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 + adcs @a[5],@a[5],@b[5] + extr @a[1],@a[2],@a[1],#1 + adc @b[5],xzr,xzr + extr @a[2],@a[3],@a[2],#1 + extr @a[3],@a[4],@a[3],#1 + extr @a[4],@a[5],@a[4],#1 + extr @a[5],@b[5],@a[5],#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __rshift_mod_384 + + ldr c30,[csp,#__SIZEOF_POINTER__] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_lshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __lshift_mod_384 + cbnz $b_ptr,.Loop_lshift_mod_384 + + ldr c30,[csp,#__SIZEOF_POINTER__] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr,#48] + ldp @b[2],@b[3],[$a_ptr,#64] + ldp @b[4],@b[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @mod[2],@mod[3],[$n_ptr,#16] + + subs @b[0],@mod[0],@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @mod[4],@mod[5],[$n_ptr,#32] + orr $carry,@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr $carry,$carry,@a[2] + sbcs @b[2],@mod[2],@a[2] + orr $carry,$carry,@a[3] + sbcs @b[3],@mod[3],@a[3] + orr $carry,$carry,@a[4] + sbcs @b[4],@mod[4],@a[4] + orr $carry,$carry,@a[5] + sbc @b[5],@mod[5],@a[5] + + cmp $carry,#0 + csetm $carry,ne + ands $b_ptr,$b_ptr,$carry + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + csel @a[3],@a[3],@b[3],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[4],@a[4],@b[4],eq + stp @a[2],@a[3],[$r_ptr,#16] + csel @a[5],@a[5],@b[5],eq + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + sbcs @a[2],@a[2],@b[2] + sbcs @a[3],@a[3],@b[3] + sbcs @a[4],@a[4],@b[4] + sbcs @a[5],@a[5],@b[5] + sbc $carry,xzr,xzr + + and @b[0],@mod[0],$carry + and @b[1],@mod[1],$carry + adds @a[0],@a[0],@b[0] + and @b[2],@mod[2],$carry + adcs @a[1],@a[1],@b[1] + and @b[3],@mod[3],$carry + adcs @a[2],@a[2],@b[2] + and @b[4],@mod[4],$carry + adcs @a[3],@a[3],@b[3] + and @b[5],@mod[5],$carry + adcs @a[4],@a[4],@b[4] + adc @a[5],@a[5],@b[5] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + + stp @a[0],@a[1],[$r_ptr] + cadd $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + cadd $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __sub_mod_384 + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + cadd $b_ptr,$a_ptr,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr c30,[csp,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $carry,$carry,xzr + + mvn $carry,$carry + and $carry,$carry,#2 + orr $r_ptr,$r_ptr,$carry + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + ldp @a[0],@a[1],[$r_ptr,#48] + ldp @a[2],@a[3],[$r_ptr,#64] + ldp @a[4],@a[5],[$r_ptr,#80] + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $b_ptr,$b_ptr,@b[0] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $r_ptr,$r_ptr,@b[0] + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +if (1) { +sub vec_select { +my $sz = shift; +my @v=map("v$_",(0..5,16..21)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,%function +.align 5 +vec_select_$sz: + dup v6.2d, $n_ptr + ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 +___ +for($i=0; $i<$sz-48; $i+=48) { +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 + bit @v[1].16b, @v[4].16b, v6.16b + ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 +___ + @v = @v[6..11,0..5]; +} +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + bit @v[1].16b, @v[4].16b, v6.16b + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(32); +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end, $step) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add $end, $end, $inp + sub $end, $end, #1 + mov $step, #64 + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + prfm pldl1keep, [$inp] + ret +.size vec_prefetch,.-vec_prefetch +___ +my $len = $end; + +$code.=<<___; +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,%function +.align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [$inp], #16 + lsr $len, $len, #4 + sub $len, $len, #1 + cbz $len, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [$inp], #16 + orr v0.16b, v0.16b, v1.16b + sub $len, $len, #1 + cbnz $len, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.2d[1] + orr v0.16b, v0.16b, v1.16b + umov x1, v0.2d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +___ +} +{ +my ($inp1, $inp2, $len) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,%function +.align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [$inp1], #16 + ld1 {v1.2d}, [$inp2], #16 + lsr $len, $len, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub $len, $len, #1 + cbz $len, .Loop_is_equal_done + ld1 {v1.2d}, [$inp1], #16 + ld1 {v2.2d}, [$inp2], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.2d[1] + orr v0.16b, v0.16b, v1.16b + umov x1, v0.2d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_equal_16x,.-vec_is_equal_16x +___ +} + +print $code; + +close STDOUT; diff --git a/src/blst/src/asm/add_mod_384-x86_64.pl b/src/blst/src/asm/add_mod_384-x86_64.pl new file mode 100755 index 0000000000..47e5f1bc85 --- /dev/null +++ b/src/blst/src/asm/add_mod_384-x86_64.pl @@ -0,0 +1,1566 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 384 bits add +my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); + push(@acc, $a_ptr); + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,\@function,4,"unwind" +.align 32 +add_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__add_mod_384_a_is_loaded: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,\@function,4,"unwind" +.align 32 +add_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + +######################################################################## +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,\@function,4,"unwind" +.align 32 +rshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_rshift_mod_384: + call __rshift_mod_384 + dec %edx + jnz .Loop_rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,\@abi-omnipotent +.align 32 +__rshift_mod_384: + mov \$1, @acc[11] + mov 8*0($n_ptr), @acc[6] + and @acc[0], @acc[11] + mov 8*1($n_ptr), @acc[7] + neg @acc[11] + mov 8*2($n_ptr), @acc[8] + and @acc[11], @acc[6] + mov 8*3($n_ptr), @acc[9] + and @acc[11], @acc[7] + mov 8*4($n_ptr), @acc[10] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], @acc[10] + adc @acc[5], @acc[11] + sbb @acc[5], @acc[5] + + shr \$1, @acc[6] + mov @acc[7], @acc[0] + shr \$1, @acc[7] + mov @acc[8], @acc[1] + shr \$1, @acc[8] + mov @acc[9], @acc[2] + shr \$1, @acc[9] + mov @acc[10], @acc[3] + shr \$1, @acc[10] + mov @acc[11], @acc[4] + shr \$1, @acc[11] + shl \$63, @acc[0] + shl \$63, @acc[1] + or @acc[6], @acc[0] + shl \$63, @acc[2] + or @acc[7], @acc[1] + shl \$63, @acc[3] + or @acc[8], @acc[2] + shl \$63, @acc[4] + or @acc[9], @acc[3] + shl \$63, @acc[5] + or @acc[10], @acc[4] + or @acc[11], @acc[5] + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[6] +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,\@function,3,"unwind" +.align 32 +div_by_2_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov $b_org, $n_ptr + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + call __rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + +######################################################################## +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,\@function,4,"unwind" +.align 32 +lshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $r_ptr, $r_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov (%rsp), $r_ptr + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + dec %edx + jnz .Loop_lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,\@abi-omnipotent +.align 32 +__lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +######################################################################## +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +######################################################################## +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __add_mod_384_a_is_loaded + + mov (%rsp), $a_ptr + lea 8*6($r_ptr), $r_ptr + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*6($a_ptr), @acc[0] + mov 8*7($a_ptr), @acc[1] + mov 8*8($a_ptr), @acc[2] + mov 8*9($a_ptr), @acc[3] + mov 8*10($a_ptr), @acc[4] + mov 8*11($a_ptr), @acc[5] + + call __lshift_mod_384 + + mov \$8*6, $b_org + add (%rsp), $b_org +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov (%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 48+8*0($a_ptr), @acc[0] + mov 48+8*1($a_ptr), @acc[1] + mov 48+8*2($a_ptr), @acc[2] + mov 48+8*3($a_ptr), @acc[3] + mov 48+8*4($a_ptr), @acc[4] + mov 48+8*5($a_ptr), @acc[5] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 48+8*0($r_ptr) + mov @acc[1], 48+8*1($r_ptr) + mov @acc[2], 48+8*2($r_ptr) + mov @acc[3], 48+8*3($r_ptr) + mov @acc[4], 48+8*4($r_ptr) + mov @acc[5], 48+8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +######################################################################## +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,\@function,4,"unwind" +.align 32 +cneg_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $b_org # condition flag +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), $b_org # load a[0:5] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $b_org, @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], $b_org + mov 8*4($a_ptr), @acc[4] + or @acc[2], $b_org + mov 8*5($a_ptr), @acc[5] + or @acc[3], $b_org + mov \$-1, @acc[11] + or @acc[4], $b_org + or @acc[5], $b_org + + mov 8*0($n_ptr), @acc[6] # load n[0:5] + cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + and $b_org, @acc[6] # n[0:5] &= mask + mov 8*3($n_ptr), @acc[9] + and $b_org, @acc[7] + mov 8*4($n_ptr), @acc[10] + and $b_org, @acc[8] + mov 8*5($n_ptr), @acc[11] + and $b_org, @acc[9] + mov 0(%rsp), $n_ptr # restore condition flag + and $b_org, @acc[10] + and $b_org, @acc[11] + + sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 + sbb @acc[1], @acc[7] + sbb @acc[2], @acc[8] + sbb @acc[3], @acc[9] + sbb @acc[4], @acc[10] + sbb @acc[5], @acc[11] + + or $n_ptr, $n_ptr # check condition flag + + cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] + cmovz @acc[1], @acc[7] + cmovz @acc[2], @acc[8] + mov @acc[6], 8*0($r_ptr) + cmovz @acc[3], @acc[9] + mov @acc[7], 8*1($r_ptr) + cmovz @acc[4], @acc[10] + mov @acc[8], 8*2($r_ptr) + cmovz @acc[5], @acc[11] + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + +######################################################################## +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,\@function,4,"unwind" +.align 32 +sub_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,\@function,4,"unwind" +.align 32 +sub_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +___ +} +{ ###################################################### ret = a * (1 + i) +my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); +my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); + +$code.=<<___; +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$56, %rsp +.cfi_adjust_cfa_offset 56 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + mov $r_ptr, 8*6(%rsp) # offload r_ptr + sbb $r_ptr, $r_ptr + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $a_ptr, $a_ptr + + mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1(%rsp) + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2(%rsp) + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3(%rsp) + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4(%rsp) + and $a_ptr, @acc[0] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5(%rsp) + and $a_ptr, @acc[1] + mov 8*5($n_ptr), @acc[5] + and $a_ptr, @acc[2] + and $a_ptr, @acc[3] + and $a_ptr, @acc[4] + and $a_ptr, @acc[5] + mov 8*6(%rsp), $a_ptr # restore r_ptr + + add @acc[0], @acc[6] + mov 8*0(%rsp), @acc[0] # restore a->re + a->im + adc @acc[1], @acc[7] + mov 8*1(%rsp), @acc[1] + adc @acc[2], @acc[8] + mov 8*2(%rsp), @acc[2] + adc @acc[3], @acc[9] + mov 8*3(%rsp), @acc[3] + adc @acc[4], @acc[10] + mov 8*4(%rsp), @acc[4] + adc @acc[5], @acc[11] + mov 8*5(%rsp), @acc[5] + + mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im + mov @acc[0], @acc[6] + mov @acc[7], 8*1($a_ptr) + mov @acc[8], 8*2($a_ptr) + mov @acc[1], @acc[7] + mov @acc[9], 8*3($a_ptr) + mov @acc[10], 8*4($a_ptr) + mov @acc[2], @acc[8] + mov @acc[11], 8*5($a_ptr) + + sub 8*0($n_ptr), @acc[0] + mov @acc[3], @acc[9] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[4], @acc[10] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($a_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($a_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($a_ptr) + mov @acc[4], 8*10($a_ptr) + mov @acc[5], 8*11($a_ptr) + + mov 56+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 56+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 56+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 56+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 56+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 56+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 56+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +___ +} +{ ###################################################### +my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); +my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); + +$code.=<<___; +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384: +.cfi_startproc +.cfi_end_prologue +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + xor %rax, %rax + mov @acc[0], $r_ptr + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, $r_ptr + and \$2, %rax + or $r_ptr, %rax # pack sign and parity + +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*6($r_ptr), @acc[0] # sgn0(a->im) + mov 8*7($r_ptr), @acc[1] + mov 8*8($r_ptr), @acc[2] + mov 8*9($r_ptr), @acc[3] + mov 8*10($r_ptr), @acc[4] + mov 8*11($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), %rax # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + mov 8*0(%rax), @acc[0] + mov 8*1(%rax), @acc[1] + mov 8*2(%rax), @acc[2] + mov 8*3(%rax), @acc[3] + mov 8*4(%rax), @acc[4] + mov 8*5(%rax), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp), %rbx +.cfi_restore %rbx + mov 16(%rsp), %rbp +.cfi_restore %rbp + lea 24(%rsp), %rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +} +if (0) { +my $inp = $win64 ? "%rcx" : "%rdi"; +$code.=<<___; +.globl nbits_384 +.hidden nbits_384 +.type nbits_384,\@abi-omnipotent +.align 32 +nbits_384: + mov 8*5($inp), %r8 + mov 8*4($inp), %r9 + mov 8*3($inp), %r10 + mov 8*2($inp), %r11 + mov \$-1, %rdx + mov \$127, %eax + bsr %r8, %r8 + cmovnz %rdx,%r9 + cmovz %rax,%r8 + bsr %r9, %r9 + cmovnz %rdx,%r10 + cmovz %rax,%r9 + xor \$63,%r8 + bsr %r10, %r10 + cmovnz %rdx, %r11 + cmovz %rax, %r10 + xor \$63,%r9 + add %r8, %r9 + mov 8*1($inp), %r8 + bsr %r11, %r11 + cmovnz %rdx, %r8 + cmovz %rax, %r11 + xor \$63, %r10 + add %r9, %r10 + mov 8*0($inp), %r9 + bsr %r8, %r8 + cmovnz %rdx, %r9 + cmovz %rax, %r8 + xor \$63, %r11 + add %r10, %r11 + bsr %r9, %r9 + cmovz %rax, %r9 + xor \$63, %r8 + add %r11, %r8 + xor \$63, %r9 + add %r8, %r9 + mov \$384, %eax + sub %r9, %rax + ret +.size nbits_384,.-nbits_384 +___ +} + +if (1) { +my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") + : ("%rdi", "%rsi", "%rdx", "%ecx"); + +sub vec_select { +my $sz = shift; +my $half = $sz/2; +my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,\@abi-omnipotent +.align 32 +vec_select_$sz: + movd $select, %xmm5 + pxor %xmm4,%xmm4 + pshufd \$0,%xmm5,%xmm5 # broadcast +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + movdqu ($inp1),$xmm0 + lea $half($inp1),$inp1 + pcmpeqd %xmm4,%xmm5 + movdqu ($inp2),$xmm1 + lea $half($inp2),$inp2 + pcmpeqd %xmm5,%xmm4 + lea $half($out),$out +___ +for($i=0; $i<$sz-16; $i+=16) { +$code.=<<___; + pand %xmm4,$xmm0 + movdqu $i+16-$half($inp1),$xmm2 + pand %xmm5,$xmm1 + movdqu $i+16-$half($inp2),$xmm3 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) +___ + ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); +} +$code.=<<___; + pand %xmm4,$xmm0 + pand %xmm5,$xmm1 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(32); +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,\@abi-omnipotent +.align 32 +vec_prefetch: + leaq -1($inp,$end), $end + mov \$64, %rax + xor %r8, %r8 +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + prefetchnta ($inp) + ret +.size vec_prefetch,.-vec_prefetch +___ +my $len = $win64 ? "%edx" : "%esi"; + +$code.=<<___; +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,\@abi-omnipotent +.align 32 +vec_is_zero_16x: + shr \$4, $len +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + movdqu ($inp), %xmm0 + lea 16($inp), $inp + +.Loop_is_zero: + dec $len + jz .Loop_is_zero_done + movdqu ($inp), %xmm1 + lea 16($inp), $inp + por %xmm1, %xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd \$0x4e, %xmm0, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, %rax + inc $len # now it's 1 + test %rax, %rax + cmovnz $len, %eax + xor \$1, %eax + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +___ +} +{ +my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d") + : ("%rdi", "%rsi", "%edx"); +$code.=<<___; +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,\@abi-omnipotent +.align 32 +vec_is_equal_16x: + shr \$4, $len +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + movdqu ($inp1), %xmm0 + movdqu ($inp2), %xmm1 + sub $inp1, $inp2 + lea 16($inp1), $inp1 + pxor %xmm1, %xmm0 + +.Loop_is_equal: + dec $len + jz .Loop_is_equal_done + movdqu ($inp1), %xmm1 + movdqu ($inp1,$inp2), %xmm2 + lea 16($inp1), $inp1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd \$0x4e, %xmm0, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, %rax + inc $len # now it's 1 + test %rax, %rax + cmovnz $len, %eax + xor \$1, %eax + ret +.size vec_is_equal_16x,.-vec_is_equal_16x +___ +} +print $code; +close STDOUT; diff --git a/src/blst/src/asm/add_mod_384x384-x86_64.pl b/src/blst/src/asm/add_mod_384x384-x86_64.pl new file mode 100755 index 0000000000..64ca97e941 --- /dev/null +++ b/src/blst/src/asm/add_mod_384x384-x86_64.pl @@ -0,0 +1,250 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +############################################################ 384x384 add/sub +# Double-width addition/subtraction modulo n<<384, as opposite to +# naively expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +{ +my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,\@function,4,"unwind" +.align 32 +add_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + add 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + adc 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + adc 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + adc 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + adc 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + adc 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + adc 8*6($b_org), @acc[6] + mov @acc[1], 8*1($r_ptr) + adc 8*7($b_org), @acc[7] + mov @acc[2], 8*2($r_ptr) + adc 8*8($b_org), @acc[8] + mov @acc[4], 8*4($r_ptr) + mov @acc[6], @acc[0] + adc 8*9($b_org), @acc[9] + mov @acc[3], 8*3($r_ptr) + mov @acc[7], @acc[1] + adc 8*10($b_org), @acc[10] + mov @acc[5], 8*5($r_ptr) + mov @acc[8], @acc[2] + adc 8*11($b_org), @acc[11] + mov @acc[9], @acc[3] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[10], @acc[4] + sbb 8*2($n_ptr), @acc[8] + sbb 8*3($n_ptr), @acc[9] + sbb 8*4($n_ptr), @acc[10] + mov @acc[11], @acc[5] + sbb 8*5($n_ptr), @acc[11] + sbb \$0, $b_org + + cmovc @acc[0], @acc[6] + cmovc @acc[1], @acc[7] + cmovc @acc[2], @acc[8] + mov @acc[6], 8*6($r_ptr) + cmovc @acc[3], @acc[9] + mov @acc[7], 8*7($r_ptr) + cmovc @acc[4], @acc[10] + mov @acc[8], 8*8($r_ptr) + cmovc @acc[5], @acc[11] + mov @acc[9], 8*9($r_ptr) + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,\@function,4,"unwind" +.align 32 +sub_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/arm-xlate.pl b/src/blst/src/asm/arm-xlate.pl new file mode 100755 index 0000000000..252b8b4d68 --- /dev/null +++ b/src/blst/src/asm/arm-xlate.pl @@ -0,0 +1,479 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ARM assembler distiller/adapter by \@dot-asm. + +use strict; + +################################################################ +# Recognized "flavour"-s are: +# +# linux[32|64] GNU assembler, effectively pass-through +# ios[32|64] global symbols' decorations, PIC tweaks, etc. +# win[32|64] Visual Studio armasm-specific directives +# coff[32|64] e.g. clang --target=arm-windows ... +# cheri64 L64P128 platform +# +my $flavour = shift; + $flavour = "linux" if (!$flavour or $flavour eq "void"); + +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; +my $in_proc; # used with 'windows' flavour + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch +my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu + +my $rodata = sub { + SWITCH: for ($flavour) { + /linux|cheri/ && return ".section\t.rodata"; + /ios/ && return ".section\t__TEXT,__const"; + /coff/ && return ".section\t.rdata,\"dr\""; + /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; + last; + } +}; + +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } +} if ($flavour !~ /linux|cheri/); + +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0\n"; + $ret .= ".previous"; + $name = "_$name"; + } elsif ($flavour =~ /ios64/) { + $name = "_$name"; + $ret = ".comm\t$name,@args[1]"; + } elsif ($flavour =~ /win/) { + $ret = "\tCOMMON\t|$name|,@args[1]"; + } elsif ($flavour =~ /coff/) { + $ret = ".comm\t$name,@args[1]"; + } else { + $ret = ".comm\t".join(',',@args); + } + + $$global = $name; + $ret; +}; + +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; last; }; + /win/ && do { $ret = ""; last; }; + } + + $ret = ".globl $name" if (!defined($ret)); + $$global = $name; + $ret; +}; +my $global = $globl; + +my $extern = sub { + &$globl(@_); + if ($flavour =~ /win/) { + return "\tEXTERN\t@_"; + } + return; # return nothing +}; + +my $type = sub { + my $arg = join(',',@_); + my $ret; + + SWITCH: for ($flavour) { + /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = "#ifdef __thumb2__\n" . + ".thumb_func $1\n" . + "#endif"; + } + last; + }; + /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { + my $type = "[DATA]"; + if ($2 eq "function") { + $in_proc = $1; + $type = "[FUNC]"; + } + $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" + : ""; + } + last; + }; + /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = ".def $1;\n". + ".type 32;\n". + ".endef"; + } + last; + }; + } + return $ret; +} if ($flavour !~ /linux|cheri/); + +my $size = sub { + if ($in_proc && $flavour =~ /win/) { + $in_proc = undef; + return "\tENDP"; + } +} if ($flavour !~ /linux|cheri/); + +my $inst = sub { + if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } + else { ".long\t".join(',',@_); } +} if ($flavour !~ /linux|cheri/); + +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { if ($flavour =~ /win/) { + "\tDCB\t$line,0\n\tALIGN\t4"; + } else { + ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; + } + } else { ""; } +}; + +my $align = sub { + "\tALIGN\t".2**@_[0]; +} if ($flavour =~ /win/); + $align = sub { + ".p2align\t".@_[0]; +} if ($flavour =~ /coff/); + +my $byte = sub { + "\tDCB\t".join(',',@_); +} if ($flavour =~ /win/); + +my $short = sub { + "\tDCWU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $word = sub { + "\tDCDU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $long = $word if ($flavour =~ /win/); + +my $quad = sub { + "\tDCQU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $skip = sub { + "\tSPACE\t".shift; +} if ($flavour =~ /win/); + +my $code = sub { + "\tCODE@_[0]"; +} if ($flavour =~ /win/); + +my $thumb = sub { # .thumb should appear prior .text in source + "# define ARM THUMB\n" . + "\tTHUMB"; +} if ($flavour =~ /win/); + +my $text = sub { + "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); +} if ($flavour =~ /win/); + +my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax + +my $rva = sub { + # .rva directive comes in handy only on 32-bit Windows, i.e. it can + # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. + # However! Corresponding compilers don't seem to bet on PIC, which + # raises the question why would assembler programmer have to jump + # through the hoops? But just in case, it would go as following: + # + # ldr r1,.LOPENSSL_armcap + # ldr r2,.LOPENSSL_armcap+4 + # adr r0,.LOPENSSL_armcap + # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas + # sub r0,r0,r1 ; r0 is image base now + # ldr r0,[r0,r2] + # ... + #.LOPENSSL_armcap: + # .rva .LOPENSSL_armcap ; self-reference + # .rva OPENSSL_armcap_P ; real target + # + # Non-position-independent [and ISA-neutral] alternative is so much + # simpler: + # + # ldr r0,.LOPENSSL_armcap + # ldr r0,[r0] + # ... + #.LOPENSSL_armcap: + # .long OPENSSL_armcap_P + # + "\tDCDU\t@_[0]\n\tRELOC\t2" +} if ($flavour =~ /win(?!64)/); + +################################################################ +# some broken instructions in Visual Studio armasm[64]... + +my $it = sub {} if ($flavour =~ /win32/); # omit 'it' + +my $ext = sub { + "\text8\t".join(',',@_); +} if ($flavour =~ /win64/); + +my $csel = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsel$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +my $csetm = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsetm$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +# ... then conditional branch instructions are also broken, but +# maintaining all the variants is tedious, so I kludge-fix it +# elsewhere... + +################################################################ +# CHERI-specific synthetic instructions +my $alignd = sub { + my ($args,$comment) = split(m|\s*//|,shift); + $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; + my @regs = split(m|,\s*|,$args); + + "\talignd\t".join(',',@regs); +}; + +my $scvalue = sub { + my ($args,$comment) = split(m|\s*//|,shift); + $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; + my @regs = split(m|,\s*|,$args); + @regs[2] =~ s/\bc([0-9])\b/x$1/; + + "\tscvalue\t".join(',',@regs); +}; + +my $cadd = sub { + my ($args,$comment) = split(m|\s*//|,shift); + if ($flavour =~ /cheri/) { + $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; + } else { + $args =~ s/\bc([0-9]+)\b/x$1/g; + } + my @regs = split(m|,\s*|,$args); + @regs[2] =~ s/c([0-9])/x$1/; + + "\tadd\t".join(',',@regs); +}; + +my $csub = sub { + my ($args,$comment) = split(m|\s*//|,shift); + if ($flavour =~ /cheri/) { + $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; + } else { + $args =~ s/\bc([0-9]+)\b/x$1/g; + } + my @regs = split(m|,\s*|,$args); + @regs[2] =~ s/c([0-9])/x$1/; + + "\tsub\t".join(',',@regs); +}; + +my $cmov = sub { + my $args = shift; + if ($flavour =~ /cheri/) { + $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; + } else { + $args =~ s/\bc([0-9]+)\b/x$1/g; + } + + "\tmov\t".$args; +}; + +my $adr = sub { + my $args = shift; + $args =~ s/\bx([0-9]+)\b/c$1/g; + + "\tadr\t".$args; +} if ($flavour =~ /cheri/); + +################################################################ +my $adrp = sub { + my ($args,$comment) = split(m|\s*//|,shift); + "\tadrp\t$args\@PAGE"; +} if ($flavour =~ /ios64/); + +my $paciasp = sub { + ($flavour =~ /linux|cheri/) ? "\t.inst\t0xd503233f" + : &$inst(0xd503233f); +}; + +my $autiasp = sub { + ($flavour =~ /linux|cheri/) ? "\t.inst\t0xd50323bf" + : &$inst(0xd50323bf); +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + if ($flavour =~ /cheri/) { + $line =~ s/\[\s*(?:x([0-9]+)|(sp))\s*(,?.*)\]/[c$1$2$3]/; + } else { + $line =~ s/\bc([0-9]+)\b/x$1/g; + $line =~ s/\bcsp\b/sp/g; + } + + if ($flavour =~ /win/) { + # adjust alignment hints, "[rN,:32]" -> "[rN@32]" + $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; + # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" + $line =~ s/\.(L\w{2,})/|\$$1|/g; + # omit "#:lo12:" on win64 + $line =~ s/#:lo12://; + } elsif ($flavour =~ /coff(?!64)/) { + $line =~ s/\.L(\w{2,})/(\$ML$1)/g; + } elsif ($flavour =~ /ios64/) { + $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; + } + + if ($flavour =~ /64/) { + # "vX.Md[N]" -> "vX.d[N] + $line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/; + } + + return $line; +} + +if ($flavour =~ /win(32|64)/) { + print<<___; + GBLA __SIZEOF_POINTER__ +__SIZEOF_POINTER__ SETA $1/8 +___ +} + +while(my $line=<>) { + + if ($flavour =~ /win/) { + if ($line =~ m/^#\s*(ifdef|ifndef|else|endif)\b(.*)/) { + my ($op, $arg) = ($1, $2); + $op = "if :def:" if ($op eq "ifdef"); + $op = "if :lnot::def:" if ($op eq "ifndef"); + print " ".$op.$arg."\n"; + next; + } + $line =~ s|//.*||; + } + + # fix up assembler-specific commentary delimiter + $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); + + if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + $label = ($GLOBALS{$label} or $label); + if ($flavour =~ /win/) { + $label =~ s|^\.L(?=\w)|\$L|; + printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); + } else { + $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); + printf "%s:", $label; + } + } + } + + if ($line !~ m/^[#@;]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + if ($flavour =~ /win64/) { + # "b.cond" -> "bcond", kludge-fix:-( + $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; + } + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +print "\tEND\n" if ($flavour =~ /win/); + +close STDOUT; diff --git a/src/blst/src/asm/ct_inverse_mod_256-armv8.pl b/src/blst/src/asm/ct_inverse_mod_256-armv8.pl new file mode 100755 index 0000000000..88b036f71c --- /dev/null +++ b/src/blst/src/asm/ct_inverse_mod_256-armv8.pl @@ -0,0 +1,610 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - +# on Cortex-A57. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(4..11)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); +my $cnt = $n_ptr; +my @t = map("x$_",(19..26)); +my ($a_lo, $b_lo) = @acc[3,7]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + paciasp + stp c29, c30, [csp,#-10*__SIZEOF_POINTER__]! + add c29, csp, #0 + stp c19, c20, [csp,#2*__SIZEOF_POINTER__] + stp c21, c22, [csp,#4*__SIZEOF_POINTER__] + stp c23, c24, [csp,#6*__SIZEOF_POINTER__] + stp c25, c26, [csp,#8*__SIZEOF_POINTER__] + sub csp, csp, #$frame + + ldp @acc[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + +#ifdef __CHERI_PURE_CAPABILITY__ + cadd $in_ptr, csp, #16+511 + alignd $in_ptr, $in_ptr, #9 +#else + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... +#endif + str c0, [csp] // offload out_ptr + + ldp @acc[4], @acc[5], [$n_ptr,#8*0] + ldp @acc[6], @acc[7], [$n_ptr,#8*2] + + stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| + stp @acc[6], @acc[7], [$in_ptr,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_256_n_shift_by_31 + str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str $f0, [$out_ptr,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr @acc[4], [$in_ptr,#8*8] // |u| + ldr @acc[5], [$in_ptr,#8*13] // |v| + madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| + madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| + str @acc[0], [$out_ptr,#8*4] + asr @acc[1], @acc[0], #63 // sign extension + stp @acc[1], @acc[1], [$out_ptr,#8*5] + stp @acc[1], @acc[1], [$out_ptr,#8*7] + + madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| + madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| + str @acc[0], [$out_ptr,#8*9] + asr @acc[1], @acc[0], #63 // sign extension + stp @acc[1], @acc[1], [$out_ptr,#8*10] + stp @acc[1], @acc[1], [$out_ptr,#8*12] +___ +for($i=2; $i<15; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc @t[3], @t[3], @t[4] + str @t[3], [$out_ptr,#8*4] + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + cadd $out_ptr, $out_ptr, #8*5 // pointer to destination |v| + bl __smul_256x63 +___ +$code.=<<___ if ($i>7); + bl __smul_512x63_tail +___ +$code.=<<___ if ($i<=7); + adc @t[3], @t[3], @t[4] + stp @t[3], @t[3], [$out_ptr,#8*4] + stp @t[3], @t[3], [$out_ptr,#8*6] +___ +} +$code.=<<___; + ////////////////////////////////////////// two[!] last iterations + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + mov $cnt, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + ldr $b_lo, [$in_ptr,#8*4] + bl __inner_loop_62_256 + + mov $f_, $f1 + mov $g_, $g1 + ldr c0, [csp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr c30, [c29,#__SIZEOF_POINTER__] + + smulh @t[1], @acc[3], $g_ // figure out top-most limb + ldp @acc[4], @acc[5], [$nx_ptr,#8*0] + adc @t[4], @t[4], @t[6] + ldp @acc[6], @acc[7], [$nx_ptr,#8*2] + + add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 + asr @t[0], @t[1], #63 // sign as mask + + and @t[4], @acc[4], @t[0] // add mod<<256 conditionally + and @t[5], @acc[5], @t[0] + adds @acc[0], @acc[0], @t[4] + and @t[6], @acc[6], @t[0] + adcs @acc[1], @acc[1], @t[5] + and @t[7], @acc[7], @t[0] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @t[3], @t[7] + adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 + + neg @t[0], @t[1] + orr @t[1], @t[1], @t[0] // excess bit or sign as mask + asr @t[0], @t[0], #63 // excess bit as mask + + and @acc[4], @acc[4], @t[1] // mask |mod| + and @acc[5], @acc[5], @t[1] + and @acc[6], @acc[6], @t[1] + and @acc[7], @acc[7], @t[1] + + eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| + eor @acc[5], @acc[5], @t[0] + adds @acc[4], @acc[4], @t[0], lsr#63 + eor @acc[6], @acc[6], @t[0] + adcs @acc[5], @acc[5], xzr + eor @acc[7], @acc[7], @t[0] + adcs @acc[6], @acc[6], xzr + adc @acc[7], @acc[7], xzr + + adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adc @acc[3], @acc[3], @acc[7] + stp @acc[2], @acc[3], [$out_ptr,#8*6] + + add csp, csp, #$frame + ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] + ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] + ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] + ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] + ldr c29, [csp],#10*__SIZEOF_POINTER__ + autiasp + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*8+8*5*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldr @t[3+$j], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @t[3+$j], @t[3+$j], $f1 + umulh @t[0], @acc[0], $f_ + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $f_ + adcs @t[3+$j], @t[3+$j], xzr + umulh @t[2], @acc[2], $f_ +___ +$code.=<<___ if ($j!=0); + adc $g1, xzr, xzr // used in __smul_512x63_tail +___ +$code.=<<___; + mul @acc[0], @acc[0], $f_ + cmp $f_, #0 + mul @acc[1], @acc[1], $f_ + csel @t[3+$j], @t[3+$j], xzr, ne + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @t[5+$j], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[5+$j], @t[5+$j], @t[2] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @t[5], @t[5], @t[6] + stp @acc[2], @t[5], [$out_ptr,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh @t[5], @acc[3], $f_ + ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v| + adc @t[7], @t[7], xzr + ldr @acc[3], [$in_ptr,#8*20] + and @t[3], @t[3], $f_ + + umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain + + sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain + asr @t[6], @t[5], #63 + + eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| + eor @acc[2], @acc[2], $f1 + adds @acc[1], @acc[1], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + umulh @t[0], @t[4], $g_ + adc @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $g_ + add @acc[7], @acc[7], @t[7] + umulh @t[2], @acc[2], $g_ + + mul @acc[0], @t[4], $g_ + mul @acc[1], @acc[1], $g_ + adds @acc[0], @acc[0], @acc[7] + mul @acc[2], @acc[2], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @t[3], @acc[3], $g_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[3], @t[3], @t[2] + adc @t[4], xzr, xzr // used in the final step + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adcs @t[3], @t[3], @t[6] // carry is used in the final step + stp @acc[2], @t[3], [$out_ptr,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*4*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) + + eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) + sub @t[6], @t[6], @t[5] + eor @acc[1], @acc[1], @t[5] + adds @acc[0], @acc[0], @t[5], lsr#63 + eor @acc[2], @acc[2], @t[5] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[5] + umulh @t[0], @acc[0], @t[6] + adcs @acc[2], @acc[2], xzr + umulh @t[1], @acc[1], @t[6] + adc @acc[3], @acc[3], xzr + umulh @t[2], @acc[2], @t[6] + and @t[5], @t[5], @t[6] + umulh @t[3+$j], @acc[3], @t[6] + neg @t[5], @t[5] + + mul @acc[0], @acc[0], @t[6] + mul @acc[1], @acc[1], @t[6] + mul @acc[2], @acc[2], @t[6] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], @t[1] + adcs @acc[3], @acc[3], @t[2] + adc @t[3+$j], @t[3+$j], @t[5] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + adcs @acc[3], @acc[3], @acc[7] + adc @acc[4], @t[3], @t[4] + + extr @acc[0], @acc[1], @acc[0], #31 + extr @acc[1], @acc[2], @acc[1], #31 + extr @acc[2], @acc[3], @acc[2], #31 + asr @t[4], @acc[4], #63 // result's sign as mask + extr @acc[3], @acc[4], @acc[3], #31 + + eor @acc[0], @acc[0], @t[4] // ensure the result is positive + eor @acc[1], @acc[1], @t[4] + adds @acc[0], @acc[0], @t[4], lsr#63 + eor @acc[2], @acc[2], @t[4] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[4] + adcs @acc[2], @acc[2], xzr + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adc @acc[3], @acc[3], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + + eor $f0, $f0, @t[4] // adjust |f/g| accordingly + eor $g0, $g0, @t[4] + sub $f0, $f0, @t[4] + sub $g0, $g0, @t[4] + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +___ + +{ +my @a = @acc[0..3]; +my @b = @acc[4..7]; +my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); + +$code.=<<___; +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*6] + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*4] + +.Lab_approximation_31_256_loaded: + orr @t[0], @a[3], @b[3] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[1], ne + orr @t[0], @a[3], @b[3] // and ones before top-most, ... + csel @b[2], @b[2], @b[1], ne + + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[0], ne + orr @t[0], @a[3], @b[3] // and one more, ... + csel @b[2], @b[2], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + neg @t[1], @t[0] + + lslv @a[3], @a[3], @t[0] // align high limbs to the left + lslv @b[3], @b[3], @t[0] + lsrv @a[2], @a[2], @t[1] + lsrv @b[2], @b[2], @t[1] + and @a[2], @a[2], @t[1], asr#6 + and @b[2], @b[2], @t[1], asr#6 + orr $a_lo, @a[3], @a[2] + orr $b_lo, @b[3], @b[2] + + bfxil $a_lo, @a[0], #0, #31 + bfxil $b_lo, @b[0], #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov $cnt, #31 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $fg1 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + cbnz $cnt, .Loop_31_256 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov @t[1], $g0 + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $f1, @t[3] + and @t[1], $g1, @t[3] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +foreach(split("\n",$code)) { + s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; + print $_,"\n"; +} +close STDOUT; diff --git a/src/blst/src/asm/ct_inverse_mod_256-x86_64.pl b/src/blst/src/asm/ct_inverse_mod_256-x86_64.pl new file mode 100755 index 0000000000..977a9d8f2a --- /dev/null +++ b/src/blst/src/asm/ct_inverse_mod_256-x86_64.pl @@ -0,0 +1,844 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15)); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edx"; + +$frame = 8*6+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 +.type ct_inverse_mod_256,\@function,4,"unwind" +.align 32 +ct_inverse_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + + mov 8*0($n_ptr), @acc[4] # load modulus + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + mov 8*3($n_ptr), @acc[7] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + + mov @acc[4], 8*4(%rax) # copy modulus to |b| + mov @acc[5], 8*5(%rax) + mov @acc[6], 8*6(%rax) + mov @acc[7], 8*7(%rax) + mov %rax, $in_ptr + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*0(%rsp) # corrected |f0| + #mov $g0, 8*1(%rsp) # corrected |g0| + mov $f0, 8*8($out_ptr) # initialize |u| with |f0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + mov $f0, 8*9($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*8($in_ptr), @acc[0] # |u| + mov 8*13($in_ptr), @acc[4] # |v| + mov @acc[0], @acc[1] + imulq 8*0(%rsp), @acc[0] # |u|*|f0| + mov @acc[4], @acc[5] + imulq 8*1(%rsp), @acc[4] # |v|*|g0| + add @acc[4], @acc[0] + mov @acc[0], 8*4($out_ptr) # destination |u| + sar \$63, @acc[0] # sign extension + mov @acc[0], 8*5($out_ptr) + mov @acc[0], 8*6($out_ptr) + mov @acc[0], 8*7($out_ptr) + mov @acc[0], 8*8($out_ptr) + lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + imulq $f0, @acc[1] # |u|*|f1| + imulq $g0, @acc[5] # |v|*|g1| + add @acc[5], @acc[1] + mov @acc[1], 8*9($out_ptr) # destination |v| + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + mov @acc[1], 8*12($out_ptr) + mov @acc[1], 8*13($out_ptr) +___ +for($i=2; $i<15; $i++) { +my $smul_512x63 = $i>8 ? "__smulq_512x63" + : "__smulq_256x63"; +$code.=<<___; + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + mov $f0, 8*2(%rsp) # corrected |f1| + mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*0(%rsp), $f0 # |f0| + mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*4($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_256x63 + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*5($out_ptr),$out_ptr # pointer to destination |v| + call $smul_512x63 +___ +$code.=<<___ if ($i==8); + sar \$63, %rbp # sign extension + mov %rbp, 8*5($out_ptr) + mov %rbp, 8*6($out_ptr) + mov %rbp, 8*7($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$47, $cnt # 31 + 512 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*4($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + #mov $f1, 8*2(%rsp) + #mov $g1, 8*3(%rsp) + + #mov 8*0(%rsp), $f0 # |f0| + #mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_256x63 + + #mov 8*2(%rsp), $f0 # |f1| + #mov 8*3(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original |out_ptr| + call __smulq_512x63 + adc %rbp, %rdx # the excess limb of the result + + mov 8*5(%rsp), $in_ptr # original |nx_ptr| + mov %rdx, %rax + sar \$63, %rdx # result's sign as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + add @acc[0], @acc[4] # conditionally add |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + adc \$0, %rax + + mov %rax, %rdx + neg %rax + or %rax, %rdx # excess bit or sign as mask + sar \$63, %rax # excess bit as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + xor %rax, @acc[0] # conditionally negate |modulus| + xor %rcx, %rcx + xor %rax, @acc[1] + sub %rax, %rcx + xor %rax, @acc[2] + xor %rax, %rdx + add %rcx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, %rdx + + add @acc[0], @acc[4] # final adjustment for |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + + mov @acc[4], 8*4($out_ptr) # store absolute value + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +$code.=<<___; +.type __smulq_512x63,\@abi-omnipotent +.align 32 +__smulq_512x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), %rbp # sign limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, %rbp + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, %rbp + + mulq %rbx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov @acc[$i], 8*$i($out_ptr) + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, %rbp + neg %rbp + mulq %rbx + add %rax, @acc[3] + adc %rdx, %rbp + mov @acc[3], 8*3($out_ptr) + + mov 8*5($in_ptr), @acc[0] # load |v| + mov 8*6($in_ptr), @acc[1] + mov 8*7($in_ptr), @acc[2] + mov 8*8($in_ptr), @acc[3] + mov 8*9($in_ptr), @acc[4] + mov 8*10($in_ptr), @acc[5] + mov 8*11($in_ptr), @acc[6] + mov 8*12($in_ptr), @acc[7] + + mov $g0, $f0 + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $g0 # conditionally negate |g0| + add %rax, $g0 + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + + mulq $g0 + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<7; $i++) { +$code.=<<___; + mulq $g0 + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + imulq $g0 + add %rax, @acc[7] + adc \$0, %rdx # used in the final step + + mov %rbp, %rbx + sar \$63, %rbp # sign extension + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc %rbx, @acc[4] + adc %rbp, @acc[5] + adc %rbp, @acc[6] + adc %rbp, @acc[7] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,\@abi-omnipotent +.align 32 +__smulq_256x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*5*$j; +my @acc=@acc; @acc=@acc[4..7] if($j); +my $top="%rbp"; $top=$g0 if($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), $top # sign/excess limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| (or |v|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, $top + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, $top + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, $top + neg $top + mulq %rbx + add %rax, @acc[3] + adc %rdx, $top +___ +$code.=<<___ if ($j==0); + mov $g0, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] # accumulate |u|*|f0| + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc %rcx, %rbp + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov %rbp, 8*4($out_ptr) + + ret +.size __smulq_256x63,.-__smulq_256x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulq_256_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulq_256_n_shift_by_31: + mov $f0, 8*0($out_ptr) # offload |f0| + mov $g0, 8*1($out_ptr) # offload |g0| + mov $f0, %rbp +___ +for($j=0; $j<2; $j++) { +my $k = 8*4*$j; +my @acc=@acc; @acc=@acc[4..7] if ($j); +my $f0="%rbp"; $f0=$g0 if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| (or |g0|) + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |a| (or |b|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + and %rbx, $f0 + neg $f0 + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mulq %rbx + add %rax, @acc[3] + adc %rdx, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc $g0, %rbp + + mov 8*0($out_ptr), $f0 # restore original |f0| + mov 8*1($out_ptr), $g0 # restore original |g0| + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, %rbp, @acc[3] + + sar \$63, %rbp # sign as mask + xor %rax, %rax + sub %rbp, %rax # sign as bit + + xor %rbp, @acc[0] # conditionally negate the result + xor %rbp, @acc[1] + xor %rbp, @acc[2] + xor %rbp, @acc[3] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + + xor %rbp, $f0 # conditionally negate |f0| + xor %rbp, $g0 # conditionally negate |g0| + add %rax, $f0 + add %rax, $g0 + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +___ +} + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31_256,\@abi-omnipotent +.align 32 +__ab_approximation_31_256: + mov 8*3($in_ptr), @a[2] # load |a| in reverse order + mov 8*7($in_ptr), @b[2] # load |b| in reverse order + mov 8*2($in_ptr), @a[1] + mov 8*6($in_ptr), @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*5($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*4($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + not %rax + and %rax, @a[2] + and %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31_256 + + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +___ +} +$code.=<<___; +.type __inner_loop_31_256,\@abi-omnipotent +.align 32 # comment and punish Coffee Lake by up to 40% +__inner_loop_31_256: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31_256: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31_256 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,\@abi-omnipotent +.align 32 +__inner_loop_62_256: + mov $cnt, %r15d + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov $f0, $g1 # |g1|=1 + mov $f0, %r14 + +.Loop_62_256: + xor $t0, $t0 + test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test %r14, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, %r15d + jnz .Loop_62_256 + + ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/ct_inverse_mod_384-armv8.pl b/src/blst/src/asm/ct_inverse_mod_384-armv8.pl new file mode 100755 index 0000000000..6b363e5d82 --- /dev/null +++ b/src/blst/src/asm/ct_inverse_mod_384-armv8.pl @@ -0,0 +1,640 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(3..14)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); +my $cnt = $n_ptr; +my @t = map("x$_",(22..28,2)); +my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; + +$frame = 32+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + paciasp + stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! + add c29, csp, #0 + stp c19, c20, [csp,#2*__SIZEOF_POINTER__] + stp c21, c22, [csp,#4*__SIZEOF_POINTER__] + stp c23, c24, [csp,#6*__SIZEOF_POINTER__] + stp c25, c26, [csp,#8*__SIZEOF_POINTER__] + stp c27, c28, [csp,#10*__SIZEOF_POINTER__] + sub csp, csp, #$frame + + ldp @t[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + ldp @acc[4], @acc[5], [$in_ptr,#8*4] + +#ifdef __CHERI_PURE_CAPABILITY__ + cadd $in_ptr, csp, #32+511 + alignd $in_ptr, $in_ptr, #9 +#else + add $in_ptr, sp, #32+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... +#endif + stp c0, c3, [csp] // offload out_ptr, nx_ptr + + ldp @acc[6], @acc[7], [$n_ptr,#8*0] + ldp @acc[8], @acc[9], [$n_ptr,#8*2] + ldp @acc[10], @acc[11], [$n_ptr,#8*4] + + stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] + stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*8] + stp @acc[10], @acc[11], [$in_ptr,#8*10] + + ////////////////////////////////////////// first iteration + mov $cnt, #62 + bl .Lab_approximation_62_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_383_n_shift_by_62 + str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str $f0, [$out_ptr,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr @acc[4], [$in_ptr,#8*12] // |u| + ldr @acc[5], [$in_ptr,#8*18] // |v| + mul @acc[0], $f_, @acc[4] // |u|*|f0| + smulh @acc[1], $f_, @acc[4] + mul @acc[2], $g_, @acc[5] // |v|*|g0| + smulh @acc[3], $g_, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + asr @acc[2], @acc[1], #63 // sign extension + stp @acc[2], @acc[2], [$out_ptr,#8*8] + stp @acc[2], @acc[2], [$out_ptr,#8*10] + + mul @acc[0], $f0, @acc[4] // |u|*|f1| + smulh @acc[1], $f0, @acc[4] + mul @acc[2], $g0, @acc[5] // |v|*|g1| + smulh @acc[3], $g0, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*12] + asr @acc[2], @acc[1], #63 // sign extension + stp @acc[2], @acc[2], [$out_ptr,#8*14] + stp @acc[2], @acc[2], [$out_ptr,#8*16] +___ +for($i=2; $i<11; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |v| + bl __smul_383x63 +___ +$code.=<<___ if ($i>5); + bl __smul_767x63_tail +___ +$code.=<<___ if ($i==5); + asr @t[5], @t[5], #63 // sign extension + stp @t[5], @t[5], [$out_ptr,#8*6] + stp @t[5], @t[5], [$out_ptr,#8*8] + stp @t[5], @t[5], [$out_ptr,#8*10] +___ +} +$code.=<<___; + ////////////////////////////////////////// iteration before last + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + mov $cnt, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load + ldp $b_lo, $b_hi, [$in_ptr,#8*6] + bl __inner_loop_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + str $a_lo, [$out_ptr,#8*0] + str $b_lo, [$out_ptr,#8*6] + + mov $f_, $f0 // exact |f0| + mov $g_, $g0 // exact |g0| + mov $f0, $f1 + mov $g0, $g1 + cadd $out_ptr, $out_ptr, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov $f_, $f0 // exact |f1| + mov $g_, $g0 // exact |g1| + cadd $out_ptr, $out_ptr, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + mov $cnt, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + eor $a_hi, $a_hi, $a_hi + ldr $b_lo, [$in_ptr,#8*6] + eor $b_hi, $b_hi, $b_hi + bl __inner_loop_62 + + mov $f_, $f1 + mov $g_, $g1 + ldp c0, c15, [csp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr c30, [c29,#__SIZEOF_POINTER__] + + asr @t[0], @acc[5], #63 // sign as mask + ldp @acc[6], @acc[7], [$f0,#8*0] + ldp @acc[8], @acc[9], [$f0,#8*2] + ldp @acc[10], @acc[11], [$f0,#8*4] + + and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally + and @acc[7], @acc[7], @t[0] + adds @acc[0], @acc[0], @acc[6] + and @acc[8], @acc[8], @t[0] + adcs @acc[1], @acc[1], @acc[7] + and @acc[9], @acc[9], @t[0] + adcs @acc[2], @acc[2], @acc[8] + and @acc[10], @acc[10], @t[0] + adcs @acc[3], @acc[3], @acc[9] + and @acc[11], @acc[11], @t[0] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @acc[11] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + add csp, csp, #$frame + ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] + ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] + ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] + ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] + ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] + ldr c29, [csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*12+8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $f_ + eor @acc[5], @acc[5], $f1 + umulh @t[1], @acc[1], $f_ + adcs @acc[4], @acc[4], xzr + umulh @t[2], @acc[2], $f_ + adcs @acc[5], @acc[5], xzr + umulh @t[3], @acc[3], $f_ +___ +$code.=<<___ if ($j); + adc $g1, xzr, xzr // used in __smul_767x63_tail +___ +$code.=<<___; + umulh @t[4], @acc[4], $f_ + mul @acc[0], @acc[0], $f_ + mul @acc[1], @acc[1], $f_ + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $f_ + adcs @acc[3], @acc[3], @t[2] + mul @t[5+$j],@acc[5], $f_ + adcs @acc[4], @acc[4], @t[3] + adcs @t[5+$j],@t[5+$j],@t[4] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adcs @t[5], @t[5], @t[6] + stp @acc[4], @t[5], [$out_ptr,#8*4] + adc @t[6], @t[7], xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh @t[5], @acc[5], $f_ + ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v| + umulh @acc[11],@acc[11], $g_ + ldp @acc[2], @acc[3], [$in_ptr,#8*26] + ldp @acc[4], @acc[5], [$in_ptr,#8*28] + + eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v| + eor @acc[1], @acc[1], $f1 + eor @acc[2], @acc[2], $f1 + adds @acc[0], @acc[0], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[5], @acc[5], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $g_ + adcs @acc[4], @acc[4], xzr + umulh @t[1], @acc[1], $g_ + adc @acc[5], @acc[5], xzr + + umulh @t[2], @acc[2], $g_ + add @acc[11], @acc[11], @t[6] + umulh @t[3], @acc[3], $g_ + asr @t[6], @t[5], #63 + umulh @t[4], @acc[4], $g_ + mul @acc[0], @acc[0], $g_ + mul @acc[1], @acc[1], $g_ + mul @acc[2], @acc[2], $g_ + adds @acc[0], @acc[0], @acc[11] + mul @acc[3], @acc[3], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @acc[4], @acc[4], $g_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[5], @acc[5], $g_ + adcs @acc[3], @acc[3], @t[2] + adcs @acc[4], @acc[4], @t[3] + adc @acc[5], @acc[5], @t[4] + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @acc[3], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @t[6] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @t[6] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) + sub @t[7], @t[7], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], @t[7] + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], @t[7] + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], @t[7] + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], @t[7] + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], @t[7] + smulh @t[5+$j], @acc[5], @t[7] + mul @acc[0], @acc[0], @t[7] + mul @acc[1], @acc[1], @t[7] + mul @acc[2], @acc[2], @t[7] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[7] + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], @t[7] + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], @t[7] + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], xzr +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #62 + extr @acc[1], @acc[2], @acc[1], #62 + extr @acc[2], @acc[3], @acc[2], #62 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #62 + extr @acc[4], @acc[5], @acc[4], #62 + extr @acc[5], @acc[6], @acc[5], #62 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + eor $f0, $f0, @t[6] + eor $g0, $g0, @t[6] + sub $f0, $f0, @t[6] + sub $g0, $g0, @t[6] + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; + +$code.=<<___; +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp @a[4], @a[5], [$in_ptr,#8*4] + ldp @b[4], @b[5], [$in_ptr,#8*10] + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*8] + +.Lab_approximation_62_loaded: + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*6] + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[1], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr @a[5], @a[5], @a[4] + orr @b[5], @b[5], @b[4] + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62: + sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + subs @t[2], $b_lo, $a_lo // |b_|-|a_| + and @t[0], $b_lo, @t[6] + sbc @t[3], $b_hi, $a_hi + and @t[1], $b_hi, @t[6] + subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + sbcs @t[5], $a_hi, @t[1] + mov @t[1], $g0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $b_hi, $b_hi, $a_hi, hs + csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $a_hi, @t[5], @t[3], hs + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + extr $a_lo, $a_hi, $a_lo, #1 + lsr $a_hi, $a_hi, #1 + and @t[0], $f1, @t[6] + and @t[1], $g1, @t[6] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/ct_is_square_mod_384-armv8.pl b/src/blst/src/asm/ct_is_square_mod_384-armv8.pl new file mode 100755 index 0000000000..493a3c3cab --- /dev/null +++ b/src/blst/src/asm/ct_is_square_mod_384-armv8.pl @@ -0,0 +1,411 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); +my @acc=map("x$_",(3..14)); +my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); +my @t = map("x$_",(21..28)); +my ($a_, $b_) = @acc[5,11]; + +$frame = 2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + paciasp + stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! + add c29, csp, #0 + stp c19, c20, [csp,#2*__SIZEOF_POINTER__] + stp c21, c22, [csp,#4*__SIZEOF_POINTER__] + stp c23, c24, [csp,#6*__SIZEOF_POINTER__] + stp c25, c26, [csp,#8*__SIZEOF_POINTER__] + stp c27, c28, [csp,#10*__SIZEOF_POINTER__] + sub csp, csp, #$frame + + ldp @acc[0], @acc[1], [x0,#8*0] // load input + ldp @acc[2], @acc[3], [x0,#8*2] + ldp @acc[4], @acc[5], [x0,#8*4] + + add $in_ptr, sp, #255 // find closest 256-byte-aligned spot + and $in_ptr, $in_ptr, #-256 // in the frame... +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + + ldp @acc[6], @acc[7], [x1,#8*0] // load modulus + ldp @acc[8], @acc[9], [x1,#8*2] + ldp @acc[10], @acc[11], [x1,#8*4] + + stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*8] + stp @acc[4], @acc[5], [$in_ptr,#8*10] + stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*2] + stp @acc[10], @acc[11], [$in_ptr,#8*4] + + eor $L, $L, $L // init the Legendre symbol + mov $cnt, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub $cnt, $cnt, #1 + + eor $out_ptr, $in_ptr, #128 // pointer to dst |b| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $out_ptr, csp, $out_ptr +#endif + bl __smul_384_n_shift_by_30 + + mov $f1, $f0 // |f0| + mov $g1, $g0 // |g0| + cadd $out_ptr, $out_ptr, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp @acc[6], @acc[7], [$out_ptr,#-8*6] + eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| +#ifdef __CHERI_PURE_CAPABILITY__ + scvalue $in_ptr, csp, $in_ptr +#endif + and @t[6], @t[6], @acc[6] // if |a| was negative, + add $L, $L, @t[6], lsr#1 // adjust |L| + + cbnz $cnt, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr $a_, [$in_ptr,#8*6] // and loaded + //ldr $b_, [$in_ptr,#8*0] + mov $cnt, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr c30, [c29,#__SIZEOF_POINTER__] + + and x0, $L, #1 + eor x0, x0, #1 + + add csp, csp, #$frame + ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] + ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] + ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] + ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] + ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] + ldr c29, [csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +my $fx = $g1; $fx = $f1 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) + asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) + sub $fx, $fx, @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], $fx + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $fx + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], $fx + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], $fx + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], $fx + and @t[7], $fx, @t[6] + umulh @t[5+$j], @acc[5], $fx + neg @t[7], @t[7] + mul @acc[0], @acc[0], $fx + mul @acc[1], @acc[1], $fx + mul @acc[2], @acc[2], $fx + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $fx + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $fx + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], $fx + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], @t[7] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #30 + extr @acc[1], @acc[2], @acc[1], #30 + extr @acc[2], @acc[3], @acc[2], #30 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #30 + extr @acc[4], @acc[5], @acc[4], #30 + extr @acc[5], @acc[6], @acc[5], #30 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; +my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); + +$code.=<<___; +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers + ldp @b[2], @b[3], [$in_ptr,#8*2] + + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] // and one more, ... + csel @b[4], @b[4], @b[1], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[0], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr $a_, @a[5], @a[4] + orr $b_, @b[5], @b[4] + + bfxil $a_, @a[0], #0, #32 + bfxil $b_, @b[0], #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov $cnt, #30 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 + mov @t[0], $fg1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + csel $L, $L, @t[4], hs + lsr $a_, $a_, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + add $t[2], $b_, #2 + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + + cbnz $cnt, .Loop_30 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove the bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_30,.-__inner_loop_30 +___ +} + +{ +my ($a_, $b_) = (@acc[0], @acc[6]); +$code.=<<___; +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $L, $L, @t[4], hs + add $t[2], $b_, #2 + lsr $a_, $a_, #1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz $cnt, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/ct_is_square_mod_384-x86_64.pl b/src/blst/src/asm/ct_is_square_mod_384-x86_64.pl new file mode 100755 index 0000000000..b2b42cbbcd --- /dev/null +++ b/src/blst/src/asm/ct_is_square_mod_384-x86_64.pl @@ -0,0 +1,498 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); +my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); +my @acc=map("%r$_",(8..15)); +my $L = "%rbp"; + +$frame = 8*3+2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 +.type ct_is_square_mod_384,\@function,2,"unwind" +.align 32 +ct_is_square_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot + and \$-256, %rax # in the frame... + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0(%rdi), @acc[0] # load input + mov 8*1(%rdi), @acc[1] + mov 8*2(%rdi), @acc[2] + mov 8*3(%rdi), @acc[3] + mov 8*4(%rdi), @acc[4] + mov 8*5(%rdi), @acc[5] + + mov 8*0(%rsi), @acc[6] # load modulus + mov 8*1(%rsi), @acc[7] + mov 8*2(%rsi), %rbx + mov 8*3(%rsi), %rcx + mov 8*4(%rsi), %rdx + mov 8*5(%rsi), %rdi + mov %rax, $in_ptr # pointer to source |a|b| + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov %rbx, 8*8(%rax) + mov %rcx, 8*9(%rax) + mov %rdx, 8*10(%rax) + mov %rdi, 8*11(%rax) + + xor $L, $L # initialize the Legendre symbol + mov \$24, %ecx # 24 is 768/30-1 + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + mov %ecx, 8*2(%rsp) # offload loop counter + + call __ab_approximation_30 + mov $f0, 8*0(%rsp) # offload |f0| and |g0| + mov $g0, 8*1(%rsp) + + mov \$128+8*6, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |b| + call __smulq_384_n_shift_by_30 + + mov 8*0(%rsp), $f1 # pop |f0| and |g0| + mov 8*1(%rsp), $g1 + lea -8*6($out_ptr),$out_ptr # pointer to destination |a| + call __smulq_384_n_shift_by_30 + + mov 8*2(%rsp), %ecx # re-load loop counter + xor \$128, $in_ptr # flip-flop pointer to source |a|b| + + and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| + shr \$1, @acc[6] + add @acc[6], $L + + sub \$1, %ecx + jnz .Loop_is_square + + ################################# last iteration + #call __ab_approximation_30 # |a| and |b| are exact, just load + #mov 8*0($in_ptr), @acc[0] # |a_| + mov 8*6($in_ptr), @acc[1] # |b_| + call __inner_loop_48 # 48 is 768%30+30 + + mov \$1, %rax + and $L, %rax + xor \$1, %rax # return value + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,\@abi-omnipotent +.align 32 +__smulq_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, %rbx # |f1| (or |g1|) + sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) + xor %rax, %rax + sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) + + xor %rdx, %rbx # conditionally negate |f1| (or |g1|) + add %rax, %rbx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov %rdx, @acc[6+$j] + and %rbx, @acc[6+$j] + mulq %rbx # |a|*|f1| (or |b|*|g1|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + neg @acc[6+$j] + mulq %rbx + add %rax, @acc[5] + adc %rdx, @acc[6+$j] +___ +$code.=<<___ if ($j==0); + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov $g1, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc @acc[7], @acc[6] + + shrd \$30, @acc[1], @acc[0] + shrd \$30, @acc[2], @acc[1] + shrd \$30, @acc[3], @acc[2] + shrd \$30, @acc[4], @acc[3] + shrd \$30, @acc[5], @acc[4] + shrd \$30, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor %rbx, %rbx + sub @acc[6], %rbx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add %rbx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +___ +{ +my ($a_, $b_) = @acc[0..1]; +my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); +my $cnt = "%edi"; +{ +my @a = @acc[0..5]; +my @b = (@a[1..3], $t4, $t5, $g0); + +$code.=<<___; +.type __ab_approximation_30,\@abi-omnipotent +.align 32 +__ab_approximation_30: + mov 8*11($in_ptr), @b[5] # load |b| in reverse order + mov 8*10($in_ptr), @b[4] + mov 8*9($in_ptr), @b[3] + + mov @a[5], %rax + or @b[5], %rax # check top-most limbs, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[3], @a[4] + mov 8*8($in_ptr), @b[2] + cmovz @b[3], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... ones before top-most, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[2], @a[4] + mov 8*7($in_ptr), @b[1] + cmovz @b[2], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[1], @a[4] + mov 8*6($in_ptr), @b[0] + cmovz @b[1], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[0], @a[4] + cmovz @b[0], @b[4] + + mov @a[5], %rax + or @b[5], %rax + bsr %rax, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[5] + cmovz @b[0], @b[5] + cmovz %rax, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[4], @a[5] # align second limb to the left + shldq %cl, @b[4], @b[5] + + mov \$0xFFFFFFFF00000000, %rax + mov @a[0]d, ${a_}d + mov @b[0]d, ${b_}d + and %rax, @a[5] + and %rax, @b[5] + or @a[5], ${a_} + or @b[5], ${b_} + + jmp __inner_loop_30 + + ret +.size __ab_approximation_30,.-__ab_approximation_30 +___ +} +$code.=<<___; +.type __inner_loop_30,\@abi-omnipotent +.align 32 +__inner_loop_30: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF + mov \$30, $cnt + +.Loop_30: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax # pre-"negate" |L| + mov $fg0, $t2 + mov $fg1, $t3 + mov $L, $t4 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + cmovz $t4, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 + sub $bias, $fg1 + + sub \$1, $cnt + jnz .Loop_30 + + shr \$32, $bias + mov %ebx, %eax # $fg0 -> $f0 + shr \$32, $g0 + mov %ecx, %edx # $fg1 -> $f1 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret # __SGX_LVI_HARDENING_CLOBBER__=$a_ +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,\@abi-omnipotent +.align 32 +__inner_loop_48: + mov \$48, $cnt # 48 is 768%30+30 + +.Loop_48: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax + mov $L, $t2 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 + + sub \$1, $cnt + jnz .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/ctq_inverse_mod_384-x86_64.pl b/src/blst/src/asm/ctq_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000000..89a12508ec --- /dev/null +++ b/src/blst/src/asm/ctq_inverse_mod_384-x86_64.pl @@ -0,0 +1,896 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$code.=<<___ if ($flavour =~ /masm/); +.extern ct_inverse_mod_383\$1 +___ + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.comm __blst_platform_cap,4 +.text + +.globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 +.type ct_inverse_mod_383,\@function,4,"unwind" +.align 32 +ct_inverse_mod_383: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz ct_inverse_mod_383\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr # pointer to source |a|b|1|0| + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<11; $i++) { +my $smul_767x63 = $i>5 ? "__smulq_767x63" + : "__smulq_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==5); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# iteration before last + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + mov 8*1($in_ptr), @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + mov 8*7($in_ptr), @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + mov @acc[0], 8*0($out_ptr) + mov @acc[2], 8*6($out_ptr) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*12($out_ptr),$out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call __smulq_767x63 + + ################################# last iteration + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$22, $cnt # 766 % 62 + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulq_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +___ +######################################################################## +# see corresponding commentary in ctx_inverse_mod_384-x86_64... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulq_767x63,\@abi-omnipotent +.align 32 +__smulq_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor $f0, $fx # conditionally negate |f0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] + mov @acc[$i], 8*$i($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + mov @acc[5], 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + mov $f0, $fx # overrides in_ptr + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $fx # conditionally negate |g0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + xor $f0, @acc[8] + xor $f0, @acc[9] + xor $f0, @acc[10] + xor $f0, @acc[11] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulq $fx # |v|*|g0| + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<11; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mov 8*1(%rsp), %rdx # out_ptr + imulq $fx, %rax + mov 8*2(%rsp), $in_ptr # restore original in_ptr + add @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulq_767x63,.-__smulq_767x63 +___ +} +$code.=<<___; +.type __smulq_383x63,\@abi-omnipotent +.align 32 +__smulq_383x63: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |u| (or |v|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| (or |v|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx, %rax + add %rax, @acc[$i] + + lea 8*6($in_ptr), $in_ptr # pointer to |v| + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx, %rax + add %rax, @acc[$i] + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_383x63,.-__smulq_383x63 +___ +{ +$code.=<<___; +.type __smulq_383_n_shift_by_62,\@abi-omnipotent +.align 32 +__smulq_383_n_shift_by_62: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| (or |g0|) + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |a|*|f0| (or |b|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov %rdx, @acc[6] + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$62, @acc[1], @acc[0] + shrd \$62, @acc[2], @acc[1] + shrd \$62, @acc[3], @acc[2] + shrd \$62, @acc[4], @acc[3] + shrd \$62, @acc[5], @acc[4] + shrd \$62, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_62,\@abi-omnipotent +.align 32 +__ab_approximation_62: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*2($in_ptr), @a[0] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*0($in_ptr), @a[0] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + jmp __inner_loop_62 + + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62,\@abi-omnipotent +.align 8 +.long 0 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + mov $in_ptr, 8(%rsp) + +.Loop_62: + xor $t0, $t0 + xor $t1, $t1 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t2 + mov $b_hi, $t3 + cmovnz $b_lo, $t0 + cmovnz $b_hi, $t1 + sub $a_lo, $t2 # |b_|-|a_| + sbb $a_hi, $t3 + mov $a_lo, $t4 + mov $a_hi, $t5 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + sbb $t1, $a_hi + cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t3, $a_hi + cmovc $t4, $b_lo # |b_| = |a_| + cmovc $t5, $b_hi + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shrd \$1, $a_hi, $a_lo + shr \$1, $a_hi + test \$1, $t4 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + mov 8(%rsp), $in_ptr + ret # __SGX_LVI_HARDENING_CLOBBER__=$t0 +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/ctx_inverse_mod_384-x86_64.pl b/src/blst/src/asm/ctx_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000000..4e1b2d4c98 --- /dev/null +++ b/src/blst/src/asm/ctx_inverse_mod_384-x86_64.pl @@ -0,0 +1,1007 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >4x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulx_383_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulx_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$code.=<<___ if ($flavour =~ /masm/); +.globl ct_inverse_mod_383\$1 +___ + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ctx_inverse_mod_383 +.hidden ctx_inverse_mod_383 +.type ctx_inverse_mod_383,\@function,4,"unwind" +.align 32 +ctx_inverse_mod_383: +.cfi_startproc +ct_inverse_mod_383\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<23; $i++) { +my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31" + : "__smulx_191_n_shift_by_31"; +my $smul_767x63 = $i>11 ? "__smulx_767x63" + : "__smulx_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call $smul_n_shift + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call $smul_n_shift + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulx_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==11); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$53, $cnt # 31 + 766 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __tail_loop_53 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulx_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulx_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulx_767x63,\@abi-omnipotent +.align 32 +__smulx_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, %rax + sar \$63, %rax # |f0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor %rax, $f0 # conditionally negate |f0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |u| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |u|*|f0| + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + mov $g0, %rax + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + sar \$63, %rax # |g0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |g0|'s sign as bit + + xor %rax, $f0 # conditionally negate |g0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |v| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor %rax, @acc[5] + xor %rax, @acc[6] + xor %rax, @acc[7] + xor %rax, @acc[8] + xor %rax, @acc[9] + xor %rax, @acc[10] + xor %rax, @acc[11] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulx @acc[0], @acc[0], %rax # |v|*|g0| + mulx @acc[1], @acc[1], $fx + add %rax, @acc[1] +___ +for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + mulx @acc[11], @acc[11], $fx + mov 8*1(%rsp), %rdx # out_ptr + mov 8*2(%rsp), $in_ptr # restore original in_ptr + adc @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulx_767x63,.-__smulx_767x63 +___ +} +$code.=<<___; +.type __smulx_383x63,\@abi-omnipotent +.align 32 +__smulx_383x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $fx, $f0 # conditionally negate |f0| + add %rax, $f0 + + xor $fx, @acc[0] # conditionally negate |u| (or |v|) + xor $fx, @acc[1] + xor $fx, @acc[2] + xor $fx, @acc[3] + xor $fx, @acc[4] + xor $fx, @acc[5] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) + mulx @acc[1], @acc[1], %rax + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + mulx @acc[$i], @acc[$i], %rax + mov $g0, $f0 + adc $fx, @acc[$i] + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + mulx @acc[$i], @acc[$i], %rax + adc $fx, @acc[$i] + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulx_383x63,.-__smulx_383x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulx_383_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_383_n_shift_by_31: + mov $f0, @acc[8] + xor @acc[6], @acc[6] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc %rdx, @acc[6] + + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) +___ +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), %rax + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, @acc[4], @acc[3] + shrd \$31, %rax, @acc[4] + shrd \$31, @acc[6], %rax + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +___ +} { +$code.=<<___; +.type __smulx_191_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_191_n_shift_by_31: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +my @acc=@acc; + @acc=@acc[3..5] if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor @acc[2], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[2] + add $fx, @acc[1] + adc \$0, @acc[2] + imulq %rdx + add %rax, @acc[2] + adc \$0, %rdx +___ +$code.=<<___ if ($j==0); + mov %rdx, @acc[6] + mov $g0, %rdx +___ +} +$code.=<<___; + add @acc[0], @acc[3] + adc @acc[1], @acc[4] + adc @acc[2], @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[4], @acc[3] + shrd \$31, @acc[5], @acc[4] + shrd \$31, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[3] # conditionally negate the result + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[3], 8*0($out_ptr) + mov @acc[4], 8*1($out_ptr) + mov @acc[5], 8*2($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31,\@abi-omnipotent +.align 32 +__ab_approximation_31: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*2($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*1($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + andn @a[2], %rax, @a[2] + andn @b[2], %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31 + + ret +.size __ab_approximation_31,.-__ab_approximation_31 +___ +} +$code.=<<___; +.type __inner_loop_31,\@abi-omnipotent +.align 32 +__inner_loop_31: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo +.size __inner_loop_31,.-__inner_loop_31 + +.type __tail_loop_53,\@abi-omnipotent +.align 32 +__tail_loop_53: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + +.Loop_53: + xor $t0, $t0 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test \$1, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_53 + + ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo +.size __tail_loop_53,.-__tail_loop_53 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/div3w-armv8.pl b/src/blst/src/asm/div3w-armv8.pl new file mode 100755 index 0000000000..4a30457c05 --- /dev/null +++ b/src/blst/src/asm/div3w-armv8.pl @@ -0,0 +1,125 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + speculative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +{ +my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); +my @div = map("x$_",(3..4)); +my @acc = map("x$_",(5..7)); +my @t = map("x$_",(8..11)); + +$code.=<<___; +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp @div[0],@div[1],[$divisor] + + mul @acc[0],@div[0],$quot // divisor[0:1} * quotient + umulh @acc[1],@div[0],$quot + mul @t[3], @div[1],$quot + umulh @acc[2],@div[1],$quot + + ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend + ldr @t[2],[$div_rem,#16] + + adds @acc[1],@acc[1],@t[3] + adc @acc[2],@acc[2],xzr + + subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient + sbcs @t[1],@t[1],@acc[1] + sbcs @t[2],@t[2],@acc[2] + sbc @acc[0],xzr,xzr // borrow -> mask + + add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... + and @div[0],@div[0],@acc[0] + and @div[1],@div[1],@acc[0] + adds @t[0],@t[0],@div[0] // ... and add divisor + adc @t[1],@t[1],@div[1] + + stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder + str $quot,[$div_rem,#16] // and one limb of the quotient + + mov x0,$quot // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr @div[0],[$divisor] + ldr @t[0],[$div_rem] // load 1 limb of the dividend + + mul @acc[0],@div[0],$quot // divisor * quotient + + sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient + + stp @t[0],$quot,[$div_rem] // save remainder and quotient + + mov x0,$quot // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/div3w-x86_64.pl b/src/blst/src/asm/div3w-x86_64.pl new file mode 100755 index 0000000000..dbbcff0822 --- /dev/null +++ b/src/blst/src/asm/div3w-x86_64.pl @@ -0,0 +1,205 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$c_ref=<<'___'; +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) +{ + llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; + llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; + limb_t Q = 0, mask; + size_t i; + + for (i = 0; i < LIMB_BITS; i++) { + Q <<= 1; + mask = (R >= D); + Q |= mask; + R -= (D & ((llimb_t)0 - mask)); + D >>= 1; + } + + mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ + + Q <<= 1; + Q |= (R >= D); + + return (Q | mask); +} +___ + +$code.=<<___; +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,\@function,3,"unwind" +.align 32 +div_3_limbs: +.cfi_startproc +.cfi_end_prologue +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov (%rdi),%r8 # load R.lo + mov 8(%rdi),%r9 # load R.hi + xor %rax,%rax # Q = 0 + mov \$64,%ecx # loop counter + +.Loop: + mov %r8,%r10 # put aside R + sub %rsi,%r8 # R -= D + mov %r9,%r11 + sbb %rdx,%r9 + lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit + mov %rdx,%rdi + cmovc %r10,%r8 # restore R if R - D borrowed + cmovc %r11,%r9 + sbb \$0,%rax # subtract speculative bit + shl \$63,%rdi + shr \$1,%rsi + shr \$1,%rdx + or %rdi,%rsi # D >>= 1 + sub \$1,%ecx + jnz .Loop + + lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit + sar \$63,%rax # top bit -> mask + + sub %rsi,%r8 # R -= D + sbb %rdx,%r9 + sbb \$0,%rcx # subtract speculative bit + + or %rcx,%rax # all ones if overflow + +.cfi_epilogue + ret +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +___ +######################################################################## +# Calculate remainder and adjust the quotient, which can be off-by-one. +# Then save quotient in limb next to top limb of the remainder. There is +# place, because the remainder/next-iteration-dividend gets shorter by +# one limb. +{ +my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); +my @acc = ("%r8", "%r9", "%rdx"); +my @tmp = ("%r10", "%r11", "%rax"); + +$code.=<<___; +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,\@function,3,"unwind" +.align 32 +quot_rem_128: +.cfi_startproc +.cfi_end_prologue +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov %rdx, %rax + mov %rdx, $quotient + + mulq 0($divisor) # divisor[0:1] * quotient + mov %rax, @acc[0] + mov $quotient, %rax + mov %rdx, @acc[1] + + mulq 8($divisor) + add %rax, @acc[1] + adc \$0, %rdx # %rdx is @acc[2] + + mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend + mov 8($div_rem), @tmp[1] + mov 16($div_rem), @tmp[2] + + sub @acc[0], @tmp[0] # dividend - divisor * quotient + sbb @acc[1], @tmp[1] + sbb @acc[2], @tmp[2] + sbb @acc[0], @acc[0] # borrow -> mask + + add @acc[0], $quotient # if borrowed, adjust the quotient ... + mov @acc[0], @acc[1] + and 0($divisor), @acc[0] + and 8($divisor), @acc[1] + add @acc[0], @tmp[0] # ... and add divisor + adc @acc[1], @tmp[1] + + mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... + mov @tmp[1], 8($div_rem) + mov $quotient, 16($div_rem) # ... and 1 limb of the quotient + + mov $quotient, %rax # return adjusted quotient + +.cfi_epilogue + ret +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + +######################################################################## +# Unlike 128-bit case above, quotient is exact. As result just one limb +# of the dividend is sufficient to calculate the remainder... + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,\@function,3,"unwind" +.align 32 +quot_rem_64: +.cfi_startproc +.cfi_end_prologue +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov %rdx, %rax # return quotient + imulq 0($divisor), %rdx # divisor[0] * quotient + + mov 0($div_rem), @tmp[0] # load 1 limb of the dividend + + sub %rdx, @tmp[0] # dividend - divisor * quotient + + mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... + mov %rax, 8($div_rem) # ... and 1 limb of the quotient + +.cfi_epilogue + ret +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/mul_mont_256-armv8.pl b/src/blst/src/asm/mul_mont_256-armv8.pl new file mode 100755 index 0000000000..aa14a34fb6 --- /dev/null +++ b/src/blst/src/asm/mul_mont_256-armv8.pl @@ -0,0 +1,409 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod=map("x$_",(5..8)); +$bi="x9"; +@a=map("x$_",(10..13)); +@tmp=map("x$_",(14..17)); +@acc=map("x$_",(19..24)); +$m0=$n_ptr; + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + mul @acc[0],@a[0],$bi + ldp @mod[0],@mod[1],[$n_ptr] + mul @acc[1],@a[1],$bi + ldp @mod[2],@mod[3],[$n_ptr,#16] + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],xzr, @tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adc @acc[4],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adc @acc[4],@acc[4],xzr + + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldr c29,[csp],#8*__SIZEOF_POINTER__ + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +___ +{ +my @acc = (@a,@acc[0..3]); +my @a = @mod; + +$code.=<<___; +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + paciasp + stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mov $n0,$n_ptr + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul @acc[1],@a[1],@a[0] // a[1]*a[0] + umulh @tmp[1],@a[1],@a[0] + mul @acc[2],@a[2],@a[0] // a[2]*a[0] + umulh @tmp[2],@a[2],@a[0] + mul @acc[3],@a[3],@a[0] // a[3]*a[0] + umulh @acc[4],@a[3],@a[0] + + adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication + mul @tmp[0],@a[2],@a[1] // a[2]*a[1] + umulh @tmp[1],@a[2],@a[1] + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@a[3],@a[1] // a[3]*a[1] + umulh @tmp[3],@a[3],@a[1] + adc @acc[4],@acc[4],xzr // can't overflow + + mul @acc[5],@a[3],@a[2] // a[3]*a[2] + umulh @acc[6],@a[3],@a[2] + + adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication + mul @acc[0],@a[0],@a[0] // a[0]*a[0] + adc @tmp[2],@tmp[3],xzr // can't overflow + + adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication + umulh @a[0],@a[0],@a[0] + adcs @acc[4],@acc[4],@tmp[1] + mul @tmp[1],@a[1],@a[1] // a[1]*a[1] + adcs @acc[5],@acc[5],@tmp[2] + umulh @a[1],@a[1],@a[1] + adc @acc[6],@acc[6],xzr // can't overflow + + adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 + mul @tmp[2],@a[2],@a[2] // a[2]*a[2] + adcs @acc[2],@acc[2],@acc[2] + umulh @a[2],@a[2],@a[2] + adcs @acc[3],@acc[3],@acc[3] + mul @tmp[3],@a[3],@a[3] // a[3]*a[3] + adcs @acc[4],@acc[4],@acc[4] + umulh @a[3],@a[3],@a[3] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adc @acc[7],xzr,xzr + + adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] + adcs @acc[2],@acc[2],@tmp[1] + adcs @acc[3],@acc[3],@a[1] + adcs @acc[4],@acc[4],@tmp[2] + adcs @acc[5],@acc[5],@a[2] + adcs @acc[6],@acc[6],@tmp[3] + adc @acc[7],@acc[7],@a[3] + + bl __mul_by_1_mont_256 + ldr c30,[c29,#__SIZEOF_POINTER__] + + adds @acc[0],@acc[0],@acc[4] // accumulate upper half + adcs @acc[1],@acc[1],@acc[5] + adcs @acc[2],@acc[2],@acc[6] + adcs @acc[3],@acc[3],@acc[7] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldr c29,[csp],#6*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +} +{ +my @a = (@a, $bi); + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + paciasp + stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! + add c29,csp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr c30,[c29,#__SIZEOF_POINTER__] + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr c29,[csp],#2*__SIZEOF_POINTER__ + autiasp + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + paciasp + stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! + add c29,csp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp @tmp[0],@tmp[1],[$a_ptr,#32] + ldp @tmp[2],@tmp[3],[$a_ptr,#48] + + adds @a[0],@a[0],@tmp[0] + adcs @a[1],@a[1],@tmp[1] + adcs @a[2],@a[2],@tmp[2] + adcs @a[3],@a[3],@tmp[3] + adc @a[4],xzr,xzr + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + sbcs xzr, @a[4],xzr + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr c29,[csp],#2*__SIZEOF_POINTER__ + autiasp + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul $m0,$n0,@a[0] + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + mul $m0,$n0,@a[0] + adc @a[3],@a[4],@tmp[3] +___ +} +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + adc @a[3],@a[4],@tmp[3] + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 +___ +} + +print $code; + +close STDOUT; diff --git a/src/blst/src/asm/mul_mont_384-armv8.pl b/src/blst/src/asm/mul_mont_384-armv8.pl new file mode 100755 index 0000000000..425b3a28b7 --- /dev/null +++ b/src/blst/src/asm/mul_mont_384-armv8.pl @@ -0,0 +1,2015 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod = map("x$_",(5..10)); +@a = map("x$_",(11..16)); +$bi = "x17"; +@acc = map("x$_",(19..25)); +@tmp = map("x$_",(26..28,0,1,3)); + +$code.=<<___; +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + paciasp + stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384x384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldr c29,[csp],#8*__SIZEOF_POINTER__ + autiasp + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + adcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + adcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + adcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + stp @a[0],@a[1],[$r_ptr,#48] + csel @a[4],@a[4],@acc[4],lo + stp @a[2],@a[3],[$r_ptr,#64] + csel @a[5],@a[5],@acc[5],lo + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + paciasp + stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384x384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldr c29,[csp],#8*__SIZEOF_POINTER__ + autiasp + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + sbcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + sbcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + sbcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[5],@a[5],@acc[5],lo + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + sub csp,csp,#288 // space for 3 768-bit vectors + + cmov @tmp[0],$r_ptr // save r_ptr + cmov @tmp[1],$a_ptr // save b_ptr + cmov @tmp[2],$b_ptr // save b_ptr + + cadd $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + cadd $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) + cadd $b_ptr,$b_ptr,#48 + cadd $r_ptr,sp,#96 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + csub $b_ptr,$a_ptr,#48 + cadd $r_ptr,sp,#240 + bl __add_mod_384 + + cadd $a_ptr,@tmp[2],#0 + cadd $b_ptr,@tmp[2],#48 + cadd $r_ptr,sp,#192 // t2 + bl __add_mod_384 + + cadd $a_ptr,$r_ptr,#0 + cadd $b_ptr,$r_ptr,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + cmov $a_ptr,$r_ptr + cadd $b_ptr,sp,#0 + bl __sub_mod_384x384 + + cadd $b_ptr,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + cadd $a_ptr,sp,#0 + cadd $b_ptr,sp,#96 + cadd $r_ptr,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + cadd $a_ptr,sp,#0 // ret->re = redc(t0) + cadd $r_ptr,@tmp[0],#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + cadd $a_ptr,sp,#192 // ret->im = redc(t2) + cadd $r_ptr,$r_ptr,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + add csp,csp,#288 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there + sub csp,csp,#96 // space for 2 384-bit vectors + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + cadd $b_ptr,$a_ptr,#48 + cadd $r_ptr,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + cadd $r_ptr,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds @a[0],@a[0],@a[0] // add with itself + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @acc[0],@a[0],@acc[0],lo + csel @acc[1],@a[1],@acc[1],lo + csel @acc[2],@a[2],@acc[2],lo + ldp @a[0],@a[1],[sp] + csel @acc[3],@a[3],@acc[3],lo + ldr $bi, [sp,#48] + csel @acc[4],@a[4],@acc[4],lo + ldp @a[2],@a[3],[sp,#16] + csel @acc[5],@a[5],@acc[5],lo + ldp @a[4],@a[5],[sp,#32] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + cadd $b_ptr,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr c30,[c29,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add csp,csp,#96 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __mul_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 + mov $bi,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + adc $n0,$bi,xzr + ldr $bi,[$b_ptr,8*$i] + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],$n0,xzr + ldr $n0,[x29,#12*__SIZEOF_POINTER__] + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adcs @acc[6],@acc[6],xzr + adc $bi,xzr,xzr + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adcs @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 + adc $bi,$bi,xzr +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr + adc $bi,$bi,xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adcs @acc[4],@acc[5],@tmp[4] + adcs @acc[5],@acc[6],@tmp[5] + adc @acc[6],$bi,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs @tmp[4],@acc[4],@mod[4] + sbcs @tmp[5],@acc[5],@mod[5] + sbcs xzr, @acc[6],xzr + + csel @a[0],@acc[0],@tmp[0],lo + csel @a[1],@acc[1],@tmp[1],lo + csel @a[2],@acc[2],@tmp[2],lo + csel @a[3],@acc[3],@tmp[3],lo + csel @a[4],@acc[4],@tmp[4],lo + csel @a[5],@acc[5],@tmp[5],lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + sub csp,csp,#96 // space for 768-bit vector + cmov $n0,$n_ptr // adjust for missing b_ptr + + cmov $n_ptr,$r_ptr // save r_ptr + cmov $r_ptr,sp + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + cmov $a_ptr,sp + cmov $r_ptr,$n_ptr // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + add csp,csp,#96 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there + sub csp,csp,#96 // space for 768-bit vector + cmov $bi,x5 // save b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + cmov $r_ptr,sp +.Loop_sqr_383: + bl __sqr_384 + sub $b_ptr,$b_ptr,#1 // counter + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + cmov $a_ptr,sp + bl __mul_by_1_mont_384 + + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // just accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + cbnz $b_ptr,.Loop_sqr_383 + + cmov $b_ptr,$bi + ldr $bi,[$bi] + bl __mul_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add csp,csp,#96 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ +my @acc=(@acc,@tmp[0..2]); + +$code.=<<___; +.type __sqr_384,%function +.align 5 +__sqr_384: + mul @acc[0],@a[1],@a[0] + mul @acc[1],@a[2],@a[0] + mul @acc[2],@a[3],@a[0] + mul @acc[3],@a[4],@a[0] + mul @acc[4],@a[5],@a[0] + + umulh @mod[1],@a[1],@a[0] + umulh @mod[2],@a[2],@a[0] + umulh @mod[3],@a[3],@a[0] + umulh @mod[4],@a[4],@a[0] + adds @acc[1],@acc[1],@mod[1] + umulh @mod[5],@a[5],@a[0] + adcs @acc[2],@acc[2],@mod[2] + mul @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + mul @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + mul @mod[4],@a[4],@a[1] + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],@a[1] + + adds @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],@a[1] + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],@a[1] + adc @acc[6],xzr,xzr + + mul @mod[0],@a[0],@a[0] + adds @acc[3],@acc[3],@mod[2] + umulh @a[0], @a[0],@a[0] + adcs @acc[4],@acc[4],@mod[3] + mul @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + mul @mod[4],@a[4],@a[2] + adc @acc[6],@acc[6],@mod[5] + mul @mod[5],@a[5],@a[2] + + adds @acc[4],@acc[4],@mod[3] + umulh @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + umulh @mod[4],@a[4],@a[2] + adcs @acc[6],@acc[6],@mod[5] + umulh @mod[5],@a[5],@a[2] + adc @acc[7],xzr,xzr + + mul @mod[1],@a[1],@a[1] + adds @acc[5],@acc[5],@mod[3] + umulh @a[1], @a[1],@a[1] + adcs @acc[6],@acc[6],@mod[4] + mul @mod[4],@a[4],@a[3] + adc @acc[7],@acc[7],@mod[5] + mul @mod[5],@a[5],@a[3] + + adds @acc[6],@acc[6],@mod[4] + umulh @mod[4],@a[4],@a[3] + adcs @acc[7],@acc[7],@mod[5] + umulh @mod[5],@a[5],@a[3] + adc @acc[8],xzr,xzr + mul @mod[2],@a[2],@a[2] + adds @acc[7],@acc[7],@mod[4] + umulh @a[2], @a[2],@a[2] + adc @acc[8],@acc[8],@mod[5] + mul @mod[3],@a[3],@a[3] + + mul @mod[5],@a[5],@a[4] + umulh @a[3], @a[3],@a[3] + adds @acc[8],@acc[8],@mod[5] + umulh @mod[5],@a[5],@a[4] + mul @mod[4],@a[4],@a[4] + adc @acc[9],@mod[5],xzr + + adds @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + adcs @acc[2],@acc[2],@acc[2] + adcs @acc[3],@acc[3],@acc[3] + adcs @acc[4],@acc[4],@acc[4] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adcs @acc[7],@acc[7],@acc[7] + umulh @a[4], @a[4],@a[4] + adcs @acc[8],@acc[8],@acc[8] + mul @mod[5],@a[5],@a[5] + adcs @acc[9],@acc[9],@acc[9] + umulh @a[5], @a[5],@a[5] + adc $a_ptr,xzr,xzr + + adds @acc[0],@acc[0],@a[0] + adcs @acc[1],@acc[1],@mod[1] + adcs @acc[2],@acc[2],@a[1] + adcs @acc[3],@acc[3],@mod[2] + adcs @acc[4],@acc[4],@a[2] + adcs @acc[5],@acc[5],@mod[3] + adcs @acc[6],@acc[6],@a[3] + stp @mod[0],@acc[0],[$r_ptr] + adcs @acc[7],@acc[7],@mod[4] + stp @acc[1],@acc[2],[$r_ptr,#16] + adcs @acc[8],@acc[8],@a[4] + stp @acc[3],@acc[4],[$r_ptr,#32] + adcs @acc[9],@acc[9],@mod[5] + stp @acc[5],@acc[6],[$r_ptr,#48] + adc @a[5],@a[5],$a_ptr + stp @acc[7],@acc[8],[$r_ptr,#64] + stp @acc[9],@a[5],[$r_ptr,#80] + + ret +.size __sqr_384,.-__sqr_384 +___ +} +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mul @tmp[0],$n0,@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + mul @tmp[0],$n0,@a[0] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +} +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + + bl __mul_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + + umulh @mod[0],@a[0],$bi + umulh @mod[1],@a[1],$bi + umulh @mod[2],@a[2],$bi + umulh @mod[3],@a[3],$bi + umulh @mod[4],@a[4],$bi + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,8*1] + + str @acc[0],[$r_ptr] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],$bi +___ +for ($i=1;$i<5;$i++) { +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,#8*($i+1)] + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],@acc[6],@mod[5] + mul @mod[5],@a[5],$bi +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + adcs @acc[1],@acc[2],@mod[1] + adcs @acc[2],@acc[3],@mod[2] + adcs @acc[3],@acc[4],@mod[3] + adcs @acc[4],@acc[5],@mod[4] + adc @acc[5],@acc[6],@mod[5] + + stp @acc[0],@acc[1],[$r_ptr,#48] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + sub csp,csp,#96 // space for two 384-bit vectors + + ldp @a[0],@a[1],[$a_ptr] + cmov @tmp[0],$r_ptr // save r_ptr + ldp @acc[0],@acc[1],[$a_ptr,#48] + cmov @tmp[1],$a_ptr // save a_ptr + ldp @a[2],@a[3],[$a_ptr,#16] + cmov @tmp[2],$b_ptr // save b_ptr + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @a[4],@a[5],[$a_ptr,#32] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + ldp @a[0],@a[1],[$b_ptr] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[0],@acc[1],[$b_ptr,#48] + adcs @mod[3],$a[3],@acc[3] + ldp @a[2],@a[3],[$b_ptr,#16] + adcs @mod[4],$a[4],@acc[4] + ldp @acc[2],@acc[3],[$b_ptr,#64] + adc @mod[5],$a[5],@acc[5] + ldp @a[4],@a[5],[$b_ptr,#32] + + stp @mod[0],@mod[1],[sp] + adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + stp @mod[2],@mod[3],[sp,#16] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + stp @mod[4],@mod[5],[sp,#32] + adcs @mod[4],$a[4],@acc[4] + stp @mod[0],@mod[1],[sp,#48] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[sp,#64] + stp @mod[4],@mod[5],[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + cadd $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) + cadd $b_ptr,sp,#48 + cadd $r_ptr,@tmp[0],#96 + bl __mul_384 + + cadd $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) + cadd $b_ptr,@tmp[2],#48 + cadd $r_ptr,sp,#0 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + cadd $a_ptr,@tmp[0],#96 // ret->im -= tx + cadd $b_ptr,sp,#0 + cadd $r_ptr,@tmp[0],#96 + bl __sub_mod_384x384 + + cadd $b_ptr,@tmp[0],#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + cadd $a_ptr,@tmp[0],#0 // ret->re -= tx + cadd $b_ptr,sp,#0 + cadd $r_ptr,@tmp[0],#0 + bl __sub_mod_384x384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + add csp,csp,#96 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$a_ptr] + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @a[2],@a[3],[$a_ptr,#16] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[2],@acc[3],[$a_ptr,#64] + adcs @mod[1],$a[1],@acc[1] + ldp @a[4],@a[5],[$a_ptr,#32] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[3],$a[3],@acc[3] + stp @mod[0],@mod[1],[$r_ptr] + adcs @mod[4],$a[4],@acc[4] + ldp @mod[0],@mod[1],[$b_ptr] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[$r_ptr,#16] + + subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im + ldp @mod[2],@mod[3],[$b_ptr,#16] + sbcs @a[1],$a[1],@acc[1] + stp @mod[4],@mod[5],[$r_ptr,#32] + sbcs @a[2],$a[2],@acc[2] + ldp @mod[4],@mod[5],[$b_ptr,#32] + sbcs @a[3],$a[3],@acc[3] + sbcs @a[4],$a[4],@acc[4] + sbcs @a[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],@acc[6] + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],@acc[6] + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],@acc[6] + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],@acc[6] + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + cmov $n0,$a_ptr // save a_ptr + cadd $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) + cadd $b_ptr,$r_ptr,#48 + bl __mul_384 + + cadd $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) + cadd $b_ptr,$n0,#48 + cadd $r_ptr,$r_ptr,#96 + bl __mul_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + adds @a[0],@a[0],@a[0] // add with itself + ldp @a[4],@a[5],[$r_ptr,#32] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adcs @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + stp @a[0],@a[1],[$r_ptr] + adcs @acc[2],@acc[2],@acc[2] + stp @a[2],@a[3],[$r_ptr,#16] + adcs @acc[3],@acc[3],@acc[3] + stp @a[4],@a[5],[$r_ptr,#32] + adcs @acc[4],@acc[4],@acc[4] + stp @acc[0],@acc[1],[$r_ptr,#48] + adc @acc[5],@acc[5],@acc[5] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there + sub csp,csp,#112 // space for two 384-bit vectors + word + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp $bi,@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @mod[0],$a[0],$bi // t0 = a->re + a->im + adcs @mod[1],$a[1],@acc[1] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + adcs @mod[4],$a[4],@acc[4] + adc @mod[5],$a[5],@acc[5] + + subs @acc[0],$a[0],$bi // t1 = a->re - a->im + sbcs @acc[1],$a[1],@acc[1] + sbcs @acc[2],$a[2],@acc[2] + sbcs @acc[3],$a[3],@acc[3] + sbcs @acc[4],$a[4],@acc[4] + sbcs @acc[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr // borrow flag as mask + + stp @mod[0],@mod[1],[sp] + stp @mod[2],@mod[3],[sp,#16] + stp @mod[4],@mod[5],[sp,#32] + stp @acc[0],@acc[1],[sp,#48] + stp @acc[2],@acc[3],[sp,#64] + stp @acc[4],@acc[5],[sp,#80] + str @acc[6],[sp,#96] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + cadd $b_ptr,$a_ptr,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds @acc[0],@a[0],@a[0] // add with itself + adcs @acc[1],@a[1],@a[1] + adcs @acc[2],@a[2],@a[2] + adcs @acc[3],@a[3],@a[3] + adcs @acc[4],@a[4],@a[4] + adc @acc[5],@a[5],@a[5] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + ldp @a[0],@a[1],[sp] + ldr $bi,[sp,#48] + ldp @a[2],@a[3],[sp,#16] + ldp @a[4],@a[5],[sp,#32] + + cadd $b_ptr,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr c30,[c29,#__SIZEOF_POINTER__] + + ldr @acc[6],[sp,#96] // account for sign from a->re - a->im + ldp @acc[0],@acc[1],[sp] + ldp @acc[2],@acc[3],[sp,#16] + ldp @acc[4],@acc[5],[sp,#32] + + and @acc[0],@acc[0],@acc[6] + and @acc[1],@acc[1],@acc[6] + and @acc[2],@acc[2],@acc[6] + and @acc[3],@acc[3],@acc[6] + and @acc[4],@acc[4],@acc[6] + and @acc[5],@acc[5],@acc[6] + + subs @a[0],@a[0],@acc[0] + sbcs @a[1],@a[1],@acc[1] + sbcs @a[2],@a[2],@acc[2] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + and @acc[2],@mod[2],@acc[6] + and @acc[3],@mod[3],@acc[6] + and @acc[4],@mod[4],@acc[6] + and @acc[5],@mod[5],@acc[6] + + adds @a[0],@a[0],@acc[0] + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add csp,csp,#112 + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + + ldr $n0,[x29,#12*__SIZEOF_POINTER__] + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adc @acc[6],@acc[6],xzr + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr + + adds @a[0],@acc[1],@tmp[0] + adcs @a[1],@acc[2],@tmp[1] + adcs @a[2],@acc[3],@tmp[2] + adcs @a[3],@acc[4],@tmp[3] + adcs @a[4],@acc[5],@tmp[4] + adcs @a[5],@acc[6],@tmp[5] + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + cmov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + paciasp + stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! + add c29,csp,#0 + stp c19,c20,[csp,#2*__SIZEOF_POINTER__] + stp c21,c22,[csp,#4*__SIZEOF_POINTER__] + stp c23,c24,[csp,#6*__SIZEOF_POINTER__] + stp c25,c26,[csp,#8*__SIZEOF_POINTER__] + stp c27,c28,[csp,#10*__SIZEOF_POINTER__] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + cmov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + cadd $a_ptr,$a_ptr,#48 + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $b_ptr,$b_ptr,$bi + + bl __mul_by_1_mont_384 + ldr c30,[c29,#__SIZEOF_POINTER__] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] + ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] + ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] + ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] + ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] + ldr c29,[csp],#16*__SIZEOF_POINTER__ + autiasp + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ + +if (0) { +my @b = ($bi, @mod[0..4]); +my @comba = @acc[4..6]; + +$code.=<<___; +.type __mul_384_comba,%function +.align 5 +__mul_384_comba: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @b[4],@b[5],[$b_ptr,#32] + + mul @comba[0],@a[0],@b[0] + umulh @comba[1],@a[0],@b[0] + mul @acc[0],@a[1],@b[0] + umulh @acc[1],@a[1],@b[0] + str @comba[0],[$r_ptr] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[0],@b[1] + umulh @acc[3],@a[0],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],xzr, @acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[2],@b[0] + umulh @acc[1],@a[2],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#8] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[1],@b[1] + umulh @acc[3],@a[1],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[0],@b[2] + umulh @acc[1],@a[0],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[3],@b[0] + umulh @acc[3],@a[3],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#16] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[2],@b[1] + umulh @acc[1],@a[2],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[1],@b[2] + umulh @acc[3],@a[1],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[0],@b[3] + umulh @acc[1],@a[0],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[4],@b[0] + umulh @acc[3],@a[4],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#24] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[3],@b[1] + umulh @acc[1],@a[3],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[2],@b[2] + umulh @acc[3],@a[2],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[3] + umulh @acc[1],@a[1],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[4] + umulh @acc[3],@a[0],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[0] + umulh @acc[1],@a[5],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#32] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[1] + umulh @acc[3],@a[4],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[2] + umulh @acc[1],@a[3],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[3] + umulh @acc[3],@a[2],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[4] + umulh @acc[1],@a[1],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[5] + umulh @acc[3],@a[0],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[1] + umulh @acc[1],@a[5],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#40] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[2] + umulh @acc[3],@a[4],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[3] + umulh @acc[1],@a[3],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[4] + umulh @acc[3],@a[2],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[5] + umulh @acc[1],@a[1],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[2] + umulh @acc[3],@a[5],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#48] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[3] + umulh @acc[1],@a[4],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[4] + umulh @acc[3],@a[3],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[2],@b[5] + umulh @acc[1],@a[2],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[3] + umulh @acc[3],@a[5],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#56] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[4] + umulh @acc[1],@a[4],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[5] + umulh @acc[3],@a[3],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[4] + umulh @acc[1],@a[5],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#64] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[5] + umulh @acc[3],@a[4],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[5],@b[5] + umulh @acc[1],@a[5],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#72] +___ + push(@comba,shift(@comba)); +$code.=<<___; + adds @comba[0],@comba[0],@acc[0] + adc @comba[1],@comba[1],@acc[1] + stp @comba[0],@comba[1],[$r_ptr,#80] + + ret +.size __mul_384_comba,.-__mul_384_comba +___ +} +print $code; + +close STDOUT; diff --git a/src/blst/src/asm/mulq_mont_256-x86_64.pl b/src/blst/src/asm/mulq_mont_256-x86_64.pl new file mode 100755 index 0000000000..6c1392f20d --- /dev/null +++ b/src/blst/src/asm/mulq_mont_256-x86_64.pl @@ -0,0 +1,537 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$code.=<<___ if ($flavour =~ /masm/); +.extern mul_mont_sparse_256\$1 +.extern sqr_mont_sparse_256\$1 +.extern from_mont_256\$1 +.extern redc_mont_256\$1 +___ + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits +my @acc=map("%r$_",(9..15)); + +{ ############################################################## mulq +my ($hi, $a0) = ("%rbp", $r_ptr); + +$code.=<<___; +.comm __blst_platform_cap,4 +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,\@function,5,"unwind" +.align 32 +mul_mont_sparse_256: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz mul_mont_sparse_256\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), @acc[3] + mov 8*3($a_ptr), $hi + mov $b_org, $b_ptr # evacuate from %rdx + + mov %rax, @acc[6] + mulq @acc[4] # a[0]*b[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqr_mont_sparse_256: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_mont_sparse_256\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), %rax + mov $n_ptr, $n0 + mov 8*1($a_ptr), @acc[5] + mov $b_org, $n_ptr + mov 8*2($a_ptr), @acc[3] + lea ($a_ptr), $b_ptr + mov 8*3($a_ptr), $hi + + mov %rax, @acc[6] + mulq %rax # a[0]*a[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulq_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulq_mont_sparse_256: + mulq @acc[5] # a[1]*b[0] + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[3] # a[2]*b[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq $hi # a[3]*b[0] + add %rax, @acc[3] + mov 8($b_ptr), %rax + adc \$0, %rdx + xor @acc[5], @acc[5] + mov %rdx, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], $a0 + imulq $n0, @acc[0] + + ################################# Multiply by b[$i] + mov %rax, @acc[6] + mulq 8*0($a_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*1($a_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($a_ptr) + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($a_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc %rdx, @acc[5] # can't overflow + xor @acc[6], @acc[6] + + ################################# reduction + mulq 8*0($n_ptr) + add %rax, $a0 # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $a0 + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $a0, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rax + mov 8(%rsp), $a_ptr # restore $r_ptr + + ################################# last reduction + mov %rax, @acc[6] + mulq 8*0($n_ptr) + add %rax, @acc[0] # guaranteed to be zero + mov @acc[6], %rax + adc %rdx, @acc[0] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + add @acc[0], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + mov @acc[2], $b_ptr + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional subtraction of modulus + + mov @acc[3], @acc[0] + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rax, @acc[1] + cmovc $b_ptr, @acc[2] + cmovc @acc[0], @acc[3] + mov @acc[1], 8*0($a_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($a_ptr) + mov @acc[3], 8*2($a_ptr) + mov @acc[4], 8*3($a_ptr) + + ret +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,\@function,4,"unwind" +.align 32 +from_mont_256: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz from_mont_256\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], @acc[1] + mov @acc[6], @acc[2] + mov @acc[0], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + sbb 8*3($n_ptr), @acc[0] + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,\@function,4,"unwind" +.align 32 +redc_mont_256: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz redc_mont_256\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[6] + mov @acc[5], @acc[1] + adc 8*7($a_ptr), @acc[0] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[6], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + mov @acc[0], @acc[3] + sbb 8*3($n_ptr), @acc[0] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulq_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/mulq_mont_384-x86_64.pl b/src/blst/src/asm/mulq_mont_384-x86_64.pl new file mode 100755 index 0000000000..34e7df1963 --- /dev/null +++ b/src/blst/src/asm/mulq_mont_384-x86_64.pl @@ -0,0 +1,2754 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$code.=<<___ if ($flavour =~ /masm/); +.extern mul_mont_384x\$1 +.extern sqr_mont_384x\$1 +.extern mul_382x\$1 +.extern sqr_382x\$1 +.extern mul_384\$1 +.extern sqr_384\$1 +.extern redc_mont_384\$1 +.extern from_mont_384\$1 +.extern sgn0_pty_mont_384\$1 +.extern sgn0_pty_mont_384x\$1 +.extern mul_mont_384\$1 +.extern sqr_mont_384\$1 +.extern sqr_n_mul_mont_384\$1 +.extern sqr_n_mul_mont_383\$1 +.extern sqr_mont_382x\$1 +___ + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.comm __blst_platform_cap,4 +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __subq_mod_384x384,\@abi-omnipotent +.align 32 +__subq_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __subq_mod_384x384,.-__subq_mod_384x384 + +.type __addq_mod_384,\@abi-omnipotent +.align 32 +__addq_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __addq_mod_384,.-__addq_mod_384 + +.type __subq_mod_384,\@abi-omnipotent +.align 32 +__subq_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__subq_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __subq_mod_384,.-__subq_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,\@function,5,"unwind" +.align 32 +mul_mont_384x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz mul_mont_384x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulq_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 48($a_ptr), $a_ptr # a->im + lea 40+96(%rsp), $r_ptr # t1 + call __mulq_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea -48($a_ptr), $b_org + lea 40+192+48(%rsp), $r_ptr + call __addq_mod_384 + + mov 8*2(%rsp), $a_ptr + lea 48($a_ptr), $b_org + lea -48($r_ptr), $r_ptr + call __addq_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulq_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __subq_mod_384x384 # t2=t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __subq_mod_384x384 # t2=t2-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __subq_mod_384x384 # t0-t1 + + mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +$code.=<<___; +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,\@function,4,"unwind" +.align 32 +sqr_mont_384x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_mont_384x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $a_ptr, 8*2(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __addq_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __subq_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + call __mulq_mont_384 +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($r_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,\@function,4,"unwind" +.align 32 +mul_382x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz mul_382x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulq_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulq_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulq_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __subq_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __subq_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __subq_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_382x,.-mul_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,\@function,3,"unwind" +.align 32 +sqr_382x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_382x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __subq_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulq_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulq_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_382x,.-sqr_382x +___ +} +{ ########################################################## 384-bit mul +my @acc=map("%r$_",("cx",8..12)); +my $bi = "%rbp"; + +$code.=<<___; +.globl mul_384 +.hidden mul_384 +.type mul_384,\@function,3,"unwind" +.align 32 +mul_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz mul_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org, $b_ptr + call __mulq_384 + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,\@abi-omnipotent +.align 32 +__mulq_384: + mov 8*0($b_ptr), %rax + + mov %rax, $bi + mulq 8*0($a_ptr) + mov %rax, 8*0($r_ptr) + mov $bi, %rax + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[4] + mov 8*1($b_ptr), %rax + adc \$0, %rdx + mov %rdx, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov @acc[0], 8*$i($r_ptr) + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[1], @acc[0] + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[2], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[3], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[4], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[5], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulq_384,.-__mulq_384 +___ +} +if (0) { ############################################################## +my @b=map("%r$_",(10..15)); +my @a=reverse(@b); + @b[5]=$b_ptr; +my $bi = "%rbp"; +my @comba=map("%r$_",("cx",8,9)); +# a[0]*b[0] +# a[1]*b[0] +# a[0]*b[1] +# a[2]*b[0] +# a[1]*b[1] +# a[0]*b[2] +# a[3]*b[0] +# a[2]*b[1] +# a[1]*b[2] +# a[0]*b[3] +# a[4]*b[0] +# a[3]*b[1] +# a[2]*b[2] +# a[1]*b[3] +# a[0]*b[4] +# a[5]*b[0] +# a[4]*b[1] +# a[3]*b[2] +# a[2]*b[3] +# a[1]*b[4] +# a[0]*b[5] +# a[5]*b[1] +# a[4]*b[2] +# a[3]*b[3] +# a[2]*b[4] +# a[1]*b[5] +# a[5]*b[2] +# a[4]*b[3] +# a[3]*b[4] +# a[2]*b[5] +# a[5]*b[3] +# a[4]*b[4] +# a[3]*b[5] +# a[5]*b[4] +# a[4]*b[5] +# a[5]*b[5] +# +# 13% less instructions give +15% on Core2, +10% on Goldmont, +# -0% on Sandy Bridge, but -16% on Haswell:-( +# [for reference +5% on Skylake, +11% on Ryzen] + +$code.=<<___; +.type __mulq_comba_384,\@abi-omnipotent +.align 32 +__mulq_comba_384: + mov 8*0($b_ptr), %rax + mov 8*0($a_ptr), @a[0] + mov 8*1($a_ptr), @a[1] + mov 8*1($b_ptr), @b[1] + + mov %rax, @b[0] + mulq @a[0] # a[0]*b[0] + mov %rax, 8*0($r_ptr) + mov @b[0], %rax + mov %rdx, @comba[0] + + ################################# + mov 8*2($a_ptr), @a[2] + xor @comba[2], @comba[2] + mulq @a[1] # a[1]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc \$0, %rdx + mov 8*2($b_ptr), @b[2] + mov %rdx, @comba[1] + + mulq @a[0] # a[0]*b[1] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*1($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[2] # a[2]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[0] # a[0]*b[2] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*2($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*3($a_ptr) # a[3]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[2] # a[2]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[2] + add %rax, @comba[0] + mov 8*3($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[3] + mulq @a[0] # a[0]*b[3] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*3($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*4($a_ptr) # a[4]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[3] + add %rax, @comba[0] + mov 8*4($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[4] + mulq @a[0] # a[0]*b[4] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + mov 8*5($a_ptr), @a[5] + adc \$0, @comba[2] + mov @comba[0], 8*4($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*4($a_ptr) # a[4]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[4] + add %rax, @comba[0] + mov 8*5($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[5] + mulq @a[0] # a[0]*b[5] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + mov 8*4($a_ptr), @a[4] + adc \$0, @comba[2] + mov @comba[0], 8*5($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[5] + add %rax, @comba[0] + mov $b[2], %rax + adc %rdx, @comba[1] + mov 8*3($a_ptr), @a[3] + adc \$0, @comba[2] + mov @comba[0], 8*6($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[5] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*7($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[5] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*8($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[5] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*9($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + adc %rdx, @comba[1] + + mov @comba[0], 8*10($r_ptr) + mov @comba[1], 8*11($r_ptr) + + ret +.size __mulq_comba_384,.-__mulq_comba_384 +___ +} +{ ########################################################## 384-bit sqr +my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); +my $hi; + +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,\@function,2,"unwind" +.align 32 +sqr_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrq_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,\@abi-omnipotent +.align 32 +__sqrq_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + + ######################################### + mov %rax, @acc[6] + mulq @acc[7] # a[1]*a[0] + mov %rax, @acc[1] + mov @acc[6], %rax + mov 8*4($a_ptr), @acc[10] + mov %rdx, @acc[2] + + mulq @acc[8] # a[2]*a[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov 8*5($a_ptr), @acc[11] + mov %rdx, @acc[3] + + mulq @acc[9] # a[3]*a[0] + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq @acc[10] # a[4]*a[0] + add %rax, @acc[4] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq @acc[11] # a[5]*a[0] + add %rax, @acc[5] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq %rax # a[0]*a[0] + xor @acc[0], @acc[0] + mov %rax, 8*0($r_ptr) + mov @acc[7], %rax + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[0] + add %rdx, @acc[1] # accumulate a[0]*a[0] + adc \$0, @acc[0] # carries to a[1]*a[1] + mov @acc[1], 8*1($r_ptr) +___ +$hi=@acc[1]; +$code.=<<___; + ######################################### + mulq @acc[8] # a[2]*a[1] + add %rax, @acc[3] + mov @acc[7], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[9] # a[3]*a[1] + add %rax, @acc[4] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[1] + add %rax, @acc[5] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[1] + add %rax, @acc[6] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq %rax # a[1]*a[1] + xor @acc[1], @acc[1] + add %rax, @acc[0] # can't carry + mov @acc[8], %rax + add @acc[2], @acc[2] # double acc[2:3] + adc @acc[3], @acc[3] + adc \$0, @acc[1] + add @acc[0], @acc[2] # accumulate a[1]*a[1] + adc %rdx, @acc[3] + adc \$0, @acc[1] # carries to a[2]*a[2] + mov @acc[2], 8*2($r_ptr) +___ +$hi=@acc[0]; +$code.=<<___; + ######################################### + mulq @acc[9] # a[3]*a[2] + add %rax, @acc[5] + mov @acc[8], %rax + adc \$0, %rdx + mov @acc[3], 8*3($r_ptr) + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[2] + add %rax, @acc[6] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[2] + add %rax, @acc[7] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[7] + adc \$0, %rdx + mov %rdx, @acc[8] + + mulq %rax # a[2]*a[2] + xor @acc[3], @acc[3] + add %rax, @acc[1] # can't carry + mov @acc[9], %rax + add @acc[4], @acc[4] # double acc[4:5] + adc @acc[5], @acc[5] + adc \$0, @acc[3] + add @acc[1], @acc[4] # accumulate a[2]*a[2] + adc %rdx, @acc[5] + adc \$0, @acc[3] # carries to a[3]*a[3] + mov @acc[4], 8*4($r_ptr) + + ######################################### + mulq @acc[10] # a[4]*a[3] + add %rax, @acc[7] + mov @acc[9], %rax + adc \$0, %rdx + mov @acc[5], 8*5($r_ptr) + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[3] + add %rax, @acc[8] + mov @acc[9], %rax + adc \$0, %rdx + add $hi, @acc[8] + adc \$0, %rdx + mov %rdx, @acc[9] + + mulq %rax # a[3]*a[3] + xor @acc[4], @acc[4] + add %rax, @acc[3] # can't carry + mov @acc[10], %rax + add @acc[6], @acc[6] # double acc[6:7] + adc @acc[7], @acc[7] + adc \$0, @acc[4] + add @acc[3], @acc[6] # accumulate a[3]*a[3] + adc %rdx, @acc[7] + mov @acc[6], 8*6($r_ptr) + adc \$0, @acc[4] # carries to a[4]*a[4] + mov @acc[7], 8*7($r_ptr) + + ######################################### + mulq @acc[11] # a[5]*a[4] + add %rax, @acc[9] + mov @acc[10], %rax + adc \$0, %rdx + mov %rdx, @acc[10] + + mulq %rax # a[4]*a[4] + xor @acc[5], @acc[5] + add %rax, @acc[4] # can't carry + mov @acc[11], %rax + add @acc[8], @acc[8] # double acc[8:9] + adc @acc[9], @acc[9] + adc \$0, @acc[5] + add @acc[4], @acc[8] # accumulate a[4]*a[4] + adc %rdx, @acc[9] + mov @acc[8], 8*8($r_ptr) + adc \$0, @acc[5] # carries to a[5]*a[5] + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulq %rax # a[5]*a[5] + add @acc[5], %rax # can't carry + add @acc[10], @acc[10] # double acc[10] + adc \$0, %rdx + add @acc[10], %rax # accumulate a[5]*a[5] + adc \$0, %rdx + mov %rax, 8*10($r_ptr) + mov %rdx, 8*11($r_ptr) + + ret +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,\@function,4,"unwind" +.align 32 +sqr_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*15, %rsp +.cfi_adjust_cfa_offset 8*15 +.cfi_end_prologue + + mov $n_ptr, 8*12(%rsp) # n0 + mov $b_org, 8*13(%rsp) # n_ptr + mov $r_ptr, 8*14(%rsp) + + mov %rsp, $r_ptr + call __sqrq_384 + + lea 0(%rsp), $a_ptr + mov 8*12(%rsp), %rcx # n0 for mul_by_1 + mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 + mov 8*14(%rsp), $r_ptr + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + lea 8*15(%rsp), %r8 # size optimization + mov 8*15(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*21 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 +___ +} +{ ########################################################## 384-bit redc_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +######################################################################## +# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,\@function,4,"unwind" +.align 32 +redc_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz redc_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + +######################################################################## +# void from_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,\@function,4,"unwind" +.align 32 +from_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz from_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[6], %rax # __mulq_by_1_mont_384 does it + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_384,.-from_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[6] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<6; $i++) { +my $hi = @acc[6]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[6] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[6] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx +___ +$code.=<<___ if ($i<5); + mov @acc[1], @acc[7] + imulq $n0, @acc[1] +___ +$code.=<<___; + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redq_tail_mont_384,\@abi-omnipotent +.align 32 +__redq_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redq_tail_mont_384,.-__redq_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sgn0_pty_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ +} } + +{ ########################################################## mulq_mont +my ($bi, $hi) = ("%rdi", "%rbp"); + +$code.=<<___; +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,\@function,5,"unwind" +.align 32 +mul_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz mul_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*3, %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + mov $b_org, $b_ptr # evacuate from %rdx + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + + call __mulq_mont_384 + + mov 24(%rsp),%r15 +.cfi_restore %r15 + mov 32(%rsp),%r14 +.cfi_restore %r14 + mov 40(%rsp),%r13 +.cfi_restore %r13 + mov 48(%rsp),%r12 +.cfi_restore %r12 + mov 56(%rsp),%rbx +.cfi_restore %rbx + mov 64(%rsp),%rbp +.cfi_restore %rbp + lea 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_mont_384,\@abi-omnipotent +.align 32 +__mulq_mont_384: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + xor @acc[7], @acc[7] + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, $hi # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $hi + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[5] + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($a_ptr) + add @acc[7], @acc[5] + adc \$0, %rdx + xor @acc[7], @acc[7] + add %rax, @acc[5] + mov @acc[0], %rax + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ +} +$code.=<<___; + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[0], %rax + mov 8*2(%rsp), $r_ptr # restore $r_ptr + sub 8*0($n_ptr), @acc[0] + mov @acc[1], %rdx + sbb 8*1($n_ptr), @acc[1] + mov @acc[2], $b_ptr + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*3($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[7] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rdx, @acc[1] + cmovc $b_ptr, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[7], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __mulq_mont_384,.-__mulq_mont_384 +___ +} } +$code.=<<___; +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_384: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movd %xmm1, %edx + lea 0($r_ptr), $a_ptr + dec %edx + jnz .Loop_sqr_384 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #mov 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 + + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_383: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + + movd %xmm1, %edx # loop counter + add 8*6($a_ptr), @acc[6] # just accumulate upper half + adc 8*7($a_ptr), @acc[7] + adc 8*8($a_ptr), @acc[0] + adc 8*9($a_ptr), @acc[1] + adc 8*10($a_ptr), @acc[2] + adc 8*11($a_ptr), @acc[3] + lea 0($r_ptr), $a_ptr + + mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% + mov @acc[7], 8*1($r_ptr) # in addition-chains + mov @acc[0], 8*2($r_ptr) + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + dec %edx + jnz .Loop_sqr_383 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #movq 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 # formally one can omit full reduction + # even after multiplication... + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + my $bi = "%rbp"; + +$code.=<<___; +.type __mulq_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulq_mont_383_nonred: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[7] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[7] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*3($n_ptr) + add @acc[7], @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[7], @acc[5] + adc %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[6] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*5($a_ptr) + add @acc[6], @acc[5] + adc \$0, %rdx + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +} +$code.=<<___; + ret +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives 8-11% better performance in add-chains +$code.=<<___; +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,\@function,4,"unwind" +.align 32 +sqr_mont_382x: +.cfi_startproc +#ifdef __BLST_PORTABLE__ + testl \$1, __blst_platform_cap(%rip) + jnz sqr_mont_382x\$1 +#endif + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + mov 8*3(%rsp), $r_ptr + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($r_ptr) # ret->im + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + mov 32+8*0(%rsp), @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[6] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[7] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[8] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/mulx_mont_256-x86_64.pl b/src/blst/src/asm/mulx_mont_256-x86_64.pl new file mode 100755 index 0000000000..44666783c2 --- /dev/null +++ b/src/blst/src/asm/mulx_mont_256-x86_64.pl @@ -0,0 +1,506 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# "Sparse" in subroutine names refers to most significant limb of the +# modulus. Though "sparse" is a bit of misnomer, because limitation is +# just not-all-ones. Or in other words not larger than 2^256-2^192-1. +# In general Montgomery multiplication algorithm can handle one of the +# inputs being non-reduced and capped by 1<re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 128+48($a_ptr), $a_ptr # a->im + lea 96($r_ptr), $r_ptr # t1 + call __mulx_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea ($b_ptr), $a_ptr # b->re + lea -48($b_ptr), $b_org # b->im + lea 40+192+48(%rsp), $r_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __addx_mod_384 + + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea -48($r_ptr), $r_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __addx_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulx_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __subx_mod_384x384 # t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __subx_mod_384x384 # t2-t0-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __subx_mod_384x384 # t0-t1 + + lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # alignment +$code.=<<___; +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,\@function,4,"unwind" +.align 32 +sqrx_mont_384x: +.cfi_startproc +sqr_mont_384x\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + # gap for __mulx_mont_384 + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __addx_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __subx_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $a_ptr, $a_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $a_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($b_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($b_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($b_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,\@function,4,"unwind" +.align 32 +mulx_382x: +.cfi_startproc +mul_382x\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulx_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48+128($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulx_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __subx_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __subx_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __subx_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_382x,.-mulx_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,\@function,3,"unwind" +.align 32 +sqrx_382x: +.cfi_startproc +sqr_382x\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __subx_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulx_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +___ +} +{ ########################################################## 384-bit mulx +my ($a0, $a1) = @acc[6..7]; +my @acc = @acc[0..5]; +my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); + +$code.=<<___; +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,\@function,3,"unwind" +.align 32 +mulx_384: +.cfi_startproc +mul_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_384 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,\@abi-omnipotent +.align 32 +__mulx_384: + mov 8*0($b_ptr), %rdx + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea -128($a_ptr), $a_ptr + + mulx $a0, @acc[1], $hi + xor $zr, $zr + + mulx $a1, @acc[0], $lo + adcx $hi, @acc[0] + mov @acc[1], 8*0($r_ptr) + + mulx @acc[2], @acc[1], $hi + adcx $lo, @acc[1] + + mulx @acc[3], @acc[2], $lo + adcx $hi, @acc[2] + + mulx @acc[4], @acc[3], $hi + adcx $lo, @acc[3] + + mulx @acc[5], @acc[4], @acc[5] + mov 8*1($b_ptr), %rdx + adcx $hi, @acc[4] + adcx $zr, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mulx $a0, $lo, $hi + adcx @acc[0], $lo + adox $hi, @acc[1] + mov $lo, 8*$i($r_ptr) + + mulx $a1, @acc[0], $hi + adcx @acc[1], $acc[0] + adox $hi, @acc[2] + + mulx 128+8*2($a_ptr), @acc[1], $lo + adcx @acc[2], @acc[1] + adox $lo, @acc[3] + + mulx 128+8*3($a_ptr), @acc[2], $hi + adcx @acc[3], @acc[2] + adox $hi, @acc[4] + + mulx 128+8*4($a_ptr), @acc[3], $lo + adcx @acc[4], @acc[3] + adox @acc[5], $lo + + mulx 128+8*5($a_ptr), @acc[4], @acc[5] + mov $b_next, %rdx + adcx $lo, @acc[4] + adox $zr, @acc[5] + adcx $zr, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulx_384,.-__mulx_384 +___ +} +{ ########################################################## 384-bit sqrx +$code.=<<___; +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,\@function,2,"unwind" +.align 32 +sqrx_384: +.cfi_startproc +sqr_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __sqrx_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_384,.-sqrx_384 +___ +if (0) { +# up to 5% slower than below variant +my @acc=map("%r$_",("no",8..15,"cx","bx")); + push(@acc, $a_ptr); +my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + adc $hi, @acc[5] + adc \$0, @acc[6] + + mulx %rdx, $lo, $hi # a[0]*a[0] + mov @acc[7], %rdx + xor @acc[7], @acc[7] + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[7] + add $hi, @acc[1] + adc \$0, @acc[7] + mov $lo, 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) +___ +($carry, @acc[7]) = (@acc[7], @acc[1]); +$code.=<<___; + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + mulx %rdx, $lo, $hi # a[1]*a[1] + mov @acc[8], %rdx + xor @acc[8], @acc[8] + adox @acc[2], @acc[2] # double acc[2:3] + adcx $carry, $lo # can't carry + adox @acc[3], @acc[3] + adcx $lo, @acc[2] + adox @acc[8], @acc[8] + adcx $hi, @acc[3] + adc \$0, @acc[8] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) +___ +($carry,@acc[8])=(@acc[8],$carry); +$code.=<<___; + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + mulx %rdx, $lo, $hi # a[2]*a[2] + mov @acc[9], %rdx + xor @acc[9], @acc[9] + adox @acc[4], @acc[4] # double acc[4:5] + adcx $carry, $lo # can't carry + adox @acc[5], @acc[5] + adcx $lo, @acc[4] + adox @acc[9], @acc[9] + adcx $hi, @acc[5] + adc \$0, $acc[9] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +($carry,@acc[9])=(@acc[9],$carry); +$code.=<<___; + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + mulx %rdx, $lo, $hi + mov @acc[10], %rdx + xor @acc[10], @acc[10] + adox @acc[6], @acc[6] # double acc[6:7] + adcx $carry, $lo # can't carry + adox @acc[7], @acc[7] + adcx $lo, @acc[6] + adox @acc[10], @acc[10] + adcx $hi, @acc[7] + adc \$0, $acc[10] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) +___ +($carry,@acc[10])=(@acc[10],$carry); +$code.=<<___; + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + add $lo, @acc[9] + adc \$0, @acc[10] + + mulx %rdx, $lo, $hi # a[4]*a[4] + mov @acc[11], %rdx + xor @acc[11], @acc[11] + adox @acc[8], @acc[8] # double acc[8:10] + adcx $carry, $lo # can't carry + adox @acc[9], @acc[9] + adcx $lo, @acc[8] + adox @acc[10], @acc[10] + adcx $hi, @acc[9] + adox @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulx %rdx, $lo, $hi # a[5]*a[5] + adcx $lo, @acc[10] + adcx $hi, @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} else { +my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); +my ($lo, $hi)=($r_ptr, "%rax"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + mov @acc[7], %rdx + adc $hi, @acc[5] + adc \$0, @acc[6] + + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + mov @acc[8], %rdx + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + mov @acc[9], %rdx + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + mov @acc[10], %rdx + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + mov 8*0($a_ptr), %rdx + add $lo, @acc[9] + mov 8(%rsp), $r_ptr # restore $r_ptr + adc \$0, @acc[10] + + ######################################### double acc[1:10] + xor @acc[11], @acc[11] + adcx @acc[1], @acc[1] + adcx @acc[2], @acc[2] + adcx @acc[3], @acc[3] + adcx @acc[4], @acc[4] + adcx @acc[5], @acc[5] + + ######################################### accumulate a[i]*a[i] + mulx %rdx, %rdx, $hi # a[0]*a[0] + mov %rdx, 8*0($r_ptr) + mov 8*1($a_ptr), %rdx + adox $hi, @acc[1] + mov @acc[1], 8*1($r_ptr) + + mulx %rdx, @acc[1], $hi # a[1]*a[1] + mov 8*2($a_ptr), %rdx + adox @acc[1], @acc[2] + adox $hi, @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] + mov 8*3($a_ptr), %rdx + adox @acc[1], @acc[4] + adox @acc[2], @acc[5] + adcx @acc[6], @acc[6] + adcx @acc[7], @acc[7] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] + mov 8*4($a_ptr), %rdx + adox @acc[1], @acc[6] + adox @acc[2], @acc[7] + adcx @acc[8], @acc[8] + adcx @acc[9], @acc[9] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] + mov 8*5($a_ptr), %rdx + adox @acc[1], @acc[8] + adox @acc[2], @acc[9] + adcx @acc[10], @acc[10] + adcx @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] + adox @acc[1], @acc[10] + adox @acc[2], @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} + +{ ########################################################## 384-bit redcx_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" +my ($lo, $hi) = ("%rax", "%rbp"); + +$code.=<<___; +######################################################################## +# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,\@function,4,"unwind" +.align 32 +redcx_mont_384: +.cfi_startproc +redc_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + +######################################################################## +# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,\@function,4,"unwind" +.align 32 +fromx_mont_384: +.cfi_startproc +from_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[6], %rax + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_384: + mov 8*0($a_ptr), @acc[0] + mov $n0, %rdx + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] +___ +for (my $i=0; $i<6; $i++) { +$code.=<<___; + imulq @acc[0], %rdx + + ################################# reduction $i + xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 + mulx 8*0($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5($n_ptr), $lo, $hi + mov $n0, %rdx + adcx $lo, @acc[5] + adox @acc[6], $hi + adcx $hi, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redx_tail_mont_384,\@abi-omnipotent +.align 32 +__redx_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redx_tail_mont_384,.-__redx_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc +sgn0_pty_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc +sgn0_pty_mont_384x\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +___ +} } + +{ ########################################################## mulx/sqrx_mont +my @acc = (@acc, "%rax"); +my ($lo,$hi)=("%rdi","%rbp"); + +$code.=<<___; +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,\@function,5,"unwind" +.align 32 +mulx_mont_384: +.cfi_startproc +mul_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + mov $n0, (%rsp) + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_384,\@abi-omnipotent +.align 32 +__mulx_mont_384: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] + xor @acc[7], @acc[7] + +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], 16(%rsp) + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx $hi, @acc[7] # cf=0 + adox @acc[8], @acc[7] + adox @acc[8], @acc[8] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx 16(%rsp), $lo # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[0], @acc[6] + adox @acc[0], @acc[7] + adcx @acc[0], @acc[7] + adox @acc[0], @acc[8] + adcx @acc[0], @acc[8] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], @acc[0] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + mov @acc[3], $a_ptr + + mulx 8*5+128($n_ptr), $lo, $hi + adcx $lo, @acc[5] + adox $hi, @acc[6] + mov @acc[1], %rdx + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + lea 128($n_ptr), $n_ptr + mov @acc[4], @acc[8] + adc \$0, @acc[7] + + ################################# + # Branch-less conditional acc[1:7] - modulus + + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + mov @acc[5], $lo + sbb 8*2($n_ptr), @acc[3] + sbb 8*3($n_ptr), @acc[4] + sbb 8*4($n_ptr), @acc[5] + mov @acc[6], $hi + sbb 8*5($n_ptr), @acc[6] + sbb \$0, @acc[7] + + cmovnc @acc[1], %rdx + cmovc @acc[0], @acc[2] + cmovc $a_ptr, @acc[3] + cmovnc @acc[4], @acc[8] + mov %rdx, 8*0($b_ptr) + cmovnc @acc[5], $lo + mov @acc[2], 8*1($b_ptr) + cmovnc @acc[6], $hi + mov @acc[3], 8*2($b_ptr) + mov @acc[8], 8*3($b_ptr) + mov $lo, 8*4($b_ptr) + mov $hi, 8*5($b_ptr) + + ret # __SGX_LVI_HARDENING_CLOBBER__=%rsi +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +___ +} +$code.=<<___; +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,\@function,4,"unwind" +.align 32 +sqrx_mont_384: +.cfi_startproc +sqr_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $n_ptr, $n0 # n0 + lea -128($b_org), $n_ptr # control u-op density +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + lea ($a_ptr), $b_ptr + mov $n0, (%rsp) # n0 + lea -128($a_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 # as fast as dedicated squaring + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc +sqr_n_mul_mont_384\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + +.Loop_sqrx_384: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 + + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_384 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc +sqr_n_mul_mont_383\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + lea -128($n_ptr), $n_ptr # control u-op density + +.Loop_sqrx_383: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_383_nonred # omitting full reduction gives ~15% + # in addition-chains + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_383 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], @acc[8] + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx @acc[7], $hi + adox $hi, @acc[7] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[8] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + adcx @acc[8], @acc[7] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adc \$0, @acc[6] + mov @acc[4], @acc[8] + + mov @acc[1], 8*0($b_ptr) + mov @acc[2], 8*1($b_ptr) + mov @acc[3], 8*2($b_ptr) + mov @acc[5], $lo + mov @acc[4], 8*3($b_ptr) + mov @acc[5], 8*4($b_ptr) + mov @acc[6], 8*5($b_ptr) + mov @acc[6], $hi + + ret # __SGX_LVI_HARDENING_CLOBBER__=%rsi +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +___ +} } } +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives ~10% better performance in add-chains +$code.=<<___; +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,\@function,4,"unwind" +.align 32 +sqrx_mont_382x: +.cfi_startproc +sqr_mont_382x\$1: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($b_ptr) # ret->im + mov @acc[1], 8*7($b_ptr) + mov @acc[2], 8*8($b_ptr) + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32-128(%rsp), $a_ptr # t0 [+u-op density] + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + #lea -128($a_ptr), $a_ptr # control u-op density + #lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + lea 128($n_ptr), $n_ptr + mov 32+8*0(%rsp), @acc[6] + and @acc[11], @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[7] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[8] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[9] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($b_ptr) # ret->re + mov @acc[1], 8*1($b_ptr) + mov @acc[2], 8*2($b_ptr) + mov @acc[3], 8*3($b_ptr) + mov @acc[4], 8*4($b_ptr) + mov @acc[5], 8*5($b_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/src/blst/src/asm/sha256-armv8.pl b/src/blst/src/asm/sha256-armv8.pl new file mode 100755 index 0000000000..93545038a6 --- /dev/null +++ b/src/blst/src/asm/sha256-armv8.pl @@ -0,0 +1,547 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for ARMv8. +# +# This module is stripped of scalar code paths, with rationale that all +# known processors are NEON-capable. +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$BITS=256; +$SZ=4; +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; +$reg_t="w"; +$pre="blst_"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +$code.=<<___; +.comm __blst_platform_cap,4 +.text + +.align 6 +.type .LK$BITS,%object +.LK$BITS: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK$BITS,.-.LK$BITS +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.globl ${pre}sha256_block_armv8 +.type ${pre}sha256_block_armv8,%function +.align 6 +${pre}sha256_block_armv8: +.Lv8_entry: + stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! + add c29,csp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr c29,[csp],#2*__SIZEOF_POINTER__ + ret +.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +.globl ${pre}sha256_block_data_order +.type ${pre}sha256_block_data_order,%function +.align 4 +${pre}sha256_block_data_order: + adrp c16,__blst_platform_cap + ldr w16,[c16,#:lo12:__blst_platform_cap] + tst w16,#1 + b.ne .Lv8_entry + + stp c29, c30, [csp, #-2*__SIZEOF_POINTER__]! + mov c29, csp + sub csp,csp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + cmov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + csub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + csub $Xfer,$Xfer,#64 + bne .L_00_48 + + csub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #-64 + csel $Xfer, $Xfer, xzr, eq + cadd $inp,$inp,$Xfer // avoid SEGV + cmov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + cmov $Xfer,sp + b.ne .L_00_48 + + ldr c29,[c29] + add csp,csp,#16*4+2*__SIZEOF_POINTER__ + ret +.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order +___ +} + +{ +my ($out,$inp,$len) = map("x$_",(0..2)); + +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,%function +.align 4 +${pre}sha256_emit: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[$out,#4] + lsr x4,x4,#32 + str w5,[$out,#12] + lsr x5,x5,#32 + str w6,[$out,#20] + lsr x6,x6,#32 + str w7,[$out,#28] + lsr x7,x7,#32 + str w4,[$out,#0] + str w5,[$out,#8] + str w6,[$out,#16] + str w7,[$out,#24] + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,%function +.align 4 +${pre}sha256_bcopy: +.Loop_bcopy: + ldrb w3,[$inp],#1 + sub $len,$len,#1 + strb w3,[$out],#1 + cbnz $len,.Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,%function +.align 4 +${pre}sha256_hcopy: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] + stp x4,x5,[$out] + stp x6,x7,[$out,#16] + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/src/blst/src/asm/sha256-portable-x86_64.pl b/src/blst/src/asm/sha256-portable-x86_64.pl new file mode 100755 index 0000000000..91f75c7431 --- /dev/null +++ b/src/blst/src/asm/sha256-portable-x86_64.pl @@ -0,0 +1,342 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# Scalar-only version with minor twist minimizing 'lea' instructions. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add `$SZ*$i`($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 +___ +$code.=<<___ if ($i==31); + lea `16*$SZ`($Tbl),$Tbl # round+=16 +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.comm __blst_platform_cap,4 +.text + +.globl $func +.type $func,\@function,3,"unwind" +.align 16 +$func: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifdef __BLST_PORTABLE__ + testl \$2,__blst_platform_cap(%rip) + jnz .L${func}\$2 +#endif + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_alloca $framesz +.cfi_def_cfa %rsp +.cfi_end_prologue + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0x19,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + lea $framesz+6*8(%rsp),%r11 +.cfi_def_cfa %r11,8 + mov $framesz(%rsp),%r15 + mov -40(%r11),%r14 + mov -32(%r11),%r13 + mov -24(%r11),%r12 + mov -16(%r11),%rbx + mov -8(%r11),%rbp +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size $func,.-$func + +#ifndef __BLST_PORTABLE__ +.section .rodata +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +#endif +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + print $_,"\n"; +} +close STDOUT; diff --git a/src/blst/src/asm/sha256-x86_64.pl b/src/blst/src/asm/sha256-x86_64.pl new file mode 100755 index 0000000000..4a8d138e6f --- /dev/null +++ b/src/blst/src/asm/sha256-x86_64.pl @@ -0,0 +1,807 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# This module is stripped of AVX and even scalar code paths, with +# rationale that +# +# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* +# processor, venerable Sandy Bridge; +# b) AVX2 incurs costly power transitions, which would be justifiable +# if AVX2 code was executing most of the time, which is not the +# case in the context; +# c) all contemporary processors support SSSE3, so that nobody would +# actually use scalar code path anyway; +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +$code=<<___; +.comm __blst_platform_cap,4 + +.section .rodata +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +.text +___ + +###################################################################### +# SIMD code paths +# +{{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl ${pre}sha256_block_data_order_shaext +.hidden ${pre}sha256_block_data_order_shaext +.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" +.align 64 +${pre}sha256_block_data_order_shaext: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +.L${func}\$2: +___ +$code.=<<___ if ($win64); + sub \$0x50,%rsp +.cfi_alloca 0x50 + movaps %xmm6,-0x50(%rbp) + movaps %xmm7,-0x40(%rbp) + movaps %xmm8,-0x30(%rbp) + movaps %xmm9,-0x20(%rbp) + movaps %xmm10,-0x10(%rbp) +.cfi_offset %xmm6-%xmm10,-0x60 +___ +$code.=<<___; +.cfi_end_prologue +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x100-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*16-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -0x50(%rbp),%xmm6 + movaps -0x40(%rbp),%xmm7 + movaps -0x30(%rbp),%xmm8 + movaps -0x20(%rbp),%xmm9 + movaps -0x10(%rbp),%xmm10 + mov %rbp,%rsp +___ +$code.=<<___; +.cfi_def_cfa_register %rsp + pop %rbp +.cfi_pop %rbp +.cfi_epilogue + ret +.cfi_endproc +.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +{ +my $Tbl = $inp; +my $_ctx="-64(%rbp)"; +my $_inp="-56(%rbp)"; +my $_end="-48(%rbp)"; +my $framesz=3*8+$win64*16*4; + +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func} +.hidden ${func} +.type ${func},\@function,3,"unwind" +.align 64 +${func}: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifndef __SGX_LVI_HARDENING__ + testl \$2,__blst_platform_cap(%rip) + jnz .L${func}\$2 +#endif + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_alloca $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,$_ctx # save ctx, 1st arg + #mov $inp,$_inp # save inp, 2nd arg + mov %rdx,$_end # save end pointer, "3rd" arg +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x80(%rbp) + movaps %xmm7,-0x70(%rbp) + movaps %xmm8,-0x60(%rbp) + movaps %xmm9,-0x50(%rbp) +.cfi_offset %xmm6-%xmm9,-0x90 +___ +$code.=<<___; +.cfi_end_prologue + + lea -16*$SZ(%rsp),%rsp +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov $SZ*0($ctx),$A + and \$-64,%rsp # align stack + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + mov $inp,$_inp # offload $inp + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x10($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x20($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x30($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + mov $_inp,$inp + +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + lea 16*$SZ($inp),$inp + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + xorps %xmm0, %xmm0 + movaps %xmm0, 0x00(%rsp) # scrub the stack + movaps %xmm0, 0x10(%rsp) + movaps %xmm0, 0x20(%rsp) + movaps %xmm0, 0x30(%rsp) +___ +$code.=<<___ if ($win64); + movaps -0x80(%rbp),%xmm6 + movaps -0x70(%rbp),%xmm7 + movaps -0x60(%rbp),%xmm8 + movaps -0x50(%rbp),%xmm9 +___ +$code.=<<___; + mov -40(%rbp),%r15 + mov -32(%rbp),%r14 + mov -24(%rbp),%r13 + mov -16(%rbp),%r12 + mov -8(%rbp),%rbx + mov %rbp,%rsp +.cfi_def_cfa_register %rsp + pop %rbp +.cfi_pop %rbp +.cfi_epilogue + ret +.cfi_endproc +.size ${func},.-${func} +___ +} +}}} +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: +#ifdef __SGX_LVI_HARDENING__ + lfence +#endif + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/src/blst/src/asm/x86_64-xlate.pl b/src/blst/src/asm/x86_64-xlate.pl new file mode 100755 index 0000000000..b55932707f --- /dev/null +++ b/src/blst/src/asm/x86_64-xlate.pl @@ -0,0 +1,1973 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. [Unless function is +# tagged with additional .type tag.] For further details see SEH +# paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $dwarf=$elf; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} +my $colon= $masm ? "::" : ":"; + +$dwarf=0 if($win64); + +my $current_segment; +my $current_function; +my %globals; +my $ret_clobber; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { + # pass through + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + my $reg = $ret_clobber || "rdx"; + $ret_clobber = undef; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . "\n#ifdef __SGX_LVI_HARDENING__\n". + " popq %$reg\n" . + " lfence\n" . + " jmpq *%$reg\n" . + " ud2\n" . + "#else\n" . + " .byte 0xf3,0xc3\n" . + "#endif"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + my $reg = $ret_clobber || "rdx"; + $ret_clobber = undef; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "\nifdef __SGX_LVI_HARDENING__\n". + " pop $reg\n" . + " lfence\n" . + " jmp $reg\n" . + " ud2\n" . + "else\n" . + " DB\t0F3h,0C3h\n" . + "endif"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + my %sifmap = ( ss=>"d", sd=>"q", # broadcast only + i32x2=>"q", f32x2=>"q", + i32x4=>"x", i64x2=>"x", i128=>"x", + f32x4=>"x", f64x2=>"x", f128=>"x", + i32x8=>"y", i64x4=>"y", + f32x8=>"y", f64x4=>"y" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,\s]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/(?:,\s*)/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) + && ($sz=$sifmap{$1}); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w\$]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^(\w+\$\w*)/$decor\1/ if ($flavour eq "macosx"); + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub win64_args { + my $narg = $current_function->{narg} // 6; + return undef if ($narg < 0); + my $arg5 = 4*8 - cfi_directive::cfa_rsp(); + my $arg6 = $arg5 + 8; + my $args; + if ($gas) { + $args .= " movq %rcx,%rdi\n" if ($narg>0); + $args .= " movq %rdx,%rsi\n" if ($narg>1); + $args .= " movq %r8,%rdx\n" if ($narg>2); + $args .= " movq %r9,%rcx\n" if ($narg>3); + $args .= " movq $arg5(%rsp),%r8\n" if ($narg>4); + $args .= " movq $arg6(%rsp),%r9\n" if ($narg>5); + } else { + $args .= " mov rdi,rcx\n" if ($narg>0); + $args .= " mov rsi,rdx\n" if ($narg>1); + $args .= " mov rdx,r8\n" if ($narg>2); + $args .= " mov rcx,r9\n" if ($narg>3); + $args .= " mov r8,QWORD$PTR\[$arg5+rsp\]\n" if ($narg>4); + $args .= " mov r9,QWORD$PTR\[$arg6+rsp\]\n" if ($narg>5); + } + $current_function->{narg} = -1; + $args; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($current_function->{name} eq $self->{value}) { + $current_function->{pc} = 0; + $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); + $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch + if ($win64) { + if ($current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "%r11" : "%rax"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,$fp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + } elsif ($current_function->{unwind}) { + $func .= " movq %rsp,%r11\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + } + } + } elsif ($win64 && $current_function->{abi} eq "svr4" + && $current_function->{pc} >= 0) { + $func = win64_args().$func; + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + my $func; + if ($win64 && $current_function->{abi} eq "svr4" + && $current_function->{pc} >= 0) { + $func = win64_args(); + } + $func .= $self->{value} . $colon; + $func; + } else { + $current_function->{pc} = 0; + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + $func .= " DB 243,15,30,250\n"; # endbranch + if ($current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "r11" : "rax"; + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov $fp,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}${colon}\n"; + } elsif ($current_function->{unwind}) { + $func .= " mov r11,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}${colon}\n"; + } + $func; + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; + if ($flavour eq "macosx" and $self->{value} !~ /\.L/) { + $self->{value} =~ s/(\w+\$\w*)/$decor\1/g; + } + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +my @xdata_seg = (".section .xdata", ".align 8"); +my @pdata_seg = (".section .pdata", ".align 4"); + +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # four auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - .cfi_alloca to annotate stack pointer adjustments, which + # translates to .cfi_adjust_cfa_offset as needed; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + # + # In addition the .cfi directives are re-purposed even for Win64 + # stack unwinding. Two more synthetic directives were added: + # + # - .cfi_end_prologue to denote point when all non-volatile + # registers are saved and stack or [chosen] frame pointer is + # stable; + # - .cfi_epilogue to denote point when all non-volatile registers + # are restored [and it even adds missing .cfi_restore-s]; + # + # Though it's not universal "miracle cure," it has its limitations. + # Most notably .cfi_cfa_expression won't start working... For more + # information see the end of this file. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); + my @cfa_stack; + + sub cfa_rsp { return $cfa_rsp // -8; } + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + + # Following constants are defined in "x64 exception handling" at + # https://docs.microsoft.com/ and match the register sequence in + # CONTEXT structure defined in winnt.h. + my %WIN64_reg_idx = ( + "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, + "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + sub xdata { + our @dat = (); + our $len = 0; + + sub savereg { + my ($key, $offset) = @_; + + if ($key =~ /%xmm([0-9]+)/) { + if ($offset < 0x100000) { + push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; + } else { + push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; + } + } else { + if ($offset < 0x80000) { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, + unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, + unpack("C4",pack("V",$offset))]; + } + } + $len += $#{@dat[-1]}+1; + } + + my $fp_info = 0; + + # allocate stack frame + if ($cfa_rsp < -8) { + my $offset = -8 - $cfa_rsp; + if ($cfa_reg ne "%rsp" && $saved_regs{$cfa_reg} == -16) { + $fp_info = $WIN64_reg_idx{$cfa_reg}; + push @dat, [0,$fp_info<<4]; # UWOP_PUSH_NONVOL + $len += $#{@dat[-1]}+1; + $offset -= 8; + } + if ($offset <= 128) { + my $alloc = ($offset - 8) >> 3; + push @dat, [0,$alloc<<4|2]; # UWOP_ALLOC_SMALL + } elsif ($offset < 0x80000) { + push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; + } + $len += $#{@dat[-1]}+1; + } + + # save frame pointer [if not pushed already] + if ($cfa_reg ne "%rsp" && $fp_info == 0) { + $fp_info = $WIN64_reg_idx{$cfa_reg}; + if (defined(my $offset = $saved_regs{$cfa_reg})) { + $offset -= $cfa_rsp; + savereg($cfa_reg, $offset); + } + } + + # set up frame pointer + if ($fp_info) { + push @dat, [0,($fp_info<<4)|3]; # UWOP_SET_FPREG + $len += $#{@dat[-1]}+1; + my $fp_off = $cfa_off - $cfa_rsp; + ($fp_off > 240 or $fp_off&0xf) and die "invalid FP offset $fp_off"; + $fp_info |= $fp_off&-16; + } + + # save registers + foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } + keys(%saved_regs)) { + next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); + my $offset = $saved_regs{$key} - $cfa_rsp; + savereg($key, $offset); + } + + my @ret; + # generate 4-byte descriptor + push @ret, ".byte 1,0,".($len/2).",$fp_info"; + $len += 4; + # keep objdump happy, pad to 4*n and add a 32-bit zero + unshift @dat, [(0)x(((-$len)&3)+4)]; + $len += $#{@dat[0]}+1; + # pad to 8*n + unshift @dat, [(0)x((-$len)&7)] if ($len&7); + # emit data + while(defined(my $row = pop @dat)) { + push @ret, ".byte ". join(",", + map { sprintf "0x%02x",$_ } @{$row}); + } + + return @ret; + } + sub startproc { + return if ($cfa_rsp == -8); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); + %saved_regs = (); + return "startproc"; + } + sub endproc { + return if ($cfa_rsp == 0); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); + %saved_regs = (); + return "endproc"; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { $dir = startproc(); last; }; + /endproc/ && do { $dir = endproc(); + # .cfi_remember_state directives that are not + # matched with .cfi_restore_state are + # unnecessary. + die "unpaired .cfi_remember_state" if (@cfa_stack); + last; + }; + /def_cfa_register/ + && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); + $cfa_reg = $$line; + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /def_cfa_offset/ + && do { $cfa_off = -1*eval($$line); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { my $val = 1*eval($$line); + $cfa_off -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_rsp -= $val; + } + last; + }; + /alloca/ && do { $dir = undef; + my $val = 1*eval($$line); + $cfa_rsp -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_off -= $val; + $dir = "adjust_cfa_offset"; + } + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*(?:,\s*(.+))?/) { + $cfa_reg = $1; + if ($cfa_reg eq "%rsp" && !defined($2)) { + $cfa_off = $cfa_rsp; + $$line .= ",".(-$cfa_rsp); + } else { + $cfa_off = -1*eval($2); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + } + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $saved_regs{$$line} = $cfa_rsp; + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + delete $saved_regs{$$line}; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, + [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; + last; + }; + /restore_state/ + && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) + = @{pop @cfa_stack}; + last; + }; + /offset/ && do { if ($$line =~ /(%\w+)(?:-%xmm(\d+))?\s*,\s*(.+)/) { + my ($reg, $off, $xmmlast) = ($1, 1*eval($3), $2); + if ($reg !~ /%xmm(\d+)/) { + $saved_regs{$reg} = $off; + } else { + $dir = undef; + $xmmlast //= $1; + for (my $i=$1; $i<=$xmmlast; $i++) { + $saved_regs{"%xmm$i"} = $off; + $off += 16; + } + } + } + last; + }; + /restore/ && do { delete $saved_regs{$$line}; last; }; + /end_prologue/ + && do { $dir = undef; + $self->{win64} = ".endprolog"; + last; + }; + /epilogue/ && do { $dir = undef; + $self->{win64} = ".epilogue"; + $self->{value} = join("\n", + map { ".cfi_restore\t$_" } + sort keys(%saved_regs)); + %saved_regs = (); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return $self->{value} if ($dwarf); + + if ($win64 and $current_function->{unwind} + and my $ret = $self->{win64}) { + my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) + : ("rsp", $cfa_rsp); + my $fname = $current_function->{name}; + + if ($ret eq ".endprolog") { + $ret = ""; + if ($current_function->{abi} eq "svr4") { + $ret .= label::win64_args(); + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + } + + push @pdata_seg, + ".rva .LSEH_begin_${fname}", + ".rva .LSEH_body_${fname}", + ".rva .LSEH_info_${fname}_prologue",""; + push @xdata_seg, + ".LSEH_info_${fname}_prologue:"; + if ($current_function->{unwind} eq "%rbp") { + if ($current_function->{abi} eq "svr4") { + push @xdata_seg, + ".byte 1,4,6,0x05", # 6 unwind codes, %rbp is FP + ".byte 4,0x74,2,0", # %rdi at 16(%rsp) + ".byte 4,0x64,3,0", # %rsi at 24(%rsp) + ".byte 4,0x53", # mov %rsp, %rbp + ".byte 1,0x50", # push %rbp + ".long 0,0" # pad to keep objdump happy + ; + } else { + push @xdata_seg, + ".byte 1,4,2,0x05", # 2 unwind codes, %rbp is FP + ".byte 4,0x53", # mov %rsp, %rbp + ".byte 1,0x50", # push %rbp + ".long 0,0" # pad to keep objdump happy + ; + } + } else { + if ($current_function->{abi} eq "svr4") { + push @xdata_seg, + ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP + ".byte 0,0x74,1,0", # %rdi at 8(%rsp) + ".byte 0,0x64,2,0", # %rsi at 16(%rsp) + ".byte 0,0xb3", # set frame pointer + ".byte 0,0", # padding + ".long 0,0" # pad to keep objdump happy + ; + } else { + push @xdata_seg, + ".byte 1,0,1,0x0b", # 1 unwind code, %r11 is FP + ".byte 0,0xb3", # set frame pointer + ".byte 0,0", # padding + ".long 0,0" # pad to keep objdump happy + ; + } + } + push @pdata_seg, + ".rva .LSEH_body_${fname}", + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_info_${fname}_body",""; + push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); + $ret .= "${decor}SEH_body_${fname}${colon}\n"; + } elsif ($ret eq ".epilogue") { + %saved_regs = (); + $cfa_rsp = $cfa_off; + $ret = "${decor}SEH_epilogue_${fname}${colon}\n"; + if ($current_function->{abi} eq "svr4") { + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + + push @pdata_seg, + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_end_${fname}", + ".rva .LSEH_info_${fname}_epilogue",""; + push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; + if ($gas) { + $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; + $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; + } else { + $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; + $ret .= " ;WIN64 epilogue\n"; + $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; + } + } + } + return $ret; + } + return; + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern|\.comm/ + && do { $$line =~ s/([_a-z][_a-z0-9\$]*)/$prefix\1/gi; + $globals{$1} = $prefix.$1 if ($1); + last; + }; + /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + $current_function->{pc} = -1; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + $current_function->{pc} = -1; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif ($dir =~ /\.size/) { + $self->{value} = "" if (!$elf); + if ($dwarf and my $endproc = cfi_directive::endproc()) { + $self->{value} = ".cfi_$endproc\n$self->{value}"; + } elsif (!$elf && defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + if (!$elf && $current_segment eq ".rodata") { + if ($flavour eq "macosx") { $self->{value} = ".section\t__TEXT,__const"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.rdata"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + $$line = ".rdata" if ($$line eq ".rodata"); + my %align = ( p=>4, x=>8, r=>256); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([pxr])data/) { + $v.=" rdata align=$align{$1}"; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([pxr])data/) { + $v.=" READONLY"; + $v.=" ALIGN($align{$1})" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}${colon}\n"; + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... +# However, it should not be used in functions' prologues explicitly, as +# it's added automatically [and in the right spot]. Which leaves only +# non-function indirect branch targets, such as in a case-like dispatch +# table, as application area. + +my $endbr64 = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +my $preproc_prefix = "#"; + +if ($nasm) { + $preproc_prefix = "%"; + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + $preproc_prefix = ""; + print <<___; +OPTION DOTNAME +___ +} + +sub process { + my $line = shift; + + $line =~ s|\R$||; # Better chomp + + if ($line =~ m/^#\s*(if|elif|else|endif)(.*)/) { # pass through preproc + if ($win64 && $current_function->{abi} eq "svr4" + && $current_function->{narg} >= 0) { + print label::win64_args(); + } + print $preproc_prefix,$1,$2,"\n"; + next; + } + + if ($line =~ m|#\s*__SGX_LVI_HARDENING_CLOBBER__=(?:%?(r\w+))|) { + $ret_clobber = $1; + } + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($win64 && $current_function->{abi} eq "svr4" + && $current_function->{narg} >= 0) { + my $pc = $current_function->{pc}; + my $op = $opcode->{op}; + my $a0 = @args[0]->{value} if ($#args>=0); + if (!$current_function->{unwind} + || $pc == 0 && !($op eq "push" && $a0 eq "rbp") + || $pc == 1 && !($op eq "mov" && $a0 eq "rsp" + && @args[1]->{value} eq "rbp" + && ($current_function->{unwind} = "%rbp")) + || $pc > 1) { + print label::win64_args(); + } + } + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + + ++$current_function->{pc} if (defined($current_function)); + } + + print $line,"\n"; +} + +while(<>) { process($_); } + +map { process($_) } @pdata_seg if ($win64 && $#pdata_seg>1); +map { process($_) } @xdata_seg if ($win64 && $#xdata_seg>1); + +# platform-specific epilogue +if ($masm) { + print "\n$current_segment\tENDS\n" if ($current_segment); + print "END\n"; +} elsif ($elf) { + # -fcf-protection segment, snatched from compiler -S output + my $align = ($flavour =~ /elf32/) ? 4 : 8; + print <<___; + +.section .note.GNU-stack,"",\@progbits +#ifndef __SGX_LVI_HARDENING__ +.section .note.gnu.property,"a",\@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align $align +2: +#endif +___ +} + +close STDOUT; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). +# +######################################################################## +# As of May 2020 an alternative approach that works with both exceptions +# and debugging/profiling was implemented by re-purposing DWARF .cfi +# annotations even for Win64 unwind tables' generation. Unfortunately, +# but not really unexpectedly, it imposes additional limitations on +# coding style. Probably the most significant limitation is that the +# frame pointer has to be at 16*n distance from the stack pointer at the +# exit from prologue. But first things first. There are two additional +# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, +# that need to be added to all functions marked with additional .type +# tag (see example below). There are "do's and don'ts" for prologue +# and epilogue. It shouldn't come as a surprise that in prologue one may +# not modify non-volatile registers, but one may not modify %r11 either. +# This is because it's used as a temporary frame pointer(*). There are +# two exceptions to this rule. 1) One can set up a non-volatile register +# or %r11 as a frame pointer, but it must be last instruction in the +# prologue. 2) One can use 'push %rbp' as first instruction immediately +# followed by 'mov %rsp,%rbp' to use %rbp as "legacy" frame pointer. +# Constraints for epilogue, or rather on its boundary, depend on whether +# the frame is fixed- or variable-length. In fixed-frame subroutine +# stack pointer has to be restored in the last instruction prior to the +# .cfi_epilogue directive. If it's a variable-frame subroutine, and a +# non-volatile register was used as a frame pointer, then the last +# instruction prior to the directive has to restore its original value. +# This means that final stack pointer adjustment would have to be +# pushed past the directive. Normally this would render the epilogue +# non-unwindable, so special care has to be taken. To resolve the +# dilemma, copy the frame pointer to a volatile register in advance. +# To give an example: +# +# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! +# rbp_as_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# mov %rsp,%rbp # last instruction in prologue +# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 +# .cfi_end_prologue +# sub \$40,%rsp +# and \$-64,%rsp +# ... +# mov %rbp,%r11 +# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 +# mov 0(%rbp),%rbx +# mov 8(%rbp),%rbp # last instruction prior epilogue +# .cfi_epilogue # may not change %r11 in epilogue +# lea 16(%r11),%rsp +# ret +# .cfi_endproc +# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer +# +# An example of "legacy" frame pointer: +# +# .type legacy_frame_pointer,\@function,3,"unwind" # mind extra tag! +# legacy_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# mov %rsp,%rbp +# .cfi_def_cfa_register %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_alloca 40 +# .cfi_end_prologue # %rsp-%rbp has to be 16*n +# and \$-64,%rsp +# ... +# mov -8(%rbp),%rbx +# mov %rbp,%rsp +# .cfi_def_cfa_regiser %rsp +# pop %rbp # recognized by Windows +# .cfi_pop %rbp +# .cfi_epilogue +# ret +# .cfi_endproc +# .size legacy_frame_pointer,.-legacy_frame_pointer +# +# To give an example of fixed-frame subroutine for reference: +# +# .type fixed_frame,\@function,3,"unwind" # mind extra tag! +# fixed_frame: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_adjust_cfa_offset 40 +# .cfi_end_prologue +# ... +# mov 40(%rsp),%rbx +# mov 48(%rsp),%rbp +# lea 56(%rsp),%rsp +# .cfi_adjust_cfa_offset -56 +# .cfi_epilogue +# ret +# .cfi_endproc +# .size fixed_frame,.-fixed_frame +# +# As for epilogue itself, one can only work on non-volatile registers. +# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. +# +# On a final note, mixing old-style and modernized subroutines in the +# same file takes some trickery. Ones of the new kind have to appear +# after old-style ones. This has everything to do with the fact that +# entries in the .pdata segment have to appear in strictly same order +# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION +# structures get mechanically appended to whatever existing .pdata. +# +# (*) Just in case, why %r11 and not %rax. This has everything to do +# with the way UNWIND_INFO is, one just can't designate %rax as +# frame pointer. diff --git a/src/blst/src/blst_t.hpp b/src/blst/src/blst_t.hpp new file mode 100644 index 0000000000..2fe03aa72e --- /dev/null +++ b/src/blst/src/blst_t.hpp @@ -0,0 +1,624 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef __BLST_T_HPP__ +#define __BLST_T_HPP__ + +/* + * These templates, blst_384_t and blst_256_t, allow to instantiate slim + * C++ shims to blst assembly with arbitrary moduli. Well, not literally + * arbitrary, as there are limitations. Most notably blst_384_t can not + * actually accommodate 384-bit moduli, only 383 and narrower. This is + * because of ct_inverse_mod_383's limitation. Though if you abstain + * from the reciprocal() method, even 384-bit modulus would work. As for + * blst_256_t, modulus has to be not larger than 2^256-2^192-1. + */ + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +extern "C" { +#include "vect.h" +} +#include "bytes.h" + +#undef launder // avoid conflict with C++ >=17 + +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif + +static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t N) +{ + const unsigned int nbits = sizeof(inp[0])*8; + const unsigned int align = (0 - N) % nbits; + size_t n = (N + nbits - 1) / nbits; + + if (align) { + limb_t top = inp[n-1] << align; + + while (--n) { + limb_t next = inp[n-1]; + out[n] = top | next >> (nbits-align); + top = next << align; + } + out[0] = top; + } else { + for (size_t i = 0; i < n; i++) + out[i] = inp[i]; + } +} + +template +class blst_384_t { +private: + vec384 val; + + inline operator const limb_t*() const { return val; } + inline operator limb_t*() { return val; } + inline limb_t& operator[](size_t i) { return val[i]; } + inline const limb_t& operator[](size_t i) const { return val[i]; } + + static const size_t n = sizeof(vec384)/sizeof(limb_t); +public: + static const size_t nbits = N; + static constexpr size_t bit_length() { return N; } + static const unsigned int degree = 1; + typedef byte pow_t[384/8]; + typedef blst_384_t mem_t; + + inline blst_384_t() {} + inline blst_384_t(const vec384 p, bool align = false) + { + if (align) + vec_left_align(val, p, N); + else + vec_copy(val, p, sizeof(val)); + } + inline blst_384_t(uint64_t a) + { + vec_zero(val, sizeof(val)); + val[0] = a; + if (a) to(); + } + inline blst_384_t(int a) : blst_384_t((uint64_t)a) {} + + inline void to_scalar(pow_t& scalar) const + { + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { + from_mont_384((limb_t *)scalar, val, MOD, M0); + } else { + vec384 out; + from_mont_384(out, val, MOD, M0); + le_bytes_from_limbs(scalar, out, sizeof(pow_t)); + vec_zero(out, sizeof(out)); + } + } + + static inline const blst_384_t& one() + { return *reinterpret_cast(ONE); } + + static inline blst_384_t one(bool or_zero) + { + blst_384_t ret; + limb_t mask = ~((limb_t)0 - or_zero); + for (size_t i = 0; i < n; i++) + ret[i] = ONE[i] & mask; + return ret; + } + + inline blst_384_t& to() + { mul_mont_384(val, RR, val, MOD, M0); return *this; } + inline blst_384_t& from() + { from_mont_384(val, val, MOD, M0); return *this; } + + inline void store(limb_t *p) const + { vec_copy(p, val, sizeof(val)); } + + inline blst_384_t& operator+=(const blst_384_t& b) + { add_mod_384(val, val, b, MOD); return *this; } + friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + add_mod_384(ret, a, b, MOD); + return ret; + } + + inline blst_384_t& operator<<=(unsigned l) + { lshift_mod_384(val, val, l, MOD); return *this; } + friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l) + { + blst_384_t ret; + lshift_mod_384(ret, a, l, MOD); + return ret; + } + + inline blst_384_t& operator>>=(unsigned r) + { rshift_mod_384(val, val, r, MOD); return *this; } + friend inline blst_384_t operator>>(const blst_384_t& a, unsigned r) + { + blst_384_t ret; + rshift_mod_384(ret, a, r, MOD); + return ret; + } + + inline blst_384_t& operator-=(const blst_384_t& b) + { sub_mod_384(val, val, b, MOD); return *this; } + friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + sub_mod_384(ret, a, b, MOD); + return ret; + } + + inline blst_384_t& cneg(bool flag) + { cneg_mod_384(val, val, flag, MOD); return *this; } + friend inline blst_384_t cneg(const blst_384_t& a, bool flag) + { + blst_384_t ret; + cneg_mod_384(ret, a, flag, MOD); + return ret; + } + friend inline blst_384_t operator-(const blst_384_t& a) + { + blst_384_t ret; + cneg_mod_384(ret, a, true, MOD); + return ret; + } + + inline blst_384_t& operator*=(const blst_384_t& a) + { + if (this == &a) sqr_mont_384(val, val, MOD, M0); + else mul_mont_384(val, val, a, MOD, M0); + return *this; + } + friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + if (&a == &b) sqr_mont_384(ret, a, MOD, M0); + else mul_mont_384(ret, a, b, MOD, M0); + return ret; + } + + // simplified exponentiation, but mind the ^ operator's precedence! + friend inline blst_384_t operator^(const blst_384_t& a, unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + blst_384_t ret; + sqr_mont_384(ret, a, MOD, M0); + return ret; + } else { + blst_384_t ret = a, sqr = a; + if ((p&1) == 0) { + do { + sqr_mont_384(sqr, sqr, MOD, M0); + p >>= 1; + } while ((p&1) == 0); + ret = sqr; + } + for (p >>= 1; p; p >>= 1) { + sqr_mont_384(sqr, sqr, MOD, M0); + if (p&1) + mul_mont_384(ret, ret, sqr, MOD, M0); + } + return ret; + } + } + inline blst_384_t& operator^=(unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + sqr_mont_384(val, val, MOD, M0); + return *this; + } + return *this = *this^p; + } + inline blst_384_t operator()(unsigned p) + { return *this^p; } + friend inline blst_384_t sqr(const blst_384_t& a) + { return a^2; } + + inline bool is_one() const + { return vec_is_equal(val, ONE, sizeof(val)); } + + inline int is_zero() const + { return vec_is_zero(val, sizeof(val)); } + + inline void zero() + { vec_zero(val, sizeof(val)); } + + friend inline blst_384_t czero(const blst_384_t& a, int set_z) + { blst_384_t ret; + const vec384 zero = { 0 }; + vec_select(ret, zero, a, sizeof(ret), set_z); + return ret; + } + + static inline blst_384_t csel(const blst_384_t& a, const blst_384_t& b, + int sel_a) + { blst_384_t ret; + vec_select(ret, a, b, sizeof(ret), sel_a); + return ret; + } + + blst_384_t reciprocal() const + { + static const blst_384_t MODx{MOD, true}; + static const blst_384_t RRx4 = *reinterpret_cast(RR)<<2; + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, val, MOD, MODx); + redc_mont_384(temp.r[0], temp.x, MOD, M0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, MOD, M0); + + return *reinterpret_cast(temp.r[0]); + } + friend inline blst_384_t operator/(unsigned one, const blst_384_t& a) + { + if (one == 1) + return a.reciprocal(); + abort(); + } + friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b) + { return a * b.reciprocal(); } + inline blst_384_t& operator/=(const blst_384_t& a) + { return *this *= a.reciprocal(); } + +#ifndef NDEBUG + inline blst_384_t(const char *hexascii) + { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } + + friend inline bool operator==(const blst_384_t& a, const blst_384_t& b) + { return vec_is_equal(a, b, sizeof(vec384)); } + friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b) + { return !vec_is_equal(a, b, sizeof(vec384)); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend std::ostream& operator<<(std::ostream& os, const blst_384_t& obj) + { + unsigned char be[sizeof(obj)]; + char buf[2+2*sizeof(obj)+1], *str = buf; + + be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj)); + + *str++ = '0', *str++ = 'x'; + for (size_t i = 0; i < sizeof(obj); i++) + *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); + *str = '\0'; + + return os << buf; + } +# endif +#endif +}; + +template +class blst_256_t { + vec256 val; + + inline operator const limb_t*() const { return val; } + inline operator limb_t*() { return val; } + inline limb_t& operator[](size_t i) { return val[i]; } + inline const limb_t& operator[](size_t i) const { return val[i]; } + + static const size_t n = sizeof(vec256)/sizeof(limb_t); +public: + static const size_t nbits = N; + static constexpr size_t bit_length() { return N; } + static const unsigned int degree = 1; + typedef byte pow_t[256/8]; + typedef blst_256_t mem_t; + + inline blst_256_t() {} + inline blst_256_t(const vec256 p, bool align = false) + { + if (align) + vec_left_align(val, p, N); + else + vec_copy(val, p, sizeof(val)); + } + inline blst_256_t(uint64_t a) + { + vec_zero(val, sizeof(val)); + val[0] = a; + if (a) to(); + } + inline blst_256_t(int a) : blst_256_t((uint64_t)a) {} + + inline void to_scalar(pow_t& scalar) const + { + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { + from_mont_256((limb_t *)scalar, val, MOD, M0); + } else { + vec256 out; + from_mont_256(out, val, MOD, M0); + le_bytes_from_limbs(scalar, out, sizeof(pow_t)); + vec_zero(out, sizeof(out)); + } + } + + static inline const blst_256_t& one() + { return *reinterpret_cast(ONE); } + + static inline blst_256_t one(bool or_zero) + { + blst_256_t ret; + limb_t mask = ~((limb_t)0 - or_zero); + for (size_t i = 0; i < n; i++) + ret[i] = ONE[i] & mask; + return ret; + } + + inline blst_256_t& to() + { mul_mont_sparse_256(val, val, RR, MOD, M0); return *this; } + inline blst_256_t& to(const uint64_t a[2*n]) + { + mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0); + add_mod_256(val, val, (const limb_t*)a, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + + return *this; + } + blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false) + { + vec_zero(val, sizeof(val)); + + vec256 digit; + size_t rem = (n - 1) % 32 + 1; + n -= rem; + + if (le) { + limbs_from_le_bytes(val, bytes += n, rem); + mul_mont_sparse_256(val, RR, val, MOD, M0); + while (n) { + limbs_from_le_bytes(digit, bytes -= 32, 32); + add_mod_256(val, val, digit, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + n -= 32; + } + } else { + limbs_from_be_bytes(val, bytes, rem); + mul_mont_sparse_256(val, RR, val, MOD, M0); + bytes += rem; + while (n) { + limbs_from_be_bytes(digit, bytes, 32); + add_mod_256(val, val, digit, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + bytes += 32; + n -= 32; + } + } + + return *this; + } + + inline blst_256_t& from() + { from_mont_256(val, val, MOD, M0); return *this; } + inline blst_256_t& from(const uint64_t a[2*n]) + { + redc_mont_256(val, (const limb_t*)a, MOD, M0); + mul_mont_sparse_256(val, RR, val, MOD, M0); + + return *this; + } + inline blst_256_t& from(const unsigned char *bytes, size_t n, bool le = false) + { + if (n > 64) + return to(bytes, n, le).from(); + + if (n > 32) { + vec512 temp{0}; + if (le) limbs_from_le_bytes(temp, bytes, n); + else limbs_from_be_bytes(temp, bytes, n); + redc_mont_256(val, temp, MOD, M0); + mul_mont_sparse_256(val, RR, val, MOD, M0); + } else { + vec_zero(val, sizeof(val)); + if (le) limbs_from_le_bytes(val, bytes, n); + else limbs_from_be_bytes(val, bytes, n); + mul_mont_sparse_256(val, ONE, val, MOD, M0); + } + + return *this; + } + + inline void store(limb_t *p) const + { vec_copy(p, val, sizeof(val)); } + + inline blst_256_t& operator+=(const blst_256_t& b) + { add_mod_256(val, val, b, MOD); return *this; } + friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + add_mod_256(ret, a, b, MOD); + return ret; + } + + inline blst_256_t& operator<<=(unsigned l) + { lshift_mod_256(val, val, l, MOD); return *this; } + friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l) + { + blst_256_t ret; + lshift_mod_256(ret, a, l, MOD); + return ret; + } + + inline blst_256_t& operator>>=(unsigned r) + { lshift_mod_256(val, val, r, MOD); return *this; } + friend inline blst_256_t operator>>(blst_256_t a, unsigned r) + { + blst_256_t ret; + lshift_mod_256(ret, a, r, MOD); + return ret; + } + + inline blst_256_t& operator-=(const blst_256_t& b) + { sub_mod_256(val, val, b, MOD); return *this; } + friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + sub_mod_256(ret, a, b, MOD); + return ret; + } + + inline blst_256_t& cneg(bool flag) + { cneg_mod_256(val, val, flag, MOD); return *this; } + friend inline blst_256_t cneg(const blst_256_t& a, bool flag) + { + blst_256_t ret; + cneg_mod_256(ret, a, flag, MOD); + return ret; + } + friend inline blst_256_t operator-(const blst_256_t& a) + { + blst_256_t ret; + cneg_mod_256(ret, a, true, MOD); + return ret; + } + + inline blst_256_t& operator*=(const blst_256_t& a) + { + if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0); + else mul_mont_sparse_256(val, val, a, MOD, M0); + return *this; + } + friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + if (&a == &b) sqr_mont_sparse_256(ret, a, MOD, M0); + else mul_mont_sparse_256(ret, a, b, MOD, M0); + return ret; + } + + // simplified exponentiation, but mind the ^ operator's precedence! + friend inline blst_256_t operator^(const blst_256_t& a, unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + blst_256_t ret; + sqr_mont_sparse_256(ret, a, MOD, M0); + return ret; + } else { + blst_256_t ret = a, sqr = a; + if ((p&1) == 0) { + do { + sqr_mont_sparse_256(sqr, sqr, MOD, M0); + p >>= 1; + } while ((p&1) == 0); + ret = sqr; + } + for (p >>= 1; p; p >>= 1) { + sqr_mont_sparse_256(sqr, sqr, MOD, M0); + if (p&1) + mul_mont_sparse_256(ret, ret, sqr, MOD, M0); + } + return ret; + } + } + inline blst_256_t& operator^=(unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + sqr_mont_sparse_256(val, val, MOD, M0); + return *this; + } + return *this = *this^p; + } + inline blst_256_t operator()(unsigned p) + { return *this^p; } + friend inline blst_256_t sqr(const blst_256_t& a) + { return a^2; } + + inline bool is_one() const + { return vec_is_equal(val, ONE, sizeof(val)); } + + inline int is_zero() const + { return vec_is_zero(val, sizeof(val)); } + + inline void zero() + { vec_zero(val, sizeof(val)); } + + friend inline blst_256_t czero(const blst_256_t& a, int set_z) + { blst_256_t ret; + const vec256 zero = { 0 }; + vec_select(ret, zero, a, sizeof(ret), set_z); + return ret; + } + + static inline blst_256_t csel(const blst_256_t& a, const blst_256_t& b, + int sel_a) + { blst_256_t ret; + vec_select(ret, a, b, sizeof(ret), sel_a); + return ret; + } + + blst_256_t reciprocal() const + { + static const blst_256_t MODx{MOD, true}; + union { vec512 x; vec256 r[2]; } temp; + + ct_inverse_mod_256(temp.x, val, MOD, MODx); + redc_mont_256(temp.r[0], temp.x, MOD, M0); + mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0); + + return *reinterpret_cast(temp.r[0]); + } + friend inline blst_256_t operator/(int one, const blst_256_t& a) + { + if (one == 1) + return a.reciprocal(); + abort(); + } + friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b) + { return a * b.reciprocal(); } + inline blst_256_t& operator/=(const blst_256_t& a) + { return *this *= a.reciprocal(); } + +#ifndef NDEBUG + inline blst_256_t(const char *hexascii) + { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } + + friend inline bool operator==(const blst_256_t& a, const blst_256_t& b) + { return vec_is_equal(a, b, sizeof(vec256)); } + friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b) + { return !vec_is_equal(a, b, sizeof(vec256)); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend std::ostream& operator<<(std::ostream& os, const blst_256_t& obj) + { + unsigned char be[sizeof(obj)]; + char buf[2+2*sizeof(obj)+1], *str=buf; + + be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj)); + + *str++ = '0', *str++ = 'x'; + for (size_t i = 0; i < sizeof(obj); i++) + *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); + *str = '\0'; + + return os << buf; + } +# endif +#endif +}; +#endif diff --git a/src/blst/src/bulk_addition.c b/src/blst/src/bulk_addition.c new file mode 100644 index 0000000000..4d36f405b6 --- /dev/null +++ b/src/blst/src/bulk_addition.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * This implementation uses explicit addition formula: + * + * λ = (Y₂-Y₁)/(X₂-X₁) + * X₃ = λ²-(X₁+X₂) + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * But since we don't know if we'll have to add point to itself, we need + * to eventually resort to corresponding doubling formula: + * + * λ = 3X₁²/2Y₁ + * X₃ = λ²-2X₁ + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * The formulae use prohibitively expensive inversion, but whenever we + * have a lot of affine points to accumulate, we can amortize the cost + * by applying Montgomery's batch inversion approach. As a result, + * asymptotic[!] per-point cost for addition is as small as 5M+1S. For + * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things + * considered, the improvement coefficient varies from 60% to 85% + * depending on platform and curve. + * + * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an + * application that requires constant time-ness, speak up!] + */ + +/* + * Calculate λ's numerator and denominator. + * + * input: A x1 y1 - + * B x2 y2 - + * output: + * if A!=B: A x1 y1 (x2-x1)*mul_acc + * B x2+x1 y2-y1 (x2-x1) + * + * if A==B: A x y 2y*mul_acc + * B 2x 3*x^2 2y + * + * if A==-B: A 0 0 1*mul_acc + * B 0 3*x^2 0 + */ +#define HEAD(ptype, bits, field, one) \ +static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ +{ \ + ptype *A = AB, *B = AB+1; \ + limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ + vec_is_zero(B, sizeof(ptype##_affine)); \ + static const vec##bits zero = { 0 }; \ +\ + sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ + add_##field(B->X, B->X, A->X); /* X2+X1 */ \ + add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ + sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ + if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ + inf = vec_is_zero(A->Z, sizeof(A->Z)); \ + vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ + sqr_##field(B->Y, A->X); \ + mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ + vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ + } /* B->Y is numenator */ \ + /* B->Z is denominator */ \ + vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ + vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ + vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ + vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ + if (mul_acc != NULL) \ + mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ +} + +/* + * Calculate λ and resulting coordinates. + * + * input: A x1 y1 - + * B x2+x1 nominator - + * lambda 1/denominator + * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 + */ +#define TAIL(ptype, bits, field, one) \ +static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ +{ \ + ptype *A = AB, *B = AB+1; \ + vec##bits llambda; \ + limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ +\ + mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ + /* alt. 3*X1^2/2*Y1 */ \ + sqr_##field(llambda, lambda); \ + sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ +\ + sub_##field(D->Y, A->X, D->X); \ + mul_##field(D->Y, D->Y, lambda); \ + sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ +\ + vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ + vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ +} + +/* + * |points[]| is volatile buffer with |X|s and |Y|s initially holding + * input affine coordinates, and with |Z|s being used as additional + * temporary storage [unrelated to Jacobian coordinates]. |sum| is + * in-/output, initialize to infinity accordingly. + */ +#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ +HEAD(ptype, bits, field, one) \ +TAIL(ptype, bits, field, one) \ +static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ +{ \ + ptype *dst; \ + void *mul_acc; \ + size_t i; \ +\ + while (n >= 16) { \ + if (n & 1) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ + n /= 2; \ + for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ + ptype##_head(points, mul_acc); \ +\ + reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ +\ + for (dst = points, i = n; --i;) { \ + dst--; points -= 2; \ + mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ + ptype##_tail(dst, points, points[-2].Z); \ + mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ + } \ + dst--; points -= 2; \ + ptype##_tail(dst, points, points[0].Z); \ + points = dst; \ + } \ + while (n--) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ +} \ +\ +void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \ + ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ + sizeof(ptype)); \ + const ptype##_affine *point = NULL; \ +\ + vec_zero(sum, sizeof(*sum)); \ + while (npoints) { \ + size_t i, j = npoints > stride ? stride : npoints; \ + for (i=0; i> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restrict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + limb_t limb; + size_t i, j, r; + + if ((uptr_t)out == (uptr_t)in && is_endian.little) + return; + + r = n % sizeof(limb_t); + n /= sizeof(limb_t); + + for(i = 0; i < n; i++) { + for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) + *out++ = (unsigned char)limb; + } + if (r) { + for (limb = in[i], j = 0; j < r; j++, limb >>= 8) + *out++ = (unsigned char)limb; + } +} + +static inline char hex_from_nibble(unsigned char nibble) +{ + int mask = (9 - (nibble &= 0xf)) >> 31; + return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask))); +} + +static unsigned char nibble_from_hex(char c) +{ + int mask, ret; + + mask = (('a'-c-1) & (c-1-'f')) >> 31; + ret = (10 + c - 'a') & mask; + mask = (('A'-c-1) & (c-1-'F')) >> 31; + ret |= (10 + c - 'A') & mask; + mask = (('0'-c-1) & (c-1-'9')) >> 31; + ret |= (c - '0') & mask; + mask = ((ret-1) & ~mask) >> 31; + ret |= 16 & mask; + + return (unsigned char)ret; +} + +static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex) +{ + size_t len; + unsigned char b = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + bytes_zero(ret, sz); + + while(len--) { + b <<= 4; + b |= nibble_from_hex(*hex++); + if (len % 2 == 0) + ret[len / 2] = b; + } +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble_from_hex(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +#endif diff --git a/src/blst/src/client_min_pk.c b/src/blst/src/client_min_pk.c new file mode 100644 index 0000000000..0fcf563f50 --- /dev/null +++ b/src/blst/src/client_min_pk.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/src/blst/src/client_min_sig.c b/src/blst/src/client_min_sig.c new file mode 100644 index 0000000000..8e4663daed --- /dev/null +++ b/src/blst/src/client_min_sig.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/src/blst/src/consts.c b/src/blst/src/consts.c new file mode 100644 index 0000000000..021c878a25 --- /dev/null +++ b/src/blst/src/consts.c @@ -0,0 +1,36 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" + +/* z = -0xd201000000010000 */ +const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ + TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) +}; +const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ + +const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ + { { ONE_MONT_P }, + { 0 } } +}; + +const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), + TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), + TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) +}; + +const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ + TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), + TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) +}; + +const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), + TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) +}; diff --git a/src/blst/src/consts.h b/src/blst/src/consts.h new file mode 100644 index 0000000000..cb391b817d --- /dev/null +++ b/src/blst/src/consts.h @@ -0,0 +1,30 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_CONST_H__ +#define __BLS12_381_ASM_CONST_H__ +#include "vect.h" + +extern const vec384 BLS12_381_P; +extern const limb_t BLS12_381_p0; +static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ +typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; +extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ +extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ + +#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ + TO_LIMB_T(0xebf4000bc40c0002), \ + TO_LIMB_T(0x5f48985753c758ba), \ + TO_LIMB_T(0x77ce585370525745), \ + TO_LIMB_T(0x5c071a97a256ec6d), \ + TO_LIMB_T(0x15f65ec3fa80e493) + +#define ZERO_384 (BLS12_381_Rx.p2[1]) + +extern const vec256 BLS12_381_r; /* order */ +static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ +extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ + +#endif diff --git a/src/blst/src/cpuid.c b/src/blst/src/cpuid.c new file mode 100644 index 0000000000..82317043ce --- /dev/null +++ b/src/blst/src/cpuid.c @@ -0,0 +1,114 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32) +__attribute__((visibility("hidden"))) +#endif +int __blst_platform_cap = 0; + +#if defined(__x86_64__) || defined(__x86_64) || (defined(_M_X64) && !defined(_M_ARM64EC)) + +# if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C) +static void __cpuidex(int info[4], int func, int sub) +{ + int eax, ebx, ecx, edx; + + __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(func), "c"(sub)); + + info[0] = eax; + info[1] = ebx; + info[2] = ecx; + info[3] = edx; +} +# else +# include +# endif + +# if defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +# endif +static int __blst_cpuid(void) +{ + int info[4], cap = 0; + + __cpuidex(info, 0, 0); + if (info[0] > 6) { + __cpuidex(info, 7, 0); + cap |= (info[1]>>19) & 1; /* ADX */ + cap |= (info[1]>>28) & 2; /* SHA */ + } + + __blst_platform_cap = cap; + + return 0; +} + +# if defined(_MSC_VER) && !defined(__clang__) +# pragma section(".CRT$XCU",read) +__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; +# elif defined(__SUNPRO_C) +# pragma init(__blst_cpuid) +# endif + +#elif defined(__aarch64__) || defined(__aarch64) || defined(_M_ARM64) + +# if defined(__linux__) && (defined(__GNUC__) || defined(__clang__)) +extern unsigned long getauxval(unsigned long type) __attribute__ ((weak)); + +__attribute__((constructor)) +static int __blst_cpuid(void) +{ + int cap = 0; + + if (getauxval) { + unsigned long hwcap_ce = getauxval(16); + cap = (hwcap_ce>>6) & 1; /* SHA256 */ + } + + __blst_platform_cap = cap; + + return 0; +} +# elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__)) +__attribute__((constructor)) +static int __blst_cpuid() +{ + __blst_platform_cap = 1; /* SHA256 */ + return 0; +} +# elif defined(__FreeBSD__) && __FreeBSD__ >= 12 +# include +__attribute__((constructor)) +static int __blst_cpuid() +{ + unsigned long cap; + + if (elf_aux_info(AT_HWCAP, &cap, sizeof(cap)) == 0) + __blst_platform_cap = (cap & HWCAP_SHA2) != 0; + + return 0; +} +# elif defined(_WIN64) +int IsProcessorFeaturePresent(int); + +# if defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +# endif +static int __blst_cpuid(void) +{ + __blst_platform_cap = IsProcessorFeaturePresent(30); /* AES, SHA1, SHA2 */ + + return 0; +} + +# if defined(_MSC_VER) && !defined(__clang__) +# pragma section(".CRT$XCU",read) +__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; +# endif +# endif + +#endif diff --git a/src/blst/src/e1.c b/src/blst/src/e1.c new file mode 100644 index 0000000000..f8a7be7bc1 --- /dev/null +++ b/src/blst/src/e1.c @@ -0,0 +1,564 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384 B_E1 = { /* (4 << 384) % P */ + TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) +}; + +const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 + * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ + { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), + TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), + TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, + { ONE_MONT_P } +}; + +const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 + * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ + { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), + TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), + TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, + { ONE_MONT_P } +}; + +static inline void mul_by_b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 2); } + +static inline void mul_by_4b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 4); } + +static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) +{ cneg_fp(p->Y, p->Y, cbit); } + +void blst_p1_cneg(POINTonE1 *a, int cbit) +{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p1_on_curve(const POINTonE1 *p) +{ return (int)POINTonE1_on_curve(p); } + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE1_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE1_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE1_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + mul_fp(ret.X, ret.X, BLS12_381_RR); + + sqr_fp(ret.Y, ret.X); + mul_fp(ret.Y, ret.Y, ret.X); + add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp(out->Y, out->Y, sgn0_pty); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y, sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X, ret.X, BLS12_381_RR); + mul_fp(ret.Y, ret.Y, BLS12_381_RR); + + if (!POINTonE1_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE1_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE1_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ return POINTonE1_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) +POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) + +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) +{ return (int)POINTonE1_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) +#endif + +static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) +}; + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + vec_copy(out->X, in->X, 2*sizeof(out->X)); + mul_fp(out->Z, in->Z, beta); +} + +/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ +static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* SK/z^2 [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s+16, val.s }; + POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ + size_t i; + + POINTonE1_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + mul_fp(table[1][i].X, table[0][i].X, beta); + cneg_fp(table[1][i].Y, table[0][i].Y, 1); + vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); + } + + POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); + POINTonE1_cneg(out, 1); + mul_fp(out->Z, out->Z, beta); + mul_fp(out->Z, out->Z, beta); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) +{ + vec384 Z, ZZ; + limb_t inf; + + POINTonE1_mult_glv(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp(ZZ, Z); + mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) +{ POINTonE1_sign(out, &BLS12_381_G1, SK); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) +{ POINTonE1_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, &BLS12_381_G1, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 176) { + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE1_mult_glv(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p1_is_inf(const POINTonE1 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE1 *blst_p1_generator(void) +{ return &BLS12_381_G1; } + +int blst_p1_affine_is_inf(const POINTonE1_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE1_affine *blst_p1_affine_generator(void) +{ return (const POINTonE1_affine *)&BLS12_381_G1; } + +size_t blst_p1_sizeof(void) +{ return sizeof(POINTonE1); } + +size_t blst_p1_affine_sizeof(void) +{ return sizeof(POINTonE1_affine); } diff --git a/src/blst/src/e2.c b/src/blst/src/e2.c new file mode 100644 index 0000000000..77f8064bce --- /dev/null +++ b/src/blst/src/e2.c @@ -0,0 +1,638 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 2); + lshift_fp(out[1], out[1], 2); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 4); + lshift_fp(out[1], out[1], 4); +} + +static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) +{ cneg_fp2(p->Y, p->Y, cbit); } + +void blst_p2_cneg(POINTonE2 *a, int cbit) +{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p2_on_curve(const POINTonE2 *p) +{ return (int)POINTonE2_on_curve(p); } + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE2_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE2_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE2_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + + sqr_fp2(ret.Y, ret.X); + mul_fp2(ret.Y, ret.Y, ret.X); + add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); + limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); + mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); + + if (!POINTonE2_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return BLST_SUCCESS; +} + +static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, + const unsigned char in[192]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE2_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE2_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, + const unsigned char in[192]) +{ return POINTonE2_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) +POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) +{ return (int)POINTonE2_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) +#endif + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } + }; + static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + }; + + vec_copy(out, in, sizeof(*out)); + cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); + cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); + cneg_fp(out->Z[1], out->Z[1], 1); +} + +/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ +static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* break down SK to "digits" with |z| as radix [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + div_by_z(val.l); + div_by_z(val.l + NLIMBS(256)/2); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s, NULL }; + POINTonE2 table[4][1<<(5-1)]; /* 18KB */ + size_t i; + + POINTonE2_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + psi(&table[1][i], &table[0][i]); + psi(&table[2][i], &table[1][i]); + psi(&table[3][i], &table[2][i]); + POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ + POINTonE2_cneg(&table[3][i], 1); + } + + POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) +{ + vec384x Z, ZZ; + limb_t inf; + + POINTonE2_mult_gls(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp2(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) +{ POINTonE2_sign(out, &BLS12_381_G2, SK); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) +{ POINTonE2_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, &BLS12_381_G2, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 144) { + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE2_mult_gls(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p2_is_inf(const POINTonE2 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE2 *blst_p2_generator(void) +{ return &BLS12_381_G2; } + +int blst_p2_affine_is_inf(const POINTonE2_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE2_affine *blst_p2_affine_generator(void) +{ return (const POINTonE2_affine *)&BLS12_381_G2; } + +size_t blst_p2_sizeof(void) +{ return sizeof(POINTonE2); } + +size_t blst_p2_affine_sizeof(void) +{ return sizeof(POINTonE2_affine); } diff --git a/src/blst/src/ec_mult.h b/src/blst/src/ec_mult.h new file mode 100644 index 0000000000..24c151baae --- /dev/null +++ b/src/blst/src/ec_mult.h @@ -0,0 +1,316 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +/* Works up to 9 bits */ +static limb_t get_wval(const byte *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + + ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; + + return ret >> (off%8); +} + +/* Works up to 25 bits. */ +static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) +{ + size_t i, top = (off + bits - 1)/8; + limb_t ret, mask = (limb_t)0 - 1; + + d += off/8; + top -= off/8-1; + + /* this is not about constant-time-ness, but branch optimization */ + for (ret=0, i=0; i<4;) { + ret |= (*d & mask) << (8*i); + mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + d += 1 & mask; + } + + return ret >> (off%8); +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode(limb_t wval, size_t sz) +{ + limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + launder(mask); + + wval = (wval + 1) >> 1; + wval = (wval ^ mask) - mask; + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling algorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ +static bool_t ptype##_gather_booth_w##SZ(ptype *restrict p, \ + const ptype table[1<<(SZ-1)], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + bool_t booth_sign = (booth_idx >> SZ) & 1; \ +\ + booth_idx &= (1< 0) \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + else \ + wval = (scalar[0] << 1) & wmask; \ +\ + wval = booth_encode(wval, SZ); \ + ret_is_inf = ptype##_gather_booth_w##SZ(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ + ptype##_dadd(sum, ret, row, NULL); \ + ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ + sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ + ret_is_inf |= sum_is_inf; \ + row_is_inf |= sum_is_inf; \ + ptype##_ccopy(ret, row, ret_is_inf); \ + ret_is_inf &= row_is_inf; \ + } \ +\ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; scalar_s = scalars; \ + } \ +\ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (scalar[0] << 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ + ptype##_dadd(sum, ret, row, NULL); \ + ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ + sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ + ret_is_inf |= sum_is_inf; \ + row_is_inf |= sum_is_inf; \ + ptype##_ccopy(ret, row, ret_is_inf); \ + ret_is_inf &= row_is_inf; \ + } \ +\ + vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ +} \ +\ +static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ + const byte *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t j, window; \ + ptype sum[1], row[1]; \ + bool_t sum_is_inf, row_is_inf, ret_is_inf; \ + ptype table[1<<(SZ-1)]; \ +\ + ptype##_precompute_w##SZ(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ret_is_inf = ptype##_gather_booth_w##SZ(ret, table, wval); \ +\ + while (bits > 0) { \ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + row_is_inf = ptype##_gather_booth_w##SZ(row, table, wval); \ + if (bits > 0) ptype##_add(sum, ret, row); \ + else ptype##_dadd(sum, ret, row, NULL); \ + ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ + sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ + ret_is_inf |= sum_is_inf; \ + row_is_inf |= sum_is_inf; \ + ptype##_ccopy(ret, row, ret_is_inf); \ + ret_is_inf &= row_is_inf; \ + } \ +\ + vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + bool_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, \ + const ptype##_affine *p_affine, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/src/blst/src/ec_ops.h b/src/blst/src/ec_ops.h new file mode 100644 index 0000000000..0d531f816e --- /dev/null +++ b/src/blst/src/ec_ops.h @@ -0,0 +1,787 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ + mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ +\ + sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(S1, S1, J); /* S1*J */\ +\ + mul_##field(p3.Y, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle either input at infinity, with |p1| encoded as Z==0, + * and |p2| as X==Y==0. + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, H, HH, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(I, J, p1->Y); /* Y1*J */\ +\ + sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, I); \ + sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3.Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(B, B, p1->X); /* X1+B */\ + sqr_##field(B, B); /* (X1+B)^2 */\ + sub_##field(B, B, A); /* (X1+B)^2-A */\ + sub_##field(B, B, C); /* (X1+B)^2-A-C */\ + add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, B); \ + sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, B, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + bool_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from /.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + bool_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} + +#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ +static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, Z2Z2; \ + ptype##_affine a1, a2; \ + bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ +\ + mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ + mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle + * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| + * and replacing few first references to |X3| in the formula, up to step + * 21, with it. 12M[+27A], doubling and infinity are handled by the + * formula itself. Infinity is to be encoded as [0, !0, 0]. + */ +#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ + const ptype##proj *p2) \ +{ \ + vec##bits t0, t1, t2, t3, t4, t5; \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ + add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ + add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ + add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ + mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ + add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ + sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ + add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ + add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ + mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ + add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ + sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ + mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ + mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ + mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ + add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ + mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ + mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ + mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle + * |p2| being infinity encoded as [0, 0]. 11M[+21A]. + */ +#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype##proj p3[1]; \ + vec##bits t0, t1, t2, t3, t4; \ + limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ + mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ + add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ + mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ + add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ + mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ + mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ + mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ + add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ + mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ + mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ + mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ +\ + vec_select(out, p1, p3, sizeof(*out), p2inf); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle + * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y + * and reordering operations to bring references to |p1| forward. + * 6M+2S[+13A]. + */ +#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ +{ \ + vec##bits t0, t1, t2, t3; \ +\ + sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ + mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ + sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ + mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ + lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ + mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ + mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ + mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ + add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ + mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ + mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ + sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ + mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ + add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ + mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ + add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ +} + +#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ +{ \ + vec##bits ZZ; \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + mul_##field(out->Y, in->Y, ZZ); \ + vec_copy(out->Z, in->Z, sizeof(out->Z)); \ +} + +#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ +static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ +{ \ + vec##bits ZZ; \ + limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ + mul_##field(out->Z, ZZ, in->Z); \ +} + +/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 + * with twist to handle either input at infinity. Addition costs 12M+2S, + * while conditional doubling - 4M+6M+3S. + */ +#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ +static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##xyzz *p2) \ +{ \ + vec##bits U, S, P, R; \ +\ + if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3, p2, sizeof(*p3)); \ + return; \ + } \ +\ + mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ + mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + sub_##field(P, P, U); /* P = U2-U1 */\ + sub_##field(R, R, S); /* R = S2-S1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, U, PP); /* Q = U1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, S, PPP); /* S1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ + mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ + mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ + mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits V, W, M; /* double |p1| */\ +\ + add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ + sqr_##field(V, U); /* V = U^2 */\ + mul_##field(W, V, U); /* W = U*V */\ + mul_##field(S, p1->X, V); /* S = X1*V */\ + sqr_##field(M, p1->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ + mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 + * with twists to handle even subtractions and either input at infinity. + * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. + */ +#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##_affine *p2, \ + bool_t subtract) \ +{ \ + vec##bits P, R; \ +\ + if (vec_is_zero(p2, sizeof(*p2))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ + cneg_##field(p3->ZZZ, one, subtract); \ + vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ + return; \ + } \ +\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + cneg_##field(R, R, subtract); \ + sub_##field(P, P, p1->X); /* P = U2-X1 */\ + sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ + mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits U, S, M; /* double |p2| */\ +\ + add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ + sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ + mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ + mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ + sqr_##field(M, p2->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ +{ \ + mul_##field(out->X, in->X, in->ZZ); \ + mul_##field(out->Y, in->Y, in->ZZZ); \ + vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ +} + +#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ +static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ +{ \ + vec_copy(out->X, in->X, 2*sizeof(out->X)); \ + sqr_##field(out->ZZ, in->Z); \ + mul_##field(out->ZZZ, out->ZZ, in->Z); \ +} + +#endif diff --git a/src/blst/src/errors.h b/src/blst/src/errors.h new file mode 100644 index 0000000000..425daeb486 --- /dev/null +++ b/src/blst/src/errors.h @@ -0,0 +1,19 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_ERRORS_H__ +#define __BLS12_381_ASM_ERRORS_H__ + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +#endif diff --git a/src/blst/src/exp.c b/src/blst/src/exp.c new file mode 100644 index 0000000000..55c5c5a787 --- /dev/null +++ b/src/blst/src/exp.c @@ -0,0 +1,55 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} diff --git a/src/blst/src/exports.c b/src/blst/src/exports.c new file mode 100644 index 0000000000..1ca4d4757f --- /dev/null +++ b/src/blst/src/exports.c @@ -0,0 +1,583 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" +#include "bytes.h" + +/* + * BLS12-381-specific Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0); + sub_mod_256(x1, x0, x2, BLS12_381_r); + add_mod_256(x0, x0, x2, BLS12_381_r); +} + +void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + sub_mod_256(x2, x0, x1, BLS12_381_r); + add_mod_256(x0, x0, x1, BLS12_381_r); + mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0); +} + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, int flag) +{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_from_scalar(vec256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); + vec_zero(out, sizeof(out)); + } +} + +void blst_scalar_from_fr(pow256 ret, const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + from_mont_256(out, a, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_fr_check(const pow256 a) +{ return (int)(check_mod_256(a, BLS12_381_r) | + bytes_are_zero(a, sizeof(pow256))); +} + +int blst_sk_check(const pow256 a) +{ return (int)check_mod_256(a, BLS12_381_r); } + +int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) +{ + vec256 t[2]; + const union { + long one; + char little; + } is_endian = { 1 }; + bool_t is_zero; + + if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { + limbs_from_le_bytes(t[0], a, sizeof(pow256)); + limbs_from_le_bytes(t[1], b, sizeof(pow256)); + a = (const byte *)t[0]; + b = (const byte *)t[1]; + } + mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0); + mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0); + le_bytes_from_limbs(ret, t[0], sizeof(pow256)); + is_zero = vec_is_zero(t[0], sizeof(vec256)); + vec_zero(t, sizeof(t)); + + return (int)(is_zero^1); +} + +void blst_sk_inverse(pow256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { + limb_t *out = (limb_t *)ret; + mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +/* + * BLS12-381-specific Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, int flag) +{ cneg_fp(ret, a, is_zero(flag) ^ 1); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specific Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) +{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } + +/* + * Scalar serialization/deserialization. + */ +void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = (unsigned int)(*a++); + w |= (unsigned int)(*a++) << 8; + w |= (unsigned int)(*a++) << 16; + w |= (unsigned int)(*a++) << 24; + ret[i] = w; + } +} + +void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + *ret++ = (byte)(w >> 32); + *ret++ = (byte)(w >> 40); + *ret++ = (byte)(w >> 48); + *ret++ = (byte)(w >> 56); + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = (unsigned long long)(*a++); + w |= (unsigned long long)(*a++) << 8; + w |= (unsigned long long)(*a++) << 16; + w |= (unsigned long long)(*a++) << 24; + w |= (unsigned long long)(*a++) << 32; + w |= (unsigned long long)(*a++) << 40; + w |= (unsigned long long)(*a++) << 48; + w |= (unsigned long long)(*a++) << 56; + ret[i] = w; + } +} + +void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + vec256 out; + limbs_from_le_bytes(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + int i; + + from_mont_256(out, a, BLS12_381_r, r0); + for (i = 0; i < 4; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + + n -= rem; + limbs_from_le_bytes(t.out, bytes += n, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n) { + limbs_from_le_bytes(t.digit, bytes -= 32, 32); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + n -= 32; + } + + from_mont_256(t.out, t.out, BLS12_381_r, r0); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(&t, sizeof(t)); + + return (int)(ret^1); +} + +int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + + limbs_from_be_bytes(t.out, bytes, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n -= rem) { + limbs_from_be_bytes(t.digit, bytes += rem, 32); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + rem = 32; + } + + from_mont_256(t.out, t.out, BLS12_381_r, r0); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(&t, sizeof(t)); + + return (int)(ret^1); +} + +/* + * Single-short SHA-256 hash function. + */ +#include "sha256.h" + +void blst_sha256(unsigned char md[32], const void *msg, size_t len) +{ + SHA256_CTX ctx; + + sha256_init(&ctx); + sha256_update(&ctx, msg, len); + sha256_final(md, &ctx); +} + +/* + * Test facilitator. + */ +void blst_scalar_from_hexascii(pow256 ret, const char *hex) +{ bytes_from_hexascii(ret, sizeof(pow256), hex); } + +void blst_fr_from_hexascii(vec256 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec256), hex); + mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fp_from_hexascii(vec384 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/src/blst/src/fields.h b/src/blst/src/fields.h new file mode 100644 index 0000000000..4b2323d2cc --- /dev/null +++ b/src/blst/src/fields.h @@ -0,0 +1,116 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +/* + * BLS12-381-specific Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) +{ rshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void div_by_2_fp(vec384 ret, const vec384 a) +{ div_by_2_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void redc_fp(vec384 ret, const vec768 a) +{ redc_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specific Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define vec_load_global vec_copy + +static void reciprocal_fp(vec384 out, const vec384 inp); +static void flt_reciprocal_fp(vec384 out, const vec384 inp); +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); +static bool_t sqrt_fp(vec384 out, const vec384 inp); + +static void reciprocal_fp2(vec384x out, const vec384x inp); +static void flt_reciprocal_fp2(vec384x out, const vec384x inp); +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, const vec384x magic_ZZZ); +static bool_t sqrt_fp2(vec384x out, const vec384x inp); +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#define neg_fp(r,a) cneg_fp((r),(a),1) +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/src/blst/src/fp12_tower.c b/src/blst/src/fp12_tower.c new file mode 100644 index 0000000000..d6c0b124eb --- /dev/null +++ b/src/blst/src/fp12_tower.c @@ -0,0 +1,789 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 && !defined(__BLST_NO_ASM__) +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + reciprocal_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specific Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } + +int blst_fp12_is_one(const vec384fp12 a) +{ + return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); +} + +const vec384fp12 *blst_fp12_one(void) +{ return (const vec384fp12 *)BLS12_381_Rx.p12; } + +void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a) +{ + size_t i, j; + vec384 out; + + for (i = 0; i < 3; i++) { + for (j = 0; j < 2; j++) { + from_fp(out, a[j][i][0]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + from_fp(out, a[j][i][1]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + } + } +} + +size_t blst_fp12_sizeof(void) +{ return sizeof(vec384fp12); } diff --git a/src/blst/src/hash_to_field.c b/src/blst/src/hash_to_field.c new file mode 100644 index 0000000000..6816ea8b92 --- /dev/null +++ b/src/blst/src/hash_to_field.c @@ -0,0 +1,177 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + if (DST_len > 255) { + sha256_init(&ctx); + sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); + sha256_update(&ctx, DST, DST_len); + sha256_final(b_0.c, &ctx); + DST = b_0.c, DST_len = 32; + } + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ + len_in_bytes /= 32; /* caller being responsible for accordingly large + * buffer. hash_to_field passes one with length + * divisible by 64, remember? which works... */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} +#endif + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} + +void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); + unsigned char *buf_ptr = bytes; + + if (buf_len > 255*32) + return; + + if (buf_len != len_in_bytes) + buf_ptr = alloca(buf_len); + + expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, + DST, DST_len); + if (buf_ptr != bytes) { + unsigned char *ptr = buf_ptr; + while (len_in_bytes--) + *bytes++ = *ptr++; + vec_zero(buf_ptr, buf_len); + } +} diff --git a/src/blst/src/keygen.c b/src/blst/src/keygen.c new file mode 100644 index 0000000000..9b62f16b53 --- /dev/null +++ b/src/blst/src/keygen.c @@ -0,0 +1,319 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "bytes.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, +#ifndef __BLST_HKDF_TESTMODE__ + int IKM_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (IKM_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); + } +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, +#ifndef __BLST_HKDF_TESTMODE__ + int info_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (info_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; + } +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef __BLST_HKDF_TESTMODE__ +static void keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len, + int version) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-"; + + if (IKM_len < 32 || (version > 4 && salt == NULL)) { + vec_zero(SK, sizeof(pow256)); + return; + } + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + if (salt == NULL) { + salt = salt_prime; + salt_len = 20; + } + + if (version == 4) { + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + while (1) { + /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, salt, salt_len, + IKM, IKM_len, 1, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, 1, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); + /* + * Given that mul_mont_sparse_256 has special boundary conditions + * it's appropriate to mention that redc_mont_256 output is fully + * reduced at this point. Because we started with 384-bit input, + * one with most significant half smaller than the modulus. + */ + mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, + BLS12_381_r, r0); + + if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256))) + break; + + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4); } + +void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3); } + +void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4); } + +void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5); } + +/* + * https://eips.ethereum.org/EIPS/eip-2333 + */ +void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len) +{ keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4); } + +static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK, + unsigned int index) +{ + size_t i; + struct { + HMAC_SHA256_CTX ctx; + SHA256_CTX ret; + unsigned char PRK[32], IKM[32]; + unsigned char lamport[255][32]; + } scratch; + + /* salt = I2OSP(index, 4) */ + unsigned char salt[4] = { (unsigned char)(index>>24), + (unsigned char)(index>>16), + (unsigned char)(index>>8), + (unsigned char)(index) }; + + /* IKM = I2OSP(parent_SK, 32) */ + for (i = 0; i < 32; i++) + scratch.IKM[i] = parent_SK[31-i]; + + /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; /* 32*8 in big endian */ + scratch.ctx.ctx.buf[63] = 0; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_init(&scratch.ret); + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + + /* not_IKM = flip_bits(IKM) */ + for (i = 0; i< 32; i++) + scratch.IKM[i] = ~scratch.IKM[i]; + + /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + sha256_final(PK, &scratch.ret); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK, + unsigned int child_index) +{ + parent_SK_to_lamport_PK(SK, parent_SK, child_index); + keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4); +} +#endif diff --git a/src/blst/src/map_to_g1.c b/src/blst/src/map_to_g1.c new file mode 100644 index 0000000000..6613d68bb2 --- /dev/null +++ b/src/blst/src/map_to_g1.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) +{ + POINTonE1_dadd(out, out, p, NULL); + while(n--) + POINTonE1_double(out, out); +} + +static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) +{ + POINTonE1_double(out, in); /* 1: 0x2 */ + POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_times_minus_z(out, &p); + POINTonE1_dadd(out, out, &p, NULL); +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void sigma(POINTonE1 *out, const POINTonE1 *in); + +#if 0 +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const byte zz_minus_1_div_by_3[] = { + TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_dadd(out, out, in, NULL); + } +} +#else +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_dadd(out, out, p, NULL); +} + +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ + POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ + POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, P); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} +#else +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + POINTonE1_times_minus_z(&t0, P); + POINTonE1_times_minus_z(&t1, &t0); + POINTonE1_cneg(&t1, 1); /* [-z²]P */ + + sigma(&t0, P); /* σ(P) */ + sigma(&t0, &t0); /* σ²(P) */ + + return POINTonE1_is_equal(&t0, &t1); +} +#endif + +int blst_p1_in_g1(const POINTonE1 *p) +{ return (int)POINTonE1_in_G1(p); } + +int blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ + POINTonE1 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE1_in_G1(&P); +} diff --git a/src/blst/src/map_to_g2.c b/src/blst/src/map_to_g2.c new file mode 100644 index 0000000000..90fd86e9d3 --- /dev/null +++ b/src/blst/src/map_to_g2.c @@ -0,0 +1,444 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ + /* a^2 + b^2 */ + { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), + TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), + TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, + /* (a^2 + b^2)^((P-3)/4) */ + { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), + TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), + TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + recip_ZZZ, magic_ZZZ); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const byte h_eff[] = { + TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), + TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), + TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), + TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), + TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_dadd(out, out, p, NULL); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void psi(POINTonE2 *out, const POINTonE2 *in); + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ + POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static bool_t POINTonE2_in_G2(const POINTonE2 *P) +{ +#if 0 + POINTonE2 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, P); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_dadd(&t0, &t0, &t2, NULL); + POINTonE2_cneg(&t0, 1); + POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +#else + POINTonE2 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + psi(&t0, P); /* Ψ(P) */ + + POINTonE2_times_minus_z(&t1, P); + POINTonE2_cneg(&t1, 1); /* [z]P */ + + return POINTonE2_is_equal(&t0, &t1); +#endif +} + +int blst_p2_in_g2(const POINTonE2 *p) +{ return (int)POINTonE2_in_G2(p); } + +int blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ + POINTonE2 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE2_in_G2(&P); +} diff --git a/src/blst/src/multi_scalar.c b/src/blst/src/multi_scalar.c new file mode 100644 index 0000000000..55ab822771 --- /dev/null +++ b/src/blst/src/multi_scalar.c @@ -0,0 +1,427 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * Infinite point among inputs would be devastating. Shall we change it? + */ +#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ +static void ptype##s_to_affine(ptype##_affine dst[], \ + const ptype *const points[], size_t npoints) \ +{ \ + size_t i; \ + vec##bits *acc, ZZ, ZZZ; \ + const ptype *point = NULL; \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ +\ + while (npoints) { \ + const ptype *p, *const *walkback; \ + size_t delta = strideZ, sizeof(vec##bits)); \ + for (i = 1; i < delta; i++, acc++) \ + point = *points ? *points++ : point+1, \ + mul_##field(acc[0], acc[-1], point->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + walkback = points-1, p = point, --delta, dst += delta; \ + for (i = 0; i < delta; i++, acc--, dst--) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], p->Z, acc[0]); \ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + p = (p == *walkback) ? *--walkback : p-1; \ + } \ + sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + ++delta, dst += delta, npoints -= delta; \ + } \ +} \ +\ +void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ + size_t npoints) \ +{ ptype##s_to_affine(dst, points, npoints); } + +POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) +POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) + +/* + * This is two-step multi-scalar multiplication procedure. First, given + * a set of points you pre-compute a table for chosen windowing factor + * [expressed in bits with value between 2 and 14], and then you pass + * this table to the actual multiplication procedure along with scalars. + * Idea is that the pre-computed table will be reused multiple times. In + * which case multiplication runs faster than below Pippenger algorithm + * implementation for up to ~16K points for wbits=8, naturally at the + * expense of multi-megabyte table. One can trade even more memory for + * performance, but each wbits increment doubles the memory requirement, + * so at some point it gets prohibively large... For reference, without + * reusing the table it's faster than Pippenger algorithm for up ~32 + * points [with wbits=5]... + */ + +#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) + +#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ + const ptype##_affine *point) \ +{ \ + size_t i, j, n = (size_t)1 << (wbits-1); \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ + vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ + ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < n; i += 2, j++) \ + ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ + size_t wbits, size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t i, j; \ + vec##bits *acc, ZZ, ZZZ; \ +\ + src += total; \ + acc = (vec##bits *)src; \ + vec_copy(acc++, one, sizeof(vec##bits)); \ + for (i = 0; i < npoints; i++) \ + for (j = nwin; --src, --j; acc++) \ + mul_##field(acc[0], acc[-1], src->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + for (i = 0; i < npoints; i++) { \ + vec_copy(dst++, src++, sizeof(ptype##_affine)); \ + for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], src->Z, acc[0]); \ + mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ + } \ + } \ +} \ +\ +/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ +static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ + size_t i, top = 0; \ + ptype *rows, *row; \ + const ptype##_affine *point = NULL; \ + size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ + if (stride == 0) stride = 1; \ +\ + while (npoints >= nmin) { \ + size_t limit = total - npoints; \ +\ + if (top + (stride << wbits) > limit) { \ + stride = (limit - top) >> wbits; \ + if (stride == 0) break; \ + } \ + rows = row = (ptype *)(&table[top]); \ + for (i = 0; i < stride; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ + top += stride << (wbits-1); \ + npoints -= stride; \ + } \ + rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ + for (i = 0; i < npoints; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ +} \ +\ +size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ +{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ +void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ ptype##s_precompute_wbits(table, wbits, points, npoints); } + +#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ + size_t wbits, limb_t booth_idx) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ + bool_t idx_is_zero; \ + static const ptype##_affine infinity = { 0 }; \ +\ + booth_idx &= ((limb_t)1 << wbits) - 1; \ + idx_is_zero = is_zero(booth_idx); \ + booth_idx -= 1 ^ idx_is_zero; \ + vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ + const byte *scalar, *const *scalar_s = scalars; \ + const ptype##_affine *row = table; \ +\ + size_t scratch_sz = SCRATCH_SZ(ptype); \ + if (scratch == NULL) { \ + scratch_sz /= 4; /* limit to 288K */ \ + scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ + scratch = alloca(sizeof(ptype) * scratch_sz); \ + } \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = nbits % wbits; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + nbits -= window; \ + z = is_zero(nbits); \ + wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ + row += nwin; \ +\ + i = 1; vec_zero(ret, sizeof(*ret)); \ + while (nbits > 0) { \ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +\ + for (j = 0; j < wbits; j++) \ + ptype##_double(ret, ret); \ +\ + window = wbits; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + nbits -= window; \ + i = 0; row = table; scalar_s = scalars; \ + } \ +\ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +} \ +\ +size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ +{ \ + const size_t scratch_sz = SCRATCH_SZ(ptype); \ + return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ +} \ +void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } + +PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) + +PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) + +/* + * Pippenger algorithm implementation, fastest option for larger amount + * of points... + */ + +static size_t pippenger_window_size(size_t npoints) +{ + size_t wbits; + + for (wbits=0; npoints>>=1; wbits++) ; + + return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); +} + +#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ +typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; + +#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ +static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ + size_t wbits) \ +{ \ + ptype##xyzz ret[1], acc[1]; \ + size_t n = (size_t)1 << wbits; \ +\ + /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ + vec_copy(acc, &buckets[--n], sizeof(acc)); \ + vec_copy(ret, &buckets[n], sizeof(ret)); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + while (n--) { \ + ptype##xyzz_dadd(acc, acc, &buckets[n]); \ + ptype##xyzz_dadd(ret, ret, acc); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + } \ + ptype##xyzz_to_Jacobian(out, ret); \ +} \ +\ +static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits, const ptype##_affine *p) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ +\ + booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ + else wbits = cbits = window; \ + ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ + bit0, wbits, cbits); \ +} \ +void prefix##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[]) \ +{ \ + if (npoints == 1) { \ + prefix##_from_affine(ret, points[0]); \ + prefix##_mult(ret, ret, scalars[0], nbits); \ + return; \ + } \ + if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT) { \ + ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \ + ptype##s_precompute_wbits(table, 4, points, npoints); \ + ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \ + return; \ + } \ + ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \ +} + +DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) + +DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/src/blst/src/no_asm.h b/src/blst/src/no_asm.h new file mode 100644 index 0000000000..be7bf47e19 --- /dev/null +++ b/src/blst/src/no_asm.h @@ -0,0 +1,1345 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if LIMB_T_BITS==32 +typedef unsigned long long llimb_t; +#endif + +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) +# error "unsupported compiler" +#endif + +#if defined(__clang__) +# pragma GCC diagnostic ignored "-Wstatic-in-inline" +#endif + +#if !defined(__clang__) && !defined(__builtin_assume) +# if defined(__GNUC__) && __GNUC__>=5 +# define __builtin_assume(condition) if (!(condition)) __builtin_unreachable() +# elif defined(_MSC_VER) +# define __builtin_assume(condition) __assume(condition) +# else +# define __builtin_assume(condition) (void)(condition) +# endif +#endif + +static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n+1], carry; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (carry=0, j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + limbx = tmp[i] + (hi + (llimb_t)carry); + tmp[i-1] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + limbx = hi + (llimb_t)carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#define SUB_MOD_IMPL(bits) \ +inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_MOD_IMPL(256) +SUB_MOD_IMPL(384) + +static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n], two_a[n]; + size_t i; + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; + mask = (limb_t)0 - flag; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + return borrow & (is_zero(acc) ^ 1); +} + +#define CHECK_MOD_IMPL(bits) \ +inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ +{ return check_mod_n(a, p, NLIMBS(bits)); } + +CHECK_MOD_IMPL(256) + +static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + add_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define ADD_N_CHECK_MOD_IMPL(bits) \ +inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_N_CHECK_MOD_IMPL(256) + +static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + sub_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define SUB_N_CHECK_MOD_IMPL(bits) \ +inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_N_CHECK_MOD_IMPL(256) + +static void from_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n]; + size_t i, j; + + for (j=0; j> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + a = tmp; + } + + /* this is needed only if input can be non-fully-reduced */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + b = tmp; + } + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (next=ret[0], i=0; i> 1; + next = ret[i+1]; + ret[i] = limb | next << (LIMB_T_BITS-1); + } + ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); + + a = ret; + } +} + +#define RSHIFT_MOD_IMPL(bits) \ +inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +RSHIFT_MOD_IMPL(256) +RSHIFT_MOD_IMPL(384) + +#define DIV_BY_2_MOD_IMPL(bits) \ +inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } + +DIV_BY_2_MOD_IMPL(384) + +static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry, borrow, ret, tmp[n]; + size_t i; + + ret = a[0] & 1; /* parity */ + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + ret |= ((carry - borrow) & 2) ^ 2; + + return ret; +} + +inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) +{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } + +inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) +{ + vec384 tmp; + + from_mont_n(tmp, a, p, n0, NLIMBS(384)); + + return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); +} + +inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) +{ + limb_t re, im, sign, prty; + + re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); + im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); + + /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ + sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); + sign = (re & sign) | (im & ~sign); + + /* a->re==0 ? prty(a->im) : prty(a->re) */ + prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); + prty = (im & prty) | (re & ~prty); + + return (sign & 2) | (prty & 1); +} + +inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) +{ + vec384x tmp; + + from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); + from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); + + return sgn0_pty_mod_384x(tmp, p); +} + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0) +{ + vec384 aa, bb, cc; + + add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); + add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); + mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); + mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); + mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); + sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); + sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); + sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); +} + +/* + * mul_mont_n without final conditional subtraction, which implies + * that modulus is one bit short, which in turn means that there are + * no carries to handle between iterations... + */ +static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mx, hi, tmp[n+1]; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = tmp[i] + hi; + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + } + + vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); +} + +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b) +{ + __builtin_assume(count != 0); + while(count--) { + mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); + a = ret; + } + mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); +} + +void sqr_mont_382x(vec384x ret, const vec384x a, + const vec384 p, limb_t n0) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + vec384 t0, t1; + + /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + + /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + + /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); + + /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); + + /* account for t1's sign... */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#if defined(__GNUC__) || defined(__clang__) +# define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; }) +#else +# define MSB(x) ((x) >> (LIMB_T_BITS-1)) +#endif + +static size_t num_bits(limb_t l) +{ + limb_t x, mask; + size_t bits = is_zero(l) ^ 1; + + if (sizeof(limb_t) == 8) { + x = l >> (32 & (8*sizeof(limb_t)-1)); + mask = 0 - MSB(0 - x); + bits += 32 & mask; + l ^= (x ^ l) & mask; + } + + x = l >> 16; + mask = 0 - MSB(0 - x); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0 - MSB(0 - x); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0 - MSB(0 - x); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0 - MSB(0 - x); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + bits += l >> 1; + + return bits; +} + +#if defined(__clang_major__) && __clang_major__>7 +__attribute__((optnone)) +#endif +static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) +{ + size_t r = LIMB_T_BITS - l; + limb_t mask = 0 - (is_zero(l)^1); + return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); +} + +/* + * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. + */ +static void ab_approximation_n(limb_t a_[2], const limb_t a[], + limb_t b_[2], const limb_t b[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a_hi, a_lo, b_hi, b_lo, mask; + size_t i; + + i = n-1; + a_hi = a[i], a_lo = a[i-1]; + b_hi = b[i], b_lo = b[i-1]; + for (i--; --i;) { + mask = 0 - is_zero(a_hi | b_hi); + a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; + b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; + a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; + b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; + } + i = LIMB_T_BITS - num_bits(a_hi | b_hi); + /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ + + a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); + b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); +} + +typedef struct { limb_t f0, g0, f1, g1; } factors; + +static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; +} + +static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx = 0; + limb_t carry; + size_t i; + + for (carry=neg&1, i=0; i> LIMB_T_BITS); + } + + return 0 - MSB((limb_t)limbx); +} + +static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry; + size_t i; + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + return carry; +} + +static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t hi; + size_t i; + + for (hi=0, i=0; i> LIMB_T_BITS); + } + + return hi; +} + +static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, + const limb_t b[], limb_t *g_, + size_t n) +{ + __builtin_assume(n != 0); + limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; + size_t i; + + /* |a|*|f_| */ + f = *f_; + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + (void)cneg_n(a_, a, neg, n); + hi = umul_n(a_, a_, f, n); + a_[n] = hi - (f & neg); + + /* |b|*|g_| */ + g = *g_; + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + (void)cneg_n(b_, b, neg, n); + hi = umul_n(b_, b_, g, n); + b_[n] = hi - (g & neg); + + /* |a|*|f_| + |b|*|g_| */ + (void)add_n(a_, a_, b_, n+1); + + /* (|a|*|f_| + |b|*|g_|) >> k */ + for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); + carry = a_[i+1]; + ret[i] = hi | (carry << 2); + } + + /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ + neg = 0 - MSB(carry); + *f_ = (*f_ ^ neg) - neg; + *g_ = (*g_ ^ neg) - neg; + (void)cneg_n(ret, ret, neg, n); + + return neg; +} + +static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, + const limb_t v[], limb_t g, size_t n) +{ + __builtin_assume(n != 0); + limb_t u_[n], v_[n], neg, hi; + + /* |u|*|f_| */ + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + neg = cneg_n(u_, u, neg, n); + hi = umul_n(u_, u_, f, n) - (f&neg); + + /* |v|*|g_| */ + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + neg = cneg_n(v_, v, neg, n); + hi += umul_n(v_, v_, g, n) - (g&neg); + + /* |u|*|f_| + |v|*|g_| */ + hi += add_n(ret, u_, v_, n); + + return hi; +} + +static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], + const limb_t mod[], const limb_t modx[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; + limb_t a_[2], b_[2], sign, carry, top; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + vec_zero(u, sizeof(u)); u[0] = 1; + vec_zero(v, sizeof(v)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); + (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + smul_2n(t, u, fg.f0, v, fg.g0, 2*n); + smul_2n(v, u, fg.f1, v, fg.g1, 2*n); + vec_copy(u, t, sizeof(u)); + } + + inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); + + sign = 0 - MSB(top); /* top is 1, 0 or -1 */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + top += carry; + sign = 0 - top; /* top is 1, 0 or -1 */ + top |= sign; + for (i=0; i> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + L += ((t_lo & b_lo) >> 1) & borrow; + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + + L += (b_lo + 2) >> 2; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; + + return L; +} + +static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a[n], b[n], t[n]; + limb_t a_[2], b_[2], neg, L = 0; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); + neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + L += (b[0] >> 1) & neg; + } + + L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + + return (L & 1) ^ 1; +} + +#define CT_IS_SQR_MOD_IMPL(bits) \ +inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ + const vec##bits mod) \ +{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } + +CT_IS_SQR_MOD_IMPL(384) + +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) +{ + llimb_t Rx; + limb_t r_lo = div_top[0], r_hi = div_top[1]; + limb_t Q = 0, mask, borrow, rx; + size_t i; + + for (i = 0; i < LIMB_T_BITS; i++) { + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS); + + /* "if (R >= D) R -= D" */ + r_lo = ((r_lo ^ rx) & borrow) ^ rx; + rx = (limb_t)Rx; + r_hi = ((r_hi ^ rx) & borrow) ^ rx; + + Q <<= 1; + Q |= ~borrow & 1; + + /* "D >>= 1" */ + d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); + d_hi >>= 1; + } + + mask = 0 - MSB(Q); /* does it overflow? */ + + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + + Q <<= 1; + Q |= borrow ^ 1; + + return (Q | mask); +} + +static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, + limb_t quotient, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t tmp[n+1], carry, mask, borrow; + size_t i; + + /* divisor*quotient */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + tmp[i] = carry; + + /* remainder = dividend - divisor*quotient */ + for (borrow=0, i=0; i<=n; i++) { + limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + /* if quotient was off by one, add divisor to the remainder */ + for (carry=0, i=0; i> LIMB_T_BITS) & 1; + } + + return (div_rem[i] = quotient + mask); +} + +inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } + +inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } + +/* + * Unlock reference implementations in vect.c + */ +#define mul_by_8_mod_384 mul_by_8_mod_384 +#define mul_by_8_mod_384x mul_by_8_mod_384x +#define mul_by_3_mod_384x mul_by_3_mod_384x +#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x +#define add_mod_384x add_mod_384x +#define sub_mod_384x sub_mod_384x +#define lshift_mod_384x lshift_mod_384x +#define sqr_mont_384x sqr_mont_384x + +inline void vec_prefetch(const void *ptr, size_t len) +{ (void)ptr; (void)len; } + +/* + * SHA-256 + */ +#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) +#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) +#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) +#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) +#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) +#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +void blst_sha256_block_data_order(unsigned int *v, const void *inp, + size_t blocks) +{ + static const unsigned int K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; + const unsigned char *data = inp; + size_t round; + + a = v[0]; + b = v[1]; + c = v[2]; + d = v[3]; + e = v[4]; + f = v[5]; + g = v[6]; + h = v[7]; + + while (blocks--) { + for (round = 0; round < 16; round++) { + l = (unsigned int)data[0] << 24; + l |= (unsigned int)data[1] << 16; + l |= (unsigned int)data[2] << 8; + l |= (unsigned int)data[3]; + data += 4; + T1 = X[round] = l; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; round < 64; round++) { + s0 = X[(round + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(round + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + a += v[0]; v[0] = a; + b += v[1]; v[1] = b; + c += v[2]; v[2] = c; + d += v[3]; v[3] = d; + e += v[4]; v[4] = e; + f += v[5]; v[5] = f; + g += v[6]; v[6] = g; + h += v[7]; v[7] = h; + } +} +#undef ROTR +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj + +void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) +{ + size_t i; + + for (i=0; i<8; i++) + dst[i] = src[i]; +} + +void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + size_t i; + + for (i=0; i<8; i++, md+=4) { + unsigned int h_i = h[i]; + md[0] = (unsigned char)(h_i >> 24); + md[1] = (unsigned char)(h_i >> 16); + md[2] = (unsigned char)(h_i >> 8); + md[3] = (unsigned char)h_i; + } +} + +void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) +{ + unsigned char *dst = dst_; + const unsigned char *src = src_; + size_t i; + + for (i=0; iZ); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | + vec_is_zero(&P[0], sizeof(P[0]))) ) { + /* + * Special case of infinite aggregated signature, pair the additive + * group's identity with the multiplicative group's identity. + */ + vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); + return; + } + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ + pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from /pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, + P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); +} + +#ifndef MILLER_LOOP_N_MAX +# define MILLER_LOOP_N_MAX 16 +#endif + +void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[], + const POINTonE1_affine *const Ps[], + size_t n) +{ /* ~10KB of stack storage */ + POINTonE2 T[MILLER_LOOP_N_MAX]; + POINTonE2_affine Q[MILLER_LOOP_N_MAX]; + POINTonE1_affine Px2[MILLER_LOOP_N_MAX]; + const POINTonE2_affine *Qptr = NULL; + const POINTonE1_affine *Pptr = NULL; + size_t i, j; + + for (i = 0, j = 0; j < n; j++) { + Qptr = *Qs ? *Qs++ : Qptr+1; + Pptr = *Ps ? *Ps++ : Pptr+1; + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, Pptr->X, Pptr->X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, Pptr->Y, Pptr->Y); + + vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X)); + vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + + if (++i == MILLER_LOOP_N_MAX || j == n-1) { + vec384fp12 tmp; + vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp; + + /* first step is ret = 1^2*line, which is just ret = line */ + start_dbl_n(ret, T, Px2, i); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, i, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, i, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, i, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, i, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, i, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ + + if (j >= MILLER_LOOP_N_MAX) + mul_fp12(out, out, ret); + + i = 0; + } + } +} + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } + +static bool_t is_cyclotomic(const vec384fp12 f) +{ + vec384fp12 a, b; + + frobenius_map_fp12(a, f, 2); + frobenius_map_fp12(b, a, 2); + mul_fp12(b, b, f); + + return vec_is_equal(a, b, sizeof(a)); +} + +int blst_fp12_in_group(const vec384fp12 f) +{ + vec384fp12 a, b; + + if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) + return 0; + + frobenius_map_fp12(a, f, 1); + raise_to_z(b, f); + + return (int)vec_is_equal(a, b, sizeof(a)); +} diff --git a/src/blst/src/pentaroot-addchain.h b/src/blst/src/pentaroot-addchain.h new file mode 100644 index 0000000000..5bdd9ddf7f --- /dev/null +++ b/src/blst/src/pentaroot-addchain.h @@ -0,0 +1,333 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which + * yields 5th root of the base. + * + * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805' + * https://github.com/kwantam/addchain + * # Bos-Coster (win=4) : 307 (15) + * # Bos-Coster (win=10) : 307 (18) + * # Yacobi : 319 (16) + * # Bos-Coster (win=2) : 319 ( 5) + * # Bos-Coster (win=5) : 306 (19) <<< + * # Bos-Coster (win=7) : 311 (22) + * # Bos-Coster (win=9) : 313 (20) + * # Bos-Coster (win=3) : 314 ( 9) + * # Bos-Coster (win=6) : 309 (21) + * # Bos-Coster (win=8) : 309 (23) + * # Bergeron-Berstel-Brlek-Duboc : 334 ( 5) + */ + +#define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \ +ptype t[19]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[7], t[1]); /* 1: 2 */\ +sqr(t[0], t[7]); /* 2: 4 */\ +sqr(t[2], t[0]); /* 3: 8 */\ +mul(t[10], t[2], t[1]); /* 4: 9 */\ +mul(t[3], t[10], t[7]); /* 5: b */\ +mul(t[1], t[10], t[0]); /* 6: d */\ +mul(t[5], t[3], t[0]); /* 7: f */\ +mul(t[9], t[10], t[2]); /* 8: 11 */\ +mul(t[4], t[3], t[2]); /* 9: 13 */\ +mul(t[15], t[5], t[2]); /* 10: 17 */\ +mul(t[8], t[15], t[2]); /* 11: 1f */\ +mul(t[13], t[8], t[7]); /* 12: 21 */\ +mul(t[14], t[8], t[0]); /* 13: 23 */\ +mul(t[12], t[13], t[0]); /* 14: 25 */\ +mul(t[6], t[8], t[2]); /* 15: 27 */\ +mul(t[11], t[14], t[2]); /* 16: 2b */\ +sqr(t[0], t[15]); /* 17: 2e */\ +mul(t[18], t[6], t[2]); /* 18: 2f */\ +mul(t[2], t[11], t[2]); /* 19: 33 */\ +mul(t[16], t[2], t[7]); /* 20: 35 */\ +mul(t[7], t[0], t[3]); /* 21: 39 */\ +mul(t[17], t[0], t[5]); /* 22: 3d */\ +/* sqr(t[0], t[0]); */ /* 23: 5c */\ +/* sqr(t[0], t[0]); */ /* 24: b8 */\ +/* sqr(t[0], t[0]); */ /* 25: 170 */\ +/* sqr(t[0], t[0]); */ /* 26: 2e0 */\ +/* sqr(t[0], t[0]); */ /* 27: 5c0 */\ +/* sqr(t[0], t[0]); */ /* 28: b80 */\ +/* sqr(t[0], t[0]); */ /* 29: 1700 */\ +sqr_n_mul(t[0], t[0], 7, t[18]); /* 30: 172f */\ +/* sqr(t[0], t[0]); */ /* 31: 2e5e */\ +/* sqr(t[0], t[0]); */ /* 32: 5cbc */\ +/* sqr(t[0], t[0]); */ /* 33: b978 */\ +/* sqr(t[0], t[0]); */ /* 34: 172f0 */\ +/* sqr(t[0], t[0]); */ /* 35: 2e5e0 */\ +/* sqr(t[0], t[0]); */ /* 36: 5cbc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 37: 5cbe1 */\ +/* sqr(t[0], t[0]); */ /* 38: b97c2 */\ +/* sqr(t[0], t[0]); */ /* 39: 172f84 */\ +/* sqr(t[0], t[0]); */ /* 40: 2e5f08 */\ +/* sqr(t[0], t[0]); */ /* 41: 5cbe10 */\ +/* sqr(t[0], t[0]); */ /* 42: b97c20 */\ +/* sqr(t[0], t[0]); */ /* 43: 172f840 */\ +sqr_n_mul(t[0], t[0], 6, t[17]); /* 44: 172f87d */\ +/* sqr(t[0], t[0]); */ /* 45: 2e5f0fa */\ +/* sqr(t[0], t[0]); */ /* 46: 5cbe1f4 */\ +/* sqr(t[0], t[0]); */ /* 47: b97c3e8 */\ +/* sqr(t[0], t[0]); */ /* 48: 172f87d0 */\ +/* sqr(t[0], t[0]); */ /* 49: 2e5f0fa0 */\ +/* sqr(t[0], t[0]); */ /* 50: 5cbe1f40 */\ +sqr_n_mul(t[0], t[0], 6, t[16]); /* 51: 5cbe1f75 */\ +/* sqr(t[0], t[0]); */ /* 52: b97c3eea */\ +/* sqr(t[0], t[0]); */ /* 53: 172f87dd4 */\ +/* sqr(t[0], t[0]); */ /* 54: 2e5f0fba8 */\ +/* sqr(t[0], t[0]); */ /* 55: 5cbe1f750 */\ +/* sqr(t[0], t[0]); */ /* 56: b97c3eea0 */\ +sqr_n_mul(t[0], t[0], 5, t[15]); /* 57: b97c3eeb7 */\ +/* sqr(t[0], t[0]); */ /* 58: 172f87dd6e */\ +/* sqr(t[0], t[0]); */ /* 59: 2e5f0fbadc */\ +/* sqr(t[0], t[0]); */ /* 60: 5cbe1f75b8 */\ +/* sqr(t[0], t[0]); */ /* 61: b97c3eeb70 */\ +/* sqr(t[0], t[0]); */ /* 62: 172f87dd6e0 */\ +/* sqr(t[0], t[0]); */ /* 63: 2e5f0fbadc0 */\ +sqr_n_mul(t[0], t[0], 6, t[15]); /* 64: 2e5f0fbadd7 */\ +/* sqr(t[0], t[0]); */ /* 65: 5cbe1f75bae */\ +/* sqr(t[0], t[0]); */ /* 66: b97c3eeb75c */\ +/* sqr(t[0], t[0]); */ /* 67: 172f87dd6eb8 */\ +/* sqr(t[0], t[0]); */ /* 68: 2e5f0fbadd70 */\ +/* sqr(t[0], t[0]); */ /* 69: 5cbe1f75bae0 */\ +/* sqr(t[0], t[0]); */ /* 70: b97c3eeb75c0 */\ +/* sqr(t[0], t[0]); */ /* 71: 172f87dd6eb80 */\ +/* sqr(t[0], t[0]); */ /* 72: 2e5f0fbadd700 */\ +sqr_n_mul(t[0], t[0], 8, t[14]); /* 73: 2e5f0fbadd723 */\ +/* sqr(t[0], t[0]); */ /* 74: 5cbe1f75bae46 */\ +/* sqr(t[0], t[0]); */ /* 75: b97c3eeb75c8c */\ +/* sqr(t[0], t[0]); */ /* 76: 172f87dd6eb918 */\ +/* sqr(t[0], t[0]); */ /* 77: 2e5f0fbadd7230 */\ +/* sqr(t[0], t[0]); */ /* 78: 5cbe1f75bae460 */\ +/* sqr(t[0], t[0]); */ /* 79: b97c3eeb75c8c0 */\ +/* sqr(t[0], t[0]); */ /* 80: 172f87dd6eb9180 */\ +/* sqr(t[0], t[0]); */ /* 81: 2e5f0fbadd72300 */\ +sqr_n_mul(t[0], t[0], 8, t[13]); /* 82: 2e5f0fbadd72321 */\ +/* sqr(t[0], t[0]); */ /* 83: 5cbe1f75bae4642 */\ +/* sqr(t[0], t[0]); */ /* 84: b97c3eeb75c8c84 */\ +/* sqr(t[0], t[0]); */ /* 85: 172f87dd6eb91908 */\ +/* sqr(t[0], t[0]); */ /* 86: 2e5f0fbadd723210 */\ +/* sqr(t[0], t[0]); */ /* 87: 5cbe1f75bae46420 */\ +/* sqr(t[0], t[0]); */ /* 88: b97c3eeb75c8c840 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 89: b97c3eeb75c8c873 */\ +/* sqr(t[0], t[0]); */ /* 90: 172f87dd6eb9190e6 */\ +/* sqr(t[0], t[0]); */ /* 91: 2e5f0fbadd72321cc */\ +/* sqr(t[0], t[0]); */ /* 92: 5cbe1f75bae464398 */\ +/* sqr(t[0], t[0]); */ /* 93: b97c3eeb75c8c8730 */\ +/* sqr(t[0], t[0]); */ /* 94: 172f87dd6eb9190e60 */\ +/* sqr(t[0], t[0]); */ /* 95: 2e5f0fbadd72321cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 96: 2e5f0fbadd72321ce1 */\ +/* sqr(t[0], t[0]); */ /* 97: 5cbe1f75bae46439c2 */\ +/* sqr(t[0], t[0]); */ /* 98: b97c3eeb75c8c87384 */\ +/* sqr(t[0], t[0]); */ /* 99: 172f87dd6eb9190e708 */\ +/* sqr(t[0], t[0]); */ /* 100: 2e5f0fbadd72321ce10 */\ +/* sqr(t[0], t[0]); */ /* 101: 5cbe1f75bae46439c20 */\ +/* sqr(t[0], t[0]); */ /* 102: b97c3eeb75c8c873840 */\ +/* sqr(t[0], t[0]); */ /* 103: 172f87dd6eb9190e7080 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 104: 172f87dd6eb9190e70a5 */\ +/* sqr(t[0], t[0]); */ /* 105: 2e5f0fbadd72321ce14a */\ +/* sqr(t[0], t[0]); */ /* 106: 5cbe1f75bae46439c294 */\ +/* sqr(t[0], t[0]); */ /* 107: b97c3eeb75c8c8738528 */\ +/* sqr(t[0], t[0]); */ /* 108: 172f87dd6eb9190e70a50 */\ +/* sqr(t[0], t[0]); */ /* 109: 2e5f0fbadd72321ce14a0 */\ +/* sqr(t[0], t[0]); */ /* 110: 5cbe1f75bae46439c2940 */\ +/* sqr(t[0], t[0]); */ /* 111: b97c3eeb75c8c87385280 */\ +/* sqr(t[0], t[0]); */ /* 112: 172f87dd6eb9190e70a500 */\ +sqr_n_mul(t[0], t[0], 8, t[11]); /* 113: 172f87dd6eb9190e70a52b */\ +/* sqr(t[0], t[0]); */ /* 114: 2e5f0fbadd72321ce14a56 */\ +/* sqr(t[0], t[0]); */ /* 115: 5cbe1f75bae46439c294ac */\ +/* sqr(t[0], t[0]); */ /* 116: b97c3eeb75c8c873852958 */\ +/* sqr(t[0], t[0]); */ /* 117: 172f87dd6eb9190e70a52b0 */\ +/* sqr(t[0], t[0]); */ /* 118: 2e5f0fbadd72321ce14a560 */\ +/* sqr(t[0], t[0]); */ /* 119: 5cbe1f75bae46439c294ac0 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 120: 5cbe1f75bae46439c294acd */\ +/* sqr(t[0], t[0]); */ /* 121: b97c3eeb75c8c873852959a */\ +/* sqr(t[0], t[0]); */ /* 122: 172f87dd6eb9190e70a52b34 */\ +/* sqr(t[0], t[0]); */ /* 123: 2e5f0fbadd72321ce14a5668 */\ +/* sqr(t[0], t[0]); */ /* 124: 5cbe1f75bae46439c294acd0 */\ +/* sqr(t[0], t[0]); */ /* 125: b97c3eeb75c8c873852959a0 */\ +/* sqr(t[0], t[0]); */ /* 126: 172f87dd6eb9190e70a52b340 */\ +/* sqr(t[0], t[0]); */ /* 127: 2e5f0fbadd72321ce14a56680 */\ +/* sqr(t[0], t[0]); */ /* 128: 5cbe1f75bae46439c294acd00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 129: 5cbe1f75bae46439c294acd33 */\ +/* sqr(t[0], t[0]); */ /* 130: b97c3eeb75c8c873852959a66 */\ +/* sqr(t[0], t[0]); */ /* 131: 172f87dd6eb9190e70a52b34cc */\ +/* sqr(t[0], t[0]); */ /* 132: 2e5f0fbadd72321ce14a566998 */\ +/* sqr(t[0], t[0]); */ /* 133: 5cbe1f75bae46439c294acd330 */\ +/* sqr(t[0], t[0]); */ /* 134: b97c3eeb75c8c873852959a660 */\ +/* sqr(t[0], t[0]); */ /* 135: 172f87dd6eb9190e70a52b34cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 136: 172f87dd6eb9190e70a52b34ceb */\ +/* sqr(t[0], t[0]); */ /* 137: 2e5f0fbadd72321ce14a56699d6 */\ +/* sqr(t[0], t[0]); */ /* 138: 5cbe1f75bae46439c294acd33ac */\ +/* sqr(t[0], t[0]); */ /* 139: b97c3eeb75c8c873852959a6758 */\ +/* sqr(t[0], t[0]); */ /* 140: 172f87dd6eb9190e70a52b34ceb0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 141: 172f87dd6eb9190e70a52b34ceb9 */\ +/* sqr(t[0], t[0]); */ /* 142: 2e5f0fbadd72321ce14a56699d72 */\ +/* sqr(t[0], t[0]); */ /* 143: 5cbe1f75bae46439c294acd33ae4 */\ +/* sqr(t[0], t[0]); */ /* 144: b97c3eeb75c8c873852959a675c8 */\ +/* sqr(t[0], t[0]); */ /* 145: 172f87dd6eb9190e70a52b34ceb90 */\ +/* sqr(t[0], t[0]); */ /* 146: 2e5f0fbadd72321ce14a56699d720 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 147: 2e5f0fbadd72321ce14a56699d73f */\ +/* sqr(t[0], t[0]); */ /* 148: 5cbe1f75bae46439c294acd33ae7e */\ +/* sqr(t[0], t[0]); */ /* 149: b97c3eeb75c8c873852959a675cfc */\ +/* sqr(t[0], t[0]); */ /* 150: 172f87dd6eb9190e70a52b34ceb9f8 */\ +/* sqr(t[0], t[0]); */ /* 151: 2e5f0fbadd72321ce14a56699d73f0 */\ +/* sqr(t[0], t[0]); */ /* 152: 5cbe1f75bae46439c294acd33ae7e0 */\ +/* sqr(t[0], t[0]); */ /* 153: b97c3eeb75c8c873852959a675cfc0 */\ +/* sqr(t[0], t[0]); */ /* 154: 172f87dd6eb9190e70a52b34ceb9f80 */\ +/* sqr(t[0], t[0]); */ /* 155: 2e5f0fbadd72321ce14a56699d73f00 */\ +/* sqr(t[0], t[0]); */ /* 156: 5cbe1f75bae46439c294acd33ae7e00 */\ +/* sqr(t[0], t[0]); */ /* 157: b97c3eeb75c8c873852959a675cfc00 */\ +/* sqr(t[0], t[0]); */ /* 158: 172f87dd6eb9190e70a52b34ceb9f800 */\ +/* sqr(t[0], t[0]); */ /* 159: 2e5f0fbadd72321ce14a56699d73f000 */\ +/* sqr(t[0], t[0]); */ /* 160: 5cbe1f75bae46439c294acd33ae7e000 */\ +/* sqr(t[0], t[0]); */ /* 161: b97c3eeb75c8c873852959a675cfc000 */\ +/* sqr(t[0], t[0]); */ /* 162: 172f87dd6eb9190e70a52b34ceb9f8000 */\ +sqr_n_mul(t[0], t[0], 15, t[9]); /* 163: 172f87dd6eb9190e70a52b34ceb9f8011 */\ +/* sqr(t[0], t[0]); */ /* 164: 2e5f0fbadd72321ce14a56699d73f0022 */\ +/* sqr(t[0], t[0]); */ /* 165: 5cbe1f75bae46439c294acd33ae7e0044 */\ +/* sqr(t[0], t[0]); */ /* 166: b97c3eeb75c8c873852959a675cfc0088 */\ +/* sqr(t[0], t[0]); */ /* 167: 172f87dd6eb9190e70a52b34ceb9f80110 */\ +/* sqr(t[0], t[0]); */ /* 168: 2e5f0fbadd72321ce14a56699d73f00220 */\ +/* sqr(t[0], t[0]); */ /* 169: 5cbe1f75bae46439c294acd33ae7e00440 */\ +/* sqr(t[0], t[0]); */ /* 170: b97c3eeb75c8c873852959a675cfc00880 */\ +/* sqr(t[0], t[0]); */ /* 171: 172f87dd6eb9190e70a52b34ceb9f801100 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 172: 172f87dd6eb9190e70a52b34ceb9f80110b */\ +/* sqr(t[0], t[0]); */ /* 173: 2e5f0fbadd72321ce14a56699d73f002216 */\ +/* sqr(t[0], t[0]); */ /* 174: 5cbe1f75bae46439c294acd33ae7e00442c */\ +/* sqr(t[0], t[0]); */ /* 175: b97c3eeb75c8c873852959a675cfc008858 */\ +/* sqr(t[0], t[0]); */ /* 176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\ +/* sqr(t[0], t[0]); */ /* 177: 2e5f0fbadd72321ce14a56699d73f0022160 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 178: 2e5f0fbadd72321ce14a56699d73f002217f */\ +/* sqr(t[0], t[0]); */ /* 179: 5cbe1f75bae46439c294acd33ae7e00442fe */\ +/* sqr(t[0], t[0]); */ /* 180: b97c3eeb75c8c873852959a675cfc00885fc */\ +/* sqr(t[0], t[0]); */ /* 181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\ +/* sqr(t[0], t[0]); */ /* 182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\ +/* sqr(t[0], t[0]); */ /* 183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\ +/* sqr(t[0], t[0]); */ /* 184: b97c3eeb75c8c873852959a675cfc00885fc0 */\ +/* sqr(t[0], t[0]); */ /* 185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\ +/* sqr(t[0], t[0]); */ /* 186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\ +/* sqr(t[0], t[0]); */ /* 187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\ +/* sqr(t[0], t[0]); */ /* 188: b97c3eeb75c8c873852959a675cfc00885fc00 */\ +sqr_n_mul(t[0], t[0], 10, t[7]); /* 189: b97c3eeb75c8c873852959a675cfc00885fc39 */\ +/* sqr(t[0], t[0]); */ /* 190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\ +/* sqr(t[0], t[0]); */ /* 191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\ +/* sqr(t[0], t[0]); */ /* 192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\ +/* sqr(t[0], t[0]); */ /* 193: b97c3eeb75c8c873852959a675cfc00885fc390 */\ +/* sqr(t[0], t[0]); */ /* 194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\ +/* sqr(t[0], t[0]); */ /* 195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\ +/* sqr(t[0], t[0]); */ /* 197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\ +/* sqr(t[0], t[0]); */ /* 198: b97c3eeb75c8c873852959a675cfc00885fc399c */\ +/* sqr(t[0], t[0]); */ /* 199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\ +/* sqr(t[0], t[0]); */ /* 200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\ +/* sqr(t[0], t[0]); */ /* 201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\ +/* sqr(t[0], t[0]); */ /* 203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\ +/* sqr(t[0], t[0]); */ /* 204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\ +/* sqr(t[0], t[0]); */ /* 205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\ +/* sqr(t[0], t[0]); */ /* 206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\ +/* sqr(t[0], t[0]); */ /* 207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\ +/* sqr(t[0], t[0]); */ /* 208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\ +/* sqr(t[0], t[0]); */ /* 209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\ +/* sqr(t[0], t[0]); */ /* 210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\ +/* sqr(t[0], t[0]); */ /* 212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\ +/* sqr(t[0], t[0]); */ /* 213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\ +/* sqr(t[0], t[0]); */ /* 214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\ +/* sqr(t[0], t[0]); */ /* 215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\ +/* sqr(t[0], t[0]); */ /* 216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\ +/* sqr(t[0], t[0]); */ /* 217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\ +/* sqr(t[0], t[0]); */ /* 218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\ +/* sqr(t[0], t[0]); */ /* 220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\ +/* sqr(t[0], t[0]); */ /* 221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\ +/* sqr(t[0], t[0]); */ /* 222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\ +/* sqr(t[0], t[0]); */ /* 223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\ +/* sqr(t[0], t[0]); */ /* 224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\ +/* sqr(t[0], t[0]); */ /* 225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\ +/* sqr(t[0], t[0]); */ /* 226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\ +/* sqr(t[0], t[0]); */ /* 227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\ +/* sqr(t[0], t[0]); */ /* 228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\ +sqr_n_mul(t[0], t[0], 9, t[2]); /* 229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\ +/* sqr(t[0], t[0]); */ /* 230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\ +/* sqr(t[0], t[0]); */ /* 231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\ +/* sqr(t[0], t[0]); */ /* 232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\ +/* sqr(t[0], t[0]); */ /* 233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\ +/* sqr(t[0], t[0]); */ /* 234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\ +/* sqr(t[0], t[0]); */ /* 235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\ +/* sqr(t[0], t[0]); */ /* 236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\ +/* sqr(t[0], t[0]); */ /* 238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\ +/* sqr(t[0], t[0]); */ /* 239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\ +/* sqr(t[0], t[0]); */ /* 240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\ +/* sqr(t[0], t[0]); */ /* 241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\ +/* sqr(t[0], t[0]); */ /* 242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\ +/* sqr(t[0], t[0]); */ /* 243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\ +/* sqr(t[0], t[0]); */ /* 244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\ +/* sqr(t[0], t[0]); */ /* 245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\ +/* sqr(t[0], t[0]); */ /* 247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\ +/* sqr(t[0], t[0]); */ /* 248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\ +/* sqr(t[0], t[0]); */ /* 249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\ +/* sqr(t[0], t[0]); */ /* 250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\ +/* sqr(t[0], t[0]); */ /* 251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\ +/* sqr(t[0], t[0]); */ /* 252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\ +/* sqr(t[0], t[0]); */ /* 253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\ +/* sqr(t[0], t[0]); */ /* 254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\ +/* sqr(t[0], t[0]); */ /* 256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\ +/* sqr(t[0], t[0]); */ /* 257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\ +/* sqr(t[0], t[0]); */ /* 258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\ +/* sqr(t[0], t[0]); */ /* 259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\ +/* sqr(t[0], t[0]); */ /* 260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\ +/* sqr(t[0], t[0]); */ /* 261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\ +/* sqr(t[0], t[0]); */ /* 263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\ +/* sqr(t[0], t[0]); */ /* 265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\ +/* sqr(t[0], t[0]); */ /* 266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\ +/* sqr(t[0], t[0]); */ /* 267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\ +/* sqr(t[0], t[0]); */ /* 268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\ +/* sqr(t[0], t[0]); */ /* 269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\ +/* sqr(t[0], t[0]); */ /* 270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\ +/* sqr(t[0], t[0]); */ /* 272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\ +/* sqr(t[0], t[0]); */ /* 273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\ +/* sqr(t[0], t[0]); */ /* 274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\ +/* sqr(t[0], t[0]); */ /* 275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\ +/* sqr(t[0], t[0]); */ /* 276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\ +/* sqr(t[0], t[0]); */ /* 277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\ +/* sqr(t[0], t[0]); */ /* 278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\ +/* sqr(t[0], t[0]); */ /* 279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\ +/* sqr(t[0], t[0]); */ /* 281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\ +/* sqr(t[0], t[0]); */ /* 282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\ +/* sqr(t[0], t[0]); */ /* 283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\ +/* sqr(t[0], t[0]); */ /* 284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\ +/* sqr(t[0], t[0]); */ /* 285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\ +/* sqr(t[0], t[0]); */ /* 286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\ +/* sqr(t[0], t[0]); */ /* 287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\ +/* sqr(t[0], t[0]); */ /* 288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\ +/* sqr(t[0], t[0]); */ /* 290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\ +/* sqr(t[0], t[0]); */ /* 291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\ +/* sqr(t[0], t[0]); */ /* 292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\ +/* sqr(t[0], t[0]); */ /* 293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\ +/* sqr(t[0], t[0]); */ /* 294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\ +/* sqr(t[0], t[0]); */ /* 295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\ +/* sqr(t[0], t[0]); */ /* 297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\ +/* sqr(t[0], t[0]); */ /* 299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\ +/* sqr(t[0], t[0]); */ /* 300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\ +/* sqr(t[0], t[0]); */ /* 301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\ +/* sqr(t[0], t[0]); */ /* 302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\ +/* sqr(t[0], t[0]); */ /* 303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\ +/* sqr(t[0], t[0]); */ /* 304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\ +sqr_n_mul(out, t[0], 6, t[1]); /* 305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\ +} while(0) diff --git a/src/blst/src/pentaroot.c b/src/blst/src/pentaroot.c new file mode 100644 index 0000000000..71f334df50 --- /dev/null +++ b/src/blst/src/pentaroot.c @@ -0,0 +1,76 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +static inline void sqr_fr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +#ifdef __OPTIMIZE_SIZE__ +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ + static const byte pow[] = { + TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199), + TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c) + }; + size_t pow_bits = 254; + vec256 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_fr(ret, ret); + if (is_bit_set(pow, pow_bits)) + mul_fr(ret, ret, inp); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +} +#else +# if 0 +/* + * "255"-bit variant omits full reductions at the ends of squarings, + * not implemented yet[?]. + */ +static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b); } +# else +static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ + do { + sqr_fr(out, a); + a = out; + } while (--count); + mul_fr(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fr(ret,a) +# define mul(ret,a,b) mul_fr(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fr(ret,a,n,b) + +# include "pentaroot-addchain.h" +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ PENTAROOT_MOD_BLS12_381_r(out, inp, vec256); } +# undef PENTAROOT_MOD_BLS12_381_r + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +void blst_fr_pentapow(vec256 out, const vec256 inp) +{ + vec256 tmp; + + sqr_fr(tmp, inp); + sqr_fr(tmp, tmp); + mul_fr(out, tmp, inp); +} diff --git a/src/blst/src/point.h b/src/blst/src/point.h new file mode 100644 index 0000000000..0aa7379671 --- /dev/null +++ b/src/blst/src/point.h @@ -0,0 +1,62 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" +#include "bytes.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const byte *scalar, size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, bool_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, bool_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, bool_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/src/blst/src/rb_tree.c b/src/blst/src/rb_tree.c new file mode 100644 index 0000000000..207becdad1 --- /dev/null +++ b/src/blst/src/rb_tree.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +struct node { + struct node *leafs[2]; + const void *data; + size_t len_n_colour; /* len<<1 | colour */ +}; + +struct rb_tree { + struct node *root; + size_t n_nodes; + struct node nodes[1]; +}; + +static long bytes_compare(const unsigned char *ptr0, size_t len0, + const unsigned char *ptr1, size_t len1) +{ + size_t i, len = len0len_n_colour &= ~(size_t)1) +#define PAINT_RED(p) ((p)->len_n_colour |= 1) +#define IS_RED(p) ((p)->len_n_colour & 1) + +static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) +{ + struct node *nodes[8*sizeof(void *)]; /* visited nodes */ + unsigned char dirs[8*sizeof(void *)]; /* taken directions */ + size_t k = 0; /* walked distance */ + struct node *p, *y, *z; + + for (p = tree->root; p != NULL; k++) { + long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); + + if (cmp == 0) + return 0; /* already in tree, no insertion */ + + /* record the step */ + nodes[k] = p; + p = p->leafs[(dirs[k] = cmp>0)]; + } + + /* allocate new node */ + z = &tree->nodes[tree->n_nodes++]; + z->leafs[0] = z->leafs[1] = NULL; + z->data = data; + z->len_n_colour = len<<1; + PAINT_RED(z); + + /* graft |z| */ + if (k > 0) + nodes[k-1]->leafs[dirs[k-1]] = z; + else + tree->root = z; + + /* re-balance |tree| */ + while (k >= 2 && IS_RED(y = nodes[k-1])) { + size_t ydir = dirs[k-2]; + struct node *x = nodes[k-2], /* |z|'s grandparent */ + *s = x->leafs[ydir^1]; /* |z|'s uncle */ + + if (s != NULL && IS_RED(s)) { + PAINT_RED(x); + PAINT_BLACK(y); + PAINT_BLACK(s); + k -= 2; + } else { + if (dirs[k-1] != ydir) { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + struct node *t = y; + y = y->leafs[ydir^1]; + t->leafs[ydir^1] = y->leafs[ydir]; + y->leafs[ydir] = t; + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x->leafs[ydir] = y->leafs[ydir^1]; + y->leafs[ydir^1] = x; + + PAINT_RED(x); + PAINT_BLACK(y); + + if (k > 2) + nodes[k-3]->leafs[dirs[k-3]] = y; + else + tree->root = y; + + break; + } + } + + PAINT_BLACK(tree->root); + + return 1; +} + +#undef IS_RED +#undef PAINT_RED +#undef PAINT_BLACK + +size_t blst_uniq_sizeof(size_t n_nodes) +{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } + +void blst_uniq_init(struct rb_tree *tree) +{ + tree->root = NULL; + tree->n_nodes = 0; +} + +int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) +{ return (int)rb_tree_insert(tree, data, len); } diff --git a/src/blst/src/recip-addchain.h b/src/blst/src/recip-addchain.h new file mode 100644 index 0000000000..e4e436a3f0 --- /dev/null +++ b/src/blst/src/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/src/blst/src/recip.c b/src/blst/src/recip.c new file mode 100644 index 0000000000..e0c700635e --- /dev/null +++ b/src/blst/src/recip.c @@ -0,0 +1,139 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + static const byte BLS12_381_P_minus_2[] = { + TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), + TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), + TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} +#else +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P +# undef sqr_n_mul +# undef mul +# undef sqr +#endif + +static void flt_reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + flt_reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const vec384 Px8 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), + TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), + TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) + }; +#ifdef __BLST_NO_ASM__ +# define RRx4 BLS12_381_RR +#else + static const vec384 RRx4 = { /* (4<<768)%P */ + TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), + TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), + TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) + }; +#endif + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); + redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* sign goes straight to flt_reciprocal */ + mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); + if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | + vec_is_zero(temp.r[1], sizeof(vec384))) + vec_copy(out, temp.r[0], sizeof(vec384)); + else + flt_reciprocal_fp(out, inp); +#else + vec_copy(out, temp.r[0], sizeof(vec384)); +#endif +#undef RRx4 +} + +void blst_fp_inverse(vec384 out, const vec384 inp) +{ reciprocal_fp(out, inp); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ reciprocal_fp(ret, a); } + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +void blst_fp2_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +static void reciprocal_fr(vec256 out, const vec256 inp) +{ + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + + ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); + redc_mont_256(out, temp, BLS12_381_r, r0); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fr_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } + +void blst_fr_eucl_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } diff --git a/src/blst/src/server.c b/src/blst/src/server.c new file mode 100644 index 0000000000..810f4ab417 --- /dev/null +++ b/src/blst/src/server.c @@ -0,0 +1,32 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "map_to_g1.c" +#include "e2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "bulk_addition.c" +#include "multi_scalar.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" +#ifndef __BLST_CGO__ +# include "rb_tree.c" +#endif +#ifdef BLST_FR_PENTAROOT +# include "pentaroot.c" +#endif +#ifndef __BLST_NO_CPUID__ +# include "cpuid.c" +#endif diff --git a/src/blst/src/sha256.h b/src/blst/src/sha256.h new file mode 100644 index 0000000000..77ddb6dc84 --- /dev/null +++ b/src/blst/src/sha256.h @@ -0,0 +1,140 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include "vect.h" + +#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ + defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_data_order_shaext +#elif defined(__aarch64__) && \ + defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_armv8 +#else +# define sha256_block_data_order blst_sha256_block_data_order +#endif +#define sha256_hcopy blst_sha256_hcopy +#define sha256_bcopy blst_sha256_bcopy +#define sha256_emit blst_sha256_emit + +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((len != 0) & ((n = ctx->off) != 0)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/src/blst/src/sqrt-addchain.h b/src/blst/src/sqrt-addchain.h new file mode 100644 index 0000000000..4e7f0beb6b --- /dev/null +++ b/src/blst/src/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/src/blst/src/sqrt.c b/src/blst/src/sqrt.c new file mode 100644 index 0000000000..cf149fd112 --- /dev/null +++ b/src/blst/src/sqrt.c @@ -0,0 +1,261 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const byte BLS_12_381_P_minus_3_div_4[] = { + TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), + TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), + TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static bool_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +int blst_fp_sqrt(vec384 out, const vec384 inp) +{ return (int)sqrt_fp(out, inp); } + +int blst_fp_is_square(const vec384 inp) +{ + return (int)ct_is_square_mod_384(inp, BLS12_381_P); +} + +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + bool_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +/* + * |inp| = a + b*i + */ +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, + const vec384x magic_ZZZ) +{ + vec384 aa, bb, cc; + vec384x inp_; + bool_t is_sqrt; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ + + /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ + mul_fp2(inp_, inp, recip_ZZZ); + /* ... and adjust |aa| and |cc| accordingly */ + { + vec384 za, zc; + + mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ + mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ + vec_select(aa, aa, za, sizeof(aa), is_sqrt); + vec_select(cc, cc, zc, sizeof(cc), is_sqrt); + } + vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); + + mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ + + sub_fp(bb, inp_[0], aa); + add_fp(aa, inp_[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(out[1], inp_[1]); + mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* bound to succeed */ + (void)sqrt_align_fp2(out, out, out, inp_); + + mul_fp(out[0], out[0], cc); /* inverse the result */ + mul_fp(out[1], out[1], cc); + neg_fp(out[1], out[1]); + + return is_sqrt; +} + +static bool_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + /* don't pay attention to return value, final "align" will tell... */ + (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ + + sub_fp(bb, inp[0], aa); + add_fp(aa, inp[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(ret[1], inp[1]); + mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} + +int blst_fp2_sqrt(vec384x out, const vec384x inp) +{ return (int)sqrt_fp2(out, inp); } + +int blst_fp2_is_square(const vec384x inp) +{ + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + return (int)ct_is_square_mod_384(aa, BLS12_381_P); +} diff --git a/src/blst/src/vect.c b/src/blst/src/vect.c new file mode 100644 index 0000000000..1834a48fad --- /dev/null +++ b/src/blst/src/vect.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +#ifdef __BLST_NO_ASM__ +# include "no_asm.h" +#endif + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, + const vec384 mod) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, + const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, + const vec384 mod) +{ + lshift_mod_384(ret[0], a[0], n, mod); + lshift_mod_384(ret[1], a[1], n, mod); +} +#endif + +#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif + +limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); +limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); +limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); + +/* + * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. + */ +static void div_by_zz(limb_t val[]) +{ + static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), + TO_LIMB_T(0xac45a4010001a402) }; + size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); + limb_t d_lo, d_hi; + + d_lo = zz[zz_len - 2]; + d_hi = zz[zz_len - 1]; + for (loop = zz_len, zz_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); + (void)quot_rem_128(val + loop, zz, q); + } + /* remainder is in low half of val[], quotient is in high */ +} + +/* + * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. + */ +static void div_by_z(limb_t val[]) +{ + static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; + size_t loop, z_len = sizeof(z)/sizeof(z[0]); + limb_t d_lo, d_hi; + + d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; + d_hi = z[z_len - 1]; + for (loop = z_len, z_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); + (void)quot_rem_64(val + loop, z, q); + } + /* remainder is in low half of val[], quotient is in high */ +} diff --git a/src/blst/src/vect.h b/src/blst/src/vect.h new file mode 100644 index 0000000000..19640b117f --- /dev/null +++ b/src/blst/src/vect.h @@ -0,0 +1,433 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include + +#if defined(__x86_64__) || defined(__aarch64__) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) +typedef unsigned int limb_t; +# define LIMB_T_BITS 32 +# ifndef __BLST_NO_ASM__ +# define __BLST_NO_ASM__ +# endif + +#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# define __BLST_NO_ASM__ +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +typedef unsigned char byte; +#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ + (byte)(limb64>>16),(byte)(limb64>>24),\ + (byte)(limb64>>32),(byte)(limb64>>40),\ + (byte)(limb64>>48),(byte)(limb64>>56) +typedef byte pow256[256/8]; + +/* + * Internal Boolean type, Boolean by value, hence safe to cast to or + * reinterpret as 'bool'. + */ +typedef limb_t bool_t; + +/* + * Assembly subroutines... + */ +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ + && !defined(__BLST_NO_ASM__) +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +# define ct_inverse_mod_383 ctx_inverse_mod_383 +#elif defined(__BLST_NO_ASM__) +# define ct_inverse_mod_383 ct_inverse_mod_384 +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, + const vec256 one); +limb_t check_mod_256(const pow256 a, const vec256 p); +limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); +limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); + +void vec_prefetch(const void *ptr, size_t len); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); +void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, + const vec384 modx); +void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, + const vec256 modx); +bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); + +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void div_by_zz(limb_t val[]); +static void div_by_z(limb_t val[]); + +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if !defined(inline) && !defined(__cplusplus) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define launder(var) __asm__ __volatile__("" : "+r"(var)) +#else +# define launder(var) +#endif + +static inline bool_t is_bit_set(const byte *v, size_t i) +{ + bool_t ret = (v[i/8] >> (i%8)) & 1; + launder(ret); + return ret; +} + +static inline bool_t byte_is_zero(unsigned char c) +{ + limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) +{ + unsigned char acc; + size_t i; + + for (acc = 0, i = 0; i < num; i++) + acc |= a[i]; + + return byte_is_zero(acc); +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + bool_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask; + size_t i; + + launder(cbit); + mask = (limb_t)0 - cbit; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); +static inline void vec_select(void *ret, const void *a, const void *b, + size_t num, bool_t sel_a) +{ + launder(sel_a); +#ifndef __BLST_NO_ASM__ + if (num == 32) vec_select_32(ret, a, b, sel_a); + else if (num == 48) vec_select_48(ret, a, b, sel_a); + else if (num == 96) vec_select_96(ret, a, b, sel_a); + else if (num == 144) vec_select_144(ret, a, b, sel_a); + else if (num == 192) vec_select_192(ret, a, b, sel_a); + else if (num == 288) vec_select_288(ret, a, b, sel_a); +#else + if (0) ; +#endif + else { + limb_t bi; + volatile limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = (limb_t)0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } + } +} + +static inline bool_t is_zero(limb_t l) +{ + limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_zero_16x(const void *a, size_t num); + if ((num & 15) == 0) + return vec_is_zero_16x(a, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return is_zero(acc); +} + +static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); + if ((num & 15) == 0) + return vec_is_equal_16x(a, b, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return is_zero(acc); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + volatile limb_t *rp = (volatile limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; + +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" : : "r"(ret) : "memory"); +#endif +} + +static inline void vec_czero(void *ret, size_t num, bool_t cbit) +{ + limb_t *rp = (limb_t *)ret; + size_t i; + limb_t mask; + + launder(cbit); + mask = (limb_t)0 - (cbit^1); + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] &= mask; +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +#elif defined(_MSC_VER) +# pragma warning(disable: 4127 4189) +#endif + +#if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 +# include +#endif + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */