This is an automated email from the git hooks/post-receive script.
dgoulet pushed a commit to branch main in repository tor.
commit 95445f49f1be510e16b4cfc7d5312f69bf46d9da Author: David Goulet dgoulet@torproject.org AuthorDate: Mon Jun 27 14:46:21 2022 -0400
ext: Add Equi-X library
Signed-off-by: David Goulet dgoulet@torproject.org --- .gitignore | 1 + Makefile.am | 10 +- configure.ac | 2 + src/ext/equix/.gitmodules | 3 + src/ext/equix/CMakeLists.txt | 82 ++++ src/ext/equix/LICENSE | 165 +++++++ src/ext/equix/README.md | 77 +++ src/ext/equix/build.sh | 7 + src/ext/equix/devlog.md | 178 +++++++ src/ext/equix/hashx/.gitignore | 9 + src/ext/equix/hashx/CMakeLists.txt | 99 ++++ src/ext/equix/hashx/LICENSE | 165 +++++++ src/ext/equix/hashx/README.md | 135 ++++++ src/ext/equix/hashx/include/hashx.h | 140 ++++++ src/ext/equix/hashx/src/bench.c | 135 ++++++ src/ext/equix/hashx/src/blake2.c | 467 +++++++++++++++++++ src/ext/equix/hashx/src/blake2.h | 73 +++ src/ext/equix/hashx/src/compiler.c | 18 + src/ext/equix/hashx/src/compiler.h | 41 ++ src/ext/equix/hashx/src/compiler_a64.c | 154 ++++++ src/ext/equix/hashx/src/compiler_x86.c | 151 ++++++ src/ext/equix/hashx/src/context.c | 81 ++++ src/ext/equix/hashx/src/context.h | 45 ++ src/ext/equix/hashx/src/force_inline.h | 9 + src/ext/equix/hashx/src/hashx.c | 134 ++++++ src/ext/equix/hashx/src/hashx_endian.h | 103 +++++ src/ext/equix/hashx/src/hashx_thread.c | 27 ++ src/ext/equix/hashx/src/hashx_thread.h | 27 ++ src/ext/equix/hashx/src/hashx_time.c | 35 ++ src/ext/equix/hashx/src/hashx_time.h | 9 + src/ext/equix/hashx/src/instruction.h | 31 ++ src/ext/equix/hashx/src/program.c | 771 +++++++++++++++++++++++++++++++ src/ext/equix/hashx/src/program.h | 48 ++ src/ext/equix/hashx/src/program_exec.c | 152 ++++++ src/ext/equix/hashx/src/siphash.c | 66 +++ src/ext/equix/hashx/src/siphash.h | 35 ++ src/ext/equix/hashx/src/siphash_rng.c | 31 ++ src/ext/equix/hashx/src/siphash_rng.h | 30 ++ src/ext/equix/hashx/src/test_utils.h | 60 +++ src/ext/equix/hashx/src/tests.c | 219 +++++++++ src/ext/equix/hashx/src/unreachable.h | 9 + src/ext/equix/hashx/src/virtual_memory.c | 126 +++++ src/ext/equix/hashx/src/virtual_memory.h | 19 + src/ext/equix/include/equix.h | 145 ++++++ src/ext/equix/src/bench.c | 175 +++++++ src/ext/equix/src/context.c | 57 +++ src/ext/equix/src/context.h | 18 + src/ext/equix/src/equix.c | 96 ++++ src/ext/equix/src/solver.c | 270 +++++++++++ src/ext/equix/src/solver.h | 30 ++ src/ext/equix/src/solver_heap.h | 108 +++++ src/ext/equix/src/tests.c | 124 +++++ 52 files changed, 5200 insertions(+), 2 deletions(-)
diff --git a/.gitignore b/.gitignore index 94988ed982..737ab72bf5 100644 --- a/.gitignore +++ b/.gitignore @@ -152,6 +152,7 @@ core.* # /src/ext/ /src/ext/ed25519/ref10/libed25519_ref10.lib /src/ext/ed25519/donna/libed25519_donna.lib +/src/ext/equix/build /src/ext/keccak-tiny/libkeccak-tiny.lib
# /src/app diff --git a/Makefile.am b/Makefile.am index c7b8b16d35..28a1c8e57d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -124,6 +124,10 @@ TOR_CRYPTO_TESTING_LIBS = \ $(LIBDONNA) endif
+EQUIX_LIBS = \ + src/ext/equix/build/libequix.a \ + src/ext/equix/build/hashx/libhashx.a + # All static libraries used to link tor. TOR_INTERNAL_LIBS = \ src/core/libtor-app.a \ @@ -132,7 +136,8 @@ TOR_INTERNAL_LIBS = \ $(TOR_CRYPTO_LIBS) \ $(TOR_UTIL_LIBS) \ src/trunnel/libor-trunnel.a \ - src/lib/libtor-trace.a + src/lib/libtor-trace.a \ + $(EQUIX_LIBS)
libtor.a: $(TOR_INTERNAL_LIBS) $(AM_V_AR) export AR="$(AR)"; \ @@ -152,7 +157,8 @@ TOR_INTERNAL_TESTING_LIBS = \ $(TOR_CRYPTO_TESTING_LIBS) \ $(TOR_UTIL_TESTING_LIBS) \ src/trunnel/libor-trunnel-testing.a \ - src/lib/libtor-trace.a + src/lib/libtor-trace.a \ + $(EQUIX_LIBS)
src/test/libtor-testing.a: $(TOR_INTERNAL_TESTING_LIBS) $(AM_V_AR) export AR="$(AR)"; \ diff --git a/configure.ac b/configure.ac index f2e5f72e97..c0b531c81d 100644 --- a/configure.ac +++ b/configure.ac @@ -31,6 +31,8 @@ tor_incr_n_warnings() { tor_ac_n_warnings=`expr $tor_ac_n_warnings + 1` }
+AC_CONFIG_COMMANDS([equix], [./src/ext/equix/build.sh]) + m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_CONFIG_HEADERS([orconfig.h])
diff --git a/src/ext/equix/.gitmodules b/src/ext/equix/.gitmodules new file mode 100644 index 0000000000..6f1277ee9f --- /dev/null +++ b/src/ext/equix/.gitmodules @@ -0,0 +1,3 @@ +[submodule "hashx"] + path = hashx + url = https://github.com/tevador/hashx diff --git a/src/ext/equix/CMakeLists.txt b/src/ext/equix/CMakeLists.txt new file mode 100644 index 0000000000..032989d804 --- /dev/null +++ b/src/ext/equix/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2020 tevador tevador@gmail.com +# See LICENSE for licensing information + +cmake_minimum_required(VERSION 2.8.8) + +set(EQUIX_VERSION 1) +set(EQUIX_VERSION_MINOR 0) +set(EQUIX_VERSION_PATCH 0) +set(EQUIX_VERSION_STR "${EQUIX_VERSION}.${EQUIX_VERSION_MINOR}.${EQUIX_VERSION_PATCH}") + +project(equix) + +add_definitions(-DHASHX_SIZE=8) + +add_subdirectory("hashx") + +set(equix_sources +src/context.c +src/equix.c +src/solver.c) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) + message(STATUS "Setting default build type: ${CMAKE_BUILD_TYPE}") +endif() + +add_library(equix SHARED ${equix_sources}) +set_property(TARGET equix PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET equix PROPERTY PUBLIC_HEADER include/equix.h) +include_directories(equix + include/ + hashx/include/ + hashx/src/) +target_compile_definitions(equix PRIVATE HASHX_STATIC) +target_compile_definitions(equix PRIVATE EQUIX_SHARED) +target_link_libraries(equix + PRIVATE hashx_static) +set_target_properties(equix PROPERTIES VERSION ${EQUIX_VERSION_STR} + SOVERSION ${EQUIX_VERSION}) + +add_library(equix_static STATIC ${equix_sources}) +set_property(TARGET equix_static PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(equix_static PROPERTIES OUTPUT_NAME equix) +target_compile_definitions(equix_static PRIVATE HASHX_STATIC) +target_compile_definitions(equix_static PRIVATE EQUIX_STATIC) +include_directories(equix_static + include/ + hashx/include/ + hashx/src/) +target_link_libraries(equix_static + PRIVATE hashx_static) + +include(GNUInstallDirs) +install(TARGETS equix equix_static + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +add_executable(equix-tests + src/tests.c) +include_directories(equix-tests + include/) +target_compile_definitions(equix-tests PRIVATE EQUIX_STATIC) +target_link_libraries(equix-tests + PRIVATE equix_static) + +if(NOT Threads_FOUND AND UNIX AND NOT APPLE) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads) +endif() + +add_executable(equix-bench + src/bench.c + hashx/src/hashx_thread.c + hashx/src/hashx_time.c) +include_directories(equix-bench + include/ + hashx/src/) +target_compile_definitions(equix-bench PRIVATE EQUIX_STATIC) +target_link_libraries(equix-bench + PRIVATE equix_static + PRIVATE ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/ext/equix/LICENSE b/src/ext/equix/LICENSE new file mode 100644 index 0000000000..0a041280bd --- /dev/null +++ b/src/ext/equix/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. https://fsf.org/ + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/src/ext/equix/README.md b/src/ext/equix/README.md new file mode 100644 index 0000000000..d3e8c93125 --- /dev/null +++ b/src/ext/equix/README.md @@ -0,0 +1,77 @@ +# Equi-X + +Equi-X is a CPU-friendly [client puzzle](https://en.wikipedia.org/wiki/Client_Puzzle_Protocol) +with fast verification and small solution size (16 bytes). It is based on Equihash(60,3) with +two major changes: + +1. Blake2b hash function is replaced with [HashX](https://github.com/tevador/hashx). +2. XOR is replaced with modular addition. + +An Equi-X solution for nonce `X` is a set of eight 16-bit indices <code>i<sub>0</sub>, ..., i<sub>7</sub></code> such that: + +<code>H<sub>X</sub>(i<sub>0</sub>) + H<sub>X</sub>(i<sub>1</sub>) + H<sub>X</sub>(i<sub>2</sub>) + H<sub>X</sub>(i<sub>3</sub>) + H<sub>X</sub>(i<sub>4</sub>) + H<sub>X</sub>(i<sub>5</sub>) + H<sub>X</sub>(i<sub>6</sub>) + H<sub>X</sub>(i<sub>7</sub>) = 0 (mod 2<sup>60</sup>)</code> + +where <code>H<sub>X</sub></code> is a HashX function generated for nonce `X`. Equi-X is therefore a variant of the [subset sum problem](https://en.wikipedia.org/wiki/Subset_sum_problem). Each nonce value provides 2 solutions on average. + +Equi-X also has additional requirements that prove that the solution was found using the Wagner's algorithm. See the [Equihash paper](https://eprint.iacr.org/2015/946.pdf) for details. + +### Example solution + +``` +H(0x6c31) = 0xcfa5375c0a7f5d7 \ + (+) = 0xa73d9777f110000 \ +H(0x8803) = 0xd798601be690a29 / | + (+) = 0xefdaadb00000000 \ +H(0x80c2) = 0xcabd8974bbee8d5 \ | | + (+) = 0x489d16380ef0000 / | +H(0xa1db) = 0x7ddf8cc3530172b / | + (+) = 0 +H(0x6592) = 0x348a96fd685dcba \ | + (+) = 0x357120e8ffb8000 \ | +H(0x76b7) = 0x00e689eb975a346 / | | + (+) = 0x102552500000000 / +H(0x74a6) = 0xacccc4ad2d06bcd \ | + (+) = 0xdab431670048000 / +H(0xe259) = 0x2de76cb9d341433 / +``` + +## Performance + +|Algorithm |n |k |memory |solution size|verification <sup>1</sup>|CPU perf. <sup>2</sup>|GPU perf. <sup>3</sup>| +|----------|---|---|-------|-------------|------------|-----------|----------| +|**Equi-X**|60 |3 |1.8 MiB|16 bytes |~50 μs |2400 Sol/s| ? | +|Zcash |200|9 |144 MiB|1344 bytes |>150 μs |30 Sol/s |~400 Sol/s <sup>4</sup>| +|BTG |144|5 |2.5 GiB|100 bytes |~10 μs |1 Sol/s |~45 Sol/s <sup>5</sup>| + +1. Using AMD Ryzen 1700 with 1 thread. +1. Using AMD Ryzen 1700 with 16 threads. +1. Using NVIDIA GTX 1660 Ti. +1. Estimated from http://www.zcashbenchmarks.info/ (GTX 1070) +1. Estimated from https://miniz.ch/features/ + +## Build + +``` +git clone --recursive https://github.com/tevador/equix.git +cd equix +mkdir build +cd build +cmake .. +make +``` +``` +./equix-tests +./equix-bench --help +``` + +## Design notes + +See [devlog.md](devlog.md) + +## Donations + +You can support the development of Equi-X by sending XMR to this address: + +``` +85GkKXSD8EQ22EMC2ZKF64S6L6Lcm8Gr23VbAKB1zg6FUW81sUEmrvvRPoM3GCpUZSC9azCLdeityW2N3CVsV4CAC3p1evV +``` \ No newline at end of file diff --git a/src/ext/equix/build.sh b/src/ext/equix/build.sh new file mode 100755 index 0000000000..753e3a138e --- /dev/null +++ b/src/ext/equix/build.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd ./src/ext/equix +mkdir build +cd build +cmake .. +make diff --git a/src/ext/equix/devlog.md b/src/ext/equix/devlog.md new file mode 100644 index 0000000000..fbd9da346e --- /dev/null +++ b/src/ext/equix/devlog.md @@ -0,0 +1,178 @@ +# DoS protection for onion services: from RandomX to Equi-X + +In early May 2020, I was contacted by two Tor developers to help them with their denial-of-service (DoS) mitigation proposal. Onion services have been plagued by DoS for many years and there are no easy solutions due to the anonymous nature of the Tor network. The vulnerability stems from the fact that the service has to perform a series of expensive operations for each connecting client, so an attacker can simply flood the service with connection requests, which are comparatively cheap [...] + +## Proof of work + +One of the strategies that can be used against such attacks are client puzzles, which require the client to prove that they performed certain amount of computational work before the request is accepted. This is not a new idea – using a client puzzle to combat email spam was first proposed in 1997 [[1](https://cypherpunks.venona.com/date/1997/03/msg00774.html)] and there have been proposals to add a client puzzle to the TLS protocol that we all use every day to browse the web [[2](https:/ [...] + +The goal of the current Tor proposal [[3](https://lists.torproject.org/pipermail/tor-dev/2020-April/014215.html)] is to integrate some kind of client puzzle as part of the protocol used to connect to onion services. The client would download the hidden service descriptor, find that the service is protected by a client puzzle, calculate the solution and send it as part of the connection request. The service would give the request a priority value based on the "difficulty" of the puzzle solution. + +The main issue is to find a suitable puzzle. There are three main requirements that the algorithm must meet: + +1. The solution proof must be smaller than about 200 bytes. +2. Solution verification must be fast. +3. GPUs and FPGAs should not provide a large advantage for solving the puzzle. + +The first requirement is due to the limited space in the message that the client sends to the hidden service to negotiate a connection. This doesn't pose a big problem in practice. + +The second requirement is needed to prevent the client puzzle from becoming a DoS vector on its own. The service must be able to quickly verify if a proof provided by a client is valid or not, otherwise the attacker could just flood the service with invalid proofs. + +The third requirement is the most important one. Users of the Tor browser are equipped with an ordinary general-purpose CPU and an attacker must not be able to use similarly priced hardware that offers a much higher performance for solving the puzzle – otherwise it would be easy for an attacker to simulate many legitimate clients connecting to the service. + +It should be noted that the puzzle does not need to be resistant to speed-up by specialized fixed-function hardware (ASIC). This is called "ASIC resistance" and there are two main reasons why it's not strictly required in this case: + +1. The cost to produce a specialized chip for a given algorithm exceeds USD $1 million for the 28 nm process [[4](https://www.electronicdesign.com/technologies/embedded-revolution/article/21...)]. This is in contrast with the current state when bringing down an onion service requires very little resources [[5](https://www.reddit.com/r/darknet/comments/b3qvbq/this_ddos_is_massive_its_go...)]. +2. If an ASIC for the client puzzle is developed, the algorithm can be quickly changed with a simple patch, rendering the attacker's investment useless. This is in contrast with cryptocurrencies, which have complex consensus protocols and can take months or years to change their proof of work algorithm [[6](https://ethereum-magicians.org/t/eip-progpow-a-programmatic-proof-of-work/27...)]. + +Therefore the client puzzle algorithm must be CPU-friendly and minimize the advantage of programmable hardware such as GPUs and FPGAs. + +## RandomX + +In 2019, we developed a new CPU-friendly proof of work algorithm called RandomX to be used by Monero [[7](https://web.getmonero.org/resources/moneropedia/randomx.html)]. It has been audited and field tested, which makes it a good candidate to be considered for a Tor client puzzle. + +However, during the development of RandomX, we focused mainly on the "CPU-friendliness" of the algorithm to minimize the advantage of specialized hardware. Other aspects such as verification performance and memory requirements were secondary and only had upper limits. The result is that RandomX has quite high solution verification demands – over 2 GiB of memory and around 2 milliseconds to verify a hash. Alternatively, it is possible to use only 256 MiB of memory, but then verification t [...] + +So I began slimming down RandomX to make it less demanding for verification. I halved the memory requirements and reduced the number of rounds from 8 to 2, which is the minimum that can be considered safe from optimizations. The result is RandomX-Tor [[8](https://github.com/tevador/RandomX/tree/tor-pow)], which takes around 0.5 ms to verify with slightly over 1 GiB of memory. GPUs perform very poorly when running RandomX-Tor – similar to full RandomX. + +However, I see two main problems with RandomX-Tor: + +1. 2000 verifications per second still may not be fast enough to prevent flooding attacks on the service, especially since each connection request to an onion service requires very little network bandwidth (512 bytes). +1. The 1 GB memory requirement is too high. Onion services would have to allocate over 2 GB just to verify puzzles because two different puzzles may be active at a given moment to allow for network synchronization delays. + +So I decided to continue my quest and look for a better puzzle. + +## Alternatives + +I briefly tested Argon2 [[9](https://github.com/p-h-c/phc-winner-argon2)] and Yespower [[10](https://www.openwall.com/yespower/)], which are password hashing algorithms claiming to be "GPU-resistant". However, both have slower verification performance than RandomX-Tor and both run faster on GPUs, so they are not suitable. See Appendix for details. + +Then I researched Equihash, which is an asymmetric client puzzle offering very fast solution verification [[11](https://eprint.iacr.org/2015/946.pdf)]. The main problem with Equihash is that GPUs perform up to 100x faster than CPUs, so it is unusable as a puzzle for CPU-based clients. + +## HashX + +I was ready to give up when I remembered SuperscalarHash, which is a part of RandomX that's used only to initialize the DAG [[12](https://github.com/tevador/RandomX/blob/master/doc/specs.md#6-superscalarhas...)]. It could be considered a lightweight version of RandomX with only integer operations and no memory accesses. I began refactoring the code with the goal of creating a fast GPU-resistant hash function. The main idea is that each instance of the hash function would use a completely d [...] + +I named the new algorithm "HashX" and in the course of the next week, I made several other changes compared to the old SuperscalarHash: + +* improved code generator that can fully saturate the simulated 3-way superscalar pipeline +* removed reciprocal multiplication, which was sometimes causing front-end stalls +* implemented a more optimized JIT compiler for x86-64 +* added input-dependent branches to further hinder GPU performance + +I also made a few tweaks to the hash finalizer to pass SMHasher [[13](https://github.com/rurban/smhasher)]. The result is a CPU-friendly algorithm suitable to be used as a partial hash inversion client puzzle. + +## Equi-X + +I still felt a bit uneasy about HashX using no memory. This means that logic-only FPGAs could be a viable option to run HashX. Additionally, introducing the right amount of memory-hardness could affect GPUs more than CPUs. I remembered Equihash, which is a memory-hard algorithm, but still allows for fast solution verification. I could simply replace the Blake2b hash in the default implementation with HashX. Each solution attempt would then use a different randomly generated HashX instance. + +Equihash has two main parameters that determine the difficulty of the generalized birthday problem: `n` is the hash size in bits and <code>2<sup>k</sup></code> is the number of hashes that need to XOR to zero. The memory hardness is proportional to <code>2<sup>n/(k+1)</sup></code>. To get fast verification and a small proof size, `k` should be as low as possible. I chose `k=3`, which means the goal would be to find preimages to 8 hashes that XOR to zero. + +I first tried with `n=96`, which would require about 2 GB of memory for an efficient solver. However, the problem is that each HashX instance would be used for <code>2<sup>25</sup></code> hashes, which could make it feasible for GPUs to compile an optimized kernel for each new solution attempt. Additionally, GPUs are much better at the sorting phase of Equihash than CPUs due to their higher memory bandwidth. + +After some more benchmarking, I estimated that <code>2<sup>16</sup></code> hashes per solution attempt would be the sweet spot for CPUs because one solution attempt would take roughly 6-8 ms, the HashX compilation overhead would still be under 1% and the expected memory usage would be under 2 MiB. This can fit entirely into the CPU cache, which is the only case when CPUs can compete with GPUs in memory bandwidth. The optimal Equihash parameters are then `n=60` and `k=3`. I call this algo [...] + +However, when testing Equi-X, I discovered a fatal flaw: the generalized birthday problem, as used by Equihash, requires a collision resistant hash function. While HashX is preimage-resistant, it is not collision resistant. Some small fraction of HashX instances produce a significant number of hash collisions. With a 2<sup>k</sup>-XOR, finding 2<sup>k-1</sup> pairs of colliding hashes automatically produces a valid solution because equal values XOR to zero. But it gets worse. If the numb [...] + +Some modifications of HashX reduced the number of collisions, but I could not completely eliminate them, so I needed another solution. I began researching the generalized birthday problem and I found this paragraph in the seminal paper by David Wagner [[14](https://people.eecs.berkeley.edu/~daw/papers/genbday.html)]: + +![genbday](https://i.imgur.com/I7TAfTQ.png) + +The advantage of 2<sup>k</sup>-SUM compared to 2<sup>k</sup>-XOR is that identical hashes no longer produce valid solutions (except for `0x000000000000000` and `0x800000000000000`, but these are very rare). There are two additional advantages: + +1. While XOR and addition have the same performance in CPUs, XOR is much faster in custom hardware. This means, for example, that an FPGA-based solver will have to use slightly more resources to calculate the modular additions. +1. With 2<sup>k</sup>-SUM, we can eliminate checking for duplicate indices, which simplifies both the solver and the verifier [[15](https://github.com/tevador/equix/issues/1)]. + +With this modification, an Equi-X solution for nonce `X` is a set of eight 16-bit indices <code>i<sub>0</sub>, ..., i<sub>7</sub></code> such that: + +<code>H<sub>X</sub>(i<sub>0</sub>) + H<sub>X</sub>(i<sub>1</sub>) + H<sub>X</sub>(i<sub>2</sub>) + H<sub>X</sub>(i<sub>3</sub>) + H<sub>X</sub>(i<sub>4</sub>) + H<sub>X</sub>(i<sub>5</sub>) + H<sub>X</sub>(i<sub>6</sub>) + H<sub>X</sub>(i<sub>7</sub>) = 0 (mod 2<sup>60</sup>)</code> + +where <code>H<sub>X</sub></code> is a HashX function generated for nonce `X`. + +Example of a solution: + +``` +H(0x6c31) = 0xcfa5375c0a7f5d7 \ + (+) = 0xa73d9777f110000 \ +H(0x8803) = 0xd798601be690a29 / | + (+) = 0xefdaadb00000000 \ +H(0x80c2) = 0xcabd8974bbee8d5 \ | | + (+) = 0x489d16380ef0000 / | +H(0xa1db) = 0x7ddf8cc3530172b / | + (+) = 0 +H(0x6592) = 0x348a96fd685dcba \ | + (+) = 0x357120e8ffb8000 \ | +H(0x76b7) = 0x00e689eb975a346 / | | + (+) = 0x102552500000000 / +H(0x74a6) = 0xacccc4ad2d06bcd \ | + (+) = 0xdab431670048000 / +H(0xe259) = 0x2de76cb9d341433 / +``` + +The following table compares Equi-X with the most common variants of Equihash: + +|Algorithm |n |k |memory|solution size|verification <sup>1</sup>|CPU perf. <sup>2</sup>|GPU perf. <sup>3</sup>| +|----------|---|---|-------|-------------|------------|-----------|----------| +|**Equi-X**|60 |3 |1.8 MiB|16 bytes |~50 μs |2400 Sol/s| ? | +|Zcash |200|9 |144 MiB|1344 bytes |>150 μs |30 Sol/s |~400 Sol/s <sup>4</sup>| +|BTG |144|5 |2.5 GiB|100 bytes |~10 μs |1 Sol/s |~45 Sol/s <sup>5</sup>| + +1. Using AMD Ryzen 1700 with 1 thread. +1. Using AMD Ryzen 1700 with 16 threads. +1. Using NVIDIA GTX 1660 Ti. +1. Estimated from http://www.zcashbenchmarks.info/ (GTX 1070) +1. Estimated from https://miniz.ch/features/ + +## References + +[1] https://cypherpunks.venona.com/date/1997/03/msg00774.html + +[2] https://tools.ietf.org/html/draft-nygren-tls-client-puzzles-02 + +[3] https://lists.torproject.org/pipermail/tor-dev/2020-April/014215.html + +[4] https://www.electronicdesign.com/technologies/embedded-revolution/article/21... + +[5] https://www.reddit.com/r/darknet/comments/b3qvbq/this_ddos_is_massive_its_go... + +[6] https://ethereum-magicians.org/t/eip-progpow-a-programmatic-proof-of-work/27... + +[7] https://web.getmonero.org/resources/moneropedia/randomx.html + +[8] https://github.com/tevador/RandomX/tree/tor-pow + +[9] https://github.com/p-h-c/phc-winner-argon2 + +[10] https://www.openwall.com/yespower/ + +[11] https://eprint.iacr.org/2015/946.pdf + +[12] https://github.com/tevador/RandomX/blob/master/doc/specs.md#6-superscalarhas... + +[13] https://github.com/rurban/smhasher + +[14] https://people.eecs.berkeley.edu/~daw/papers/genbday.html + +[15] https://github.com/tevador/equix/issues/1 + +## Appendix: Comparison of GPU-resistant client puzzles + +|Algorithm|GPU speed <sup>6</sup>|Solver memory|Verifier memory|Verif./sec. <sup>11</sup>|Solution size <sup>12</sup>| +|------------|---------|-------------|---------------|---------------|-------------| +|**Equi-X** <sup>1</sup>|<50% <sup>7</sup>|1.81 MiB|< 20 KiB|19600| 32 bytes| +|**HashX** <sup>2</sup>|<50% <sup>7</sup>|< 20 KiB|< 20 KiB|19700|16 bytes| +|RandomX-Tor <sup>3</sup>|10% <sup>8</sup>| 1041 MiB|1041 MiB|2200|16 bytes| +|Argon2 <sup>4</sup>|~300% <sup>9</sup>|2.00 MiB|2.00 MiB|1020|16 bytes| +|Yespower <sup>5</sup>|~200% <sup>10</sup>|2.00 MiB|2.00 MiB|780|16 bytes| + +1. https://github.com/tevador/equix +1. https://github.com/tevador/hashx +1. https://github.com/tevador/RandomX/tree/tor-pow +1. https://github.com/p-h-c/phc-winner-argon2 +1. https://www.openwall.com/yespower/ +1. Performance of NVIDIA GTX 1660 Ti compared to AMD Ryzen 1700 +1. No GPU implementation exists; upper bound based on RandomX performance +1. Benchmarked using https://github.com/SChernykh/RandomX_CUDA +1. Estimated from https://github.com/WebDollar/argon2-gpu +1. Estimated from https://nlpool.nl/bench?algo=yescrypt +1. Benchmarked on AMD Ryzen 1700 @ 3.3 GHz using 1 thread +1. Including a 16-byte nonce. diff --git a/src/ext/equix/hashx/.gitignore b/src/ext/equix/hashx/.gitignore new file mode 100644 index 0000000000..ec94c2c699 --- /dev/null +++ b/src/ext/equix/hashx/.gitignore @@ -0,0 +1,9 @@ +bin/ +obj/ +*.user +*.suo +.vs +x64/ +Release/ +Debug/ +build/ diff --git a/src/ext/equix/hashx/CMakeLists.txt b/src/ext/equix/hashx/CMakeLists.txt new file mode 100644 index 0000000000..5eb694d5dd --- /dev/null +++ b/src/ext/equix/hashx/CMakeLists.txt @@ -0,0 +1,99 @@ +# Copyright (c) 2020 tevador tevador@gmail.com +# See LICENSE for licensing information + +cmake_minimum_required(VERSION 2.8.8) + +set(HASHX_VERSION 1) +set(HASHX_VERSION_MINOR 0) +set(HASHX_VERSION_PATCH 0) +set(HASHX_VERSION_STR "${HASHX_VERSION}.${HASHX_VERSION_MINOR}.${HASHX_VERSION_PATCH}") + +project(hashx) + +set(hashx_sources +src/blake2.c +src/compiler.c +src/compiler_a64.c +src/compiler_x86.c +src/context.c +src/hashx.c +src/program.c +src/program_exec.c +src/siphash.c +src/siphash_rng.c +src/virtual_memory.c) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) + message(STATUS "Setting default build type: ${CMAKE_BUILD_TYPE}") +endif() + +option(HASHX_BLOCK_MODE "Hash function for block mode" OFF) + +if(HASHX_BLOCK_MODE) + add_definitions(-DHASHX_BLOCK_MODE) +endif() + +set(HASHX_SIZE CACHE STRING "Hash function output size in bytes") + +if(HASHX_SIZE) + if(HASHX_SIZE GREATER 32) + message(SEND_ERROR "The maximum hash size is 32 bytes") + else() + add_definitions(-DHASHX_SIZE=${HASHX_SIZE}) + endif() +endif() + +set(HASHX_SALT CACHE STRING "Implementation-specific salt value") + +if(HASHX_SALT) + string(LENGTH ${HASHX_SALT} HASHX_SALT_LENGTH) + if(HASHX_SALT_LENGTH GREATER 15) + message(SEND_ERROR "The maximum salt length is 15 characters") + else() + add_definitions(-DHASHX_SALT=${HASHX_SALT}) + endif() +endif() + +add_library(hashx SHARED ${hashx_sources}) +set_property(TARGET hashx PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET hashx PROPERTY PUBLIC_HEADER include/hashx.h) +include_directories(hashx + include/) +target_compile_definitions(hashx PRIVATE HASHX_SHARED) +set_target_properties(hashx PROPERTIES VERSION ${HASHX_VERSION_STR} + SOVERSION ${HASHX_VERSION}) + +add_library(hashx_static STATIC ${hashx_sources}) +set_property(TARGET hashx_static PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hashx_static PROPERTIES OUTPUT_NAME hashx) + +include(GNUInstallDirs) +install(TARGETS hashx hashx_static + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +add_executable(hashx-tests + src/tests.c) +include_directories(hashx-tests + include/) +target_compile_definitions(hashx-tests PRIVATE HASHX_STATIC) +target_link_libraries(hashx-tests + PRIVATE hashx_static) + +if(NOT Threads_FOUND AND UNIX AND NOT APPLE) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads) +endif() + +add_executable(hashx-bench + src/bench.c + src/hashx_thread.c + src/hashx_time.c) +include_directories(hashx-bench + include/) +target_compile_definitions(hashx-bench PRIVATE HASHX_STATIC) +target_link_libraries(hashx-bench + PRIVATE hashx_static + PRIVATE ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/ext/equix/hashx/LICENSE b/src/ext/equix/hashx/LICENSE new file mode 100644 index 0000000000..0a041280bd --- /dev/null +++ b/src/ext/equix/hashx/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. https://fsf.org/ + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/src/ext/equix/hashx/README.md b/src/ext/equix/hashx/README.md new file mode 100644 index 0000000000..1d8ea47652 --- /dev/null +++ b/src/ext/equix/hashx/README.md @@ -0,0 +1,135 @@ +# HashX + +HashX is an algorithm designed for client puzzles and proof-of-work schemes. +While traditional cryptographic hash functions use a fixed one-way compression +function, each HashX instance represents a unique pseudorandomly generated +one-way function. + +HashX functions are generated as a carefully crafted sequence of integer +operations to fully saturate a 3-way superscalar CPU pipeline (modeled after +the Intel Ivy Bridge architecture). Extra care is taken to avoid optimizations +and to ensure that each function takes exactly the same number of CPU cycles +(currently 512 instructions over 192 cycles). + +## API + +The API consists of 4 functions and is documented in the public header file +[hashx.h](include/hashx.h). + +Example of usage: + +```c +#include <hashx.h> +#include <stdio.h> + +int main() { + char seed[] = "this is a seed that will generate a hash function"; + char hash[HASHX_SIZE]; + hashx_ctx* ctx = hashx_alloc(HASHX_COMPILED); + if (ctx == HASHX_NOTSUPP) + ctx = hashx_alloc(HASHX_INTERPRETED); + if (ctx == NULL) + return 1; + if (!hashx_make(ctx, seed, sizeof(seed))) /* generate a hash function */ + return 1; + hashx_exec(ctx, 123456789, hash); /* calculate the hash of a nonce value */ + hashx_free(ctx); + for (unsigned i = 0; i < HASHX_SIZE; ++i) + printf("%02x", hash[i] & 0xff); + printf("\n"); + return 0; +} +``` + +## Build + +A C99-compatible compiler and `cmake` are required. + +``` +git clone https://github.com/tevador/hashx.git +cd hashx +mkdir build +cd build +cmake .. [-DHASHX_BLOCK_MODE=ON] [-DHASHX_SIZE=<1-32>] [-DHASHX_SALT="my custom hash"] +make +``` + +### Block mode (default: off) + +Because HashX is meant to be used in proof-of-work schemes and client puzzles, +the input is a 64-bit counter value. If you need to hash arbitrary data, build +with `-DHASHX_BLOCK_MODE=ON`. This will change the API to accept `const void*, size_t` instead of `uint64_t`. +However, it is strongly recommended to use the counter mode, which is almost twice faster for short inputs. + +### Hash size (default: 32) + +The default hash output size is 32 bytes (256 bits). If you want to reduce the output +size, build with, for example, `-DHASHX_SIZE=20`. Output sizes in the range of 1-32 bytes +are supported. Shorter output sizes are formed by simply truncating the full 256-bit hash. + +### Generator salt (default: "HashX v1") + +An implementation-specific salt value may be specified when building, for example: `-DHASHX_SALT="my custom hash"`. +This value is used as a salt when generating hash instances. The maximum supported +salt size is 15 characters. + +## Performance + +HashX was designed for fast verification. Generating a hash function from seed +takes about 50 μs and a 64-bit nonce can be hashed in under 100 ns (in compiled +mode) or in about 1-2 μs (in interpreted mode). + +A benchmark executable is included: +``` +./hashx-bench --seeds 500 +``` + +## Security + +HashX should provide strong preimage resistance. No other security guarantees are made. About + 99% of HashX instances pass [SMHasher](https://github.com/tevador/smhasher), + but using HashX as a generic hash function is not recommended. + +Known vulnerabilities that should not affect intended use cases: + +1. HashX is not collision resistant. Around 0.2% of seeds produce "weak" hash functions for +which hash collisions are plentiful. +2. Secret values should not be used as inputs to HashX because the generated instructions +include data-dependent branches by design. + +## Protocols based on HashX + +Here are two examples of how HashX can be used in practice: + +### Interactive client puzzle + +Client puzzles are protocols designed to protect server resources from abuse. +A client requesting a resource from a server may be asked to solve a puzzle +before the request is accepted. + +One of the first proposed client puzzles is [Hashcash](https://en.wikipedia.org/wiki/Hashcash), +which requires the client to find a partial SHA-1 hash inversion. However, +because of the static nature of cryptographic hash functions, an attacker can +offload hashing to a GPU or FPGA to gain a significant advantage over legitimate +clients equipped only with a CPU. + +In a HashX-based interactive client puzzle, the server sends each client +a 256-bit challenge used to generate a unique HashX function. The client then +has to find a 64-bit nonce value such that the resulting hash has a predefined +number of leading zeroes. An attacker cannot easily parallelize the workload +because each request would require a new GPU kernel or FPGA bistream. + +### Non-interactive proof-of-work + +In the absence of a central authority handing out challenges (for example in +a cryptocurrency), the client takes some public information `T` (for example +a block template) and combines it with a chosen 64-bit nonce `N1`. +The resulting string `X = T||N1` is then used to generate a HashX function +<code>H<sub>X</sub></code>. The client then tries to find a 16-bit nonce `N2` +such that <code>r = H<sub>X</sub>(N2)</code> meets the difficulty target of +the protocol. If no `N2` value is successful, the client increments `N1` and +tries with a different hash function. + +In this protocol, each HashX function provides only 65536 attempts before it +must be discarded. This limits the parallelization advantage of GPUs and FPGAs. +A CPU core will be able to test about 200 different hash functions per second. diff --git a/src/ext/equix/hashx/include/hashx.h b/src/ext/equix/hashx/include/hashx.h new file mode 100644 index 0000000000..c95fd295ef --- /dev/null +++ b/src/ext/equix/hashx/include/hashx.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +/* + * HashX is an algorithm designed for client puzzles and proof-of-work schemes. + * While traditional cryptographic hash functions use a fixed one-way + * compression function, each HashX instance represents a unique pseudorandomly + * generated one-way function. + * + * Example of usage: + * + #include <hashx.h> + #include <stdio.h> + + int main() { + char seed[] = "this is a seed that will generate a hash function"; + char hash[HASHX_SIZE]; + hashx_ctx* ctx = hashx_alloc(HASHX_COMPILED); + if (ctx == HASHX_NOTSUPP) + ctx = hashx_alloc(HASHX_INTERPRETED); + if (ctx == NULL) + return 1; + if (!hashx_make(ctx, seed, sizeof(seed))) + return 1; + hashx_exec(ctx, 123456789, hash); + hashx_free(ctx); + for (unsigned i = 0; i < HASHX_SIZE; ++i) + printf("%02x", hash[i] & 0xff); + printf("\n"); + return 0; + } + * + */ + +#ifndef HASHX_H +#define HASHX_H + +#include <stdint.h> +#include <stddef.h> + +/* + * Input of the hash function. + * + * Counter mode (default): a 64-bit unsigned integer + * Block mode: pointer to a buffer and the number of bytes to be hashed +*/ +#ifndef HASHX_BLOCK_MODE +#define HASHX_INPUT uint64_t input +#else +#define HASHX_INPUT const void* input, size_t size +#endif + +/* The default (and maximum) hash size is 32 bytes */ +#ifndef HASHX_SIZE +#define HASHX_SIZE 32 +#endif + +/* Opaque struct representing a HashX instance */ +typedef struct hashx_ctx hashx_ctx; + +/* Type of hash function */ +typedef enum hashx_type { + HASHX_INTERPRETED, + HASHX_COMPILED +} hashx_type; + +/* Sentinel value used to indicate unsupported type */ +#define HASHX_NOTSUPP ((hashx_ctx*)-1) + +#if defined(_WIN32) || defined(__CYGWIN__) +#define HASHX_WIN +#endif + +/* Shared/static library definitions */ +#ifdef HASHX_WIN + #ifdef HASHX_SHARED + #define HASHX_API __declspec(dllexport) + #elif !defined(HASHX_STATIC) + #define HASHX_API __declspec(dllimport) + #else + #define HASHX_API + #endif + #define HASHX_PRIVATE +#else + #ifdef HASHX_SHARED + #define HASHX_API __attribute__ ((visibility ("default"))) + #else + #define HASHX_API __attribute__ ((visibility ("hidden"))) + #endif + #define HASHX_PRIVATE __attribute__ ((visibility ("hidden"))) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allocate a HashX instance. + * + * @param type is the type of instance to be created. + * + * @return pointer to a new HashX instance. Returns NULL on memory allocation + * failure and HASHX_NOTSUPP if the requested type is not supported. +*/ +HASHX_API hashx_ctx* hashx_alloc(hashx_type type); + +/* + * Create a new HashX function from seed. + * + * @param ctx is pointer to a HashX instance. + * @param seed is a pointer to the seed value. + * @param size is the size of the seed. + * + * @return 1 on success, 0 on failure. +*/ +HASHX_API int hashx_make(hashx_ctx* ctx, const void* seed, size_t size); + +/* + * Execute the HashX function. + * + * @param ctx is pointer to a HashX instance. A HashX function must have + * been previously created by calling hashx_make. + * @param HASHX_INPUT is the input to be hashed (see definition above). + * @param output is a pointer to the result buffer. HASHX_SIZE bytes will be + * written. + s*/ +HASHX_API void hashx_exec(const hashx_ctx* ctx, HASHX_INPUT, void* output); + +/* + * Free a HashX instance. + * + * @param ctx is pointer to a HashX instance. +*/ +HASHX_API void hashx_free(hashx_ctx* ctx); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/ext/equix/hashx/src/bench.c b/src/ext/equix/hashx/src/bench.c new file mode 100644 index 0000000000..fbcd41a064 --- /dev/null +++ b/src/ext/equix/hashx/src/bench.c @@ -0,0 +1,135 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "test_utils.h" +#include "hashx_thread.h" +#include "hashx_endian.h" +#include "hashx_time.h" +#include <limits.h> +#include <inttypes.h> + +typedef struct worker_job { + int id; + hashx_thread thread; + hashx_ctx* ctx; + int64_t total_hashes; + uint64_t best_hash; + uint64_t threshold; + int start; + int step; + int end; + int nonces; +} worker_job; + +static hashx_thread_retval worker(void* args) { + worker_job* job = (worker_job*)args; + job->total_hashes = 0; + job->best_hash = UINT64_MAX; + for (int seed = job->start; seed < job->end; seed += job->step) { + if (!hashx_make(job->ctx, &seed, sizeof(seed))) { + continue; + } + for (int nonce = 0; nonce < job->nonces; ++nonce) { + uint8_t hash[HASHX_SIZE] = { 0 }; +#ifndef HASHX_BLOCK_MODE + hashx_exec(job->ctx, nonce, hash); +#else + hashx_exec(job->ctx, &nonce, sizeof(nonce), hash); +#endif + uint64_t hashval = load64(hash); + if (hashval < job->best_hash) { + job->best_hash = hashval; + } + if (hashval < job->threshold) { + printf("[thread %2i] Hash (%5i, %5i) below threshold:" + " ...%02x%02x%02x%02x%02x%02x%02x%02x\n", + job->id, + seed, + nonce, + hash[0], + hash[1], + hash[2], + hash[3], + hash[4], + hash[5], + hash[6], + hash[7]); + } + } + job->total_hashes += job->nonces; + } + return HASHX_THREAD_SUCCESS; +} + +int main(int argc, char** argv) { + int nonces, seeds, start, diff, threads; + bool interpret; + read_int_option("--diff", argc, argv, &diff, INT_MAX); + read_int_option("--start", argc, argv, &start, 0); + read_int_option("--seeds", argc, argv, &seeds, 500); + read_int_option("--nonces", argc, argv, &nonces, 65536); + read_int_option("--threads", argc, argv, &threads, 1); + read_option("--interpret", argc, argv, &interpret); + hashx_type flags = HASHX_INTERPRETED; + if (!interpret) { + flags = HASHX_COMPILED; + } + uint64_t best_hash = UINT64_MAX; + uint64_t diff_ex = (uint64_t)diff * 1000ULL; + uint64_t threshold = UINT64_MAX / diff_ex; + int seeds_end = seeds + start; + int64_t total_hashes = 0; + printf("Interpret: %i, Target diff.: %" PRIu64 ", Threads: %i\n", interpret, diff_ex, threads); + printf("Testing seeds %i-%i with %i nonces each ...\n", start, seeds_end - 1, nonces); + double time_start, time_end; + worker_job* jobs = malloc(sizeof(worker_job) * threads); + if (jobs == NULL) { + printf("Error: memory allocation failure\n"); + return 1; + } + for (int thd = 0; thd < threads; ++thd) { + jobs[thd].ctx = hashx_alloc(flags); + if (jobs[thd].ctx == NULL) { + printf("Error: memory allocation failure\n"); + return 1; + } + if (jobs[thd].ctx == HASHX_NOTSUPP) { + printf("Error: not supported. Try with --interpret\n"); + return 1; + } + jobs[thd].id = thd; + jobs[thd].start = start + thd; + jobs[thd].step = threads; + jobs[thd].end = seeds_end; + jobs[thd].nonces = nonces; + jobs[thd].threshold = threshold; + } + time_start = hashx_time(); + if (threads > 1) { + for (int thd = 0; thd < threads; ++thd) { + jobs[thd].thread = hashx_thread_create(&worker, &jobs[thd]); + } + for (int thd = 0; thd < threads; ++thd) { + hashx_thread_join(jobs[thd].thread); + } + } + else { + worker(jobs); + } + time_end = hashx_time(); + for (int thd = 0; thd < threads; ++thd) { + total_hashes += jobs[thd].total_hashes; + if (jobs[thd].best_hash < best_hash) { + best_hash = jobs[thd].best_hash; + } + } + double elapsed = time_end - time_start; + printf("Total hashes: %" PRIi64 "\n", total_hashes); + printf("%f hashes/sec.\n", total_hashes / elapsed); + printf("%f seeds/sec.\n", seeds / elapsed); + printf("Best hash: ..."); + output_hex((char*)&best_hash, sizeof(best_hash)); + printf(" (diff: %" PRIu64 ")\n", UINT64_MAX / best_hash); + free(jobs); + return 0; +} diff --git a/src/ext/equix/hashx/src/blake2.c b/src/ext/equix/hashx/src/blake2.c new file mode 100644 index 0000000000..f353cb3064 --- /dev/null +++ b/src/ext/equix/hashx/src/blake2.c @@ -0,0 +1,467 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +/* Original code from Argon2 reference source code package used under CC0 + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include "blake2.h" +#include "hashx_endian.h" + +static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; + +#define BLAKE2_SIGMA_0_0 0 +#define BLAKE2_SIGMA_0_1 1 +#define BLAKE2_SIGMA_0_2 2 +#define BLAKE2_SIGMA_0_3 3 +#define BLAKE2_SIGMA_0_4 4 +#define BLAKE2_SIGMA_0_5 5 +#define BLAKE2_SIGMA_0_6 6 +#define BLAKE2_SIGMA_0_7 7 +#define BLAKE2_SIGMA_0_8 8 +#define BLAKE2_SIGMA_0_9 9 +#define BLAKE2_SIGMA_0_10 10 +#define BLAKE2_SIGMA_0_11 11 +#define BLAKE2_SIGMA_0_12 12 +#define BLAKE2_SIGMA_0_13 13 +#define BLAKE2_SIGMA_0_14 14 +#define BLAKE2_SIGMA_0_15 15 + +#define BLAKE2_SIGMA_1_0 14 +#define BLAKE2_SIGMA_1_1 10 +#define BLAKE2_SIGMA_1_2 4 +#define BLAKE2_SIGMA_1_3 8 +#define BLAKE2_SIGMA_1_4 9 +#define BLAKE2_SIGMA_1_5 15 +#define BLAKE2_SIGMA_1_6 13 +#define BLAKE2_SIGMA_1_7 6 +#define BLAKE2_SIGMA_1_8 1 +#define BLAKE2_SIGMA_1_9 12 +#define BLAKE2_SIGMA_1_10 0 +#define BLAKE2_SIGMA_1_11 2 +#define BLAKE2_SIGMA_1_12 11 +#define BLAKE2_SIGMA_1_13 7 +#define BLAKE2_SIGMA_1_14 5 +#define BLAKE2_SIGMA_1_15 3 + +#define BLAKE2_SIGMA_2_0 11 +#define BLAKE2_SIGMA_2_1 8 +#define BLAKE2_SIGMA_2_2 12 +#define BLAKE2_SIGMA_2_3 0 +#define BLAKE2_SIGMA_2_4 5 +#define BLAKE2_SIGMA_2_5 2 +#define BLAKE2_SIGMA_2_6 15 +#define BLAKE2_SIGMA_2_7 13 +#define BLAKE2_SIGMA_2_8 10 +#define BLAKE2_SIGMA_2_9 14 +#define BLAKE2_SIGMA_2_10 3 +#define BLAKE2_SIGMA_2_11 6 +#define BLAKE2_SIGMA_2_12 7 +#define BLAKE2_SIGMA_2_13 1 +#define BLAKE2_SIGMA_2_14 9 +#define BLAKE2_SIGMA_2_15 4 + +#define BLAKE2_SIGMA_3_0 7 +#define BLAKE2_SIGMA_3_1 9 +#define BLAKE2_SIGMA_3_2 3 +#define BLAKE2_SIGMA_3_3 1 +#define BLAKE2_SIGMA_3_4 13 +#define BLAKE2_SIGMA_3_5 12 +#define BLAKE2_SIGMA_3_6 11 +#define BLAKE2_SIGMA_3_7 14 +#define BLAKE2_SIGMA_3_8 2 +#define BLAKE2_SIGMA_3_9 6 +#define BLAKE2_SIGMA_3_10 5 +#define BLAKE2_SIGMA_3_11 10 +#define BLAKE2_SIGMA_3_12 4 +#define BLAKE2_SIGMA_3_13 0 +#define BLAKE2_SIGMA_3_14 15 +#define BLAKE2_SIGMA_3_15 8 + +#define BLAKE2_SIGMA_4_0 9 +#define BLAKE2_SIGMA_4_1 0 +#define BLAKE2_SIGMA_4_2 5 +#define BLAKE2_SIGMA_4_3 7 +#define BLAKE2_SIGMA_4_4 2 +#define BLAKE2_SIGMA_4_5 4 +#define BLAKE2_SIGMA_4_6 10 +#define BLAKE2_SIGMA_4_7 15 +#define BLAKE2_SIGMA_4_8 14 +#define BLAKE2_SIGMA_4_9 1 +#define BLAKE2_SIGMA_4_10 11 +#define BLAKE2_SIGMA_4_11 12 +#define BLAKE2_SIGMA_4_12 6 +#define BLAKE2_SIGMA_4_13 8 +#define BLAKE2_SIGMA_4_14 3 +#define BLAKE2_SIGMA_4_15 13 + +#define BLAKE2_SIGMA_5_0 2 +#define BLAKE2_SIGMA_5_1 12 +#define BLAKE2_SIGMA_5_2 6 +#define BLAKE2_SIGMA_5_3 10 +#define BLAKE2_SIGMA_5_4 0 +#define BLAKE2_SIGMA_5_5 11 +#define BLAKE2_SIGMA_5_6 8 +#define BLAKE2_SIGMA_5_7 3 +#define BLAKE2_SIGMA_5_8 4 +#define BLAKE2_SIGMA_5_9 13 +#define BLAKE2_SIGMA_5_10 7 +#define BLAKE2_SIGMA_5_11 5 +#define BLAKE2_SIGMA_5_12 15 +#define BLAKE2_SIGMA_5_13 14 +#define BLAKE2_SIGMA_5_14 1 +#define BLAKE2_SIGMA_5_15 9 + +#define BLAKE2_SIGMA_6_0 12 +#define BLAKE2_SIGMA_6_1 5 +#define BLAKE2_SIGMA_6_2 1 +#define BLAKE2_SIGMA_6_3 15 +#define BLAKE2_SIGMA_6_4 14 +#define BLAKE2_SIGMA_6_5 13 +#define BLAKE2_SIGMA_6_6 4 +#define BLAKE2_SIGMA_6_7 10 +#define BLAKE2_SIGMA_6_8 0 +#define BLAKE2_SIGMA_6_9 7 +#define BLAKE2_SIGMA_6_10 6 +#define BLAKE2_SIGMA_6_11 3 +#define BLAKE2_SIGMA_6_12 9 +#define BLAKE2_SIGMA_6_13 2 +#define BLAKE2_SIGMA_6_14 8 +#define BLAKE2_SIGMA_6_15 11 + +#define BLAKE2_SIGMA_7_0 13 +#define BLAKE2_SIGMA_7_1 11 +#define BLAKE2_SIGMA_7_2 7 +#define BLAKE2_SIGMA_7_3 14 +#define BLAKE2_SIGMA_7_4 12 +#define BLAKE2_SIGMA_7_5 1 +#define BLAKE2_SIGMA_7_6 3 +#define BLAKE2_SIGMA_7_7 9 +#define BLAKE2_SIGMA_7_8 5 +#define BLAKE2_SIGMA_7_9 0 +#define BLAKE2_SIGMA_7_10 15 +#define BLAKE2_SIGMA_7_11 4 +#define BLAKE2_SIGMA_7_12 8 +#define BLAKE2_SIGMA_7_13 6 +#define BLAKE2_SIGMA_7_14 2 +#define BLAKE2_SIGMA_7_15 10 + +#define BLAKE2_SIGMA_8_0 6 +#define BLAKE2_SIGMA_8_1 15 +#define BLAKE2_SIGMA_8_2 14 +#define BLAKE2_SIGMA_8_3 9 +#define BLAKE2_SIGMA_8_4 11 +#define BLAKE2_SIGMA_8_5 3 +#define BLAKE2_SIGMA_8_6 0 +#define BLAKE2_SIGMA_8_7 8 +#define BLAKE2_SIGMA_8_8 12 +#define BLAKE2_SIGMA_8_9 2 +#define BLAKE2_SIGMA_8_10 13 +#define BLAKE2_SIGMA_8_11 7 +#define BLAKE2_SIGMA_8_12 1 +#define BLAKE2_SIGMA_8_13 4 +#define BLAKE2_SIGMA_8_14 10 +#define BLAKE2_SIGMA_8_15 5 + +#define BLAKE2_SIGMA_9_0 10 +#define BLAKE2_SIGMA_9_1 2 +#define BLAKE2_SIGMA_9_2 8 +#define BLAKE2_SIGMA_9_3 4 +#define BLAKE2_SIGMA_9_4 7 +#define BLAKE2_SIGMA_9_5 6 +#define BLAKE2_SIGMA_9_6 1 +#define BLAKE2_SIGMA_9_7 5 +#define BLAKE2_SIGMA_9_8 15 +#define BLAKE2_SIGMA_9_9 11 +#define BLAKE2_SIGMA_9_10 9 +#define BLAKE2_SIGMA_9_11 14 +#define BLAKE2_SIGMA_9_12 3 +#define BLAKE2_SIGMA_9_13 12 +#define BLAKE2_SIGMA_9_14 13 +#define BLAKE2_SIGMA_9_15 0 + +#define BLAKE2_SIGMA_10_0 0 +#define BLAKE2_SIGMA_10_1 1 +#define BLAKE2_SIGMA_10_2 2 +#define BLAKE2_SIGMA_10_3 3 +#define BLAKE2_SIGMA_10_4 4 +#define BLAKE2_SIGMA_10_5 5 +#define BLAKE2_SIGMA_10_6 6 +#define BLAKE2_SIGMA_10_7 7 +#define BLAKE2_SIGMA_10_8 8 +#define BLAKE2_SIGMA_10_9 9 +#define BLAKE2_SIGMA_10_10 10 +#define BLAKE2_SIGMA_10_11 11 +#define BLAKE2_SIGMA_10_12 12 +#define BLAKE2_SIGMA_10_13 13 +#define BLAKE2_SIGMA_10_14 14 +#define BLAKE2_SIGMA_10_15 15 + +#define BLAKE2_SIGMA_11_0 14 +#define BLAKE2_SIGMA_11_1 10 +#define BLAKE2_SIGMA_11_2 4 +#define BLAKE2_SIGMA_11_3 8 +#define BLAKE2_SIGMA_11_4 9 +#define BLAKE2_SIGMA_11_5 15 +#define BLAKE2_SIGMA_11_6 13 +#define BLAKE2_SIGMA_11_7 6 +#define BLAKE2_SIGMA_11_8 1 +#define BLAKE2_SIGMA_11_9 12 +#define BLAKE2_SIGMA_11_10 0 +#define BLAKE2_SIGMA_11_11 2 +#define BLAKE2_SIGMA_11_12 11 +#define BLAKE2_SIGMA_11_13 7 +#define BLAKE2_SIGMA_11_14 5 +#define BLAKE2_SIGMA_11_15 3 + +static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { + return (w >> c) | (w << (64 - c)); +} + +static FORCE_INLINE void blake2b_set_lastblock(blake2b_state* S) { + S->f[0] = (uint64_t)-1; +} + +static FORCE_INLINE void blake2b_increment_counter(blake2b_state* S, + uint64_t inc) { + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +} + +static FORCE_INLINE void blake2b_invalidate_state(blake2b_state* S) { + //clear_internal_memory(S, sizeof(*S)); /* wipe */ + blake2b_set_lastblock(S); /* invalidate for further use */ +} + +static FORCE_INLINE void blake2b_init0(blake2b_state* S) { + memset(S, 0, sizeof(*S)); + memcpy(S->h, blake2b_IV, sizeof(S->h)); +} + +int hashx_blake2b_init_param(blake2b_state* S, const blake2b_param* P) { + const unsigned char* p = (const unsigned char*)P; + unsigned int i; + + if (NULL == P || NULL == S) { + return -1; + } + + blake2b_init0(S); + /* IV XOR Parameter Block */ + for (i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P->digest_length; + return 0; +} + +#define SIGMA(r, k) BLAKE2_SIGMA_ ## r ## _ ## k + +#define G(r, i, j, a, b, c, d) \ + do { \ + a = a + b + m[SIGMA(r, i)]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[SIGMA(r, j)]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define ROUND_INNER(r) \ + do { \ + G(r, 0, 1, v[0], v[4], v[8], v[12]); \ + G(r, 2, 3, v[1], v[5], v[9], v[13]); \ + G(r, 4, 5, v[2], v[6], v[10], v[14]); \ + G(r, 6, 7, v[3], v[7], v[11], v[15]); \ + G(r, 8, 9, v[0], v[5], v[10], v[15]); \ + G(r, 10, 11, v[1], v[6], v[11], v[12]); \ + G(r, 12, 13, v[2], v[7], v[8], v[13]); \ + G(r, 14, 15, v[3], v[4], v[9], v[14]); \ + } while ((void)0, 0) + +#define ROUND(r) ROUND_INNER(r) + +static void blake2b_compress(blake2b_state* S, const uint8_t* block) { + uint64_t m[16]; + uint64_t v[16]; + unsigned int i; + + for (i = 0; i < 16; ++i) { + m[i] = load64(block + i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = S->h[i]; + } + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ S->t[0]; + v[13] = blake2b_IV[5] ^ S->t[1]; + v[14] = blake2b_IV[6] ^ S->f[0]; + v[15] = blake2b_IV[7] ^ S->f[1]; + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + + for (i = 0; i < 8; ++i) { + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + } +} + +static void blake2b_compress_4r(blake2b_state* S, const uint8_t* block) { + uint64_t m[16]; + uint64_t v[16]; + unsigned int i; + + for (i = 0; i < 16; ++i) { + m[i] = load64(block + i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = S->h[i]; + } + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ S->t[0]; + v[13] = blake2b_IV[5] ^ S->t[1]; + v[14] = blake2b_IV[6] ^ S->f[0]; + v[15] = blake2b_IV[7] ^ S->f[1]; + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + + for (i = 0; i < 8; ++i) { + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + } +} + +int hashx_blake2b_update(blake2b_state* S, const void* in, size_t inlen) { + const uint8_t* pin = (const uint8_t*)in; + + if (inlen == 0) { + return 0; + } + + /* Sanity check */ + if (S == NULL || in == NULL) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) { + /* Complete current block */ + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + memcpy(&S->buf[left], pin, fill); + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen = 0; + inlen -= fill; + pin += fill; + /* Avoid buffer copies when possible */ + while (inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, pin); + inlen -= BLAKE2B_BLOCKBYTES; + pin += BLAKE2B_BLOCKBYTES; + } + } + memcpy(&S->buf[S->buflen], pin, inlen); + S->buflen += (unsigned int)inlen; + return 0; +} + +int hashx_blake2b_final(blake2b_state* S, void* out, size_t outlen) { + uint8_t buffer[BLAKE2B_OUTBYTES] = { 0 }; + unsigned int i; + + /* Sanity checks */ + if (S == NULL || out == NULL || outlen < S->outlen) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); + } + + memcpy(out, buffer, S->outlen); + + return 0; +} + +/* 4-round version of Blake2b */ +void hashx_blake2b_4r(const blake2b_param* params, const void* in, + size_t inlen, void* out) { + + blake2b_state state; + const uint8_t* p = (const uint8_t*)params; + + blake2b_init0(&state); + /* IV XOR Parameter Block */ + for (unsigned i = 0; i < 8; ++i) { + state.h[i] ^= load64(&p[i * sizeof(state.h[i])]); + } + //state.outlen = blake_params.digest_length; + + const uint8_t* pin = (const uint8_t*)in; + + while (inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(&state, BLAKE2B_BLOCKBYTES); + blake2b_compress_4r(&state, pin); + inlen -= BLAKE2B_BLOCKBYTES; + pin += BLAKE2B_BLOCKBYTES; + } + + memcpy(state.buf, pin, inlen); + blake2b_increment_counter(&state, inlen); + blake2b_set_lastblock(&state); + blake2b_compress_4r(&state, state.buf); + + /* Output hash */ + memcpy(out, state.h, sizeof(state.h)); +} diff --git a/src/ext/equix/hashx/src/blake2.h b/src/ext/equix/hashx/src/blake2.h new file mode 100644 index 0000000000..ba535b21fb --- /dev/null +++ b/src/ext/equix/hashx/src/blake2.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef PORTABLE_BLAKE2_H +#define PORTABLE_BLAKE2_H + +#include <stdint.h> +#include <limits.h> +#include <stddef.h> +#include <hashx.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +#pragma pack(push, 1) +typedef struct blake2b_param { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint64_t node_offset; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ +} blake2b_param; +#pragma pack(pop) + +typedef struct blake2b_state { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + unsigned buflen; + unsigned outlen; + uint8_t last_node; +} blake2b_state; + +/* Ensure param structs have not been wrongly padded */ +/* Poor man's static_assert */ +enum { + blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), + blake2_size_check_2 = + 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) +}; + +HASHX_PRIVATE int hashx_blake2b_init_param(blake2b_state* S, const blake2b_param* P); +HASHX_PRIVATE int hashx_blake2b_update(blake2b_state* S, const void* in, size_t inlen); +HASHX_PRIVATE int hashx_blake2b_final(blake2b_state* S, void* out, size_t outlen); +HASHX_PRIVATE void hashx_blake2b_4r(const blake2b_param* P, const void* in, size_t inlen, void* out); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/ext/equix/hashx/src/compiler.c b/src/ext/equix/hashx/src/compiler.c new file mode 100644 index 0000000000..f180bf2d25 --- /dev/null +++ b/src/ext/equix/hashx/src/compiler.c @@ -0,0 +1,18 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdbool.h> + +#include "compiler.h" +#include "virtual_memory.h" +#include "program.h" +#include "context.h" + +bool hashx_compiler_init(hashx_ctx* ctx) { + ctx->code = hashx_vm_alloc(COMP_CODE_SIZE); + return ctx->code != NULL; +} + +void hashx_compiler_destroy(hashx_ctx* ctx) { + hashx_vm_free(ctx->code, COMP_CODE_SIZE); +} diff --git a/src/ext/equix/hashx/src/compiler.h b/src/ext/equix/hashx/src/compiler.h new file mode 100644 index 0000000000..ef0201be02 --- /dev/null +++ b/src/ext/equix/hashx/src/compiler.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef COMPILER_H +#define COMPILER_H + +#include <stdint.h> +#include <stdbool.h> +#include <hashx.h> +#include "virtual_memory.h" +#include "program.h" + +HASHX_PRIVATE void hashx_compile_x86(const hashx_program* program, uint8_t* code); + +HASHX_PRIVATE void hashx_compile_a64(const hashx_program* program, uint8_t* code); + +#if defined(_M_X64) || defined(__x86_64__) +#define HASHX_COMPILER 1 +#define HASHX_COMPILER_X86 +#define hashx_compile hashx_compile_x86 +#elif defined(__aarch64__) +#define HASHX_COMPILER 1 +#define HASHX_COMPILER_A64 +#define hashx_compile hashx_compile_a64 +#else +#define HASHX_COMPILER 0 +#define hashx_compile +#endif + +HASHX_PRIVATE bool hashx_compiler_init(hashx_ctx* compiler); +HASHX_PRIVATE void hashx_compiler_destroy(hashx_ctx* compiler); + +#define COMP_PAGE_SIZE 4096 +#define COMP_RESERVE_SIZE 1024 +#define COMP_AVG_INSTR_SIZE 5 +#define COMP_CODE_SIZE \ + ALIGN_SIZE( \ + HASHX_PROGRAM_MAX_SIZE * COMP_AVG_INSTR_SIZE + COMP_RESERVE_SIZE, \ + COMP_PAGE_SIZE) + +#endif diff --git a/src/ext/equix/hashx/src/compiler_a64.c b/src/ext/equix/hashx/src/compiler_a64.c new file mode 100644 index 0000000000..48f743b988 --- /dev/null +++ b/src/ext/equix/hashx/src/compiler_a64.c @@ -0,0 +1,154 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <string.h> +#include <assert.h> + +#include "compiler.h" +#include "program.h" +#include "virtual_memory.h" +#include "unreachable.h" + +#define EMIT(p,x) do { \ + memcpy(p, x, sizeof(x)); \ + p += sizeof(x); \ + } while (0) +#define EMIT_U32(p,x) *((uint32_t*)(p)) = x; p += sizeof(uint32_t) +#define EMIT_IMM32(p,x) \ + EMIT_U32(p, 0x9280000c | \ + ((x <= INT32_MAX) << 30) | \ + (((x <= INT32_MAX) ? (x & 0xFFFF) : (~x & 0xFFFF)) << 5)); \ + EMIT_U32(p, 0xf2a0000c | \ + (((x >> 16) & 0xFFFF) << 5)); + +#ifdef HASHX_COMPILER_A64 + +static const uint8_t a64_prologue[] = { + 0x07, 0x1c, 0x40, 0xf9, /* ldr x7, [x0, #56] */ + 0x06, 0x18, 0x40, 0xf9, /* ldr x6, [x0, #48] */ + 0x05, 0x14, 0x40, 0xf9, /* ldr x5, [x0, #40] */ + 0x04, 0x10, 0x40, 0xf9, /* ldr x4, [x0, #32] */ + 0x03, 0x0c, 0x40, 0xf9, /* ldr x3, [x0, #24] */ + 0x02, 0x08, 0x40, 0xf9, /* ldr x2, [x0, #16] */ + 0x01, 0x04, 0x40, 0xf9, /* ldr x1, [x0, #8] */ + 0xe8, 0x03, 0x00, 0xaa, /* mov x8, x0 */ + 0x00, 0x00, 0x40, 0xf9, /* ldr x0, [x0] */ + 0xe9, 0x03, 0x1f, 0x2a, /* mov w9, wzr */ +}; + +static const uint8_t a64_epilogue[] = { + 0x00, 0x01, 0x00, 0xf9, /* str x0, [x8] */ + 0x01, 0x05, 0x00, 0xf9, /* str x1, [x8, #8] */ + 0x02, 0x09, 0x00, 0xf9, /* str x2, [x8, #16] */ + 0x03, 0x0d, 0x00, 0xf9, /* str x3, [x8, #24] */ + 0x04, 0x11, 0x00, 0xf9, /* str x4, [x8, #32] */ + 0x05, 0x15, 0x00, 0xf9, /* str x5, [x8, #40] */ + 0x06, 0x19, 0x00, 0xf9, /* str x6, [x8, #48] */ + 0x07, 0x1d, 0x00, 0xf9, /* str x7, [x8, #56] */ + 0xc0, 0x03, 0x5f, 0xd6, /* ret */ +}; + +void hashx_compile_a64(const hashx_program* program, uint8_t* code) { + hashx_vm_rw(code, COMP_CODE_SIZE); + uint8_t* pos = code; + uint8_t* target = NULL; + int creg = -1; + EMIT(pos, a64_prologue); + for (int i = 0; i < program->code_size; ++i) { + const instruction* instr = &program->code[i]; + switch (instr->opcode) + { + case INSTR_UMULH_R: + EMIT_U32(pos, 0x9bc07c00 | + (instr->src << 16) | + (instr->dst << 5) | + (instr->dst)); + if (target != NULL) { + creg = instr->dst; + } + break; + case INSTR_SMULH_R: + EMIT_U32(pos, 0x9b407c00 | + (instr->src << 16) | + (instr->dst << 5) | + (instr->dst)); + if (target != NULL) { + creg = instr->dst; + } + break; + case INSTR_MUL_R: + assert(creg != instr->dst); + EMIT_U32(pos, 0x9b007c00 | + (instr->src << 16) | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_SUB_R: + assert(creg != instr->dst); + EMIT_U32(pos, 0xcb000000 | + (instr->src << 16) | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_XOR_R: + assert(creg != instr->dst); + EMIT_U32(pos, 0xca000000 | + (instr->src << 16) | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_ADD_RS: + assert(creg != instr->dst); + EMIT_U32(pos, 0x8b000000 | + (instr->src << 16) | + (instr->imm32 << 10) | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_ROR_C: + assert(creg != instr->dst); + EMIT_U32(pos, 0x93c00000 | + (instr->dst << 16) | + (instr->imm32 << 10) | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_ADD_C: + assert(creg != instr->dst); + EMIT_IMM32(pos, instr->imm32); + EMIT_U32(pos, 0x8b0c0000 | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_XOR_C: + assert(creg != instr->dst); + EMIT_IMM32(pos, instr->imm32); + EMIT_U32(pos, 0xca0c0000 | + (instr->dst << 5) | + (instr->dst)); + break; + case INSTR_TARGET: + target = pos; + break; + case INSTR_BRANCH: + EMIT_IMM32(pos, instr->imm32); + EMIT_U32(pos, 0x2a00012b | (creg << 16)); + EMIT_U32(pos, 0x6a0c017f); + EMIT_U32(pos, 0x5a891129); + EMIT_U32(pos, 0x54000000 | + ((((uint32_t)(target - pos)) >> 2) & 0x7FFFF) << 5); + target = NULL; + creg = -1; + break; + default: + UNREACHABLE; + } + } + EMIT(pos, a64_epilogue); + hashx_vm_rx(code, COMP_CODE_SIZE); +#ifdef __GNUC__ + __builtin___clear_cache(code, pos); +#endif +} + +#endif diff --git a/src/ext/equix/hashx/src/compiler_x86.c b/src/ext/equix/hashx/src/compiler_x86.c new file mode 100644 index 0000000000..0a1d9efd8f --- /dev/null +++ b/src/ext/equix/hashx/src/compiler_x86.c @@ -0,0 +1,151 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <string.h> + +#include "compiler.h" +#include "program.h" +#include "virtual_memory.h" +#include "unreachable.h" + +#if defined(_WIN32) || defined(__CYGWIN__) +#define WINABI +#endif + +#define EMIT(p,x) do { \ + memcpy(p, x, sizeof(x)); \ + p += sizeof(x); \ + } while (0) +#define EMIT_BYTE(p,x) *((p)++) = x +#define EMIT_U16(p,x) *((uint16_t*)(p)) = x; p += sizeof(uint16_t) +#define EMIT_U32(p,x) *((uint32_t*)(p)) = x; p += sizeof(uint32_t) +#define EMIT_U64(p,x) *((uint64_t*)(p)) = x; p += sizeof(uint64_t) + +#define GEN_SIB(scale, index, base) ((scale << 6) | (index << 3) | base) + +#ifdef HASHX_COMPILER_X86 + +static const uint8_t x86_prologue[] = { +#ifndef WINABI + 0x48, 0x89, 0xF9, /* mov rcx, rdi */ + 0x48, 0x83, 0xEC, 0x20, /* sub rsp, 32 */ + 0x4C, 0x89, 0x24, 0x24, /* mov qword ptr [rsp+0], r12 */ + 0x4C, 0x89, 0x6C, 0x24, 0x08, /* mov qword ptr [rsp+8], r13 */ + 0x4C, 0x89, 0x74, 0x24, 0x10, /* mov qword ptr [rsp+16], r14 */ + 0x4C, 0x89, 0x7C, 0x24, 0x18, /* mov qword ptr [rsp+24], r15 */ +#else + 0x4C, 0x89, 0x64, 0x24, 0x08, /* mov qword ptr [rsp+8], r12 */ + 0x4C, 0x89, 0x6C, 0x24, 0x10, /* mov qword ptr [rsp+16], r13 */ + 0x4C, 0x89, 0x74, 0x24, 0x18, /* mov qword ptr [rsp+24], r14 */ + 0x4C, 0x89, 0x7C, 0x24, 0x20, /* mov qword ptr [rsp+32], r15 */ + 0x48, 0x83, 0xEC, 0x10, /* sub rsp, 16 */ + 0x48, 0x89, 0x34, 0x24, /* mov qword ptr [rsp+0], rsi */ + 0x48, 0x89, 0x7C, 0x24, 0x08, /* mov qword ptr [rsp+8], rdi */ +#endif + 0x31, 0xF6, /* xor esi, esi */ + 0x8D, 0x7E, 0xFF, /* lea edi, [rsi-1] */ + 0x4C, 0x8B, 0x01, /* mov r8, qword ptr [rcx+0] */ + 0x4C, 0x8B, 0x49, 0x08, /* mov r9, qword ptr [rcx+8] */ + 0x4C, 0x8B, 0x51, 0x10, /* mov r10, qword ptr [rcx+16] */ + 0x4C, 0x8B, 0x59, 0x18, /* mov r11, qword ptr [rcx+24] */ + 0x4C, 0x8B, 0x61, 0x20, /* mov r12, qword ptr [rcx+32] */ + 0x4C, 0x8B, 0x69, 0x28, /* mov r13, qword ptr [rcx+40] */ + 0x4C, 0x8B, 0x71, 0x30, /* mov r14, qword ptr [rcx+48] */ + 0x4C, 0x8B, 0x79, 0x38 /* mov r15, qword ptr [rcx+56] */ +}; + +static const uint8_t x86_epilogue[] = { + 0x4C, 0x89, 0x01, /* mov qword ptr [rcx+0], r8 */ + 0x4C, 0x89, 0x49, 0x08, /* mov qword ptr [rcx+8], r9 */ + 0x4C, 0x89, 0x51, 0x10, /* mov qword ptr [rcx+16], r10 */ + 0x4C, 0x89, 0x59, 0x18, /* mov qword ptr [rcx+24], r11 */ + 0x4C, 0x89, 0x61, 0x20, /* mov qword ptr [rcx+32], r12 */ + 0x4C, 0x89, 0x69, 0x28, /* mov qword ptr [rcx+40], r13 */ + 0x4C, 0x89, 0x71, 0x30, /* mov qword ptr [rcx+48], r14 */ + 0x4C, 0x89, 0x79, 0x38, /* mov qword ptr [rcx+56], r15 */ +#ifndef WINABI + 0x4C, 0x8B, 0x24, 0x24, /* mov r12, qword ptr [rsp+0] */ + 0x4C, 0x8B, 0x6C, 0x24, 0x08, /* mov r13, qword ptr [rsp+8] */ + 0x4C, 0x8B, 0x74, 0x24, 0x10, /* mov r14, qword ptr [rsp+16] */ + 0x4C, 0x8B, 0x7C, 0x24, 0x18, /* mov r15, qword ptr [rsp+24] */ + 0x48, 0x83, 0xC4, 0x20, /* add rsp, 32 */ +#else + 0x48, 0x8B, 0x34, 0x24, /* mov rsi, qword ptr [rsp+0] */ + 0x48, 0x8B, 0x7C, 0x24, 0x08, /* mov rdi, qword ptr [rsp+8] */ + 0x48, 0x83, 0xC4, 0x10, /* add rsp, 16 */ + 0x4C, 0x8B, 0x64, 0x24, 0x08, /* mov r12, qword ptr [rsp+8] */ + 0x4C, 0x8B, 0x6C, 0x24, 0x10, /* mov r13, qword ptr [rsp+16] */ + 0x4C, 0x8B, 0x74, 0x24, 0x18, /* mov r14, qword ptr [rsp+24] */ + 0x4C, 0x8B, 0x7C, 0x24, 0x20, /* mov r15, qword ptr [rsp+32] */ +#endif + 0xC3 /* ret */ +}; + +void hashx_compile_x86(const hashx_program* program, uint8_t* code) { + hashx_vm_rw(code, COMP_CODE_SIZE); + uint8_t* pos = code; + uint8_t* target = NULL; + EMIT(pos, x86_prologue); + for (int i = 0; i < program->code_size; ++i) { + const instruction* instr = &program->code[i]; + switch (instr->opcode) + { + case INSTR_UMULH_R: + EMIT_U64(pos, 0x8b4ce0f749c08b49 | + (((uint64_t)instr->src) << 40) | + (((uint64_t)instr->dst) << 16)); + EMIT_BYTE(pos, 0xc2 + 8 * instr->dst); + break; + case INSTR_SMULH_R: + EMIT_U64(pos, 0x8b4ce8f749c08b49 | + (((uint64_t)instr->src) << 40) | + (((uint64_t)instr->dst) << 16)); + EMIT_BYTE(pos, 0xc2 + 8 * instr->dst); + break; + case INSTR_MUL_R: + EMIT_U32(pos, 0xc0af0f4d | (instr->dst << 27) | (instr->src << 24)); + break; + case INSTR_SUB_R: + EMIT_U16(pos, 0x2b4d); + EMIT_BYTE(pos, 0xc0 | (instr->dst << 3) | instr->src); + break; + case INSTR_XOR_R: + EMIT_U16(pos, 0x334d); + EMIT_BYTE(pos, 0xc0 | (instr->dst << 3) | instr->src); + break; + case INSTR_ADD_RS: + EMIT_U32(pos, 0x00048d4f | + (instr->dst << 19) | + GEN_SIB(instr->imm32, instr->src, instr->dst) << 24); + break; + case INSTR_ROR_C: + EMIT_U32(pos, 0x00c8c149 | (instr->dst << 16) | (instr->imm32 << 24)); + break; + case INSTR_ADD_C: + EMIT_U16(pos, 0x8149); + EMIT_BYTE(pos, 0xc0 | instr->dst); + EMIT_U32(pos, instr->imm32); + break; + case INSTR_XOR_C: + EMIT_U16(pos, 0x8149); + EMIT_BYTE(pos, 0xf0 | instr->dst); + EMIT_U32(pos, instr->imm32); + break; + case INSTR_TARGET: + target = pos; /* +2 */ + EMIT_U32(pos, 0x440fff85); + EMIT_BYTE(pos, 0xf7); + break; + case INSTR_BRANCH: + EMIT_U64(pos, ((uint64_t)instr->imm32) << 32 | 0xc2f7f209); + EMIT_U16(pos, ((target - pos) << 8) | 0x74); + break; + default: + UNREACHABLE; + } + } + EMIT(pos, x86_epilogue); + hashx_vm_rx(code, COMP_CODE_SIZE); +} + +#endif diff --git a/src/ext/equix/hashx/src/context.c b/src/ext/equix/hashx/src/context.c new file mode 100644 index 0000000000..de2144d46c --- /dev/null +++ b/src/ext/equix/hashx/src/context.c @@ -0,0 +1,81 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdlib.h> +#include <string.h> + +#include <hashx.h> +#include "context.h" +#include "compiler.h" +#include "program.h" + +#define STRINGIZE_INNER(x) #x +#define STRINGIZE(x) STRINGIZE_INNER(x) + +/* Salt used when generating hash functions. Useful for domain separation. */ +#ifndef HASHX_SALT +#define HASHX_SALT HashX v1 +#endif + +/* Blake2b params used to generate program keys */ +const blake2b_param hashx_blake2_params = { + .digest_length = 64, + .key_length = 0, + .fanout = 1, + .depth = 1, + .leaf_length = 0, + .node_offset = 0, + .node_depth = 0, + .inner_length = 0, + .reserved = { 0 }, + .salt = STRINGIZE(HASHX_SALT), + .personal = { 0 } +}; + +hashx_ctx* hashx_alloc(hashx_type type) { + if (!HASHX_COMPILER && (type & HASHX_COMPILED)) { + return HASHX_NOTSUPP; + } + hashx_ctx* ctx = malloc(sizeof(hashx_ctx)); + if (ctx == NULL) { + goto failure; + } + ctx->code = NULL; + if (type & HASHX_COMPILED) { + if (!hashx_compiler_init(ctx)) { + goto failure; + } + ctx->type = HASHX_COMPILED; + } + else { + ctx->program = malloc(sizeof(hashx_program)); + if (ctx->program == NULL) { + goto failure; + } + ctx->type = HASHX_INTERPRETED; + } +#ifdef HASHX_BLOCK_MODE + memcpy(&ctx->params, &hashx_blake2_params, 32); +#endif +#ifndef NDEBUG + ctx->has_program = false; +#endif + return ctx; +failure: + hashx_free(ctx); + return NULL; +} + +void hashx_free(hashx_ctx* ctx) { + if (ctx != NULL && ctx != HASHX_NOTSUPP) { + if (ctx->code != NULL) { + if (ctx->type & HASHX_COMPILED) { + hashx_compiler_destroy(ctx); + } + else { + free(ctx->program); + } + } + free(ctx); + } +} diff --git a/src/ext/equix/hashx/src/context.h b/src/ext/equix/hashx/src/context.h new file mode 100644 index 0000000000..40736397f8 --- /dev/null +++ b/src/ext/equix/hashx/src/context.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef CONTEXT_H +#define CONTEXT_H + +#include <stdbool.h> + +#include "hashx.h" +#include "blake2.h" +#include "siphash.h" + +typedef void program_func(uint64_t r[8]); + +#ifdef __cplusplus +extern "C" { +#endif + +HASHX_PRIVATE extern const blake2b_param hashx_blake2_params; + +#ifdef __cplusplus +} +#endif + +typedef struct hashx_program hashx_program; + +/* HashX context. */ +typedef struct hashx_ctx { + union { + uint8_t* code; + program_func* func; + hashx_program* program; + }; + hashx_type type; +#ifndef HASHX_BLOCK_MODE + siphash_state keys; +#else + blake2b_param params; +#endif +#ifndef NDEBUG + bool has_program; +#endif +} hashx_ctx; + +#endif diff --git a/src/ext/equix/hashx/src/force_inline.h b/src/ext/equix/hashx/src/force_inline.h new file mode 100644 index 0000000000..d9af3f32e8 --- /dev/null +++ b/src/ext/equix/hashx/src/force_inline.h @@ -0,0 +1,9 @@ +#ifndef FORCE_INLINE +#if defined(_MSC_VER) +#define FORCE_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif +#endif \ No newline at end of file diff --git a/src/ext/equix/hashx/src/hashx.c b/src/ext/equix/hashx/src/hashx.c new file mode 100644 index 0000000000..1f5715dce8 --- /dev/null +++ b/src/ext/equix/hashx/src/hashx.c @@ -0,0 +1,134 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include <hashx.h> +#include "blake2.h" +#include "hashx_endian.h" +#include "program.h" +#include "context.h" +#include "compiler.h" + +#if HASHX_SIZE > 32 +#error HASHX_SIZE cannot be more than 32 +#endif + +#ifndef HASHX_BLOCK_MODE +#define HASHX_INPUT_ARGS input +#else +#define HASHX_INPUT_ARGS input, size +#endif + +static int initialize_program(hashx_ctx* ctx, hashx_program* program, + siphash_state keys[2]) { + + if (!hashx_program_generate(&keys[0], program)) { + return 0; + } +#ifndef HASHX_BLOCK_MODE + memcpy(&ctx->keys, &keys[1], 32); +#else + memcpy(&ctx->params.salt, &keys[1], 32); +#endif +#ifndef NDEBUG + ctx->has_program = true; +#endif + return 1; +} + +int hashx_make(hashx_ctx* ctx, const void* seed, size_t size) { + assert(ctx != NULL && ctx != HASHX_NOTSUPP); + assert(seed != NULL || size == 0); + siphash_state keys[2]; + blake2b_state hash_state; + hashx_blake2b_init_param(&hash_state, &hashx_blake2_params); + hashx_blake2b_update(&hash_state, seed, size); + hashx_blake2b_final(&hash_state, &keys, sizeof(keys)); + if (ctx->type & HASHX_COMPILED) { + hashx_program program; + if (!initialize_program(ctx, &program, keys)) { + return 0; + } + hashx_compile(&program, ctx->code); + return 1; + } + return initialize_program(ctx, ctx->program, keys); +} + +void hashx_exec(const hashx_ctx* ctx, HASHX_INPUT, void* output) { + assert(ctx != NULL && ctx != HASHX_NOTSUPP); + assert(output != NULL); + assert(ctx->has_program); + uint64_t r[8]; +#ifndef HASHX_BLOCK_MODE + hashx_siphash24_ctr_state512(&ctx->keys, input, r); +#else + hashx_blake2b_4r(&ctx->params, input, size, r); +#endif + + if (ctx->type & HASHX_COMPILED) { + ctx->func(r); + } + else { + hashx_program_execute(ctx->program, r); + } + + /* Hash finalization to remove bias toward 0 caused by multiplications */ +#ifndef HASHX_BLOCK_MODE + r[0] += ctx->keys.v0; + r[1] += ctx->keys.v1; + r[6] += ctx->keys.v2; + r[7] += ctx->keys.v3; +#else + const uint8_t* p = (const uint8_t*)&ctx->params; + r[0] ^= load64(&p[8 * 0]); + r[1] ^= load64(&p[8 * 1]); + r[2] ^= load64(&p[8 * 2]); + r[3] ^= load64(&p[8 * 3]); + r[4] ^= load64(&p[8 * 4]); + r[5] ^= load64(&p[8 * 5]); + r[6] ^= load64(&p[8 * 6]); + r[7] ^= load64(&p[8 * 7]); +#endif + /* 1 SipRound per 4 registers is enough to pass SMHasher. */ + SIPROUND(r[0], r[1], r[2], r[3]); + SIPROUND(r[4], r[5], r[6], r[7]); + + /* output */ +#if HASHX_SIZE > 0 + /* optimized output for hash sizes that are multiples of 8 */ +#if HASHX_SIZE % 8 == 0 + uint8_t* temp_out = (uint8_t*)output; +#if HASHX_SIZE >= 8 + store64(temp_out + 0, r[0] ^ r[4]); +#endif +#if HASHX_SIZE >= 16 + store64(temp_out + 8, r[1] ^ r[5]); +#endif +#if HASHX_SIZE >= 24 + store64(temp_out + 16, r[2] ^ r[6]); +#endif +#if HASHX_SIZE >= 32 + store64(temp_out + 24, r[3] ^ r[7]); +#endif +#else /* any output size */ + uint8_t temp_out[32]; +#if HASHX_SIZE > 0 + store64(temp_out + 0, r[0] ^ r[4]); +#endif +#if HASHX_SIZE > 8 + store64(temp_out + 8, r[1] ^ r[5]); +#endif +#if HASHX_SIZE > 16 + store64(temp_out + 16, r[2] ^ r[6]); +#endif +#if HASHX_SIZE > 24 + store64(temp_out + 24, r[3] ^ r[7]); +#endif + memcpy(output, temp_out, HASHX_SIZE); +#endif +#endif +} diff --git a/src/ext/equix/hashx/src/hashx_endian.h b/src/ext/equix/hashx/src/hashx_endian.h new file mode 100644 index 0000000000..23537cfba5 --- /dev/null +++ b/src/ext/equix/hashx/src/hashx_endian.h @@ -0,0 +1,103 @@ +#ifndef ENDIAN_H +#define ENDIAN_H + +#include <stdint.h> +#include <string.h> +#include "force_inline.h" + +/* Argon2 Team - Begin Code */ +/* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. +*/ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif +/* Argon2 Team - End Code */ + +static FORCE_INLINE uint32_t load32(const void* src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t* p = (const uint8_t*)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static FORCE_INLINE uint64_t load64_native(const void* src) { + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +} + +static FORCE_INLINE uint64_t load64(const void* src) { +#if defined(NATIVE_LITTLE_ENDIAN) + return load64_native(src); +#else + const uint8_t* p = (const uint8_t*)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static FORCE_INLINE void store32(void* dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t* p = (uint8_t*)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static FORCE_INLINE void store64_native(void* dst, uint64_t w) { + memcpy(dst, &w, sizeof w); +} + +static FORCE_INLINE void store64(void* dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + store64_native(dst, w); +#else + uint8_t* p = (uint8_t*)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} +#endif diff --git a/src/ext/equix/hashx/src/hashx_thread.c b/src/ext/equix/hashx/src/hashx_thread.c new file mode 100644 index 0000000000..6618378e66 --- /dev/null +++ b/src/ext/equix/hashx/src/hashx_thread.c @@ -0,0 +1,27 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "hashx_thread.h" + +hashx_thread hashx_thread_create(hashx_thread_func* func, void* args) { +#ifdef HASHX_WIN + return CreateThread(NULL, 0, func, args, 0, NULL); +#else + hashx_thread thread; + if (pthread_create(&thread, NULL, func, args) != 0) + { + thread = 0; + } + return thread; +#endif +} + +void hashx_thread_join(hashx_thread thread) { +#ifdef HASHX_WIN + WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); +#else + void* retval; + pthread_join(thread, &retval); +#endif +} diff --git a/src/ext/equix/hashx/src/hashx_thread.h b/src/ext/equix/hashx/src/hashx_thread.h new file mode 100644 index 0000000000..01556626d8 --- /dev/null +++ b/src/ext/equix/hashx/src/hashx_thread.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef HASHX_THREAD_H +#define HASHX_THREAD_H + +#include <hashx.h> + +#ifdef HASHX_WIN +#include <Windows.h> +typedef HANDLE hashx_thread; +typedef DWORD hashx_thread_retval; +#define HASHX_THREAD_SUCCESS 0 +#else +#include <pthread.h> +typedef pthread_t hashx_thread; +typedef void* hashx_thread_retval; +#define HASHX_THREAD_SUCCESS NULL +#endif + +typedef hashx_thread_retval hashx_thread_func(void* args); + +hashx_thread hashx_thread_create(hashx_thread_func* func, void* args); + +void hashx_thread_join(hashx_thread thread); + +#endif diff --git a/src/ext/equix/hashx/src/hashx_time.c b/src/ext/equix/hashx/src/hashx_time.c new file mode 100644 index 0000000000..d3eaaed10e --- /dev/null +++ b/src/ext/equix/hashx/src/hashx_time.c @@ -0,0 +1,35 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "hashx_time.h" +#include <hashx.h> + +#if defined(HASHX_WIN) +#include <windows.h> +#else +#include <sys/time.h> +#endif + +double hashx_time() { +#ifdef HASHX_WIN + static double freq = 0; + if (freq == 0) { + LARGE_INTEGER freq_long; + if (!QueryPerformanceFrequency(&freq_long)) { + return 0; + } + freq = freq_long.QuadPart; + } + LARGE_INTEGER time; + if (!QueryPerformanceCounter(&time)) { + return 0; + } + return time.QuadPart / freq; +#else + struct timeval time; + if (gettimeofday(&time, NULL) != 0) { + return 0; + } + return (double)time.tv_sec + (double)time.tv_usec * 1.0e-6; +#endif +} diff --git a/src/ext/equix/hashx/src/hashx_time.h b/src/ext/equix/hashx/src/hashx_time.h new file mode 100644 index 0000000000..da1ca746fc --- /dev/null +++ b/src/ext/equix/hashx/src/hashx_time.h @@ -0,0 +1,9 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef HASHX_TIME_H +#define HASHX_TIME_H + +double hashx_time(void); + +#endif diff --git a/src/ext/equix/hashx/src/instruction.h b/src/ext/equix/hashx/src/instruction.h new file mode 100644 index 0000000000..f17582ffea --- /dev/null +++ b/src/ext/equix/hashx/src/instruction.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef INSTRUCTION_H +#define INSTRUCTION_H + +#include <stdint.h> + +typedef enum instr_type { + INSTR_UMULH_R, /* unsigned high multiplication by a register */ + INSTR_SMULH_R, /* signed high multiplication by a register */ + INSTR_MUL_R, /* multiplication by a register */ + INSTR_SUB_R, /* subtraction of a register */ + INSTR_XOR_R, /* xor with a register */ + INSTR_ADD_RS, /* addition of a shifted register */ + INSTR_ROR_C, /* rotation by a constant */ + INSTR_ADD_C, /* addition of a constant */ + INSTR_XOR_C, /* xor with a constant */ + INSTR_TARGET, /* branch instruction target */ + INSTR_BRANCH, /* conditional branch */ +} instr_type; + +typedef struct instruction { + instr_type opcode; + int src; + int dst; + uint32_t imm32; + uint32_t op_par; +} instruction; + +#endif diff --git a/src/ext/equix/hashx/src/program.c b/src/ext/equix/hashx/src/program.c new file mode 100644 index 0000000000..7f4b0cef0a --- /dev/null +++ b/src/ext/equix/hashx/src/program.c @@ -0,0 +1,771 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "program.h" +#include "unreachable.h" +#include "siphash_rng.h" + +/* instructions are generated until this CPU cycle */ +#define TARGET_CYCLE 192 + +/* requirements for the program to be acceptable */ +#define REQUIREMENT_SIZE 512 +#define REQUIREMENT_MUL_COUNT 192 +#define REQUIREMENT_LATENCY 195 + +/* R5 (x86 = r13) register cannot be used as the destination of INSTR_ADD_RS */ +#define REGISTER_NEEDS_DISPLACEMENT 5 + +#define PORT_MAP_SIZE (TARGET_CYCLE + 4) +#define NUM_PORTS 3 +#define MAX_RETRIES 1 +#define LOG2_BRANCH_PROB 4 +#define BRANCH_MASK 0x80000000 + +#define TRACE false +#define TRACE_PRINT(...) do { if (TRACE) printf(__VA_ARGS__); } while (false) + +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* If the instruction is a multiplication. */ +static inline bool is_mul(instr_type type) { + return type <= INSTR_MUL_R; +} + +/* If the instruction is a 64x64->128 bit multiplication. */ +static inline bool is_wide_mul(instr_type type) { + return type < INSTR_MUL_R; +} + +/* Ivy Bridge integer execution ports: P0, P1, P5 */ +typedef enum execution_port { + PORT_NONE = 0, + PORT_P0 = 1, + PORT_P1 = 2, + PORT_P5 = 4, + PORT_P01 = PORT_P0 | PORT_P1, + PORT_P05 = PORT_P0 | PORT_P5, + PORT_P015 = PORT_P0 | PORT_P1 | PORT_P5 +} execution_port; + +static const char* execution_port_names[] = { + "PORT_NONE", "PORT_P0", "PORT_P1", "PORT_P01", "PORT_P5", "PORT_P05", "PORT_P15", "PORT_P015" +}; + +typedef struct instr_template { + instr_type type; /* instruction type */ + const char* x86_asm; /* x86 assembly */ + int x86_size; /* x86 code size */ + int latency; /* latency in cycles */ + execution_port uop1; /* ex. ports for the 1st uop */ + execution_port uop2; /* ex. ports for the 2nd uop */ + uint32_t immediate_mask; /* mask for imm32 */ + instr_type group; /* instruction group */ + bool imm_can_be_0; /* if imm32 can be zero */ + bool distinct_dst; /* if dst and src must be distinct */ + bool op_par_src; /* operation parameter is equal to src */ + bool has_src; /* if the instruction has a source operand */ + bool has_dst; /* if the instr. has a destination operand */ +} instr_template; + +typedef struct register_info { + int latency; /* cycle when the register value will be ready */ + instr_type last_op; /* last op applied to the register */ + int last_op_par; /* parameter of the last op (-1 = constant) */ +} register_info; + +typedef struct program_item { + const instr_template** templates; + uint32_t mask0; + uint32_t mask1; + bool duplicates; +} program_item; + +typedef struct generator_ctx { + int cycle; + int sub_cycle; + int mul_count; + bool chain_mul; + int latency; + siphash_rng gen; + register_info registers[8]; + execution_port ports[PORT_MAP_SIZE][NUM_PORTS]; +} generator_ctx; + +const static instr_template tpl_umulh_r = { + .type = INSTR_UMULH_R, + .x86_asm = "mul r", + .x86_size = 9, /* mov, mul, mov */ + .latency = 4, + .uop1 = PORT_P1, + .uop2 = PORT_P5, + .immediate_mask = 0, + .group = INSTR_UMULH_R, + .imm_can_be_0 = false, + .distinct_dst = false, + .op_par_src = false, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_smulh_r = { + .type = INSTR_SMULH_R, + .x86_asm = "imul r", + .x86_size = 9, /* mov, mul, mov */ + .latency = 4, + .uop1 = PORT_P1, + .uop2 = PORT_P5, + .immediate_mask = 0, + .group = INSTR_SMULH_R, + .imm_can_be_0 = false, + .distinct_dst = false, + .op_par_src = false, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_mul_r = { + .type = INSTR_MUL_R, + .x86_asm = "imul r,r", + .x86_size = 4, + .latency = 3, + .uop1 = PORT_P1, + .uop2 = PORT_NONE, + .immediate_mask = 0, + .group = INSTR_MUL_R, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = true, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_sub_r = { + .type = INSTR_SUB_R, + .x86_asm = "sub r,r", + .x86_size = 3, + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_NONE, + .immediate_mask = 0, + .group = INSTR_ADD_RS, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = true, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_xor_r = { + .type = INSTR_XOR_R, + .x86_asm = "xor r,r", + .x86_size = 3, + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_NONE, + .immediate_mask = 0, + .group = INSTR_XOR_R, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = true, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_add_rs = { + .type = INSTR_ADD_RS, + .x86_asm = "lea r,r+r*s", + .x86_size = 4, + .latency = 1, + .uop1 = PORT_P01, + .uop2 = PORT_NONE, + .immediate_mask = 3, + .group = INSTR_ADD_RS, + .imm_can_be_0 = true, + .distinct_dst = true, + .op_par_src = true, + .has_src = true, + .has_dst = true, +}; + +const static instr_template tpl_ror_c = { + .type = INSTR_ROR_C, + .x86_asm = "ror r,i", + .x86_size = 4, + .latency = 1, + .uop1 = PORT_P05, + .uop2 = PORT_NONE, + .immediate_mask = 63, + .group = INSTR_ROR_C, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = false, + .has_src = false, + .has_dst = true, +}; + +const static instr_template tpl_add_c = { + .type = INSTR_ADD_C, + .x86_asm = "add r,i", + .x86_size = 7, + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_NONE, + .immediate_mask = UINT32_MAX, + .group = INSTR_ADD_C, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = false, + .has_src = false, + .has_dst = true, +}; + +const static instr_template tpl_xor_c = { + .type = INSTR_XOR_C, + .x86_asm = "xor r,i", + .x86_size = 7, + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_NONE, + .immediate_mask = UINT32_MAX, + .group = INSTR_XOR_C, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = false, + .has_src = false, + .has_dst = true, +}; + + +const static instr_template tpl_target = { + .type = INSTR_TARGET, + .x86_asm = "cmovz esi, edi", + .x86_size = 5, /* test, cmovz */ + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_P015, + .immediate_mask = 0, + .group = INSTR_TARGET, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = false, + .has_src = false, + .has_dst = false, +}; + +const static instr_template tpl_branch = { + .type = INSTR_BRANCH, + .x86_asm = "jz target", + .x86_size = 10, /* or, test, jz */ + .latency = 1, + .uop1 = PORT_P015, + .uop2 = PORT_P015, + .immediate_mask = BRANCH_MASK, + .group = INSTR_BRANCH, + .imm_can_be_0 = false, + .distinct_dst = true, + .op_par_src = false, + .has_src = false, + .has_dst = false, +}; + +const static instr_template* instr_lookup[] = { + &tpl_ror_c, + &tpl_xor_c, + &tpl_add_c, + &tpl_add_c, + &tpl_sub_r, + &tpl_xor_r, + &tpl_xor_c, + &tpl_add_rs, +}; + +const static instr_template* wide_mul_lookup[] = { + &tpl_smulh_r, + &tpl_umulh_r +}; + +const static instr_template* mul_lookup = &tpl_mul_r; +const static instr_template* target_lookup = &tpl_target; +const static instr_template* branch_lookup = &tpl_branch; + +const static program_item item_mul = { + .templates = &mul_lookup, + .mask0 = 0, + .mask1 = 0, + .duplicates = true +}; + +const static program_item item_target = { + .templates = &target_lookup, + .mask0 = 0, + .mask1 = 0, + .duplicates = true +}; + +const static program_item item_branch = { + .templates = &branch_lookup, + .mask0 = 0, + .mask1 = 0, + .duplicates = true +}; + +const static program_item item_wide_mul = { + .templates = wide_mul_lookup, + .mask0 = 1, + .mask1 = 1, + .duplicates = true +}; + +const static program_item item_any = { + .templates = instr_lookup, + .mask0 = 7, + .mask1 = 3, /* instructions that don't need a src register */ + .duplicates = false +}; + +const static program_item* program_layout[] = { + &item_mul, + &item_target, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_wide_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_mul, + &item_branch, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_wide_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, + &item_mul, + &item_any, + &item_any, +}; + +static const instr_template* select_template(generator_ctx* ctx, instr_type last_instr, int attempt) { + const program_item* item = program_layout[ctx->sub_cycle % 36]; + const instr_template* tpl; + do { + int index = item->mask0 ? hashx_siphash_rng_u8(&ctx->gen) & (attempt > 0 ? item->mask1 : item->mask0) : 0; + tpl = item->templates[index]; + } while (!item->duplicates && tpl->group == last_instr); + return tpl; +} + +static uint32_t branch_mask(siphash_rng* gen) { + uint32_t mask = 0; + int popcnt = 0; + while (popcnt < LOG2_BRANCH_PROB) { + int bit = hashx_siphash_rng_u8(gen) % 32; + uint32_t bitmask = 1U << bit; + if (!(mask & bitmask)) { + mask |= bitmask; + popcnt++; + } + } + return mask; +} + +static void instr_from_template(const instr_template* tpl, siphash_rng* gen, instruction* instr) { + instr->opcode = tpl->type; + if (tpl->immediate_mask) { + if (tpl->immediate_mask == BRANCH_MASK) { + instr->imm32 = branch_mask(gen); + } + else do { + instr->imm32 = hashx_siphash_rng_u32(gen) & tpl->immediate_mask; + } while (instr->imm32 == 0 && !tpl->imm_can_be_0); + } + if (!tpl->op_par_src) { + if (tpl->distinct_dst) { + instr->op_par = UINT32_MAX; + } + else { + instr->op_par = hashx_siphash_rng_u32(gen); + } + } + if (!tpl->has_src) { + instr->src = -1; + } + if (!tpl->has_dst) { + instr->dst = -1; + } +} + +static bool select_register(int available_regs[8], int regs_count, siphash_rng* gen, int* reg_out) { + if (regs_count == 0) + return false; + + int index; + + if (regs_count > 1) { + index = hashx_siphash_rng_u32(gen) % regs_count; + } + else { + index = 0; + } + *reg_out = available_regs[index]; + return true; +} + +static bool select_destination(const instr_template* tpl, instruction* instr, generator_ctx* ctx, int cycle) { + int available_regs[8]; + int regs_count = 0; + /* Conditions for the destination register: + // * value must be ready at the required cycle + // * cannot be the same as the source register unless the instruction allows it + // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * register cannot be multiplied twice in a row unless chain_mul is true + // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication + // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) + // * either the last instruction applied to the register or its source must be different than this instruction + // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" + // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) */ + for (int i = 0; i < 8; ++i) { + bool available = ctx->registers[i].latency <= cycle; + available &= ((!tpl->distinct_dst) | (i != instr->src)); + available &= (ctx->chain_mul | (tpl->group != INSTR_MUL_R) | (ctx->registers[i].last_op != INSTR_MUL_R)); + available &= ((ctx->registers[i].last_op != tpl->group) | (ctx->registers[i].last_op_par != instr->op_par)); + available &= ((instr->opcode != INSTR_ADD_RS) | (i != REGISTER_NEEDS_DISPLACEMENT)); + available_regs[regs_count] = available ? i : 0; + regs_count += available; + } + return select_register(available_regs, regs_count, &ctx->gen, &instr->dst); +} + +static bool select_source(const instr_template* tpl, instruction* instr, generator_ctx* ctx, int cycle) { + int available_regs[8]; + int regs_count = 0; + /* all registers that are ready at the cycle */ + for (int i = 0; i < 8; ++i) { + if (ctx->registers[i].latency <= cycle) + available_regs[regs_count++] = i; + } + /* if there are only 2 available registers for ADD_RS and one of them is r5, select it as the source because it cannot be the destination */ + if (regs_count == 2 && instr->opcode == INSTR_ADD_RS) { + if (available_regs[0] == REGISTER_NEEDS_DISPLACEMENT || available_regs[1] == REGISTER_NEEDS_DISPLACEMENT) { + instr->op_par = instr->src = REGISTER_NEEDS_DISPLACEMENT; + return true; + } + } + if (select_register(available_regs, regs_count, &ctx->gen, &instr->src)) { + if (tpl->op_par_src) + instr->op_par = instr->src; + return true; + } + return false; +} + +static int schedule_uop(execution_port uop, generator_ctx* ctx, int cycle, bool commit) { + /* The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload + port P1 (multiplication) by instructions that can go to any port. */ + for (; cycle < PORT_MAP_SIZE; ++cycle) { + if ((uop & PORT_P5) && !ctx->ports[cycle][2]) { + if (commit) { + ctx->ports[cycle][2] = uop; + } + TRACE_PRINT("%s scheduled to port P5 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit); + return cycle; + } + if ((uop & PORT_P0) && !ctx->ports[cycle][0]) { + if (commit) { + ctx->ports[cycle][0] = uop; + } + TRACE_PRINT("%s scheduled to port P0 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit); + return cycle; + } + if ((uop & PORT_P1) != 0 && !ctx->ports[cycle][1]) { + if (commit) { + ctx->ports[cycle][1] = uop; + } + TRACE_PRINT("%s scheduled to port P1 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit); + return cycle; + } + } + return -1; +} + +static int schedule_instr(const instr_template* tpl, generator_ctx* ctx, bool commit) { + if (tpl->uop2 == PORT_NONE) { + /* this instruction has only one uOP */ + return schedule_uop(tpl->uop1, ctx, ctx->cycle, commit); + } + else { + /* instructions with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle */ + for (int cycle = ctx->cycle; cycle < PORT_MAP_SIZE; ++cycle) { + + int cycle1 = schedule_uop(tpl->uop1, ctx, cycle, false); + int cycle2 = schedule_uop(tpl->uop2, ctx, cycle, false); + + if (cycle1 >= 0 && cycle1 == cycle2) { + if (commit) { + schedule_uop(tpl->uop1, ctx, cycle, true); + schedule_uop(tpl->uop2, ctx, cycle, true); + } + return cycle1; + } + } + } + + return -1; +} + +static void print_registers(const generator_ctx* ctx) { + for (int i = 0; i < 8; ++i) { + printf(" R%i = %i\n", i, ctx->registers[i].latency); + } +} + +bool hashx_program_generate(const siphash_state* key, hashx_program* program) { + generator_ctx ctx = { + .cycle = 0, + .sub_cycle = 0, /* 3 sub-cycles = 1 cycle */ + .mul_count = 0, + .chain_mul = false, + .latency = 0, + .ports = { 0 } + }; + hashx_siphash_rng_init(&ctx.gen, key); + for (int i = 0; i < 8; ++i) { + ctx.registers[i].last_op = -1; + ctx.registers[i].latency = 0; + ctx.registers[i].last_op_par = -1; + } + program->code_size = 0; + + int attempt = 0; + instr_type last_instr = -1; +#ifdef HASHX_PROGRAM_STATS + program->x86_size = 0; +#endif + + while (program->code_size < HASHX_PROGRAM_MAX_SIZE) { + instruction* instr = &program->code[program->code_size]; + TRACE_PRINT("CYCLE: %i/%i\n", ctx.sub_cycle, ctx.cycle); + + /* select an instruction template */ + const instr_template* tpl = select_template(&ctx, last_instr, attempt); + last_instr = tpl->group; + + TRACE_PRINT("Template: %s\n", tpl->x86_asm); + + instr_from_template(tpl, &ctx.gen, instr); + + /* calculate the earliest cycle when this instruction (all of its uOPs) can be scheduled for execution */ + int scheduleCycle = schedule_instr(tpl, &ctx, false); + if (scheduleCycle < 0) { + TRACE_PRINT("Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle); + /* __debugbreak(); */ + break; + } + + ctx.chain_mul = attempt > 0; + + /* find a source register (if applicable) that will be ready when this instruction executes */ + if (tpl->has_src) { + if (!select_source(tpl, instr, &ctx, scheduleCycle)) { + TRACE_PRINT("; src STALL (attempt %i)\n", attempt); + if (attempt++ < MAX_RETRIES) { + continue; + } + if (TRACE) { + printf("; select_source FAILED at cycle %i\n", ctx.cycle); + print_registers(&ctx); + /* __debugbreak(); */ + } + ctx.sub_cycle += 3; + ctx.cycle = ctx.sub_cycle / 3; + attempt = 0; + continue; + } + TRACE_PRINT("; src = r%i\n", instr->src); + } + + /* find a destination register that will be ready when this instruction executes */ + if (tpl->has_dst) { + if (!select_destination(tpl, instr, &ctx, scheduleCycle)) { + TRACE_PRINT("; dst STALL (attempt %i)\n", attempt); + if (attempt++ < MAX_RETRIES) { + continue; + } + if (TRACE) { + printf("; select_destination FAILED at cycle %i\n", ctx.cycle); + print_registers(&ctx); + /* __debugbreak(); */ + } + ctx.sub_cycle += 3; + ctx.cycle = ctx.sub_cycle / 3; + attempt = 0; + continue; + } + TRACE_PRINT("; dst = r%i\n", instr->dst); + } + attempt = 0; + + /* recalculate when the instruction can be scheduled for execution based on operand availability */ + scheduleCycle = schedule_instr(tpl, &ctx, true); + + if (scheduleCycle < 0) { + TRACE_PRINT("Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle); + break; + } + + /* terminating condition */ + if (scheduleCycle >= TARGET_CYCLE) { + break; + } + + if (tpl->has_dst) { + register_info* ri = &ctx.registers[instr->dst]; + int retireCycle = scheduleCycle + tpl->latency; + ri->latency = retireCycle; + ri->last_op = tpl->group; + ri->last_op_par = instr->op_par; + ctx.latency = MAX(retireCycle, ctx.latency); + TRACE_PRINT("; RETIRED at cycle %i\n", retireCycle); + } + + program->code_size++; +#ifdef HASHX_PROGRAM_STATS + program->x86_size += tpl->x86_size; +#endif + + ctx.mul_count += is_mul(instr->opcode); + + ++ctx.sub_cycle; + ctx.sub_cycle += (tpl->uop2 != PORT_NONE); + ctx.cycle = ctx.sub_cycle / 3; + } + +#ifdef HASHX_PROGRAM_STATS + memset(program->asic_latencies, 0, sizeof(program->asic_latencies)); + + program->counter = ctx.gen.counter; + program->wide_mul_count = 0; + program->mul_count = ctx.mul_count; + + /* Calculate ASIC latency: + Assumes 1 cycle latency for all operations and unlimited parallelization. */ + for (int i = 0; i < program->code_size; ++i) { + instruction* instr = &program->code[i]; + if (instr->dst < 0) + continue; + int last_dst = program->asic_latencies[instr->dst] + 1; + int lat_src = instr->dst != instr->src ? program->asic_latencies[instr->src] + 1 : 0; + program->asic_latencies[instr->dst] = MAX(last_dst, lat_src); + program->wide_mul_count += is_wide_mul(instr->opcode); + } + + program->asic_latency = 0; + program->cpu_latency = 0; + for (int i = 0; i < 8; ++i) { + program->asic_latency = MAX(program->asic_latency, program->asic_latencies[i]); + program->cpu_latencies[i] = ctx.registers[i].latency; + program->cpu_latency = MAX(program->cpu_latency, program->cpu_latencies[i]); + } + + program->ipc = program->code_size / (double)program->cpu_latency; + program->branch_count = 0; + memset(program->branches, 0, sizeof(program->branches)); + + if (TRACE) { + printf("; ALU port utilization:\n"); + printf("; (* = in use, _ = idle)\n"); + for (int i = 0; i < PORT_MAP_SIZE; ++i) { + printf("; %3i ", i); + for (int j = 0; j < NUM_PORTS; ++j) { + printf("%c", (ctx.ports[i][j] ? '*' : '_')); + } + printf("\n"); + } + } +#endif + + /* reject programs that don't meet the uniform complexity requirements */ + /* this happens in less than 1 seed out of 10000 */ + return + (program->code_size == REQUIREMENT_SIZE) & + (ctx.mul_count == REQUIREMENT_MUL_COUNT) & + (ctx.latency == REQUIREMENT_LATENCY - 1); /* cycles are numbered from 0 */ +} + +static const char* x86_reg_map[] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }; + +void hashx_program_asm_x86(const hashx_program* program) { + int target = 0; + for (unsigned i = 0; i < program->code_size; ++i) { + const instruction* instr = &program->code[i]; + switch (instr->opcode) + { + case INSTR_SUB_R: + printf("sub %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]); + break; + case INSTR_XOR_R: + printf("xor %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]); + break; + case INSTR_ADD_RS: + printf("lea %s, [%s+%s*%u]\n", x86_reg_map[instr->dst], x86_reg_map[instr->dst], x86_reg_map[instr->src], 1 << instr->imm32); + break; + case INSTR_MUL_R: + printf("imul %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]); + break; + case INSTR_ROR_C: + printf("ror %s, %u\n", x86_reg_map[instr->dst], instr->imm32); + break; + case INSTR_ADD_C: + printf("add %s, %i\n", x86_reg_map[instr->dst], instr->imm32); + break; + case INSTR_XOR_C: + printf("xor %s, %i\n", x86_reg_map[instr->dst], instr->imm32); + break; + case INSTR_UMULH_R: + printf("mov rax, %s\n", x86_reg_map[instr->dst]); + printf("mul %s\n", x86_reg_map[instr->src]); + printf("mov %s, rdx\n", x86_reg_map[instr->dst]); + break; + case INSTR_SMULH_R: + printf("mov rax, %s\n", x86_reg_map[instr->dst]); + printf("imul %s\n", x86_reg_map[instr->src]); + printf("mov %s, rdx\n", x86_reg_map[instr->dst]); + break; + case INSTR_TARGET: + printf("test edi, edi\n"); + printf("target_%i: cmovz esi, edi\n", i); + target = i; + break; + case INSTR_BRANCH: + printf("or edx, esi\n"); + printf("test edx, %i\n", instr->imm32); + printf("jz target_%i\n", target); + break; + default: + UNREACHABLE; + } + } +} diff --git a/src/ext/equix/hashx/src/program.h b/src/ext/equix/hashx/src/program.h new file mode 100644 index 0000000000..096cc4ee0a --- /dev/null +++ b/src/ext/equix/hashx/src/program.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef PROGRAM_H +#define PROGRAM_H + +#include <stdint.h> +#include <stdbool.h> +#include <hashx.h> +#include "instruction.h" +#include "siphash.h" +#include "blake2.h" + +#define HASHX_PROGRAM_MAX_SIZE 512 + +typedef struct hashx_program { + instruction code[HASHX_PROGRAM_MAX_SIZE]; + size_t code_size; +#ifdef HASHX_PROGRAM_STATS + unsigned counter; + double ipc; + int x86_size; + int cpu_latency; + int asic_latency; + int mul_count; + int wide_mul_count; + int cpu_latencies[8]; + int asic_latencies[8]; + int branch_count; + int branches[16]; +#endif +} hashx_program; + +#ifdef __cplusplus +extern "C" { +#endif + +HASHX_PRIVATE bool hashx_program_generate(const siphash_state* key, hashx_program* program); + +HASHX_PRIVATE void hashx_program_execute(const hashx_program* program, uint64_t r[8]); + +HASHX_PRIVATE void hashx_program_asm_x86(const hashx_program* program); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/ext/equix/hashx/src/program_exec.c b/src/ext/equix/hashx/src/program_exec.c new file mode 100644 index 0000000000..f8b991e01e --- /dev/null +++ b/src/ext/equix/hashx/src/program_exec.c @@ -0,0 +1,152 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "program.h" +#include "force_inline.h" +#include "unreachable.h" +#include "siphash.h" +#include "hashx_endian.h" + +#if defined(__SIZEOF_INT128__) +typedef unsigned __int128 uint128_t; +typedef __int128 int128_t; +static FORCE_INLINE uint64_t umulh(uint64_t a, uint64_t b) { + return ((uint128_t)a * b) >> 64; + } +static FORCE_INLINE int64_t smulh(int64_t a, int64_t b) { + return ((int128_t)a * b) >> 64; +} +#define HAVE_UMULH +#define HAVE_SMULH +#endif + +#if defined(_MSC_VER) +#pragma warning (disable : 4146) /* unary minus applied to unsigned type */ +#define HAS_VALUE(X) X ## 0 +#define EVAL_DEFINE(X) HAS_VALUE(X) +#include <intrin.h> +#include <stdlib.h> + +static FORCE_INLINE uint64_t rotr64(uint64_t x, unsigned int c) { + return _rotr64(x, c); +} + +#define HAVE_ROTR + +#if EVAL_DEFINE(__MACHINEARM64_X64(1)) +static FORCE_INLINE uint64_t umulh(uint64_t a, uint64_t b) { + return __umulh(a, b); +} +#define HAVE_UMULH +#endif + +#if EVAL_DEFINE(__MACHINEX64(1)) +static FORCE_INLINE int64_t smulh(int64_t a, int64_t b) { + int64_t hi; + _mul128(a, b, &hi); + return hi; +} +#define HAVE_SMULH +#endif + +#endif + +#ifndef HAVE_ROTR +static FORCE_INLINE uint64_t rotr64(uint64_t a, unsigned int b) { + return (a >> b) | (a << (64 - b)); +} +#define HAVE_ROTR +#endif + +#ifndef HAVE_UMULH +#define LO(x) ((x)&0xffffffff) +#define HI(x) ((x)>>32) +uint64_t umulh(uint64_t a, uint64_t b) { + uint64_t ah = HI(a), al = LO(a); + uint64_t bh = HI(b), bl = LO(b); + uint64_t x00 = al * bl; + uint64_t x01 = al * bh; + uint64_t x10 = ah * bl; + uint64_t x11 = ah * bh; + uint64_t m1 = LO(x10) + LO(x01) + HI(x00); + uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1); + uint64_t m3 = HI(x11) + HI(m2); + + return (m3 << 32) + LO(m2); +} +#undef LO +#undef HI +#define HAVE_UMULH +#endif + +#ifndef HAVE_SMULH +int64_t smulh(int64_t a, int64_t b) { + int64_t hi = umulh(a, b); + if (a < 0LL) hi -= b; + if (b < 0LL) hi -= a; + return hi; +} +#define HAVE_SMULH +#endif + +static FORCE_INLINE uint64_t sign_extend_2s_compl(uint32_t x) { + return (-1 == ~0) ? + (int64_t)(int32_t)(x) : + (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + +void hashx_program_execute(const hashx_program* program, uint64_t r[8]) { + int target = 0; + bool branch_enable = true; + uint32_t result = 0; + int branch_idx = 0; + for (int i = 0; i < program->code_size; ++i) { + const instruction* instr = &program->code[i]; + switch (instr->opcode) + { + case INSTR_UMULH_R: + result = r[instr->dst] = umulh(r[instr->dst], r[instr->src]); + break; + case INSTR_SMULH_R: + result = r[instr->dst] = smulh(r[instr->dst], r[instr->src]); + break; + case INSTR_MUL_R: + r[instr->dst] *= r[instr->src]; + break; + case INSTR_SUB_R: + r[instr->dst] -= r[instr->src]; + break; + case INSTR_XOR_R: + r[instr->dst] ^= r[instr->src]; + break; + case INSTR_ADD_RS: + r[instr->dst] += r[instr->src] << instr->imm32; + break; + case INSTR_ROR_C: + r[instr->dst] = rotr64(r[instr->dst], instr->imm32); + break; + case INSTR_ADD_C: + r[instr->dst] += sign_extend_2s_compl(instr->imm32); + break; + case INSTR_XOR_C: + r[instr->dst] ^= sign_extend_2s_compl(instr->imm32); + break; + case INSTR_TARGET: + target = i; + break; + case INSTR_BRANCH: + if (branch_enable && (result & instr->imm32) == 0) { + i = target; + branch_enable = false; +#ifdef HASHX_PROGRAM_STATS + ((hashx_program*)program)->branch_count++; + ((hashx_program*)program)->branches[branch_idx]++; +#endif + } + branch_idx++; + break; + default: + UNREACHABLE; + } + } +} diff --git a/src/ext/equix/hashx/src/siphash.c b/src/ext/equix/hashx/src/siphash.c new file mode 100644 index 0000000000..0acfca8814 --- /dev/null +++ b/src/ext/equix/hashx/src/siphash.c @@ -0,0 +1,66 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "siphash.h" +#include "hashx_endian.h" +#include "unreachable.h" + +uint64_t hashx_siphash13_ctr(uint64_t input, const siphash_state* keys) { + uint64_t v0 = keys->v0; + uint64_t v1 = keys->v1; + uint64_t v2 = keys->v2; + uint64_t v3 = keys->v3; + + v3 ^= input; + + SIPROUND(v0, v1, v2, v3); + + v0 ^= input; + v2 ^= 0xff; + + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + + return (v0 ^ v1) ^ (v2 ^ v3); +} + +void hashx_siphash24_ctr_state512(const siphash_state* keys, uint64_t input, + uint64_t state_out[8]) { + + uint64_t v0 = keys->v0; + uint64_t v1 = keys->v1; + uint64_t v2 = keys->v2; + uint64_t v3 = keys->v3; + + v1 ^= 0xee; + v3 ^= input; + + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + + v0 ^= input; + v2 ^= 0xee; + + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + + state_out[0] = v0; + state_out[1] = v1; + state_out[2] = v2; + state_out[3] = v3; + + v1 ^= 0xdd; + + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + SIPROUND(v0, v1, v2, v3); + + state_out[4] = v0; + state_out[5] = v1; + state_out[6] = v2; + state_out[7] = v3; +} diff --git a/src/ext/equix/hashx/src/siphash.h b/src/ext/equix/hashx/src/siphash.h new file mode 100644 index 0000000000..bb468402c3 --- /dev/null +++ b/src/ext/equix/hashx/src/siphash.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef SIPHASH_H +#define SIPHASH_H + +#include <stdint.h> +#include <hashx.h> + +#define ROTL(x, b) (((x) << (b)) | ((x) >> (64 - (b)))) +#define SIPROUND(v0, v1, v2, v3) \ + do { \ + v0 += v1; v2 += v3; v1 = ROTL(v1, 13); \ + v3 = ROTL(v3, 16); v1 ^= v0; v3 ^= v2; \ + v0 = ROTL(v0, 32); v2 += v1; v0 += v3; \ + v1 = ROTL(v1, 17); v3 = ROTL(v3, 21); \ + v1 ^= v2; v3 ^= v0; v2 = ROTL(v2, 32); \ + } while (0) + +typedef struct siphash_state { + uint64_t v0, v1, v2, v3; +} siphash_state; + +#ifdef __cplusplus +extern "C" { +#endif + +HASHX_PRIVATE uint64_t hashx_siphash13_ctr(uint64_t input, const siphash_state* keys); +HASHX_PRIVATE void hashx_siphash24_ctr_state512(const siphash_state* keys, uint64_t input, uint64_t state_out[8]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/ext/equix/hashx/src/siphash_rng.c b/src/ext/equix/hashx/src/siphash_rng.c new file mode 100644 index 0000000000..f1ec23bf47 --- /dev/null +++ b/src/ext/equix/hashx/src/siphash_rng.c @@ -0,0 +1,31 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "siphash_rng.h" + +void hashx_siphash_rng_init(siphash_rng* gen, const siphash_state* state) { + gen->keys = *state; + gen->counter = 0; + gen->count8 = 0; + gen->count32 = 0; +} + +uint8_t hashx_siphash_rng_u8(siphash_rng* gen) { + if (gen->count8 == 0) { + gen->buffer8 = hashx_siphash13_ctr(gen->counter, &gen->keys); + gen->counter++; + gen->count8 = sizeof(gen->buffer8); + } + gen->count8--; + return gen->buffer8 >> (gen->count8 * 8); +} + +uint32_t hashx_siphash_rng_u32(siphash_rng* gen) { + if (gen->count32 == 0) { + gen->buffer32 = hashx_siphash13_ctr(gen->counter, &gen->keys); + gen->counter++; + gen->count32 = sizeof(gen->buffer32) / sizeof(uint32_t); + } + gen->count32--; + return gen->buffer32 >> (gen->count32 * 32); +} diff --git a/src/ext/equix/hashx/src/siphash_rng.h b/src/ext/equix/hashx/src/siphash_rng.h new file mode 100644 index 0000000000..638b177e06 --- /dev/null +++ b/src/ext/equix/hashx/src/siphash_rng.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef SIPHASH_GENERATOR_H +#define SIPHASH_GENERATOR_H + +#include <stdint.h> +#include <hashx.h> +#include "siphash.h" + +typedef struct siphash_rng { + siphash_state keys; + uint64_t counter; + uint64_t buffer8, buffer32; + unsigned count8, count32; +} siphash_rng; + +#ifdef __cplusplus +extern "C" { +#endif + +HASHX_PRIVATE void hashx_siphash_rng_init(siphash_rng* gen, const siphash_state* state); +HASHX_PRIVATE uint32_t hashx_siphash_rng_u32(siphash_rng* gen); +HASHX_PRIVATE uint8_t hashx_siphash_rng_u8(siphash_rng* gen); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/ext/equix/hashx/src/test_utils.h b/src/ext/equix/hashx/src/test_utils.h new file mode 100644 index 0000000000..54c2f7ec80 --- /dev/null +++ b/src/ext/equix/hashx/src/test_utils.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef TEST_UTILS_H +#define TEST_UTILS_H + +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <stdlib.h> +#include <hashx.h> + +static inline void read_option(const char* option, int argc, char** argv, bool* out) { + for (int i = 0; i < argc; ++i) { + if (strcmp(argv[i], option) == 0) { + *out = true; + return; + } + } + *out = false; +} + +static inline void read_int_option(const char* option, int argc, char** argv, int* out, int default_val) { + for (int i = 0; i < argc - 1; ++i) { + if (strcmp(argv[i], option) == 0 && (*out = atoi(argv[i + 1])) > 0) { + return; + } + } + *out = default_val; +} + +static inline char parse_nibble(char hex) { + hex &= ~0x20; + return (hex & 0x40) ? hex - ('A' - 10) : hex & 0xf; +} + +static inline void hex2bin(const char* in, int length, char* out) { + for (int i = 0; i < length; i += 2) { + char nibble1 = parse_nibble(*in++); + char nibble2 = parse_nibble(*in++); + *out++ = nibble1 << 4 | nibble2; + } +} + +static inline void output_hex(const char* data, int length) { + for (unsigned i = 0; i < length; ++i) + printf("%02x", data[i] & 0xff); +} + +static inline bool hashes_equal(char* a, char* b) { + return memcmp(a, b, HASHX_SIZE) == 0; +} + +static inline bool equals_hex(const void* hash, const char* hex) { + char reference[HASHX_SIZE]; + hex2bin(hex, 2 * HASHX_SIZE, reference); + return memcmp(hash, reference, sizeof(reference)) == 0; +} + +#endif diff --git a/src/ext/equix/hashx/src/tests.c b/src/ext/equix/hashx/src/tests.c new file mode 100644 index 0000000000..f04d8b9d8f --- /dev/null +++ b/src/ext/equix/hashx/src/tests.c @@ -0,0 +1,219 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifdef NDEBUG +#undef NDEBUG +#endif + +#include <assert.h> +#include "test_utils.h" + +typedef bool test_func(); + +static int test_no = 0; + +static hashx_ctx* ctx_int = NULL; +static hashx_ctx* ctx_cmp = NULL; + +static const char seed1[] = "This is a test"; +static const char seed2[] = "Lorem ipsum dolor sit amet"; + +static const uint64_t counter1 = 0; +static const uint64_t counter2 = 123456; +static const uint64_t counter3 = 987654321123456789; + +static const unsigned char long_input[] = { + 0x0b, 0x0b, 0x98, 0xbe, 0xa7, 0xe8, 0x05, 0xe0, 0x01, 0x0a, 0x21, 0x26, + 0xd2, 0x87, 0xa2, 0xa0, 0xcc, 0x83, 0x3d, 0x31, 0x2c, 0xb7, 0x86, 0x38, + 0x5a, 0x7c, 0x2f, 0x9d, 0xe6, 0x9d, 0x25, 0x53, 0x7f, 0x58, 0x4a, 0x9b, + 0xc9, 0x97, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x66, 0x6f, 0xd8, 0x75, 0x3b, + 0xf6, 0x1a, 0x86, 0x31, 0xf1, 0x29, 0x84, 0xe3, 0xfd, 0x44, 0xf4, 0x01, + 0x4e, 0xca, 0x62, 0x92, 0x76, 0x81, 0x7b, 0x56, 0xf3, 0x2e, 0x9b, 0x68, + 0xbd, 0x82, 0xf4, 0x16 +}; + +#define RUN_TEST(x) run_test(#x, &x) + +static void run_test(const char* name, test_func* func) { + printf("[%2i] %-40s ... ", ++test_no, name); + printf(func() ? "PASSED\n" : "SKIPPED\n"); +} + +static bool test_alloc() { + ctx_int = hashx_alloc(HASHX_INTERPRETED); + assert(ctx_int != NULL && ctx_int != HASHX_NOTSUPP); + return true; +} + +static bool test_free() { + hashx_free(ctx_int); + hashx_free(ctx_cmp); + return true; +} + +static bool test_make1() { + int result = hashx_make(ctx_int, seed1, sizeof(seed1)); + assert(result == 1); + return true; +} + +static bool test_hash_ctr1() { +#ifdef HASHX_SALT + return false; +#endif +#ifndef HASHX_BLOCK_MODE + char hash[HASHX_SIZE]; + hashx_exec(ctx_int, counter2, hash); + /* printf("\n"); + output_hex(hash, HASHX_SIZE); + printf("\n"); */ + assert(equals_hex(hash, "aebdd50aa67c93afb82a4c534603b65e46decd584c55161c526ebc099415ccf1")); + return true; +#else + return false; +#endif +} + +static bool test_hash_ctr2() { +#ifdef HASHX_SALT + return false; +#endif +#ifndef HASHX_BLOCK_MODE + char hash[HASHX_SIZE]; + hashx_exec(ctx_int, counter1, hash); + assert(equals_hex(hash, "2b2f54567dcbea98fdb5d5e5ce9a65983c4a4e35ab1464b1efb61e83b7074bb2")); + return true; +#else + return false; +#endif +} + +static bool test_make2() { + int result = hashx_make(ctx_int, seed2, sizeof(seed2)); + assert(result == 1); + return true; +} + +static bool test_hash_ctr3() { +#ifdef HASHX_SALT + return false; +#endif +#ifndef HASHX_BLOCK_MODE + char hash[HASHX_SIZE]; + hashx_exec(ctx_int, counter2, hash); + assert(equals_hex(hash, "ab3d155bf4bbb0aa3a71b7801089826186e44300e6932e6ffd287cf302bbb0ba")); + return true; +#else + return false; +#endif +} + +static bool test_hash_ctr4() { +#ifdef HASHX_SALT + return false; +#endif +#ifndef HASHX_BLOCK_MODE + char hash[HASHX_SIZE]; + hashx_exec(ctx_int, counter3, hash); + assert(equals_hex(hash, "8dfef0497c323274a60d1d93292b68d9a0496379ba407b4341cf868a14d30113")); + return true; +#else + return false; +#endif +} + +static bool test_hash_block1() { +#ifdef HASHX_SALT + return false; +#endif +#ifndef HASHX_BLOCK_MODE + return false; +#else + char hash[HASHX_SIZE]; + hashx_exec(ctx_int, long_input, sizeof(long_input), hash); + assert(equals_hex(hash, "d0b232b832459501ca1ac9dc0429fd931414ead7624a457e375a43ea3e5e737a")); + return true; +#endif +} + +static bool test_alloc_compiler() { + ctx_cmp = hashx_alloc(HASHX_COMPILED); + assert(ctx_cmp != NULL); + return ctx_cmp != HASHX_NOTSUPP; +} + +static bool test_make3() { + if (ctx_cmp == HASHX_NOTSUPP) + return false; + + int result = hashx_make(ctx_cmp, seed2, sizeof(seed2)); + assert(result == 1); + return true; +} + +static bool test_compiler_ctr1() { + if (ctx_cmp == HASHX_NOTSUPP) + return false; + +#ifndef HASHX_BLOCK_MODE + char hash1[HASHX_SIZE]; + char hash2[HASHX_SIZE]; + hashx_exec(ctx_int, counter2, hash1); + hashx_exec(ctx_cmp, counter2, hash2); + assert(hashes_equal(hash1, hash2)); + return true; +#else + return false; +#endif +} + +static bool test_compiler_ctr2() { + if (ctx_cmp == HASHX_NOTSUPP) + return false; + +#ifndef HASHX_BLOCK_MODE + char hash1[HASHX_SIZE]; + char hash2[HASHX_SIZE]; + hashx_exec(ctx_int, counter1, hash1); + hashx_exec(ctx_cmp, counter1, hash2); + assert(hashes_equal(hash1, hash2)); + return true; +#else + return false; +#endif +} + +static bool test_compiler_block1() { + if (ctx_cmp == HASHX_NOTSUPP) + return false; +#ifndef HASHX_BLOCK_MODE + return false; +#else + char hash1[HASHX_SIZE]; + char hash2[HASHX_SIZE]; + hashx_exec(ctx_int, long_input, sizeof(long_input), hash1); + hashx_exec(ctx_cmp, long_input, sizeof(long_input), hash2); + assert(hashes_equal(hash1, hash2)); + return true; +#endif +} + +int main() { + RUN_TEST(test_alloc); + RUN_TEST(test_make1); + RUN_TEST(test_hash_ctr1); + RUN_TEST(test_hash_ctr2); + RUN_TEST(test_make2); + RUN_TEST(test_hash_ctr3); + RUN_TEST(test_hash_ctr4); + RUN_TEST(test_alloc_compiler); + RUN_TEST(test_make3); + RUN_TEST(test_compiler_ctr1); + RUN_TEST(test_compiler_ctr2); + RUN_TEST(test_hash_block1); + RUN_TEST(test_compiler_block1); + RUN_TEST(test_free); + + printf("\nAll tests were successful\n"); + return 0; +} diff --git a/src/ext/equix/hashx/src/unreachable.h b/src/ext/equix/hashx/src/unreachable.h new file mode 100644 index 0000000000..69807fb395 --- /dev/null +++ b/src/ext/equix/hashx/src/unreachable.h @@ -0,0 +1,9 @@ +#ifndef UNREACHABLE +#ifdef __GNUC__ +#define UNREACHABLE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE __assume(0) +#else +#define UNREACHABLE +#endif +#endif diff --git a/src/ext/equix/hashx/src/virtual_memory.c b/src/ext/equix/hashx/src/virtual_memory.c new file mode 100644 index 0000000000..7dc63e1e68 --- /dev/null +++ b/src/ext/equix/hashx/src/virtual_memory.c @@ -0,0 +1,126 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "virtual_memory.h" + +#ifdef HASHX_WIN +#include <windows.h> +#else +#ifdef __APPLE__ +#include <mach/vm_statistics.h> +#endif +#include <sys/types.h> +#include <sys/mman.h> +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#define PAGE_READONLY PROT_READ +#define PAGE_READWRITE (PROT_READ | PROT_WRITE) +#define PAGE_EXECUTE_READ (PROT_READ | PROT_EXEC) +#define PAGE_EXECUTE_READWRITE (PROT_READ | PROT_WRITE | PROT_EXEC) +#endif + +#ifdef HASHX_WIN + +static int set_privilege(const char* pszPrivilege, BOOL bEnable) { + HANDLE hToken; + TOKEN_PRIVILEGES tp; + BOOL status; + DWORD error; + + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES + | TOKEN_QUERY, &hToken)) + return 0; + + if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) + return 0; + + tp.PrivilegeCount = 1; + + if (bEnable) + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + else + tp.Privileges[0].Attributes = 0; + + status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, + (PTOKEN_PRIVILEGES)NULL, 0); + error = GetLastError(); + + CloseHandle(hToken); + + return status && (error == ERROR_SUCCESS); +} +#endif + +void* hashx_vm_alloc(size_t bytes) { + void* mem; +#ifdef HASHX_WIN + mem = VirtualAlloc(NULL, bytes, MEM_COMMIT, PAGE_READWRITE); +#else + mem = mmap(NULL, bytes, PAGE_READWRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + return NULL; +#endif + return mem; +} + +static inline int page_protect(void* ptr, size_t bytes, int rules) { +#ifdef HASHX_WIN + DWORD oldp; + if (!VirtualProtect(ptr, bytes, (DWORD)rules, &oldp)) { + return 0; + } +#else + if (-1 == mprotect(ptr, bytes, rules)) + return 0; +#endif + return 1; +} + +void hashx_vm_rw(void* ptr, size_t bytes) { + page_protect(ptr, bytes, PAGE_READWRITE); +} + +void hashx_vm_rx(void* ptr, size_t bytes) { + page_protect(ptr, bytes, PAGE_EXECUTE_READ); +} + +void* hashx_vm_alloc_huge(size_t bytes) { + void* mem; +#ifdef HASHX_WIN + set_privilege("SeLockMemoryPrivilege", 1); + SIZE_T page_min = GetLargePageMinimum(); + if (page_min > 0) { + mem = VirtualAlloc(NULL, ALIGN_SIZE(bytes, page_min), MEM_COMMIT + | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + } + else { + mem = NULL; + } +#else +#ifdef __APPLE__ + mem = mmap(NULL, bytes, PAGE_READWRITE, MAP_PRIVATE | MAP_ANONYMOUS, + VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); +#elif defined(__FreeBSD__) + mem = mmap(NULL, bytes, PAGE_READWRITE, MAP_PRIVATE | MAP_ANONYMOUS + | MAP_ALIGNED_SUPER, -1, 0); +#elif defined(__OpenBSD__) + mem = MAP_FAILED; // OpenBSD does not support huge pages +#else + mem = mmap(NULL, bytes, PAGE_READWRITE, MAP_PRIVATE | MAP_ANONYMOUS + | MAP_HUGETLB | MAP_POPULATE, -1, 0); +#endif + if (mem == MAP_FAILED) { + mem = NULL; + } +#endif + return mem; +} + +void hashx_vm_free(void* ptr, size_t bytes) { +#ifdef HASHX_WIN + VirtualFree(ptr, 0, MEM_RELEASE); +#else + munmap(ptr, bytes); +#endif +} diff --git a/src/ext/equix/hashx/src/virtual_memory.h b/src/ext/equix/hashx/src/virtual_memory.h new file mode 100644 index 0000000000..d08f74dcc6 --- /dev/null +++ b/src/ext/equix/hashx/src/virtual_memory.h @@ -0,0 +1,19 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef VIRTUAL_MEMORY_H +#define VIRTUAL_MEMORY_H + +#include <stdint.h> +#include <stddef.h> +#include <hashx.h> + +#define ALIGN_SIZE(pos, align) ((((pos) - 1) / (align) + 1) * (align)) + +HASHX_PRIVATE void* hashx_vm_alloc(size_t size); +HASHX_PRIVATE void hashx_vm_rw(void* ptr, size_t size); +HASHX_PRIVATE void hashx_vm_rx(void* ptr, size_t size); +HASHX_PRIVATE void* hashx_vm_alloc_huge(size_t size); +HASHX_PRIVATE void hashx_vm_free(void* ptr, size_t size); + +#endif diff --git a/src/ext/equix/include/equix.h b/src/ext/equix/include/equix.h new file mode 100644 index 0000000000..01ab249437 --- /dev/null +++ b/src/ext/equix/include/equix.h @@ -0,0 +1,145 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef EQUIX_H +#define EQUIX_H + +#include <stdint.h> +#include <stddef.h> + +/* + * The solver will return at most this many solutions. + */ +#define EQUIX_MAX_SOLS 8 + +/* + * The number of indices. + */ +#define EQUIX_NUM_IDX 8 + +/* + * 16-bit index. + */ +typedef uint16_t equix_idx; + +/* + * The solution. + */ +typedef struct equix_solution { + equix_idx idx[EQUIX_NUM_IDX]; +} equix_solution; + +/* + * Solution verification results + */ +typedef enum equix_result { + EQUIX_OK, /* Solution is valid */ + EQUIX_CHALLENGE, /* The challenge is invalid (the internal hash + function doesn't pass validation). */ + EQUIX_ORDER, /* Indices are not in the correct order. */ + EQUIX_PARTIAL_SUM, /* The partial sums of the hash values don't + have the required number of trailing zeroes. */ + EQUIX_FINAL_SUM /* The hash values don't sum to zero. */ +} equix_result; + +/* + * Opaque struct that holds the Equi-X context + */ +typedef struct equix_ctx equix_ctx; + +/* + * Flags for context creation +*/ +typedef enum equix_ctx_flags { + EQUIX_CTX_VERIFY = 0, /* Context for verification */ + EQUIX_CTX_SOLVE = 1, /* Context for solving */ + EQUIX_CTX_COMPILE = 2, /* Compile internal hash function */ + EQUIX_CTX_HUGEPAGES = 4, /* Allocate solver memory using HugePages */ +} equix_ctx_flags; + +/* Sentinel value used to indicate unsupported type */ +#define EQUIX_NOTSUPP ((equix_ctx*)-1) + +#if defined(_WIN32) || defined(__CYGWIN__) +#define EQUIX_WIN +#endif + +/* Shared/static library definitions */ +#ifdef EQUIX_WIN + #ifdef EQUIX_SHARED + #define EQUIX_API __declspec(dllexport) + #elif !defined(EQUIX_STATIC) + #define EQUIX_API __declspec(dllimport) + #else + #define EQUIX_API + #endif + #define EQUIX_PRIVATE +#else + #ifdef EQUIX_SHARED + #define EQUIX_API __attribute__ ((visibility ("default"))) + #else + #define EQUIX_API __attribute__ ((visibility ("hidden"))) + #endif + #define EQUIX_PRIVATE __attribute__ ((visibility ("hidden"))) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allocate an Equi-X context. + * + * @param flags is the type of context to be created + * + * @return pointer to a newly created context. Returns NULL on memory + * allocation failure and EQUIX_NOTSUPP if the requested type + * is not supported. + */ +EQUIX_API equix_ctx* equix_alloc(equix_ctx_flags flags); + +/* +* Free an Equi-X a context. +* +* @param ctx is a pointer to the context +*/ +EQUIX_API void equix_free(equix_ctx* ctx); + +/* + * Find Equi-X solutions for the given challenge. + * + * @param ctx pointer to an Equi-X context + * @param challenge pointer to the challenge data + * @param challenge_size size of the challenge + * @param output pointer to the output array where solutions will be + * stored + * + * @return the number of solutions found + */ +EQUIX_API int equix_solve( + equix_ctx* ctx, + const void* challenge, + size_t challenge_size, + equix_solution output[EQUIX_MAX_SOLS]); + +/* + * Verify an Equi-X solution. + * + * @param ctx pointer to an Equi-X context + * @param challenge pointer to the challenge data + * @param challenge_size size of the challenge + * @param solution pointer to the solution to be verified + * + * @return verification result +*/ +EQUIX_API equix_result equix_verify( + equix_ctx* ctx, + const void* challenge, + size_t challenge_size, + const equix_solution* solution); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/ext/equix/src/bench.c b/src/ext/equix/src/bench.c new file mode 100644 index 0000000000..e5b925c3d2 --- /dev/null +++ b/src/ext/equix/src/bench.c @@ -0,0 +1,175 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <stdlib.h> +#include <equix.h> +#include <test_utils.h> +#include <hashx_thread.h> +#include <hashx_time.h> + +typedef struct solver_output { + equix_solution sols[EQUIX_MAX_SOLS]; + int count; +} solver_output; + +typedef struct worker_job { + int id; + hashx_thread thread; + equix_ctx* ctx; + int64_t total_sols; + int start; + int step; + int end; + solver_output* output; +} worker_job; + +static hashx_thread_retval worker(void* args) { + worker_job* job = (worker_job*)args; + job->total_sols = 0; + solver_output* outptr = job->output; + for (int seed = job->start; seed < job->end; seed += job->step) { + int count = equix_solve(job->ctx, &seed, sizeof(seed), outptr->sols); + outptr->count = count; + job->total_sols += count; + outptr++; + } + return HASHX_THREAD_SUCCESS; +} + +static void print_solution(int nonce, const equix_solution* sol) { + output_hex((char*)&nonce, sizeof(nonce)); + printf(" : { "); + for (int idx = 0; idx < EQUIX_NUM_IDX; ++idx) { + printf("%#06x%s", sol->idx[idx], + idx != EQUIX_NUM_IDX - 1 ? ", " : ""); + } + printf(" }\n"); +} + +static const char* result_names[] = { + "OK", + "Invalid nonce", + "Indices out of order", + "Nonzero partial sum", + "Nonzero final sum" +}; + +static void print_help(char* executable) { + printf("Usage: %s [OPTIONS]\n", executable); + printf("Supported options:\n"); + printf(" --help show this message\n"); + printf(" --nonces N solve N nonces (default: N=500)\n"); + printf(" --start S start with nonce S (default: S=0)\n"); + printf(" --threads T use T threads (default: T=1)\n"); + printf(" --interpret use HashX interpreter\n"); + printf(" --hugepages use hugepages\n"); + printf(" --sols print all solutions\n"); +} + +int main(int argc, char** argv) { + int nonces, start, threads; + bool interpret, huge_pages, print_sols, help; + read_option("--help", argc, argv, &help); + if (help) { + print_help(argv[0]); + return 0; + } + read_int_option("--nonces", argc, argv, &nonces, 500); + read_int_option("--start", argc, argv, &start, 0); + read_option("--interpret", argc, argv, &interpret); + read_option("--hugepages", argc, argv, &huge_pages); + read_option("--sols", argc, argv, &print_sols); + read_int_option("--threads", argc, argv, &threads, 1); + equix_ctx_flags flags = EQUIX_CTX_SOLVE; + if (!interpret) { + flags |= EQUIX_CTX_COMPILE; + } + if (huge_pages) { + flags |= EQUIX_CTX_HUGEPAGES; + } + worker_job* jobs = malloc(sizeof(worker_job) * threads); + if (jobs == NULL) { + printf("Error: memory allocation failure\n"); + return 1; + } + int per_thread = (nonces + threads - 1) / threads; + for (int thd = 0; thd < threads; ++thd) { + jobs[thd].ctx = equix_alloc(flags); + if (jobs[thd].ctx == NULL) { + printf("Error: memory allocation failure\n"); + return 1; + } + if (jobs[thd].ctx == EQUIX_NOTSUPP) { + printf("Error: not supported. Try with --interpret\n"); + return 1; + } + jobs[thd].id = thd; + jobs[thd].start = start + thd; + jobs[thd].step = threads; + jobs[thd].end = start + nonces; + jobs[thd].output = malloc(sizeof(solver_output) * per_thread); + if (jobs[thd].output == NULL) { + printf("Error: memory allocation failure\n"); + return 1; + } + } + printf("Solving nonces %i-%i (interpret: %i, hugepages: %i, threads: %i) ...\n", start, start + nonces - 1, interpret, huge_pages, threads); + int total_sols = 0; + double time_start, time_end; + time_start = hashx_time(); + if (threads > 1) { + for (int thd = 0; thd < threads; ++thd) { + jobs[thd].thread = hashx_thread_create(&worker, &jobs[thd]); + } + for (int thd = 0; thd < threads; ++thd) { + hashx_thread_join(jobs[thd].thread); + } + } + else { + worker(jobs); + } + time_end = hashx_time(); + for (int thd = 0; thd < threads; ++thd) { + total_sols += jobs[thd].total_sols; + } + double elapsed = time_end - time_start; + printf("%f solutions/nonce\n", total_sols / (double)nonces); + printf("%f solutions/sec. (%i thread%s)\n", total_sols / elapsed, threads, threads > 1 ? "s" : ""); + if (print_sols) { + for (int thd = 0; thd < threads; ++thd) { + worker_job* job = &jobs[thd]; + solver_output* outptr = job->output; + for (int seed = job->start; seed < job->end; seed += job->step) { + for (int sol = 0; sol < outptr->count; ++sol) { + print_solution(seed, &outptr->sols[sol]); + } + outptr++; + } + } + } + time_start = hashx_time(); + for (int thd = 0; thd < threads; ++thd) { + worker_job* job = &jobs[thd]; + solver_output* outptr = job->output; + for (int seed = job->start; seed < job->end; seed += job->step) { + for (int sol = 0; sol < outptr->count; ++sol) { + equix_result result = equix_verify(job->ctx, &seed, sizeof(seed), &outptr->sols[sol]); + if (result != EQUIX_OK) { + printf("Invalid solution (%s):\n", result_names[result]); + print_solution(seed, &outptr->sols[sol]); + } + } + outptr++; + } + } + time_end = hashx_time(); + printf("%f verifications/sec. (1 thread)\n", total_sols / (time_end - time_start)); + for (int thd = 0; thd < threads; ++thd) { + free(jobs[thd].output); + } + free(jobs); + return 0; +} diff --git a/src/ext/equix/src/context.c b/src/ext/equix/src/context.c new file mode 100644 index 0000000000..b0aa2d40e5 --- /dev/null +++ b/src/ext/equix/src/context.c @@ -0,0 +1,57 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdlib.h> +#include <equix.h> +#include <virtual_memory.h> +#include "context.h" +#include "solver_heap.h" + +equix_ctx* equix_alloc(equix_ctx_flags flags) { + equix_ctx* ctx_failure = NULL; + equix_ctx* ctx = malloc(sizeof(equix_ctx)); + if (ctx == NULL) { + goto failure; + } + ctx->flags = flags & EQUIX_CTX_COMPILE; + ctx->hash_func = hashx_alloc(flags & EQUIX_CTX_COMPILE ? + HASHX_COMPILED : HASHX_INTERPRETED); + if (ctx->hash_func == NULL) { + goto failure; + } + if (ctx->hash_func == HASHX_NOTSUPP) { + ctx_failure = EQUIX_NOTSUPP; + goto failure; + } + if (flags & EQUIX_CTX_SOLVE) { + if (flags & EQUIX_CTX_HUGEPAGES) { + ctx->heap = hashx_vm_alloc_huge(sizeof(solver_heap)); + } + else { + ctx->heap = malloc(sizeof(solver_heap)); + } + if (ctx->heap == NULL) { + goto failure; + } + } + ctx->flags = flags; + return ctx; +failure: + equix_free(ctx); + return ctx_failure; +} + +void equix_free(equix_ctx* ctx) { + if (ctx != NULL && ctx != EQUIX_NOTSUPP) { + if (ctx->flags & EQUIX_CTX_SOLVE) { + if (ctx->flags & EQUIX_CTX_HUGEPAGES) { + hashx_vm_free(ctx->heap, sizeof(solver_heap)); + } + else { + free(ctx->heap); + } + } + hashx_free(ctx->hash_func); + free(ctx); + } +} diff --git a/src/ext/equix/src/context.h b/src/ext/equix/src/context.h new file mode 100644 index 0000000000..69dadb8069 --- /dev/null +++ b/src/ext/equix/src/context.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef CONTEXT_H +#define CONTEXT_H + +#include <equix.h> +#include <hashx.h> + +typedef struct solver_heap solver_heap; + +typedef struct equix_ctx { + hashx_ctx* hash_func; + solver_heap* heap; + equix_ctx_flags flags; +} equix_ctx; + +#endif diff --git a/src/ext/equix/src/equix.c b/src/ext/equix/src/equix.c new file mode 100644 index 0000000000..71ebd9ca38 --- /dev/null +++ b/src/ext/equix/src/equix.c @@ -0,0 +1,96 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> + +#include <equix.h> +#include <hashx.h> +#include "context.h" +#include "solver.h" +#include <hashx_endian.h> + +static bool verify_order(const equix_solution* solution) { + return + tree_cmp4(&solution->idx[0], &solution->idx[4]) & + tree_cmp2(&solution->idx[0], &solution->idx[2]) & + tree_cmp2(&solution->idx[4], &solution->idx[6]) & + tree_cmp1(&solution->idx[0], &solution->idx[1]) & + tree_cmp1(&solution->idx[2], &solution->idx[3]) & + tree_cmp1(&solution->idx[4], &solution->idx[5]) & + tree_cmp1(&solution->idx[6], &solution->idx[7]); +} + +static uint64_t sum_pair(hashx_ctx* hash_func, equix_idx left, equix_idx right) { + uint8_t hash_left[HASHX_SIZE]; + uint8_t hash_right[HASHX_SIZE]; + hashx_exec(hash_func, left, hash_left); + hashx_exec(hash_func, right, hash_right); + return load64(hash_left) + load64(hash_right); +} + +static equix_result verify_internal(hashx_ctx* hash_func, const equix_solution* solution) { + uint64_t pair0 = sum_pair(hash_func, solution->idx[0], solution->idx[1]); + if (pair0 & EQUIX_STAGE1_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair1 = sum_pair(hash_func, solution->idx[2], solution->idx[3]); + if (pair1 & EQUIX_STAGE1_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair4 = pair0 + pair1; + if (pair4 & EQUIX_STAGE2_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair2 = sum_pair(hash_func, solution->idx[4], solution->idx[5]); + if (pair2 & EQUIX_STAGE1_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair3 = sum_pair(hash_func, solution->idx[6], solution->idx[7]); + if (pair3 & EQUIX_STAGE1_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair5 = pair2 + pair3; + if (pair5 & EQUIX_STAGE2_MASK) { + return EQUIX_PARTIAL_SUM; + } + uint64_t pair6 = pair4 + pair5; + if (pair6 & EQUIX_FULL_MASK) { + return EQUIX_FINAL_SUM; + } + return EQUIX_OK; +} + +int equix_solve( + equix_ctx* ctx, + const void* challenge, + size_t challenge_size, + equix_solution output[EQUIX_MAX_SOLS]) +{ + if ((ctx->flags & EQUIX_CTX_SOLVE) == 0) { + return 0; + } + + if (!hashx_make(ctx->hash_func, challenge, challenge_size)) { + return 0; + } + + return equix_solver_solve(ctx->hash_func, ctx->heap, output); +} + + +equix_result equix_verify( + equix_ctx* ctx, + const void* challenge, + size_t challenge_size, + const equix_solution* solution) +{ + if (!verify_order(solution)) { + return EQUIX_ORDER; + } + if (!hashx_make(ctx->hash_func, challenge, challenge_size)) { + return EQUIX_CHALLENGE; + } + return verify_internal(ctx->hash_func, solution); +} diff --git a/src/ext/equix/src/solver.c b/src/ext/equix/src/solver.c new file mode 100644 index 0000000000..480c699c60 --- /dev/null +++ b/src/ext/equix/src/solver.c @@ -0,0 +1,270 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#include "solver.h" +#include "context.h" +#include "solver_heap.h" +#include <hashx_endian.h> +#include <string.h> +#include <stdbool.h> +#include <assert.h> +#include <stdio.h> + +#ifdef _MSC_VER +#pragma warning (disable : 4146) /* unary minus applied to unsigned type */ +#endif + +#define CLEAR(x) memset(&x, 0, sizeof(x)) +#define MAKE_ITEM(bucket, left, right) ((left) << 17 | (right) << 8 | (bucket)) +#define ITEM_BUCKET(item) (item) % NUM_COARSE_BUCKETS +#define ITEM_LEFT_IDX(item) (item) >> 17 +#define ITEM_RIGHT_IDX(item) ((item) >> 8) & 511 +#define INVERT_BUCKET(idx) -(idx) % NUM_COARSE_BUCKETS +#define INVERT_SCRATCH(idx) -(idx) % NUM_FINE_BUCKETS +#define STAGE1_IDX(buck, pos) heap->stage1_indices.buckets[buck].items[pos] +#define STAGE2_IDX(buck, pos) heap->stage2_indices.buckets[buck].items[pos] +#define STAGE3_IDX(buck, pos) heap->stage3_indices.buckets[buck].items[pos] +#define STAGE1_DATA(buck, pos) heap->stage1_data.buckets[buck].items[pos] +#define STAGE2_DATA(buck, pos) heap->stage2_data.buckets[buck].items[pos] +#define STAGE3_DATA(buck, pos) heap->stage3_data.buckets[buck].items[pos] +#define STAGE1_SIZE(buck) heap->stage1_indices.counts[buck] +#define STAGE2_SIZE(buck) heap->stage2_indices.counts[buck] +#define STAGE3_SIZE(buck) heap->stage3_indices.counts[buck] +#define SCRATCH(buck, pos) heap->scratch_ht.buckets[buck].items[pos] +#define SCRATCH_SIZE(buck) heap->scratch_ht.counts[buck] +#define SWAP_IDX(a, b) \ + do { \ + equix_idx temp = a; \ + a = b; \ + b = temp; \ + } while(0) +#define CARRY (bucket_idx != 0) +#define BUCK_START 0 +#define BUCK_END (NUM_COARSE_BUCKETS / 2 + 1) + +typedef uint32_t u32; +typedef stage1_idx_item s1_idx; +typedef stage2_idx_item s2_idx; +typedef stage3_idx_item s3_idx; + +static FORCE_INLINE uint64_t hash_value(hashx_ctx* hash_func, equix_idx index) { + char hash[HASHX_SIZE]; + hashx_exec(hash_func, index, hash); + return load64(hash); +} + +static void build_solution_stage1(equix_idx* output, solver_heap* heap, s2_idx root) { + u32 bucket = ITEM_BUCKET(root); + u32 bucket_inv = INVERT_BUCKET(bucket); + u32 left_parent_idx = ITEM_LEFT_IDX(root); + u32 right_parent_idx = ITEM_RIGHT_IDX(root); + s1_idx left_parent = STAGE1_IDX(bucket, left_parent_idx); + s1_idx right_parent = STAGE1_IDX(bucket_inv, right_parent_idx); + output[0] = left_parent; + output[1] = right_parent; + if (!tree_cmp1(&output[0], &output[1])) { + SWAP_IDX(output[0], output[1]); + } +} + +static void build_solution_stage2(equix_idx* output, solver_heap* heap, s3_idx root) { + u32 bucket = ITEM_BUCKET(root); + u32 bucket_inv = INVERT_BUCKET(bucket); + u32 left_parent_idx = ITEM_LEFT_IDX(root); + u32 right_parent_idx = ITEM_RIGHT_IDX(root); + s2_idx left_parent = STAGE2_IDX(bucket, left_parent_idx); + s2_idx right_parent = STAGE2_IDX(bucket_inv, right_parent_idx); + build_solution_stage1(&output[0], heap, left_parent); + build_solution_stage1(&output[2], heap, right_parent); + if (!tree_cmp2(&output[0], &output[2])) { + SWAP_IDX(output[0], output[2]); + SWAP_IDX(output[1], output[3]); + } +} + +static void build_solution(equix_solution* solution, solver_heap* heap, s3_idx left, s3_idx right) { + build_solution_stage2(&solution->idx[0], heap, left); + build_solution_stage2(&solution->idx[4], heap, right); + if (!tree_cmp4(&solution->idx[0], &solution->idx[4])) { + SWAP_IDX(solution->idx[0], solution->idx[4]); + SWAP_IDX(solution->idx[1], solution->idx[5]); + SWAP_IDX(solution->idx[2], solution->idx[6]); + SWAP_IDX(solution->idx[3], solution->idx[7]); + } +} + +static void solve_stage0(hashx_ctx* hash_func, solver_heap* heap) { + CLEAR(heap->stage1_indices.counts); + for (u32 i = 0; i < INDEX_SPACE; ++i) { + uint64_t value = hash_value(hash_func, i); + u32 bucket_idx = value % NUM_COARSE_BUCKETS; + u32 item_idx = STAGE1_SIZE(bucket_idx); + if (item_idx >= COARSE_BUCKET_ITEMS) + continue; + STAGE1_SIZE(bucket_idx) = item_idx + 1; + STAGE1_IDX(bucket_idx, item_idx) = i; + STAGE1_DATA(bucket_idx, item_idx) = value / NUM_COARSE_BUCKETS; /* 52 bits */ + } +} + +#define MAKE_PAIRS1 \ + stage1_data_item value = STAGE1_DATA(bucket_idx, item_idx) + CARRY; \ + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; \ + u32 fine_cpl_bucket = INVERT_SCRATCH(fine_buck_idx); \ + u32 fine_cpl_size = SCRATCH_SIZE(fine_cpl_bucket); \ + for (u32 fine_idx = 0; fine_idx < fine_cpl_size; ++fine_idx) { \ + u32 cpl_index = SCRATCH(fine_cpl_bucket, fine_idx); \ + stage1_data_item cpl_value = STAGE1_DATA(cpl_bucket, cpl_index); \ + stage1_data_item sum = value + cpl_value; \ + assert((sum % NUM_FINE_BUCKETS) == 0); \ + sum /= NUM_FINE_BUCKETS; /* 45 bits */ \ + u32 s2_buck_id = sum % NUM_COARSE_BUCKETS; \ + u32 s2_item_id = STAGE2_SIZE(s2_buck_id); \ + if (s2_item_id >= COARSE_BUCKET_ITEMS) \ + continue; \ + STAGE2_SIZE(s2_buck_id) = s2_item_id + 1; \ + STAGE2_IDX(s2_buck_id, s2_item_id) = \ + MAKE_ITEM(bucket_idx, item_idx, cpl_index); \ + STAGE2_DATA(s2_buck_id, s2_item_id) = \ + sum / NUM_COARSE_BUCKETS; /* 37 bits */ \ + } \ + +static void solve_stage1(solver_heap* heap) { + CLEAR(heap->stage2_indices.counts); + for (u32 bucket_idx = BUCK_START; bucket_idx < BUCK_END; ++bucket_idx) { + u32 cpl_bucket = INVERT_BUCKET(bucket_idx); + CLEAR(heap->scratch_ht.counts); + u32 cpl_buck_size = STAGE1_SIZE(cpl_bucket); + for (u32 item_idx = 0; item_idx < cpl_buck_size; ++item_idx) { + stage1_data_item value = STAGE1_DATA(cpl_bucket, item_idx); + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; + u32 fine_item_idx = SCRATCH_SIZE(fine_buck_idx); + if (fine_item_idx >= FINE_BUCKET_ITEMS) + continue; + SCRATCH_SIZE(fine_buck_idx) = fine_item_idx + 1; + SCRATCH(fine_buck_idx, fine_item_idx) = item_idx; + if (cpl_bucket == bucket_idx) { + MAKE_PAIRS1 + } + } + if (cpl_bucket != bucket_idx) { + u32 buck_size = STAGE1_SIZE(bucket_idx); + for (u32 item_idx = 0; item_idx < buck_size; ++item_idx) { + MAKE_PAIRS1 + } + } + } +} + +#define MAKE_PAIRS2 \ + stage2_data_item value = STAGE2_DATA(bucket_idx, item_idx) + CARRY; \ + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; \ + u32 fine_cpl_bucket = INVERT_SCRATCH(fine_buck_idx); \ + u32 fine_cpl_size = SCRATCH_SIZE(fine_cpl_bucket); \ + for (u32 fine_idx = 0; fine_idx < fine_cpl_size; ++fine_idx) { \ + u32 cpl_index = SCRATCH(fine_cpl_bucket, fine_idx); \ + stage2_data_item cpl_value = STAGE2_DATA(cpl_bucket, cpl_index); \ + stage2_data_item sum = value + cpl_value; \ + assert((sum % NUM_FINE_BUCKETS) == 0); \ + sum /= NUM_FINE_BUCKETS; /* 30 bits */ \ + u32 s3_buck_id = sum % NUM_COARSE_BUCKETS; \ + u32 s3_item_id = STAGE3_SIZE(s3_buck_id); \ + if (s3_item_id >= COARSE_BUCKET_ITEMS) \ + continue; \ + STAGE3_SIZE(s3_buck_id) = s3_item_id + 1; \ + STAGE3_IDX(s3_buck_id, s3_item_id) = \ + MAKE_ITEM(bucket_idx, item_idx, cpl_index); \ + STAGE3_DATA(s3_buck_id, s3_item_id) = \ + sum / NUM_COARSE_BUCKETS; /* 22 bits */ \ + } \ + +static void solve_stage2(solver_heap* heap) { + CLEAR(heap->stage3_indices.counts); + for (u32 bucket_idx = BUCK_START; bucket_idx < BUCK_END; ++bucket_idx) { + u32 cpl_bucket = INVERT_BUCKET(bucket_idx); + CLEAR(heap->scratch_ht.counts); + u32 cpl_buck_size = STAGE2_SIZE(cpl_bucket); + for (u32 item_idx = 0; item_idx < cpl_buck_size; ++item_idx) { + stage2_data_item value = STAGE2_DATA(cpl_bucket, item_idx); + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; + u32 fine_item_idx = SCRATCH_SIZE(fine_buck_idx); + if (fine_item_idx >= FINE_BUCKET_ITEMS) + continue; + SCRATCH_SIZE(fine_buck_idx) = fine_item_idx + 1; + SCRATCH(fine_buck_idx, fine_item_idx) = item_idx; + if (cpl_bucket == bucket_idx) { + MAKE_PAIRS2 + } + } + if (cpl_bucket != bucket_idx) { + u32 buck_size = STAGE2_SIZE(bucket_idx); + for (u32 item_idx = 0; item_idx < buck_size; ++item_idx) { + MAKE_PAIRS2 + } + } + } +} + +#define MAKE_PAIRS3 \ + stage3_data_item value = STAGE3_DATA(bucket_idx, item_idx) + CARRY; \ + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; \ + u32 fine_cpl_bucket = INVERT_SCRATCH(fine_buck_idx); \ + u32 fine_cpl_size = SCRATCH_SIZE(fine_cpl_bucket); \ + for (u32 fine_idx = 0; fine_idx < fine_cpl_size; ++fine_idx) { \ + u32 cpl_index = SCRATCH(fine_cpl_bucket, fine_idx); \ + stage3_data_item cpl_value = STAGE3_DATA(cpl_bucket, cpl_index); \ + stage3_data_item sum = value + cpl_value; \ + assert((sum % NUM_FINE_BUCKETS) == 0); \ + sum /= NUM_FINE_BUCKETS; /* 15 bits */ \ + if ((sum & EQUIX_STAGE1_MASK) == 0) { \ + /* we have a solution */ \ + s3_idx item_left = STAGE3_IDX(bucket_idx, item_idx); \ + s3_idx item_right = STAGE3_IDX(cpl_bucket, cpl_index); \ + build_solution(&output[sols_found], heap, item_left, item_right); \ + if (++(sols_found) >= EQUIX_MAX_SOLS) { \ + return sols_found; \ + } \ + } \ + } \ + +static int solve_stage3(solver_heap* heap, equix_solution output[EQUIX_MAX_SOLS]) { + int sols_found = 0; + + for (u32 bucket_idx = BUCK_START; bucket_idx < BUCK_END; ++bucket_idx) { + u32 cpl_bucket = -bucket_idx & (NUM_COARSE_BUCKETS - 1); + bool nodup = cpl_bucket == bucket_idx; + CLEAR(heap->scratch_ht.counts); + u32 cpl_buck_size = STAGE3_SIZE(cpl_bucket); + for (u32 item_idx = 0; item_idx < cpl_buck_size; ++item_idx) { + stage3_data_item value = STAGE3_DATA(cpl_bucket, item_idx); + u32 fine_buck_idx = value % NUM_FINE_BUCKETS; + u32 fine_item_idx = SCRATCH_SIZE(fine_buck_idx); + if (fine_item_idx >= FINE_BUCKET_ITEMS) + continue; + SCRATCH_SIZE(fine_buck_idx) = fine_item_idx + 1; + SCRATCH(fine_buck_idx, fine_item_idx) = item_idx; + if (cpl_bucket == bucket_idx) { + MAKE_PAIRS3 + } + } + if (cpl_bucket != bucket_idx) { + u32 buck_size = STAGE3_SIZE(bucket_idx); + for (u32 item_idx = 0; item_idx < buck_size; ++item_idx) { + MAKE_PAIRS3 + } + } + } + + return sols_found; +} + +int equix_solver_solve( + hashx_ctx* hash_func, + solver_heap* heap, + equix_solution output[EQUIX_MAX_SOLS]) +{ + solve_stage0(hash_func, heap); + solve_stage1(heap); + solve_stage2(heap); + return solve_stage3(heap, output); +} diff --git a/src/ext/equix/src/solver.h b/src/ext/equix/src/solver.h new file mode 100644 index 0000000000..ad69951929 --- /dev/null +++ b/src/ext/equix/src/solver.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef SOLVER_H +#define SOLVER_H + +#include <equix.h> +#include <hashx_endian.h> +#include <stdbool.h> +#include "context.h" + +#define EQUIX_STAGE1_MASK ((1ull << 15) - 1) +#define EQUIX_STAGE2_MASK ((1ull << 30) - 1) +#define EQUIX_FULL_MASK ((1ull << 60) - 1) + +static inline bool tree_cmp1(const equix_idx* left, const equix_idx* right) { + return *left <= *right; +} + +static inline bool tree_cmp2(const equix_idx* left, const equix_idx* right) { + return load32(left) <= load32(right); +} + +static inline bool tree_cmp4(const equix_idx* left, const equix_idx* right) { + return load64(left) <= load64(right); +} + +EQUIX_PRIVATE int equix_solver_solve(hashx_ctx* hash_func, solver_heap* heap, equix_solution output[EQUIX_MAX_SOLS]); + +#endif diff --git a/src/ext/equix/src/solver_heap.h b/src/ext/equix/src/solver_heap.h new file mode 100644 index 0000000000..71c5f0ca48 --- /dev/null +++ b/src/ext/equix/src/solver_heap.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifndef SOLVER_HEAP_H +#define SOLVER_HEAP_H + +#include <stdint.h> +#include <equix.h> + +#define INDEX_SPACE (UINT32_C(1) << 16) +#define NUM_COARSE_BUCKETS 256 +#define NUM_FINE_BUCKETS 128 +#define COARSE_BUCKET_ITEMS 336 +#define FINE_BUCKET_ITEMS 12 + +typedef uint16_t fine_item; + +typedef struct fine_bucket { + fine_item items[FINE_BUCKET_ITEMS]; +} fine_bucket; + +typedef struct fine_hashtab { + uint8_t counts[NUM_FINE_BUCKETS]; + fine_bucket buckets[NUM_FINE_BUCKETS]; +} fine_hashtab; + +typedef equix_idx stage1_idx_item; /* 16 bits */ + +typedef uint64_t stage1_data_item; /* 52 bits */ + +typedef struct stage1_idx_bucket { + stage1_idx_item items[COARSE_BUCKET_ITEMS]; +} stage1_idx_bucket; + +typedef struct stage1_data_bucket { + stage1_data_item items[COARSE_BUCKET_ITEMS]; +} stage1_data_bucket; + +typedef struct stage1_idx_hashtab { + uint16_t counts[NUM_COARSE_BUCKETS]; + stage1_idx_bucket buckets[NUM_COARSE_BUCKETS]; +} stage1_idx_hashtab; + +typedef struct stage1_data_hashtab { + stage1_data_bucket buckets[NUM_COARSE_BUCKETS]; +} stage1_data_hashtab; + +typedef uint32_t stage2_idx_item; /* 26 bits: 8 bits = left bucket index + 9 bits = left item index + 9 bits = right item index */ + +typedef struct stage2_idx_bucket { + stage2_idx_item items[COARSE_BUCKET_ITEMS]; +} stage2_idx_bucket; + +typedef struct stage2_idx_hashtab { + uint16_t counts[NUM_COARSE_BUCKETS]; + stage2_idx_bucket buckets[NUM_COARSE_BUCKETS]; +} stage2_idx_hashtab; + +#ifdef SOLVER_PACKED_STAGE2 +#pragma pack(push, 1) +typedef struct stage2_data_item { + uint32_t upper; /* 22 bits */ + uint8_t middle; /* 8 bits */ + uint8_t lower; /* 7 bits */ +} stage2_data_item; +#pragma pack(pop) +#else +typedef uint64_t stage2_data_item; /* 37 bits */ +#endif + +typedef struct stage2_data_bucket { + stage2_data_item items[COARSE_BUCKET_ITEMS]; +} stage2_data_bucket; + +typedef struct stage2_data_hashtab { + stage2_data_bucket buckets[NUM_COARSE_BUCKETS]; +} stage2_data_hashtab; + +typedef uint32_t stage3_data_item; /* 22 bits */ + +typedef struct stage3_data_bucket { + stage3_data_item items[COARSE_BUCKET_ITEMS]; +} stage3_data_bucket; + +typedef struct stage3_data_hashtab { + stage3_data_bucket buckets[NUM_COARSE_BUCKETS]; +} stage3_data_hashtab; + +typedef stage2_idx_hashtab stage3_idx_hashtab; +typedef stage2_idx_item stage3_idx_item; + +typedef struct solver_heap { + stage1_idx_hashtab stage1_indices; /* 172 544 bytes */ + stage2_idx_hashtab stage2_indices; /* 344 576 bytes */ + stage2_data_hashtab stage2_data; /* 688 128 bytes */ + union { + stage1_data_hashtab stage1_data; /* 688 128 bytes */ + struct { + stage3_idx_hashtab stage3_indices; /* 344 576 bytes */ + stage3_data_hashtab stage3_data; /* 344 064 bytes */ + }; + }; + fine_hashtab scratch_ht; /* 3 200 bytes */ +} solver_heap; /* TOTAL: 1 897 088 bytes */ + +#endif diff --git a/src/ext/equix/src/tests.c b/src/ext/equix/src/tests.c new file mode 100644 index 0000000000..63fb5bdb1e --- /dev/null +++ b/src/ext/equix/src/tests.c @@ -0,0 +1,124 @@ +/* Copyright (c) 2020 tevador tevador@gmail.com */ +/* See LICENSE for licensing information */ + +#ifdef NDEBUG +#undef NDEBUG +#endif + +#include <assert.h> +#include <equix.h> +#include <stdbool.h> +#include <stdio.h> + +typedef bool test_func(); + +static equix_ctx* ctx = NULL; +static equix_solution solution[EQUIX_MAX_SOLS]; +static int nonce; +static int valid_count = 0; +static int test_no = 0; + +#define SWAP_IDX(a, b) \ + do { \ + equix_idx temp = a; \ + a = b; \ + b = temp; \ + } while(0) + +static bool test_alloc() { + ctx = equix_alloc(EQUIX_CTX_SOLVE); + assert(ctx != NULL && ctx != EQUIX_NOTSUPP); + return true; +} + +static bool test_free() { + equix_free(ctx); + return true; +} + +static bool test_solve() { + int num_solutions = 0; + for (nonce = 0; num_solutions == 0 && nonce < 20; ++nonce) { + num_solutions = equix_solve(ctx, &nonce, sizeof(nonce), solution); + } + --nonce; + assert(num_solutions > 0); + return true; +} + +static bool test_verify1() { + equix_result result = equix_verify(ctx, &nonce, sizeof(nonce), &solution[0]); + assert(result == EQUIX_OK); + return true; +} + +static bool test_verify2() { + SWAP_IDX(solution[0].idx[0], solution[0].idx[1]); + equix_result result = equix_verify(ctx, &nonce, sizeof(nonce), &solution[0]); + assert(result == EQUIX_ORDER); + return true; +} + +static bool test_verify3() { + SWAP_IDX(solution[0].idx[0], solution[0].idx[4]); + SWAP_IDX(solution[0].idx[1], solution[0].idx[5]); + SWAP_IDX(solution[0].idx[2], solution[0].idx[6]); + SWAP_IDX(solution[0].idx[3], solution[0].idx[7]); + equix_result result = equix_verify(ctx, &nonce, sizeof(nonce), &solution[0]); + assert(result == EQUIX_ORDER); + SWAP_IDX(solution[0].idx[0], solution[0].idx[4]); + SWAP_IDX(solution[0].idx[1], solution[0].idx[5]); + SWAP_IDX(solution[0].idx[2], solution[0].idx[6]); + SWAP_IDX(solution[0].idx[3], solution[0].idx[7]); + return true; +} + +static bool test_verify4() { + SWAP_IDX(solution[0].idx[1], solution[0].idx[2]); + equix_result result = equix_verify(ctx, &nonce, sizeof(nonce), &solution[0]); + assert(result == EQUIX_PARTIAL_SUM); + SWAP_IDX(solution[0].idx[1], solution[0].idx[2]); + return true; +} + +static void permute_idx(int start) { + if (start == EQUIX_NUM_IDX - 1) { + equix_result result = equix_verify(ctx, &nonce, sizeof(nonce), &solution[0]); + valid_count += result == EQUIX_OK; + } + else { + for (int i = start; i < EQUIX_NUM_IDX; ++i) { + SWAP_IDX(solution[0].idx[start], solution[0].idx[i]); + permute_idx(start + 1); + SWAP_IDX(solution[0].idx[start], solution[0].idx[i]); + } + } +} + +static bool test_permutations() { + permute_idx(0); + assert(valid_count == 1); /* check that only one of the 40320 possible + permutations of indices is a valid solution */ + return true; +} + +#define RUN_TEST(x) run_test(#x, &x) + +static void run_test(const char* name, test_func* func) { + printf("[%2i] %-40s ... ", ++test_no, name); + printf(func() ? "PASSED\n" : "SKIPPED\n"); +} + +int main() { + RUN_TEST(test_alloc); + RUN_TEST(test_solve); + RUN_TEST(test_verify1); + RUN_TEST(test_verify2); + RUN_TEST(test_verify3); + RUN_TEST(test_verify4); + RUN_TEST(test_permutations); + RUN_TEST(test_free); + + printf("\nAll tests were successful\n"); + return 0; +}