diff --git a/src/aes_hash.cpp b/src/aes_hash.cpp index 1aff37f..a3b7395 100644 --- a/src/aes_hash.cpp +++ b/src/aes_hash.cpp @@ -239,3 +239,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; + const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; + + // initial state + rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0); + rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1); + rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2); + rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3); + + constexpr int PREFETCH_DISTANCE = 4096; + const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE; + scratchpadEnd -= PREFETCH_DISTANCE; + + for (int i = 0; i < 2; ++i) { + //process 64 bytes at a time in 4 lanes + while (scratchpadPtr < scratchpadEnd) { + hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); + hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); + hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); + hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); + + fill_state0 = aesdec(fill_state0, key0); + fill_state1 = aesenc(fill_state1, key1); + fill_state2 = aesdec(fill_state2, key2); + fill_state3 = aesenc(fill_state3, key3); + + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); + + rx_prefetch_t0(prefetchPtr); + + scratchpadPtr += 64; + prefetchPtr += 64; + } + prefetchPtr = (const char*) scratchpad; + scratchpadEnd += PREFETCH_DISTANCE; + } + + rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3); + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + hash_state0 = aesenc(hash_state0, xkey0); + hash_state1 = aesdec(hash_state1, xkey0); + hash_state2 = aesenc(hash_state2, xkey0); + hash_state3 = aesdec(hash_state3, xkey0); + + hash_state0 = aesenc(hash_state0, xkey1); + hash_state1 = aesdec(hash_state1, xkey1); + hash_state2 = aesenc(hash_state2, xkey1); + hash_state3 = aesdec(hash_state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); +} + +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/aes_hash.hpp b/src/aes_hash.hpp index b4d0e94..9f75f73 100644 --- a/src/aes_hash.hpp +++ b/src/aes_hash.hpp @@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/intrin_portable.h b/src/intrin_portable.h index b5ad91a..c9d4475 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128; #define rx_aligned_alloc(a, b) _mm_malloc(a,b) #define rx_aligned_free(a) _mm_free(a) #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) +#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0) #define rx_load_vec_f128 _mm_load_pd #define rx_store_vec_f128 _mm_store_pd @@ -201,6 +202,7 @@ typedef union{ #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) /* Splat 64-bit long long to 2 64-bit long longs */ FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) @@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) { asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); } +inline void rx_prefetch_t0(const void* ptr) { + asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); +} + FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { return vld1q_f64((const float64_t*)pd); } @@ -532,6 +538,7 @@ typedef union { #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { rx_vec_f128 x; diff --git a/src/randomx.cpp b/src/randomx.cpp index 90fc46a..c4281b3 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -363,4 +363,21 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize) { + blake2b(tempHash, sizeof(uint64_t) * 8, input, inputSize, nullptr, 0); + machine->initScratchpad(tempHash); + } + + void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output) { + machine->resetRoundingMode(); + for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { + machine->run(tempHash); + blake2b(tempHash, sizeof(uint64_t) * 8, machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + } + machine->run(tempHash); + + // Finish current hash and fill the scratchpad for the next hash at the same time + blake2b(tempHash, sizeof(uint64_t) * 8, nextInput, nextInputSize, nullptr, 0); + machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); + } } diff --git a/src/randomx.h b/src/randomx.h index c06002b..4a7fcbf 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_H #include +#include #define RANDOMX_HASH_SIZE 32 #define RANDOMX_DATASET_ITEM_SIZE 64 @@ -238,6 +239,21 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); */ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); +/** + * Paired functions used to calculate multiple RandomX hashes during mining for example. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param tempHash an array of 8 64-bit values used to store intermediate data between calls to randomx_calculate_hash_first and randomx_calculate_hash_next. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL. + * @param nextInputSize is the number of bytes to be hashed for the next hash. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize); +RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output); + #if defined(__cplusplus) } #endif diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index 03023ab..868dd28 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -122,11 +122,16 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result void* noncePtr = blockTemplate + 39; auto nonce = atomicNonce.fetch_add(1); + uint64_t tempHash[8]; + + store32(noncePtr, nonce); + randomx_calculate_hash_first(vm, tempHash, blockTemplate, sizeof(blockTemplate)); + while (nonce < noncesCount) { + nonce = atomicNonce.fetch_add(1); store32(noncePtr, nonce); - randomx_calculate_hash(vm, blockTemplate, sizeof(blockTemplate), &hash); + randomx_calculate_hash_next(vm, tempHash, blockTemplate, sizeof(blockTemplate), &hash); result.xorWith(hash); - nonce = atomicNonce.fetch_add(1); } } diff --git a/src/virtual_machine.cpp b/src/virtual_machine.cpp index d73a024..2d5d2be 100644 --- a/src/virtual_machine.cpp +++ b/src/virtual_machine.cpp @@ -120,6 +120,12 @@ namespace randomx { blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); } + template + void VmBase::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) { + hashAndFillAes1Rx4((void*) getScratchpad(), ScratchpadSize, ®.a, fill_state); + blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + template void VmBase::initScratchpad(void* seed) { fillAes1Rx4(seed, ScratchpadSize, scratchpad); diff --git a/src/virtual_machine.hpp b/src/virtual_machine.hpp index d662c89..4e89366 100644 --- a/src/virtual_machine.hpp +++ b/src/virtual_machine.hpp @@ -38,6 +38,7 @@ public: virtual ~randomx_vm() = 0; virtual void allocate() = 0; virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -78,6 +79,7 @@ namespace randomx { void allocate() override; void initScratchpad(void* seed) override; void getFinalResult(void* out, size_t outSize) override; + void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override; protected: void generateProgram(void* seed); };