diff --git a/CMakeLists.txt b/CMakeLists.txt index 0eeac94..1d4e827 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,8 @@ cmake_minimum_required(VERSION 2.8.7) set (randomx_sources src/aes_hash.cpp src/argon2_ref.c +src/argon2_sse3.c +src/argon2_avx2.c src/bytecode_machine.cpp src/dataset.cpp src/soft_aes.cpp @@ -103,6 +105,14 @@ if (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL " else() # default build has hardware AES enabled (software AES can be selected at runtime) add_flag("-maes") + check_c_compiler_flag(-mssse3 HAVE_SSSE3) + if(HAVE_SSSE3) + set_source_files_properties(src/argon2_sse3.c COMPILE_FLAGS -mssse3) + endif() + check_c_compiler_flag(-mavx2 HAVE_AVX2) + if(HAVE_AVX2) + set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2) + endif() endif() endif() diff --git a/src/argon2.h b/src/argon2.h index 9d42715..1734641 100644 --- a/src/argon2.h +++ b/src/argon2.h @@ -227,3 +227,35 @@ typedef enum Argon2_version { ARGON2_VERSION_13 = 0x13, ARGON2_VERSION_NUMBER = ARGON2_VERSION_13 } argon2_version; + +//Argon2 instance - forward declaration +typedef struct Argon2_instance_t argon2_instance_t; + +//Argon2 position = forward declaration +typedef struct Argon2_position_t argon2_position_t; + +//Argon2 implementation function +typedef void randomx_argon2_impl(const argon2_instance_t* instance, + argon2_position_t position); + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param context current context + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl *randomx_argon2_impl_sse3(); +randomx_argon2_impl *randomx_argon2_impl_avx2(); + +#if defined(__cplusplus) +} +#endif diff --git a/src/argon2_avx2.c b/src/argon2_avx2.c new file mode 100644 index 0000000..2135303 --- /dev/null +++ b/src/argon2_avx2.c @@ -0,0 +1,174 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_avx2() { +#if defined(__AVX2__) + return &randomx_argon2_fill_segment_avx2; +#endif + return NULL; +} + +#if defined(__AVX2__) + +#include "argon2_core.h" + +#include "blake2/blamka-round-avx2.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +static void fill_block(__m256i* state, const block* ref_block, + block* next_block, int with_xor) { + __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); + block_XY[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i)); + } + } + else { + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm256_xor_si256( + state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); + } + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], + state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); + } + + for (i = 0; i < 4; ++i) { + BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i], + state[16 + i], state[20 + i], state[24 + i], state[28 + i]); + } + + for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { + state[i] = _mm256_xor_si256(state[i], block_XY[i]); + _mm256_storeu_si256((__m256i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, + argon2_position_t position) { + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m256i state[ARGON2_HWORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif diff --git a/src/argon2_core.c b/src/argon2_core.c index 977705e..a70f9dc 100644 --- a/src/argon2_core.c +++ b/src/argon2_core.c @@ -70,18 +70,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /***************Instance and Position constructors**********/ -void rxa2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } - -void rxa2_copy_block(block *dst, const block *src) { - memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); -} - -void rxa2_xor_block(block *dst, const block *src) { - int i; - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { - dst->v[i] ^= src->v[i]; - } -} static void load_block(block *dst, const void *input) { unsigned i; @@ -97,69 +85,7 @@ static void store_block(void *output, const block *src) { } } -/***************Memory functions*****************/ - -int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, - size_t num, size_t size) { - size_t memory_size = num * size; - if (memory == NULL) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - /* 1. Check for multiplication overflow */ - if (size != 0 && memory_size / size != num) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - /* 2. Try to allocate with appropriate allocator */ - if (context->allocate_cbk) { - (context->allocate_cbk)(memory, memory_size); - } - else { - *memory = (uint8_t*)malloc(memory_size); - } - - if (*memory == NULL) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - return ARGON2_OK; -} - -void rxa2_free_memory(const argon2_context *context, uint8_t *memory, - size_t num, size_t size) { - size_t memory_size = num * size; - rxa2_clear_internal_memory(memory, memory_size); - if (context->free_cbk) { - (context->free_cbk)(memory, memory_size); - } - else { - free(memory); - } -} - -void NOT_OPTIMIZED rxa2_secure_wipe_memory(void *v, size_t n) { -#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER) - SecureZeroMemory(v, n); -#elif defined memset_s - memset_s(v, n, 0, n); -#elif defined(__OpenBSD__) - explicit_bzero(v, n); -#else - static void *(*const volatile memset_sec)(void *, int, size_t) = &memset; - memset_sec(v, 0, n); -#endif -} - -/* Memory clear flag defaults to true. */ -#define FLAG_clear_internal_memory 0 -void rxa2_clear_internal_memory(void *v, size_t n) { - if (FLAG_clear_internal_memory && v) { - rxa2_secure_wipe_memory(v, n); - } -} - -uint32_t rxa2_index_alpha(const argon2_instance_t *instance, +uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance, const argon2_position_t *position, uint32_t pseudo_rand, int same_lane) { /* @@ -241,24 +167,22 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) { for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { for (l = 0; l < instance->lanes; ++l) { argon2_position_t position = { r, l, (uint8_t)s, 0 }; - rxa2_fill_segment(instance, position); + //fill the segment using the selected implementation + instance->impl(instance, position); } } -#ifdef GENKAT - internal_kat(instance, r); /* Print all memory blocks */ -#endif } return ARGON2_OK; } -int rxa2_fill_memory_blocks(argon2_instance_t *instance) { +int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) { if (instance == NULL || instance->lanes == 0) { return ARGON2_INCORRECT_PARAMETER; } return fill_memory_blocks_st(instance); } -int rxa2_validate_inputs(const argon2_context *context) { +int randomx_argon2_validate_inputs(const argon2_context *context) { if (NULL == context) { return ARGON2_INCORRECT_PARAMETER; } @@ -394,7 +318,6 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc load_block(&instance->memory[l * instance->lane_length + 1], blockhash_bytes); } - rxa2_clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); } void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) { @@ -431,11 +354,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type if (context->pwd != NULL) { blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, context->pwdlen); - - if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { - rxa2_secure_wipe_memory(context->pwd, context->pwdlen); - context->pwdlen = 0; - } } store32(&value, context->saltlen); @@ -451,11 +369,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type if (context->secret != NULL) { blake2b_update(&BlakeHash, (const uint8_t *)context->secret, context->secretlen); - - if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { - rxa2_secure_wipe_memory(context->secret, context->secretlen); - context->secretlen = 0; - } } store32(&value, context->adlen); @@ -469,7 +382,7 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); } -int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) { +int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) { uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; int result = ARGON2_OK; @@ -478,10 +391,7 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) instance->context_ptr = context; /* 1. Memory allocation */ - /*result = allocate_memory(context, (uint8_t **)&(instance->memory), instance->memory_blocks, sizeof(block)); - if (result != ARGON2_OK) { - return result; - }*/ + //RandomX takes care of memory allocation /* 2. Initial hashing */ /* H_0 + 8 extra bytes to produce the first blocks */ @@ -489,15 +399,13 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) /* Hashing all inputs */ rxa2_initial_hash(blockhash, context, instance->type); /* Zeroing 8 extra bytes */ - rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, + /*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, ARGON2_PREHASH_SEED_LENGTH - - ARGON2_PREHASH_DIGEST_LENGTH); + ARGON2_PREHASH_DIGEST_LENGTH);*/ /* 3. Creating first blocks, we always have at least two blocks in a slice */ rxa2_fill_first_blocks(blockhash, instance); - /* Clearing the hash */ - rxa2_clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); return ARGON2_OK; } diff --git a/src/argon2_core.h b/src/argon2_core.h index efd56d9..def27c6 100644 --- a/src/argon2_core.h +++ b/src/argon2_core.h @@ -73,17 +73,6 @@ enum argon2_core_constants { */ typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; -/*****************Functions that work with the block******************/ - -/* Initialize each byte of the block with @in */ -void rxa2_init_block_value(block *b, uint8_t in); - -/* Copy block @src to block @dst */ -void rxa2_copy_block(block *dst, const block *src); - -/* XOR @src onto @dst bytewise */ -void rxa2_xor_block(block *dst, const block *src); - /* * Argon2 instance: memory pointer, number of passes, amount of memory, type, * and derived values. @@ -102,6 +91,7 @@ typedef struct Argon2_instance_t { argon2_type type; int print_internals; /* whether to print the memory blocks */ argon2_context *context_ptr; /* points back to original context */ + randomx_argon2_impl *impl; } argon2_instance_t; /* @@ -123,42 +113,6 @@ typedef struct Argon2_thread_data { /*************************Argon2 core functions********************************/ -/* Allocates memory to the given pointer, uses the appropriate allocator as - * specified in the context. Total allocated memory is num*size. - * @param context argon2_context which specifies the allocator - * @param memory pointer to the pointer to the memory - * @param size the size in bytes for each element to be allocated - * @param num the number of elements to be allocated - * @return ARGON2_OK if @memory is a valid pointer and memory is allocated - */ -int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, - size_t num, size_t size); - -/* - * Frees memory at the given pointer, uses the appropriate deallocator as - * specified in the context. Also cleans the memory using clear_internal_memory. - * @param context argon2_context which specifies the deallocator - * @param memory pointer to buffer to be freed - * @param size the size in bytes for each element to be deallocated - * @param num the number of elements to be deallocated - */ -void rxa2_free_memory(const argon2_context *context, uint8_t *memory, - size_t num, size_t size); - -/* Function that securely cleans the memory. This ignores any flags set - * regarding clearing memory. Usually one just calls clear_internal_memory. - * @param mem Pointer to the memory - * @param s Memory size in bytes - */ -void rxa2_secure_wipe_memory(void *v, size_t n); - -/* Function that securely clears the memory if FLAG_clear_internal_memory is - * set. If the flag isn't set, this function does nothing. - * @param mem Pointer to the memory - * @param s Memory size in bytes - */ -void rxa2_clear_internal_memory(void *v, size_t n); - /* * Computes absolute position of reference block in the lane following a skewed * distribution and using a pseudo-random value as input @@ -169,7 +123,7 @@ void rxa2_clear_internal_memory(void *v, size_t n); * If so we can reference the current segment * @pre All pointers must be valid */ -uint32_t rxa2_index_alpha(const argon2_instance_t *instance, +uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance, const argon2_position_t *position, uint32_t pseudo_rand, int same_lane); @@ -180,28 +134,7 @@ uint32_t rxa2_index_alpha(const argon2_instance_t *instance, * @return ARGON2_OK if everything is all right, otherwise one of error codes * (all defined in */ -int rxa2_validate_inputs(const argon2_context *context); - -/* - * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears - * password and secret if needed - * @param context Pointer to the Argon2 internal structure containing memory - * pointer, and parameters for time and space requirements. - * @param blockhash Buffer for pre-hashing digest - * @param type Argon2 type - * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes - * allocated - */ -void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, - argon2_type type); - -/* - * Function creates first 2 blocks per lane - * @param instance Pointer to the current instance - * @param blockhash Pointer to the pre-hashing digest - * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values - */ -void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); +int randomx_argon2_validate_inputs(const argon2_context *context); /* * Function allocates memory, hashes the inputs with Blake, and creates first @@ -213,31 +146,7 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc * @return Zero if successful, -1 if memory failed to allocate. @context->state * will be modified if successful. */ -int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context); - -/* - * XORing the last block of each lane, hashing it, making the tag. Deallocates - * the memory. - * @param context Pointer to current Argon2 context (use only the out parameters - * from it) - * @param instance Pointer to current instance of Argon2 - * @pre instance->state must point to necessary amount of memory - * @pre context->out must point to outlen bytes of memory - * @pre if context->free_cbk is not NULL, it should point to a function that - * deallocates memory - */ -void rxa2_finalize(const argon2_context *context, argon2_instance_t *instance); - -/* - * Function that fills the segment using previous segments also from other - * threads - * @param context current context - * @param instance Pointer to the current instance - * @param position Current position - * @pre all block pointers must be valid - */ -void rxa2_fill_segment(const argon2_instance_t *instance, - argon2_position_t position); +int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context); /* * Function that fills the entire memory t_cost times based on the first two @@ -245,7 +154,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, * @param instance Pointer to the current instance * @return ARGON2_OK if successful, @context->state */ -int rxa2_fill_memory_blocks(argon2_instance_t *instance); +int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance); #if defined(__cplusplus) } diff --git a/src/argon2_ref.c b/src/argon2_ref.c index 018b985..dc4a804 100644 --- a/src/argon2_ref.c +++ b/src/argon2_ref.c @@ -43,6 +43,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "blake2/blake2-impl.h" #include "blake2/blake2.h" +static void copy_block(block* dst, const block* src) { + memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); +} + +static void xor_block(block* dst, const block* src) { + int i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] ^= src->v[i]; + } +} + /* * Function fills a new memory block and optionally XORs the old block over the new one. * @next_block must be initialized. @@ -57,13 +68,13 @@ static void fill_block(const block *prev_block, const block *ref_block, block blockR, block_tmp; unsigned i; - rxa2_copy_block(&blockR, ref_block); - rxa2_xor_block(&blockR, prev_block); - rxa2_copy_block(&block_tmp, &blockR); + copy_block(&blockR, ref_block); + xor_block(&blockR, prev_block); + copy_block(&block_tmp, &blockR); /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */ if (with_xor) { /* Saving the next block contents for XOR over: */ - rxa2_xor_block(&block_tmp, next_block); + xor_block(&block_tmp, next_block); /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block + next_block */ } @@ -92,18 +103,11 @@ static void fill_block(const block *prev_block, const block *ref_block, blockR.v[2 * i + 113]); } - rxa2_copy_block(next_block, &block_tmp); - rxa2_xor_block(next_block, &blockR); -} - -static void next_addresses(block *address_block, block *input_block, - const block *zero_block) { - input_block->v[6]++; - fill_block(zero_block, input_block, address_block, 0); - fill_block(zero_block, address_block, address_block, 0); + copy_block(next_block, &block_tmp); + xor_block(next_block, &blockR); } -void rxa2_fill_segment(const argon2_instance_t *instance, +void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance, argon2_position_t position) { block *ref_block = NULL, *curr_block = NULL; block address_block, input_block, zero_block; @@ -111,38 +115,15 @@ void rxa2_fill_segment(const argon2_instance_t *instance, uint32_t prev_offset, curr_offset; uint32_t starting_index; uint32_t i; - int data_independent_addressing; if (instance == NULL) { return; } - data_independent_addressing = - (instance->type == Argon2_i) || - (instance->type == Argon2_id && (position.pass == 0) && - (position.slice < ARGON2_SYNC_POINTS / 2)); - - if (data_independent_addressing) { - rxa2_init_block_value(&zero_block, 0); - rxa2_init_block_value(&input_block, 0); - - input_block.v[0] = position.pass; - input_block.v[1] = position.lane; - input_block.v[2] = position.slice; - input_block.v[3] = instance->memory_blocks; - input_block.v[4] = instance->passes; - input_block.v[5] = instance->type; - } - starting_index = 0; if ((0 == position.pass) && (0 == position.slice)) { starting_index = 2; /* we have already generated the first two blocks */ - - /* Don't forget to generate the first block of addresses: */ - if (data_independent_addressing) { - next_addresses(&address_block, &input_block, &zero_block); - } } /* Offset of the current block */ @@ -167,15 +148,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, /* 1.2 Computing the index of the reference block */ /* 1.2.1 Taking pseudo-random value from the previous block */ - if (data_independent_addressing) { - if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { - next_addresses(&address_block, &input_block, &zero_block); - } - pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; - } - else { - pseudo_rand = instance->memory[prev_offset].v[0]; - } + pseudo_rand = instance->memory[prev_offset].v[0]; /* 1.2.2 Computing the lane of the reference block */ ref_lane = ((pseudo_rand >> 32)) % instance->lanes; @@ -189,7 +162,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, * lane. */ position.index = i; - ref_index = rxa2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, ref_lane == position.lane); /* 2 Creating a new block */ diff --git a/src/argon2_sse3.c b/src/argon2_sse3.c new file mode 100644 index 0000000..5a980ef --- /dev/null +++ b/src/argon2_sse3.c @@ -0,0 +1,182 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +#if defined(_MSC_VER) //MSVC doesn't define SSSE3 +#define __SSSE3__ +#endif + +void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_sse3() { +#if defined(__SSSE3__) + return &randomx_argon2_fill_segment_sse3; +#endif + return NULL; +} + +#if defined(__SSSE3__) + +#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ + +#include "argon2_core.h" + +#include "blake2/blamka-round-sse3.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +static void fill_block(__m128i* state, const block* ref_block, + block* next_block, int with_xor) { + __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); + block_XY[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)next_block->v + i)); + } + } + else { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); + } + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], + state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], + state[8 * i + 6], state[8 * i + 7]); + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], + state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], + state[8 * 6 + i], state[8 * 7 + i]); + } + + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128(state[i], block_XY[i]); + _mm_storeu_si128((__m128i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance, + argon2_position_t position) { + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m128i state[ARGON2_OWORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif diff --git a/src/blake2/blamka-round-avx2.h b/src/blake2/blamka-round-avx2.h new file mode 100644 index 0000000..4838261 --- /dev/null +++ b/src/blake2/blamka-round-avx2.h @@ -0,0 +1,189 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) +#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) + +#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr32(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr24(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr32(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr24(B1); \ + } while((void)0, 0); + +#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i ml = _mm256_mul_epu32(A0, B0); \ + ml = _mm256_add_epi64(ml, ml); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + D0 = _mm256_xor_si256(D0, A0); \ + D0 = rotr16(D0); \ + \ + ml = _mm256_mul_epu32(C0, D0); \ + ml = _mm256_add_epi64(ml, ml); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ + B0 = _mm256_xor_si256(B0, C0); \ + B0 = rotr63(B0); \ + \ + ml = _mm256_mul_epu32(A1, B1); \ + ml = _mm256_add_epi64(ml, ml); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ + D1 = _mm256_xor_si256(D1, A1); \ + D1 = rotr16(D1); \ + \ + ml = _mm256_mul_epu32(C1, D1); \ + ml = _mm256_add_epi64(ml, ml); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + B1 = _mm256_xor_si256(B1, C1); \ + B1 = rotr63(B1); \ + } while((void)0, 0); + +#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while((void)0, 0); + +#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while(0); + +#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ + C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ + D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ + \ + B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ + C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ + D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ + B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + \ + tmp1 = C0; \ + C0 = C1; \ + C1 = tmp1; \ + \ + tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + } while((void)0, 0); + +#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ + } while((void)0, 0); + +#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + do{ \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ + \ + UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ + } while((void)0, 0); + +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/src/blake2/blamka-round-sse3.h b/src/blake2/blamka-round-sse3.h new file mode 100644 index 0000000..1fa5595 --- /dev/null +++ b/src/blake2/blamka-round-sse3.h @@ -0,0 +1,158 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#define r16 \ + (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define r24 \ + (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) + +static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) { + const __m128i z = _mm_mul_epu32(x, y); + return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -32); \ + D1 = _mm_roti_epi64(D1, -32); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -24); \ + B1 = _mm_roti_epi64(B1, -24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -16); \ + D1 = _mm_roti_epi64(D1, -16); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -63); \ + B1 = _mm_roti_epi64(B1, -63); \ + } while ((void)0, 0) + +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ + __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D1, D0, 8); \ + t1 = _mm_alignr_epi8(D0, D1, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ + __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D0, D1, 8); \ + t1 = _mm_alignr_epi8(D1, D0, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) + + +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/src/dataset.cpp b/src/dataset.cpp index 59aef62..675c5ab 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -92,7 +92,7 @@ namespace randomx { context.flags = ARGON2_DEFAULT_FLAGS; context.version = ARGON2_VERSION_NUMBER; - int inputsValid = rxa2_validate_inputs(&context); + int inputsValid = randomx_argon2_validate_inputs(&context); assert(inputsValid == ARGON2_OK); /* 2. Align memory size */ @@ -111,6 +111,7 @@ namespace randomx { instance.threads = context.threads; instance.type = Argon2_d; instance.memory = (block*)cache->memory; + instance.impl = cache->argonImpl; if (instance.threads > instance.lanes) { instance.threads = instance.lanes; @@ -119,9 +120,9 @@ namespace randomx { /* 3. Initialization: Hashing inputs, allocating memory, filling first * blocks */ - rxa2_argon_initialize(&instance, &context); + randomx_argon2_initialize(&instance, &context); - rxa2_fill_memory_blocks(&instance); + randomx_argon2_fill_memory_blocks(&instance); cache->reciprocalCache.clear(); randomx::Blake2Generator gen(key, keySize); diff --git a/src/dataset.hpp b/src/dataset.hpp index d80945c..5a0ea4d 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.hpp" #include "superscalar_program.hpp" #include "allocator.hpp" +#include "argon2.h" /* Global scope for C binding */ struct randomx_dataset { @@ -51,6 +52,7 @@ struct randomx_cache { randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; std::vector reciprocalCache; std::string cacheKey; + randomx_argon2_impl* argonImpl; bool isInitialized() { return programs[0].getSize() != 0; @@ -79,4 +81,21 @@ namespace randomx { void initCacheCompile(randomx_cache*, const void*, size_t); void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber); void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); + + inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) { + if ((flags & RANDOMX_FLAG_ARGON2) == 0) { + return &randomx_argon2_fill_segment_ref; + } + randomx_argon2_impl* impl = nullptr; + if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_SSE3) { + impl = randomx_argon2_impl_sse3(); + } + if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_AVX2) { + impl = randomx_argon2_impl_avx2(); + } + if (impl != nullptr) { + return impl; + } + throw std::runtime_error("Unsupported Argon2 implementation"); + } } diff --git a/src/randomx.cpp b/src/randomx.cpp index 9c743d7..c399786 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -39,10 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { randomx_cache *randomx_alloc_cache(randomx_flags flags) { - randomx_cache *cache; + randomx_cache *cache = nullptr; try { cache = new randomx_cache(); + cache->argonImpl = randomx::selectArgonImpl(flags); switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)) { case RANDOMX_FLAG_DEFAULT: cache->dealloc = &randomx::deallocCache; @@ -103,7 +104,9 @@ extern "C" { void randomx_release_cache(randomx_cache* cache) { assert(cache != nullptr); - cache->dealloc(cache); + if (cache->memory != nullptr) { + cache->dealloc(cache); + } delete cache; } @@ -114,7 +117,7 @@ extern "C" { return nullptr; } - randomx_dataset *dataset; + randomx_dataset *dataset = nullptr; try { dataset = new randomx_dataset(); diff --git a/src/randomx.h b/src/randomx.h index 5054c23..1fe3266 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -44,7 +44,10 @@ typedef enum { RANDOMX_FLAG_HARD_AES = 2, RANDOMX_FLAG_FULL_MEM = 4, RANDOMX_FLAG_JIT = 8, - RANDOMX_FLAG_SECURE = 16 + RANDOMX_FLAG_SECURE = 16, + RANDOMX_FLAG_ARGON2_SSE3 = 32, + RANDOMX_FLAG_ARGON2_AVX2 = 64, + RANDOMX_FLAG_ARGON2 = 96 } randomx_flags; typedef struct randomx_dataset randomx_dataset; @@ -62,10 +65,17 @@ extern "C" { * RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages * RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes * subsequent Dataset initialization faster + * Optionally, one of these two flags may be selected: + * RANDOMX_FLAG_ARGON2_SSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set + * makes subsequent cache initialization faster + * RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set + * makes subsequent cache initialization faster * * @return Pointer to an allocated randomx_cache structure. - * NULL is returned if memory allocation fails or if the RANDOMX_FLAG_JIT - * is set and JIT compilation is not supported on the current platform. + * Returns NULL if: + * (1) memory allocation fails + * (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform + * (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set */ RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags); diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index b4120ae..081f70b 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "stopwatch.hpp" #include "utility.hpp" #include "../randomx.h" +#include "../dataset.hpp" #include "../blake2/endian.h" #include "../common.hpp" #ifdef _WIN32 @@ -90,6 +91,8 @@ void printUsage(const char* executable) { std::cout << " --init Q initialize dataset with Q threads (default: 1)" << std::endl; std::cout << " --nonces N run N nonces (default: 1000)" << std::endl; std::cout << " --seed S seed for cache initialization (default: 0)" << std::endl; + std::cout << " --sse3 use optimized Argon2 for SSSE3 CPUs" << std::endl; + std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; } struct MemoryException : public std::exception { @@ -127,7 +130,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } int main(int argc, char** argv) { - bool softAes, miningMode, verificationMode, help, largePages, jit, secure; + bool softAes, miningMode, verificationMode, help, largePages, jit, secure, sse3, avx2; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; int32_t seedValue; @@ -148,6 +151,8 @@ int main(int argc, char** argv) { readOption("--jit", argc, argv, jit); readOption("--help", argc, argv, help); readOption("--secure", argc, argv, secure); + readOption("--sse3", argc, argv, sse3); + readOption("--avx2", argc, argv, avx2); store32(&seed, seedValue); @@ -166,6 +171,16 @@ int main(int argc, char** argv) { randomx_cache* cache; randomx_flags flags = RANDOMX_FLAG_DEFAULT; + if (sse3) { + flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_SSE3); + std::cout << " - Argon2 implementation: SSE3" << std::endl; + } + + if (avx2) { + flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_AVX2); + std::cout << " - Argon2 implementation: AVX2" << std::endl; + } + if (miningMode) { flags = (randomx_flags)(flags | RANDOMX_FLAG_FULL_MEM); std::cout << " - full memory mode (2080 MiB)" << std::endl; @@ -213,6 +228,7 @@ int main(int argc, char** argv) { std::cout << " ..." << std::endl; try { + randomx::selectArgonImpl(flags); //just to check if flags are valid if (jit && !RANDOMX_HAVE_COMPILER) { throw std::runtime_error("JIT compilation is not supported on this platform. Try without --jit"); } diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index 06bd56a..3b210a5 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -997,8 +997,9 @@ int main() { if (RANDOMX_HAVE_COMPILER) { randomx_release_cache(cache); - cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); randomx_destroy_vm(vm); + vm = nullptr; + cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); initCache("test key 000"); vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr); } @@ -1013,6 +1014,35 @@ int main() { runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e); + randomx_destroy_vm(vm); + vm = nullptr; + + randomx_release_cache(cache); + cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_SSE3); + + runTest("Cache initialization: SSSE3", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + initCache("test key 000"); + uint64_t* cacheMemory = (uint64_t*)cache->memory; + assert(cacheMemory[0] == 0x191e0e1d23c02186); + assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); + assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); + }); + + if (cache != nullptr) + randomx_release_cache(cache); + cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX2); + + runTest("Cache initialization: AVX2", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + initCache("test key 000"); + uint64_t* cacheMemory = (uint64_t*)cache->memory; + assert(cacheMemory[0] == 0x191e0e1d23c02186); + assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); + assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); + }); + + if (cache != nullptr) + randomx_release_cache(cache); + std::cout << std::endl << "All tests PASSED" << std::endl; if (skipped) { diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index 9d8e794..a0287b0 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -114,6 +114,7 @@ true AssemblyCode _MBCS;NDEBUG;%(PreprocessorDefinitions) + AdvancedVectorExtensions2 true @@ -131,8 +132,10 @@ SET ERRORLEVEL = 0 + + @@ -163,7 +166,9 @@ SET ERRORLEVEL = 0 + + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index 12f6187..56ea326 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -81,6 +81,12 @@ Source Files + + Source Files + + + Source Files + @@ -185,6 +191,12 @@ Header Files + + Header Files + + + Header Files +