Optimized Argon2 (SSSE3/AVX2)

pr-argon2
tevador 5 years ago
parent 298cc77095
commit 900a936816

@ -31,6 +31,8 @@ cmake_minimum_required(VERSION 2.8.7)
set (randomx_sources
src/aes_hash.cpp
src/argon2_ref.c
src/argon2_sse3.c
src/argon2_avx2.c
src/bytecode_machine.cpp
src/dataset.cpp
src/soft_aes.cpp
@ -103,6 +105,14 @@ if (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "
else()
# default build has hardware AES enabled (software AES can be selected at runtime)
add_flag("-maes")
check_c_compiler_flag(-mssse3 HAVE_SSSE3)
if(HAVE_SSSE3)
set_source_files_properties(src/argon2_sse3.c COMPILE_FLAGS -mssse3)
endif()
check_c_compiler_flag(-mavx2 HAVE_AVX2)
if(HAVE_AVX2)
set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2)
endif()
endif()
endif()

@ -227,3 +227,35 @@ typedef enum Argon2_version {
ARGON2_VERSION_13 = 0x13,
ARGON2_VERSION_NUMBER = ARGON2_VERSION_13
} argon2_version;
//Argon2 instance - forward declaration
typedef struct Argon2_instance_t argon2_instance_t;
//Argon2 position = forward declaration
typedef struct Argon2_position_t argon2_position_t;
//Argon2 implementation function
typedef void randomx_argon2_impl(const argon2_instance_t* instance,
argon2_position_t position);
#if defined(__cplusplus)
extern "C" {
#endif
/*
* Function that fills the segment using previous segments also from other
* threads
* @param context current context
* @param instance Pointer to the current instance
* @param position Current position
* @pre all block pointers must be valid
*/
void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl *randomx_argon2_impl_sse3();
randomx_argon2_impl *randomx_argon2_impl_avx2();
#if defined(__cplusplus)
}
#endif

@ -0,0 +1,174 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "argon2.h"
void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl* randomx_argon2_impl_avx2() {
#if defined(__AVX2__)
return &randomx_argon2_fill_segment_avx2;
#endif
return NULL;
}
#if defined(__AVX2__)
#include "argon2_core.h"
#include "blake2/blamka-round-avx2.h"
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void fill_block(__m256i* state, const block* ref_block,
block* next_block, int with_xor) {
__m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
unsigned int i;
if (with_xor) {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
block_XY[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i));
}
}
else {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
}
}
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
}
for (i = 0; i < 4; ++i) {
BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i],
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
}
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
_mm256_storeu_si256((__m256i*)next_block->v + i, state[i]);
}
}
void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
argon2_position_t position) {
block* ref_block = NULL, * curr_block = NULL;
block address_block, input_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
__m256i state[ARGON2_HWORDS_IN_BLOCK];
if (instance == NULL) {
return;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
}
else {
/* Previous block */
prev_offset = curr_offset - 1;
}
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(state, ref_block, curr_block, 0);
}
else {
if (0 == position.pass) {
fill_block(state, ref_block, curr_block, 0);
}
else {
fill_block(state, ref_block, curr_block, 1);
}
}
}
}
#endif

@ -70,18 +70,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
/***************Instance and Position constructors**********/
void rxa2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
void rxa2_copy_block(block *dst, const block *src) {
memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
}
void rxa2_xor_block(block *dst, const block *src) {
int i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
dst->v[i] ^= src->v[i];
}
}
static void load_block(block *dst, const void *input) {
unsigned i;
@ -97,69 +85,7 @@ static void store_block(void *output, const block *src) {
}
}
/***************Memory functions*****************/
int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory,
size_t num, size_t size) {
size_t memory_size = num * size;
if (memory == NULL) {
return ARGON2_MEMORY_ALLOCATION_ERROR;
}
/* 1. Check for multiplication overflow */
if (size != 0 && memory_size / size != num) {
return ARGON2_MEMORY_ALLOCATION_ERROR;
}
/* 2. Try to allocate with appropriate allocator */
if (context->allocate_cbk) {
(context->allocate_cbk)(memory, memory_size);
}
else {
*memory = (uint8_t*)malloc(memory_size);
}
if (*memory == NULL) {
return ARGON2_MEMORY_ALLOCATION_ERROR;
}
return ARGON2_OK;
}
void rxa2_free_memory(const argon2_context *context, uint8_t *memory,
size_t num, size_t size) {
size_t memory_size = num * size;
rxa2_clear_internal_memory(memory, memory_size);
if (context->free_cbk) {
(context->free_cbk)(memory, memory_size);
}
else {
free(memory);
}
}
void NOT_OPTIMIZED rxa2_secure_wipe_memory(void *v, size_t n) {
#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER)
SecureZeroMemory(v, n);
#elif defined memset_s
memset_s(v, n, 0, n);
#elif defined(__OpenBSD__)
explicit_bzero(v, n);
#else
static void *(*const volatile memset_sec)(void *, int, size_t) = &memset;
memset_sec(v, 0, n);
#endif
}
/* Memory clear flag defaults to true. */
#define FLAG_clear_internal_memory 0
void rxa2_clear_internal_memory(void *v, size_t n) {
if (FLAG_clear_internal_memory && v) {
rxa2_secure_wipe_memory(v, n);
}
}
uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
const argon2_position_t *position, uint32_t pseudo_rand,
int same_lane) {
/*
@ -241,24 +167,22 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) {
for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
for (l = 0; l < instance->lanes; ++l) {
argon2_position_t position = { r, l, (uint8_t)s, 0 };
rxa2_fill_segment(instance, position);
//fill the segment using the selected implementation
instance->impl(instance, position);
}
}
#ifdef GENKAT
internal_kat(instance, r); /* Print all memory blocks */
#endif
}
return ARGON2_OK;
}
int rxa2_fill_memory_blocks(argon2_instance_t *instance) {
int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) {
if (instance == NULL || instance->lanes == 0) {
return ARGON2_INCORRECT_PARAMETER;
}
return fill_memory_blocks_st(instance);
}
int rxa2_validate_inputs(const argon2_context *context) {
int randomx_argon2_validate_inputs(const argon2_context *context) {
if (NULL == context) {
return ARGON2_INCORRECT_PARAMETER;
}
@ -394,7 +318,6 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc
load_block(&instance->memory[l * instance->lane_length + 1],
blockhash_bytes);
}
rxa2_clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
}
void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) {
@ -431,11 +354,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
if (context->pwd != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
context->pwdlen);
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
rxa2_secure_wipe_memory(context->pwd, context->pwdlen);
context->pwdlen = 0;
}
}
store32(&value, context->saltlen);
@ -451,11 +369,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
if (context->secret != NULL) {
blake2b_update(&BlakeHash, (const uint8_t *)context->secret,
context->secretlen);
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
rxa2_secure_wipe_memory(context->secret, context->secretlen);
context->secretlen = 0;
}
}
store32(&value, context->adlen);
@ -469,7 +382,7 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
}
int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) {
int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) {
uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
int result = ARGON2_OK;
@ -478,10 +391,7 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context)
instance->context_ptr = context;
/* 1. Memory allocation */
/*result = allocate_memory(context, (uint8_t **)&(instance->memory), instance->memory_blocks, sizeof(block));
if (result != ARGON2_OK) {
return result;
}*/
//RandomX takes care of memory allocation
/* 2. Initial hashing */
/* H_0 + 8 extra bytes to produce the first blocks */
@ -489,15 +399,13 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context)
/* Hashing all inputs */
rxa2_initial_hash(blockhash, context, instance->type);
/* Zeroing 8 extra bytes */
rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
/*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
ARGON2_PREHASH_SEED_LENGTH -
ARGON2_PREHASH_DIGEST_LENGTH);
ARGON2_PREHASH_DIGEST_LENGTH);*/
/* 3. Creating first blocks, we always have at least two blocks in a slice
*/
rxa2_fill_first_blocks(blockhash, instance);
/* Clearing the hash */
rxa2_clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
return ARGON2_OK;
}

@ -73,17 +73,6 @@ enum argon2_core_constants {
*/
typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block;
/*****************Functions that work with the block******************/
/* Initialize each byte of the block with @in */
void rxa2_init_block_value(block *b, uint8_t in);
/* Copy block @src to block @dst */
void rxa2_copy_block(block *dst, const block *src);
/* XOR @src onto @dst bytewise */
void rxa2_xor_block(block *dst, const block *src);
/*
* Argon2 instance: memory pointer, number of passes, amount of memory, type,
* and derived values.
@ -102,6 +91,7 @@ typedef struct Argon2_instance_t {
argon2_type type;
int print_internals; /* whether to print the memory blocks */
argon2_context *context_ptr; /* points back to original context */
randomx_argon2_impl *impl;
} argon2_instance_t;
/*
@ -123,42 +113,6 @@ typedef struct Argon2_thread_data {
/*************************Argon2 core functions********************************/
/* Allocates memory to the given pointer, uses the appropriate allocator as
* specified in the context. Total allocated memory is num*size.
* @param context argon2_context which specifies the allocator
* @param memory pointer to the pointer to the memory
* @param size the size in bytes for each element to be allocated
* @param num the number of elements to be allocated
* @return ARGON2_OK if @memory is a valid pointer and memory is allocated
*/
int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory,
size_t num, size_t size);
/*
* Frees memory at the given pointer, uses the appropriate deallocator as
* specified in the context. Also cleans the memory using clear_internal_memory.
* @param context argon2_context which specifies the deallocator
* @param memory pointer to buffer to be freed
* @param size the size in bytes for each element to be deallocated
* @param num the number of elements to be deallocated
*/
void rxa2_free_memory(const argon2_context *context, uint8_t *memory,
size_t num, size_t size);
/* Function that securely cleans the memory. This ignores any flags set
* regarding clearing memory. Usually one just calls clear_internal_memory.
* @param mem Pointer to the memory
* @param s Memory size in bytes
*/
void rxa2_secure_wipe_memory(void *v, size_t n);
/* Function that securely clears the memory if FLAG_clear_internal_memory is
* set. If the flag isn't set, this function does nothing.
* @param mem Pointer to the memory
* @param s Memory size in bytes
*/
void rxa2_clear_internal_memory(void *v, size_t n);
/*
* Computes absolute position of reference block in the lane following a skewed
* distribution and using a pseudo-random value as input
@ -169,7 +123,7 @@ void rxa2_clear_internal_memory(void *v, size_t n);
* If so we can reference the current segment
* @pre All pointers must be valid
*/
uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
const argon2_position_t *position, uint32_t pseudo_rand,
int same_lane);
@ -180,28 +134,7 @@ uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
* @return ARGON2_OK if everything is all right, otherwise one of error codes
* (all defined in <argon2.h>
*/
int rxa2_validate_inputs(const argon2_context *context);
/*
* Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
* password and secret if needed
* @param context Pointer to the Argon2 internal structure containing memory
* pointer, and parameters for time and space requirements.
* @param blockhash Buffer for pre-hashing digest
* @param type Argon2 type
* @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
* allocated
*/
void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context,
argon2_type type);
/*
* Function creates first 2 blocks per lane
* @param instance Pointer to the current instance
* @param blockhash Pointer to the pre-hashing digest
* @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
*/
void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
int randomx_argon2_validate_inputs(const argon2_context *context);
/*
* Function allocates memory, hashes the inputs with Blake, and creates first
@ -213,31 +146,7 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc
* @return Zero if successful, -1 if memory failed to allocate. @context->state
* will be modified if successful.
*/
int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context);
/*
* XORing the last block of each lane, hashing it, making the tag. Deallocates
* the memory.
* @param context Pointer to current Argon2 context (use only the out parameters
* from it)
* @param instance Pointer to current instance of Argon2
* @pre instance->state must point to necessary amount of memory
* @pre context->out must point to outlen bytes of memory
* @pre if context->free_cbk is not NULL, it should point to a function that
* deallocates memory
*/
void rxa2_finalize(const argon2_context *context, argon2_instance_t *instance);
/*
* Function that fills the segment using previous segments also from other
* threads
* @param context current context
* @param instance Pointer to the current instance
* @param position Current position
* @pre all block pointers must be valid
*/
void rxa2_fill_segment(const argon2_instance_t *instance,
argon2_position_t position);
int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context);
/*
* Function that fills the entire memory t_cost times based on the first two
@ -245,7 +154,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
* @param instance Pointer to the current instance
* @return ARGON2_OK if successful, @context->state
*/
int rxa2_fill_memory_blocks(argon2_instance_t *instance);
int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance);
#if defined(__cplusplus)
}

@ -43,6 +43,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void copy_block(block* dst, const block* src) {
memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
}
static void xor_block(block* dst, const block* src) {
int i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
dst->v[i] ^= src->v[i];
}
}
/*
* Function fills a new memory block and optionally XORs the old block over the new one.
* @next_block must be initialized.
@ -57,13 +68,13 @@ static void fill_block(const block *prev_block, const block *ref_block,
block blockR, block_tmp;
unsigned i;
rxa2_copy_block(&blockR, ref_block);
rxa2_xor_block(&blockR, prev_block);
rxa2_copy_block(&block_tmp, &blockR);
copy_block(&blockR, ref_block);
xor_block(&blockR, prev_block);
copy_block(&block_tmp, &blockR);
/* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */
if (with_xor) {
/* Saving the next block contents for XOR over: */
rxa2_xor_block(&block_tmp, next_block);
xor_block(&block_tmp, next_block);
/* Now blockR = ref_block + prev_block and
block_tmp = ref_block + prev_block + next_block */
}
@ -92,18 +103,11 @@ static void fill_block(const block *prev_block, const block *ref_block,
blockR.v[2 * i + 113]);
}
rxa2_copy_block(next_block, &block_tmp);
rxa2_xor_block(next_block, &blockR);
}
static void next_addresses(block *address_block, block *input_block,
const block *zero_block) {
input_block->v[6]++;
fill_block(zero_block, input_block, address_block, 0);
fill_block(zero_block, address_block, address_block, 0);
copy_block(next_block, &block_tmp);
xor_block(next_block, &blockR);
}
void rxa2_fill_segment(const argon2_instance_t *instance,
void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance,
argon2_position_t position) {
block *ref_block = NULL, *curr_block = NULL;
block address_block, input_block, zero_block;
@ -111,38 +115,15 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
uint32_t prev_offset, curr_offset;
uint32_t starting_index;
uint32_t i;
int data_independent_addressing;
if (instance == NULL) {
return;
}
data_independent_addressing =
(instance->type == Argon2_i) ||
(instance->type == Argon2_id && (position.pass == 0) &&
(position.slice < ARGON2_SYNC_POINTS / 2));
if (data_independent_addressing) {
rxa2_init_block_value(&zero_block, 0);
rxa2_init_block_value(&input_block, 0);
input_block.v[0] = position.pass;
input_block.v[1] = position.lane;
input_block.v[2] = position.slice;
input_block.v[3] = instance->memory_blocks;
input_block.v[4] = instance->passes;
input_block.v[5] = instance->type;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
/* Don't forget to generate the first block of addresses: */
if (data_independent_addressing) {
next_addresses(&address_block, &input_block, &zero_block);
}
}
/* Offset of the current block */
@ -167,15 +148,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
if (data_independent_addressing) {
if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
next_addresses(&address_block, &input_block, &zero_block);
}
pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
}
else {
pseudo_rand = instance->memory[prev_offset].v[0];
}
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
@ -189,7 +162,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
* lane.
*/
position.index = i;
ref_index = rxa2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */

@ -0,0 +1,182 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "argon2.h"
#if defined(_MSC_VER) //MSVC doesn't define SSSE3
#define __SSSE3__
#endif
void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance,
argon2_position_t position);
randomx_argon2_impl* randomx_argon2_impl_sse3() {
#if defined(__SSSE3__)
return &randomx_argon2_fill_segment_sse3;
#endif
return NULL;
}
#if defined(__SSSE3__)
#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
#include "argon2_core.h"
#include "blake2/blamka-round-sse3.h"
#include "blake2/blake2-impl.h"
#include "blake2/blake2.h"
static void fill_block(__m128i* state, const block* ref_block,
block* next_block, int with_xor) {
__m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
unsigned int i;
if (with_xor) {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
block_XY[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)next_block->v + i));
}
}
else {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm_xor_si128(
state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
}
}
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
state[8 * i + 6], state[8 * i + 7]);
}
for (i = 0; i < 8; ++i) {
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
state[8 * 6 + i], state[8 * 7 + i]);
}
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(state[i], block_XY[i]);
_mm_storeu_si128((__m128i*)next_block->v + i, state[i]);
}
}
void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance,
argon2_position_t position) {
block* ref_block = NULL, * curr_block = NULL;
block address_block, input_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
__m128i state[ARGON2_OWORDS_IN_BLOCK];
if (instance == NULL) {
return;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
}
else {
/* Previous block */
prev_offset = curr_offset - 1;
}
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
pseudo_rand = instance->memory[prev_offset].v[0];
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(state, ref_block, curr_block, 0);
}
else {
if (0 == position.pass) {
fill_block(state, ref_block, curr_block, 0);
}
else {
fill_block(state, ref_block, curr_block, 1);
}
}
}
}
#endif

@ -0,0 +1,189 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef BLAKE_ROUND_MKA_OPT_H
#define BLAKE_ROUND_MKA_OPT_H
#include "blake2-impl.h"
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr32(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
\
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr24(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D1 = rotr32(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
\
B1 = _mm256_xor_si256(B1, C1); \
B1 = rotr24(B1); \
} while((void)0, 0);
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr16(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr63(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D1 = rotr16(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
B1 = _mm256_xor_si256(B1, C1); \
B1 = rotr63(B1); \
} while((void)0, 0);
#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while((void)0, 0);
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
} while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while((void)0, 0);
#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
} while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0);
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0);
#endif /* BLAKE_ROUND_MKA_OPT_H */

@ -0,0 +1,158 @@
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#ifndef BLAKE_ROUND_MKA_OPT_H
#define BLAKE_ROUND_MKA_OPT_H
#include "blake2-impl.h"
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif
#define r16 \
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define r24 \
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
const __m128i z = _mm_mul_epu32(x, y);
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
}
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -32); \
D1 = _mm_roti_epi64(D1, -32); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -24); \
B1 = _mm_roti_epi64(B1, -24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -16); \
D1 = _mm_roti_epi64(D1, -16); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -63); \
B1 = _mm_roti_epi64(B1, -63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D1, D0, 8); \
t1 = _mm_alignr_epi8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D0, D1, 8); \
t1 = _mm_alignr_epi8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
#endif /* BLAKE_ROUND_MKA_OPT_H */

@ -92,7 +92,7 @@ namespace randomx {
context.flags = ARGON2_DEFAULT_FLAGS;
context.version = ARGON2_VERSION_NUMBER;
int inputsValid = rxa2_validate_inputs(&context);
int inputsValid = randomx_argon2_validate_inputs(&context);
assert(inputsValid == ARGON2_OK);
/* 2. Align memory size */
@ -111,6 +111,7 @@ namespace randomx {
instance.threads = context.threads;
instance.type = Argon2_d;
instance.memory = (block*)cache->memory;
instance.impl = cache->argonImpl;
if (instance.threads > instance.lanes) {
instance.threads = instance.lanes;
@ -119,9 +120,9 @@ namespace randomx {
/* 3. Initialization: Hashing inputs, allocating memory, filling first
* blocks
*/
rxa2_argon_initialize(&instance, &context);
randomx_argon2_initialize(&instance, &context);
rxa2_fill_memory_blocks(&instance);
randomx_argon2_fill_memory_blocks(&instance);
cache->reciprocalCache.clear();
randomx::Blake2Generator gen(key, keySize);

@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.hpp"
#include "superscalar_program.hpp"
#include "allocator.hpp"
#include "argon2.h"
/* Global scope for C binding */
struct randomx_dataset {
@ -51,6 +52,7 @@ struct randomx_cache {
randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
std::vector<uint64_t> reciprocalCache;
std::string cacheKey;
randomx_argon2_impl* argonImpl;
bool isInitialized() {
return programs[0].getSize() != 0;
@ -79,4 +81,21 @@ namespace randomx {
void initCacheCompile(randomx_cache*, const void*, size_t);
void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber);
void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) {
if ((flags & RANDOMX_FLAG_ARGON2) == 0) {
return &randomx_argon2_fill_segment_ref;
}
randomx_argon2_impl* impl = nullptr;
if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_SSE3) {
impl = randomx_argon2_impl_sse3();
}
if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_AVX2) {
impl = randomx_argon2_impl_avx2();
}
if (impl != nullptr) {
return impl;
}
throw std::runtime_error("Unsupported Argon2 implementation");
}
}

@ -39,10 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern "C" {
randomx_cache *randomx_alloc_cache(randomx_flags flags) {
randomx_cache *cache;
randomx_cache *cache = nullptr;
try {
cache = new randomx_cache();
cache->argonImpl = randomx::selectArgonImpl(flags);
switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)) {
case RANDOMX_FLAG_DEFAULT:
cache->dealloc = &randomx::deallocCache<randomx::DefaultAllocator>;
@ -103,7 +104,9 @@ extern "C" {
void randomx_release_cache(randomx_cache* cache) {
assert(cache != nullptr);
cache->dealloc(cache);
if (cache->memory != nullptr) {
cache->dealloc(cache);
}
delete cache;
}
@ -114,7 +117,7 @@ extern "C" {
return nullptr;
}
randomx_dataset *dataset;
randomx_dataset *dataset = nullptr;
try {
dataset = new randomx_dataset();

@ -44,7 +44,10 @@ typedef enum {
RANDOMX_FLAG_HARD_AES = 2,
RANDOMX_FLAG_FULL_MEM = 4,
RANDOMX_FLAG_JIT = 8,
RANDOMX_FLAG_SECURE = 16
RANDOMX_FLAG_SECURE = 16,
RANDOMX_FLAG_ARGON2_SSE3 = 32,
RANDOMX_FLAG_ARGON2_AVX2 = 64,
RANDOMX_FLAG_ARGON2 = 96
} randomx_flags;
typedef struct randomx_dataset randomx_dataset;
@ -62,10 +65,17 @@ extern "C" {
* RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
* RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes
* subsequent Dataset initialization faster
* Optionally, one of these two flags may be selected:
* RANDOMX_FLAG_ARGON2_SSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set
* makes subsequent cache initialization faster
* RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set
* makes subsequent cache initialization faster
*
* @return Pointer to an allocated randomx_cache structure.
* NULL is returned if memory allocation fails or if the RANDOMX_FLAG_JIT
* is set and JIT compilation is not supported on the current platform.
* Returns NULL if:
* (1) memory allocation fails
* (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform
* (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set
*/
RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags);

@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "stopwatch.hpp"
#include "utility.hpp"
#include "../randomx.h"
#include "../dataset.hpp"
#include "../blake2/endian.h"
#include "../common.hpp"
#ifdef _WIN32
@ -90,6 +91,8 @@ void printUsage(const char* executable) {
std::cout << " --init Q initialize dataset with Q threads (default: 1)" << std::endl;
std::cout << " --nonces N run N nonces (default: 1000)" << std::endl;
std::cout << " --seed S seed for cache initialization (default: 0)" << std::endl;
std::cout << " --sse3 use optimized Argon2 for SSSE3 CPUs" << std::endl;
std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl;
}
struct MemoryException : public std::exception {
@ -127,7 +130,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
}
int main(int argc, char** argv) {
bool softAes, miningMode, verificationMode, help, largePages, jit, secure;
bool softAes, miningMode, verificationMode, help, largePages, jit, secure, sse3, avx2;
int noncesCount, threadCount, initThreadCount;
uint64_t threadAffinity;
int32_t seedValue;
@ -148,6 +151,8 @@ int main(int argc, char** argv) {
readOption("--jit", argc, argv, jit);
readOption("--help", argc, argv, help);
readOption("--secure", argc, argv, secure);
readOption("--sse3", argc, argv, sse3);
readOption("--avx2", argc, argv, avx2);
store32(&seed, seedValue);
@ -166,6 +171,16 @@ int main(int argc, char** argv) {
randomx_cache* cache;
randomx_flags flags = RANDOMX_FLAG_DEFAULT;
if (sse3) {
flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_SSE3);
std::cout << " - Argon2 implementation: SSE3" << std::endl;
}
if (avx2) {
flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_AVX2);
std::cout << " - Argon2 implementation: AVX2" << std::endl;
}
if (miningMode) {
flags = (randomx_flags)(flags | RANDOMX_FLAG_FULL_MEM);
std::cout << " - full memory mode (2080 MiB)" << std::endl;
@ -213,6 +228,7 @@ int main(int argc, char** argv) {
std::cout << " ..." << std::endl;
try {
randomx::selectArgonImpl(flags); //just to check if flags are valid
if (jit && !RANDOMX_HAVE_COMPILER) {
throw std::runtime_error("JIT compilation is not supported on this platform. Try without --jit");
}

@ -997,8 +997,9 @@ int main() {
if (RANDOMX_HAVE_COMPILER) {
randomx_release_cache(cache);
cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
randomx_destroy_vm(vm);
vm = nullptr;
cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
initCache("test key 000");
vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr);
}
@ -1013,6 +1014,35 @@ int main() {
runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e);
randomx_destroy_vm(vm);
vm = nullptr;
randomx_release_cache(cache);
cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_SSE3);
runTest("Cache initialization: SSSE3", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
initCache("test key 000");
uint64_t* cacheMemory = (uint64_t*)cache->memory;
assert(cacheMemory[0] == 0x191e0e1d23c02186);
assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1);
assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
});
if (cache != nullptr)
randomx_release_cache(cache);
cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX2);
runTest("Cache initialization: AVX2", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
initCache("test key 000");
uint64_t* cacheMemory = (uint64_t*)cache->memory;
assert(cacheMemory[0] == 0x191e0e1d23c02186);
assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1);
assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
});
if (cache != nullptr)
randomx_release_cache(cache);
std::cout << std::endl << "All tests PASSED" << std::endl;
if (skipped) {

@ -114,6 +114,7 @@
<ConformanceMode>true</ConformanceMode>
<AssemblerOutput>AssemblyCode</AssemblerOutput>
<PreprocessorDefinitions>_MBCS;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -131,8 +132,10 @@ SET ERRORLEVEL = 0</Command>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\src\allocator.cpp" />
<ClCompile Include="..\src\argon2_avx2.c" />
<ClCompile Include="..\src\argon2_core.c" />
<ClCompile Include="..\src\argon2_ref.c" />
<ClCompile Include="..\src\argon2_sse3.c" />
<ClCompile Include="..\src\assembly_generator_x86.cpp" />
<ClCompile Include="..\src\blake2_generator.cpp" />
<ClCompile Include="..\src\blake2\blake2b.c" />
@ -163,7 +166,9 @@ SET ERRORLEVEL = 0</Command>
<ClInclude Include="..\src\assembly_generator_x86.hpp" />
<ClInclude Include="..\src\blake2\blake2-impl.h" />
<ClInclude Include="..\src\blake2\blake2.h" />
<ClInclude Include="..\src\blake2\blamka-round-avx2.h" />
<ClInclude Include="..\src\blake2\blamka-round-ref.h" />
<ClInclude Include="..\src\blake2\blamka-round-sse3.h" />
<ClInclude Include="..\src\blake2\endian.h" />
<ClInclude Include="..\src\blake2_generator.hpp" />
<ClInclude Include="..\src\bytecode_machine.hpp" />

@ -81,6 +81,12 @@
<ClCompile Include="..\src\bytecode_machine.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\src\argon2_sse3.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\src\argon2_avx2.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\argon2.h">
@ -185,6 +191,12 @@
<ClInclude Include="..\src\bytecode_machine.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\src\blake2\blamka-round-sse3.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\src\blake2\blamka-round-avx2.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<MASM Include="..\src\jit_compiler_x86_static.asm">

Loading…
Cancel
Save