Optimized Argon2 (SSSE3/AVX2)

5 years ago · 900a936816
parent 298cc77095
commit 900a936816
17 changed files with 886 additions and 255 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,6 +31,8 @@ cmake_minimum_required(VERSION 2.8.7)
 set (randomx_sources
 src/aes_hash.cpp
 src/argon2_ref.c
+src/argon2_sse3.c
+src/argon2_avx2.c
 src/bytecode_machine.cpp
 src/dataset.cpp
 src/soft_aes.cpp
@ -103,6 +105,14 @@ if (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "
  else()
    # default build has hardware AES enabled (software AES can be selected at runtime)
    add_flag("-maes")
+    check_c_compiler_flag(-mssse3 HAVE_SSSE3)
+    if(HAVE_SSSE3)
+      set_source_files_properties(src/argon2_sse3.c COMPILE_FLAGS -mssse3)
+    endif()
+    check_c_compiler_flag(-mavx2 HAVE_AVX2)
+    if(HAVE_AVX2)
+      set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2)
+    endif()
  endif()
 endif()

--- a/src/argon2.h
+++ b/src/argon2.h
@ -227,3 +227,35 @@ typedef enum Argon2_version {
 	ARGON2_VERSION_13 = 0x13,
 	ARGON2_VERSION_NUMBER = ARGON2_VERSION_13
 } argon2_version;
+
+//Argon2 instance - forward declaration
+typedef struct Argon2_instance_t argon2_instance_t;
+
+//Argon2 position = forward declaration
+typedef struct Argon2_position_t argon2_position_t;
+
+//Argon2 implementation function
+typedef void randomx_argon2_impl(const argon2_instance_t* instance,
+	argon2_position_t position);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads
+ * @param context current context
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance,
+	argon2_position_t position);
+
+randomx_argon2_impl *randomx_argon2_impl_sse3();
+randomx_argon2_impl *randomx_argon2_impl_avx2();
+
+#if defined(__cplusplus)
+}
+#endif
--- a/src/argon2_avx2.c
+++ b/src/argon2_avx2.c
@ -0,0 +1,174 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+
+void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
+	argon2_position_t position);
+
+randomx_argon2_impl* randomx_argon2_impl_avx2() {
+#if defined(__AVX2__)
+	return &randomx_argon2_fill_segment_avx2;
+#endif
+	return NULL;
+}
+
+#if defined(__AVX2__)
+
+#include "argon2_core.h"
+
+#include "blake2/blamka-round-avx2.h"
+#include "blake2/blake2-impl.h"
+#include "blake2/blake2.h"
+
+static void fill_block(__m256i* state, const block* ref_block,
+	block* next_block, int with_xor) {
+	__m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+	unsigned int i;
+
+	if (with_xor) {
+		for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+			state[i] = _mm256_xor_si256(
+				state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
+			block_XY[i] = _mm256_xor_si256(
+				state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i));
+		}
+	}
+	else {
+		for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+			block_XY[i] = state[i] = _mm256_xor_si256(
+				state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i));
+		}
+	}
+
+	for (i = 0; i < 4; ++i) {
+		BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+			state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+	}
+
+	for (i = 0; i < 4; ++i) {
+		BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i],
+			state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+	}
+
+	for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+		state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+		_mm256_storeu_si256((__m256i*)next_block->v + i, state[i]);
+	}
+}
+
+void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance,
+	argon2_position_t position) {
+	block* ref_block = NULL, * curr_block = NULL;
+	block address_block, input_block;
+	uint64_t pseudo_rand, ref_index, ref_lane;
+	uint32_t prev_offset, curr_offset;
+	uint32_t starting_index, i;
+	__m256i state[ARGON2_HWORDS_IN_BLOCK];
+
+	if (instance == NULL) {
+		return;
+	}
+
+	starting_index = 0;
+
+	if ((0 == position.pass) && (0 == position.slice)) {
+		starting_index = 2; /* we have already generated the first two blocks */
+	}
+
+	/* Offset of the current block */
+	curr_offset = position.lane * instance->lane_length +
+		position.slice * instance->segment_length + starting_index;
+
+	if (0 == curr_offset % instance->lane_length) {
+		/* Last block in this lane */
+		prev_offset = curr_offset + instance->lane_length - 1;
+	}
+	else {
+		/* Previous block */
+		prev_offset = curr_offset - 1;
+	}
+
+	memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+	for (i = starting_index; i < instance->segment_length;
+		++i, ++curr_offset, ++prev_offset) {
+		/*1.1 Rotating prev_offset if needed */
+		if (curr_offset % instance->lane_length == 1) {
+			prev_offset = curr_offset - 1;
+		}
+
+		/* 1.2 Computing the index of the reference block */
+		/* 1.2.1 Taking pseudo-random value from the previous block */
+		pseudo_rand = instance->memory[prev_offset].v[0];
+
+		/* 1.2.2 Computing the lane of the reference block */
+		ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+		if ((position.pass == 0) && (position.slice == 0)) {
+			/* Can not reference other lanes yet */
+			ref_lane = position.lane;
+		}
+
+		/* 1.2.3 Computing the number of possible reference block within the
+		 * lane.
+		 */
+		position.index = i;
+		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+			ref_lane == position.lane);
+
+		/* 2 Creating a new block */
+		ref_block =
+			instance->memory + instance->lane_length * ref_lane + ref_index;
+		curr_block = instance->memory + curr_offset;
+		if (ARGON2_VERSION_10 == instance->version) {
+			/* version 1.2.1 and earlier: overwrite, not XOR */
+			fill_block(state, ref_block, curr_block, 0);
+		}
+		else {
+			if (0 == position.pass) {
+				fill_block(state, ref_block, curr_block, 0);
+			}
+			else {
+				fill_block(state, ref_block, curr_block, 1);
+			}
+		}
+	}
+}
+
+#endif
--- a/src/argon2_core.c
+++ b/src/argon2_core.c
@ -70,18 +70,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif

 /***************Instance and Position constructors**********/
-void rxa2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
-
-void rxa2_copy_block(block *dst, const block *src) {
-	memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
-}
-
-void rxa2_xor_block(block *dst, const block *src) {
-	int i;
-	for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
-		dst->v[i] ^= src->v[i];
-	}
-}

 static void load_block(block *dst, const void *input) {
 	unsigned i;
@ -97,69 +85,7 @@ static void store_block(void *output, const block *src) {
 	}
 }

-/***************Memory functions*****************/
-
-int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory,
-	size_t num, size_t size) {
-	size_t memory_size = num * size;
-	if (memory == NULL) {
-		return ARGON2_MEMORY_ALLOCATION_ERROR;
-	}
-
-	/* 1. Check for multiplication overflow */
-	if (size != 0 && memory_size / size != num) {
-		return ARGON2_MEMORY_ALLOCATION_ERROR;
-	}
-
-	/* 2. Try to allocate with appropriate allocator */
-	if (context->allocate_cbk) {
-		(context->allocate_cbk)(memory, memory_size);
-	}
-	else {
-		*memory = (uint8_t*)malloc(memory_size);
-	}
-
-	if (*memory == NULL) {
-		return ARGON2_MEMORY_ALLOCATION_ERROR;
-	}
-
-	return ARGON2_OK;
-}
-
-void rxa2_free_memory(const argon2_context *context, uint8_t *memory,
-	size_t num, size_t size) {
-	size_t memory_size = num * size;
-	rxa2_clear_internal_memory(memory, memory_size);
-	if (context->free_cbk) {
-		(context->free_cbk)(memory, memory_size);
-	}
-	else {
-		free(memory);
-	}
-}
-
-void NOT_OPTIMIZED rxa2_secure_wipe_memory(void *v, size_t n) {
-#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER)
-	SecureZeroMemory(v, n);
-#elif defined memset_s
-	memset_s(v, n, 0, n);
-#elif defined(__OpenBSD__)
-	explicit_bzero(v, n);
-#else
-	static void *(*const volatile memset_sec)(void *, int, size_t) = &memset;
-	memset_sec(v, 0, n);
-#endif
-}
-
-/* Memory clear flag defaults to true. */
-#define FLAG_clear_internal_memory 0
-void rxa2_clear_internal_memory(void *v, size_t n) {
-	if (FLAG_clear_internal_memory && v) {
-		rxa2_secure_wipe_memory(v, n);
-	}
-}
-
-uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
+uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
 	const argon2_position_t *position, uint32_t pseudo_rand,
 	int same_lane) {
 	/*
@ -241,24 +167,22 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) {
 		for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
 			for (l = 0; l < instance->lanes; ++l) {
 				argon2_position_t position = { r, l, (uint8_t)s, 0 };
-				rxa2_fill_segment(instance, position);
+				//fill the segment using the selected implementation
+				instance->impl(instance, position);
 			}
 		}
-#ifdef GENKAT
-		internal_kat(instance, r); /* Print all memory blocks */
-#endif
 	}
 	return ARGON2_OK;
 }

-int rxa2_fill_memory_blocks(argon2_instance_t *instance) {
+int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) {
 	if (instance == NULL || instance->lanes == 0) {
 		return ARGON2_INCORRECT_PARAMETER;
 	}
 	return fill_memory_blocks_st(instance);
 }

-int rxa2_validate_inputs(const argon2_context *context) {
+int randomx_argon2_validate_inputs(const argon2_context *context) {
 	if (NULL == context) {
 		return ARGON2_INCORRECT_PARAMETER;
 	}
@ -394,7 +318,6 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc
 		load_block(&instance->memory[l * instance->lane_length + 1],
 			blockhash_bytes);
 	}
-	rxa2_clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
 }

 void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) {
@ -431,11 +354,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
 	if (context->pwd != NULL) {
 		blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
 			context->pwdlen);
-
-		if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-			rxa2_secure_wipe_memory(context->pwd, context->pwdlen);
-			context->pwdlen = 0;
-		}
 	}

 	store32(&value, context->saltlen);
@ -451,11 +369,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
 	if (context->secret != NULL) {
 		blake2b_update(&BlakeHash, (const uint8_t *)context->secret,
 			context->secretlen);
-
-		if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-			rxa2_secure_wipe_memory(context->secret, context->secretlen);
-			context->secretlen = 0;
-		}
 	}

 	store32(&value, context->adlen);
@ -469,7 +382,7 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type
 	blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
 }

-int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) {
+int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) {
 	uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
 	int result = ARGON2_OK;

@ -478,10 +391,7 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context)
 	instance->context_ptr = context;

 	/* 1. Memory allocation */
-	/*result = allocate_memory(context, (uint8_t **)&(instance->memory), instance->memory_blocks, sizeof(block));
-	if (result != ARGON2_OK) {
-		return result;
-	}*/
+	//RandomX takes care of memory allocation

 	/* 2. Initial hashing */
 	/* H_0 + 8 extra bytes to produce the first blocks */
@ -489,15 +399,13 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context)
 	/* Hashing all inputs */
 	rxa2_initial_hash(blockhash, context, instance->type);
 	/* Zeroing 8 extra bytes */
-	rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
+	/*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
 		ARGON2_PREHASH_SEED_LENGTH -
-		ARGON2_PREHASH_DIGEST_LENGTH);
+		ARGON2_PREHASH_DIGEST_LENGTH);*/

 	/* 3. Creating first blocks, we always have at least two blocks in a slice
 	 */
 	rxa2_fill_first_blocks(blockhash, instance);
-	/* Clearing the hash */
-	rxa2_clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);

 	return ARGON2_OK;
 }
--- a/src/argon2_core.h
+++ b/src/argon2_core.h
@ -73,17 +73,6 @@ enum argon2_core_constants {
 */
 typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block;

-/*****************Functions that work with the block******************/
-
-/* Initialize each byte of the block with @in */
-void rxa2_init_block_value(block *b, uint8_t in);
-
-/* Copy block @src to block @dst */
-void rxa2_copy_block(block *dst, const block *src);
-
-/* XOR @src onto @dst bytewise */
-void rxa2_xor_block(block *dst, const block *src);
-
 /*
 * Argon2 instance: memory pointer, number of passes, amount of memory, type,
 * and derived values.
@ -102,6 +91,7 @@ typedef struct Argon2_instance_t {
 	argon2_type type;
 	int print_internals; /* whether to print the memory blocks */
 	argon2_context *context_ptr; /* points back to original context */
+	randomx_argon2_impl *impl;
 } argon2_instance_t;

 /*
@ -123,42 +113,6 @@ typedef struct Argon2_thread_data {

 /*************************Argon2 core functions********************************/

-/* Allocates memory to the given pointer, uses the appropriate allocator as
- * specified in the context. Total allocated memory is num*size.
- * @param context argon2_context which specifies the allocator
- * @param memory pointer to the pointer to the memory
- * @param size the size in bytes for each element to be allocated
- * @param num the number of elements to be allocated
- * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
- */
-int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory,
-	size_t num, size_t size);
-
-/*
- * Frees memory at the given pointer, uses the appropriate deallocator as
- * specified in the context. Also cleans the memory using clear_internal_memory.
- * @param context argon2_context which specifies the deallocator
- * @param memory pointer to buffer to be freed
- * @param size the size in bytes for each element to be deallocated
- * @param num the number of elements to be deallocated
- */
-void rxa2_free_memory(const argon2_context *context, uint8_t *memory,
-	size_t num, size_t size);
-
-/* Function that securely cleans the memory. This ignores any flags set
- * regarding clearing memory. Usually one just calls clear_internal_memory.
- * @param mem Pointer to the memory
- * @param s Memory size in bytes
- */
-void rxa2_secure_wipe_memory(void *v, size_t n);
-
-/* Function that securely clears the memory if FLAG_clear_internal_memory is
- * set. If the flag isn't set, this function does nothing.
- * @param mem Pointer to the memory
- * @param s Memory size in bytes
- */
-void rxa2_clear_internal_memory(void *v, size_t n);
-
 /*
 * Computes absolute position of reference block in the lane following a skewed
 * distribution and using a pseudo-random value as input
@ -169,7 +123,7 @@ void rxa2_clear_internal_memory(void *v, size_t n);
 * If so we can reference the current segment
 * @pre All pointers must be valid
 */
-uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
+uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance,
 	const argon2_position_t *position, uint32_t pseudo_rand,
 	int same_lane);

@ -180,28 +134,7 @@ uint32_t rxa2_index_alpha(const argon2_instance_t *instance,
 * @return ARGON2_OK if everything is all right, otherwise one of error codes
 * (all defined in <argon2.h>
 */
-int rxa2_validate_inputs(const argon2_context *context);
-
-/*
- * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
- * password and secret if needed
- * @param  context  Pointer to the Argon2 internal structure containing memory
- * pointer, and parameters for time and space requirements.
- * @param  blockhash Buffer for pre-hashing digest
- * @param  type Argon2 type
- * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
- * allocated
- */
-void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context,
-	argon2_type type);
-
-/*
- * Function creates first 2 blocks per lane
- * @param instance Pointer to the current instance
- * @param blockhash Pointer to the pre-hashing digest
- * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
- */
-void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
+int randomx_argon2_validate_inputs(const argon2_context *context);

 /*
 * Function allocates memory, hashes the inputs with Blake,  and creates first
@ -213,31 +146,7 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc
 * @return Zero if successful, -1 if memory failed to allocate. @context->state
 * will be modified if successful.
 */
-int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context);
-
-/*
- * XORing the last block of each lane, hashing it, making the tag. Deallocates
- * the memory.
- * @param context Pointer to current Argon2 context (use only the out parameters
- * from it)
- * @param instance Pointer to current instance of Argon2
- * @pre instance->state must point to necessary amount of memory
- * @pre context->out must point to outlen bytes of memory
- * @pre if context->free_cbk is not NULL, it should point to a function that
- * deallocates memory
- */
-void rxa2_finalize(const argon2_context *context, argon2_instance_t *instance);
-
-/*
- * Function that fills the segment using previous segments also from other
- * threads
- * @param context current context
- * @param instance Pointer to the current instance
- * @param position Current position
- * @pre all block pointers must be valid
- */
-void rxa2_fill_segment(const argon2_instance_t *instance,
-	argon2_position_t position);
+int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context);

 /*
 * Function that fills the entire memory t_cost times based on the first two
@ -245,7 +154,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
 * @param instance Pointer to the current instance
 * @return ARGON2_OK if successful, @context->state
 */
-int rxa2_fill_memory_blocks(argon2_instance_t *instance);
+int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance);

 #if defined(__cplusplus)
 }
--- a/src/argon2_ref.c
+++ b/src/argon2_ref.c
@ -43,6 +43,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "blake2/blake2-impl.h"
 #include "blake2/blake2.h"

+static void copy_block(block* dst, const block* src) {
+	memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
+}
+
+static void xor_block(block* dst, const block* src) {
+	int i;
+	for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+		dst->v[i] ^= src->v[i];
+	}
+}
+
 /*
  * Function fills a new memory block and optionally XORs the old block over the new one.
  * @next_block must be initialized.
@ -57,13 +68,13 @@ static void fill_block(const block *prev_block, const block *ref_block,
 	block blockR, block_tmp;
 	unsigned i;

-	rxa2_copy_block(&blockR, ref_block);
-	rxa2_xor_block(&blockR, prev_block);
-	rxa2_copy_block(&block_tmp, &blockR);
+	copy_block(&blockR, ref_block);
+	xor_block(&blockR, prev_block);
+	copy_block(&block_tmp, &blockR);
 	/* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */
 	if (with_xor) {
 		/* Saving the next block contents for XOR over: */
-		rxa2_xor_block(&block_tmp, next_block);
+		xor_block(&block_tmp, next_block);
 		/* Now blockR = ref_block + prev_block and
 		   block_tmp = ref_block + prev_block + next_block */
 	}
@ -92,18 +103,11 @@ static void fill_block(const block *prev_block, const block *ref_block,
 			blockR.v[2 * i + 113]);
 	}

-	rxa2_copy_block(next_block, &block_tmp);
-	rxa2_xor_block(next_block, &blockR);
-}
-
-static void next_addresses(block *address_block, block *input_block,
-	const block *zero_block) {
-	input_block->v[6]++;
-	fill_block(zero_block, input_block, address_block, 0);
-	fill_block(zero_block, address_block, address_block, 0);
+	copy_block(next_block, &block_tmp);
+	xor_block(next_block, &blockR);
 }

-void rxa2_fill_segment(const argon2_instance_t *instance,
+void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance,
 	argon2_position_t position) {
 	block *ref_block = NULL, *curr_block = NULL;
 	block address_block, input_block, zero_block;
@ -111,38 +115,15 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
 	uint32_t prev_offset, curr_offset;
 	uint32_t starting_index;
 	uint32_t i;
-	int data_independent_addressing;

 	if (instance == NULL) {
 		return;
 	}

-	data_independent_addressing =
-		(instance->type == Argon2_i) ||
-		(instance->type == Argon2_id && (position.pass == 0) &&
-		(position.slice < ARGON2_SYNC_POINTS / 2));
-
-	if (data_independent_addressing) {
-		rxa2_init_block_value(&zero_block, 0);
-		rxa2_init_block_value(&input_block, 0);
-
-		input_block.v[0] = position.pass;
-		input_block.v[1] = position.lane;
-		input_block.v[2] = position.slice;
-		input_block.v[3] = instance->memory_blocks;
-		input_block.v[4] = instance->passes;
-		input_block.v[5] = instance->type;
-	}
-
 	starting_index = 0;

 	if ((0 == position.pass) && (0 == position.slice)) {
 		starting_index = 2; /* we have already generated the first two blocks */
-
-		/* Don't forget to generate the first block of addresses: */
-		if (data_independent_addressing) {
-			next_addresses(&address_block, &input_block, &zero_block);
-		}
 	}

 	/* Offset of the current block */
@ -167,15 +148,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,

 		/* 1.2 Computing the index of the reference block */
 		/* 1.2.1 Taking pseudo-random value from the previous block */
-		if (data_independent_addressing) {
-			if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
-				next_addresses(&address_block, &input_block, &zero_block);
-			}
-			pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
-		}
-		else {
-			pseudo_rand = instance->memory[prev_offset].v[0];
-		}
+		pseudo_rand = instance->memory[prev_offset].v[0];

 		/* 1.2.2 Computing the lane of the reference block */
 		ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
@ -189,7 +162,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance,
 		 * lane.
 		 */
 		position.index = i;
-		ref_index = rxa2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
 			ref_lane == position.lane);

 		/* 2 Creating a new block */
--- a/src/argon2_sse3.c
+++ b/src/argon2_sse3.c
@ -0,0 +1,182 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+
+#if defined(_MSC_VER) //MSVC doesn't define SSSE3
+#define __SSSE3__
+#endif
+
+void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance,
+	argon2_position_t position);
+
+randomx_argon2_impl* randomx_argon2_impl_sse3() {
+#if defined(__SSSE3__)
+	return &randomx_argon2_fill_segment_sse3;
+#endif
+	return NULL;
+}
+
+#if defined(__SSSE3__)
+
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+
+#include "argon2_core.h"
+
+#include "blake2/blamka-round-sse3.h"
+#include "blake2/blake2-impl.h"
+#include "blake2/blake2.h"
+
+static void fill_block(__m128i* state, const block* ref_block,
+	block* next_block, int with_xor) {
+	__m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+	unsigned int i;
+
+	if (with_xor) {
+		for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+			state[i] = _mm_xor_si128(
+				state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
+			block_XY[i] = _mm_xor_si128(
+				state[i], _mm_loadu_si128((const __m128i*)next_block->v + i));
+		}
+	}
+	else {
+		for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+			block_XY[i] = state[i] = _mm_xor_si128(
+				state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i));
+		}
+	}
+
+	for (i = 0; i < 8; ++i) {
+		BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
+			state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
+			state[8 * i + 6], state[8 * i + 7]);
+	}
+
+	for (i = 0; i < 8; ++i) {
+		BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
+			state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
+			state[8 * 6 + i], state[8 * 7 + i]);
+	}
+
+	for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+		state[i] = _mm_xor_si128(state[i], block_XY[i]);
+		_mm_storeu_si128((__m128i*)next_block->v + i, state[i]);
+	}
+}
+
+void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance,
+	argon2_position_t position) {
+	block* ref_block = NULL, * curr_block = NULL;
+	block address_block, input_block;
+	uint64_t pseudo_rand, ref_index, ref_lane;
+	uint32_t prev_offset, curr_offset;
+	uint32_t starting_index, i;
+	__m128i state[ARGON2_OWORDS_IN_BLOCK];
+
+	if (instance == NULL) {
+		return;
+	}
+
+	starting_index = 0;
+
+	if ((0 == position.pass) && (0 == position.slice)) {
+		starting_index = 2; /* we have already generated the first two blocks */
+	}
+
+	/* Offset of the current block */
+	curr_offset = position.lane * instance->lane_length +
+		position.slice * instance->segment_length + starting_index;
+
+	if (0 == curr_offset % instance->lane_length) {
+		/* Last block in this lane */
+		prev_offset = curr_offset + instance->lane_length - 1;
+	}
+	else {
+		/* Previous block */
+		prev_offset = curr_offset - 1;
+	}
+
+	memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+	for (i = starting_index; i < instance->segment_length;
+		++i, ++curr_offset, ++prev_offset) {
+		/*1.1 Rotating prev_offset if needed */
+		if (curr_offset % instance->lane_length == 1) {
+			prev_offset = curr_offset - 1;
+		}
+
+		/* 1.2 Computing the index of the reference block */
+		/* 1.2.1 Taking pseudo-random value from the previous block */
+		pseudo_rand = instance->memory[prev_offset].v[0];
+
+		/* 1.2.2 Computing the lane of the reference block */
+		ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+		if ((position.pass == 0) && (position.slice == 0)) {
+			/* Can not reference other lanes yet */
+			ref_lane = position.lane;
+		}
+
+		/* 1.2.3 Computing the number of possible reference block within the
+		 * lane.
+		 */
+		position.index = i;
+		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+			ref_lane == position.lane);
+
+		/* 2 Creating a new block */
+		ref_block =
+			instance->memory + instance->lane_length * ref_lane + ref_index;
+		curr_block = instance->memory + curr_offset;
+		if (ARGON2_VERSION_10 == instance->version) {
+			/* version 1.2.1 and earlier: overwrite, not XOR */
+			fill_block(state, ref_block, curr_block, 0);
+		}
+		else {
+			if (0 == position.pass) {
+				fill_block(state, ref_block, curr_block, 0);
+			}
+			else {
+				fill_block(state, ref_block, curr_block, 1);
+			}
+		}
+	}
+}
+
+#endif
--- a/src/blake2/blamka-round-avx2.h
+++ b/src/blake2/blamka-round-avx2.h
@ -0,0 +1,189 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/src/blake2/blamka-round-sse3.h
+++ b/src/blake2/blamka-round-sse3.h
@ -0,0 +1,158 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+
+static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+
+
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@ -92,7 +92,7 @@ namespace randomx {
 		context.flags = ARGON2_DEFAULT_FLAGS;
 		context.version = ARGON2_VERSION_NUMBER;

-		int inputsValid = rxa2_validate_inputs(&context);
+		int inputsValid = randomx_argon2_validate_inputs(&context);
 		assert(inputsValid == ARGON2_OK);

 		/* 2. Align memory size */
@ -111,6 +111,7 @@ namespace randomx {
 		instance.threads = context.threads;
 		instance.type = Argon2_d;
 		instance.memory = (block*)cache->memory;
+		instance.impl = cache->argonImpl;

 		if (instance.threads > instance.lanes) {
 			instance.threads = instance.lanes;
@ -119,9 +120,9 @@ namespace randomx {
 		/* 3. Initialization: Hashing inputs, allocating memory, filling first
 		 * blocks
 		 */
-		rxa2_argon_initialize(&instance, &context);
+		randomx_argon2_initialize(&instance, &context);

-		rxa2_fill_memory_blocks(&instance);
+		randomx_argon2_fill_memory_blocks(&instance);

 		cache->reciprocalCache.clear();
 		randomx::Blake2Generator gen(key, keySize);
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.hpp"
 #include "superscalar_program.hpp"
 #include "allocator.hpp"
+#include "argon2.h"

 /* Global scope for C binding */
 struct randomx_dataset {
@ -51,6 +52,7 @@ struct randomx_cache {
 	randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
 	std::vector<uint64_t> reciprocalCache;
 	std::string cacheKey;
+	randomx_argon2_impl* argonImpl;

 	bool isInitialized() {
 		return programs[0].getSize() != 0;
@ -79,4 +81,21 @@ namespace randomx {
 	void initCacheCompile(randomx_cache*, const void*, size_t);
 	void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber);
 	void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
+
+	inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) {
+		if ((flags & RANDOMX_FLAG_ARGON2) == 0) {
+			return &randomx_argon2_fill_segment_ref;
+		}
+		randomx_argon2_impl* impl = nullptr;
+		if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_SSE3) {
+			impl = randomx_argon2_impl_sse3();
+		}
+		if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_AVX2) {
+			impl = randomx_argon2_impl_avx2();
+		}
+		if (impl != nullptr) {
+			return impl;
+		}
+		throw std::runtime_error("Unsupported Argon2 implementation");
+	}
 }
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@ -39,10 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 extern "C" {

 	randomx_cache *randomx_alloc_cache(randomx_flags flags) {
-		randomx_cache *cache;
+		randomx_cache *cache = nullptr;

 		try {
 			cache = new randomx_cache();
+			cache->argonImpl = randomx::selectArgonImpl(flags);
 			switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)) {
 				case RANDOMX_FLAG_DEFAULT:
 					cache->dealloc = &randomx::deallocCache<randomx::DefaultAllocator>;
@ -103,7 +104,9 @@ extern "C" {

 	void randomx_release_cache(randomx_cache* cache) {
 		assert(cache != nullptr);
-		cache->dealloc(cache);
+		if (cache->memory != nullptr) {
+			cache->dealloc(cache);
+		}
 		delete cache;
 	}

@ -114,7 +117,7 @@ extern "C" {
 			return nullptr;
 		}

-		randomx_dataset *dataset;
+		randomx_dataset *dataset = nullptr;

 		try {
 			dataset = new randomx_dataset();
--- a/src/randomx.h
+++ b/src/randomx.h
@ -44,7 +44,10 @@ typedef enum {
  RANDOMX_FLAG_HARD_AES = 2,
  RANDOMX_FLAG_FULL_MEM = 4,
  RANDOMX_FLAG_JIT = 8,
-  RANDOMX_FLAG_SECURE = 16
+  RANDOMX_FLAG_SECURE = 16,
+  RANDOMX_FLAG_ARGON2_SSE3 = 32,
+  RANDOMX_FLAG_ARGON2_AVX2 = 64,
+  RANDOMX_FLAG_ARGON2 = 96
 } randomx_flags;

 typedef struct randomx_dataset randomx_dataset;
@ -62,10 +65,17 @@ extern "C" {
 *        RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
 *        RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes
 *                           subsequent Dataset initialization faster
+ *        Optionally, one of these two flags may be selected:
+ *        RANDOMX_FLAG_ARGON2_SSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set
+ *                                   makes subsequent cache initialization faster
+ *        RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set
+ *                                   makes subsequent cache initialization faster
 *
 * @return Pointer to an allocated randomx_cache structure.
- *         NULL is returned if memory allocation fails or if the RANDOMX_FLAG_JIT
- *         is set and JIT compilation is not supported on the current platform.
+ *         Returns NULL if:
+ *         (1) memory allocation fails
+ *         (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform
+ *         (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set
 */
 RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags);

--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "stopwatch.hpp"
 #include "utility.hpp"
 #include "../randomx.h"
+#include "../dataset.hpp"
 #include "../blake2/endian.h"
 #include "../common.hpp"
 #ifdef _WIN32
@ -90,6 +91,8 @@ void printUsage(const char* executable) {
 	std::cout << "  --init Q      initialize dataset with Q threads (default: 1)" << std::endl;
 	std::cout << "  --nonces N    run N nonces (default: 1000)" << std::endl;
 	std::cout << "  --seed S      seed for cache initialization (default: 0)" << std::endl;
+	std::cout << "  --sse3        use optimized Argon2 for SSSE3 CPUs" << std::endl;
+	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
 }

 struct MemoryException : public std::exception {
@ -127,7 +130,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 }

 int main(int argc, char** argv) {
-	bool softAes, miningMode, verificationMode, help, largePages, jit, secure;
+	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, sse3, avx2;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
 	int32_t seedValue;
@ -148,6 +151,8 @@ int main(int argc, char** argv) {
 	readOption("--jit", argc, argv, jit);
 	readOption("--help", argc, argv, help);
 	readOption("--secure", argc, argv, secure);
+	readOption("--sse3", argc, argv, sse3);
+	readOption("--avx2", argc, argv, avx2);

 	store32(&seed, seedValue);

@ -166,6 +171,16 @@ int main(int argc, char** argv) {
 	randomx_cache* cache;
 	randomx_flags flags = RANDOMX_FLAG_DEFAULT;

+	if (sse3) {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_SSE3);
+		std::cout << " - Argon2 implementation: SSE3" << std::endl;
+	}
+
+	if (avx2) {
+		flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_AVX2);
+		std::cout << " - Argon2 implementation: AVX2" << std::endl;
+	}
+
 	if (miningMode) {
 		flags = (randomx_flags)(flags | RANDOMX_FLAG_FULL_MEM);
 		std::cout << " - full memory mode (2080 MiB)" << std::endl;
@ -213,6 +228,7 @@ int main(int argc, char** argv) {
 	std::cout << " ..." << std::endl;

 	try {
+		randomx::selectArgonImpl(flags); //just to check if flags are valid
 		if (jit && !RANDOMX_HAVE_COMPILER) {
 			throw std::runtime_error("JIT compilation is not supported on this platform. Try without --jit");
 		}
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@ -997,8 +997,9 @@ int main() {

 	if (RANDOMX_HAVE_COMPILER) {
 		randomx_release_cache(cache);
-		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
 		randomx_destroy_vm(vm);
+		vm = nullptr;
+		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
 		initCache("test key 000");
 		vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr);
 	}
@ -1013,6 +1014,35 @@ int main() {

 	runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e);

+	randomx_destroy_vm(vm);
+	vm = nullptr;
+
+	randomx_release_cache(cache);
+	cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_SSE3);
+
+	runTest("Cache initialization: SSSE3", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		initCache("test key 000");
+		uint64_t* cacheMemory = (uint64_t*)cache->memory;
+		assert(cacheMemory[0] == 0x191e0e1d23c02186);
+		assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1);
+		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
+	});
+
+	if (cache != nullptr)
+		randomx_release_cache(cache);
+	cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX2);
+
+	runTest("Cache initialization: AVX2", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		initCache("test key 000");
+		uint64_t* cacheMemory = (uint64_t*)cache->memory;
+		assert(cacheMemory[0] == 0x191e0e1d23c02186);
+		assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1);
+		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
+	});
+
+	if (cache != nullptr)
+		randomx_release_cache(cache);
+
 	std::cout << std::endl << "All tests PASSED" << std::endl;

 	if (skipped) {
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@ -114,6 +114,7 @@
      <ConformanceMode>true</ConformanceMode>
      <AssemblerOutput>AssemblyCode</AssemblerOutput>
      <PreprocessorDefinitions>_MBCS;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -131,8 +132,10 @@ SET ERRORLEVEL = 0</Command>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\src\allocator.cpp" />
+    <ClCompile Include="..\src\argon2_avx2.c" />
    <ClCompile Include="..\src\argon2_core.c" />
    <ClCompile Include="..\src\argon2_ref.c" />
+    <ClCompile Include="..\src\argon2_sse3.c" />
    <ClCompile Include="..\src\assembly_generator_x86.cpp" />
    <ClCompile Include="..\src\blake2_generator.cpp" />
    <ClCompile Include="..\src\blake2\blake2b.c" />
@ -163,7 +166,9 @@ SET ERRORLEVEL = 0</Command>
    <ClInclude Include="..\src\assembly_generator_x86.hpp" />
    <ClInclude Include="..\src\blake2\blake2-impl.h" />
    <ClInclude Include="..\src\blake2\blake2.h" />
+    <ClInclude Include="..\src\blake2\blamka-round-avx2.h" />
    <ClInclude Include="..\src\blake2\blamka-round-ref.h" />
+    <ClInclude Include="..\src\blake2\blamka-round-sse3.h" />
    <ClInclude Include="..\src\blake2\endian.h" />
    <ClInclude Include="..\src\blake2_generator.hpp" />
    <ClInclude Include="..\src\bytecode_machine.hpp" />
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@ -81,6 +81,12 @@
    <ClCompile Include="..\src\bytecode_machine.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\argon2_sse3.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\argon2_avx2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\argon2.h">
@ -185,6 +191,12 @@
    <ClInclude Include="..\src\bytecode_machine.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\blake2\blamka-round-sse3.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\blake2\blamka-round-avx2.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <MASM Include="..\src\jit_compiler_x86_static.asm">