@ -1,4 +1,4 @@
// Copyright (c) 2014-202 0 , The Monero Project
// Copyright (c) 2014-202 2 , The Monero Project
//
// All rights reserved.
//
@ -40,6 +40,9 @@
# include "oaes_lib.h"
# include "variant2_int_sqrt.h"
# include "variant4_random_math.h"
# include "CryptonightR_JIT.h"
# include <errno.h>
# define MEMORY (1 << 21) // 2MB scratchpad
# define ITER (1 << 20)
@ -48,9 +51,72 @@
# define INIT_SIZE_BLK 8
# define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
# if defined(_MSC_VER)
# define THREADV __declspec(thread)
# else
# define THREADV __thread
# endif
extern void aesb_single_round ( const uint8_t * in , uint8_t * out , const uint8_t * expandedKey ) ;
extern void aesb_pseudo_round ( const uint8_t * in , uint8_t * out , const uint8_t * expandedKey ) ;
static void local_abort ( const char * msg )
{
fprintf ( stderr , " %s \n " , msg ) ;
# ifdef NDEBUG
_exit ( 1 ) ;
# else
abort ( ) ;
# endif
}
volatile int use_v4_jit_flag = - 1 ;
static inline int use_v4_jit ( void )
{
# if defined(__x86_64__)
if ( use_v4_jit_flag ! = - 1 )
return use_v4_jit_flag ;
const char * env = getenv ( " MONERO_USE_CNV4_JIT " ) ;
if ( ! env ) {
use_v4_jit_flag = 1 ;
}
else if ( ! strcmp ( env , " 0 " ) | | ! strcmp ( env , " no " ) ) {
use_v4_jit_flag = 0 ;
}
else {
use_v4_jit_flag = 1 ;
}
return use_v4_jit_flag ;
# else
return 0 ;
# endif
}
# if defined(__x86_64__) || defined(__aarch64__)
static inline int force_software_aes ( void )
{
static int use = - 1 ;
if ( use ! = - 1 )
return use ;
const char * env = getenv ( " MONERO_USE_SOFTWARE_AES " ) ;
if ( ! env ) {
use = 0 ;
}
else if ( ! strcmp ( env , " 0 " ) | | ! strcmp ( env , " no " ) ) {
use = 0 ;
}
else {
use = 1 ;
}
return use ;
}
# endif
# define VARIANT1_1(p) \
do if ( variant = = 1 ) \
{ \
@ -117,48 +183,74 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
# define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
do if ( variant > = 2 ) \
{ \
const __m128i chunk1 = _mm_load_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) ) ; \
__m128i chunk1 = _mm_load_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) ) ; \
const __m128i chunk2 = _mm_load_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x20 ) ) ) ; \
const __m128i chunk3 = _mm_load_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x30 ) ) ) ; \
_mm_store_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) , _mm_add_epi64 ( chunk3 , _b1 ) ) ; \
_mm_store_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x20 ) ) , _mm_add_epi64 ( chunk1 , _b ) ) ; \
_mm_store_si128 ( ( __m128i * ) ( ( base_ptr ) + ( ( offset ) ^ 0x30 ) ) , _mm_add_epi64 ( chunk2 , _a ) ) ; \
if ( variant > = 4 ) \
{ \
chunk1 = _mm_xor_si128 ( chunk1 , chunk2 ) ; \
_c = _mm_xor_si128 ( _c , chunk3 ) ; \
_c = _mm_xor_si128 ( _c , chunk1 ) ; \
} \
} while ( 0 )
# define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
do if ( variant > = 2 ) \
{ \
const uint64x2_t chunk1 = vld1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) ) ; \
uint64x2_t chunk1 = vld1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) ) ; \
const uint64x2_t chunk2 = vld1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x20 ) ) ) ; \
const uint64x2_t chunk3 = vld1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x30 ) ) ) ; \
vst1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) , vaddq_u64 ( chunk3 , vreinterpretq_u64_u8 ( _b1 ) ) ) ; \
vst1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x20 ) ) , vaddq_u64 ( chunk1 , vreinterpretq_u64_u8 ( _b ) ) ) ; \
vst1q_u64 ( U64 ( ( base_ptr ) + ( ( offset ) ^ 0x30 ) ) , vaddq_u64 ( chunk2 , vreinterpretq_u64_u8 ( _a ) ) ) ; \
if ( variant > = 4 ) \
{ \
chunk1 = veorq_u64 ( chunk1 , chunk2 ) ; \
_c = vreinterpretq_u8_u64 ( veorq_u64 ( vreinterpretq_u64_u8 ( _c ) , chunk3 ) ) ; \
_c = vreinterpretq_u8_u64 ( veorq_u64 ( vreinterpretq_u64_u8 ( _c ) , chunk1 ) ) ; \
} \
} while ( 0 )
# define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \
# define VARIANT2_PORTABLE_SHUFFLE_ADD( out, a_, base_ptr, offset) \
do if ( variant > = 2 ) \
{ \
uint64_t * chunk1 = U64 ( ( base_ptr ) + ( ( offset ) ^ 0x10 ) ) ; \
uint64_t * chunk2 = U64 ( ( base_ptr ) + ( ( offset ) ^ 0x20 ) ) ; \
uint64_t * chunk3 = U64 ( ( base_ptr ) + ( ( offset ) ^ 0x30 ) ) ; \
\
const uint64_t chunk1_old [ 2 ] = { chunk1 [ 0 ] , chunk1 [ 1 ] } ; \
uint64_t chunk1_old [ 2 ] = { SWAP64LE ( chunk1 [ 0 ] ) , SWAP64LE ( chunk1 [ 1 ] ) } ; \
const uint64_t chunk2_old [ 2 ] = { SWAP64LE ( chunk2 [ 0 ] ) , SWAP64LE ( chunk2 [ 1 ] ) } ; \
const uint64_t chunk3_old [ 2 ] = { SWAP64LE ( chunk3 [ 0 ] ) , SWAP64LE ( chunk3 [ 1 ] ) } ; \
\
uint64_t b1 [ 2 ] ; \
memcpy_swap64le ( b1 , b + 16 , 2 ) ; \
chunk1 [ 0 ] = SWAP64LE ( SWAP64LE ( chunk3 [ 0 ] ) + b1 [ 0 ] ) ; \
chunk1 [ 1 ] = SWAP64LE ( SWAP64LE ( chunk3 [ 1 ] ) + b1 [ 1 ] ) ; \
chunk1 [ 0 ] = SWAP64LE ( chunk3 _old [ 0 ] + b1 [ 0 ] ) ; \
chunk1 [ 1 ] = SWAP64LE ( chunk3 _old [ 1 ] + b1 [ 1 ] ) ; \
\
uint64_t a0 [ 2 ] ; \
memcpy_swap64le ( a0 , a , 2 ) ; \
chunk3 [ 0 ] = SWAP64LE ( SWAP64LE ( chunk2 [ 0 ] ) + a0 [ 0 ] ) ; \
chunk3 [ 1 ] = SWAP64LE ( SWAP64LE ( chunk2 [ 1 ] ) + a0 [ 1 ] ) ; \
memcpy_swap64le ( a0 , a _ , 2 ) ; \
chunk3 [ 0 ] = SWAP64LE ( chunk2 _old [ 0 ] + a0 [ 0 ] ) ; \
chunk3 [ 1 ] = SWAP64LE ( chunk2 _old [ 1 ] + a0 [ 1 ] ) ; \
\
uint64_t b0 [ 2 ] ; \
memcpy_swap64le ( b0 , b , 2 ) ; \
chunk2 [ 0 ] = SWAP64LE ( SWAP64LE ( chunk1_old [ 0 ] ) + b0 [ 0 ] ) ; \
chunk2 [ 1 ] = SWAP64LE ( SWAP64LE ( chunk1_old [ 1 ] ) + b0 [ 1 ] ) ; \
chunk2 [ 0 ] = SWAP64LE ( chunk1_old [ 0 ] + b0 [ 0 ] ) ; \
chunk2 [ 1 ] = SWAP64LE ( chunk1_old [ 1 ] + b0 [ 1 ] ) ; \
if ( variant > = 4 ) \
{ \
uint64_t out_copy [ 2 ] ; \
memcpy_swap64le ( out_copy , out , 2 ) ; \
chunk1_old [ 0 ] ^ = chunk2_old [ 0 ] ; \
chunk1_old [ 1 ] ^ = chunk2_old [ 1 ] ; \
out_copy [ 0 ] ^ = chunk3_old [ 0 ] ; \
out_copy [ 1 ] ^ = chunk3_old [ 1 ] ; \
out_copy [ 0 ] ^ = chunk1_old [ 0 ] ; \
out_copy [ 1 ] ^ = chunk1_old [ 1 ] ; \
memcpy_swap64le ( out , out_copy , 2 ) ; \
} \
} while ( 0 )
# define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
@ -201,18 +293,18 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
# endif
# define VARIANT2_2_PORTABLE() \
if ( variant >= 2 ) { \
if ( variant == 2 | | variant = = 3 ) { \
xor_blocks ( long_state + ( j ^ 0x10 ) , d ) ; \
xor_blocks ( d , long_state + ( j ^ 0x20 ) ) ; \
}
# define VARIANT2_2() \
do if ( variant >= 2 ) \
do if ( variant == 2 | | variant = = 3 ) \
{ \
* U64 ( hp_state + ( j ^ 0x10 ) ) ^ = SWAP64LE ( hi ) ; \
* ( U64 ( hp_state + ( j ^ 0x10 ) ) + 1 ) ^ = SWAP64LE ( lo ) ; \
hi ^ = SWAP64LE ( * U64 ( hp_state + ( j ^ 0x20 ) ) ) ; \
lo ^ = SWAP64LE ( * ( U64 ( hp_state + ( j ^ 0x20 ) ) + 1 ) ) ; \
* U64 ( local_ hp_state + ( j ^ 0x10 ) ) ^ = SWAP64LE ( hi ) ; \
* ( U64 ( local_ hp_state + ( j ^ 0x10 ) ) + 1 ) ^ = SWAP64LE ( lo ) ; \
hi ^ = SWAP64LE ( * U64 ( local_ hp_state + ( j ^ 0x20 ) ) ) ; \
lo ^ = SWAP64LE ( * ( U64 ( local_ hp_state + ( j ^ 0x20 ) ) + 1 ) ) ; \
} while ( 0 )
# define V4_REG_LOAD(dst, src) \
@ -225,34 +317,56 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
} while ( 0 )
# define VARIANT4_RANDOM_MATH_INIT() \
v4_reg r [ 8 ] ; \
struct V4_Instruction code [ TOTAL_LATENCY * ALU_COUNT + 1 ] ; \
v4_reg r [ 9 ] ; \
struct V4_Instruction code [ NUM_INSTRUCTIONS_MAX + 1 ] ; \
int jit = use_v4_jit ( ) ; \
do if ( variant > = 4 ) \
{ \
for ( int i = 0 ; i < 4 ; + + i ) \
V4_REG_LOAD ( r + i , ( uint8_t * ) ( state . hs . w + 12 ) + sizeof ( v4_reg ) * i ) ; \
v4_random_math_init ( code , height ) ; \
if ( jit ) \
{ \
int ret = v4_generate_JIT_code ( code , hp_jitfunc , 4096 ) ; \
if ( ret < 0 ) \
local_abort ( " Error generating CryptonightR code " ) ; \
} \
} while ( 0 )
# define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
do if ( variant > = 4 ) \
{ \
uint64_t t ; \
memcpy ( & t , b , sizeof ( uint64_t ) ) ; \
uint64_t t [2 ] ; \
memcpy ( t , b , sizeof ( uint64_t ) ) ; \
\
if ( sizeof ( v4_reg ) = = sizeof ( uint32_t ) ) \
t ^ = SWAP64LE ( ( r [ 0 ] + r [ 1 ] ) | ( ( uint64_t ) ( r [ 2 ] + r [ 3 ] ) < < 32 ) ) ; \
t [ 0 ] ^ = SWAP64LE ( ( r [ 0 ] + r [ 1 ] ) | ( ( uint64_t ) ( r [ 2 ] + r [ 3 ] ) < < 32 ) ) ; \
else \
t ^ = SWAP64LE ( ( r [ 0 ] + r [ 1 ] ) ^ ( r [ 2 ] + r [ 3 ] ) ) ; \
t [ 0 ] ^ = SWAP64LE ( ( r [ 0 ] + r [ 1 ] ) ^ ( r [ 2 ] + r [ 3 ] ) ) ; \
\
memcpy ( b , & t , sizeof ( uint64_t ) ) ; \
memcpy ( b , t , sizeof ( uint64_t ) ) ; \
\
V4_REG_LOAD ( r + 4 , a ) ; \
V4_REG_LOAD ( r + 5 , ( uint64_t * ) ( a ) + 1 ) ; \
V4_REG_LOAD ( r + 6 , _b ) ; \
V4_REG_LOAD ( r + 7 , _b1 ) ; \
V4_REG_LOAD ( r + 8 , ( uint64_t * ) ( _b1 ) + 1 ) ; \
\
if ( jit ) \
( * hp_jitfunc ) ( r ) ; \
else \
v4_random_math ( code , r ) ; \
\
memcpy ( t , a , sizeof ( uint64_t ) * 2 ) ; \
\
if ( sizeof ( v4_reg ) = = sizeof ( uint32_t ) ) { \
t [ 0 ] ^ = SWAP64LE ( r [ 2 ] | ( ( uint64_t ) ( r [ 3 ] ) < < 32 ) ) ; \
t [ 1 ] ^ = SWAP64LE ( r [ 0 ] | ( ( uint64_t ) ( r [ 1 ] ) < < 32 ) ) ; \
} else { \
t [ 0 ] ^ = SWAP64LE ( r [ 2 ] ^ r [ 3 ] ) ; \
t [ 1 ] ^ = SWAP64LE ( r [ 0 ] ^ r [ 1 ] ) ; \
} \
memcpy ( a , t , sizeof ( uint64_t ) * 2 ) ; \
} while ( 0 )
@ -318,7 +432,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
# define pre_aes() \
j = state_index ( a ) ; \
_c = _mm_load_si128 ( R128 ( & hp_state[ j ] ) ) ; \
_c = _mm_load_si128 ( R128 ( & local_ hp_state[ j ] ) ) ; \
_a = _mm_load_si128 ( R128 ( a ) ) ; \
/*
@ -331,32 +445,26 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
* This code is based upon an optimized implementation by dga .
*/
# define post_aes() \
VARIANT2_SHUFFLE_ADD_SSE2 ( hp_state, j ) ; \
VARIANT2_SHUFFLE_ADD_SSE2 ( local_ hp_state, j ) ; \
_mm_store_si128 ( R128 ( c ) , _c ) ; \
_mm_store_si128 ( R128 ( & hp_state[ j ] ) , _mm_xor_si128 ( _b , _c ) ) ; \
VARIANT1_1 ( & hp_state[ j ] ) ; \
_mm_store_si128 ( R128 ( & local_ hp_state[ j ] ) , _mm_xor_si128 ( _b , _c ) ) ; \
VARIANT1_1 ( & local_ hp_state[ j ] ) ; \
j = state_index ( c ) ; \
p = U64 ( & hp_state[ j ] ) ; \
p = U64 ( & local_ hp_state[ j ] ) ; \
b [ 0 ] = p [ 0 ] ; b [ 1 ] = p [ 1 ] ; \
VARIANT2_INTEGER_MATH_SSE2 ( b , c ) ; \
VARIANT4_RANDOM_MATH ( a , b , r , & _b , & _b1 ) ; \
__mul ( ) ; \
VARIANT2_2 ( ) ; \
VARIANT2_SHUFFLE_ADD_SSE2 ( hp_state, j ) ; \
VARIANT2_SHUFFLE_ADD_SSE2 ( local_ hp_state, j ) ; \
a [ 0 ] + = hi ; a [ 1 ] + = lo ; \
p = U64 ( & hp_state[ j ] ) ; \
p = U64 ( & local_ hp_state[ j ] ) ; \
p [ 0 ] = a [ 0 ] ; p [ 1 ] = a [ 1 ] ; \
a [ 0 ] ^ = b [ 0 ] ; a [ 1 ] ^ = b [ 1 ] ; \
VARIANT1_2 ( p + 1 ) ; \
_b1 = _b ; \
_b = _c ; \
# if defined(_MSC_VER)
# define THREADV __declspec(thread)
# else
# define THREADV __thread
# endif
# pragma pack(push, 1)
union cn_slow_hash_state
{
@ -371,6 +479,9 @@ union cn_slow_hash_state
THREADV uint8_t * hp_state = NULL ;
THREADV int hp_allocated = 0 ;
THREADV v4_random_math_JIT_func hp_jitfunc = NULL ;
THREADV uint8_t * hp_jitfunc_memory = NULL ;
THREADV int hp_jitfunc_allocated = 0 ;
# if defined(_MSC_VER)
# define cpuid(info,x) __cpuidex(info,x,0)
@ -409,25 +520,6 @@ STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
* @ return true if the CPU supports AES , false otherwise
*/
STATIC INLINE int force_software_aes ( void )
{
static int use = - 1 ;
if ( use ! = - 1 )
return use ;
const char * env = getenv ( " MONERO_USE_SOFTWARE_AES " ) ;
if ( ! env ) {
use = 0 ;
}
else if ( ! strcmp ( env , " 0 " ) | | ! strcmp ( env , " no " ) ) {
use = 0 ;
}
else {
use = 1 ;
}
return use ;
}
STATIC INLINE int check_aes_hw ( void )
{
@ -666,10 +758,10 @@ void cn_slow_hash_allocate_state(void)
# if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
defined ( __DragonFly__ ) | | defined ( __NetBSD__ )
hp_state = mmap ( 0 , MEMORY , PROT_READ | PROT_WRITE ,
MAP_PRIVATE | MAP_ANON , 0 , 0 ) ;
MAP_PRIVATE | MAP_ANON , - 1 , 0 ) ;
# else
hp_state = mmap ( 0 , MEMORY , PROT_READ | PROT_WRITE ,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB , 0 , 0 ) ;
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB , - 1 , 0 ) ;
# endif
if ( hp_state = = MAP_FAILED )
hp_state = NULL ;
@ -680,6 +772,35 @@ void cn_slow_hash_allocate_state(void)
hp_allocated = 0 ;
hp_state = ( uint8_t * ) malloc ( MEMORY ) ;
}
# if defined(_MSC_VER) || defined(__MINGW32__)
hp_jitfunc_memory = ( uint8_t * ) VirtualAlloc ( hp_jitfunc_memory , 4096 + 4095 ,
MEM_COMMIT | MEM_RESERVE , PAGE_EXECUTE_READWRITE ) ;
# else
# if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
defined ( __DragonFly__ ) | | defined ( __NetBSD__ )
# ifdef __NetBSD__
# define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
# else
# define RESERVED_FLAGS 0
# endif
hp_jitfunc_memory = mmap ( 0 , 4096 + 4096 , PROT_READ | PROT_WRITE | RESERVED_FLAGS ,
MAP_PRIVATE | MAP_ANON , - 1 , 0 ) ;
# else
hp_jitfunc_memory = mmap ( 0 , 4096 + 4096 , PROT_READ | PROT_WRITE | PROT_EXEC ,
MAP_PRIVATE | MAP_ANONYMOUS , - 1 , 0 ) ;
# endif
if ( hp_jitfunc_memory = = MAP_FAILED )
hp_jitfunc_memory = NULL ;
# endif
hp_jitfunc_allocated = 1 ;
if ( hp_jitfunc_memory = = NULL )
{
hp_jitfunc_allocated = 0 ;
hp_jitfunc_memory = malloc ( 4096 + 4095 ) ;
}
hp_jitfunc = ( v4_random_math_JIT_func ) ( ( size_t ) ( hp_jitfunc_memory + 4095 ) & ~ 4095 ) ;
}
/**
@ -702,8 +823,22 @@ void cn_slow_hash_free_state(void)
# endif
}
if ( ! hp_jitfunc_allocated )
free ( hp_jitfunc_memory ) ;
else
{
# if defined(_MSC_VER) || defined(__MINGW32__)
VirtualFree ( hp_jitfunc_memory , 0 , MEM_RELEASE ) ;
# else
munmap ( hp_jitfunc_memory , 4096 + 4095 ) ;
# endif
}
hp_state = NULL ;
hp_allocated = 0 ;
hp_jitfunc = NULL ;
hp_jitfunc_memory = NULL ;
hp_jitfunc_allocated = 0 ;
}
/**
@ -787,7 +922,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
aes_pseudo_round ( text , text , expandedKey , INIT_SIZE_BLK ) ;
memcpy ( & hp_state[ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
memcpy ( & local_ hp_state[ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
}
}
else
@ -799,7 +934,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
for ( j = 0 ; j < INIT_SIZE_BLK ; j + + )
aesb_pseudo_round ( & text [ AES_BLOCK_SIZE * j ] , & text [ AES_BLOCK_SIZE * j ] , aes_ctx - > key - > exp_data ) ;
memcpy ( & hp_state[ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
memcpy ( & local_ hp_state[ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
}
}
@ -847,7 +982,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
// add the xor to the pseudo round
aes_pseudo_round_xor ( text , text , expandedKey , & hp_state[ i * INIT_SIZE_BYTE ] , INIT_SIZE_BLK ) ;
aes_pseudo_round_xor ( text , text , expandedKey , & local_ hp_state[ i * INIT_SIZE_BYTE ] , INIT_SIZE_BLK ) ;
}
}
else
@ -857,7 +992,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
{
for ( j = 0 ; j < INIT_SIZE_BLK ; j + + )
{
xor_blocks ( & text [ j * AES_BLOCK_SIZE ] , & hp_state[ i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE ] ) ;
xor_blocks ( & text [ j * AES_BLOCK_SIZE ] , & local_ hp_state[ i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE ] ) ;
aesb_pseudo_round ( & text [ AES_BLOCK_SIZE * j ] , & text [ AES_BLOCK_SIZE * j ] , aes_ctx - > key - > exp_data ) ;
}
}
@ -877,6 +1012,44 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
}
# elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__))
# ifdef __aarch64__
# include <sys/mman.h>
THREADV uint8_t * hp_state = NULL ;
THREADV int hp_malloced = 0 ;
void cn_slow_hash_allocate_state ( void )
{
if ( hp_state ! = NULL )
return ;
# ifndef MAP_HUGETLB
# define MAP_HUGETLB 0
# endif
hp_state = mmap ( 0 , MEMORY , PROT_READ | PROT_WRITE ,
MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , - 1 , 0 ) ;
if ( hp_state = = MAP_FAILED )
hp_state = NULL ;
if ( hp_state = = NULL )
{
hp_malloced = 1 ;
hp_state = ( uint8_t * ) malloc ( MEMORY ) ;
}
}
void cn_slow_hash_free_state ( void )
{
if ( hp_state = = NULL )
return ;
if ( hp_malloced )
free ( hp_state ) ;
else
munmap ( hp_state , MEMORY ) ;
hp_state = NULL ;
hp_malloced = 0 ;
}
# else
void cn_slow_hash_allocate_state ( void )
{
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
@ -888,6 +1061,7 @@ void cn_slow_hash_free_state(void)
// As above
return ;
}
# endif
# if defined(__GNUC__)
# define RDATA_ALIGN16 __attribute__ ((aligned(16)))
@ -901,6 +1075,8 @@ void cn_slow_hash_free_state(void)
# define U64(x) ((uint64_t *) (x))
# define hp_jitfunc ((v4_random_math_JIT_func)NULL)
STATIC INLINE void xor64 ( uint64_t * a , const uint64_t b )
{
* a ^ = b ;
@ -926,6 +1102,23 @@ union cn_slow_hash_state
* and moving between vector and regular registers stalls the pipeline .
*/
# include <arm_neon.h>
# ifndef __APPLE__
# include <sys/auxv.h>
# include <asm/hwcap.h>
# endif
STATIC INLINE int check_aes_hw ( void )
{
# ifdef __APPLE__
return 1 ;
# else
static int supported = - 1 ;
if ( supported < 0 )
supported = ( getauxval ( AT_HWCAP ) & HWCAP_AES ) ! = 0 ;
return supported ;
# endif
}
# define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
@ -935,24 +1128,24 @@ union cn_slow_hash_state
# define pre_aes() \
j = state_index ( a ) ; \
_c = vld1q_u8 ( & hp_state[ j ] ) ; \
_c = vld1q_u8 ( & local_ hp_state[ j ] ) ; \
_a = vld1q_u8 ( ( const uint8_t * ) a ) ; \
# define post_aes() \
VARIANT2_SHUFFLE_ADD_NEON ( hp_state, j ) ; \
VARIANT2_SHUFFLE_ADD_NEON ( local_ hp_state, j ) ; \
vst1q_u8 ( ( uint8_t * ) c , _c ) ; \
vst1q_u8 ( & hp_state[ j ] , veorq_u8 ( _b , _c ) ) ; \
VARIANT1_1 ( & hp_state[ j ] ) ; \
vst1q_u8 ( & local_ hp_state[ j ] , veorq_u8 ( _b , _c ) ) ; \
VARIANT1_1 ( & local_ hp_state[ j ] ) ; \
j = state_index ( c ) ; \
p = U64 ( & hp_state[ j ] ) ; \
p = U64 ( & local_ hp_state[ j ] ) ; \
b [ 0 ] = p [ 0 ] ; b [ 1 ] = p [ 1 ] ; \
VARIANT2_PORTABLE_INTEGER_MATH ( b , c ) ; \
VARIANT4_RANDOM_MATH ( a , b , r , & _b , & _b1 ) ; \
__mul ( ) ; \
VARIANT2_2 ( ) ; \
VARIANT2_SHUFFLE_ADD_NEON ( hp_state, j ) ; \
VARIANT2_SHUFFLE_ADD_NEON ( local_ hp_state, j ) ; \
a [ 0 ] + = hi ; a [ 1 ] + = lo ; \
p = U64 ( & hp_state[ j ] ) ; \
p = U64 ( & local_ hp_state[ j ] ) ; \
p [ 0 ] = a [ 0 ] ; p [ 1 ] = a [ 1 ] ; \
a [ 0 ] ^ = b [ 0 ] ; a [ 1 ] ^ = b [ 1 ] ; \
VARIANT1_2 ( p + 1 ) ; \
@ -1022,7 +1215,6 @@ __asm__(
STATIC INLINE void aes_pseudo_round ( const uint8_t * in , uint8_t * out , const uint8_t * expandedKey , int nblocks )
{
const uint8x16_t * k = ( const uint8x16_t * ) expandedKey , zero = { 0 } ;
uint8x16_t tmp ;
int i ;
for ( i = 0 ; i < nblocks ; i + + )
@ -1057,7 +1249,6 @@ STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const u
{
const uint8x16_t * k = ( const uint8x16_t * ) expandedKey ;
const uint8x16_t * x = ( const uint8x16_t * ) xor ;
uint8x16_t tmp ;
int i ;
for ( i = 0 ; i < nblocks ; i + + )
@ -1110,16 +1301,17 @@ STATIC INLINE void aligned_free(void *ptr)
}
# endif /* FORCE_USE_HEAP */
STATIC INLINE void xor_blocks ( uint8_t * a , const uint8_t * b )
{
U64 ( a ) [ 0 ] ^ = U64 ( b ) [ 0 ] ;
U64 ( a ) [ 1 ] ^ = U64 ( b ) [ 1 ] ;
}
void cn_slow_hash ( const void * data , size_t length , char * hash , int variant , int prehashed , uint64_t height )
{
RDATA_ALIGN16 uint8_t expandedKey [ 240 ] ;
# ifndef FORCE_USE_HEAP
RDATA_ALIGN16 uint8_t hp_state [ MEMORY ] ;
# else
uint8_t * hp_state = ( uint8_t * ) aligned_malloc ( MEMORY , 16 ) ;
# endif
uint8_t * local_hp_state ;
uint8_t text [ INIT_SIZE_BYTE ] ;
RDATA_ALIGN16 uint64_t a [ 2 ] ;
RDATA_ALIGN16 uint64_t b [ 4 ] ;
@ -1130,12 +1322,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
size_t i , j ;
uint64_t * p = NULL ;
oaes_ctx * aes_ctx = NULL ;
int useAes = ! force_software_aes ( ) & & check_aes_hw ( ) ;
static void ( * const extra_hashes [ 4 ] ) ( const void * , size_t , char * ) =
{
hash_extra_blake , hash_extra_groestl , hash_extra_jh , hash_extra_skein
} ;
// this isn't supposed to happen, but guard against it for now.
if ( hp_state = = NULL )
cn_slow_hash_allocate_state ( ) ;
// locals to avoid constant TLS dereferencing
local_hp_state = hp_state ;
// locals to avoid constant TLS dereferencing
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
if ( prehashed ) {
@ -1153,11 +1355,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
* the 2 MB large random access buffer .
*/
if ( useAes )
{
aes_expand_key ( state . hs . b , expandedKey ) ;
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
aes_pseudo_round ( text , text , expandedKey , INIT_SIZE_BLK ) ;
memcpy ( & hp_state [ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
memcpy ( & local_hp_state [ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
}
}
else
{
aes_ctx = ( oaes_ctx * ) oaes_alloc ( ) ;
oaes_key_import_data ( aes_ctx , state . hs . b , AES_KEY_SIZE ) ;
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
for ( j = 0 ; j < INIT_SIZE_BLK ; j + + )
aesb_pseudo_round ( & text [ AES_BLOCK_SIZE * j ] , & text [ AES_BLOCK_SIZE * j ] , aes_ctx - > key - > exp_data ) ;
memcpy ( & local_hp_state [ i * INIT_SIZE_BYTE ] , text , INIT_SIZE_BYTE ) ;
}
}
U64 ( a ) [ 0 ] = U64 ( & state . k [ 0 ] ) [ 0 ] ^ U64 ( & state . k [ 32 ] ) [ 0 ] ;
@ -1173,6 +1390,8 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
_b = vld1q_u8 ( ( const uint8_t * ) b ) ;
_b1 = vld1q_u8 ( ( ( const uint8_t * ) b ) + AES_BLOCK_SIZE ) ;
if ( useAes )
{
for ( i = 0 ; i < ITER / 2 ; i + + )
{
pre_aes ( ) ;
@ -1181,6 +1400,17 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
_c = veorq_u8 ( _c , _a ) ;
post_aes ( ) ;
}
}
else
{
for ( i = 0 ; i < ITER / 2 ; i + + )
{
pre_aes ( ) ;
aesb_single_round ( ( uint8_t * ) & _c , ( uint8_t * ) & _c , ( uint8_t * ) & _a ) ;
post_aes ( ) ;
}
}
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
* of AES encryption to mix the random data back into the ' text ' buffer . ' text '
@ -1188,11 +1418,27 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
memcpy ( text , state . init , INIT_SIZE_BYTE ) ;
if ( useAes )
{
aes_expand_key ( & state . hs . b [ 32 ] , expandedKey ) ;
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
// add the xor to the pseudo round
aes_pseudo_round_xor ( text , text , expandedKey , & hp_state [ i * INIT_SIZE_BYTE ] , INIT_SIZE_BLK ) ;
aes_pseudo_round_xor ( text , text , expandedKey , & local_hp_state [ i * INIT_SIZE_BYTE ] , INIT_SIZE_BLK ) ;
}
}
else
{
oaes_key_import_data ( aes_ctx , & state . hs . b [ 32 ] , AES_KEY_SIZE ) ;
for ( i = 0 ; i < MEMORY / INIT_SIZE_BYTE ; i + + )
{
for ( j = 0 ; j < INIT_SIZE_BLK ; j + + )
{
xor_blocks ( & text [ j * AES_BLOCK_SIZE ] , & local_hp_state [ i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE ] ) ;
aesb_pseudo_round ( & text [ AES_BLOCK_SIZE * j ] , & text [ AES_BLOCK_SIZE * j ] , aes_ctx - > key - > exp_data ) ;
}
}
oaes_free ( ( OAES_CTX * * ) & aes_ctx ) ;
}
/* CryptoNight Step 5: Apply Keccak to the state again, and then
@ -1205,10 +1451,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
memcpy ( state . init , text , INIT_SIZE_BYTE ) ;
hash_permutation ( & state . hs ) ;
extra_hashes [ state . hs . b [ 0 ] & 3 ] ( & state , 200 , hash ) ;
# ifdef FORCE_USE_HEAP
aligned_free ( hp_state ) ;
# endif
}
# else /* aarch64 && crypto */
@ -1330,6 +1572,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
{
uint8_t text [ INIT_SIZE_BYTE ] ;
uint8_t a [ AES_BLOCK_SIZE ] ;
uint8_t a1 [ AES_BLOCK_SIZE ] ;
uint8_t b [ AES_BLOCK_SIZE * 2 ] ;
uint8_t c [ AES_BLOCK_SIZE ] ;
uint8_t c1 [ AES_BLOCK_SIZE ] ;
@ -1389,10 +1632,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
// Iteration 1
j = state_index ( a ) ;
p = & long_state [ j ] ;
aesb_single_round ( p , p , a ) ;
copy_block ( c1 , p ) ;
aesb_single_round ( p , c1 , a ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( long_state , j ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( c1 , a , long_state , j ) ;
copy_block ( p , c1 ) ;
xor_blocks ( p , b ) ;
VARIANT1_1 ( p ) ;
@ -1401,14 +1644,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
p = & long_state [ j ] ;
copy_block ( c , p ) ;
copy_block ( a1 , a ) ;
VARIANT2_PORTABLE_INTEGER_MATH ( c , c1 ) ;
VARIANT4_RANDOM_MATH ( a , c , r , b , b + AES_BLOCK_SIZE ) ;
VARIANT4_RANDOM_MATH ( a 1 , c , r , b , b + AES_BLOCK_SIZE ) ;
mul ( c1 , c , d ) ;
VARIANT2_2_PORTABLE ( ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( long_state, j ) ;
sum_half_blocks ( a , d ) ;
swap_blocks ( a , c ) ;
xor_blocks ( a , c ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( c1, a , long_state, j ) ;
sum_half_blocks ( a 1 , d ) ;
swap_blocks ( a 1 , c ) ;
xor_blocks ( a 1 , c ) ;
VARIANT1_2 ( U64 ( c ) + 1 ) ;
copy_block ( p , c ) ;
@ -1416,6 +1660,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
copy_block ( b + AES_BLOCK_SIZE , b ) ;
}
copy_block ( b , c1 ) ;
copy_block ( a , a1 ) ;
}
memcpy ( text , state . init , INIT_SIZE_BYTE ) ;
@ -1443,7 +1688,9 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
# else
// Portable implementation as a fallback
void slow_hash_allocate_state ( void )
# define hp_jitfunc ((v4_random_math_JIT_func)NULL)
void cn_slow_hash_allocate_state ( void )
{
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
return ;
@ -1536,6 +1783,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
union cn_slow_hash_state state ;
uint8_t text [ INIT_SIZE_BYTE ] ;
uint8_t a [ AES_BLOCK_SIZE ] ;
uint8_t a1 [ AES_BLOCK_SIZE ] ;
uint8_t b [ AES_BLOCK_SIZE * 2 ] ;
uint8_t c1 [ AES_BLOCK_SIZE ] ;
uint8_t c2 [ AES_BLOCK_SIZE ] ;
@ -1579,7 +1827,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
j = e2i ( a , MEMORY / AES_BLOCK_SIZE ) * AES_BLOCK_SIZE ;
copy_block ( c1 , & long_state [ j ] ) ;
aesb_single_round ( c1 , c1 , a ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( long_state, j ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( c1, a , long_state, j ) ;
copy_block ( & long_state [ j ] , c1 ) ;
xor_blocks ( & long_state [ j ] , b ) ;
assert ( j = = e2i ( a , MEMORY / AES_BLOCK_SIZE ) * AES_BLOCK_SIZE ) ;
@ -1587,23 +1835,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
/* Iteration 2 */
j = e2i ( c1 , MEMORY / AES_BLOCK_SIZE ) * AES_BLOCK_SIZE ;
copy_block ( c2 , & long_state [ j ] ) ;
copy_block ( a1 , a ) ;
VARIANT2_PORTABLE_INTEGER_MATH ( c2 , c1 ) ;
VARIANT4_RANDOM_MATH ( a , c2 , r , b , b + AES_BLOCK_SIZE ) ;
VARIANT4_RANDOM_MATH ( a 1 , c2 , r , b , b + AES_BLOCK_SIZE ) ;
mul ( c1 , c2 , d ) ;
VARIANT2_2_PORTABLE ( ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( long_state , j ) ;
swap_blocks ( a , c1 ) ;
sum_half_blocks ( c1 , d ) ;
swap_blocks ( c1 , c2 ) ;
xor_blocks ( c1 , c2 ) ;
VARIANT2_PORTABLE_SHUFFLE_ADD ( c1 , a , long_state , j ) ;
sum_half_blocks ( a1 , d ) ;
swap_blocks ( a1 , c2 ) ;
xor_blocks ( a1 , c2 ) ;
VARIANT1_2 ( c2 + 8 ) ;
copy_block ( & long_state [ j ] , c2 ) ;
assert ( j = = e2i ( a , MEMORY / AES_BLOCK_SIZE ) * AES_BLOCK_SIZE ) ;
if ( variant > = 2 ) {
copy_block ( b + AES_BLOCK_SIZE , b ) ;
}
copy_block ( b , a ) ;
copy_block ( a , c 1) ;
copy_block ( b , c1 ) ;
copy_block ( a , a 1) ;
}
memcpy ( text , state . init , INIT_SIZE_BYTE ) ;