From 3c8c7ee097b4989407ea1acf2a458e8f1608ff07 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 22 May 2021 14:54:50 +0300 Subject: [PATCH] Optimized dataset read (#211) * Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing --- src/asm/program_prologue_linux.inc | 1 + src/asm/program_prologue_win64.inc | 1 + src/asm/program_read_dataset.inc | 23 ++++++++++---------- src/asm/program_read_dataset_sshash_init.inc | 8 +++---- src/jit_compiler_a64_static.S | 5 +++-- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index ffde152..033584a 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -15,6 +15,7 @@ mov rsi, rdx ;# uint8_t* scratchpad mov rax, rbp + ror rbp, 32 ;# zero integer registers xor r8, r8 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 590a98d..10f21d3 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -28,6 +28,7 @@ mov rbx, r9 ;# loop counter mov rax, rbp + ror rbp, 32 ;# zero integer registers xor r8, r8 diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc index b81d0c3..9c61092 100644 --- a/src/asm/program_read_dataset.inc +++ b/src/asm/program_read_dataset.inc @@ -1,17 +1,16 @@ + mov ecx, ebp ;# ecx = ma + and ecx, RANDOMX_DATASET_BASE_MASK + xor r8, qword ptr [rdi+rcx] + ror rbp, 32 ;# swap "ma" and "mx" xor rbp, rax ;# modify "mx" mov edx, ebp ;# edx = mx and edx, RANDOMX_DATASET_BASE_MASK prefetchnta byte ptr [rdi+rdx] - ror rbp, 32 ;# swap "ma" and "mx" - mov edx, ebp ;# edx = ma - and edx, RANDOMX_DATASET_BASE_MASK - lea rcx, [rdi+rdx] ;# dataset cache line - xor r8, qword ptr [rcx+0] - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] + xor r9, qword ptr [rdi+rcx+8] + xor r10, qword ptr [rdi+rcx+16] + xor r11, qword ptr [rdi+rcx+24] + xor r12, qword ptr [rdi+rcx+32] + xor r13, qword ptr [rdi+rcx+40] + xor r14, qword ptr [rdi+rcx+48] + xor r15, qword ptr [rdi+rcx+56] \ No newline at end of file diff --git a/src/asm/program_read_dataset_sshash_init.inc b/src/asm/program_read_dataset_sshash_init.inc index 6fe9525..9491f3d 100644 --- a/src/asm/program_read_dataset_sshash_init.inc +++ b/src/asm/program_read_dataset_sshash_init.inc @@ -8,10 +8,10 @@ mov qword ptr [rsp+16], r13 mov qword ptr [rsp+8], r14 mov qword ptr [rsp+0], r15 - xor rbp, rax ;# modify "mx" ror rbp, 32 ;# swap "ma" and "mx" - mov ebx, ebp ;# ecx = ma - and ebx, RANDOMX_DATASET_BASE_MASK - shr ebx, 6 ;# ebx = Dataset block number + xor rbp, rax ;# modify "mx" + mov rbx, rbp ;# ebx = ma + shr rbx, 38 + and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number ;# add ebx, datasetOffset / 64 ;# call 32768 \ No newline at end of file diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 598eca2..7fe6599 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0 literal_v15: .fill 2,8,0 DECL(randomx_program_aarch64_vm_instructions_end): + # Calculate dataset pointer for dataset read + # Do it here to break false dependency from readReg2 and readReg3 (see next line) + lsr x10, x9, 32 # mx ^= r[readReg2] ^ r[readReg3]; eor x9, x9, x18 @@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1): # mx <-> ma ror x9, x9, 32 - # Calculate dataset pointer for dataset read - mov w10, w9 DECL(randomx_program_aarch64_cacheline_align_mask2): # Actual mask will be inserted by JIT compiler and x10, x10, 1