initBlock asm version (disabled)

feature/branches
tevador 5 years ago
parent 91063aac91
commit 6b344b81fd

@ -53,16 +53,16 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak
$(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR)
$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@
$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR)
$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@
$(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR)
$(CC) $(CCFLAGS) -c $(SRCDIR)/blake2/blake2b.c -o $@
$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp) | $(OBJDIR)
$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@
$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR)
$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
$(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR)
@ -71,40 +71,40 @@ $(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OB
$(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR)
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR)
$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc) configuration.h) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp intrinPortable.h blake2/endian.h common.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@
$(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp blake2/endian.h common.hpp) | $(OBJDIR)
$(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp blake2/endian.h common.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Instruction.cpp -o $@
$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h dataset.hpp Cache.hpp virtualMemory.hpp LightClientAsyncWorker.hpp) | $(OBJDIR)
$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h dataset.hpp Cache.hpp virtualMemory.hpp LightClientAsyncWorker.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@
$(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@
$(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h) | $(OBJDIR)
$(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@
$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp) | $(OBJDIR)
$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@
$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h) | $(OBJDIR)
$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@
$(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/softAes.cpp -o $@
$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp blake2/endian.h Program.hpp Instruction.hpp hashAes1Rx4.hpp softAes.h intrinPortable.h blake2/blake2.h) | $(OBJDIR)
$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp blake2/endian.h Program.hpp Instruction.hpp hashAes1Rx4.hpp softAes.h intrinPortable.h blake2/blake2.h configuration.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@
$(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR)

@ -0,0 +1,155 @@
prefetcht0 byte ptr [rbp]
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
initBlock_loop:
;# c0
mov rbx, r8
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r8+r9]
call squareHash
mov r9, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c1
mov rbx, r9
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r9+r10]
call squareHash
mov r10, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c2
mov rbx, r10
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r10+r11]
call squareHash
mov r11, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c3
mov rbx, r11
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r11+r12]
call squareHash
mov r12, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c4
mov rbx, r12
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r12+r13]
call squareHash
mov r13, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c5
mov rbx, r13
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r13+r14]
call squareHash
mov r14, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c6
mov rbx, r14
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r14+r15]
call squareHash
mov r15, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
;# c7
mov rbx, r15
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r15+r8]
call squareHash
mov r8, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
sub rsi, 1
jnz initBlock_loop
mov qword ptr [rbp+0], r8
mov qword ptr [rbp+8], r9
mov qword ptr [rbp+16], r10
mov qword ptr [rbp+24], r11
mov qword ptr [rbp+32], r12
mov qword ptr [rbp+40], r13
mov qword ptr [rbp+48], r14
mov qword ptr [rbp+56], r15

@ -40,7 +40,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
#if !defined(_M_X64)
#if true //RANDOMX_ARGON_GROWTH != 0 || (!defined(_M_X64) && !defined(__x86_64__))
static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) {
uint8_t* mixBlock;
if (RANDOMX_ARGON_GROWTH == 0) {
@ -75,7 +75,7 @@ namespace RandomX {
uint8_t* mixBlock;
for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) {
for (auto i = 0; i < iterations; ++i) {
mixBlock = selectMixBlock(cache, c0, c1);
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);

@ -25,7 +25,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
#if defined(_M_X64)
#if false //RANDOMX_ARGON_GROWTH == 0 && (defined(_M_X64) || defined(__x86_64__))
extern "C"
#endif
void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations);

@ -10,8 +10,31 @@
#define DECL(x) x
#endif
#include "configuration.h"
.global DECL(squareHash)
.global DECL(initBlock)
DECL(squareHash):
mov rcx, rdi
#include "asm/squareHash.inc"
DECL(initBlock):
push rbx
push rbp
push r12
push r13
push r14
push r15
mov rdi, qword ptr [rdi]
mov rbp, rsi
mov r8, rdx
mov rsi, rcx
#include "asm/initBlock.inc"
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
ret

@ -22,165 +22,11 @@ initBlock PROC
push r13
push r14
push r15
mov rsi, r9
mov rdi, qword ptr [rcx]
mov rbp, rdx
prefetcht0 byte ptr [rbp]
; r8 = blockNumber
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
initBlock_loop:
; c0
mov rbx, r8
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r8+r9]
call squareHash
mov r9, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c1
mov rbx, r9
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r9+r10]
call squareHash
mov r10, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c2
mov rbx, r10
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r10+r11]
call squareHash
mov r11, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c3
mov rbx, r11
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r11+r12]
call squareHash
mov r12, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c4
mov rbx, r12
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r12+r13]
call squareHash
mov r13, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c5
mov rbx, r13
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r13+r14]
call squareHash
mov r14, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c6
mov rbx, r14
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r14+r15]
call squareHash
mov r15, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
; c7
mov rbx, r15
and rbx, 4194303
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rcx, [r15+r8]
call squareHash
mov r8, rax
xor r8, qword ptr [rbx+0]
xor r9, qword ptr [rbx+8]
xor r10, qword ptr [rbx+16]
xor r11, qword ptr [rbx+24]
xor r12, qword ptr [rbx+32]
xor r13, qword ptr [rbx+40]
xor r14, qword ptr [rbx+48]
xor r15, qword ptr [rbx+56]
sub rsi, 1
jnz initBlock_loop
mov qword ptr [rbp+0], r8
mov qword ptr [rbp+8], r9
mov qword ptr [rbp+16], r10
mov qword ptr [rbp+24], r11
mov qword ptr [rbp+32], r12
mov qword ptr [rbp+40], r13
mov qword ptr [rbp+48], r14
mov qword ptr [rbp+56], r15
mov rsi, r9
include asm/initBlock.inc
pop r15
pop r14
pop r13

Loading…
Cancel
Save