diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index dddcd07..5c89aed 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -68,7 +68,6 @@ static const size_t CodeSize = ((uint8_t*)randomx_init_dataset_aarch64_end) - (( static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64); static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64); static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64); -static const size_t InstructionsEnd = ((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64); static const size_t CalcDatasetItemSize = (((uint8_t*)randomx_calc_dataset_item_aarch64_end) - ((uint8_t*)randomx_calc_dataset_item_aarch64)) + @@ -150,6 +149,63 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con #endif } +void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset) +{ + uint32_t codePos = MainLoopBegin + 4; + + // and w16, w10, ScratchpadL3Mask64 + emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + // and w17, w18, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + + codePos = PrologueSize; + literalPos = ImulRcpLiteralsEnd; + num32bitLiterals = 0; + + for (uint32_t i = 0; i < RegistersCount; ++i) + reg_changed_offset[i] = codePos; + + for (uint32_t i = 0; i < program.getSize(); ++i) + { + Instruction& instr = program(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + (this->*engine[instr.opcode])(instr, codePos); + } + + // Update spMix2 + // eor w18, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + + // Jump back to the main loop + const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; + emit32(ARMV8A::B | (offset / 4), code, codePos); + + // and w2, w9, CacheLineAlignMask + codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64)); + emit32(0x121A0000 | 2 | (9 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + + // Update spMix1 + // eor x10, config.readReg0, config.readReg1 + codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64); + emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos); + + // Apply dataset offset + codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64); + + datasetOffset /= CacheLineSize; + const uint32_t imm_lo = datasetOffset & ((1 << 12) - 1); + const uint32_t imm_hi = datasetOffset >> 12; + + emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos); + emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos); + +#ifdef __GNUC__ + __builtin___clear_cache(reinterpret_cast(code + MainLoopBegin), reinterpret_cast(code + codePos)); +#endif +} + template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &reciprocalCache) { diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index ba6e263..fa96756 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -50,8 +50,7 @@ namespace randomx { ~JitCompilerA64(); void generateProgram(Program&, ProgramConfiguration&); - - void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {} + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); template void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &); diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index bf901a9..c4c433b 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -35,6 +35,9 @@ .global randomx_program_aarch64_cacheline_align_mask1 .global randomx_program_aarch64_cacheline_align_mask2 .global randomx_program_aarch64_update_spMix1 + .global randomx_program_aarch64_vm_instructions_end_light + .global randomx_program_aarch64_light_cacheline_align_mask + .global randomx_program_aarch64_light_dataset_offset .global randomx_init_dataset_aarch64 .global randomx_init_dataset_aarch64_end .global randomx_calc_dataset_item_aarch64 @@ -318,6 +321,7 @@ randomx_program_aarch64_cacheline_align_mask2: and x10, x10, 1 add x10, x10, x1 +randomx_program_aarch64_xor_with_dataset_line: # xor integer registers with dataset data ldp x18, x19, [x10] eor x4, x4, x18 @@ -386,6 +390,46 @@ randomx_program_aarch64_update_spMix1: ret +randomx_program_aarch64_vm_instructions_end_light: + sub sp, sp, 96 + stp x0, x1, [sp, 64] + stp x2, x30, [sp, 80] + + # mx ^= r[readReg2] ^ r[readReg3]; + eor x9, x9, x18 + + # mx <-> ma + ror x9, x9, 32 + + # x0 -> pointer to cache memory + mov x0, x1 + + # x1 -> pointer to output + mov x1, sp + +randomx_program_aarch64_light_cacheline_align_mask: + # Actual mask will be inserted by JIT compiler + and w2, w9, 1 + + # x2 -> item number + lsr x2, x2, 6 + +randomx_program_aarch64_light_dataset_offset: + # Apply dataset offset (filled in by JIT compiler) + add x2, x2, 0 + add x2, x2, 0 + + bl randomx_calc_dataset_item_aarch64 + + mov x10, sp + ldp x0, x1, [sp, 64] + ldp x2, x30, [sp, 80] + add sp, sp, 96 + + b randomx_program_aarch64_xor_with_dataset_line + + + # Input parameters # # x0 -> pointer to cache diff --git a/src/jit_compiler_a64_static.hpp b/src/jit_compiler_a64_static.hpp index 6a49480..a9b922e 100644 --- a/src/jit_compiler_a64_static.hpp +++ b/src/jit_compiler_a64_static.hpp @@ -38,6 +38,9 @@ extern "C" { void randomx_program_aarch64_cacheline_align_mask1(); void randomx_program_aarch64_cacheline_align_mask2(); void randomx_program_aarch64_update_spMix1(); + void randomx_program_aarch64_vm_instructions_end_light(); + void randomx_program_aarch64_light_cacheline_align_mask(); + void randomx_program_aarch64_light_dataset_offset(); void randomx_init_dataset_aarch64(); void randomx_init_dataset_aarch64_end(); void randomx_calc_dataset_item_aarch64(); diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index def3a64..ef4f4c7 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -254,6 +254,7 @@ int main(int argc, char** argv) { p += nBytes; } fclose(fp); + std::cout << "Dataset loaded in " << sw.getElapsed() << " s" << std::endl; } else { @@ -261,6 +262,8 @@ int main(int argc, char** argv) { } if (!read_ok) { + Stopwatch dataset_initialization(true); + uint32_t datasetItemCount = randomx_dataset_item_count(); if (initThreadCount > 1) { auto perThread = datasetItemCount / initThreadCount; @@ -278,6 +281,7 @@ int main(int argc, char** argv) { else { randomx_init_dataset(dataset, cache, 0, datasetItemCount); } + std::cout << "Dataset initialized in " << dataset_initialization.getElapsed() << " s" << std::endl; fp = fopen("dataset.bin", "wb"); if (fp) @@ -290,7 +294,6 @@ int main(int argc, char** argv) { cache = nullptr; threads.clear(); } - std::cout << "Memory initialized in " << sw.getElapsed() << " s" << std::endl; std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl; for (int i = 0; i < threadCount; ++i) { randomx_vm *vm = randomx_create_vm(flags, cache, dataset);