diff --git a/src/blake2/endian.h b/src/blake2/endian.h index fab1eed..c7afed2 100644 --- a/src/blake2/endian.h +++ b/src/blake2/endian.h @@ -41,11 +41,15 @@ static FORCE_INLINE uint32_t load32(const void *src) { #endif } -static FORCE_INLINE uint64_t load64(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) +static FORCE_INLINE uint64_t load64_native(const void *src) { uint64_t w; memcpy(&w, src, sizeof w); return w; +} + +static FORCE_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + return load64_native(src); #else const uint8_t *p = (const uint8_t *)src; uint64_t w = *p++; @@ -75,9 +79,13 @@ static FORCE_INLINE void store32(void *dst, uint32_t w) { #endif } +static FORCE_INLINE void store64_native(void *dst, uint64_t w) { + memcpy(dst, &w, sizeof w); +} + static FORCE_INLINE void store64(void *dst, uint64_t w) { #if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); + store64_native(dst, w); #else uint8_t *p = (uint8_t *)dst; *p++ = (uint8_t)w; diff --git a/src/dataset.cpp b/src/dataset.cpp index 5858115..ee0958f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -192,7 +192,7 @@ namespace randomx { executeSuperscalar(rl, prog, &cache->reciprocalCache); for (unsigned q = 0; q < 8; ++q) - rl[q] ^= load64(mixBlock + 8 * q); + rl[q] ^= load64_native(mixBlock + 8 * q); registerValue = rl[prog.getAddressRegister()]; } diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 221a56b..32aba08 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -295,7 +295,8 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) { __m128i x; - x.u64[0] = load64(mem_addr); + x.u32[0] = load32((uint8_t*)mem_addr + 0); + x.u32[1] = load32((uint8_t*)mem_addr + 4); return x; } diff --git a/src/superscalar.cpp b/src/superscalar.cpp index 8bf757e..54e376f 100644 --- a/src/superscalar.cpp +++ b/src/superscalar.cpp @@ -573,14 +573,6 @@ namespace randomx { constexpr int LOOK_FORWARD_CYCLES = 4; constexpr int MAX_THROWAWAY_COUNT = 256; -#ifndef _DEBUG - constexpr bool TRACE = false; - constexpr bool INFO = false; -#else - constexpr bool TRACE = true; - constexpr bool INFO = true; -#endif - template static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload @@ -588,21 +580,21 @@ namespace randomx { for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { if (commit) { - if (TRACE) std::cout << "; P5 at cycle " << cycle << std::endl; + if (trace) std::cout << "; P5 at cycle " << cycle << std::endl; portBusy[cycle][2] = uop; } return cycle; } if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { if (commit) { - if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + if (trace) std::cout << "; P0 at cycle " << cycle << std::endl; portBusy[cycle][0] = uop; } return cycle; } if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { if (commit) { - if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; + if (trace) std::cout << "; P1 at cycle " << cycle << std::endl; portBusy[cycle][1] = uop; } return cycle; @@ -621,7 +613,7 @@ namespace randomx { //move instructions are eliminated and don't need an execution unit if (mop.isEliminated()) { if (commit) - if (TRACE) std::cout << "; (eliminated)" << std::endl; + if (trace) std::cout << "; (eliminated)" << std::endl; return cycle; } else if (mop.isSimple()) { @@ -677,7 +669,7 @@ namespace randomx { //select a decode configuration decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); - if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; + if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; int bufferIndex = 0; @@ -692,15 +684,15 @@ namespace randomx { //select an instruction so that the first macro-op fits into the current slot currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); macroOpIndex = 0; - if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; + if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); - if (TRACE) std::cout << mop.getName() << " "; + if (trace) std::cout << mop.getName() << " "; //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); if (scheduleCycle < 0) { - if (TRACE) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; + if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; //__debugbreak(); portsSaturated = true; break; @@ -711,7 +703,7 @@ namespace randomx { int forward; //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { - if (TRACE) std::cout << "; src STALL at cycle " << cycle << std::endl; + if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } @@ -720,22 +712,22 @@ namespace randomx { if (throwAwayCount < MAX_THROWAWAY_COUNT) { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); - if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; //cycle = topCycle; continue; } //abort this decode buffer - if (TRACE) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; currentInstruction = SuperscalarInstruction::Null; break; } - if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } //find a destination register that will be ready when this instruction executes if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { int forward; for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { - if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; + if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } @@ -743,16 +735,16 @@ namespace randomx { if (throwAwayCount < MAX_THROWAWAY_COUNT) { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); - if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; //cycle = topCycle; continue; } //abort this decode buffer - if (TRACE) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; currentInstruction = SuperscalarInstruction::Null; break; } - if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } throwAwayCount = 0; @@ -773,7 +765,7 @@ namespace randomx { ri.latency = retireCycle; ri.lastOpGroup = currentInstruction.getGroup(); ri.lastOpPar = currentInstruction.getGroupPar(); - if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; + if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; } codeSize += mop.getSize(); bufferIndex++; diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index 34f5c47..c422702 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -37,14 +37,6 @@ const uint8_t blockTemplate_[] = { 0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09 }; -constexpr char hexmap[] = "0123456789abcdef"; -void outputHex(std::ostream& os, const char* data, int length) { - for (int i = 0; i < length; ++i) { - os << hexmap[(data[i] & 0xF0) >> 4]; - os << hexmap[data[i] & 0x0F]; - } -} - class AtomicHash { public: AtomicHash() { @@ -101,7 +93,8 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result int main(int argc, char** argv) { bool softAes, miningMode, verificationMode, help, largePages, jit; int noncesCount, threadCount, initThreadCount; - int32_t seed; + int32_t seedValue; + char seed[4]; readOption("--softAes", argc, argv, softAes); readOption("--mine", argc, argv, miningMode); @@ -109,11 +102,13 @@ int main(int argc, char** argv) { readIntOption("--threads", argc, argv, threadCount, 1); readIntOption("--nonces", argc, argv, noncesCount, 1000); readIntOption("--init", argc, argv, initThreadCount, 1); - readIntOption("--seed", argc, argv, seed, 0); + readIntOption("--seed", argc, argv, seedValue, 0); readOption("--largePages", argc, argv, largePages); readOption("--jit", argc, argv, jit); readOption("--help", argc, argv, help); + store32(&seed, seedValue); + std::cout << "RandomX benchmark" << std::endl; if (help || (!miningMode && !verificationMode)) { @@ -229,7 +224,7 @@ int main(int argc, char** argv) { double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; result.print(std::cout); - if (noncesCount == 1000 && seed == 0) + if (noncesCount == 1000 && seedValue == 0) std::cout << "Reference result: b69741719152625854031c2337ceae68c3030f2b9581a73acebaa69fc9b555fc" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; diff --git a/src/tests/utility.hpp b/src/tests/utility.hpp index 2dc6bb7..f3ed029 100644 --- a/src/tests/utility.hpp +++ b/src/tests/utility.hpp @@ -24,6 +24,14 @@ along with RandomX. If not, see. #include #include +constexpr char hexmap[] = "0123456789abcdef"; +inline void outputHex(std::ostream& os, const char* data, int length) { + for (int i = 0; i < length; ++i) { + os << hexmap[(data[i] & 0xF0) >> 4]; + os << hexmap[data[i] & 0x0F]; + } +} + inline void dump(const char* buffer, uint64_t count, const char* name) { std::ofstream fout(name, std::ios::out | std::ios::binary); fout.write(buffer, count); diff --git a/src/vm_interpreted.cpp b/src/vm_interpreted.cpp index 4508330..22109d3 100644 --- a/src/vm_interpreted.cpp +++ b/src/vm_interpreted.cpp @@ -114,7 +114,7 @@ namespace randomx { template void InterpretedVm::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[ic]; - if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); + if (trace && ibc.type != InstructionType::NOP) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); switch (ibc.type) { case InstructionType::IADD_RS: { @@ -270,7 +270,7 @@ namespace randomx { default: UNREACHABLE; } - if (trace) { + if (trace && ibc.type != InstructionType::NOP) { if(ibc.type < 20 || ibc.type == 31 || ibc.type == 32) print(*ibc.idst); else //if(ibc.type >= 20 && ibc.type <= 30)