diff --git a/src/main.cpp b/src/main.cpp index ac63dce..b6efceb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,7 @@ along with RandomX. If not, see. #include "AssemblyGeneratorX86.hpp" #include "Stopwatch.hpp" #include "blake2/blake2.h" +#include "blake2/endian.h" #include #include #include @@ -125,12 +126,11 @@ void printUsage(const char* executable) { } template -void generateAsm(int nonce) { +void generateAsm(uint32_t nonce) { alignas(16) uint64_t hash[8]; uint8_t blockTemplate[sizeof(blockTemplate__)]; memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate)); - int* noncePtr = (int*)(blockTemplate + 39); - *noncePtr = nonce; + store32(blockTemplate + 39, nonce); blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); uint8_t scratchpad[RandomX::ScratchpadSize]; fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); @@ -142,12 +142,11 @@ void generateAsm(int nonce) { } template -void generateNative(int nonce) { +void generateNative(uint32_t nonce) { alignas(16) uint64_t hash[8]; uint8_t blockTemplate[sizeof(blockTemplate__)]; memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate)); - int* noncePtr = (int*)(blockTemplate + 39); - *noncePtr = nonce; + store32(blockTemplate + 39, nonce); blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); uint8_t scratchpad[RandomX::ScratchpadSize]; fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); @@ -161,16 +160,16 @@ void generateNative(int nonce) { } template -void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { +void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, uint8_t* scratchpad) { alignas(16) uint64_t hash[8]; uint8_t blockTemplate[sizeof(blockTemplate__)]; memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate)); - int* noncePtr = (int*)(blockTemplate + 39); - int nonce = atomicNonce.fetch_add(1); + void* noncePtr = blockTemplate + 39; + auto nonce = atomicNonce.fetch_add(1); while (nonce < noncesCount) { //std::cout << "Thread " << thread << " nonce " << nonce << std::endl; - *noncePtr = nonce; + store32(noncePtr, nonce); blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); vm->resetRoundingMode(); @@ -242,7 +241,7 @@ int main(int argc, char** argv) { if (softAes) std::cout << "Using software AES." << std::endl; - std::atomic atomicNonce(0); + std::atomic atomicNonce(0); AtomicHash result; std::vector vms; std::vector threads; diff --git a/src/program.inc b/src/program.inc index 8a18fe4..4171a54 100644 --- a/src/program.inc +++ b/src/program.inc @@ -471,9 +471,9 @@ mov eax, r13d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 + andps xmm12, xmm13 + orps xmm12, xmm14 divpd xmm5, xmm12 - maxpd xmm5, xmm13 ; IXOR_M r2, L1[r5] mov eax, r13d and eax, 16376 @@ -685,9 +685,9 @@ mov eax, r10d and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 + andps xmm12, xmm13 + orps xmm12, xmm14 divpd xmm4, xmm12 - maxpd xmm4, xmm13 ; IMUL_9C r4, -1849458799 lea r12, [r12+r12*8-1849458799] ; IADD_RC r1, r4, -651820510 @@ -707,9 +707,9 @@ mov eax, r8d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 + andps xmm12, xmm13 + orps xmm12, xmm14 divpd xmm7, xmm12 - maxpd xmm7, xmm13 ; IADD_M r3, L1[r7] mov eax, r15d and eax, 16376