Vector FPU instructions

JitCompilerX86 - static code written in asm
Updated ALU/FPU tests
Updated instruction weights
random-access
tevador 5 years ago
parent a09bee8d60
commit 3caecc7646

@ -12,6 +12,9 @@ OBJDIR=obj
LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
ifeq ($(PLATFORM),x86_64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o
endif
all: release test
@ -57,6 +60,9 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

@ -54,7 +54,7 @@ namespace RandomX {
(this->*generator)(instr, i);
}
void AssemblyGeneratorX86::gena(Instruction& instr) {
void AssemblyGeneratorX86::genar(Instruction& instr) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
switch (instr.loca & 7)
{
@ -63,7 +63,7 @@ namespace RandomX {
case 2:
case 3:
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tcall rx_read_dataset" << std::endl;
asmCode << "\tcall rx_read_dataset_r" << std::endl;
return;
case 4:
@ -80,6 +80,33 @@ namespace RandomX {
}
}
void AssemblyGeneratorX86::genaf(Instruction& instr) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tcall rx_read_dataset_f" << std::endl;
return;
case 4:
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
return;
default:
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl;
return;
}
}
void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) {
switch (instr.locb & 7)
{
@ -87,8 +114,6 @@ namespace RandomX {
case 1:
case 2:
case 3:
case 4:
case 5:
asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
return;
@ -133,26 +158,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) {
asmCode << "\tand rax, -2048" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
switch (instr.locb & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
return;
default:
convertible_t bimm;
bimm.f64 = (double)instr.imm32;
asmCode << "\tmov rax, " << bimm.i64 << std::endl;
asmCode << "\tmovd xmm1, rax" << std::endl;
asmCode << "\t" << instrx86 << " xmm0, xmm1" << std::endl;
return;
}
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
}
void AssemblyGeneratorX86::gencr(Instruction& instr) {
@ -165,7 +171,7 @@ namespace RandomX {
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
}
return;
@ -178,76 +184,75 @@ namespace RandomX {
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rcx" << std::endl;
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl;
}
return;
default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
}
}
}
void AssemblyGeneratorX86::gencf(Instruction& instr) {
void AssemblyGeneratorX86::gencf(Instruction& instr, bool alwaysLow = false) {
if(!alwaysLow)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (!alwaysLow && (instr.locc & 8)) ? "movhpd" : "movlpd";
switch (instr.locc & 7)
{
case 0:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
break;
case 1:
case 2:
case 3:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\tmovd qword ptr [rsi + rax * 8], xmm0" << std::endl;
break;
case 4:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
break;
default:
asmCode << "\tmovsd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
break;
case 5:
case 6:
case 7:
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl;
break;
}
if (trace) {
asmCode << "\tmovd qword ptr [rsi + rdi * 8 + 262144], xmm0" << std::endl;
asmCode << "\t" << store << " qword ptr [rsi + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl;
}
}
void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tadd rax, ";
genbr1(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tadd eax, ";
genbr132(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tsub rax, ";
genbr1(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tsub eax, ";
genbr132(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\timul rax, ";
if ((instr.locb & 7) >= 6) {
asmCode << "rax, ";
@ -257,7 +262,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmov rcx, ";
genbr1(instr);
asmCode << "\tmul rcx" << std::endl;
@ -266,7 +271,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmov ecx, eax" << std::endl;
asmCode << "\tmov eax, ";
genbr132(instr);
@ -275,7 +280,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmovsxd rcx, eax" << std::endl;
if ((instr.locb & 7) >= 6) {
asmCode << "\tmov rax, " << instr.imm32 << std::endl;
@ -288,7 +293,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmov rcx, ";
genbr1(instr);
asmCode << "\timul rcx" << std::endl;
@ -297,7 +302,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
if ((instr.locb & 7) >= 6) {
if (instr.imm32 == 0) {
asmCode << "\tmov ecx, 1" << std::endl;
@ -318,7 +323,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmov edx, ";
genbr132(instr);
asmCode << "\tcmp edx, -1" << std::endl;
@ -339,123 +344,125 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tand rax, ";
genbr1(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tand eax, ";
genbr132(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tor rax, ";
genbr1(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tor eax, ";
genbr132(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\txor rax, ";
genbr1(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\txor eax, ";
genbr132(instr);
gencr(instr);
}
void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
genbr0(instr, "shl");
gencr(instr);
}
void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
genbr0(instr, "shr");
gencr(instr);
}
void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
genbr0(instr, "sar");
gencr(instr);
}
void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
genbr0(instr, "rol");
gencr(instr);
}
void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
gena(instr);
genar(instr);
genbr0(instr, "ror");
gencr(instr);
}
void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) {
gena(instr);
genbf(instr, "addsd");
genaf(instr);
genbf(instr, "addpd");
gencf(instr);
}
void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) {
gena(instr);
genbf(instr, "subsd");
genaf(instr);
genbf(instr, "subpd");
gencf(instr);
}
void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) {
gena(instr);
asmCode << "\tor rax, 2048" << std::endl;
genbf(instr, "mulsd");
genaf(instr);
genbf(instr, "mulpd");
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr);
}
void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) {
gena(instr);
asmCode << "\tor rax, 2048" << std::endl;
genbf(instr, "divsd");
genaf(instr);
genbf(instr, "divpd");
asmCode << "\tmovaps xmm1, xmm0" << std::endl;
asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl;
asmCode << "\tandps xmm0, xmm1" << std::endl;
gencf(instr);
}
void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) {
gena(instr);
asmCode << "\tmov rcx, 9223372036854773760" << std::endl;
asmCode << "\tand rax, rcx" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rax" << std::endl;
asmCode << "\tsqrtsd xmm0, xmm0" << std::endl;
genaf(instr);
asmCode << "\tandps xmm0, xmm10" << std::endl;
asmCode << "\tsqrtpd xmm0, xmm0" << std::endl;
gencf(instr);
}
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tshl eax, 13" << std::endl;
asmCode << "\tand rcx, -2048" << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
asmCode << "\tcvtsi2sd xmm0, rcx" << std::endl;
asmCode << "\tcvtsi2sd " << regF[instr.regc % RegistersCount] << ", rcx" << std::endl;
asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencf(instr);
gencf(instr, true);
}
static inline const char* jumpCondition(Instruction& instr, bool invert = false) {
@ -481,7 +488,7 @@ namespace RandomX {
}
void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl;
asmCode << "\t" << jumpCondition(instr);
asmCode << " short taken_call_" << i << std::endl;
@ -489,14 +496,14 @@ namespace RandomX {
asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl;
asmCode << "taken_call_" << i << ":" << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262144], rax" << std::endl;
asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl;
}
asmCode << "\tpush rax" << std::endl;
asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl;
}
void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) {
gena(instr);
genar(instr);
asmCode << "\tcmp rsp, rbp" << std::endl;
asmCode << "\tje short not_taken_ret_" << i << std::endl;
asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl;

@ -38,13 +38,14 @@ namespace RandomX {
static InstructionGenerator engine[256];
std::stringstream asmCode;
void gena(Instruction&);
void genar(Instruction&);
void genaf(Instruction&);
void genbr0(Instruction&, const char*);
void genbr1(Instruction&);
void genbr132(Instruction&);
void genbf(Instruction&, const char*);
void gencr(Instruction&);
void gencf(Instruction&);
void gencf(Instruction&, bool);
void generateCode(Instruction&, int);

@ -26,9 +26,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) {
#if !defined(_M_X64) && !defined(__x86_64__)
throw std::runtime_error("Compiled VM only supports x86-64 CPUs");
#endif
}
void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
@ -51,7 +49,7 @@ namespace RandomX {
void CompiledVirtualMachine::execute() {
//executeProgram(reg, mem, scratchpad, readDataset);
compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACE
#ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;
}

@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
#pragma once
//#define TRACE
//#define TRACEVM
#include "VirtualMachine.hpp"
#include "JitCompilerX86.hpp"
@ -34,7 +34,7 @@ namespace RandomX {
return compiler.getCode();
}
private:
#ifdef TRACE
#ifdef TRACEVM
convertible_t tracepad[InstructionCount];
#endif
JitCompilerX86 compiler;

@ -44,9 +44,11 @@ namespace RandomX {
*(((uint32_t*)&reg) + i) = gen();
}
FPINIT();
for (int i = 0; i < 8; ++i) {
reg.f[i].f64 = (double)reg.f[i].i64;
for (int i = 0; i < RegistersCount; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
}
//std::cout << reg;
p.initialize(gen);
mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
mem.mx = *(((uint32_t*)seed) + 5);
@ -97,96 +99,98 @@ namespace RandomX {
convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) {
switch (inst.locb & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
return reg.r[inst.regb % RegistersCount];
case 6:
case 7:
convertible_t temp;
temp.i64 = inst.imm32; //sign-extend imm32
return temp;
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
return reg.r[inst.regb % RegistersCount];
case 6:
case 7:
convertible_t temp;
temp.i64 = inst.imm32; //sign-extend imm32
return temp;
}
}
convertible_t InterpretedVirtualMachine::loadbr0(Instruction& inst) {
switch (inst.locb & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
return reg.r[inst.regb % RegistersCount];
case 6:
case 7:
convertible_t temp;
temp.u64 = inst.imm8;
return temp;
case 0:
case 1:
case 2:
case 3:
return reg.r[inst.regb % RegistersCount];
case 4:
case 5:
case 6:
case 7:
convertible_t temp;
temp.u64 = inst.imm8;
return temp;
}
}
double InterpretedVirtualMachine::loadbf(Instruction& inst) {
switch (inst.locb & 7)
convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
addr_t addr;
switch (inst.locc & 7)
{
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
return reg.f[inst.regb % RegistersCount].f64;
case 6:
case 7:
return (double)inst.imm32;
case 0:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL2];
case 1:
case 2:
case 3:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL1];
case 4:
case 5:
case 6:
case 7:
return reg.r[inst.regc % RegistersCount];
}
}
convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) {
void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) {
addr_t addr;
switch (inst.locc & 7)
{
case 0:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL2];
case 1:
case 2:
case 3:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL1];
case 4:
case 5:
case 6:
case 7:
return reg.r[inst.regc % RegistersCount];
case 4:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo;
break;
case 5:
case 6:
case 7:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo;
default:
break;
}
}
convertible_t& InterpretedVirtualMachine::getcf(Instruction& inst) {
void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) {
addr_t addr;
switch (inst.locc & 7)
{
case 0:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL2];
case 1:
case 2:
case 3:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
return scratchpad[addr % ScratchpadL1];
case 4:
case 5:
case 6:
case 7:
return reg.f[inst.regc % RegistersCount];
case 4:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL2] = regc.lo;
break;
case 5:
case 6:
case 7:
addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc;
scratchpad[addr % ScratchpadL1] = regc.lo;
default:
break;
}
}
@ -194,22 +198,18 @@ namespace RandomX {
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \
if(trace) { \
convertible_t bc; \
bc.f64 = b; \
std::cout << std::hex << /*a.u64 << " " << bc.u64 << " " <<*/ c.u64 << std::endl; \
std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \
} \
if(fpuCheck) { \
convertible_t bc; \
if(c.f64 != c.f64) { \
if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64) { \
std::stringstream ss; \
bc.f64 = b; \
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
throw std::runtime_error(ss.str()); \
} else if (std::fpclassify(c.f64) == FP_SUBNORMAL) {\
} else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\
std::stringstream ss; \
bc.f64 = b; \
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << bc.u64 << ") = " << c.u64; \
ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \
throw std::runtime_error(ss.str()); \
} \
}
@ -220,8 +220,13 @@ namespace RandomX {
#define INC_COUNT(x)
#endif
#define FPU_RETIRE_NB(x) x(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl;
#define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \
writecf(inst, c); \
if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl;
#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \
writecflo(inst, c); \
if(trace) std::cout << std::hex << c.lo.u64 << std::endl;
#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
@ -242,17 +247,17 @@ namespace RandomX {
#define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
double b = loadbf(inst); \
convertible_t& c = getcf(inst); \
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \
}
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \
INC_COUNT(x) \
convertible_t a = loada(inst); \
convertible_t b; \
convertible_t& c = getcf(inst); \
FPU_RETIRE_NB(x) \
fpu_reg_t b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE_##x(x) \
}
ALU_INST(ADD_64)

@ -18,7 +18,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
#pragma once
#define STATS
//#define STATS
#include "VirtualMachine.hpp"
#include "Program.hpp"
#include <vector>
@ -88,9 +88,9 @@ namespace RandomX {
convertible_t loada(Instruction&);
convertible_t loadbr0(Instruction&);
convertible_t loadbr1(Instruction&);
double loadbf(Instruction&);
convertible_t& getcr(Instruction&);
convertible_t& getcf(Instruction&);
void writecf(Instruction&, fpu_reg_t&);
void writecflo(Instruction&, fpu_reg_t&);
void stackPush(convertible_t& c) {
stack.push_back(c);

@ -0,0 +1,58 @@
;# Copyright (c) 2018 tevador
;#
;# This file is part of RandomX.
;#
;# RandomX is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# RandomX is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#else
.section .text
#endif
#if defined(__WIN32__) || defined(__APPLE__)
#define DECL(x) _##x
#else
#define DECL(x) x
#endif
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_r)
.global DECL(randomx_program_read_f)
.global DECL(randomx_program_end)
.align 64
DECL(randomx_program_prologue):
#include "asm/program_prologue_linux.inc"
.align 64
DECL(randomx_program_begin):
nop
.align 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
.align 64
DECL(randomx_program_read_r):
#include "asm/program_read_r.inc"
.align 64
DECL(randomx_program_read_f):
#include "asm/program_read_f.inc"
.align 64
DECL(randomx_program_end):
nop

@ -0,0 +1,59 @@
;# Copyright (c) 2018 tevador
;#
;# This file is part of RandomX.
;#
;# RandomX is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# RandomX is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_r
PUBLIC randomx_program_read_f
PUBLIC randomx_program_end
ALIGN 64
randomx_program_prologue PROC
include asm/program_prologue_win64.inc
randomx_program_prologue ENDP
ALIGN 64
randomx_program_begin PROC
nop
randomx_program_begin ENDP
ALIGN 64
randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_program_read_r PROC
include asm/program_read_r.inc
randomx_program_read_r ENDP
ALIGN 64
randomx_program_read_f PROC
include asm/program_read_f.inc
randomx_program_read_f ENDP
ALIGN 64
randomx_program_end PROC
nop
randomx_program_end ENDP
_RANDOMX_JITX86_STATIC ENDS
END

@ -0,0 +1,27 @@
/*
Copyright (c) 2018 tevador
This file is part of RandomX.
RandomX is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RandomX is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
extern "C" {
void randomx_program_prologue();
void randomx_program_begin();
void randomx_program_epilogue();
void randomx_program_read_r();
void randomx_program_read_f();
void randomx_program_end();
}

@ -34,6 +34,16 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
#if !defined(_M_X64) && !defined(__x86_64__)
JitCompilerX86::JitCompilerX86() {
throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
}
#else
/*
REGISTER ALLOCATION:
@ -41,7 +51,7 @@ namespace RandomX {
rbx -> MemoryRegisters& memory
rcx -> temporary
rdx -> temporary
rsi -> convertible_t& scratchpad
rsi -> convertible_t* scratchpad
rdi -> "ic" (instruction counter)
rbp -> beginning of VM stack
rsp -> end of VM stack
@ -63,6 +73,7 @@ namespace RandomX {
xmm7 -> "f7"
xmm8 -> "f0"
xmm9 -> "f1"
xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
STACK STRUCTURE:
@ -81,127 +92,23 @@ namespace RandomX {
*/
constexpr uint8_t ic3 = ((InstructionCount + 1) >> 24);
constexpr uint8_t ic2 = ((InstructionCount + 1) >> 16);
constexpr uint8_t ic1 = ((InstructionCount + 1) >> 8);
constexpr uint8_t ic0 = ((InstructionCount + 1) >> 0);
const uint8_t prologue[] = {
0x53, //push rbx
0x55, //push rbp
#ifdef _WIN32
0x57, //push rdi
0x56, //push rsi
#endif
0x41, 0x54, //push r12
0x41, 0x55, //push r13
0x41, 0x56, //push r14
0x41, 0x57, //push r15
#ifdef _WIN32
0x48, 0x83, 0xec, 0x48, //sub rsp,0x48
0xf3, 0x0f, 0x7f, 0x74, 0x24, 0x30, //movdqu XMMWORD PTR[rsp + 0x30],xmm6
0xf3, 0x0f, 0x7f, 0x7c, 0x24, 0x20, //movdqu XMMWORD PTR[rsp + 0x20],xmm7
0xf3, 0x44, 0x0f, 0x7f, 0x44, 0x24, 0x10, //movdqu XMMWORD PTR[rsp + 0x10],xmm8
0xf3, 0x44, 0x0f, 0x7f, 0x0c, 0x24, //movdqu XMMWORD PTR[rsp],xmm9
0x51, //push rcx
0x48, 0x8b, 0xda, //mov rbx,rdx
0x49, 0x8b, 0xf0, //mov rsi,r8
#else
0x57, //push rdi
0x48, 0x8b, 0xde, //mov rbx, rsi
0x48, 0x8b, 0xf2, //mov rsi, rdx
0x48, 0x8b, 0xcf, //mov rcx, rdi
#endif
0x48, 0x8b, 0xec, //mov rbp,rsp
0x48, 0xc7, 0xc7, ic0, ic1, ic2, ic3, //mov rdi, "InstructionCount"
0x4c, 0x8b, 0x01, //mov r8,QWORD PTR[rcx]
0x4c, 0x8b, 0x49, 0x08, //mov r9,QWORD PTR[rcx+0x8]
0x4c, 0x8b, 0x51, 0x10, //mov r10,QWORD PTR[rcx+0x10]
0x4c, 0x8b, 0x59, 0x18, //mov r11,QWORD PTR[rcx+0x18]
0x4c, 0x8b, 0x61, 0x20, //mov r12,QWORD PTR[rcx+0x20]
0x4c, 0x8b, 0x69, 0x28, //mov r13,QWORD PTR[rcx+0x28]
0x4c, 0x8b, 0x71, 0x30, //mov r14,QWORD PTR[rcx+0x30]
0x4c, 0x8b, 0x79, 0x38, //mov r15,QWORD PTR[rcx+0x38]
0xc7, 0x44, 0x24, 0xf8, 0xc0, 0x9f, 0x00, //mov DWORD PTR[rsp-0x8],0x9fc0
0x00,
0x0f, 0xae, 0x54, 0x24, 0xf8, //ldmxcsr DWORD PTR[rsp-0x8]
0xf2, 0x4c, 0x0f, 0x2a, 0x41, 0x40, //cvtsi2sd xmm8,QWORD PTR[rcx+0x40]
0xf2, 0x4c, 0x0f, 0x2a, 0x49, 0x48, //cvtsi2sd xmm9,QWORD PTR[rcx+0x48]
0xf2, 0x48, 0x0f, 0x2a, 0x51, 0x50, //cvtsi2sd xmm2,QWORD PTR[rcx+0x50]
0xf2, 0x48, 0x0f, 0x2a, 0x59, 0x58, //cvtsi2sd xmm3,QWORD PTR[rcx+0x58]
0xf2, 0x48, 0x0f, 0x2a, 0x61, 0x60, //cvtsi2sd xmm4,QWORD PTR[rcx+0x60]
0xf2, 0x48, 0x0f, 0x2a, 0x69, 0x68, //cvtsi2sd xmm5,QWORD PTR[rcx+0x68]
0xf2, 0x48, 0x0f, 0x2a, 0x71, 0x70, //cvtsi2sd xmm6,QWORD PTR[rcx+0x70]
0xf2, 0x48, 0x0f, 0x2a, 0x79, 0x78, //cvtsi2sd xmm7,QWORD PTR[rcx+0x78]
};
const uint8_t epilogue[] = {
0x48, 0x8b, 0xe5, //mov rsp,rbp
0x59, //pop rcx
0x4c, 0x89, 0x01, //mov QWORD PTR [rcx],r8
0x4c, 0x89, 0x49, 0x08, //mov QWORD PTR [rcx+0x8],r9
0x4c, 0x89, 0x51, 0x10, //mov QWORD PTR [rcx+0x10],r10
0x4c, 0x89, 0x59, 0x18, //mov QWORD PTR [rcx+0x18],r11
0x4c, 0x89, 0x61, 0x20, //mov QWORD PTR [rcx+0x20],r12
0x4c, 0x89, 0x69, 0x28, //mov QWORD PTR [rcx+0x28],r13
0x4c, 0x89, 0x71, 0x30, //mov QWORD PTR [rcx+0x30],r14
0x4c, 0x89, 0x79, 0x38, //mov QWORD PTR [rcx+0x38],r15
0x66, 0x4c, 0x0f, 0x7e, 0x41, 0x40, //movq QWORD PTR [rcx+0x40],xmm8
0x66, 0x4c, 0x0f, 0x7e, 0x49, 0x48, //movq QWORD PTR [rcx+0x48],xmm9
0x66, 0x48, 0x0f, 0x7e, 0x51, 0x50, //movq QWORD PTR [rcx+0x50],xmm2
0x66, 0x48, 0x0f, 0x7e, 0x59, 0x58, //movq QWORD PTR [rcx+0x58],xmm3
0x66, 0x48, 0x0f, 0x7e, 0x61, 0x60, //movq QWORD PTR [rcx+0x60],xmm4
0x66, 0x48, 0x0f, 0x7e, 0x69, 0x68, //movq QWORD PTR [rcx+0x68],xmm5
0x66, 0x48, 0x0f, 0x7e, 0x71, 0x70, //movq QWORD PTR [rcx+0x70],xmm6
0x66, 0x48, 0x0f, 0x7e, 0x79, 0x78, //movq QWORD PTR [rcx+0x78],xmm7
#ifdef _WIN32
0xf3, 0x44, 0x0f, 0x6f, 0x0c, 0x24, //movdqu xmm9,XMMWORD PTR [rsp]
0xf3, 0x44, 0x0f, 0x6f, 0x44, 0x24, 0x10, //movdqu xmm8,XMMWORD PTR [rsp+0x10]
0xf3, 0x0f, 0x6f, 0x7c, 0x24, 0x20, //movdqu xmm7,XMMWORD PTR [rsp+0x20]
0xf3, 0x0f, 0x6f, 0x74, 0x24, 0x30, //movdqu xmm6,XMMWORD PTR [rsp+0x30]
0x48, 0x83, 0xc4, 0x48, //add rsp,0x48
#endif
0x41, 0x5f, //pop r15
0x41, 0x5e, //pop r14
0x41, 0x5d, //pop r13
0x41, 0x5c, //pop r12
#ifdef _WIN32
0x5e, //pop rsi
0x5f, //pop rdi
#endif
0x5d, //pop rbp
0x5b, //pop rbx
0xc3, //ret
};
//41 bytes -> 1 cache line
const uint8_t readDatasetSub[] = {
0x8b, 0x13, //mov edx,DWORD PTR [rbx]
0x48, 0x8b, 0x43, 0x08, //mov rax,QWORD PTR [rbx+0x8]
0x48, 0x8b, 0x04, 0x10, //mov rax,QWORD PTR [rax+rdx*1]
0x83, 0x03, 0x08, //add DWORD PTR [rbx],0x8
0x33, 0x4b, 0x04, //xor ecx,DWORD PTR [rbx+0x4]
0x89, 0x4b, 0x04, //mov DWORD PTR [rbx+0x4],ecx
0xf7, 0xc1, 0xf8, 0xff, 0x00, 0x00, //test ecx,0xfff8
0x75, 0x0d, //jne
0x83, 0xe1, 0xf8, //and ecx,0xfffffff8
0x89, 0x0b, //mov DWORD PTR [rbx],ecx
0x48, 0x8b, 0x53, 0x08, //mov rdx,QWORD PTR [rbx+0x8]
0x0f, 0x18, 0x0c, 0x0a, //prefetcht0 BYTE PTR [rdx+rcx*1]
0xc3, //ret
};
#include "JitCompilerX86-static.hpp"
constexpr int getNumCacheLines(size_t size) {
return (size + (CacheLineSize - 1)) / CacheLineSize;
}
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
constexpr int32_t align(int32_t pos, int32_t align) {
return ((pos - 1) / align + 1) * align;
}
const int32_t prologueSize = codeProgramBegin - codePrologue;
const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;
constexpr int32_t readDatasetSubOffset = CodeSize - CacheLineSize * getNumCacheLines(sizeof(readDatasetSub));
constexpr int32_t epilogueOffset = readDatasetSubOffset - CacheLineSize * getNumCacheLines(sizeof(epilogue));
constexpr int32_t startOffsetAligned = align(sizeof(prologue), CacheLineSize);
const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
const int32_t epilogueOffset = readDatasetROffset - epilogueSize;
JitCompilerX86::JitCompilerX86() {
#ifdef _WIN32
@ -213,24 +120,16 @@ namespace RandomX {
if (code == (uint8_t*)-1)
throw std::runtime_error("mmap failed");
#endif
memcpy(code, prologue, sizeof(prologue));
codePos = sizeof(prologue);
if (startOffsetAligned - codePos > 4) {
emitByte(0xeb);
emitByte(startOffsetAligned - (codePos + 1));
}
else {
while (codePos < startOffsetAligned)
emitByte(0x90); //nop
}
memcpy(code + readDatasetSubOffset, readDatasetSub, sizeof(readDatasetSub));
memcpy(code + epilogueOffset, epilogue, sizeof(epilogue));
memcpy(code, codePrologue, prologueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
instructionOffsets.clear();
callOffsets.clear();
codePos = startOffsetAligned;
codePos = prologueSize;
Instruction instr;
for (unsigned i = 0; i < ProgramLength; ++i) {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
@ -247,7 +146,6 @@ namespace RandomX {
instructionOffsets.push_back(codePos);
emit(0x840fcfff); //dec edx; jz <epilogue>
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
gena(instr);
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
@ -258,11 +156,10 @@ namespace RandomX {
}
}
void JitCompilerX86::gena(Instruction& instr) {
void JitCompilerX86::genar(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
int32_t pc;
switch (instr.loca & 7)
{
case 0:
@ -272,7 +169,7 @@ namespace RandomX {
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetSubOffset - (codePos + 4));
emit(readDatasetROffset - (codePos + 4));
return;
case 4:
@ -293,8 +190,44 @@ namespace RandomX {
}
}
void JitCompilerX86::genaf(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetFOffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
}
}
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
if ((instr.locb & 7) <= 5) {
if ((instr.locb & 7) <= 3) {
emit(uint16_t(0x8b49)); //mov
emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
emitByte(0x48); //REX.W
@ -330,126 +263,117 @@ namespace RandomX {
}
void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) {
emit(0x48f2fffff8002548); //and rax,0xfffffffffffff800; cvtsi2sd xmm0,rax
emit(uint16_t(0x2a0f));
emitByte(0xc0);
if ((instr.locb & 7) <= 5) {
int regb = (instr.regb % RegistersCount);
emitByte(0xf2); //xxxsd xmm0,regb
if (regb <= 1) {
emitByte(0x41); //REX
}
emitByte(0x0f);
emitByte(opcode);
emitByte(0xc0 + regb);
}
else {
convertible_t bimm;
bimm.f64 = (double)instr.imm32;
emit(uint16_t(0xb848)); //movabs rax,imm64
emit(bimm.i64);
emitByte(0x66); //movq xmm1,rax
emit(0xc86e0f48);
emit(uint16_t(0x0ff2)); //xxxsd xmm0,xmm1
emitByte(opcode);
emitByte(0xc1);
int regb = (instr.regb % RegistersCount);
emitByte(0x66); //xxxpd xmm0,regb
if (regb <= 1) {
emitByte(0x41); //REX
}
emitByte(0x0f);
emitByte(opcode);
emitByte(0xc0 + regb);
}
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
emit(0x41c88b48); //mov rcx, rax; REX
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(scratchpadSize - 1);
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
}
void JitCompilerX86::gencr(Instruction& instr) {
switch (instr.locc & 7)
{
case 0:
emit(0x41c88b48); //mov rcx, rax; REX
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
break;
case 0:
scratchpadStoreR(instr, ScratchpadL2);
break;
case 1:
case 2:
case 3:
emit(0x41c88b48); //mov rcx, rax; REX
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
break;
case 1:
case 2:
case 3:
scratchpadStoreR(instr, ScratchpadL1);
break;
default:
emit(uint16_t(0x8b4c)); //mov
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
break;
default:
emit(uint16_t(0x8b4c)); //mov
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
break;
}
}
void JitCompilerX86::gencf(Instruction& instr) {
int regc = (instr.regc % RegistersCount);
switch (instr.locc & 7)
{
case 0:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + regc); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(uint16_t(0x4866)); //prefix
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
break;
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + regc); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(uint16_t(0x4866)); //prefix
emit(0xc6047e0f); // movq QWORD PTR [rsi+rax*8],xmm0
break;
void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) {
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + regc); //eax, regc
emitByte(0x35); // xor eax
emit(instr.addrc);
emitByte(0x25); //and
emit(scratchpadSize - 1);
emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc
if (regc <= 1) {
emitByte(0x44); //REX
}
emitByte(0x0f);
emitByte(storeHigh ? 0x17 : 0x13);
emitByte(4 + 8 * regc);
emitByte(0xc6);
}
default:
emitByte(0xf2);
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
int regc = (instr.regc % RegistersCount);
if (!alwaysLow) {
if (regc <= 1) {
emitByte(0x44); //REX
}
emit(uint16_t(0x100f)); //movsd
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
break;
}
switch (instr.locc & 7)
{
case 4:
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
break;
case 5:
case 6:
case 7:
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
break;
default:
break;
}
}
void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x0349, 0x0548);
gencr(instr);
}
void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x0341, 0x05);
gencr(instr);
}
void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x2b49, 0x2d48);
gencr(instr);
}
void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x2b41, 0x2d);
gencr(instr);
}
void JitCompilerX86::h_MUL_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) {
emitByte(0x49); //REX
emit(uint16_t(0xaf0f)); // imul rax, r64
@ -464,6 +388,7 @@ namespace RandomX {
}
void JitCompilerX86::h_MULH_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b49)); //mov rcx, r64
emitByte(0xc8 + (instr.regb % RegistersCount));
@ -481,6 +406,7 @@ namespace RandomX {
}
void JitCompilerX86::h_MUL_32(Instruction& instr, int i) {
genar(instr);
emit(uint16_t(0xc88b)); //mov ecx, eax
if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b41)); // mov eax, r32
@ -495,6 +421,7 @@ namespace RandomX {
}
void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) {
genar(instr);
emitByte(0x48);
emit(uint16_t(0xc863)); //movsxd rcx,eax
if ((instr.locb & 7) <= 5) {
@ -511,6 +438,7 @@ namespace RandomX {
}
void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b49)); //mov rcx, r64
emitByte(0xc8 + (instr.regb % RegistersCount));
@ -528,6 +456,7 @@ namespace RandomX {
}
void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) {
emitByte(0xb9); //mov ecx, 1
emit(1);
@ -546,6 +475,7 @@ namespace RandomX {
}
void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
genar(instr);
if ((instr.locb & 7) <= 5) {
emit(uint16_t(0x8b41)); //mov edx, r32
emitByte(0xd0 + (instr.regb % RegistersCount));
@ -563,100 +493,127 @@ namespace RandomX {
}
void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x2349, 0x2548);
gencr(instr);
}
void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x2341, 0x25);
gencr(instr);
}
void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x0b49, 0x0d48);
gencr(instr);
}
void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x0b41, 0x0d);
gencr(instr);
}
void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
genar(instr);
genbr1(instr, 0x3349, 0x3548);
gencr(instr);
}
void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
genar(instr);
genbr132(instr, 0x3341, 0x35);
gencr(instr);
}
void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xe0d3, 0xe0c1);
gencr(instr);
}
void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xe8d3, 0xe8c1);
gencr(instr);
}
void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xf8d3, 0xf8c1);
gencr(instr);
}
void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xc0d3, 0xc0c1);
gencr(instr);
}
void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
genar(instr);
genbr0(instr, 0xc8d3, 0xc8c1);
gencr(instr);
}
void JitCompilerX86::h_FPADD(Instruction& instr, int i) {
genaf(instr);
genbf(instr, 0x58);
gencf(instr);
}
void JitCompilerX86::h_FPSUB(Instruction& instr, int i) {
genaf(instr);
genbf(instr, 0x5c);
gencf(instr);
}
void JitCompilerX86::h_FPMUL(Instruction& instr, int i) {
emit(uint16_t(0x0d48)); //or rax,0x800
emit(0x00000800);
genaf(instr);
genbf(instr, 0x59);
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
emit(uint16_t(0x540f)); //andps xmm0,xmm1
emitByte(0xc1);
gencf(instr);
}
void JitCompilerX86::h_FPDIV(Instruction& instr, int i) {
emit(uint16_t(0x0d48)); //or rax,0x800
emit(0x00000800);
genaf(instr);
genbf(instr, 0x5e);
emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
emit(uint16_t(0x540f)); //andps xmm0,xmm1
emitByte(0xc1);
gencf(instr);
}
void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) {
emit(uint16_t(0xb948)); //or movabs rcx, imm64
emit(0x7ffffffffffff800);
emit(0xc02a0f48f2c12348); //and rax,rcx; cvtsi2sd xmm0,rax
emit(0xc0510ff2); //sqrtsd xmm0,xmm0
genaf(instr);
emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0
gencf(instr);
}
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
genar(instr);
emit(0x81480de0c1c88b48);
emit(0x600025fffff800e1);
emit(0x0dc12a0f48f20000);
emit(uint16_t(0x0000));
emitByte(0xf2);
int regc = (instr.regc % RegistersCount);
if (regc <= 1) {
emitByte(0x4c); //REX
}
else {
emitByte(0x48); //REX
}
emit(uint16_t(0x2a0f));
emitByte(0xc1 + 8 * regc);
emitByte(0x0d);
emit(0xf824448900009fc0);
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf8);
gencf(instr);
gencf(instr, true);
}
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@ -682,6 +639,7 @@ namespace RandomX {
}
void JitCompilerX86::h_CALL(Instruction& instr, int i) {
genar(instr);
emit(uint16_t(0x8141)); //cmp regb, imm32
emitByte(0xf8 + (instr.regb % RegistersCount));
emit(instr.imm32);
@ -707,6 +665,7 @@ namespace RandomX {
}
void JitCompilerX86::h_RET(Instruction& instr, int i) {
genar(instr);
int crlen = 0;
if ((instr.locc & 7) <= 3) {
crlen = 17;
@ -756,4 +715,6 @@ namespace RandomX {
INST_HANDLE(CALL)
INST_HANDLE(RET)
};
#endif
}

@ -58,13 +58,16 @@ namespace RandomX {
std::vector<int32_t> instructionOffsets;
std::vector<CallOffset> callOffsets;
void gena(Instruction&);
void genar(Instruction&);
void genaf(Instruction&);
void genbr0(Instruction&, uint16_t, uint16_t);
void genbr1(Instruction&, uint16_t, uint16_t);
void genbr132(Instruction&, uint16_t, uint8_t);
void genbf(Instruction&, uint8_t);
void scratchpadStoreR(Instruction&, uint32_t);
void scratchpadStoreF(Instruction&, int, uint32_t, bool);
void gencr(Instruction&);
void gencf(Instruction&);
void gencf(Instruction&, bool);
void generateCode(Instruction&, int);
void fixCallOffsets();

@ -21,33 +21,36 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <iomanip>
#include <limits>
#include "instructions.hpp"
#include "Pcg32.hpp"
//#define DEBUG
using namespace RandomX;
typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
typedef void(*FpuOperation)(convertible_t&, fpu_reg_t&, fpu_reg_t&);
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
convertible_t a, b, c;
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
uint64_t rxRound(uint32_t mode, int64_t x, int64_t y, FpuOperation op, bool hiEqualsLo = true) {
convertible_t a;
fpu_reg_t b, c;
a.u64 = mode;
FPROUND(a, b, c);
#ifdef DEBUG
a.f64 = convertToDouble(x);
b.f64 = convertToDouble(y);
std::cout << std::hex << (uint64_t)x << " -> " << a.u64 << std::endl;
std::cout << std::hex << (uint64_t)y << " -> " << b.u64 << std::endl;
std::cout << std::dec;
#endif
a.i64 = x;
b.i64 = y;
if (hiEqualsLo) {
a.i32lo = x;
a.i32hi = x;
}
else {
a.i64 = x;
}
b.lo.i64 = y;
b.hi.i64 = y;
op(a, b, c);
return c.u64;
if (hiEqualsLo) {
CHECK(c.lo.u64 == c.hi.u64);
}
return c.lo.u64;
}
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
#define RX_EXECUTE_U64(va, vb, INST) do { \
a.u64 = va; \
b.u64 = vb; \
@ -273,118 +276,126 @@ TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
TEST_CASE("Denormal results are not produced", "[FTZ]") {
FPINIT();
convertible_t a, b, c;
a.i64 = 2048;
FPDIV(a, DBL_MAX, c);
convertible_t a;
fpu_reg_t b;
a.i64 = 1;
b.lo.f64 = DBL_MAX;
FPDIV(a, b, b);
#ifdef DEBUG
std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << c.u64 << std::endl;
std::cout << a.i64 << " / " << DBL_MAX << " = " << std::hex << b.lo.u64 << std::endl;
#endif
REQUIRE(std::fpclassify(c.f64) != FP_SUBNORMAL);
b.f64 = c.f64;
CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
a.i64 = 0;
FPSUB_64(a, b, c);
FPSUB(a, b, b);
#ifdef DEBUG
std::cout << a.i64 << " - " << b.f64 << " = " << std::hex << c.u64 << std::endl;
std::cout << a.i64 << " - " << b.lo.f64 << " = " << std::hex << b.lo.u64 << std::endl;
#endif
CHECK(std::fpclassify(c.f64) != FP_SUBNORMAL);
CHECK(std::fpclassify(b.lo.f64) != FP_SUBNORMAL);
}
TEST_CASE("NaN results are not produced", "[NAN]") {
FPINIT();
convertible_t a, c;
convertible_t a;
fpu_reg_t b;
a.i64 = 0;
FPDIV(a, 0, c);
CHECK(std::fpclassify(c.f64) != FP_NAN);
FPMUL(a, std::numeric_limits<double>::infinity(), c);
CHECK(std::fpclassify(c.f64) != FP_NAN);
b.lo.f64 = 0;
FPDIV(a, b, b);
CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
b.lo.f64 = std::numeric_limits<double>::infinity();
FPMUL(a, b, b);
CHECK(std::fpclassify(b.lo.f64) != FP_NAN);
}
volatile int64_t fpAdda = 7379480244170225589;
volatile int64_t fpAddb = -438072579179686797;
volatile int64_t fpSuba = 2939258788088626026;
volatile int64_t fpSubb = 4786131045320678734;
volatile int64_t fpMula1 = 8399833736388895639;
volatile int64_t fpMulb1 = 5671608020317594922;
volatile int64_t fpMula2 = -7094299423744805450;
volatile int64_t fpMulb2 = 4982086006202596504;
volatile int64_t fpDiva1 = 8399833736388895639;
volatile int64_t fpDivb1 = 5671608020317594922;
volatile int64_t fpDiva2 = -7434878587645025912;
volatile int64_t fpDivb2 = 5266243837734830806;
volatile int64_t fpSqrta = -7594301562963134542;
volatile int64_t fpRounda = 7379480244170225589;
volatile int32_t fpAdda = -2110701072;
volatile int64_t fpAddb = 5822431907862180274;
volatile int32_t fpSuba = -1651770302;
volatile int64_t fpSubb = 4982086006202596504;
volatile int32_t fpMula1 = 122885310;
volatile int64_t fpMulb1 = 6036690890763685020;
volatile int32_t fpMula2 = -1952486466;
volatile int64_t fpMulb2 = 5693689137909219638;
volatile int32_t fpDiva1 = -1675630642;
volatile int64_t fpDivb1 = -3959960229647489051;
volatile int32_t fpDiva2 = -1651770302;
volatile int64_t fpDivb2 = 4982086006202596504;
volatile int32_t fpSqrta1 = 440505508;
volatile int32_t fpSqrta2 = -2147483648;
TEST_CASE("IEEE-754 compliance", "[FPU]") {
FPINIT();
convertible_t a, b, c;
convertible_t a;
fpu_reg_t b, c;
b.lo.f64 = 0.0;
a.i64 = 2048;
FPDIV(a, 0, c);
CHECK(c.f64 == std::numeric_limits<double>::infinity());
a.i64 = 1;
FPDIV(a, b, c);
CHECK(c.lo.f64 == std::numeric_limits<double>::infinity());
a.i64 = -2048;
FPDIV(a, 0, c);
CHECK(c.f64 == -std::numeric_limits<double>::infinity());
a.i64 = -1;
FPDIV(a, b, c);
CHECK(c.lo.f64 == -std::numeric_limits<double>::infinity());
#ifdef DEBUG
std::cout << "FPROUND" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundDown, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundUp, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundToZero, fpAdda, 0, &FPROUND) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundToNearest, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundDown, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundUp, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundToZero, fpSuba, 0, &FPROUND) == 0x43c4652c25bf7bdcU);
CHECK(rxRound(RoundToNearest, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundDown, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundUp, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
CHECK(rxRound(RoundToZero, fpRounda, 0, &FPROUND, false) == 0x43d99a4b8bc531dcU);
#ifdef DEBUG
std::cout << "FPADD" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d473U);
CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD_64) == 0xf9eba74f6c27d472U);
CHECK(rxRound(RoundToNearest, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
CHECK(rxRound(RoundDown, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
CHECK(rxRound(RoundUp, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b2U);
CHECK(rxRound(RoundToZero, fpAdda, fpAddb, &FPADD) == 0x50cd6ef8bd0671b1U);
#ifdef DEBUG
std::cout << "FPSUB" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c49U);
CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB_64) == 0x43c4652bb6bc2c48U);
CHECK(rxRound(RoundToNearest, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
CHECK(rxRound(RoundDown, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c99U);
CHECK(rxRound(RoundUp, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
CHECK(rxRound(RoundToZero, fpSuba, fpSubb, &FPSUB) == 0xc523ecd390267c98U);
#ifdef DEBUG
std::cout << "FPMUL" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e9U);
CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL_64) == 0x52a3abbb1677f3e8U);
CHECK(rxRound(RoundToNearest, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
CHECK(rxRound(RoundDown, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
CHECK(rxRound(RoundUp, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24542U);
CHECK(rxRound(RoundToZero, fpMula1, fpMulb1, &FPMUL) == 0x5574b924d2f24541U);
CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c583U);
CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL_64) == 0xc90ea6c25e29c582U);
CHECK(rxRound(RoundToNearest, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
CHECK(rxRound(RoundDown, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a7470U);
CHECK(rxRound(RoundUp, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
CHECK(rxRound(RoundToZero, fpMula2, fpMulb2, &FPMUL) == 0xd0f23a18891a746fU);
#ifdef DEBUG
std::cout << "FPDIV" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81cU);
CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV_64) == 0x3515967d3015e81bU);
CHECK(rxRound(RoundToNearest, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
CHECK(rxRound(RoundDown, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
CHECK(rxRound(RoundUp, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb0aU);
CHECK(rxRound(RoundToZero, fpDiva1, fpDivb1, &FPDIV) == 0x38bd2a7732b5eb09U);
CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fccU);
CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV_64) == 0xbab33c30b92b8fcbU);
CHECK(rxRound(RoundToNearest, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
CHECK(rxRound(RoundDown, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71cU);
CHECK(rxRound(RoundUp, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
CHECK(rxRound(RoundToZero, fpDiva2, fpDivb2, &FPDIV) == 0xbca3c3c039ccc71bU);
#ifdef DEBUG
std::cout << "FPSQRT" << std::endl;
#endif
CHECK(rxRound(RoundToNearest, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
CHECK(rxRound(RoundDown, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
CHECK(rxRound(RoundUp, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2dU);
CHECK(rxRound(RoundToZero, fpSqrta, 0, &FPSQRT) == 0x41d304e3fcc31a2cU);
CHECK(rxRound(RoundToNearest, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
CHECK(rxRound(RoundDown, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
CHECK(rxRound(RoundUp, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19dU);
CHECK(rxRound(RoundToZero, fpSqrta1, 0, &FPSQRT) == 0x40d47f0e46ebc19cU);
CHECK(rxRound(RoundToNearest, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
CHECK(rxRound(RoundDown, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
CHECK(rxRound(RoundUp, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bcdU);
CHECK(rxRound(RoundToZero, fpSqrta2, 0, &FPSQRT) == 0x40e6a09e667f3bccU);
}

@ -24,8 +24,19 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "t1ha/t1ha.h"
#include "blake2/blake2.h"
#include <cstring>
#include <iomanip>
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl
<< " = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec;
return os;
}
namespace RandomX {
VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
mem.ds.dataset = nullptr;
}
@ -83,9 +94,10 @@ namespace RandomX {
}
void VirtualMachine::getResult(void* out) {
uint64_t smallState[sizeof(RegisterFile) / sizeof(uint64_t) + 2];
constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2;
uint64_t smallState[smallStateLength];
memcpy(smallState, &reg, sizeof(RegisterFile));
smallState[17] = t1ha2_atonce128(&smallState[16], scratchpad, ScratchpadSize, reg.r[0].u64);
smallState[smallStateLength - 1] = t1ha2_atonce128(&smallState[smallStateLength - 2], scratchpad, ScratchpadSize, reg.r[0].u64);
blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0);
}
}

@ -32,11 +32,14 @@ namespace RandomX {
virtual void initializeProgram(const void* seed) = 0;
virtual void execute() = 0;
void getResult(void*);
const RegisterFile& getRegisterFile() {
return reg;
}
protected:
bool softAes, lightClient;
RegisterFile reg;
MemoryRegisters mem;
DatasetReadFunc readDataset;
alignas(16) RegisterFile reg;
MemoryRegisters mem;
alignas(16) convertible_t scratchpad[ScratchpadLength];
};
}

@ -0,0 +1,12 @@
#include "program_epilogue_store.inc"
;# restore callee-saved registers - System V AMD64 ABI
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
;# program finished
ret 0

@ -0,0 +1,22 @@
;# unroll VM stack
mov rsp, rbp
;# save VM register values
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
movdqa xmmword ptr [rcx+64], xmm8
movdqa xmmword ptr [rcx+80], xmm9
movdqa xmmword ptr [rcx+96], xmm2
movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
movdqa xmmword ptr [rcx+64], xmm4
movdqa xmmword ptr [rcx+80], xmm5
movdqa xmmword ptr [rcx+96], xmm6
movdqa xmmword ptr [rcx+112], xmm7

@ -0,0 +1,20 @@
include program_epilogue_store.inc
;# restore callee-saved registers - Microsoft x64 calling convention
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
;# program finished
ret 0

@ -0,0 +1,17 @@
;# callee-saved registers - System V AMD64 ABI
push rbx
push rbp
push r12
push r13
push r14
push r15
;# function arguments
push rdi ;# RegisterFile& registerFile
mov rbx, rsi ;# MemoryRegisters& memory
mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi
#include "program_prologue_load.inc"
jmp randomx_program_begin

@ -0,0 +1,63 @@
mov rbp, rsp ;# beginning of VM stack
mov rdi, 1048577 ;# number of VM instructions to execute + 1
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10
psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
;# reset rounding mode
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
;# load integer registers
mov r8, qword ptr [rcx+0]
mov r9, qword ptr [rcx+8]
mov r10, qword ptr [rcx+16]
mov r11, qword ptr [rcx+24]
mov r12, qword ptr [rcx+32]
mov r13, qword ptr [rcx+40]
mov r14, qword ptr [rcx+48]
mov r15, qword ptr [rcx+56]
;# initialize floating point registers
xorps xmm8, xmm8
cvtsi2sd xmm8, qword ptr [rcx+72]
pslldq xmm8, 8
cvtsi2sd xmm8, qword ptr [rcx+64]
xorps xmm9, xmm9
cvtsi2sd xmm9, qword ptr [rcx+88]
pslldq xmm9, 8
cvtsi2sd xmm9, qword ptr [rcx+80]
xorps xmm2, xmm2
cvtsi2sd xmm2, qword ptr [rcx+104]
pslldq xmm2, 8
cvtsi2sd xmm2, qword ptr [rcx+96]
xorps xmm3, xmm3
cvtsi2sd xmm3, qword ptr [rcx+120]
pslldq xmm3, 8
cvtsi2sd xmm3, qword ptr [rcx+112]
lea rcx, [rcx+64]
xorps xmm4, xmm4
cvtsi2sd xmm4, qword ptr [rcx+72]
pslldq xmm4, 8
cvtsi2sd xmm4, qword ptr [rcx+64]
xorps xmm5, xmm5
cvtsi2sd xmm5, qword ptr [rcx+88]
pslldq xmm5, 8
cvtsi2sd xmm5, qword ptr [rcx+80]
xorps xmm6, xmm6
cvtsi2sd xmm6, qword ptr [rcx+104]
pslldq xmm6, 8
cvtsi2sd xmm6, qword ptr [rcx+96]
xorps xmm7, xmm7
cvtsi2sd xmm7, qword ptr [rcx+120]
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]

@ -0,0 +1,24 @@
;# callee-saved registers - Microsoft x64 calling convention
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbx, rdx ;# MemoryRegisters& memory
mov rsi, r8 ;# convertible_t* scratchpad
include program_prologue_load.inc
jmp randomx_program_begin

@ -0,0 +1,13 @@
mov edx, dword ptr [rbx] ;# ma
mov rax, qword ptr [rbx+8] ;# dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0

@ -0,0 +1,13 @@
mov eax, dword ptr [rbx] ;# ma
mov rdx, qword ptr [rbx+8] ;# dataset
mov rax, qword ptr [rdx+rax]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_r_ret:
ret 0

@ -20,6 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
#include <cstdint>
#include <iostream>
namespace RandomX {
@ -59,6 +60,15 @@ namespace RandomX {
uint64_t u64;
int32_t i32;
uint32_t u32;
struct {
int32_t i32lo;
int32_t i32hi;
};
};
struct fpu_reg_t {
convertible_t lo;
convertible_t hi;
};
constexpr int ProgramLength = 512;
@ -96,10 +106,10 @@ namespace RandomX {
struct RegisterFile {
convertible_t r[RegistersCount];
convertible_t f[RegistersCount];
fpu_reg_t f[RegistersCount];
};
static_assert(sizeof(RegisterFile) == 2 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile");
typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&);
@ -108,4 +118,6 @@ namespace RandomX {
extern "C" {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc);
}
}
}
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);

@ -1,19 +1,19 @@
; Copyright (c) 2018 tevador
;
; This file is part of RandomX.
;
; RandomX is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; RandomX is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with RandomX. If not, see<http://www.gnu.org/licenses/>.
;# Copyright (c) 2018 tevador
;#
;# This file is part of RandomX.
;#
;# RandomX is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# RandomX is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with RandomX. If not, see<http://www.gnu.org/licenses/>.
PUBLIC executeProgram
@ -47,6 +47,7 @@ executeProgram PROC
; xmm7 -> "f7"
; xmm8 -> "f0"
; xmm9 -> "f1"
; xmm10 -> absolute value mask
; STACK STRUCTURE:
; |
@ -71,11 +72,12 @@ executeProgram PROC
push r13
push r14
push r15
sub rsp, 64
movdqu xmmword ptr [rsp+48], xmm6
movdqu xmmword ptr [rsp+32], xmm7
movdqu xmmword ptr [rsp+16], xmm8
movdqu xmmword ptr [rsp+0], xmm9
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
; function arguments
push rcx ; RegisterFile& registerFile
@ -86,7 +88,15 @@ executeProgram PROC
mov rbp, rsp ; beginning of VM stack
mov rdi, 1048577 ; number of VM instructions to execute + 1
; load VM register values
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10
psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
; reset rounding mode
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
; load integer registers
mov r8, qword ptr [rcx+0]
mov r9, qword ptr [rcx+8]
mov r10, qword ptr [rcx+16]
@ -95,16 +105,56 @@ executeProgram PROC
mov r13, qword ptr [rcx+40]
mov r14, qword ptr [rcx+48]
mov r15, qword ptr [rcx+56]
mov dword ptr [rsp-8], 40896
ldmxcsr dword ptr [rsp-8]
; load register f0 hi, lo
xorps xmm8, xmm8
cvtsi2sd xmm8, qword ptr [rcx+72]
pslldq xmm8, 8
cvtsi2sd xmm8, qword ptr [rcx+64]
cvtsi2sd xmm9, qword ptr [rcx+72]
cvtsi2sd xmm2, qword ptr [rcx+80]
cvtsi2sd xmm3, qword ptr [rcx+88]
cvtsi2sd xmm4, qword ptr [rcx+96]
cvtsi2sd xmm5, qword ptr [rcx+104]
cvtsi2sd xmm6, qword ptr [rcx+112]
; load register f1 hi, lo
xorps xmm9, xmm9
cvtsi2sd xmm9, qword ptr [rcx+88]
pslldq xmm9, 8
cvtsi2sd xmm9, qword ptr [rcx+80]
; load register f2 hi, lo
xorps xmm2, xmm2
cvtsi2sd xmm2, qword ptr [rcx+104]
pslldq xmm2, 8
cvtsi2sd xmm2, qword ptr [rcx+96]
; load register f3 hi, lo
xorps xmm3, xmm3
cvtsi2sd xmm3, qword ptr [rcx+120]
pslldq xmm3, 8
cvtsi2sd xmm3, qword ptr [rcx+112]
lea rcx, [rcx+64]
; load register f4 hi, lo
xorps xmm4, xmm4
cvtsi2sd xmm4, qword ptr [rcx+72]
pslldq xmm4, 8
cvtsi2sd xmm4, qword ptr [rcx+64]
; load register f5 hi, lo
xorps xmm5, xmm5
cvtsi2sd xmm5, qword ptr [rcx+88]
pslldq xmm5, 8
cvtsi2sd xmm5, qword ptr [rcx+80]
; load register f6 hi, lo
xorps xmm6, xmm6
cvtsi2sd xmm6, qword ptr [rcx+104]
pslldq xmm6, 8
cvtsi2sd xmm6, qword ptr [rcx+96]
; load register f7 hi, lo
xorps xmm7, xmm7
cvtsi2sd xmm7, qword ptr [rcx+120]
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]
; program body
@ -125,21 +175,23 @@ rx_finish:
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
movd qword ptr [rcx+64], xmm8
movd qword ptr [rcx+72], xmm9
movd qword ptr [rcx+80], xmm2
movd qword ptr [rcx+88], xmm3
movd qword ptr [rcx+96], xmm4
movd qword ptr [rcx+104], xmm5
movd qword ptr [rcx+112], xmm6
movd qword ptr [rcx+120], xmm7
movdqa xmmword ptr [rcx+64], xmm8
movdqa xmmword ptr [rcx+80], xmm9
movdqa xmmword ptr [rcx+96], xmm2
movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
movdqa xmmword ptr [rcx+64], xmm4
movdqa xmmword ptr [rcx+80], xmm5
movdqa xmmword ptr [rcx+96], xmm6
movdqa xmmword ptr [rcx+112], xmm7
; load callee-saved registers
movdqu xmm9, xmmword ptr [rsp]
movdqu xmm8, xmmword ptr [rsp+16]
movdqu xmm7, xmmword ptr [rsp+32]
movdqu xmm6, xmmword ptr [rsp+48]
add rsp, 64
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
@ -171,7 +223,7 @@ rx_read_dataset:
pop r8
ret 0
rx_read_dataset_full:
rx_read_dataset_r:
mov edx, dword ptr [rbx] ; ma
mov rax, qword ptr [rbx+8] ; dataset
mov rax, qword ptr [rax+rdx]
@ -179,12 +231,27 @@ rx_read_dataset_full:
xor ecx, dword ptr [rbx+4] ; mx
mov dword ptr [rbx+4], ecx
test ecx, 0FFF8h
jne short rx_read_dataset_full_ret
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
mov rdx, qword ptr [rbx+8]
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_full_ret:
rx_read_dataset_r_ret:
ret 0
rx_read_dataset_f:
mov edx, dword ptr [rbx] ; ma
mov rax, qword ptr [rbx+8] ; dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ; mx
mov dword ptr [rbx+4], ecx
test ecx, 0FFF8h
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0
executeProgram ENDP

@ -19,15 +19,15 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
#define WT_ADD_64 10
#define WT_ADD_64 11
#define WT_ADD_32 2
#define WT_SUB_64 10
#define WT_SUB_64 11
#define WT_SUB_32 2
#define WT_MUL_64 21
#define WT_MUL_64 23
#define WT_MULH_64 10
#define WT_MUL_32 15
#define WT_IMUL_32 15
#define WT_IMULH_64 10
#define WT_IMULH_64 6
#define WT_DIV_64 1
#define WT_IDIV_64 1
#define WT_AND_64 4
@ -47,8 +47,9 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#define WT_FPDIV 8
#define WT_FPSQRT 6
#define WT_FPROUND 2
#define WT_CALL 24
#define WT_RET 18
#define WT_CALL 20
#define WT_RET 22
constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \
WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \
@ -60,6 +61,7 @@ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \
static_assert(wtSum == 256,
"Sum of instruction weights must be 256");
#define REP0(x)
#define REP1(x) x,
#define REP2(x) REP1(x) x,
#define REP3(x) REP2(x) x,
@ -86,6 +88,16 @@ static_assert(wtSum == 256,
#define REP24(x) REP23(x) x,
#define REP25(x) REP24(x) x,
#define REP26(x) REP25(x) x,
#define REP27(x) REP26(x) x,
#define REP28(x) REP27(x) x,
#define REP29(x) REP28(x) x,
#define REP30(x) REP29(x) x,
#define REP31(x) REP30(x) x,
#define REP32(x) REP31(x) x,
#define REP33(x) REP32(x) x,
#define REP40(x) REP32(x) REP8(x)
#define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x)
#define REP256(x) REP128(x) REP128(x)
#define REPNX(x,N) REP##N(x)
#define REPN(x,N) REPNX(x,N)
#define NUM(x) x

@ -22,16 +22,10 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
inline double convertToDouble(int64_t x) {
return (double)(x &-2048L);
}
inline double convertToDoubleNonZero(int64_t x) {
return (double)((x & -2048L) | 2048);
}
inline double convertToDoubleNonNegative(int64_t x) {
return (double)(x & 9223372036854773760L);
//Clears the 11 least-significant bits before conversion. This is done so the number
//fits exactly into the 52-bit mantissa without rounding.
inline double convertSigned52(int64_t x) {
return (double)(x & -2048L);
}
extern "C" {
@ -59,27 +53,11 @@ namespace RandomX {
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
bool JMP_COND(uint8_t, convertible_t&, int32_t);
void FPINIT();
void FPADD(convertible_t& a, double b, convertible_t& c);
void FPSUB(convertible_t& a, double b, convertible_t& c);
void FPMUL(convertible_t& a, double b, convertible_t& c);
void FPDIV(convertible_t& a, double b, convertible_t& c);
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c);
inline void FPADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPADD(a, b.f64, c);
}
inline void FPSUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPSUB(a, b.f64, c);
}
inline void FPMUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPMUL(a, b.f64, c);
}
inline void FPDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
FPDIV(a, b.f64, c);
}
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c);
}
}

@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
//#define DEBUG
//#define FTZ
#include "instructions.hpp"
#include "intrinPortable.h"
#pragma STDC FENV_ACCESS on
@ -154,19 +153,17 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
#define subOverflow __subOverflow
#endif
static double FlushDenormal(double x) {
if (std::fpclassify(x) == FP_SUBNORMAL) {
return 0;
static inline double FlushDenormalNaN(double x) {
int fpc = std::fpclassify(x);
if (fpc == FP_SUBNORMAL || fpc == FP_NAN) {
return 0.0;
}
return x;
}
#ifdef FTZ
#undef FTZ
#define FTZ(x) FlushDenormal(x)
#else
#define FTZ(x) x
#endif
static inline double FlushNaN(double x) {
return x != x ? 0.0 : x;
}
namespace RandomX {
@ -286,37 +283,95 @@ namespace RandomX {
}
void FPINIT() {
#ifdef __SSE2__
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
#else
setRoundMode(FE_TONEAREST);
#endif
}
void FPADD(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDouble(a.i64) + b);
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_add_pd(ad, bd);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = alo + b.lo.f64;
c.hi.f64 = ahi + b.hi.f64;
#endif
}
void FPSUB(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDouble(a.i64) - b);
void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_sub_pd(ad, bd);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = alo - b.lo.f64;
c.hi.f64 = ahi - b.hi.f64;
#endif
}
void FPMUL(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDoubleNonZero(a.i64) * b);
void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_mul_pd(ad, bd);
__m128d mask = _mm_cmpeq_pd(cd, cd);
cd = _mm_and_pd(cd, mask);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = FlushNaN(alo * b.lo.f64);
c.hi.f64 = FlushNaN(ahi * b.hi.f64);
#endif
}
void FPDIV(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ(convertToDoubleNonZero(a.i64) / b);
void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
__m128d bd = _mm_load_pd(&b.lo.f64);
__m128d cd = _mm_div_pd(ad, bd);
__m128d mask = _mm_cmpeq_pd(cd, cd);
cd = _mm_and_pd(cd, mask);
_mm_store_pd(&c.lo.f64, cd);
#else
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64);
c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64);
#endif
}
void FPSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
double d = convertToDoubleNonNegative(a.i64);
c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
__m128d ad = _mm_cvtepi32_pd(ai);
const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
ad = _mm_and_pd(ad, absmask);
__m128d cd = _mm_sqrt_pd(ad);
_mm_store_pd(&c.lo.f64, cd);
#else
c.f64 = FTZ(sqrt(convertToDoubleNonNegative(a.i64)));
double alo = (double)a.i32lo;
double ahi = (double)a.i32hi;
c.lo.f64 = sqrt(std::abs(alo));
c.hi.f64 = sqrt(std::abs(ahi));
#endif
}
void FPROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = convertToDouble(a.i64);
void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
c.lo.f64 = convertSigned52(a.i64);
switch (a.u64 & 3) {
case RoundDown:
#ifdef DEBUG

@ -79,14 +79,6 @@ void readInt(int argc, char** argv, int& out, int defaultValue) {
out = defaultValue;
}
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec;
for (int i = 0; i < RandomX::RegistersCount; ++i)
os << std::hex << "f" << i << " = " << rf.f[i].u64 << " (" << rf.f[i].f64 << ")" << std::endl << std::dec;
return os;
}
class AtomicHash {
public:
AtomicHash() {
@ -282,7 +274,7 @@ int main(int argc, char** argv) {
std::cout << "Calculated result: ";
result.print(std::cout);
if(programCount == 1000)
std::cout << "Reference result: f6bf06465d5fa1b1dc919140b9e9f9e210b07ae6d662988458a172e9a267eb3f" << std::endl;
std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
/*if (threadCount == 1 && !compiled) {
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save