From 790b382edab419c7d22200ea411a0b3dd041d63f Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Sun, 24 Feb 2019 14:48:07 +0100
Subject: [PATCH] Reworked conversion int -> float for register group E

---
 src/InterpretedVirtualMachine.cpp | 52 ++++++++++++++++++++++++-------
 src/intrinPortable.h              | 31 +++++++++++++++---
 2 files changed, 66 insertions(+), 17 deletions(-)
diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index 6a97d7d..f44a391 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -29,6 +29,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <cmath>
 #include <cfloat>
 #include <thread>
+#include <climits>
 #include "intrinPortable.h"
 #include "reciprocal.h"
 #ifdef STATS
@@ -108,9 +109,13 @@ namespace RandomX {
 		}
 	}
 
+	static bool isDenormal(double x) {
+		return std::fpclassify(x) == FP_SUBNORMAL;
+	}
+
 	FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
 		auto& ibc = byteCode[i];
-		if(trace) printState(r, f, e, a);
+		//if(trace) printState(r, f, e, a);
 		switch (ibc.type)
 		{
 			case InstructionType::IADD_R: {
@@ -219,9 +224,8 @@ namespace RandomX {
 			} break;
 
 			case InstructionType::FDIV_M: {
-				__m128d fsrc = _mm_abs(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)));
-				__m128d fdst = _mm_div_pd(*ibc.fdst, fsrc);
-				*ibc.fdst = _mm_max_pd(fdst, _mm_set_pd(DBL_MIN, DBL_MIN));
+				__m128d fsrc = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)));
+				*ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc);
 			} break;
 
 			case InstructionType::FSQRT_R: {
@@ -258,6 +262,18 @@ namespace RandomX {
 			else //if(ibc.type >= 20 && ibc.type <= 30)
 				print(0);
 		}
+#ifdef FPUCHECK
+		if (ibc.type >= 26 && ibc.type <= 30) {
+			double lo = *(((double*)ibc.fdst) + 0);
+			double hi = *(((double*)ibc.fdst) + 1);
+			if (lo <= 0 || hi <= 0) {
+				std::stringstream ss;
+				ss << "Underflow in operation " << ibc.type;
+				printState(r, f, e, a);
+				throw std::runtime_error(ss.str());
+			}
+		}
+#endif
 	}
 
 	void InterpretedVirtualMachine::execute() {
@@ -304,10 +320,10 @@ namespace RandomX {
 			f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8);
 			f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16);
 			f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24);
-			e[0] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 32));
-			e[1] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 40));
-			e[2] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 48));
-			e[3] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 56));
+			e[0] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 32));
+			e[1] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 40));
+			e[2] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 48));
+			e[3] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 56));
 
 			if (trace) {
 				std::cout << "iteration " << std::dec << iter << std::endl;
@@ -357,10 +373,22 @@ namespace RandomX {
 			store64(scratchpad + spAddr1 + 48, r[6]);
 			store64(scratchpad + spAddr1 + 56, r[7]);
 
-			f[0] = _mm_mul_pd(f[0], e[0]);
-			f[1] = _mm_mul_pd(f[1], e[1]);
-			f[2] = _mm_mul_pd(f[2], e[2]);
-			f[3] = _mm_mul_pd(f[3], e[3]);
+			f[0] = _mm_xor_pd(f[0], e[0]);
+			f[1] = _mm_xor_pd(f[1], e[1]);
+			f[2] = _mm_xor_pd(f[2], e[2]);
+			f[3] = _mm_xor_pd(f[3], e[3]);
+
+#ifdef FPUCHECK
+			for(int i = 0; i < 4; ++i) {
+				double lo = *(((double*)&f[i]) + 0);
+				double hi = *(((double*)&f[i]) + 1);
+				if (isDenormal(lo) || isDenormal(hi)) {
+					std::stringstream ss;
+					ss << "Denormal f" << i;
+					throw std::runtime_error(ss.str());
+				}
+			}
+#endif
 
 			_mm_store_pd((double*)(scratchpad + spAddr0 + 0), f[0]);
 			_mm_store_pd((double*)(scratchpad + spAddr0 + 16), f[1]);
diff --git a/src/intrinPortable.h b/src/intrinPortable.h
index aea0335..3bee98f 100644
--- a/src/intrinPortable.h
+++ b/src/intrinPortable.h
@@ -46,11 +46,6 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
 #include <intrin.h>
 #endif
 
-inline __m128d _mm_abs(__m128d xd) {
-	const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
-	return _mm_and_pd(xd, absmask);
-}
-
 #define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
 
 #else
@@ -159,6 +154,20 @@ inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
 	return x;
 }
 
+inline __m128d _mm_and_pd(__m128d a, __m128d b) {
+	__m128d x;
+	x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
+	x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
+	return x;
+}
+
+inline __m128d _mm_or_pd(__m128d a, __m128d b) {
+	__m128d x;
+	x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
+	x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
+	return x;
+}
+
 inline __m128d _mm_set_pd(double e1, double e0) {
 	__m128d x;
 	x.lo = e0;
@@ -302,6 +311,18 @@ inline __m128d load_cvt_i32x2(const void* addr) {
 	return _mm_cvtepi32_pd(ix);
 }
 
+template<int E>
+__m128d ieee_set_exponent(__m128d x) {
+	static_assert(E > -1023, "Invalid exponent value");
+	constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1;
+	const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64));
+	constexpr uint64_t exponent64 = (uint64_t)(E + 1023U) << 52;
+	const __m128d exponentMask = _mm_castsi128_pd(_mm_set_epi64x(exponent64, exponent64));
+	x = _mm_and_pd(x, mantissaMask);
+	x = _mm_or_pd(x, exponentMask);
+	return x;
+}
+
 double loadDoublePortable(const void* addr);
 
 uint64_t mulh(uint64_t, uint64_t);