From 2d8b094e09a98a420f6af625fd74724e04e1ed2a Mon Sep 17 00:00:00 2001
From: wowario <wowario@protonmail.com>
Date: Tue, 10 Jan 2023 21:19:35 +0300
Subject: [PATCH 1/3] update README.md

---
 README.md | 56 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index b13d6480d..2de4d2d10 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,21 @@ Portions Copyright (c) 2012-2013 The Cryptonote developers.
 - Discord: [discord.gg/ykZyAzJhDK](https://discord.com/invite/ykZyAzJhDK)
 - Telegram: [t.me/wownero](https://t.me/wownero)
 - Wowlet Desktop Wallet: [git.wownero.com/wowlet/wowlet](https://git.wownero.com/wowlet/wowlet/releases)
-- WOW Stash Web Wallet: [wowstash.app](https://wowstash.app)
+- Stack Wallet iOS & Android Mobile Wallet: [stackwallet.com](https://stackwallet.com)
+- Wonerujo Android Mobile Wallet: [google store](https://play.google.com/store/apps/details?id=com.m2049r.wowwallet)
 - Public Node Status: [monero.fail](https://monero.fail/?crypto=wownero)
-- Map of Nodes: [wownero.fyi](https://wownero.fyi)
 - Wownero Memes: [suchwow.xyz](https://suchwow.xyz/posts/top)
 - Market Info: [coinmarketcap.com](https://coinmarketcap.com/currencies/wownero), [coingecko.com](https://www.coingecko.com/en/coins/wownero/usd)
 
+## Exchanges
+
+- [AltQuick](https://altquick.com/market/Wownero)
+- [Majestic Bank](https://majesticbank.sc)
+- [TradeOgre](https://tradeogre.com/exchange/BTC-WOW)
+
 ### Blockchain Explorers
+
 - https://explore.wownero.com
-- http://gffjxd5nn2heslj6jv5ts2ok5j6xi6m3pwlpz7le4i5bu56sirbxfiqd.onion:8081
-- https://wownero.club
 
 ## Introduction
 
@@ -36,11 +41,11 @@ Wownero is a privacy-centric memecoin that was fairly launched on April 1, 2018
 Wownero is a 100% community-sponsored endeavor. Supporting services are also graciously provided by sponsors:
 
 [<img src="https://git.wownero.com/wownero/meta/raw/branch/master/images/macstadium.png"
-      alt="MacStadium"
-      height="100">](https://www.macstadium.com)
+alt="MacStadium"
+height="100">](https://www.macstadium.com)
 [<img src="https://git.wownero.com/wownero/meta/raw/branch/master/images/jetbrains.png"
-      alt="JetBrains"
-      height="100">](https://www.jetbrains.com)
+alt="JetBrains"
+height="100">](https://www.jetbrains.com)
 
 Developers are volunteers doing this mostly for shits and giggles. If you would like to support our shenanigans and stimulant addictions, please consider donating to [WFS proposals](https://funding.wownero.com/proposals) or the dev slush fund.
 
@@ -64,31 +69,32 @@ Wownero is hosted by Open Collective Europe, a Brussels-based non-profit that ho
 
 ## Release staging and Contributing
 
-**Anyone is welcome to contribute to Wownero's codebase!** 
+**Anyone is welcome to contribute to Wownero's codebase!**
 
 If you have a fix or code change, feel free to submit it as a pull request. Ahead of a scheduled software upgrade, a development branch will be created with the new release version tag. Pull requests that address bugs should be made to Master. Pull requests that require review and testing (generally, optimizations and new features) should be made to the development branch. All pull requests will be considered safe until the US dollar valuation of 1 Wownero equals $1000. After this valuation has been reached, more research will be needed to introduce experimental cryptography and/or code into the codebase.
 
-Things to Do, Work in Progress, and Help Wanted tasks are tracked in the [Meta](https://git.wownero.com/wownero/meta/issues) repo. 
+Things to Do, Work in Progress, and Help Wanted tasks are tracked in the [Meta](https://git.wownero.com/wownero/meta/issues) repo.
 
 Join `#wownero` on IRC OFTC to participate in development conversation.
 
-## Scheduled software upgrades
+## Scheduled software/network upgrades
 
 Wownero uses a fixed-schedule software upgrade (hard fork) mechanism to implement new features. This means that users of Wownero (end users and service providers) should run current versions and upgrade their software on a regular schedule. The required software for these upgrades will be available prior to the scheduled date. Please check the repository prior to this date for the proper Wownero software version. Below is the historical schedule and the projected schedule for the next upgrade.
-Dates are provided in the format YYYY-MM-DD. 
+Dates are provided in the format YYYY-MM-DD.
 
 | Software upgrade block height | Date       | Release Name | Minimum Wownero version | Recommended Wownero version | Details                                                                            |  
-| ------------------------------ | -----------| ----------------- | ---------------------- | -------------------------- | ---------------------------------------------------------------------------------- |
-| 1                              | 2018-04-01 | Awesome Akita                | v0.1.0.0               | v0.1.0.0                  | Cryptonight variant 1, ringsize >= 8, sorted inputs
-| 69,69                           | 2018-04-24 | Busty Brazzers                | v0.2.0.0               | v0.2.0.0                  | Bulletproofs, LWMA difficulty algorithm, ringsize >= 10, reduce unlock to 4
-| 53,666                          | 2018-10-06 | Cool Cage                | v0.3.0.0               | v0.3.1.3                  | Cryptonight variant 2, LWMA v2, ringsize = 22, MMS
-| 63,469                          | 2018-11-11 | Dank Doge               | v0.4.0.0               | v0.4.0.0                  | LWMA v4
-| 81,769                          | 2019-02-19 | Erotic EggplantEmoji    | v0.5.0.0               | v0.5.0.2                  | Cryptonight/wow, LWMA v1 with N=144, Updated Bulletproofs, Fee Per Byte, Auto-churn
-| 114,969                          | 2019-06-14 | F For Fappening    | v0.6.1.0               | v0.6.1.2                  | RandomWOW, new block weight algorithm, slightly more efficient RingCT format
-| 160,777                          | 2019-11-20 | Gaping Goatse    | v0.7.0.0               | v0.7.1.0                  | Only allow >= 2 outputs, change to the block median used to calculate penalty, rct sigs in coinbase forbidden, 4 unlock time as protocol rule
-| -                         | 2020-06-28 | Hallucinogenic Hypnotoad  | v0.8.0.0               | v0.8.0.2                  | Dandelion++ support
-| 253,999                         | 2020-10-09 | Illiterate Illuminati  | v0.9.0.0               | v0.9.3.3                  | Dynamic coinbase unlock (up to 1 mo.), Deterministic unlock times, Enforce maximum coinbase amount, show_qr_code wallet command, CLSAG
-| 331,170                         | 2021-07-04 | Junkie Jeff  | v0.10.0.0               |  v0.10.1.0                  | Bulletproofs+, Miner Block Header Signing, Vote by Block, Change coinbase unlock time to 1 day, Reset difficulty and switch back to Monero's difficulty algorithm
+|-------------------------------|------------| ----------------- |-------------------------|-----------------------------| ---------------------------------------------------------------------------------- |
+| 1                             | 2018-04-01 | Awesome Akita                | v0.1.0.0                | v0.1.0.0                    | Cryptonight variant 1, ringsize >= 8, sorted inputs
+| 69,69                         | 2018-04-24 | Busty Brazzers                | v0.2.0.0                | v0.2.0.0                    | Bulletproofs, LWMA difficulty algorithm, ringsize >= 10, reduce unlock to 4
+| 53,666                        | 2018-10-06 | Cool Cage                | v0.3.0.0                | v0.3.1.3                    | Cryptonight variant 2, LWMA v2, ringsize = 22, MMS
+| 63,469                        | 2018-11-11 | Dank Doge               | v0.4.0.0                | v0.4.0.0                    | LWMA v4
+| 81,769                        | 2019-02-19 | Erotic EggplantEmoji    | v0.5.0.0                | v0.5.0.2                    | Cryptonight/wow, LWMA v1 with N=144, Updated Bulletproofs, Fee Per Byte, Auto-churn
+| 114,969                       | 2019-06-14 | F For Fappening    | v0.6.1.0                | v0.6.1.2                    | RandomWOW, new block weight algorithm, slightly more efficient RingCT format
+| 160,777                       | 2019-11-20 | Gaping Goatse    | v0.7.0.0                | v0.7.1.0                    | Only allow >= 2 outputs, change to the block median used to calculate penalty, rct sigs in coinbase forbidden, 4 unlock time as protocol rule
+| -                             | 2020-06-28 | Hallucinogenic Hypnotoad  | v0.8.0.0                | v0.8.0.2                    | Dandelion++ support
+| 253,999                       | 2020-10-09 | Illiterate Illuminati  | v0.9.0.0                | v0.9.3.3                    | Dynamic coinbase unlock (up to 1 mo.), Deterministic unlock times, Enforce maximum coinbase amount, show_qr_code wallet command, CLSAG
+| 331,170                       | 2021-07-04 | Junkie Jeff  | v0.10.0.0               | v0.10.2.0                   | Bulletproofs+, Miner Block Header Signing, Vote by Block, Change coinbase unlock time to 1 day, Reset difficulty and switch back to Monero's difficulty algorithm
+| XXX,XXX                       | 2023-XX-XX | Kunty Karen  | v0.11.0.0               | v0.11.0.0                   | View tags, fee changes, adjusted dynamic block weight algorithm, multisig security fixes, RPC broadcast node donation sub-address 
 
 X's indicate that these details have not been determined as of commit date.
 
@@ -196,7 +202,7 @@ save and close nano
 ```
 
 * `sudo cat /var/lib/tor/wownero/hostname`
-copy your onion address and share node with others [here](https://monero.fail/?crypto=wownero) and [here](https://forum.wownero.com/t/wownero-tor-onion-sites/623)
+  copy your onion address and share node with others [here](https://monero.fail/?crypto=wownero) and [here](https://forum.wownero.com/t/wownero-tor-onion-sites/623)
 
 To share your node over p2p, uncomment first line of wownerod.conf and add your onion address.
 
@@ -206,4 +212,4 @@ More information on running Tor and i2p nodes is available [here](https://forum.
 
 ```
 ./wownero-wallet-cli --proxy 127.0.0.1:9050 --daemon-address iy6ry6uudpzvbd72zsipepukp6nsazjdu72n52vg3isfnxqn342flzad.onion:34568
-```
+```
\ No newline at end of file
-- 
2.25.1


From 07864cc53f5ba267d6d6b40453acde6d32f59b5b Mon Sep 17 00:00:00 2001
From: wowario <wowario@protonmail.com>
Date: Tue, 10 Jan 2023 21:46:43 +0300
Subject: [PATCH 2/3] add seed nodes

---
 src/p2p/net_node.inl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/p2p/net_node.inl b/src/p2p/net_node.inl
index e457b6b3c..f2e3f5775 100644
--- a/src/p2p/net_node.inl
+++ b/src/p2p/net_node.inl
@@ -691,8 +691,6 @@ namespace nodetool
     std::set<std::string> full_addrs;
     if (m_nettype == cryptonote::TESTNET)
     {
-      full_addrs.insert("207.254.29.107:11180");
-      full_addrs.insert("135.148.138.255:11180");
     }
     else if (m_nettype == cryptonote::STAGENET)
     {
@@ -702,8 +700,8 @@ namespace nodetool
     }
     else
     {
-      full_addrs.insert("158.69.60.225:34567");   //  OVH France
-      full_addrs.insert("159.65.91.59:34567");    //  DigiO london
+      full_addrs.insert("158.69.60.225:34567");   //  explore.wownero.com
+      full_addrs.insert("159.65.91.59:34567");    //  jw
       full_addrs.insert("164.90.230.176:34567");  //  de1.wownodes.com
       full_addrs.insert("64.227.81.144:34567");   //  us1.wownodes.com
       full_addrs.insert("188.166.237.187:34567"); //  sg1.wownodes.com      
@@ -711,6 +709,11 @@ namespace nodetool
       full_addrs.insert("167.114.196.241:34567"); //  wowbux.org
       full_addrs.insert("135.148.138.255:34567");
       full_addrs.insert("207.254.29.107:34567");
+      full_addrs.insert("142.93.144.79:34567"); // idontwanttogototoronto.wow.fail
+      full_addrs.insert("51.75.76.161:34567"); // eu-west-1.wow.xmr.pm
+      full_addrs.insert("145.239.93.75:34567"); // eu-west-2.wow.xmr.pm
+      full_addrs.insert("88.198.199.23:34567");
+      full_addrs.insert("167.114.119.46:34567"); // wownero.stackwallet.com
     }
     return full_addrs;
   }
@@ -843,6 +846,7 @@ namespace nodetool
           "nepc4lxndsooj2akn7ofrj3ooqc25242obchcag6tw3f2mxrms2uuvyd.onion:34566",
           "666l2ajxqjgj5lskvbokvworjysgvqag4oitokjuy7wz6juisul4jqad.onion:34566",
           "ty7ppqozzodz75audgvkprekiiqsovbyrkfdjwadrkbe3etyzloatxad.onion:34566",
+          "77uase4p6y6jsjdf6z2kdgpxgh7nkvywagvhurzphbm7vrkyj2d2gdid.onion:34566",
         };
       }
       return {};
-- 
2.25.1


From 44c482913f322e25d4925afb23abd293715a1b3a Mon Sep 17 00:00:00 2001
From: dsc <dsc@xmr.pm>
Date: Mon, 23 May 2022 00:52:49 +0200
Subject: [PATCH 3/3] Wownero fails to compile on armv7a. To fix we can:

- update src/crypto/slow-hash.c to the latest version that Monero currently has
- modify variant4_random_math.h to facilitate the changes in slow-hash.c

In short; src/crypto/slow-hash.c is now up to date with upstream Monero.

The next Wownero version will have these changes automatically as the
codebase follows Monero, rendering this commit obsolete in the process.
---
 src/crypto/slow-hash.c            | 669 ++++++++++++++++++++----------
 src/crypto/variant4_random_math.h |   8 +-
 2 files changed, 465 insertions(+), 212 deletions(-)

diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c
index 1ee9789e3..0de7db505 100644
--- a/src/crypto/slow-hash.c
+++ b/src/crypto/slow-hash.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2014-2020, The Monero Project
+// Copyright (c) 2014-2022, The Monero Project
 //
 // All rights reserved.
 //
@@ -40,6 +40,9 @@
 #include "oaes_lib.h"
 #include "variant2_int_sqrt.h"
 #include "variant4_random_math.h"
+#include "CryptonightR_JIT.h"
+
+#include <errno.h>
 
 #define MEMORY         (1 << 21) // 2MB scratchpad
 #define ITER           (1 << 20)
@@ -48,9 +51,72 @@
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
 
+#if defined(_MSC_VER)
+#define THREADV __declspec(thread)
+#else
+#define THREADV __thread
+#endif
+
 extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
 extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
 
+static void local_abort(const char *msg)
+{
+  fprintf(stderr, "%s\n", msg);
+#ifdef NDEBUG
+  _exit(1);
+#else
+  abort();
+#endif
+}
+
+volatile int use_v4_jit_flag = -1;
+
+static inline int use_v4_jit(void)
+{
+#if defined(__x86_64__)
+
+  if (use_v4_jit_flag != -1)
+    return use_v4_jit_flag;
+
+  const char *env = getenv("MONERO_USE_CNV4_JIT");
+  if (!env) {
+    use_v4_jit_flag = 1;
+  }
+  else if (!strcmp(env, "0") || !strcmp(env, "no")) {
+    use_v4_jit_flag = 0;
+  }
+  else {
+    use_v4_jit_flag = 1;
+  }
+  return use_v4_jit_flag;
+#else
+  return 0;
+#endif
+}
+
+#if defined(__x86_64__) || defined(__aarch64__)
+static inline int force_software_aes(void)
+{
+  static int use = -1;
+
+  if (use != -1)
+    return use;
+
+  const char *env = getenv("MONERO_USE_SOFTWARE_AES");
+  if (!env) {
+    use = 0;
+  }
+  else if (!strcmp(env, "0") || !strcmp(env, "no")) {
+    use = 0;
+  }
+  else {
+    use = 1;
+  }
+  return use;
+}
+#endif
+
 #define VARIANT1_1(p) \
   do if (variant == 1) \
   { \
@@ -117,48 +183,74 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
 #define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
   do if (variant >= 2) \
   { \
-    const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+    __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
     const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
     const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
     _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
     _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
     _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    if (variant >= 4) \
+    { \
+      chunk1 = _mm_xor_si128(chunk1, chunk2); \
+      _c = _mm_xor_si128(_c, chunk3); \
+      _c = _mm_xor_si128(_c, chunk1); \
+    } \
   } while (0)
 
 #define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
   do if (variant >= 2) \
   { \
-    const uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
+    uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
     const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \
     const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \
     vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
     vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
     vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
+    if (variant >= 4) \
+    { \
+      chunk1 = veorq_u64(chunk1, chunk2); \
+      _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk3)); \
+      _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk1)); \
+    } \
   } while (0)
 
-#define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \
+#define VARIANT2_PORTABLE_SHUFFLE_ADD(out, a_, base_ptr, offset) \
   do if (variant >= 2) \
   { \
     uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \
     uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \
     uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \
     \
-    const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \
+    uint64_t chunk1_old[2] = { SWAP64LE(chunk1[0]), SWAP64LE(chunk1[1]) }; \
+    const uint64_t chunk2_old[2] = { SWAP64LE(chunk2[0]), SWAP64LE(chunk2[1]) }; \
+    const uint64_t chunk3_old[2] = { SWAP64LE(chunk3[0]), SWAP64LE(chunk3[1]) }; \
     \
     uint64_t b1[2]; \
     memcpy_swap64le(b1, b + 16, 2); \
-    chunk1[0] = SWAP64LE(SWAP64LE(chunk3[0]) + b1[0]); \
-    chunk1[1] = SWAP64LE(SWAP64LE(chunk3[1]) + b1[1]); \
+    chunk1[0] = SWAP64LE(chunk3_old[0] + b1[0]); \
+    chunk1[1] = SWAP64LE(chunk3_old[1] + b1[1]); \
     \
     uint64_t a0[2]; \
-    memcpy_swap64le(a0, a, 2); \
-    chunk3[0] = SWAP64LE(SWAP64LE(chunk2[0]) + a0[0]); \
-    chunk3[1] = SWAP64LE(SWAP64LE(chunk2[1]) + a0[1]); \
+    memcpy_swap64le(a0, a_, 2); \
+    chunk3[0] = SWAP64LE(chunk2_old[0] + a0[0]); \
+    chunk3[1] = SWAP64LE(chunk2_old[1] + a0[1]); \
     \
     uint64_t b0[2]; \
     memcpy_swap64le(b0, b, 2); \
-    chunk2[0] = SWAP64LE(SWAP64LE(chunk1_old[0]) + b0[0]); \
-    chunk2[1] = SWAP64LE(SWAP64LE(chunk1_old[1]) + b0[1]); \
+    chunk2[0] = SWAP64LE(chunk1_old[0] + b0[0]); \
+    chunk2[1] = SWAP64LE(chunk1_old[1] + b0[1]); \
+    if (variant >= 4) \
+    { \
+      uint64_t out_copy[2]; \
+      memcpy_swap64le(out_copy, out, 2); \
+      chunk1_old[0] ^= chunk2_old[0]; \
+      chunk1_old[1] ^= chunk2_old[1]; \
+      out_copy[0] ^= chunk3_old[0]; \
+      out_copy[1] ^= chunk3_old[1]; \
+      out_copy[0] ^= chunk1_old[0]; \
+      out_copy[1] ^= chunk1_old[1]; \
+      memcpy_swap64le(out, out_copy, 2); \
+    } \
   } while (0)
 
 #define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
@@ -201,18 +293,18 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
 #endif
 
 #define VARIANT2_2_PORTABLE() \
-    if (variant >= 2) { \
+    if (variant == 2 || variant == 3) { \
       xor_blocks(long_state + (j ^ 0x10), d); \
       xor_blocks(d, long_state + (j ^ 0x20)); \
     }
 
 #define VARIANT2_2() \
-  do if (variant >= 2) \
+  do if (variant == 2 || variant == 3) \
   { \
-    *U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
-    *(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
-    hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \
-    lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \
+    *U64(local_hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
+    *(U64(local_hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
+    hi ^= SWAP64LE(*U64(local_hp_state + (j ^ 0x20))); \
+    lo ^= SWAP64LE(*(U64(local_hp_state + (j ^ 0x20)) + 1)); \
   } while (0)
 
 #define V4_REG_LOAD(dst, src) \
@@ -225,34 +317,56 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
   } while (0)
 
 #define VARIANT4_RANDOM_MATH_INIT() \
-  v4_reg r[8]; \
-  struct V4_Instruction code[TOTAL_LATENCY * ALU_COUNT + 1]; \
+  v4_reg r[9]; \
+  struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \
+  int jit = use_v4_jit(); \
   do if (variant >= 4) \
   { \
     for (int i = 0; i < 4; ++i) \
       V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \
     v4_random_math_init(code, height); \
+    if (jit) \
+    { \
+      int ret = v4_generate_JIT_code(code, hp_jitfunc, 4096); \
+      if (ret < 0) \
+        local_abort("Error generating CryptonightR code"); \
+    } \
   } while (0)
 
 #define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
   do if (variant >= 4) \
   { \
-    uint64_t t; \
-    memcpy(&t, b, sizeof(uint64_t)); \
+    uint64_t t[2]; \
+    memcpy(t, b, sizeof(uint64_t)); \
     \
     if (sizeof(v4_reg) == sizeof(uint32_t)) \
-      t ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
+      t[0] ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
     else \
-      t ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
+      t[0] ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
     \
-    memcpy(b, &t, sizeof(uint64_t)); \
+    memcpy(b, t, sizeof(uint64_t)); \
     \
     V4_REG_LOAD(r + 4, a); \
     V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \
     V4_REG_LOAD(r + 6, _b); \
     V4_REG_LOAD(r + 7, _b1); \
+    V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \
+    \
+    if (jit) \
+      (*hp_jitfunc)(r); \
+    else \
+      v4_random_math(code, r); \
     \
-    v4_random_math(code, r); \
+    memcpy(t, a, sizeof(uint64_t) * 2); \
+    \
+    if (sizeof(v4_reg) == sizeof(uint32_t)) { \
+      t[0] ^= SWAP64LE(r[2] | ((uint64_t)(r[3]) << 32)); \
+      t[1] ^= SWAP64LE(r[0] | ((uint64_t)(r[1]) << 32)); \
+    } else { \
+      t[0] ^= SWAP64LE(r[2] ^ r[3]); \
+      t[1] ^= SWAP64LE(r[0] ^ r[1]); \
+    } \
+    memcpy(a, t, sizeof(uint64_t) * 2); \
   } while (0)
 
 
@@ -318,7 +432,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
 
 #define pre_aes() \
   j = state_index(a); \
-  _c = _mm_load_si128(R128(&hp_state[j])); \
+  _c = _mm_load_si128(R128(&local_hp_state[j])); \
   _a = _mm_load_si128(R128(a)); \
 
 /*
@@ -331,32 +445,26 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
  * This code is based upon an optimized implementation by dga.
  */
 #define post_aes() \
-  VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
+  VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
   _mm_store_si128(R128(c), _c); \
-  _mm_store_si128(R128(&hp_state[j]), _mm_xor_si128(_b, _c)); \
-  VARIANT1_1(&hp_state[j]); \
+  _mm_store_si128(R128(&local_hp_state[j]), _mm_xor_si128(_b, _c)); \
+  VARIANT1_1(&local_hp_state[j]); \
   j = state_index(c); \
-  p = U64(&hp_state[j]); \
+  p = U64(&local_hp_state[j]); \
   b[0] = p[0]; b[1] = p[1]; \
   VARIANT2_INTEGER_MATH_SSE2(b, c); \
   VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
   __mul(); \
   VARIANT2_2(); \
-  VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
+  VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
   a[0] += hi; a[1] += lo; \
-  p = U64(&hp_state[j]); \
+  p = U64(&local_hp_state[j]); \
   p[0] = a[0];  p[1] = a[1]; \
   a[0] ^= b[0]; a[1] ^= b[1]; \
   VARIANT1_2(p + 1); \
   _b1 = _b; \
   _b = _c; \
 
-#if defined(_MSC_VER)
-#define THREADV __declspec(thread)
-#else
-#define THREADV __thread
-#endif
-
 #pragma pack(push, 1)
 union cn_slow_hash_state
 {
@@ -371,6 +479,9 @@ union cn_slow_hash_state
 
 THREADV uint8_t *hp_state = NULL;
 THREADV int hp_allocated = 0;
+THREADV v4_random_math_JIT_func hp_jitfunc = NULL;
+THREADV uint8_t *hp_jitfunc_memory = NULL;
+THREADV int hp_jitfunc_allocated = 0;
 
 #if defined(_MSC_VER)
 #define cpuid(info,x)    __cpuidex(info,x,0)
@@ -409,25 +520,6 @@ STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
  * @return true if the CPU supports AES, false otherwise
  */
 
-STATIC INLINE int force_software_aes(void)
-{
-  static int use = -1;
-
-  if (use != -1)
-    return use;
-
-  const char *env = getenv("MONERO_USE_SOFTWARE_AES");
-  if (!env) {
-    use = 0;
-  }
-  else if (!strcmp(env, "0") || !strcmp(env, "no")) {
-    use = 0;
-  }
-  else {
-    use = 1;
-  }
-  return use;
-}
 
 STATIC INLINE int check_aes_hw(void)
 {
@@ -666,10 +758,10 @@ void cn_slow_hash_allocate_state(void)
 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
   defined(__DragonFly__) || defined(__NetBSD__)
     hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
-                    MAP_PRIVATE | MAP_ANON, 0, 0);
+                    MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
     hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
-                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0);
+                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
 #endif
     if(hp_state == MAP_FAILED)
         hp_state = NULL;
@@ -680,6 +772,35 @@ void cn_slow_hash_allocate_state(void)
         hp_allocated = 0;
         hp_state = (uint8_t *) malloc(MEMORY);
     }
+
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    hp_jitfunc_memory = (uint8_t *) VirtualAlloc(hp_jitfunc_memory, 4096 + 4095,
+                                                 MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
+  defined(__DragonFly__) || defined(__NetBSD__)
+#ifdef __NetBSD__
+#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
+#else
+#define RESERVED_FLAGS 0
+#endif
+    hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | RESERVED_FLAGS,
+                    MAP_PRIVATE | MAP_ANON, -1, 0);
+#else
+    hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#endif
+    if(hp_jitfunc_memory == MAP_FAILED)
+        hp_jitfunc_memory = NULL;
+#endif
+    hp_jitfunc_allocated = 1;
+    if (hp_jitfunc_memory == NULL)
+    {
+        hp_jitfunc_allocated = 0;
+        hp_jitfunc_memory = malloc(4096 + 4095);
+    }
+    hp_jitfunc = (v4_random_math_JIT_func)((size_t)(hp_jitfunc_memory + 4095) & ~4095);
 }
 
 /**
@@ -702,8 +823,22 @@ void cn_slow_hash_free_state(void)
 #endif
     }
 
+    if(!hp_jitfunc_allocated)
+        free(hp_jitfunc_memory);
+    else
+    {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+        VirtualFree(hp_jitfunc_memory, 0, MEM_RELEASE);
+#else
+        munmap(hp_jitfunc_memory, 4096 + 4095);
+#endif
+    }
+
     hp_state = NULL;
     hp_allocated = 0;
+    hp_jitfunc = NULL;
+    hp_jitfunc_memory = NULL;
+    hp_jitfunc_allocated = 0;
 }
 
 /**
@@ -787,7 +922,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
         for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
         {
             aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
-            memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
+            memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
         }
     }
     else
@@ -799,7 +934,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
             for(j = 0; j < INIT_SIZE_BLK; j++)
                 aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
 
-            memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
+            memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
         }
     }
 
@@ -847,7 +982,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
         for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
         {
             // add the xor to the pseudo round
-            aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
+            aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
         }
     }
     else
@@ -857,7 +992,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
         {
             for(j = 0; j < INIT_SIZE_BLK; j++)
             {
-                xor_blocks(&text[j * AES_BLOCK_SIZE], &hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
+                xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
                 aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
             }
         }
@@ -877,6 +1012,44 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
 }
 
 #elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__))
+#ifdef __aarch64__
+#include <sys/mman.h>
+THREADV uint8_t *hp_state = NULL;
+THREADV int hp_malloced = 0;
+
+void cn_slow_hash_allocate_state(void)
+{
+    if(hp_state != NULL)
+        return;
+
+#ifndef	MAP_HUGETLB
+#define	MAP_HUGETLB	0
+#endif
+    hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+
+    if(hp_state == MAP_FAILED)
+        hp_state = NULL;
+    if(hp_state == NULL)
+    {
+        hp_malloced = 1;
+        hp_state = (uint8_t *) malloc(MEMORY);
+    }
+}
+
+void cn_slow_hash_free_state(void)
+{
+    if(hp_state == NULL)
+        return;
+
+    if (hp_malloced)
+        free(hp_state);
+    else
+        munmap(hp_state, MEMORY);
+    hp_state = NULL;
+    hp_malloced = 0;
+}
+#else
 void cn_slow_hash_allocate_state(void)
 {
   // Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
@@ -888,6 +1061,7 @@ void cn_slow_hash_free_state(void)
   // As above
   return;
 }
+#endif
 
 #if defined(__GNUC__)
 #define RDATA_ALIGN16 __attribute__ ((aligned(16)))
@@ -901,6 +1075,8 @@ void cn_slow_hash_free_state(void)
 
 #define U64(x) ((uint64_t *) (x))
 
+#define hp_jitfunc ((v4_random_math_JIT_func)NULL)
+
 STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
 {
     *a ^= b;
@@ -926,6 +1102,23 @@ union cn_slow_hash_state
  * and moving between vector and regular registers stalls the pipeline.
  */
 #include <arm_neon.h>
+#ifndef __APPLE__
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+
+STATIC INLINE int check_aes_hw(void)
+{
+#ifdef __APPLE__
+    return 1;
+#else
+    static int supported = -1;
+
+    if(supported < 0)
+        supported = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
+    return supported;
+#endif
+}
 
 #define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
 
@@ -935,24 +1128,24 @@ union cn_slow_hash_state
 
 #define pre_aes() \
   j = state_index(a); \
-  _c = vld1q_u8(&hp_state[j]); \
+  _c = vld1q_u8(&local_hp_state[j]); \
   _a = vld1q_u8((const uint8_t *)a); \
 
 #define post_aes() \
-  VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
+  VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
   vst1q_u8((uint8_t *)c, _c); \
-  vst1q_u8(&hp_state[j], veorq_u8(_b, _c)); \
-  VARIANT1_1(&hp_state[j]); \
+  vst1q_u8(&local_hp_state[j], veorq_u8(_b, _c)); \
+  VARIANT1_1(&local_hp_state[j]); \
   j = state_index(c); \
-  p = U64(&hp_state[j]); \
+  p = U64(&local_hp_state[j]); \
   b[0] = p[0]; b[1] = p[1]; \
   VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
   VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
   __mul(); \
   VARIANT2_2(); \
-  VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
+  VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
   a[0] += hi; a[1] += lo; \
-  p = U64(&hp_state[j]); \
+  p = U64(&local_hp_state[j]); \
   p[0] = a[0];  p[1] = a[1]; \
   a[0] ^= b[0]; a[1] ^= b[1]; \
   VARIANT1_2(p + 1); \
@@ -966,47 +1159,47 @@ union cn_slow_hash_state
 */
 static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) {
 static const int rcon[] = {
-  0x01,0x01,0x01,0x01,
-  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,  // rotate-n-splat
-  0x1b,0x1b,0x1b,0x1b };
+	0x01,0x01,0x01,0x01,
+	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,	// rotate-n-splat
+	0x1b,0x1b,0x1b,0x1b };
 __asm__(
-" eor v0.16b,v0.16b,v0.16b\n"
-" ld1 {v3.16b},[%0],#16\n"
-" ld1 {v1.4s,v2.4s},[%2],#32\n"
-" ld1 {v4.16b},[%0]\n"
-" mov w2,#5\n"
-" st1 {v3.4s},[%1],#16\n"
+"	eor	v0.16b,v0.16b,v0.16b\n"
+"	ld1	{v3.16b},[%0],#16\n"
+"	ld1	{v1.4s,v2.4s},[%2],#32\n"
+"	ld1	{v4.16b},[%0]\n"
+"	mov	w2,#5\n"
+"	st1	{v3.4s},[%1],#16\n"
 "\n"
 "1:\n"
-" tbl v6.16b,{v4.16b},v2.16b\n"
-" ext v5.16b,v0.16b,v3.16b,#12\n"
-" st1 {v4.4s},[%1],#16\n"
-" aese  v6.16b,v0.16b\n"
-" subs  w2,w2,#1\n"
+"	tbl	v6.16b,{v4.16b},v2.16b\n"
+"	ext	v5.16b,v0.16b,v3.16b,#12\n"
+"	st1	{v4.4s},[%1],#16\n"
+"	aese	v6.16b,v0.16b\n"
+"	subs	w2,w2,#1\n"
 "\n"
-" eor v3.16b,v3.16b,v5.16b\n"
-" ext v5.16b,v0.16b,v5.16b,#12\n"
-" eor v3.16b,v3.16b,v5.16b\n"
-" ext v5.16b,v0.16b,v5.16b,#12\n"
-" eor v6.16b,v6.16b,v1.16b\n"
-" eor v3.16b,v3.16b,v5.16b\n"
-" shl v1.16b,v1.16b,#1\n"
-" eor v3.16b,v3.16b,v6.16b\n"
-" st1 {v3.4s},[%1],#16\n"
-" b.eq  2f\n"
+"	eor	v3.16b,v3.16b,v5.16b\n"
+"	ext	v5.16b,v0.16b,v5.16b,#12\n"
+"	eor	v3.16b,v3.16b,v5.16b\n"
+"	ext	v5.16b,v0.16b,v5.16b,#12\n"
+"	eor	v6.16b,v6.16b,v1.16b\n"
+"	eor	v3.16b,v3.16b,v5.16b\n"
+"	shl	v1.16b,v1.16b,#1\n"
+"	eor	v3.16b,v3.16b,v6.16b\n"
+"	st1	{v3.4s},[%1],#16\n"
+"	b.eq	2f\n"
 "\n"
-" dup v6.4s,v3.s[3]   // just splat\n"
-" ext v5.16b,v0.16b,v4.16b,#12\n"
-" aese  v6.16b,v0.16b\n"
+"	dup	v6.4s,v3.s[3]		// just splat\n"
+"	ext	v5.16b,v0.16b,v4.16b,#12\n"
+"	aese	v6.16b,v0.16b\n"
 "\n"
-" eor v4.16b,v4.16b,v5.16b\n"
-" ext v5.16b,v0.16b,v5.16b,#12\n"
-" eor v4.16b,v4.16b,v5.16b\n"
-" ext v5.16b,v0.16b,v5.16b,#12\n"
-" eor v4.16b,v4.16b,v5.16b\n"
+"	eor	v4.16b,v4.16b,v5.16b\n"
+"	ext	v5.16b,v0.16b,v5.16b,#12\n"
+"	eor	v4.16b,v4.16b,v5.16b\n"
+"	ext	v5.16b,v0.16b,v5.16b,#12\n"
+"	eor	v4.16b,v4.16b,v5.16b\n"
 "\n"
-" eor v4.16b,v4.16b,v6.16b\n"
-" b 1b\n"
+"	eor	v4.16b,v4.16b,v6.16b\n"
+"	b	1b\n"
 "\n"
 "2:\n" : : "r"(key), "r"(expandedKey), "r"(rcon));
 }
@@ -1021,71 +1214,69 @@ __asm__(
  */
 STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks)
 {
-  const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
-  uint8x16_t tmp;
-  int i;
-
-  for (i=0; i<nblocks; i++)
-  {
-    uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
-    tmp = vaeseq_u8(tmp, zero);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[0]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[1]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[2]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[3]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[4]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[5]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[6]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[7]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[8]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = veorq_u8(tmp,  k[9]);
-    vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
-  }
+	const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
+	int i;
+
+	for (i=0; i<nblocks; i++)
+	{
+		uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
+		tmp = vaeseq_u8(tmp, zero);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[0]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[1]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[2]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[3]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[4]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[5]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[6]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[7]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[8]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = veorq_u8(tmp,  k[9]);
+		vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
+	}
 }
 
 STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, const uint8_t *xor, int nblocks)
 {
-  const uint8x16_t *k = (const uint8x16_t *)expandedKey;
-  const uint8x16_t *x = (const uint8x16_t *)xor;
-  uint8x16_t tmp;
-  int i;
-
-  for (i=0; i<nblocks; i++)
-  {
-    uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
-    tmp = vaeseq_u8(tmp, x[i]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[0]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[1]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[2]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[3]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[4]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[5]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[6]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[7]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = vaeseq_u8(tmp, k[8]);
-    tmp = vaesmcq_u8(tmp);
-    tmp = veorq_u8(tmp,  k[9]);
-    vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
-  }
+	const uint8x16_t *k = (const uint8x16_t *)expandedKey;
+	const uint8x16_t *x = (const uint8x16_t *)xor;
+	int i;
+
+	for (i=0; i<nblocks; i++)
+	{
+		uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
+		tmp = vaeseq_u8(tmp, x[i]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[0]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[1]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[2]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[3]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[4]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[5]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[6]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[7]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = vaeseq_u8(tmp, k[8]);
+		tmp = vaesmcq_u8(tmp);
+		tmp = veorq_u8(tmp,  k[9]);
+		vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
+	}
 }
 
 #ifdef FORCE_USE_HEAP
@@ -1110,16 +1301,17 @@ STATIC INLINE void aligned_free(void *ptr)
 }
 #endif /* FORCE_USE_HEAP */
 
+STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
+{
+  U64(a)[0] ^= U64(b)[0];
+  U64(a)[1] ^= U64(b)[1];
+}
+
 void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
 {
     RDATA_ALIGN16 uint8_t expandedKey[240];
 
-#ifndef FORCE_USE_HEAP
-    RDATA_ALIGN16 uint8_t hp_state[MEMORY];
-#else
-    uint8_t *hp_state = (uint8_t *)aligned_malloc(MEMORY,16);
-#endif
-
+    uint8_t *local_hp_state;
     uint8_t text[INIT_SIZE_BYTE];
     RDATA_ALIGN16 uint64_t a[2];
     RDATA_ALIGN16 uint64_t b[4];
@@ -1130,12 +1322,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
 
     size_t i, j;
     uint64_t *p = NULL;
+    oaes_ctx *aes_ctx = NULL;
+    int useAes = !force_software_aes() && check_aes_hw();
 
     static void (*const extra_hashes[4])(const void *, size_t, char *) =
     {
         hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
     };
 
+    // this isn't supposed to happen, but guard against it for now.
+    if(hp_state == NULL)
+        cn_slow_hash_allocate_state();
+
+    // locals to avoid constant TLS dereferencing
+    local_hp_state = hp_state;
+
+    // locals to avoid constant TLS dereferencing
     /* CryptoNight Step 1:  Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
 
     if (prehashed) {
@@ -1153,11 +1355,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
      * the 2MB large random access buffer.
      */
 
-    aes_expand_key(state.hs.b, expandedKey);
-    for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+    if(useAes)
     {
-        aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
-        memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
+        aes_expand_key(state.hs.b, expandedKey);
+        for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+        {
+            aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
+            memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
+        }
+    }
+    else
+    {
+        aes_ctx = (oaes_ctx *) oaes_alloc();
+        oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
+        for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+        {
+            for(j = 0; j < INIT_SIZE_BLK; j++)
+                aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
+
+            memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
+        }
     }
 
     U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
@@ -1173,13 +1390,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
     _b = vld1q_u8((const uint8_t *)b);
     _b1 = vld1q_u8(((const uint8_t *)b) + AES_BLOCK_SIZE);
 
-    for(i = 0; i < ITER / 2; i++)
+    if(useAes)
+    {
+        for(i = 0; i < ITER / 2; i++)
+        {
+            pre_aes();
+            _c = vaeseq_u8(_c, zero);
+            _c = vaesmcq_u8(_c);
+            _c = veorq_u8(_c, _a);
+            post_aes();
+        }
+    }
+    else
     {
-        pre_aes();
-        _c = vaeseq_u8(_c, zero);
-        _c = vaesmcq_u8(_c);
-        _c = veorq_u8(_c, _a);
-        post_aes();
+        for(i = 0; i < ITER / 2; i++)
+        {
+            pre_aes();
+            aesb_single_round((uint8_t *) &_c, (uint8_t *) &_c, (uint8_t *) &_a);
+            post_aes();
+        }
+
     }
 
     /* CryptoNight Step 4:  Sequentially pass through the mixing buffer and use 10 rounds
@@ -1188,11 +1418,27 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
 
     memcpy(text, state.init, INIT_SIZE_BYTE);
 
-    aes_expand_key(&state.hs.b[32], expandedKey);
-    for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+    if(useAes)
+    {
+        aes_expand_key(&state.hs.b[32], expandedKey);
+        for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+        {
+            // add the xor to the pseudo round
+            aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
+        }
+    }
+    else
     {
-        // add the xor to the pseudo round
-        aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
+        oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
+        for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
+        {
+            for(j = 0; j < INIT_SIZE_BLK; j++)
+            {
+                xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
+                aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
+            }
+        }
+        oaes_free((OAES_CTX **) &aes_ctx);
     }
 
     /* CryptoNight Step 5:  Apply Keccak to the state again, and then
@@ -1205,10 +1451,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
     memcpy(state.init, text, INIT_SIZE_BYTE);
     hash_permutation(&state.hs);
     extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
-
-#ifdef FORCE_USE_HEAP
-    aligned_free(hp_state);
-#endif
 }
 #else /* aarch64 && crypto */
 
@@ -1251,7 +1493,7 @@ void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cres) {
 #else // !NO_OPTIMIZED_MULTIPLY_ON_ARM
 
 #ifdef __aarch64__ /* ARM64, no crypto */
-#define mul(a, b, c)  cn_mul128((const uint64_t *)a, (const uint64_t *)b, (uint64_t *)c)
+#define mul(a, b, c)	cn_mul128((const uint64_t *)a, (const uint64_t *)b, (uint64_t *)c)
 STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r)
 {
   uint64_t lo, hi;
@@ -1261,7 +1503,7 @@ STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r)
   r[1] = lo;
 }
 #else /* ARM32 */
-#define mul(a, b, c)  cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c)
+#define mul(a, b, c)	cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c)
 STATIC void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r)
 {
   uint32_t t0, t1, t2=0, t3=0;
@@ -1330,6 +1572,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
 {
     uint8_t text[INIT_SIZE_BYTE];
     uint8_t a[AES_BLOCK_SIZE];
+    uint8_t a1[AES_BLOCK_SIZE];
     uint8_t b[AES_BLOCK_SIZE * 2];
     uint8_t c[AES_BLOCK_SIZE];
     uint8_t c1[AES_BLOCK_SIZE];
@@ -1389,10 +1632,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
       // Iteration 1
       j = state_index(a);
       p = &long_state[j];
-      aesb_single_round(p, p, a);
-      copy_block(c1, p);
+      aesb_single_round(p, c1, a);
 
-      VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+      VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+      copy_block(p, c1);
       xor_blocks(p, b);
       VARIANT1_1(p);
 
@@ -1401,14 +1644,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
       p = &long_state[j];
       copy_block(c, p);
 
+      copy_block(a1, a);
       VARIANT2_PORTABLE_INTEGER_MATH(c, c1);
-      VARIANT4_RANDOM_MATH(a, c, r, b, b + AES_BLOCK_SIZE);
+      VARIANT4_RANDOM_MATH(a1, c, r, b, b + AES_BLOCK_SIZE);
       mul(c1, c, d);
       VARIANT2_2_PORTABLE();
-      VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
-      sum_half_blocks(a, d);
-      swap_blocks(a, c);
-      xor_blocks(a, c);
+      VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+      sum_half_blocks(a1, d);
+      swap_blocks(a1, c);
+      xor_blocks(a1, c);
       VARIANT1_2(U64(c) + 1);
       copy_block(p, c);
 
@@ -1416,6 +1660,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
         copy_block(b + AES_BLOCK_SIZE, b);
       }
       copy_block(b, c1);
+      copy_block(a, a1);
     }
 
     memcpy(text, state.init, INIT_SIZE_BYTE);
@@ -1443,7 +1688,9 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
 #else
 // Portable implementation as a fallback
 
-void slow_hash_allocate_state(void)
+#define hp_jitfunc ((v4_random_math_JIT_func)NULL)
+
+void cn_slow_hash_allocate_state(void)
 {
   // Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
   return;
@@ -1536,6 +1783,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
   union cn_slow_hash_state state;
   uint8_t text[INIT_SIZE_BYTE];
   uint8_t a[AES_BLOCK_SIZE];
+  uint8_t a1[AES_BLOCK_SIZE];
   uint8_t b[AES_BLOCK_SIZE * 2];
   uint8_t c1[AES_BLOCK_SIZE];
   uint8_t c2[AES_BLOCK_SIZE];
@@ -1579,7 +1827,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
     j = e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
     copy_block(c1, &long_state[j]);
     aesb_single_round(c1, c1, a);
-    VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+    VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
     copy_block(&long_state[j], c1);
     xor_blocks(&long_state[j], b);
     assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
@@ -1587,23 +1835,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
     /* Iteration 2 */
     j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
     copy_block(c2, &long_state[j]);
+    copy_block(a1, a);
     VARIANT2_PORTABLE_INTEGER_MATH(c2, c1);
-    VARIANT4_RANDOM_MATH(a, c2, r, b, b + AES_BLOCK_SIZE);
+    VARIANT4_RANDOM_MATH(a1, c2, r, b, b + AES_BLOCK_SIZE);
     mul(c1, c2, d);
     VARIANT2_2_PORTABLE();
-    VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
-    swap_blocks(a, c1);
-    sum_half_blocks(c1, d);
-    swap_blocks(c1, c2);
-    xor_blocks(c1, c2);
+    VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+    sum_half_blocks(a1, d);
+    swap_blocks(a1, c2);
+    xor_blocks(a1, c2);
     VARIANT1_2(c2 + 8);
     copy_block(&long_state[j], c2);
-    assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
     if (variant >= 2) {
       copy_block(b + AES_BLOCK_SIZE, b);
     }
-    copy_block(b, a);
-    copy_block(a, c1);
+    copy_block(b, c1);
+    copy_block(a, a1);
   }
 
   memcpy(text, state.init, INIT_SIZE_BYTE);
diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h
index 2c190287b..78adf902e 100644
--- a/src/crypto/variant4_random_math.h
+++ b/src/crypto/variant4_random_math.h
@@ -11,7 +11,13 @@ enum V4_Settings
 	
 	// Always generate at least 60 instructions
 	NUM_INSTRUCTIONS = 60,
-	
+
+    // Always generate at least 60 instructions
+    NUM_INSTRUCTIONS_MIN = 60,
+
+    // Never generate more than 70 instructions (final RET instruction doesn't count here)
+    NUM_INSTRUCTIONS_MAX = 70,
+
 	// Available ALUs for MUL
 	// Modern CPUs typically have only 1 ALU which can do multiplications
 	ALU_COUNT_MUL = 1,
-- 
2.25.1