From 126196b017cd93ff399212b7315f9053511afb07 Mon Sep 17 00:00:00 2001 From: moneromooo-monero Date: Sun, 25 Mar 2018 12:17:37 +0100 Subject: [PATCH] multiexp: some speedups - use a raw memory block to store cache - use aligned memory - use doubling API where appropriate - calculate straus in bands --- src/ringct/bulletproofs.cc | 3 +- src/ringct/multiexp.cc | 144 ++++++++++++++++++++++++++++++------- src/ringct/multiexp.h | 2 +- 3 files changed, 121 insertions(+), 28 deletions(-) diff --git a/src/ringct/bulletproofs.cc b/src/ringct/bulletproofs.cc index e2540fb22..94be0e545 100644 --- a/src/ringct/bulletproofs.cc +++ b/src/ringct/bulletproofs.cc @@ -70,8 +70,9 @@ static boost::mutex init_mutex; static inline rct::key multiexp(const std::vector &data, bool HiGi) { + static const size_t STEP = getenv("STRAUS_STEP") ? atoi(getenv("STRAUS_STEP")) : 0; if (HiGi || data.size() < 1000) - return straus(data, HiGi ? HiGi_cache: NULL); + return straus(data, HiGi ? HiGi_cache: NULL, STEP); else return bos_coster_heap_conv_robust(data); } diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc index 4f16bd588..99bef25f3 100644 --- a/src/ringct/multiexp.cc +++ b/src/ringct/multiexp.cc @@ -34,6 +34,7 @@ extern "C" { #include "crypto/crypto-ops.h" } +#include "common/aligned.h" #include "rctOps.h" #include "multiexp.h" @@ -43,6 +44,17 @@ extern "C" //#define MULTIEXP_PERF(x) x #define MULTIEXP_PERF(x) +#define RAW_MEMORY_BLOCK +//#define ALTERNATE_LAYOUT +//#define TRACK_STRAUS_ZERO_IDENTITY + +// per points us for N/B points (B point bands) +// raw alt 128/192 4096/192 4096/4096 +// 0 0 52.6 71 71.2 +// 0 1 53.2 72.2 72.4 +// 1 0 52.7 67 67.1 +// 1 1 52.8 70.4 70.2 + namespace rct { @@ -198,6 +210,7 @@ rct::key bos_coster_heap_conv_robust(std::vector data) ge_cached cached; ge_p1p1 p1; + ge_p2 p2; MULTIEXP_PERF(PERF_TIMER_RESUME(div)); while (1) @@ -214,8 +227,8 @@ rct::key bos_coster_heap_conv_robust(std::vector data) std::push_heap(heap.begin(), heap.end(), Comp); } data[index1].scalar = div2(data[index1].scalar); - ge_p3_to_cached(&cached, &data[index1].point); - ge_add(&p1, &data[index1].point, &cached); + ge_p3_to_p2(&p2, &data[index1].point); + ge_p2_dbl(&p1, &p2); ge_p1p1_to_p3(&data[index1].point, &p1); } MULTIEXP_PERF(PERF_TIMER_PAUSE(div)); @@ -259,12 +272,32 @@ rct::key bos_coster_heap_conv_robust(std::vector data) return res; } +static constexpr unsigned int STRAUS_C = 4; + struct straus_cached_data { +#ifdef RAW_MEMORY_BLOCK + size_t size; + ge_cached *multiples; + straus_cached_data(): size(0), multiples(NULL) {} + ~straus_cached_data() { aligned_free(multiples); } +#else std::vector> multiples; +#endif }; - -static constexpr unsigned int STRAUS_C = 4; +#ifdef RAW_MEMORY_BLOCK +#ifdef ALTERNATE_LAYOUT +#define CACHE_OFFSET(cache,point,digit) cache->multiples[(point)*((1<multiples[(point)+cache->size*((digit)-1)] +#endif +#else +#ifdef ALTERNATE_LAYOUT +#define CACHE_OFFSET(cache,point,digit) local_cache->multiples[j][digit-1] +#else +#define CACHE_OFFSET(cache,point,digit) local_cache->multiples[digit][j] +#endif +#endif std::shared_ptr straus_init_cache(const std::vector &data) { @@ -274,6 +307,36 @@ std::shared_ptr straus_init_cache(const std::vector cache(new straus_cached_data()); +#ifdef RAW_MEMORY_BLOCK + const size_t offset = cache->size; + cache->multiples = (ge_cached*)aligned_realloc(cache->multiples, sizeof(ge_cached) * ((1<size = data.size(); + for (size_t j=offset;jmultiples.size(); + cache->multiples.resize(std::max(offset, data.size())); + for (size_t i = offset; i < data.size(); ++i) + { + cache->multiples[i].resize((1<multiples[i][0], &data[i].point); + for (size_t j=2;j<1<multiples[i][j-2]); + ge_p1p1_to_p3(&p3, &p1); + ge_p3_to_cached(&cache->multiples[i][j-1], &p3); + } + } +#else cache->multiples.resize(1<multiples[1].size(); cache->multiples[1].resize(std::max(offset, data.size())); @@ -290,6 +353,8 @@ std::shared_ptr straus_init_cache(const std::vectormultiples[i][j], &p3); } } +#endif +#endif MULTIEXP_PERF(PERF_TIMER_STOP(multiples)); return cache; @@ -298,15 +363,20 @@ std::shared_ptr straus_init_cache(const std::vector &cache) { size_t sz = 0; +#ifdef RAW_MEMORY_BLOCK + sz += cache->size * sizeof(ge_cached) * ((1<multiples) - sz += e0.size() * sizeof(ge_p3); + sz += e0.size() * sizeof(ge_cached); +#endif return sz; } -rct::key straus(const std::vector &data, const std::shared_ptr &cache) +rct::key straus(const std::vector &data, const std::shared_ptr &cache, size_t STEP) { MULTIEXP_PERF(PERF_TIMER_UNIT(straus, 1000000)); bool HiGi = cache != NULL; + STEP = STEP ? STEP : 192; MULTIEXP_PERF(PERF_TIMER_START_UNIT(setup, 1000000)); static constexpr unsigned int mask = (1< &data, const std::shared_ptr skip(data.size()); for (size_t i = 0; i < data.size(); ++i) skip[i] = data[i].scalar == rct::zero() || !memcmp(&data[i].point, &ge_p3_identity, sizeof(ge_p3)); + MULTIEXP_PERF(PERF_TIMER_STOP(skip)); +#endif MULTIEXP_PERF(PERF_TIMER_START_UNIT(digits, 1000000)); std::vector> digits; @@ -361,35 +435,53 @@ rct::key straus(const std::vector &data, const std::shared_ptrmultiples[digit][j]); - ge_p1p1_to_p3(&res_p3, &p1); +#ifdef TRACK_STRAUS_ZERO_IDENTITY + if (skip[j]) + continue; +#endif + const uint8_t digit = digits[j][i]; + if (digit) + { + ge_add(&p1, &band_p3, &CACHE_OFFSET(local_cache, j, digit)); + ge_p1p1_to_p3(&band_p3, &p1); + } } } + + ge_p3_to_cached(&cached, &band_p3); + ge_add(&p1, &res_p3, &cached); + ge_p1p1_to_p3(&res_p3, &p1); } rct::key res; diff --git a/src/ringct/multiexp.h b/src/ringct/multiexp.h index 44998e2e0..c08c70858 100644 --- a/src/ringct/multiexp.h +++ b/src/ringct/multiexp.h @@ -59,7 +59,7 @@ rct::key bos_coster_heap_conv(std::vector data); rct::key bos_coster_heap_conv_robust(std::vector data); std::shared_ptr straus_init_cache(const std::vector &data); size_t straus_get_cache_size(const std::shared_ptr &cache); -rct::key straus(const std::vector &data, const std::shared_ptr &cache = NULL); +rct::key straus(const std::vector &data, const std::shared_ptr &cache = NULL, size_t STEP = 0); }