From 76a61ab006414a39536bb81fa86db36aaaaeef0a Mon Sep 17 00:00:00 2001 From: moneromooo-monero Date: Wed, 26 Dec 2018 09:46:41 +0000 Subject: [PATCH] epee: speedup word/number matching Number matching semantics are slightly changed: since this is used as a filter to check whether a number is signed and/or floating point, we can speed this up further. strto* functions are called afterwards and will error out where necessary. We now also accept numbers like .4 which were not accepted before. The strto* calls on a boost::string_ref will not access unallocated memory since the parsers always stop at the first bad character, and the original string is zero terminated. in arbitrary time measurement units for some arbitrary test case: match_number2: 235 -> 70 match_word2: 330 -> 108 --- .../include/storages/parserse_base_utils.h | 72 ++++++++++++---- .../storages/portable_storage_from_json.h | 62 +++++++------- tests/unit_tests/epee_utils.cpp | 84 +++++++++++++++++++ 3 files changed, 173 insertions(+), 45 deletions(-) diff --git a/contrib/epee/include/storages/parserse_base_utils.h b/contrib/epee/include/storages/parserse_base_utils.h index d73fbde3a..69b650cd4 100644 --- a/contrib/epee/include/storages/parserse_base_utils.h +++ b/contrib/epee/include/storages/parserse_base_utils.h @@ -29,6 +29,7 @@ #pragma once #include +#include namespace epee { @@ -36,6 +37,40 @@ namespace misc_utils { namespace parse { + // 1: digit + // 2: .eE (floating point) + // 4: alpha + // 8: whitespace + // 16: allowed in float but doesn't necessarily mean it's a float + static const constexpr uint8_t lut[256]={ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 0, 0, // 16 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 32 + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 16, 18, 0, // 48 + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 0, 0, 0, 0, 0, 0, // 64 + 0, 4, 4, 4, 4, 22, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 80 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 96 + 0, 4, 4, 4, 4, 22, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 128 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + inline bool isspace(char c) + { + return lut[(uint8_t)c] & 8; + } + + inline bool isdigit(char c) + { + return lut[(uint8_t)c] & 1; + } + inline std::string transform_to_escape_sequence(const std::string& src) { static const char escaped[] = "\b\f\n\r\t\v\"\\/"; @@ -159,25 +194,34 @@ namespace misc_utils return false; } } - inline void match_number2(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, std::string& val, bool& is_float_val, bool& is_signed_val) + inline void match_number2(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, boost::string_ref& val, bool& is_float_val, bool& is_signed_val) { val.clear(); - is_float_val = false; - for(std::string::const_iterator it = star_end_string;it != buf_end;it++) + uint8_t float_flag = 0; + is_signed_val = false; + size_t chars = 0; + std::string::const_iterator it = star_end_string; + if (it != buf_end && *it == '-') + { + is_signed_val = true; + ++chars; + ++it; + } + for(;it != buf_end;it++) { - if(isdigit(*it) || (it == star_end_string && *it == '-') || (val.size() && *it == '.' ) || (is_float_val && (*it == 'e' || *it == 'E' || *it == '-' || *it == '+' )) ) + const uint8_t flags = lut[(uint8_t)*it]; + if (flags & 16) { - if(!val.size() && *it == '-') - is_signed_val = true; - if(*it == '.' ) - is_float_val = true; - val.push_back(*it); + float_flag |= flags; + ++chars; } else { + val = boost::string_ref(&*star_end_string, chars); if(val.size()) { star_end_string = --it; + is_float_val = !!(float_flag & 2); return; } else @@ -186,7 +230,7 @@ namespace misc_utils } ASSERT_MES_AND_THROW("wrong number in json entry: " << std::string(star_end_string, buf_end)); } - inline bool match_number(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, std::string& val) + inline bool match_number(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, boost::string_ref& val) { try { @@ -199,15 +243,15 @@ namespace misc_utils return false; } } - inline void match_word2(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, std::string& val) + inline void match_word2(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, boost::string_ref& val) { val.clear(); for(std::string::const_iterator it = star_end_string;it != buf_end;it++) { - if(!isalpha(*it)) + if (!(lut[(uint8_t)*it] & 4)) { - val.assign(star_end_string, it); + val = boost::string_ref(&*star_end_string, std::distance(star_end_string, it)); if(val.size()) { star_end_string = --it; @@ -218,7 +262,7 @@ namespace misc_utils } ASSERT_MES_AND_THROW("failed to match word number in json entry: " << std::string(star_end_string, buf_end)); } - inline bool match_word(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, std::string& val) + inline bool match_word(std::string::const_iterator& star_end_string, std::string::const_iterator buf_end, boost::string_ref& val) { try { diff --git a/contrib/epee/include/storages/portable_storage_from_json.h b/contrib/epee/include/storages/portable_storage_from_json.h index 0307b732c..3e3052541 100644 --- a/contrib/epee/include/storages/portable_storage_from_json.h +++ b/contrib/epee/include/storages/portable_storage_from_json.h @@ -39,7 +39,7 @@ namespace epee { namespace json { -#define CHECK_ISSPACE() if(!isspace(*it)){ ASSERT_MES_AND_THROW("Wrong JSON character at: " << std::string(it, buf_end));} +#define CHECK_ISSPACE() if(!epee::misc_utils::parse::isspace(*it)){ ASSERT_MES_AND_THROW("Wrong JSON character at: " << std::string(it, buf_end));} /*inline void parse_error() { @@ -114,11 +114,11 @@ namespace epee std::string val; match_string2(it, buf_end, val); //insert text value - stg.set_value(name, val, current_section); + stg.set_value(name, std::move(val), current_section); state = match_state_wonder_after_value; - }else if (isdigit(*it) || *it == '-') + }else if (epee::misc_utils::parse::isdigit(*it) || *it == '-') {//just a named number value started - std::string val; + boost::string_ref val; bool is_v_float = false;bool is_signed = false; match_number2(it, buf_end, val, is_v_float, is_signed); if(!is_v_float) @@ -126,27 +126,27 @@ namespace epee if(is_signed) { errno = 0; - int64_t nval = strtoll(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + int64_t nval = strtoll(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); stg.set_value(name, nval, current_section); }else { errno = 0; - uint64_t nval = strtoull(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + uint64_t nval = strtoull(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); stg.set_value(name, nval, current_section); } }else { errno = 0; - double nval = strtod(val.c_str(), NULL); - if (errno) throw std::runtime_error("Invalid number: " + val); + double nval = strtod(val.data(), NULL); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); stg.set_value(name, nval, current_section); } state = match_state_wonder_after_value; }else if(isalpha(*it) ) {// could be null, true or false - std::string word; + boost::string_ref word; match_word2(it, buf_end, word); if(boost::iequals(word, "null")) { @@ -203,13 +203,13 @@ namespace epee //mean array of strings std::string val; match_string2(it, buf_end, val); - h_array = stg.insert_first_value(name, val, current_section); + h_array = stg.insert_first_value(name, std::move(val), current_section); CHECK_AND_ASSERT_THROW_MES(h_array, " failed to insert values entry"); state = match_state_array_after_value; array_md = array_mode_string; - }else if (isdigit(*it) || *it == '-') + }else if (epee::misc_utils::parse::isdigit(*it) || *it == '-') {//array of numbers value started - std::string val; + boost::string_ref val; bool is_v_float = false;bool is_signed_val = false; match_number2(it, buf_end, val, is_v_float, is_signed_val); if(!is_v_float) @@ -217,22 +217,22 @@ namespace epee if (is_signed_val) { errno = 0; - int64_t nval = strtoll(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + int64_t nval = strtoll(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); h_array = stg.insert_first_value(name, nval, current_section); }else { errno = 0; - uint64_t nval = strtoull(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + uint64_t nval = strtoull(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); h_array = stg.insert_first_value(name, nval, current_section); } CHECK_AND_ASSERT_THROW_MES(h_array, " failed to insert values section entry"); }else { errno = 0; - double nval = strtod(val.c_str(), NULL); - if (errno) throw std::runtime_error("Invalid number: " + val); + double nval = strtod(val.data(), NULL); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); h_array = stg.insert_first_value(name, nval, current_section); CHECK_AND_ASSERT_THROW_MES(h_array, " failed to insert values section entry"); } @@ -245,7 +245,7 @@ namespace epee state = match_state_wonder_after_value; }else if(isalpha(*it) ) {// array of booleans - std::string word; + boost::string_ref word; match_word2(it, buf_end, word); if(boost::iequals(word, "true")) { @@ -291,15 +291,15 @@ namespace epee { std::string val; match_string2(it, buf_end, val); - bool res = stg.insert_next_value(h_array, val); + bool res = stg.insert_next_value(h_array, std::move(val)); CHECK_AND_ASSERT_THROW_MES(res, "failed to insert values"); state = match_state_array_after_value; }else CHECK_ISSPACE(); break; case array_mode_numbers: - if (isdigit(*it) || *it == '-') + if (epee::misc_utils::parse::isdigit(*it) || *it == '-') {//array of numbers value started - std::string val; + boost::string_ref val; bool is_v_float = false;bool is_signed_val = false; match_number2(it, buf_end, val, is_v_float, is_signed_val); bool insert_res = false; @@ -308,21 +308,21 @@ namespace epee if (is_signed_val) { errno = 0; - int64_t nval = strtoll(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + int64_t nval = strtoll(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); insert_res = stg.insert_next_value(h_array, nval); }else { errno = 0; - uint64_t nval = strtoull(val.c_str(), NULL, 10); - if (errno) throw std::runtime_error("Invalid number: " + val); + uint64_t nval = strtoull(val.data(), NULL, 10); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); insert_res = stg.insert_next_value(h_array, nval); } }else { errno = 0; - double nval = strtod(val.c_str(), NULL); - if (errno) throw std::runtime_error("Invalid number: " + val); + double nval = strtod(val.data(), NULL); + if (errno) throw std::runtime_error("Invalid number: " + std::string(val)); insert_res = stg.insert_next_value(h_array, nval); } CHECK_AND_ASSERT_THROW_MES(insert_res, "Failed to insert next value"); @@ -333,7 +333,7 @@ namespace epee case array_mode_booleans: if(isalpha(*it) ) {// array of booleans - std::string word; + boost::string_ref word; match_word2(it, buf_end, word); if(boost::iequals(word, "true")) { diff --git a/tests/unit_tests/epee_utils.cpp b/tests/unit_tests/epee_utils.cpp index 3915c8903..d18fc26bb 100644 --- a/tests/unit_tests/epee_utils.cpp +++ b/tests/unit_tests/epee_utils.cpp @@ -50,6 +50,7 @@ #include "p2p/net_peerlist_boost_serialization.h" #include "span.h" #include "string_tools.h" +#include "storages/parserse_base_utils.h" namespace { @@ -833,3 +834,86 @@ TEST(net_buffer, move) ASSERT_TRUE(!memcmp(span.data() + 1, std::string(4000, '0').c_str(), 4000)); } +TEST(parsing, isspace) +{ + ASSERT_FALSE(epee::misc_utils::parse::isspace(0)); + for (int c = 1; c < 256; ++c) + { + ASSERT_EQ(epee::misc_utils::parse::isspace(c), strchr("\r\n\t\f\v ", c) != NULL); + } +} + +TEST(parsing, isdigit) +{ + ASSERT_FALSE(epee::misc_utils::parse::isdigit(0)); + for (int c = 1; c < 256; ++c) + { + ASSERT_EQ(epee::misc_utils::parse::isdigit(c), strchr("0123456789", c) != NULL); + } +} + +TEST(parsing, number) +{ + boost::string_ref val; + std::string s; + std::string::const_iterator i; + + // the parser expects another character to end the number, and accepts things + // that aren't numbers, as it's meant as a pre-filter for strto* functions, + // so we just check that numbers get accepted, but don't test non numbers + + s = "0 "; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "0"); + + s = "000 "; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "000"); + + s = "10x"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "10"); + + s = "10.09/"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "10.09"); + + s = "-1.r"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "-1."); + + s = "-49.;"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "-49."); + + s = "0.78/"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "0.78"); + + s = "33E9$"; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "33E9"); + + s = ".34e2="; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, ".34e2"); + + s = "-9.34e-2="; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "-9.34e-2"); + + s = "+9.34e+03="; + i = s.begin(); + epee::misc_utils::parse::match_number(i, s.end(), val); + ASSERT_EQ(val, "+9.34e+03"); +}