From 11227e0ba24c9c2c38ab5691e3b5950358f84175 Mon Sep 17 00:00:00 2001
From: moneromooo-monero <moneromooo-monero@users.noreply.github.com>
Date: Fri, 11 Jan 2019 01:36:59 +0000
Subject: [PATCH] mnemonics: compare canonical words (lowercase)

---
 src/mnemonics/electrum-words.cpp | 43 +++++++-------
 src/mnemonics/language_base.h    | 96 ++++++++++++++++++++++++++++++--
 2 files changed, 115 insertions(+), 24 deletions(-)

diff --git a/src/mnemonics/electrum-words.cpp b/src/mnemonics/electrum-words.cpp
index b1e3bdcd5..68d5b84d3 100644
--- a/src/mnemonics/electrum-words.cpp
+++ b/src/mnemonics/electrum-words.cpp
@@ -76,8 +76,8 @@ namespace crypto
 namespace
 {
   uint32_t create_checksum_index(const std::vector<epee::wipeable_string> &word_list,
-    uint32_t unique_prefix_length);
-  bool checksum_test(std::vector<epee::wipeable_string> seed, uint32_t unique_prefix_length);
+    const Language::Base *language);
+  bool checksum_test(std::vector<epee::wipeable_string> seed, const Language::Base *language);
 
   /*!
    * \brief Finds the word list that contains the seed words and puts the indices
@@ -116,8 +116,8 @@ namespace
     for (std::vector<Language::Base*>::iterator it1 = language_instances.begin();
       it1 != language_instances.end(); it1++)
     {
-      const std::unordered_map<epee::wipeable_string, uint32_t> &word_map = (*it1)->get_word_map();
-      const std::unordered_map<epee::wipeable_string, uint32_t> &trimmed_word_map = (*it1)->get_trimmed_word_map();
+      const std::unordered_map<epee::wipeable_string, uint32_t, Language::WordHash, Language::WordEqual> &word_map = (*it1)->get_word_map();
+      const std::unordered_map<epee::wipeable_string, uint32_t, Language::WordHash, Language::WordEqual> &trimmed_word_map = (*it1)->get_trimmed_word_map();
       // To iterate through seed words
       bool full_match = true;
 
@@ -151,7 +151,7 @@ namespace
         // if we were using prefix only, and we have a checksum, check it now
         // to avoid false positives due to prefix set being too common
         if (has_checksum)
-          if (!checksum_test(seed, (*it1)->get_unique_prefix_length()))
+          if (!checksum_test(seed, *it1))
           {
             fallback = *it1;
             full_match = false;
@@ -190,20 +190,20 @@ namespace
    * \return                      Checksum index
    */
   uint32_t create_checksum_index(const std::vector<epee::wipeable_string> &word_list,
-    uint32_t unique_prefix_length)
+    const Language::Base *language)
   {
-    epee::wipeable_string trimmed_words = "";
+    epee::wipeable_string trimmed_words = "", word;
 
+    const auto &word_map = language->get_word_map();
+    const auto &trimmed_word_map = language->get_trimmed_word_map();
+    const uint32_t unique_prefix_length = language->get_unique_prefix_length();
     for (std::vector<epee::wipeable_string>::const_iterator it = word_list.begin(); it != word_list.end(); it++)
     {
-      if (it->length() > unique_prefix_length)
-      {
-        trimmed_words += Language::utf8prefix(*it, unique_prefix_length);
-      }
-      else
-      {
-        trimmed_words += *it;
-      }
+      word = Language::utf8prefix(*it, unique_prefix_length);
+      auto it2 = trimmed_word_map.find(word);
+      if (it2 == trimmed_word_map.end())
+        throw std::runtime_error("Word \"" + std::string(word.data(), word.size()) + "\" not found in trimmed word map in " + language->get_english_language_name());
+      trimmed_words += it2->first;
     }
     boost::crc_32_type result;
     result.process_bytes(trimmed_words.data(), trimmed_words.length());
@@ -216,7 +216,7 @@ namespace
    * \param unique_prefix_length  the prefix length of each word to use for checksum
    * \return                      True if the test passed false if not.
    */
-  bool checksum_test(std::vector<epee::wipeable_string> seed, uint32_t unique_prefix_length)
+  bool checksum_test(std::vector<epee::wipeable_string> seed, const Language::Base *language)
   {
     if (seed.empty())
       return false;
@@ -224,13 +224,16 @@ namespace
     epee::wipeable_string last_word = seed.back();
     seed.pop_back();
 
-    epee::wipeable_string checksum = seed[create_checksum_index(seed, unique_prefix_length)];
+    const uint32_t unique_prefix_length = language->get_unique_prefix_length();
+
+    auto idx = create_checksum_index(seed, language);
+    epee::wipeable_string checksum = seed[idx];
 
     epee::wipeable_string trimmed_checksum = checksum.length() > unique_prefix_length ? Language::utf8prefix(checksum, unique_prefix_length) :
       checksum;
     epee::wipeable_string trimmed_last_word = last_word.length() > unique_prefix_length ? Language::utf8prefix(last_word, unique_prefix_length) :
       last_word;
-    bool ret = trimmed_checksum == trimmed_last_word;
+    bool ret = Language::WordEqual()(trimmed_checksum, trimmed_last_word);
     MINFO("Checksum is " << (ret ? "valid" : "invalid"));
     return ret;
   }
@@ -301,7 +304,7 @@ namespace crypto
 
       if (has_checksum)
       {
-        if (!checksum_test(seed, language->get_unique_prefix_length()))
+        if (!checksum_test(seed, language))
         {
           // Checksum fail
           MERROR("Invalid seed: invalid checksum");
@@ -424,7 +427,7 @@ namespace crypto
         memwipe(w, sizeof(w));
       }
 
-      words += words_store[create_checksum_index(words_store, language->get_unique_prefix_length())];
+      words += words_store[create_checksum_index(words_store, language)];
       return true;
     }
 
diff --git a/src/mnemonics/language_base.h b/src/mnemonics/language_base.h
index 52e784cef..a6f969604 100644
--- a/src/mnemonics/language_base.h
+++ b/src/mnemonics/language_base.h
@@ -38,7 +38,9 @@
 #include <vector>
 #include <unordered_map>
 #include <string>
+#include <boost/algorithm/string.hpp>
 #include "misc_log_ex.h"
+#include "fnv1.h"
 
 /*!
  * \namespace Language
@@ -71,6 +73,92 @@ namespace Language
     return prefix;
   }
 
+  template<typename T>
+  inline T utf8canonical(const T &s)
+  {
+    T sc = "";
+    size_t avail = s.size();
+    const char *ptr = s.data();
+    wint_t cp = 0;
+    int bytes = 1;
+    char wbuf[8], *wptr;
+    while (avail--)
+    {
+      if ((*ptr & 0x80) == 0)
+      {
+        cp = *ptr++;
+        bytes = 1;
+      }
+      else if ((*ptr & 0xe0) == 0xc0)
+      {
+        if (avail < 1)
+          throw std::runtime_error("Invalid UTF-8");
+        cp = (*ptr++ & 0x1f) << 6;
+        cp |= *ptr++ & 0x3f;
+        --avail;
+        bytes = 2;
+      }
+      else if ((*ptr & 0xf0) == 0xe0)
+      {
+        if (avail < 2)
+          throw std::runtime_error("Invalid UTF-8");
+        cp = (*ptr++ & 0xf) << 12;
+        cp |= (*ptr++ & 0x3f) << 6;
+        cp |= *ptr++ & 0x3f;
+        avail -= 2;
+        bytes = 3;
+      }
+      else if ((*ptr & 0xf8) == 0xf0)
+      {
+        if (avail < 3)
+          throw std::runtime_error("Invalid UTF-8");
+        cp = (*ptr++ & 0x7) << 18;
+        cp |= (*ptr++ & 0x3f) << 12;
+        cp |= (*ptr++ & 0x3f) << 6;
+        cp |= *ptr++ & 0x3f;
+        avail -= 3;
+        bytes = 4;
+      }
+      else
+        throw std::runtime_error("Invalid UTF-8");
+
+      cp = std::towlower(cp);
+      wptr = wbuf;
+      switch (bytes)
+      {
+        case 1: *wptr++ = cp; break;
+        case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
+        case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
+        case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr += 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
+        default: throw std::runtime_error("Invalid UTF-8");
+      }
+      *wptr = 0;
+      sc += T(wbuf, bytes);
+      cp = 0;
+      bytes = 1;
+    }
+    return sc;
+  }
+
+  struct WordHash
+  {
+    std::size_t operator()(const epee::wipeable_string &s) const
+    {
+      const epee::wipeable_string sc = utf8canonical(s);
+      return epee::fnv::FNV1a(sc.data(), sc.size());
+    }
+  };
+
+  struct WordEqual
+  {
+    bool operator()(const epee::wipeable_string &s0, const epee::wipeable_string &s1) const
+    {
+      const epee::wipeable_string s0c = utf8canonical(s0);
+      const epee::wipeable_string s1c = utf8canonical(s1);
+      return s0c == s1c;
+    }
+  };
+
   /*!
    * \class Base
    * \brief A base language class which all languages have to inherit from for
@@ -87,8 +175,8 @@ namespace Language
       NWORDS = 1626
     };
     std::vector<std::string> word_list; /*!< A pointer to the array of words */
-    std::unordered_map<epee::wipeable_string, uint32_t> word_map; /*!< hash table to find word's index */
-    std::unordered_map<epee::wipeable_string, uint32_t> trimmed_word_map; /*!< hash table to find word's trimmed index */
+    std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> word_map; /*!< hash table to find word's index */
+    std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> trimmed_word_map; /*!< hash table to find word's trimmed index */
     std::string language_name; /*!< Name of language */
     std::string english_language_name; /*!< Name of language */
     uint32_t unique_prefix_length; /*!< Number of unique starting characters to trim the wordlist to when matching */
@@ -159,7 +247,7 @@ namespace Language
      * \brief Returns a pointer to the word map.
      * \return A pointer to the word map.
      */
-    const std::unordered_map<epee::wipeable_string, uint32_t>& get_word_map() const
+    const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_word_map() const
     {
       return word_map;
     }
@@ -167,7 +255,7 @@ namespace Language
      * \brief Returns a pointer to the trimmed word map.
      * \return A pointer to the trimmed word map.
      */
-    const std::unordered_map<epee::wipeable_string, uint32_t>& get_trimmed_word_map() const
+    const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_trimmed_word_map() const
     {
       return trimmed_word_map;
     }