From 9b3085a429660dd442176ba63de196a476edc4e0 Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 27 Jan 2020 22:38:26 +0100 Subject: [PATCH 1/8] Add test cases to check for exact token matching --- libgnucash/engine/test/gtest-import-map.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libgnucash/engine/test/gtest-import-map.cpp b/libgnucash/engine/test/gtest-import-map.cpp index f2e4509e61..33d64ab18e 100644 --- a/libgnucash/engine/test/gtest-import-map.cpp +++ b/libgnucash/engine/test/gtest-import-map.cpp @@ -271,6 +271,14 @@ TEST_F(ImapBayesTest, FindAccountBayes) EXPECT_EQ(t_expense_account2, account); account = gnc_account_imap_find_account_bayes(t_imap, t_list5); EXPECT_EQ(nullptr, account); + + // only imap entries with exact token matching should be considered + root->set_path({std::string{IMAP_FRAME_BAYES} + "/" + pepper + waldo + "/" + acct2_guid}, new KvpValue{*value}); + account = gnc_account_imap_find_account_bayes(t_imap, t_list3); + EXPECT_EQ(t_expense_account1, account); + root->set_path({std::string{IMAP_FRAME_BAYES} + "/" + pepper + "/" + waldo + "/" + acct2_guid}, new KvpValue{*value}); + account = gnc_account_imap_find_account_bayes(t_imap, t_list3); + EXPECT_EQ(t_expense_account1, account); } TEST_F(ImapBayesTest, AddAccountBayes) From da60560ac4ad1994a79185eaaafaaa0363a21076 Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 27 Jan 2020 22:43:09 +0100 Subject: [PATCH 2/8] Change behaviour of KvpFrame::for_each_slot_prefix() Provided function func is now called with key suffix only instead of full key (prefix is omitted). This is neccessary for fixing function build_token_info() in the next commit. --- libgnucash/engine/Account.cpp | 24 +++++++++++------------- libgnucash/engine/kvp-frame.hpp | 4 ++-- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/libgnucash/engine/Account.cpp b/libgnucash/engine/Account.cpp index 194976dbd6..fee1187bc8 100644 --- a/libgnucash/engine/Account.cpp +++ b/libgnucash/engine/Account.cpp @@ -5279,11 +5279,11 @@ struct AccountInfo }; static void -build_token_info(char const * key, KvpValue * value, TokenAccountsInfo & tokenInfo) +build_token_info(char const * suffix, KvpValue * value, TokenAccountsInfo & tokenInfo) { tokenInfo.total_count += value->get(); AccountTokenCount this_account; - std::string account_guid {key}; + std::string account_guid {suffix}; /*By convention, the key ends with the account GUID.*/ this_account.account_guid = account_guid.substr(account_guid.size() - GUID_ENCODING_LENGTH); this_account.token_count = value->get(); @@ -5665,38 +5665,36 @@ build_non_bayes (const char *key, const GValue *value, gpointer user_data) g_free (guid_string); } -static std::tuple +static std::tuple parse_bayes_imap_info (std::string const & imap_bayes_entry) { - auto header_length = strlen (IMAP_FRAME_BAYES); - std::string header {imap_bayes_entry.substr (0, header_length)}; auto guid_start = imap_bayes_entry.size() - GUID_ENCODING_LENGTH; - std::string keyword {imap_bayes_entry.substr (header_length + 1, guid_start - header_length - 2)}; + std::string keyword {imap_bayes_entry.substr (1, guid_start - 2)}; std::string account_guid {imap_bayes_entry.substr (guid_start)}; - return std::tuple {header, keyword, account_guid}; + return std::tuple {keyword, account_guid}; } static void -build_bayes (const char *key, KvpValue * value, GncImapInfo & imapInfo) +build_bayes (const char *suffix, KvpValue * value, GncImapInfo & imapInfo) { - auto parsed_key = parse_bayes_imap_info (key); + auto parsed_key = parse_bayes_imap_info (suffix); GncGUID guid; try { - auto temp_guid = gnc::GUID::from_string (std::get <2> (parsed_key)); + auto temp_guid = gnc::GUID::from_string (std::get <1> (parsed_key)); guid = temp_guid; } catch (const gnc::guid_syntax_exception& err) { - PWARN("Invalid GUID string from %s", key); + PWARN("Invalid GUID string from %s%s", IMAP_FRAME_BAYES, suffix); } auto map_account = xaccAccountLookup (&guid, gnc_account_get_book (imapInfo.source_account)); auto imap_node = static_cast (g_malloc (sizeof (GncImapInfo))); auto count = value->get (); imap_node->source_account = imapInfo.source_account; imap_node->map_account = map_account; - imap_node->head = g_strdup (key); - imap_node->match_string = g_strdup (std::get <1> (parsed_key).c_str ()); + imap_node->head = g_strdup_printf ("%s%s", IMAP_FRAME_BAYES, suffix); + imap_node->match_string = g_strdup (std::get <0> (parsed_key).c_str ()); imap_node->category = g_strdup(" "); imap_node->count = g_strdup_printf ("%" G_GINT64_FORMAT, count); imapInfo.list = g_list_prepend (imapInfo.list, imap_node); diff --git a/libgnucash/engine/kvp-frame.hpp b/libgnucash/engine/kvp-frame.hpp index 23eb0c6153..1419e626e5 100644 --- a/libgnucash/engine/kvp-frame.hpp +++ b/libgnucash/engine/kvp-frame.hpp @@ -247,7 +247,7 @@ void KvpFrame::for_each_slot_prefix(std::string const & prefix, return; /* Testing for prefix matching */ if (std::mismatch(prefix.begin(), prefix.end(), temp_key.begin()).first == prefix.end()) - func (a.first, a.second); + func (&a.first[prefix.size()], a.second); } ); } @@ -264,7 +264,7 @@ void KvpFrame::for_each_slot_prefix(std::string const & prefix, return; /* Testing for prefix matching */ if (std::mismatch(prefix.begin(), prefix.end(), temp_key.begin()).first == prefix.end()) - func (a.first, a.second, data); + func (&a.first[prefix.size()], a.second, data); } ); } From a13184978a07e4c6735e5d5893076ffd8117c6a1 Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 27 Jan 2020 23:01:25 +0100 Subject: [PATCH 3/8] Fix calculation of token info to find exactly matching tokens only In get_first_pass_probabilities() function qof_instance_foreach_slot_prefix() is called with a prefix path including closing slash after token now. This avoids, that also entries with token as a substring are included in token info, where key only starts with token. Finally function build_token_info() checks, if the key suffix after the token consists only of the GUID. This avoids, that also entries with the same prefix and slashes are included in token info. --- libgnucash/engine/Account.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/libgnucash/engine/Account.cpp b/libgnucash/engine/Account.cpp index fee1187bc8..805daeea4e 100644 --- a/libgnucash/engine/Account.cpp +++ b/libgnucash/engine/Account.cpp @@ -5281,13 +5281,15 @@ struct AccountInfo static void build_token_info(char const * suffix, KvpValue * value, TokenAccountsInfo & tokenInfo) { - tokenInfo.total_count += value->get(); - AccountTokenCount this_account; - std::string account_guid {suffix}; - /*By convention, the key ends with the account GUID.*/ - this_account.account_guid = account_guid.substr(account_guid.size() - GUID_ENCODING_LENGTH); - this_account.token_count = value->get(); - tokenInfo.accounts.push_back(this_account); + if (strlen(suffix) == GUID_ENCODING_LENGTH) + { + tokenInfo.total_count += value->get(); + AccountTokenCount this_account; + /*By convention, the key ends with the account GUID.*/ + this_account.account_guid = std::string{suffix}; + this_account.token_count = value->get(); + tokenInfo.accounts.push_back(this_account); + } } /** We scale the probability values by probability_factor. @@ -5332,7 +5334,7 @@ get_first_pass_probabilities(GncImportMatchMap * imap, GList * tokens) for (auto current_token = tokens; current_token; current_token = current_token->next) { TokenAccountsInfo tokenInfo{}; - auto path = std::string{IMAP_FRAME_BAYES "/"} + static_cast (current_token->data); + auto path = std::string{IMAP_FRAME_BAYES "/"} + static_cast (current_token->data) + "/"; qof_instance_foreach_slot_prefix (QOF_INSTANCE (imap->acc), path, &build_token_info, tokenInfo); for (auto const & current_account_token : tokenInfo.accounts) { From 322f2d99de89849b4d193afcb779520aabcafa4f Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Sun, 2 Feb 2020 17:55:30 +0100 Subject: [PATCH 4/8] Rework key prefix matching Use C string comparison instead of C++ function std::mismatch to increase performance. --- libgnucash/engine/kvp-frame.hpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/libgnucash/engine/kvp-frame.hpp b/libgnucash/engine/kvp-frame.hpp index 1419e626e5..c84cf18a4d 100644 --- a/libgnucash/engine/kvp-frame.hpp +++ b/libgnucash/engine/kvp-frame.hpp @@ -242,11 +242,8 @@ void KvpFrame::for_each_slot_prefix(std::string const & prefix, std::for_each (m_valuemap.begin(), m_valuemap.end(), [&prefix,&func](const KvpFrameImpl::map_type::value_type & a) { - std::string temp_key {a.first}; - if (temp_key.size() < prefix.size()) - return; /* Testing for prefix matching */ - if (std::mismatch(prefix.begin(), prefix.end(), temp_key.begin()).first == prefix.end()) + if (strncmp(a.first, prefix.c_str(), prefix.size()) == 0) func (&a.first[prefix.size()], a.second); } ); @@ -259,11 +256,8 @@ void KvpFrame::for_each_slot_prefix(std::string const & prefix, std::for_each (m_valuemap.begin(), m_valuemap.end(), [&prefix,&func,&data](const KvpFrameImpl::map_type::value_type & a) { - std::string temp_key {a.first}; - if (temp_key.size() < prefix.size()) - return; /* Testing for prefix matching */ - if (std::mismatch(prefix.begin(), prefix.end(), temp_key.begin()).first == prefix.end()) + if (strncmp(a.first, prefix.c_str(), prefix.size()) == 0) func (&a.first[prefix.size()], a.second, data); } ); From d07d4b962fc89ef332626b984b8fb3149033113d Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Sun, 2 Feb 2020 22:38:44 +0100 Subject: [PATCH 5/8] Fix tokenize_string() This fix prevents empty strings as tokens and removes duplicated tokens. Function tokenize_string() is used for bayesian import matching, where empty token strings or duplicated tokens lead to wrong results within probability calculation for matching of a transaction to an account. Empty token strings can occur if (see function g_strsplit()) * two or more spaces occur directly after another * the string begins or ends with spaces --- gnucash/import-export/import-backend.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/gnucash/import-export/import-backend.c b/gnucash/import-export/import-backend.c index ebc37c3a7b..888b913b2f 100644 --- a/gnucash/import-export/import-backend.c +++ b/gnucash/import-export/import-backend.c @@ -387,8 +387,24 @@ tokenize_string(GList* existing_tokens, const char *string) /* add each token to the token GList */ while (stringpos && *stringpos) { - /* prepend the char* to the token GList */ - existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos)); + if (strlen(*stringpos) > 0) + { + /* check for duplicated tokens */ + gboolean duplicated = FALSE; + for (GList* token = existing_tokens; token != NULL; token = token->next) + { + if (g_strcmp0(token->data, *stringpos) == 0) + { + duplicated = TRUE; + break; + } + } + if (duplicated == FALSE) + { + /* prepend the char* to the token GList */ + existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos)); + } + } /* then move to the next string */ stringpos++; From 41863be9c7251098764c5a36e25f401fe536162d Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 17 Feb 2020 22:56:15 +0100 Subject: [PATCH 6/8] Avoid copying local instance of AccountTokenCount --- libgnucash/engine/Account.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libgnucash/engine/Account.cpp b/libgnucash/engine/Account.cpp index 805daeea4e..83d49a7480 100644 --- a/libgnucash/engine/Account.cpp +++ b/libgnucash/engine/Account.cpp @@ -5284,11 +5284,8 @@ build_token_info(char const * suffix, KvpValue * value, TokenAccountsInfo & toke if (strlen(suffix) == GUID_ENCODING_LENGTH) { tokenInfo.total_count += value->get(); - AccountTokenCount this_account; /*By convention, the key ends with the account GUID.*/ - this_account.account_guid = std::string{suffix}; - this_account.token_count = value->get(); - tokenInfo.accounts.push_back(this_account); + tokenInfo.accounts.emplace_back(AccountTokenCount{std::string{suffix}, value->get()}); } } From 01c76e23913ea870fb331fc6c747d67f9bdd7f85 Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 17 Feb 2020 23:29:28 +0100 Subject: [PATCH 7/8] Remove unused template of function for_each_slot_prefix() for_each_slot_prefix() is not used anywhere with two arguments --- libgnucash/engine/kvp-frame.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/libgnucash/engine/kvp-frame.hpp b/libgnucash/engine/kvp-frame.hpp index c84cf18a4d..253eec3187 100644 --- a/libgnucash/engine/kvp-frame.hpp +++ b/libgnucash/engine/kvp-frame.hpp @@ -235,20 +235,6 @@ struct KvpFrameImpl KvpValue * set_impl (std::string const &, KvpValue *) noexcept; }; -template -void KvpFrame::for_each_slot_prefix(std::string const & prefix, - func_type const & func) const noexcept -{ - std::for_each (m_valuemap.begin(), m_valuemap.end(), - [&prefix,&func](const KvpFrameImpl::map_type::value_type & a) - { - /* Testing for prefix matching */ - if (strncmp(a.first, prefix.c_str(), prefix.size()) == 0) - func (&a.first[prefix.size()], a.second); - } - ); -} - template void KvpFrame::for_each_slot_prefix(std::string const & prefix, func_type const & func, data_type & data) const noexcept From 7509b542da67aeb42bff0b98a20ffc2989f6e146 Mon Sep 17 00:00:00 2001 From: Christian Gruber Date: Mon, 17 Feb 2020 23:31:12 +0100 Subject: [PATCH 8/8] Simplify function build_bayes() Inline function parse_bayes_imap_info() into build_bayes() and remove temp_guid. --- libgnucash/engine/Account.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/libgnucash/engine/Account.cpp b/libgnucash/engine/Account.cpp index 83d49a7480..ea976be491 100644 --- a/libgnucash/engine/Account.cpp +++ b/libgnucash/engine/Account.cpp @@ -5664,24 +5664,15 @@ build_non_bayes (const char *key, const GValue *value, gpointer user_data) g_free (guid_string); } -static std::tuple -parse_bayes_imap_info (std::string const & imap_bayes_entry) -{ - auto guid_start = imap_bayes_entry.size() - GUID_ENCODING_LENGTH; - std::string keyword {imap_bayes_entry.substr (1, guid_start - 2)}; - std::string account_guid {imap_bayes_entry.substr (guid_start)}; - return std::tuple {keyword, account_guid}; -} - static void build_bayes (const char *suffix, KvpValue * value, GncImapInfo & imapInfo) { - auto parsed_key = parse_bayes_imap_info (suffix); + size_t guid_start = strlen(suffix) - GUID_ENCODING_LENGTH; + std::string account_guid {&suffix[guid_start]}; GncGUID guid; try { - auto temp_guid = gnc::GUID::from_string (std::get <1> (parsed_key)); - guid = temp_guid; + guid = gnc::GUID::from_string (account_guid); } catch (const gnc::guid_syntax_exception& err) { @@ -5693,7 +5684,7 @@ build_bayes (const char *suffix, KvpValue * value, GncImapInfo & imapInfo) imap_node->source_account = imapInfo.source_account; imap_node->map_account = map_account; imap_node->head = g_strdup_printf ("%s%s", IMAP_FRAME_BAYES, suffix); - imap_node->match_string = g_strdup (std::get <0> (parsed_key).c_str ()); + imap_node->match_string = g_strndup (&suffix[1], guid_start - 2); imap_node->category = g_strdup(" "); imap_node->count = g_strdup_printf ("%" G_GINT64_FORMAT, count); imapInfo.list = g_list_prepend (imapInfo.list, imap_node);