Fix tokenize_string()

This fix prevents empty strings as tokens and removes duplicated tokens. Function tokenize_string() is used for bayesian import matching, where empty token strings or duplicated tokens lead to wrong results within probability calculation for matching of a transaction to an account. Empty token strings can occur if (see function g_strsplit()) * two or more spaces occur directly after another * the string begins or ends with spaces
2025-02-25 18:55:30 -06:00 · 2020-02-02 22:38:44 +01:00 · 2020-02-02 22:38:44 +01:00 · d07d4b962f
commit d07d4b962f
parent 322f2d99de
1 changed files with 18 additions and 2 deletions
--- a/gnucash/import-export/import-backend.c
+++ b/gnucash/import-export/import-backend.c
@ -387,8 +387,24 @@ tokenize_string(GList* existing_tokens, const char *string)
    /* add each token to the token GList */
    while (stringpos && *stringpos)
    {
-        /* prepend the char* to the token GList */
-        existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
+        if (strlen(*stringpos) > 0)
+        {
+            /* check for duplicated tokens */
+            gboolean duplicated = FALSE;
+            for (GList* token = existing_tokens; token != NULL; token = token->next)
+            {
+                if (g_strcmp0(token->data, *stringpos) == 0)
+                {
+                    duplicated = TRUE;
+                    break;
+                }
+            }
+            if (duplicated == FALSE)
+            {
+                /* prepend the char* to the token GList */
+                existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
+            }
+        }

        /* then move to the next string */
        stringpos++;