Fix tokenize_string()

This fix prevents empty strings as tokens and removes duplicated tokens.
Function tokenize_string() is used for bayesian import matching, where
empty token strings or duplicated tokens lead to wrong results within
probability calculation for matching of a transaction to an account.

Empty token strings can occur if (see function g_strsplit())
* two or more spaces occur directly after another
* the string begins or ends with spaces
This commit is contained in:
Christian Gruber 2020-02-02 22:38:44 +01:00
parent 322f2d99de
commit d07d4b962f

View File

@ -387,8 +387,24 @@ tokenize_string(GList* existing_tokens, const char *string)
/* add each token to the token GList */
while (stringpos && *stringpos)
{
/* prepend the char* to the token GList */
existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
if (strlen(*stringpos) > 0)
{
/* check for duplicated tokens */
gboolean duplicated = FALSE;
for (GList* token = existing_tokens; token != NULL; token = token->next)
{
if (g_strcmp0(token->data, *stringpos) == 0)
{
duplicated = TRUE;
break;
}
}
if (duplicated == FALSE)
{
/* prepend the char* to the token GList */
existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
}
}
/* then move to the next string */
stringpos++;