* src/import-export/import-backend.c:

* src/import-export/import-match-map.c:
	* src/import-export/import-match-map.h:
	  Chris Morgan's Baysian Matching code, to match transactions
	  based on Bayesian filtering of previously matched transactions.


git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@8044 57a11ea4-9604-0410-9ed3-97b8803252fd
This commit is contained in:
Derek Atkins 2003-03-08 19:48:50 +00:00
parent 6e143835cb
commit b2ccbf62cf
4 changed files with 552 additions and 24 deletions

View File

@ -1,3 +1,11 @@
2003-03-08 Derek Atkins <derek@ihtfp.com>
* src/import-export/import-backend.c:
* src/import-export/import-match-map.c:
* src/import-export/import-match-map.h:
Chris Morgan's Baysian Matching code, to match transactions
based on Bayesian filtering of previously matched transactions.
2003-03-06 Christian Stimming <stimming@tuhh.de>
* src/import-export/hbci/dialog-hbcitrans.c: Include a latest

View File

@ -44,6 +44,9 @@
#include "gnc-ui-util.h"
#define IMPORT_PAGE "Online Banking & Importing" /* from app-utils/prefs.scm */
#define BAYES_OPTION "Use Bayesian Matching?"
/********************************************************************\
* Constants *
\********************************************************************/
@ -90,6 +93,9 @@ struct _transactioninfo
GNCImportAction action;
GNCImportAction previous_action;
/* A list of tokenized strings to use for bayesian matching purposes */
GList * match_tokens;
/* In case of a single destination account it is stored here. */
Account *dest_acc;
gboolean dest_acc_selected_manually;
@ -241,6 +247,15 @@ void gnc_import_TransInfo_delete (GNCImportTransInfo *info)
xaccTransDestroy(info->trans);
xaccTransCommitEdit(info->trans);
}
if (info->match_tokens)
{
GList *node;
for (node = info->match_tokens; node; node = node->next)
g_free (node->data);
g_list_free (info->match_tokens);
}
g_free(info);
}
}
@ -343,28 +358,128 @@ GdkPixmap* gen_probability_pixmap(gint score_original, GNCImportSettings *settin
* MatchMap- related functions (storing and retrieving)
*/
/* searches using the GNCImportTransInfo through all existing transactions */
/* if there is an exact match of the description and memo */
/* Tokenize a string and append to an existing GList(or an empty GList)
* the tokens
*/
static GList*
tokenize_string(GList* existing_tokens, const char *string)
{
char **tokenized_strings; /* array of strings returned by g_strsplit() */
char **stringpos;
tokenized_strings = g_strsplit(string, " ", 0);
stringpos = tokenized_strings;
/* add each token to the token GList */
while(stringpos && *stringpos)
{
/* prepend the char* to the token GList */
existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
/* then move to the next string */
stringpos++;
}
/* free up the strings that g_strsplit() created */
g_strfreev(tokenized_strings);
return existing_tokens;
}
/* create and return a list of tokens for a given transaction info. */
static GList*
TransactionGetTokens(GNCImportTransInfo *info)
{
Transaction* transaction;
GList* tokens;
const char* text;
time_t transtime;
struct tm *tm_struct;
char local_day_of_week[16];
Split* split;
int split_index;
g_return_val_if_fail (info, NULL);
if (info->match_tokens) return info->match_tokens;
transaction = gnc_import_TransInfo_get_trans(info);
g_assert(transaction);
tokens = 0; /* start off with an empty list */
/* make tokens from the transaction description */
text = xaccTransGetDescription(transaction);
tokens = tokenize_string(tokens, text);
/* the day of week the transaction occured is a good indicator of
* what account this transaction belongs in get the date and covert
* it to day of week as a token
*/
transtime = xaccTransGetDate(transaction);
tm_struct = gmtime(&transtime);
if(!strftime(local_day_of_week, sizeof(local_day_of_week), "%A", tm_struct))
{
PERR("TransactionGetTokens: error, strftime failed\n");
}
/* we cannot add a locally allocated string to this array, dup it so
* it frees the same way the rest do
*/
tokens = g_list_prepend(tokens, g_strdup(local_day_of_week));
/* make tokens from the memo of each split of this transaction */
split_index = 0;
while((split = xaccTransGetSplit(transaction, split_index)))
{
text = xaccSplitGetMemo(split);
tokens = tokenize_string(tokens, text);
split_index++; /* next split */
}
/* remember the list of tokens for later.. */
info->match_tokens = tokens;
/* return the pointer to the GList */
return tokens;
}
/* searches using the GNCImportTransInfo through all existing transactions
* if there is an exact match of the description and memo
*/
static Account *
matchmap_find_destination (GncImportMatchMap *matchmap,
GNCImportTransInfo *info)
matchmap_find_destination (GncImportMatchMap *matchmap, GNCImportTransInfo *info)
{
GncImportMatchMap *tmp_map;
Account *result;
g_assert (info);
GList* tokens;
gboolean useBayes;
g_assert (info);
tmp_map = ((matchmap != NULL) ? matchmap :
gnc_imap_create_from_account
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (info))));
result = gnc_imap_find_account
(tmp_map, GNCIMPORT_DESC,
xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE);
if(useBayes)
{
/* get the tokens for this transaction* */
tokens = TransactionGetTokens(info);
/* try to find the destination account for this transaction from its tokens */
result = gnc_imap_find_account_bayes(tmp_map, tokens);
} else {
/* old system of transaction to account matching */
result = gnc_imap_find_account
(tmp_map, GNCIMPORT_DESC,
xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
}
/* Disable matching by memo, until bayesian filtering is implemented.
It's currently unlikely to help, and has adverse effects, causing false positives,
since very often the type of the transaction is stored there.
* It's currently unlikely to help, and has adverse effects,
* causing false positives, since very often the type of the
* transaction is stored there.
if (result == NULL)
result = gnc_imap_find_account
@ -390,6 +505,9 @@ matchmap_store_destination (GncImportMatchMap *matchmap,
GncImportMatchMap *tmp_matchmap = NULL;
Account *dest;
const char *descr, *memo;
GList *tokens;
gboolean useBayes;
g_assert (trans_info);
/* This will store the destination account of the selected match if
@ -410,20 +528,33 @@ matchmap_store_destination (GncImportMatchMap *matchmap,
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (trans_info))));
descr = xaccTransGetDescription
(gnc_import_TransInfo_get_trans (trans_info));
if (descr && (strlen (descr) > 0))
gnc_imap_add_account (tmp_matchmap,
/* see what matching system we are currently using */
useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE);
if(useBayes)
{
/* tokenize this transaction */
tokens = TransactionGetTokens(trans_info);
/* add the tokens to the imap with the given destination account */
gnc_imap_add_account_bayes(tmp_matchmap, tokens, dest);
} else {
/* old matching system */
descr = xaccTransGetDescription
(gnc_import_TransInfo_get_trans (trans_info));
if (descr && (strlen (descr) > 0))
gnc_imap_add_account (tmp_matchmap,
GNCIMPORT_DESC,
descr,
dest);
memo = xaccSplitGetMemo
(gnc_import_TransInfo_get_fsplit (trans_info));
if (memo && (strlen (memo) > 0))
gnc_imap_add_account (tmp_matchmap,
memo = xaccSplitGetMemo
(gnc_import_TransInfo_get_fsplit (trans_info));
if (memo && (strlen (memo) > 0))
gnc_imap_add_account (tmp_matchmap,
GNCIMPORT_MEMO,
memo,
dest);
} /* if(useBayes) */
if (matchmap == NULL)
gnc_imap_destroy (tmp_matchmap);
@ -935,7 +1066,7 @@ gnc_import_TransInfo_refresh_destacc (GNCImportTransInfo *transaction_info,
/* if we haven't manually selected a destination account for this transaction */
if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE)
{
/* Try to find a previous selected destination account string match for the ADD action */
/* Try to find the destination account for this transaction based on prior ones */
new_destacc = matchmap_find_destination(matchmap, transaction_info);
gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE);
} else

View File

@ -25,11 +25,22 @@
An import mapper service that stores Account Maps for the
generic importer. This allows importers to map various
"strings" to Gnucash accounts in a generic manner.
@author Copyright (C) 2002 Derek Atkins <derek@ihtfp.com>
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
*/
#include <string.h>
#include <glib.h>
#include "import-match-map.h"
#include "kvp_frame.h"
#include "Group.h"
#include "gnc-ui-util.h"
#include "gnc-engine-util.h"
/********************************************************************\
* Constants *
\********************************************************************/
static short module = MOD_IMPORT;
struct _GncImportMatchMap {
kvp_frame * frame;
@ -37,7 +48,8 @@ struct _GncImportMatchMap {
GNCBook * book;
};
#define IMAP_FRAME "import-map"
#define IMAP_FRAME "import-map"
#define IMAP_FRAME_BAYES "import-map-bayes"
static GncImportMatchMap *
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book)
@ -99,6 +111,9 @@ void gnc_imap_clear (GncImportMatchMap *imap)
/* Clear the IMAP_FRAME kvp */
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
/* Clear the bayes kvp, IMAP_FRAME_BAYES */
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
/* XXX: mark the account (or book) as dirty! */
}
@ -143,4 +158,368 @@ void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
/* XXX Mark the account (or book) as dirty! */
}
/* Below here is the bayes transaction to account matching system */
struct account_token_count
{
char* account_name;
gint64 token_count; /* occurances of a given token for this account_name */
};
/* total_count and the token_count for a given account let us calculate the
* probability of a given account with any single token
*/
struct token_accounts_info
{
GList *accounts; /* array of struct account_token_count */
gint64 total_count;
};
/* gpointer is a pointer to a struct token_accounts_info
* NOTE: can always assume that keys are unique, reduces code in this function
*/
static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
{
struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
struct account_token_count* this_account;
// PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key,
// (long)kvp_value_get_gint64(value));
/* add the count to the total_count */
tokenInfo->total_count += kvp_value_get_gint64(value);
/* allocate a new structure for this account and it's token count */
this_account = (struct account_token_count*)
g_new0(struct account_token_count, 1);
/* fill in the account name and number of tokens found for this account name */
this_account->account_name = (char*)key;
this_account->token_count = kvp_value_get_gint64(value);
/* append onto the glist a pointer to the new account_token_count structure */
tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
}
/* intermediate values used to calculate the bayes probability of a given account
* where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b),
* product_difference is (1-a) * (1-b)
*/
struct account_probability
{
double product; /* product of probabilities */
double product_difference; /* product of (1-probabilities) */
};
/* convert a hash table of account names and (struct account_probability*)
* into a hash table of 100000x the percentage match value, ie. 10% would be
* 0.10 * 100000 = 10000
*/
#define PROBABILITY_FACTOR 100000
static void buildProbabilities(gpointer key, gpointer value, gpointer data)
{
GHashTable *final_probabilities = (GHashTable*)data;
struct account_probability *account_p = (struct account_probability*)value;
/* P(AB) = A*B / [A*B + (1-A)*(1-B)]
* NOTE: so we only keep track of a running product(A*B*C...)
* and product difference ((1-A)(1-B)...)
*/
gint32 probability =
(account_p->product /
(account_p->product + account_p->product_difference))
* PROBABILITY_FACTOR;
PINFO("P('%s') = '%d'\n", (char*)key, probability);
g_hash_table_insert(final_probabilities, key, (gpointer)probability);
}
/* Frees an array of the same time that buildProperties built */
static void freeProbabilities(gpointer key, gpointer value, gpointer data)
{
/* free up the struct account_probability that was allocated
* in gnc_imap_find_account_bayes()
*/
g_free(value);
}
/* holds an account name and its corresponding integer probability
* the integer probability is some factor of 10
*/
struct account_info
{
char* account_name;
gint32 probability;
};
/* Find the highest probability and the corresponding account name
* store in data, a (struct account_info*)
* NOTE: this is a g_hash_table_foreach() function for a hash table of entries
* key is a pointer to the account name, value is a gint32, 100000x
* the probability for this account
*/
static void highestProbability(gpointer key, gpointer value, gpointer data)
{
struct account_info *account_i = (struct account_info*)data;
/* if the current probability is greater than the stored, store the current */
if((gint32)value > account_i->probability)
{
/* Save the new highest probability and the assoaciated account name */
account_i->probability = (gint32)value;
account_i->account_name = key;
}
}
#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
/* Look up an Account in the map */
Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
{
struct token_accounts_info tokenInfo; /* holds the accounts and total
* token count for a single token */
GList *current_token; /* pointer to the current token from the
* input GList *tokens */
GList *current_account_token; /* pointer to the struct
* account_token_count */
struct account_token_count *account_c; /* an account name and the number
* of times a token has appeared
* for the account */
struct account_probability *account_p; /* intermediate storage of values
* to compute the bayes probability
* of an account */
GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
struct account_info account_i;
kvp_value* value;
kvp_frame* token_frame;
ENTER(" ");
/* check to see if the imap is NULL */
if(!imap)
{
PINFO("imap is null, returning null");
LEAVE(" ");
return NULL;
}
/* find the probability for each account that contains any of the tokens
* in the input tokens list
*/
for(current_token = tokens; current_token; current_token = current_token->next)
{
/* zero out the token_accounts_info structure */
memset(&tokenInfo, 0, sizeof(struct token_accounts_info));
PINFO("token: '%s'", (char*)current_token->data);
/* find the slot for the given token off of the source account
* for these tokens, search off of the IMAP_FRAME_BAYES path so
* we aren't looking from the parent of the entire kvp tree
*/
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data, NULL);
/* if value is null we should skip over this token */
if(!value)
continue;
/* convert the slot(value) into a the frame that contains the
* list of accounts
*/
token_frame = kvp_value_get_frame(value);
/* token_frame should NEVER be null */
if(!token_frame)
{
PERR("token '%s' has no accounts", (char*)current_token->data);
continue; /* skip over this token */
}
/* process the accounts for this token, adding the account if it
* doesn't already exist or adding to the existing accounts token
* count if it does
*/
kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
/* for each account we have just found, see if the account already exists
* in the list of account probabilities, if not add it
*/
for(current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* get the account name and corresponding token count */
account_c = (struct account_token_count*)current_account_token->data;
PINFO("account_c->account_name('%s'), "
"account_c->token_count('%ld')/total_count('%ld')",
account_c->account_name, (long)account_c->token_count,
(long)tokenInfo.total_count);
account_p = g_hash_table_lookup(running_probabilities,
account_c->account_name);
/* if the account exists in the list then continue
* the running probablities
*/
if(account_p)
{
account_p->product =
((double)account_c->token_count / (double)tokenInfo.total_count)
* account_p->product;
account_p->product_difference =
((double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count))
* account_p->product_difference;
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
}
else
{
/* add a new entry */
PINFO("adding a new entry for this account");
account_p = (struct account_probability*)
g_new0(struct account_probability, 1);
/* set the product and product difference values */
account_p->product = ((double)account_c->token_count /
(double)tokenInfo.total_count);
account_p->product_difference =
(double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count);
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
/* add the account name and (struct account_probability*)
* to the hash table */
g_hash_table_insert(running_probabilities,
account_c->account_name, account_p);
}
} /* for all accounts in tokenInfo */
/* free the data in tokenInfo */
for(current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* free up each struct account_token_count we allocated */
g_free((struct account_token_count*)current_account_token->data);
}
g_list_free(tokenInfo.accounts); /* free the accounts GList */
}
/* build a hash table of account names and their final probabilities
* from each entry in the running_probabilties hash table
*/
g_hash_table_foreach(running_probabilities, buildProbabilities,
final_probabilities);
/* find the highest probabilty and the corresponding account */
memset(&account_i, 0, sizeof(struct account_info));
g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
/* free each element of the running_probabilities hash */
g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
/* free the hash tables */
g_hash_table_destroy(running_probabilities);
g_hash_table_destroy(final_probabilities);
PINFO("highest P('%s') = '%d'", account_i.account_name, account_i.probability);
/* has this probability met our threshold? */
if(account_i.probability >= threshold)
{
PINFO("found match");
LEAVE(" ");
return xaccGetAccountFromFullName(gnc_book_get_group(imap->book),
account_i.account_name,
gnc_get_account_separator());
}
PINFO("no match");
LEAVE(" ");
return NULL; /* we didn't meet our threshold, return NULL for an account */
}
/* Updates the imap for a given account using a list of tokens */
void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
{
GList *current_token;
kvp_value *value;
gint64 token_count;
char* account_fullname;
kvp_value *new_value; /* the value that will be added back into the kvp tree */
ENTER(" ");
/* if imap is null return */
if(!imap)
{
LEAVE(" ");
return;
}
account_fullname = xaccAccountGetFullName(acc, gnc_get_account_separator());
PINFO("account name: '%s'\n", account_fullname);
/* process each token in the list */
for(current_token = g_list_first(tokens); current_token;
current_token = current_token->next)
{
/* start off with no tokens for this account */
token_count = 0;
PINFO("adding token '%s'\n", (char*)current_token->data);
/* is this token/account_name already in the kvp tree? */
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data, account_fullname,
NULL);
/* if the token/account is already in the tree, read the current
* value from the tree and use this for the basis of the value we
* are putting back
*/
if(value)
{
PINFO("found existing value of '%ld'\n",
(long)kvp_value_get_gint64(value));
/* convert this value back into an integer */
token_count+=kvp_value_get_gint64(value);
}
/* increment the token count */
token_count++;
/* create a new value */
new_value = kvp_value_new_gint64(token_count);
/* insert the value into the kvp tree at
* /imap->frame/IMAP_FRAME/token_string/account_name_string
*/
kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES,
(char*)current_token->data, account_fullname, NULL);
/* kvp_frame_set_slot_path() copied the value so we
* need to delete this one ;-) */
kvp_value_delete(new_value);
}
/* free up the account fullname string */
g_free(account_fullname);
LEAVE(" ");
}
/** @} */

View File

@ -24,7 +24,7 @@
An import mapper service that stores Account Maps for the
generic importer. This allows importers to map various
"strings" to Gnucash accounts in a generic manner.
@author Copyright (C) 2002 Derek Atkins <derek@ihtfp.com>
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
*/
#ifndef GNC_IMPORT_MATCH_MAP_H
#define GNC_IMPORT_MATCH_MAP_H
@ -48,8 +48,8 @@ void gnc_imap_destroy (GncImportMatchMap *imap);
void gnc_imap_clear (GncImportMatchMap *imap);
/** Look up an Account in the map */
Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
const char *key);
Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category,
const char *key);
/** Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
@ -57,6 +57,16 @@ Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
const char *key, Account *acc);
/** Look up an Account in the map from a GList* of pointers to strings(tokens)
from the current transaction */
Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList* tokens);
/** Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
Account *acc);
/** @name Some well-known categories