mirror of
https://github.com/Gnucash/gnucash.git
synced 2025-02-25 18:55:30 -06:00
* src/import-export/import-backend.c:
* src/import-export/import-match-map.c: * src/import-export/import-match-map.h: Chris Morgan's Baysian Matching code, to match transactions based on Bayesian filtering of previously matched transactions. git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@8044 57a11ea4-9604-0410-9ed3-97b8803252fd
This commit is contained in:
parent
6e143835cb
commit
b2ccbf62cf
@ -1,3 +1,11 @@
|
|||||||
|
2003-03-08 Derek Atkins <derek@ihtfp.com>
|
||||||
|
|
||||||
|
* src/import-export/import-backend.c:
|
||||||
|
* src/import-export/import-match-map.c:
|
||||||
|
* src/import-export/import-match-map.h:
|
||||||
|
Chris Morgan's Baysian Matching code, to match transactions
|
||||||
|
based on Bayesian filtering of previously matched transactions.
|
||||||
|
|
||||||
2003-03-06 Christian Stimming <stimming@tuhh.de>
|
2003-03-06 Christian Stimming <stimming@tuhh.de>
|
||||||
|
|
||||||
* src/import-export/hbci/dialog-hbcitrans.c: Include a latest
|
* src/import-export/hbci/dialog-hbcitrans.c: Include a latest
|
||||||
|
@ -44,6 +44,9 @@
|
|||||||
|
|
||||||
#include "gnc-ui-util.h"
|
#include "gnc-ui-util.h"
|
||||||
|
|
||||||
|
#define IMPORT_PAGE "Online Banking & Importing" /* from app-utils/prefs.scm */
|
||||||
|
#define BAYES_OPTION "Use Bayesian Matching?"
|
||||||
|
|
||||||
/********************************************************************\
|
/********************************************************************\
|
||||||
* Constants *
|
* Constants *
|
||||||
\********************************************************************/
|
\********************************************************************/
|
||||||
@ -90,6 +93,9 @@ struct _transactioninfo
|
|||||||
GNCImportAction action;
|
GNCImportAction action;
|
||||||
GNCImportAction previous_action;
|
GNCImportAction previous_action;
|
||||||
|
|
||||||
|
/* A list of tokenized strings to use for bayesian matching purposes */
|
||||||
|
GList * match_tokens;
|
||||||
|
|
||||||
/* In case of a single destination account it is stored here. */
|
/* In case of a single destination account it is stored here. */
|
||||||
Account *dest_acc;
|
Account *dest_acc;
|
||||||
gboolean dest_acc_selected_manually;
|
gboolean dest_acc_selected_manually;
|
||||||
@ -241,6 +247,15 @@ void gnc_import_TransInfo_delete (GNCImportTransInfo *info)
|
|||||||
xaccTransDestroy(info->trans);
|
xaccTransDestroy(info->trans);
|
||||||
xaccTransCommitEdit(info->trans);
|
xaccTransCommitEdit(info->trans);
|
||||||
}
|
}
|
||||||
|
if (info->match_tokens)
|
||||||
|
{
|
||||||
|
GList *node;
|
||||||
|
|
||||||
|
for (node = info->match_tokens; node; node = node->next)
|
||||||
|
g_free (node->data);
|
||||||
|
|
||||||
|
g_list_free (info->match_tokens);
|
||||||
|
}
|
||||||
g_free(info);
|
g_free(info);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -343,28 +358,128 @@ GdkPixmap* gen_probability_pixmap(gint score_original, GNCImportSettings *settin
|
|||||||
* MatchMap- related functions (storing and retrieving)
|
* MatchMap- related functions (storing and retrieving)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* searches using the GNCImportTransInfo through all existing transactions */
|
/* Tokenize a string and append to an existing GList(or an empty GList)
|
||||||
/* if there is an exact match of the description and memo */
|
* the tokens
|
||||||
|
*/
|
||||||
|
static GList*
|
||||||
|
tokenize_string(GList* existing_tokens, const char *string)
|
||||||
|
{
|
||||||
|
char **tokenized_strings; /* array of strings returned by g_strsplit() */
|
||||||
|
char **stringpos;
|
||||||
|
|
||||||
|
tokenized_strings = g_strsplit(string, " ", 0);
|
||||||
|
stringpos = tokenized_strings;
|
||||||
|
|
||||||
|
/* add each token to the token GList */
|
||||||
|
while(stringpos && *stringpos)
|
||||||
|
{
|
||||||
|
/* prepend the char* to the token GList */
|
||||||
|
existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos));
|
||||||
|
|
||||||
|
/* then move to the next string */
|
||||||
|
stringpos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* free up the strings that g_strsplit() created */
|
||||||
|
g_strfreev(tokenized_strings);
|
||||||
|
|
||||||
|
return existing_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* create and return a list of tokens for a given transaction info. */
|
||||||
|
static GList*
|
||||||
|
TransactionGetTokens(GNCImportTransInfo *info)
|
||||||
|
{
|
||||||
|
Transaction* transaction;
|
||||||
|
GList* tokens;
|
||||||
|
const char* text;
|
||||||
|
time_t transtime;
|
||||||
|
struct tm *tm_struct;
|
||||||
|
char local_day_of_week[16];
|
||||||
|
Split* split;
|
||||||
|
int split_index;
|
||||||
|
|
||||||
|
g_return_val_if_fail (info, NULL);
|
||||||
|
if (info->match_tokens) return info->match_tokens;
|
||||||
|
|
||||||
|
transaction = gnc_import_TransInfo_get_trans(info);
|
||||||
|
g_assert(transaction);
|
||||||
|
|
||||||
|
tokens = 0; /* start off with an empty list */
|
||||||
|
|
||||||
|
/* make tokens from the transaction description */
|
||||||
|
text = xaccTransGetDescription(transaction);
|
||||||
|
tokens = tokenize_string(tokens, text);
|
||||||
|
|
||||||
|
/* the day of week the transaction occured is a good indicator of
|
||||||
|
* what account this transaction belongs in get the date and covert
|
||||||
|
* it to day of week as a token
|
||||||
|
*/
|
||||||
|
transtime = xaccTransGetDate(transaction);
|
||||||
|
tm_struct = gmtime(&transtime);
|
||||||
|
if(!strftime(local_day_of_week, sizeof(local_day_of_week), "%A", tm_struct))
|
||||||
|
{
|
||||||
|
PERR("TransactionGetTokens: error, strftime failed\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we cannot add a locally allocated string to this array, dup it so
|
||||||
|
* it frees the same way the rest do
|
||||||
|
*/
|
||||||
|
tokens = g_list_prepend(tokens, g_strdup(local_day_of_week));
|
||||||
|
|
||||||
|
/* make tokens from the memo of each split of this transaction */
|
||||||
|
split_index = 0;
|
||||||
|
while((split = xaccTransGetSplit(transaction, split_index)))
|
||||||
|
{
|
||||||
|
text = xaccSplitGetMemo(split);
|
||||||
|
tokens = tokenize_string(tokens, text);
|
||||||
|
split_index++; /* next split */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* remember the list of tokens for later.. */
|
||||||
|
info->match_tokens = tokens;
|
||||||
|
|
||||||
|
/* return the pointer to the GList */
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* searches using the GNCImportTransInfo through all existing transactions
|
||||||
|
* if there is an exact match of the description and memo
|
||||||
|
*/
|
||||||
static Account *
|
static Account *
|
||||||
matchmap_find_destination (GncImportMatchMap *matchmap,
|
matchmap_find_destination (GncImportMatchMap *matchmap, GNCImportTransInfo *info)
|
||||||
GNCImportTransInfo *info)
|
|
||||||
{
|
{
|
||||||
GncImportMatchMap *tmp_map;
|
GncImportMatchMap *tmp_map;
|
||||||
Account *result;
|
Account *result;
|
||||||
g_assert (info);
|
GList* tokens;
|
||||||
|
gboolean useBayes;
|
||||||
|
|
||||||
|
g_assert (info);
|
||||||
tmp_map = ((matchmap != NULL) ? matchmap :
|
tmp_map = ((matchmap != NULL) ? matchmap :
|
||||||
gnc_imap_create_from_account
|
gnc_imap_create_from_account
|
||||||
(xaccSplitGetAccount
|
(xaccSplitGetAccount
|
||||||
(gnc_import_TransInfo_get_fsplit (info))));
|
(gnc_import_TransInfo_get_fsplit (info))));
|
||||||
|
|
||||||
result = gnc_imap_find_account
|
useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE);
|
||||||
(tmp_map, GNCIMPORT_DESC,
|
if(useBayes)
|
||||||
xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
|
{
|
||||||
|
/* get the tokens for this transaction* */
|
||||||
|
tokens = TransactionGetTokens(info);
|
||||||
|
|
||||||
|
/* try to find the destination account for this transaction from its tokens */
|
||||||
|
result = gnc_imap_find_account_bayes(tmp_map, tokens);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* old system of transaction to account matching */
|
||||||
|
result = gnc_imap_find_account
|
||||||
|
(tmp_map, GNCIMPORT_DESC,
|
||||||
|
xaccTransGetDescription (gnc_import_TransInfo_get_trans (info)));
|
||||||
|
}
|
||||||
|
|
||||||
/* Disable matching by memo, until bayesian filtering is implemented.
|
/* Disable matching by memo, until bayesian filtering is implemented.
|
||||||
It's currently unlikely to help, and has adverse effects, causing false positives,
|
* It's currently unlikely to help, and has adverse effects,
|
||||||
since very often the type of the transaction is stored there.
|
* causing false positives, since very often the type of the
|
||||||
|
* transaction is stored there.
|
||||||
|
|
||||||
if (result == NULL)
|
if (result == NULL)
|
||||||
result = gnc_imap_find_account
|
result = gnc_imap_find_account
|
||||||
@ -390,6 +505,9 @@ matchmap_store_destination (GncImportMatchMap *matchmap,
|
|||||||
GncImportMatchMap *tmp_matchmap = NULL;
|
GncImportMatchMap *tmp_matchmap = NULL;
|
||||||
Account *dest;
|
Account *dest;
|
||||||
const char *descr, *memo;
|
const char *descr, *memo;
|
||||||
|
GList *tokens;
|
||||||
|
gboolean useBayes;
|
||||||
|
|
||||||
g_assert (trans_info);
|
g_assert (trans_info);
|
||||||
|
|
||||||
/* This will store the destination account of the selected match if
|
/* This will store the destination account of the selected match if
|
||||||
@ -410,20 +528,33 @@ matchmap_store_destination (GncImportMatchMap *matchmap,
|
|||||||
(xaccSplitGetAccount
|
(xaccSplitGetAccount
|
||||||
(gnc_import_TransInfo_get_fsplit (trans_info))));
|
(gnc_import_TransInfo_get_fsplit (trans_info))));
|
||||||
|
|
||||||
descr = xaccTransGetDescription
|
/* see what matching system we are currently using */
|
||||||
(gnc_import_TransInfo_get_trans (trans_info));
|
useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE);
|
||||||
if (descr && (strlen (descr) > 0))
|
if(useBayes)
|
||||||
gnc_imap_add_account (tmp_matchmap,
|
{
|
||||||
|
/* tokenize this transaction */
|
||||||
|
tokens = TransactionGetTokens(trans_info);
|
||||||
|
|
||||||
|
/* add the tokens to the imap with the given destination account */
|
||||||
|
gnc_imap_add_account_bayes(tmp_matchmap, tokens, dest);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* old matching system */
|
||||||
|
descr = xaccTransGetDescription
|
||||||
|
(gnc_import_TransInfo_get_trans (trans_info));
|
||||||
|
if (descr && (strlen (descr) > 0))
|
||||||
|
gnc_imap_add_account (tmp_matchmap,
|
||||||
GNCIMPORT_DESC,
|
GNCIMPORT_DESC,
|
||||||
descr,
|
descr,
|
||||||
dest);
|
dest);
|
||||||
memo = xaccSplitGetMemo
|
memo = xaccSplitGetMemo
|
||||||
(gnc_import_TransInfo_get_fsplit (trans_info));
|
(gnc_import_TransInfo_get_fsplit (trans_info));
|
||||||
if (memo && (strlen (memo) > 0))
|
if (memo && (strlen (memo) > 0))
|
||||||
gnc_imap_add_account (tmp_matchmap,
|
gnc_imap_add_account (tmp_matchmap,
|
||||||
GNCIMPORT_MEMO,
|
GNCIMPORT_MEMO,
|
||||||
memo,
|
memo,
|
||||||
dest);
|
dest);
|
||||||
|
} /* if(useBayes) */
|
||||||
|
|
||||||
if (matchmap == NULL)
|
if (matchmap == NULL)
|
||||||
gnc_imap_destroy (tmp_matchmap);
|
gnc_imap_destroy (tmp_matchmap);
|
||||||
@ -935,7 +1066,7 @@ gnc_import_TransInfo_refresh_destacc (GNCImportTransInfo *transaction_info,
|
|||||||
/* if we haven't manually selected a destination account for this transaction */
|
/* if we haven't manually selected a destination account for this transaction */
|
||||||
if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE)
|
if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE)
|
||||||
{
|
{
|
||||||
/* Try to find a previous selected destination account string match for the ADD action */
|
/* Try to find the destination account for this transaction based on prior ones */
|
||||||
new_destacc = matchmap_find_destination(matchmap, transaction_info);
|
new_destacc = matchmap_find_destination(matchmap, transaction_info);
|
||||||
gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE);
|
gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE);
|
||||||
} else
|
} else
|
||||||
|
@ -25,11 +25,22 @@
|
|||||||
An import mapper service that stores Account Maps for the
|
An import mapper service that stores Account Maps for the
|
||||||
generic importer. This allows importers to map various
|
generic importer. This allows importers to map various
|
||||||
"strings" to Gnucash accounts in a generic manner.
|
"strings" to Gnucash accounts in a generic manner.
|
||||||
@author Copyright (C) 2002 Derek Atkins <derek@ihtfp.com>
|
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
|
||||||
*/
|
*/
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <glib.h>
|
||||||
#include "import-match-map.h"
|
#include "import-match-map.h"
|
||||||
#include "kvp_frame.h"
|
#include "kvp_frame.h"
|
||||||
|
#include "Group.h"
|
||||||
|
#include "gnc-ui-util.h"
|
||||||
|
#include "gnc-engine-util.h"
|
||||||
|
|
||||||
|
/********************************************************************\
|
||||||
|
* Constants *
|
||||||
|
\********************************************************************/
|
||||||
|
|
||||||
|
static short module = MOD_IMPORT;
|
||||||
|
|
||||||
|
|
||||||
struct _GncImportMatchMap {
|
struct _GncImportMatchMap {
|
||||||
kvp_frame * frame;
|
kvp_frame * frame;
|
||||||
@ -37,7 +48,8 @@ struct _GncImportMatchMap {
|
|||||||
GNCBook * book;
|
GNCBook * book;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define IMAP_FRAME "import-map"
|
#define IMAP_FRAME "import-map"
|
||||||
|
#define IMAP_FRAME_BAYES "import-map-bayes"
|
||||||
|
|
||||||
static GncImportMatchMap *
|
static GncImportMatchMap *
|
||||||
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book)
|
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book)
|
||||||
@ -99,6 +111,9 @@ void gnc_imap_clear (GncImportMatchMap *imap)
|
|||||||
/* Clear the IMAP_FRAME kvp */
|
/* Clear the IMAP_FRAME kvp */
|
||||||
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
|
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
|
||||||
|
|
||||||
|
/* Clear the bayes kvp, IMAP_FRAME_BAYES */
|
||||||
|
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
|
||||||
|
|
||||||
/* XXX: mark the account (or book) as dirty! */
|
/* XXX: mark the account (or book) as dirty! */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,4 +158,368 @@ void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
|
|||||||
/* XXX Mark the account (or book) as dirty! */
|
/* XXX Mark the account (or book) as dirty! */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* Below here is the bayes transaction to account matching system */
|
||||||
|
struct account_token_count
|
||||||
|
{
|
||||||
|
char* account_name;
|
||||||
|
gint64 token_count; /* occurances of a given token for this account_name */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* total_count and the token_count for a given account let us calculate the
|
||||||
|
* probability of a given account with any single token
|
||||||
|
*/
|
||||||
|
struct token_accounts_info
|
||||||
|
{
|
||||||
|
GList *accounts; /* array of struct account_token_count */
|
||||||
|
gint64 total_count;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* gpointer is a pointer to a struct token_accounts_info
|
||||||
|
* NOTE: can always assume that keys are unique, reduces code in this function
|
||||||
|
*/
|
||||||
|
static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
|
||||||
|
{
|
||||||
|
struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
|
||||||
|
struct account_token_count* this_account;
|
||||||
|
|
||||||
|
// PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key,
|
||||||
|
// (long)kvp_value_get_gint64(value));
|
||||||
|
|
||||||
|
/* add the count to the total_count */
|
||||||
|
tokenInfo->total_count += kvp_value_get_gint64(value);
|
||||||
|
|
||||||
|
/* allocate a new structure for this account and it's token count */
|
||||||
|
this_account = (struct account_token_count*)
|
||||||
|
g_new0(struct account_token_count, 1);
|
||||||
|
|
||||||
|
/* fill in the account name and number of tokens found for this account name */
|
||||||
|
this_account->account_name = (char*)key;
|
||||||
|
this_account->token_count = kvp_value_get_gint64(value);
|
||||||
|
|
||||||
|
/* append onto the glist a pointer to the new account_token_count structure */
|
||||||
|
tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* intermediate values used to calculate the bayes probability of a given account
|
||||||
|
* where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b),
|
||||||
|
* product_difference is (1-a) * (1-b)
|
||||||
|
*/
|
||||||
|
struct account_probability
|
||||||
|
{
|
||||||
|
double product; /* product of probabilities */
|
||||||
|
double product_difference; /* product of (1-probabilities) */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* convert a hash table of account names and (struct account_probability*)
|
||||||
|
* into a hash table of 100000x the percentage match value, ie. 10% would be
|
||||||
|
* 0.10 * 100000 = 10000
|
||||||
|
*/
|
||||||
|
#define PROBABILITY_FACTOR 100000
|
||||||
|
static void buildProbabilities(gpointer key, gpointer value, gpointer data)
|
||||||
|
{
|
||||||
|
GHashTable *final_probabilities = (GHashTable*)data;
|
||||||
|
struct account_probability *account_p = (struct account_probability*)value;
|
||||||
|
|
||||||
|
/* P(AB) = A*B / [A*B + (1-A)*(1-B)]
|
||||||
|
* NOTE: so we only keep track of a running product(A*B*C...)
|
||||||
|
* and product difference ((1-A)(1-B)...)
|
||||||
|
*/
|
||||||
|
gint32 probability =
|
||||||
|
(account_p->product /
|
||||||
|
(account_p->product + account_p->product_difference))
|
||||||
|
* PROBABILITY_FACTOR;
|
||||||
|
|
||||||
|
PINFO("P('%s') = '%d'\n", (char*)key, probability);
|
||||||
|
|
||||||
|
g_hash_table_insert(final_probabilities, key, (gpointer)probability);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Frees an array of the same time that buildProperties built */
|
||||||
|
static void freeProbabilities(gpointer key, gpointer value, gpointer data)
|
||||||
|
{
|
||||||
|
/* free up the struct account_probability that was allocated
|
||||||
|
* in gnc_imap_find_account_bayes()
|
||||||
|
*/
|
||||||
|
g_free(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* holds an account name and its corresponding integer probability
|
||||||
|
* the integer probability is some factor of 10
|
||||||
|
*/
|
||||||
|
struct account_info
|
||||||
|
{
|
||||||
|
char* account_name;
|
||||||
|
gint32 probability;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Find the highest probability and the corresponding account name
|
||||||
|
* store in data, a (struct account_info*)
|
||||||
|
* NOTE: this is a g_hash_table_foreach() function for a hash table of entries
|
||||||
|
* key is a pointer to the account name, value is a gint32, 100000x
|
||||||
|
* the probability for this account
|
||||||
|
*/
|
||||||
|
static void highestProbability(gpointer key, gpointer value, gpointer data)
|
||||||
|
{
|
||||||
|
struct account_info *account_i = (struct account_info*)data;
|
||||||
|
|
||||||
|
/* if the current probability is greater than the stored, store the current */
|
||||||
|
if((gint32)value > account_i->probability)
|
||||||
|
{
|
||||||
|
/* Save the new highest probability and the assoaciated account name */
|
||||||
|
account_i->probability = (gint32)value;
|
||||||
|
account_i->account_name = key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
|
||||||
|
|
||||||
|
/* Look up an Account in the map */
|
||||||
|
Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
|
||||||
|
{
|
||||||
|
struct token_accounts_info tokenInfo; /* holds the accounts and total
|
||||||
|
* token count for a single token */
|
||||||
|
GList *current_token; /* pointer to the current token from the
|
||||||
|
* input GList *tokens */
|
||||||
|
GList *current_account_token; /* pointer to the struct
|
||||||
|
* account_token_count */
|
||||||
|
struct account_token_count *account_c; /* an account name and the number
|
||||||
|
* of times a token has appeared
|
||||||
|
* for the account */
|
||||||
|
struct account_probability *account_p; /* intermediate storage of values
|
||||||
|
* to compute the bayes probability
|
||||||
|
* of an account */
|
||||||
|
GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
|
||||||
|
GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
|
||||||
|
struct account_info account_i;
|
||||||
|
kvp_value* value;
|
||||||
|
kvp_frame* token_frame;
|
||||||
|
|
||||||
|
ENTER(" ");
|
||||||
|
|
||||||
|
/* check to see if the imap is NULL */
|
||||||
|
if(!imap)
|
||||||
|
{
|
||||||
|
PINFO("imap is null, returning null");
|
||||||
|
LEAVE(" ");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* find the probability for each account that contains any of the tokens
|
||||||
|
* in the input tokens list
|
||||||
|
*/
|
||||||
|
for(current_token = tokens; current_token; current_token = current_token->next)
|
||||||
|
{
|
||||||
|
/* zero out the token_accounts_info structure */
|
||||||
|
memset(&tokenInfo, 0, sizeof(struct token_accounts_info));
|
||||||
|
|
||||||
|
PINFO("token: '%s'", (char*)current_token->data);
|
||||||
|
|
||||||
|
/* find the slot for the given token off of the source account
|
||||||
|
* for these tokens, search off of the IMAP_FRAME_BAYES path so
|
||||||
|
* we aren't looking from the parent of the entire kvp tree
|
||||||
|
*/
|
||||||
|
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
|
||||||
|
(char*)current_token->data, NULL);
|
||||||
|
|
||||||
|
/* if value is null we should skip over this token */
|
||||||
|
if(!value)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* convert the slot(value) into a the frame that contains the
|
||||||
|
* list of accounts
|
||||||
|
*/
|
||||||
|
token_frame = kvp_value_get_frame(value);
|
||||||
|
|
||||||
|
/* token_frame should NEVER be null */
|
||||||
|
if(!token_frame)
|
||||||
|
{
|
||||||
|
PERR("token '%s' has no accounts", (char*)current_token->data);
|
||||||
|
continue; /* skip over this token */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* process the accounts for this token, adding the account if it
|
||||||
|
* doesn't already exist or adding to the existing accounts token
|
||||||
|
* count if it does
|
||||||
|
*/
|
||||||
|
kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
|
||||||
|
|
||||||
|
/* for each account we have just found, see if the account already exists
|
||||||
|
* in the list of account probabilities, if not add it
|
||||||
|
*/
|
||||||
|
for(current_account_token = tokenInfo.accounts; current_account_token;
|
||||||
|
current_account_token = current_account_token->next)
|
||||||
|
{
|
||||||
|
/* get the account name and corresponding token count */
|
||||||
|
account_c = (struct account_token_count*)current_account_token->data;
|
||||||
|
|
||||||
|
PINFO("account_c->account_name('%s'), "
|
||||||
|
"account_c->token_count('%ld')/total_count('%ld')",
|
||||||
|
account_c->account_name, (long)account_c->token_count,
|
||||||
|
(long)tokenInfo.total_count);
|
||||||
|
|
||||||
|
account_p = g_hash_table_lookup(running_probabilities,
|
||||||
|
account_c->account_name);
|
||||||
|
|
||||||
|
/* if the account exists in the list then continue
|
||||||
|
* the running probablities
|
||||||
|
*/
|
||||||
|
if(account_p)
|
||||||
|
{
|
||||||
|
account_p->product =
|
||||||
|
((double)account_c->token_count / (double)tokenInfo.total_count)
|
||||||
|
* account_p->product;
|
||||||
|
account_p->product_difference =
|
||||||
|
((double)1 - ((double)account_c->token_count /
|
||||||
|
(double)tokenInfo.total_count))
|
||||||
|
* account_p->product_difference;
|
||||||
|
PINFO("product == %f, product_difference == %f",
|
||||||
|
account_p->product, account_p->product_difference);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* add a new entry */
|
||||||
|
PINFO("adding a new entry for this account");
|
||||||
|
account_p = (struct account_probability*)
|
||||||
|
g_new0(struct account_probability, 1);
|
||||||
|
|
||||||
|
/* set the product and product difference values */
|
||||||
|
account_p->product = ((double)account_c->token_count /
|
||||||
|
(double)tokenInfo.total_count);
|
||||||
|
account_p->product_difference =
|
||||||
|
(double)1 - ((double)account_c->token_count /
|
||||||
|
(double)tokenInfo.total_count);
|
||||||
|
|
||||||
|
PINFO("product == %f, product_difference == %f",
|
||||||
|
account_p->product, account_p->product_difference);
|
||||||
|
|
||||||
|
/* add the account name and (struct account_probability*)
|
||||||
|
* to the hash table */
|
||||||
|
g_hash_table_insert(running_probabilities,
|
||||||
|
account_c->account_name, account_p);
|
||||||
|
}
|
||||||
|
} /* for all accounts in tokenInfo */
|
||||||
|
|
||||||
|
/* free the data in tokenInfo */
|
||||||
|
for(current_account_token = tokenInfo.accounts; current_account_token;
|
||||||
|
current_account_token = current_account_token->next)
|
||||||
|
{
|
||||||
|
/* free up each struct account_token_count we allocated */
|
||||||
|
g_free((struct account_token_count*)current_account_token->data);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_list_free(tokenInfo.accounts); /* free the accounts GList */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* build a hash table of account names and their final probabilities
|
||||||
|
* from each entry in the running_probabilties hash table
|
||||||
|
*/
|
||||||
|
g_hash_table_foreach(running_probabilities, buildProbabilities,
|
||||||
|
final_probabilities);
|
||||||
|
|
||||||
|
/* find the highest probabilty and the corresponding account */
|
||||||
|
memset(&account_i, 0, sizeof(struct account_info));
|
||||||
|
g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
|
||||||
|
|
||||||
|
/* free each element of the running_probabilities hash */
|
||||||
|
g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
|
||||||
|
|
||||||
|
/* free the hash tables */
|
||||||
|
g_hash_table_destroy(running_probabilities);
|
||||||
|
g_hash_table_destroy(final_probabilities);
|
||||||
|
|
||||||
|
PINFO("highest P('%s') = '%d'", account_i.account_name, account_i.probability);
|
||||||
|
|
||||||
|
/* has this probability met our threshold? */
|
||||||
|
if(account_i.probability >= threshold)
|
||||||
|
{
|
||||||
|
PINFO("found match");
|
||||||
|
LEAVE(" ");
|
||||||
|
return xaccGetAccountFromFullName(gnc_book_get_group(imap->book),
|
||||||
|
account_i.account_name,
|
||||||
|
gnc_get_account_separator());
|
||||||
|
}
|
||||||
|
|
||||||
|
PINFO("no match");
|
||||||
|
LEAVE(" ");
|
||||||
|
|
||||||
|
return NULL; /* we didn't meet our threshold, return NULL for an account */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Updates the imap for a given account using a list of tokens */
|
||||||
|
void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
|
||||||
|
{
|
||||||
|
GList *current_token;
|
||||||
|
kvp_value *value;
|
||||||
|
gint64 token_count;
|
||||||
|
char* account_fullname;
|
||||||
|
kvp_value *new_value; /* the value that will be added back into the kvp tree */
|
||||||
|
|
||||||
|
ENTER(" ");
|
||||||
|
|
||||||
|
/* if imap is null return */
|
||||||
|
if(!imap)
|
||||||
|
{
|
||||||
|
LEAVE(" ");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
account_fullname = xaccAccountGetFullName(acc, gnc_get_account_separator());
|
||||||
|
|
||||||
|
PINFO("account name: '%s'\n", account_fullname);
|
||||||
|
|
||||||
|
/* process each token in the list */
|
||||||
|
for(current_token = g_list_first(tokens); current_token;
|
||||||
|
current_token = current_token->next)
|
||||||
|
{
|
||||||
|
/* start off with no tokens for this account */
|
||||||
|
token_count = 0;
|
||||||
|
|
||||||
|
PINFO("adding token '%s'\n", (char*)current_token->data);
|
||||||
|
|
||||||
|
/* is this token/account_name already in the kvp tree? */
|
||||||
|
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
|
||||||
|
(char*)current_token->data, account_fullname,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
/* if the token/account is already in the tree, read the current
|
||||||
|
* value from the tree and use this for the basis of the value we
|
||||||
|
* are putting back
|
||||||
|
*/
|
||||||
|
if(value)
|
||||||
|
{
|
||||||
|
PINFO("found existing value of '%ld'\n",
|
||||||
|
(long)kvp_value_get_gint64(value));
|
||||||
|
|
||||||
|
/* convert this value back into an integer */
|
||||||
|
token_count+=kvp_value_get_gint64(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* increment the token count */
|
||||||
|
token_count++;
|
||||||
|
|
||||||
|
/* create a new value */
|
||||||
|
new_value = kvp_value_new_gint64(token_count);
|
||||||
|
|
||||||
|
/* insert the value into the kvp tree at
|
||||||
|
* /imap->frame/IMAP_FRAME/token_string/account_name_string
|
||||||
|
*/
|
||||||
|
kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES,
|
||||||
|
(char*)current_token->data, account_fullname, NULL);
|
||||||
|
|
||||||
|
/* kvp_frame_set_slot_path() copied the value so we
|
||||||
|
* need to delete this one ;-) */
|
||||||
|
kvp_value_delete(new_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* free up the account fullname string */
|
||||||
|
g_free(account_fullname);
|
||||||
|
|
||||||
|
LEAVE(" ");
|
||||||
|
}
|
||||||
|
|
||||||
/** @} */
|
/** @} */
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
An import mapper service that stores Account Maps for the
|
An import mapper service that stores Account Maps for the
|
||||||
generic importer. This allows importers to map various
|
generic importer. This allows importers to map various
|
||||||
"strings" to Gnucash accounts in a generic manner.
|
"strings" to Gnucash accounts in a generic manner.
|
||||||
@author Copyright (C) 2002 Derek Atkins <derek@ihtfp.com>
|
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
|
||||||
*/
|
*/
|
||||||
#ifndef GNC_IMPORT_MATCH_MAP_H
|
#ifndef GNC_IMPORT_MATCH_MAP_H
|
||||||
#define GNC_IMPORT_MATCH_MAP_H
|
#define GNC_IMPORT_MATCH_MAP_H
|
||||||
@ -48,8 +48,8 @@ void gnc_imap_destroy (GncImportMatchMap *imap);
|
|||||||
void gnc_imap_clear (GncImportMatchMap *imap);
|
void gnc_imap_clear (GncImportMatchMap *imap);
|
||||||
|
|
||||||
/** Look up an Account in the map */
|
/** Look up an Account in the map */
|
||||||
Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
|
Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category,
|
||||||
const char *key);
|
const char *key);
|
||||||
|
|
||||||
/** Store an Account in the map. This mapping is immediatly stored in
|
/** Store an Account in the map. This mapping is immediatly stored in
|
||||||
the underlying kvp frame, regardless of whether the MatchMap is
|
the underlying kvp frame, regardless of whether the MatchMap is
|
||||||
@ -57,6 +57,16 @@ Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
|
|||||||
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
|
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
|
||||||
const char *key, Account *acc);
|
const char *key, Account *acc);
|
||||||
|
|
||||||
|
/** Look up an Account in the map from a GList* of pointers to strings(tokens)
|
||||||
|
from the current transaction */
|
||||||
|
Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList* tokens);
|
||||||
|
|
||||||
|
/** Store an Account in the map. This mapping is immediatly stored in
|
||||||
|
the underlying kvp frame, regardless of whether the MatchMap is
|
||||||
|
destroyed later or not. */
|
||||||
|
void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
|
||||||
|
Account *acc);
|
||||||
|
|
||||||
|
|
||||||
/** @name Some well-known categories
|
/** @name Some well-known categories
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user