Transfer the account-matching functions from import-export to Account

I would have preferred to separate the data-retrieval from the actual
bayesian routines but I didn't think that I could do so without losing
backwards data compatibility.
This commit is contained in:
John Ralls 2013-10-27 15:37:42 -07:00
parent fadc3d7082
commit c9493cfcf5
10 changed files with 575 additions and 664 deletions

View File

@ -386,7 +386,6 @@ src/import-export/import-backend.c
src/import-export/import-commodity-matcher.c
src/import-export/import-format-dialog.c
src/import-export/import-main-matcher.c
src/import-export/import-match-map.c
src/import-export/import-match-picker.c
src/import-export/import-parse.c
src/import-export/import-settings.c

View File

@ -4819,6 +4819,508 @@ xaccAccountForEachTransaction(const Account *acc, TransactionCallback proc,
return xaccAccountStagedTransactionTraversal(acc, 42, proc, data);
}
/* ================================================================ */
/* The following functions are used by
* src/import-export/import-backend.c to manipulate the contra-account
* matching data. See src/import-export/import-backend.c for explanations.
*/
/* FIXME: These data are stored per-account in KVP and the functions
* work directly on KVP data structures. This prevents moving KVP to a
* backend-only abstraction.
*/
typedef struct _GncImportMatchMap
{
kvp_frame * frame;
Account * acc;
QofBook * book;
} GncImportMatchMap;
#define IMAP_FRAME "import-map"
#define IMAP_FRAME_BAYES "import-map-bayes"
GncImportMatchMap * gnc_account_create_imap (Account *acc);
Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category,
const char *key);
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
const char *key, Account *acc);
Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList* tokens);
void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
Account *acc);
/* Obtain an ImportMatchMap object from an Account or a Book */
GncImportMatchMap *
gnc_account_create_imap (Account *acc)
{
GncImportMatchMap *imap;
kvp_frame *frame;
if (!acc) return NULL;
frame = qof_instance_get_slots (QOF_INSTANCE (acc));
g_return_val_if_fail (frame != NULL, NULL);
g_return_val_if_fail (frame != NULL, NULL);
imap = g_new0(GncImportMatchMap, 1);
imap->frame = frame;
/* Cache the book for easy lookups; store the account/book for
* marking dirtiness
*/
imap->acc = acc;
imap->book = gnc_account_get_book (acc);
return imap;
}
/* Look up an Account in the map */
Account*
gnc_imap_find_account (GncImportMatchMap *imap,
const char *category,
const char *key)
{
kvp_value *value;
GncGUID * guid;
if (!imap || !key) return NULL;
if (!category)
{
category = key;
key = NULL;
}
value = kvp_frame_get_slot_path (imap->frame, IMAP_FRAME,
category, key, NULL);
if (!value) return NULL;
guid = kvp_value_get_guid (value);
return xaccAccountLookup (guid, imap->book);
}
/* Store an Account in the map */
void
gnc_imap_add_account (GncImportMatchMap *imap,
const char *category,
const char *key,
Account *acc)
{
kvp_value *value;
if (!imap || !key || !acc || (strlen (key) == 0)) return;
if (!category)
{
category = key;
key = NULL;
}
g_return_if_fail (acc != NULL);
value = kvp_value_new_guid (xaccAccountGetGUID (acc));
g_return_if_fail (value != NULL);
xaccAccountBeginEdit (imap->acc);
kvp_frame_set_slot_path (imap->frame, value, IMAP_FRAME, category, key, NULL);
qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
xaccAccountCommitEdit (imap->acc);
kvp_value_delete (value);
/* XXX Mark the account (or book) as dirty! */
}
/*--------------------------------------------------------------------------
Below here is the bayes transaction to account matching system
--------------------------------------------------------------------------*/
struct account_token_count
{
char* account_name;
gint64 token_count; /**< occurances of a given token for this account_name */
};
/** total_count and the token_count for a given account let us calculate the
* probability of a given account with any single token
*/
struct token_accounts_info
{
GList *accounts; /**< array of struct account_token_count */
gint64 total_count;
};
/** gpointer is a pointer to a struct token_accounts_info
* \note Can always assume that keys are unique, reduces code in this function
*/
static void
buildTokenInfo(const char *key, kvp_value *value, gpointer data)
{
struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
struct account_token_count* this_account;
// PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key,
// (long)kvp_value_get_gint64(value));
/* add the count to the total_count */
tokenInfo->total_count += kvp_value_get_gint64(value);
/* allocate a new structure for this account and it's token count */
this_account = (struct account_token_count*)
g_new0(struct account_token_count, 1);
/* fill in the account name and number of tokens found for this account name */
this_account->account_name = (char*)key;
this_account->token_count = kvp_value_get_gint64(value);
/* append onto the glist a pointer to the new account_token_count structure */
tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
}
/** intermediate values used to calculate the bayes probability of a given account
where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b),
product_difference is (1-a) * (1-b)
*/
struct account_probability
{
double product; /* product of probabilities */
double product_difference; /* product of (1-probabilities) */
};
/** convert a hash table of account names and (struct account_probability*)
into a hash table of 100000x the percentage match value, ie. 10% would be
0.10 * 100000 = 10000
*/
#define PROBABILITY_FACTOR 100000
static void
buildProbabilities(gpointer key, gpointer value, gpointer data)
{
GHashTable *final_probabilities = (GHashTable*)data;
struct account_probability *account_p = (struct account_probability*)value;
/* P(AB) = A*B / [A*B + (1-A)*(1-B)]
* NOTE: so we only keep track of a running product(A*B*C...)
* and product difference ((1-A)(1-B)...)
*/
gint32 probability =
(account_p->product /
(account_p->product + account_p->product_difference))
* PROBABILITY_FACTOR;
PINFO("P('%s') = '%d'\n", (char*)key, probability);
g_hash_table_insert(final_probabilities, key, GINT_TO_POINTER(probability));
}
/** Frees an array of the same time that buildProperties built */
static void
freeProbabilities(gpointer key, gpointer value, gpointer data)
{
/* free up the struct account_probability that was allocated
* in gnc_account_find_account_bayes()
*/
g_free(value);
}
/** holds an account name and its corresponding integer probability
the integer probability is some factor of 10
*/
struct account_info
{
char* account_name;
gint32 probability;
};
/** Find the highest probability and the corresponding account name
store in data, a (struct account_info*)
NOTE: this is a g_hash_table_foreach() function for a hash table of entries
key is a pointer to the account name, value is a gint32, 100000x
the probability for this account
*/
static void
highestProbability(gpointer key, gpointer value, gpointer data)
{
struct account_info *account_i = (struct account_info*)data;
/* if the current probability is greater than the stored, store the current */
if (GPOINTER_TO_INT(value) > account_i->probability)
{
/* Save the new highest probability and the assoaciated account name */
account_i->probability = GPOINTER_TO_INT(value);
account_i->account_name = key;
}
}
#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
/** Look up an Account in the map */
Account*
gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList *tokens)
{
struct token_accounts_info tokenInfo; /**< holds the accounts and total
* token count for a single token */
GList *current_token; /**< pointer to the current
* token from the input GList
* tokens */
GList *current_account_token; /**< pointer to the struct
* account_token_count */
struct account_token_count *account_c; /**< an account name and the number
* of times a token has appeared
* for the account */
struct account_probability *account_p; /**< intermediate storage of values
* to compute the bayes probability
* of an account */
GHashTable *running_probabilities = g_hash_table_new(g_str_hash,
g_str_equal);
GHashTable *final_probabilities = g_hash_table_new(g_str_hash,
g_str_equal);
struct account_info account_i;
kvp_value* value;
kvp_frame* token_frame;
ENTER(" ");
/* check to see if the imap is NULL */
if (!imap)
{
PINFO("imap is null, returning null");
LEAVE(" ");
return NULL;
}
/* find the probability for each account that contains any of the tokens
* in the input tokens list
*/
for (current_token = tokens; current_token;
current_token = current_token->next)
{
/* zero out the token_accounts_info structure */
memset(&tokenInfo, 0, sizeof(struct token_accounts_info));
PINFO("token: '%s'", (char*)current_token->data);
/* find the slot for the given token off of the source account
* for these tokens, search off of the IMAP_FRAME_BAYES path so
* we aren't looking from the parent of the entire kvp tree
*/
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data, NULL);
/* if value is null we should skip over this token */
if (!value)
continue;
/* convert the slot(value) into a the frame that contains the
* list of accounts
*/
token_frame = kvp_value_get_frame(value);
/* token_frame should NEVER be null */
if (!token_frame)
{
PERR("token '%s' has no accounts", (char*)current_token->data);
continue; /* skip over this token */
}
/* process the accounts for this token, adding the account if it
* doesn't already exist or adding to the existing accounts token
* count if it does
*/
kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
/* for each account we have just found, see if the account
* already exists in the list of account probabilities, if not
* add it
*/
for (current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* get the account name and corresponding token count */
account_c = (struct account_token_count*)current_account_token->data;
PINFO("account_c->account_name('%s'), "
"account_c->token_count('%ld')/total_count('%ld')",
account_c->account_name, (long)account_c->token_count,
(long)tokenInfo.total_count);
account_p = g_hash_table_lookup(running_probabilities,
account_c->account_name);
/* if the account exists in the list then continue
* the running probablities
*/
if (account_p)
{
account_p->product = (((double)account_c->token_count /
(double)tokenInfo.total_count)
* account_p->product);
account_p->product_difference =
((double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count))
* account_p->product_difference;
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
}
else
{
/* add a new entry */
PINFO("adding a new entry for this account");
account_p = (struct account_probability*)
g_new0(struct account_probability, 1);
/* set the product and product difference values */
account_p->product = ((double)account_c->token_count /
(double)tokenInfo.total_count);
account_p->product_difference =
(double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count);
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
/* add the account name and (struct account_probability*)
* to the hash table */
g_hash_table_insert(running_probabilities,
account_c->account_name, account_p);
}
} /* for all accounts in tokenInfo */
/* free the data in tokenInfo */
for (current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* free up each struct account_token_count we allocated */
g_free((struct account_token_count*)current_account_token->data);
}
g_list_free(tokenInfo.accounts); /* free the accounts GList */
}
/* build a hash table of account names and their final probabilities
* from each entry in the running_probabilties hash table
*/
g_hash_table_foreach(running_probabilities, buildProbabilities,
final_probabilities);
/* find the highest probabilty and the corresponding account */
memset(&account_i, 0, sizeof(struct account_info));
g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
/* free each element of the running_probabilities hash */
g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
/* free the hash tables */
g_hash_table_destroy(running_probabilities);
g_hash_table_destroy(final_probabilities);
PINFO("highest P('%s') = '%d'",
account_i.account_name ? account_i.account_name : "(null)",
account_i.probability);
/* has this probability met our threshold? */
if (account_i.probability >= threshold)
{
PINFO("found match");
LEAVE(" ");
return gnc_account_lookup_by_full_name(gnc_book_get_root_account(imap->book),
account_i.account_name);
}
PINFO("no match");
LEAVE(" ");
return NULL; /* we didn't meet our threshold, return NULL for an account */
}
/** Updates the imap for a given account using a list of tokens */
void
gnc_imap_add_account_bayes(GncImportMatchMap *imap,
GList *tokens,
Account *acc)
{
GList *current_token;
kvp_value *value;
gint64 token_count;
char* account_fullname;
kvp_value *new_value; /* the value that will be added back into
* the kvp tree */
ENTER(" ");
/* if imap is null return */
if (!imap)
{
LEAVE(" ");
return;
}
g_return_if_fail (acc != NULL);
account_fullname = gnc_account_get_full_name(acc);
xaccAccountBeginEdit (imap->acc);
PINFO("account name: '%s'\n", account_fullname);
/* process each token in the list */
for (current_token = g_list_first(tokens); current_token;
current_token = current_token->next)
{
/* Jump to next iteration if the pointer is not valid or if the
string is empty. In HBCI import we almost always get an empty
string, which doesn't work in the kvp loopkup later. So we
skip this case here. */
if (!current_token->data || (*((char*)current_token->data) == '\0'))
continue;
/* start off with no tokens for this account */
token_count = 0;
PINFO("adding token '%s'\n", (char*)current_token->data);
/* is this token/account_name already in the kvp tree? */
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data,
account_fullname,
NULL);
/* if the token/account is already in the tree, read the current
* value from the tree and use this for the basis of the value we
* are putting back
*/
if (value)
{
PINFO("found existing value of '%ld'\n",
(long)kvp_value_get_gint64(value));
/* convert this value back into an integer */
token_count += kvp_value_get_gint64(value);
}
/* increment the token count */
token_count++;
/* create a new value */
new_value = kvp_value_new_gint64(token_count);
/* insert the value into the kvp tree at
* /imap->frame/IMAP_FRAME/token_string/account_name_string
*/
kvp_frame_set_slot_path(imap->frame, new_value,
IMAP_FRAME_BAYES,
(char*)current_token->data,
account_fullname,
NULL);
/* kvp_frame_set_slot_path() copied the value so we
* need to delete this one ;-) */
kvp_value_delete(new_value);
}
/* free up the account fullname string */
qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
xaccAccountCommitEdit (imap->acc);
g_free(account_fullname);
LEAVE(" ");
}
/* ================================================================ */
/* QofObject function implementation and registration */

View File

@ -1721,43 +1721,44 @@ gtva_currency_changed_cb (void)
gtva_update_column_names (ptr->data);
}
}
/* This function implements a custom mapping between an account's KVP
* and the cell renderer's 'text' property. */
/* Retrieve a specified account string property and put the result
* into the tree column's text property.
*/
static void
account_cell_kvp_data_func (GtkTreeViewColumn *tree_column,
GtkCellRenderer *cell,
GtkTreeModel *s_model,
GtkTreeIter *s_iter,
gpointer key)
account_cell_property_data_func (GtkTreeViewColumn *tree_column,
GtkCellRenderer *cell,
GtkTreeModel *s_model,
GtkTreeIter *s_iter,
gpointer key)
{
Account *account;
kvp_frame * frame;
gchar *string;
g_return_if_fail (GTK_IS_TREE_MODEL_SORT (s_model));
account = gnc_tree_view_account_get_account_from_iter(s_model, s_iter);
frame = xaccAccountGetSlots(account);
g_object_set (G_OBJECT (cell),
"text", kvp_frame_get_string(frame, (gchar *)key),
"xalign", 0.0,
NULL);
qof_instance_get (QOF_INSTANCE (account),
key, &string,
NULL);
if (string == NULL)
string = "";
g_object_set (G_OBJECT (cell), "text", string, "xalign", 0.0, NULL);
}
GtkTreeViewColumn *
gnc_tree_view_account_add_kvp_column (GncTreeViewAccount *view,
gnc_tree_view_account_add_property_column (GncTreeViewAccount *view,
const gchar *column_title,
const gchar *kvp_key)
const gchar *propname)
{
GtkCellRenderer *renderer;
GtkTreeViewColumn *column;
g_return_val_if_fail (GNC_IS_TREE_VIEW_ACCOUNT (view), NULL);
g_return_val_if_fail (kvp_key != NULL, NULL);
g_return_val_if_fail (propname != NULL, NULL);
column = gnc_tree_view_add_text_column(GNC_TREE_VIEW(view), column_title,
kvp_key, NULL, "Sample text",
propname, NULL, "Sample text",
-1, -1, NULL);
/* This new kvp column has only had one renderer added to it so
@ -1766,8 +1767,8 @@ gnc_tree_view_account_add_kvp_column (GncTreeViewAccount *view,
g_object_set (G_OBJECT (renderer), "xalign", 1.0, NULL);
gtk_tree_view_column_set_cell_data_func (column, renderer,
account_cell_kvp_data_func,
g_strdup(kvp_key), g_free);
account_cell_property_data_func,
g_strdup(propname), g_free);
return column;
}

View File

@ -203,20 +203,19 @@ void gnc_tree_view_account_notes_edited_cb(Account *account, GtkTreeViewColumn *
/** Add a new column to the set of columns in an account tree view.
* This column will be visible as soon as it is added and will
* display the contents of the specified KVP slot.
* display the contents of the specified account property
*
* @param view A pointer to an account tree view.
*
* @param column_title The title for this new column.
*
* @param kvp_key The lookup key to use for looking up data in the
* account KVP structures. The value associated with this key is what
* will be displayed in the column.
* @param propname The g_object_property name of the desired
* value. This must be a string property.
*/
GtkTreeViewColumn *
gnc_tree_view_account_add_kvp_column (GncTreeViewAccount *view,
const gchar *column_title,
const gchar *kvp_key);
gnc_tree_view_account_add_property_column (GncTreeViewAccount *view,
const gchar *column_title,
const gchar *propname);
/** @} */

View File

@ -18,13 +18,11 @@ libgncmod_generic_import_la_SOURCES = \
import-parse.c \
import-utilities.c \
import-settings.c \
import-match-map.c \
import-main-matcher.c \
gncmod-generic-import.c
gncincludedir = ${GNC_INCLUDE_DIR}
gncinclude_HEADERS = \
import-match-map.h \
import-parse.h
noinst_HEADERS = \
@ -32,7 +30,6 @@ noinst_HEADERS = \
import-backend.h \
import-commodity-matcher.h \
import-main-matcher.h \
import-match-map.h \
import-match-picker.h \
import-settings.h \
import-utilities.h

View File

@ -44,6 +44,40 @@
#include "gnc-prefs.h"
#include "gnc-ui-util.h"
/* Private interface to Account GncImportMatchMap functions */
/** @{
Obtain an ImportMatchMap object from an Account */
extern GncImportMatchMap * gnc_account_create_imap (Account *acc);
/*@}*/
/* Look up an Account in the map */
extern Account* gnc_imap_find_account(GncImportMatchMap *imap,
const char* category,
const char *key);
/* Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
extern void gnc_imap_add_account (GncImportMatchMap *imap,
const char *category,
const char *key, Account *acc);
/* Look up an Account in the map from a GList* of pointers to strings(tokens)
from the current transaction */
extern Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap,
GList* tokens);
/* Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
extern void gnc_imap_add_account_bayes (GncImportMatchMap *imap,
GList* tokens,
Account *acc);
#define GNCIMPORT_DESC "desc"
#define GNCIMPORT_MEMO "memo"
#define GNCIMPORT_PAYEE "payee"
/********************************************************************\
* Constants *
@ -457,6 +491,15 @@ TransactionGetTokens(GNCImportTransInfo *info)
/* return the pointer to the GList */
return tokens;
}
/* Destroy an import map. But all stored entries will still continue
* to exist in the underlying kvp frame of the account.
*/
static void
gnc_imap_destroy (GncImportMatchMap *imap)
{
if (!imap) return;
g_free (imap);
}
/* searches using the GNCImportTransInfo through all existing transactions
* if there is an exact match of the description and memo
@ -471,7 +514,7 @@ matchmap_find_destination (GncImportMatchMap *matchmap, GNCImportTransInfo *info
g_assert (info);
tmp_map = ((matchmap != NULL) ? matchmap :
gnc_imap_create_from_account
gnc_account_create_imap
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (info))));
@ -541,7 +584,7 @@ matchmap_store_destination (GncImportMatchMap *matchmap,
tmp_matchmap = ((matchmap != NULL) ?
matchmap :
gnc_imap_create_from_account
gnc_account_create_imap
(xaccSplitGetAccount
(gnc_import_TransInfo_get_fsplit (trans_info))));

View File

@ -29,11 +29,11 @@
#define TRANSACTION_MATCHER_H
#include "Transaction.h"
#include "import-match-map.h"
#include "import-settings.h"
typedef struct _transactioninfo GNCImportTransInfo;
typedef struct _matchinfo GNCImportMatchInfo;
typedef struct _GncImportMatchMap GncImportMatchMap;
typedef enum _action
{

View File

@ -44,7 +44,6 @@
#include "gnc-ui-util.h"
#include "gnc-engine.h"
#include "import-settings.h"
#include "import-match-map.h"
#include "import-match-picker.h"
#include "import-backend.h"
#include "import-account-matcher.h"

View File

@ -1,545 +0,0 @@
/********************************************************************\
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License as *
* published by the Free Software Foundation; either version 2 of *
* the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License*
* along with this program; if not, contact: *
* *
* Free Software Foundation Voice: +1-617-542-5942 *
* 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
* Boston, MA 02110-1301, USA gnu@gnu.org *
\********************************************************************/
/** @addtogroup Import_Export
@{ */
/** @internal
@file import-match-map.c
@brief Generic import mapper service, maps strings->accounts
*
An import mapper service that stores Account Maps for the
generic importer. This allows importers to map various
"strings" to Gnucash accounts in a generic manner.
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
*/
#include "config.h"
#include <string.h>
#include <glib.h>
#include "import-match-map.h"
#include "gnc-ui-util.h"
#include "gnc-engine.h"
/********************************************************************\
* Constants *
\********************************************************************/
static QofLogModule log_module = GNC_MOD_IMPORT;
struct _GncImportMatchMap
{
kvp_frame * frame;
Account * acc;
QofBook * book;
};
#define IMAP_FRAME "import-map"
#define IMAP_FRAME_BAYES "import-map-bayes"
static GncImportMatchMap *
gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, QofBook *book)
{
GncImportMatchMap *imap;
g_return_val_if_fail (frame != NULL, NULL);
g_return_val_if_fail ((acc && !book) || (!acc && book), NULL);
imap = g_new0(GncImportMatchMap, 1);
imap->frame = frame;
/* Cache the book for easy lookups; store the account/book for
* marking dirtiness
*/
if (acc)
book = gnc_account_get_book (acc);
imap->acc = acc;
imap->book = book;
return imap;
}
/** Obtain an ImportMatchMap object from an Account or a Book */
GncImportMatchMap * gnc_imap_create_from_account (Account *acc)
{
kvp_frame * frame;
if (!acc) return NULL;
frame = xaccAccountGetSlots (acc);
g_return_val_if_fail (frame != NULL, NULL);
return gnc_imap_create_from_frame (frame, acc, NULL);
}
GncImportMatchMap * gnc_imap_create_from_book (QofBook *book)
{
kvp_frame * frame;
if (!book) return NULL;
frame = qof_book_get_slots (book);
g_return_val_if_fail (frame != NULL, NULL);
return gnc_imap_create_from_frame (frame, NULL, book);
}
/** Destroy an import map */
void gnc_imap_destroy (GncImportMatchMap *imap)
{
if (!imap) return;
g_free (imap);
}
/** Clear an import map -- this removes ALL entries in the map */
void gnc_imap_clear (GncImportMatchMap *imap)
{
if (!imap) return;
/* Clear the IMAP_FRAME kvp */
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME);
/* Clear the bayes kvp, IMAP_FRAME_BAYES */
kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES);
/* XXX: mark the account (or book) as dirty! */
}
/** Look up an Account in the map */
Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category,
const char *key)
{
kvp_value *value;
GncGUID * guid;
if (!imap || !key) return NULL;
if (!category)
{
category = key;
key = NULL;
}
value = kvp_frame_get_slot_path (imap->frame, IMAP_FRAME, category, key, NULL);
if (!value) return NULL;
guid = kvp_value_get_guid (value);
return xaccAccountLookup (guid, imap->book);
}
/** Store an Account in the map */
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
const char *key, Account *acc)
{
kvp_value *value;
if (!imap || !key || !acc || (strlen (key) == 0)) return;
if (!category)
{
category = key;
key = NULL;
}
g_return_if_fail (acc != NULL);
value = kvp_value_new_guid (xaccAccountGetGUID (acc));
g_return_if_fail (value != NULL);
xaccAccountBeginEdit (imap->acc);
kvp_frame_set_slot_path (imap->frame, value, IMAP_FRAME, category, key, NULL);
qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
xaccAccountCommitEdit (imap->acc);
kvp_value_delete (value);
/* XXX Mark the account (or book) as dirty! */
}
/*--------------------------------------------------------------------------
Below here is the bayes transaction to account matching system
--------------------------------------------------------------------------*/
struct account_token_count
{
char* account_name;
gint64 token_count; /**< occurances of a given token for this account_name */
};
/** total_count and the token_count for a given account let us calculate the
* probability of a given account with any single token
*/
struct token_accounts_info
{
GList *accounts; /**< array of struct account_token_count */
gint64 total_count;
};
/** gpointer is a pointer to a struct token_accounts_info
* \note Can always assume that keys are unique, reduces code in this function
*/
static void buildTokenInfo(const char *key, kvp_value *value, gpointer data)
{
struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data;
struct account_token_count* this_account;
// PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key,
// (long)kvp_value_get_gint64(value));
/* add the count to the total_count */
tokenInfo->total_count += kvp_value_get_gint64(value);
/* allocate a new structure for this account and it's token count */
this_account = (struct account_token_count*)
g_new0(struct account_token_count, 1);
/* fill in the account name and number of tokens found for this account name */
this_account->account_name = (char*)key;
this_account->token_count = kvp_value_get_gint64(value);
/* append onto the glist a pointer to the new account_token_count structure */
tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account);
}
/** intermediate values used to calculate the bayes probability of a given account
where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b),
product_difference is (1-a) * (1-b)
*/
struct account_probability
{
double product; /* product of probabilities */
double product_difference; /* product of (1-probabilities) */
};
/** convert a hash table of account names and (struct account_probability*)
into a hash table of 100000x the percentage match value, ie. 10% would be
0.10 * 100000 = 10000
*/
#define PROBABILITY_FACTOR 100000
static void buildProbabilities(gpointer key, gpointer value, gpointer data)
{
GHashTable *final_probabilities = (GHashTable*)data;
struct account_probability *account_p = (struct account_probability*)value;
/* P(AB) = A*B / [A*B + (1-A)*(1-B)]
* NOTE: so we only keep track of a running product(A*B*C...)
* and product difference ((1-A)(1-B)...)
*/
gint32 probability =
(account_p->product /
(account_p->product + account_p->product_difference))
* PROBABILITY_FACTOR;
PINFO("P('%s') = '%d'\n", (char*)key, probability);
g_hash_table_insert(final_probabilities, key, GINT_TO_POINTER(probability));
}
/** Frees an array of the same time that buildProperties built */
static void freeProbabilities(gpointer key, gpointer value, gpointer data)
{
/* free up the struct account_probability that was allocated
* in gnc_imap_find_account_bayes()
*/
g_free(value);
}
/** holds an account name and its corresponding integer probability
the integer probability is some factor of 10
*/
struct account_info
{
char* account_name;
gint32 probability;
};
/** Find the highest probability and the corresponding account name
store in data, a (struct account_info*)
NOTE: this is a g_hash_table_foreach() function for a hash table of entries
key is a pointer to the account name, value is a gint32, 100000x
the probability for this account
*/
static void highestProbability(gpointer key, gpointer value, gpointer data)
{
struct account_info *account_i = (struct account_info*)data;
/* if the current probability is greater than the stored, store the current */
if (GPOINTER_TO_INT(value) > account_i->probability)
{
/* Save the new highest probability and the assoaciated account name */
account_i->probability = GPOINTER_TO_INT(value);
account_i->account_name = key;
}
}
#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */
/** Look up an Account in the map */
Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens)
{
struct token_accounts_info tokenInfo; /**< holds the accounts and total
* token count for a single token */
GList *current_token; /**< pointer to the current token from the
* input GList *tokens */
GList *current_account_token; /**< pointer to the struct
* account_token_count */
struct account_token_count *account_c; /**< an account name and the number
* of times a token has appeared
* for the account */
struct account_probability *account_p; /**< intermediate storage of values
* to compute the bayes probability
* of an account */
GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal);
struct account_info account_i;
kvp_value* value;
kvp_frame* token_frame;
ENTER(" ");
/* check to see if the imap is NULL */
if (!imap)
{
PINFO("imap is null, returning null");
LEAVE(" ");
return NULL;
}
/* find the probability for each account that contains any of the tokens
* in the input tokens list
*/
for (current_token = tokens; current_token; current_token = current_token->next)
{
/* zero out the token_accounts_info structure */
memset(&tokenInfo, 0, sizeof(struct token_accounts_info));
PINFO("token: '%s'", (char*)current_token->data);
/* find the slot for the given token off of the source account
* for these tokens, search off of the IMAP_FRAME_BAYES path so
* we aren't looking from the parent of the entire kvp tree
*/
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data, NULL);
/* if value is null we should skip over this token */
if (!value)
continue;
/* convert the slot(value) into a the frame that contains the
* list of accounts
*/
token_frame = kvp_value_get_frame(value);
/* token_frame should NEVER be null */
if (!token_frame)
{
PERR("token '%s' has no accounts", (char*)current_token->data);
continue; /* skip over this token */
}
/* process the accounts for this token, adding the account if it
* doesn't already exist or adding to the existing accounts token
* count if it does
*/
kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo);
/* for each account we have just found, see if the account already exists
* in the list of account probabilities, if not add it
*/
for (current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* get the account name and corresponding token count */
account_c = (struct account_token_count*)current_account_token->data;
PINFO("account_c->account_name('%s'), "
"account_c->token_count('%ld')/total_count('%ld')",
account_c->account_name, (long)account_c->token_count,
(long)tokenInfo.total_count);
account_p = g_hash_table_lookup(running_probabilities,
account_c->account_name);
/* if the account exists in the list then continue
* the running probablities
*/
if (account_p)
{
account_p->product =
((double)account_c->token_count / (double)tokenInfo.total_count)
* account_p->product;
account_p->product_difference =
((double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count))
* account_p->product_difference;
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
}
else
{
/* add a new entry */
PINFO("adding a new entry for this account");
account_p = (struct account_probability*)
g_new0(struct account_probability, 1);
/* set the product and product difference values */
account_p->product = ((double)account_c->token_count /
(double)tokenInfo.total_count);
account_p->product_difference =
(double)1 - ((double)account_c->token_count /
(double)tokenInfo.total_count);
PINFO("product == %f, product_difference == %f",
account_p->product, account_p->product_difference);
/* add the account name and (struct account_probability*)
* to the hash table */
g_hash_table_insert(running_probabilities,
account_c->account_name, account_p);
}
} /* for all accounts in tokenInfo */
/* free the data in tokenInfo */
for (current_account_token = tokenInfo.accounts; current_account_token;
current_account_token = current_account_token->next)
{
/* free up each struct account_token_count we allocated */
g_free((struct account_token_count*)current_account_token->data);
}
g_list_free(tokenInfo.accounts); /* free the accounts GList */
}
/* build a hash table of account names and their final probabilities
* from each entry in the running_probabilties hash table
*/
g_hash_table_foreach(running_probabilities, buildProbabilities,
final_probabilities);
/* find the highest probabilty and the corresponding account */
memset(&account_i, 0, sizeof(struct account_info));
g_hash_table_foreach(final_probabilities, highestProbability, &account_i);
/* free each element of the running_probabilities hash */
g_hash_table_foreach(running_probabilities, freeProbabilities, NULL);
/* free the hash tables */
g_hash_table_destroy(running_probabilities);
g_hash_table_destroy(final_probabilities);
PINFO("highest P('%s') = '%d'",
account_i.account_name ? account_i.account_name : "(null)",
account_i.probability);
/* has this probability met our threshold? */
if (account_i.probability >= threshold)
{
PINFO("found match");
LEAVE(" ");
return gnc_account_lookup_by_full_name(gnc_book_get_root_account(imap->book),
account_i.account_name);
}
PINFO("no match");
LEAVE(" ");
return NULL; /* we didn't meet our threshold, return NULL for an account */
}
/** Updates the imap for a given account using a list of tokens */
void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc)
{
GList *current_token;
kvp_value *value;
gint64 token_count;
char* account_fullname;
kvp_value *new_value; /* the value that will be added back into the kvp tree */
ENTER(" ");
/* if imap is null return */
if (!imap)
{
LEAVE(" ");
return;
}
g_return_if_fail (acc != NULL);
account_fullname = gnc_account_get_full_name(acc);
xaccAccountBeginEdit (imap->acc);
PINFO("account name: '%s'\n", account_fullname);
/* process each token in the list */
for (current_token = g_list_first(tokens); current_token;
current_token = current_token->next)
{
/* Jump to next iteration if the pointer is not valid or if the
string is empty. In HBCI import we almost always get an empty
string, which doesn't work in the kvp loopkup later. So we
skip this case here. */
if (!current_token->data || (*((char*)current_token->data) == '\0'))
continue;
/* start off with no tokens for this account */
token_count = 0;
PINFO("adding token '%s'\n", (char*)current_token->data);
/* is this token/account_name already in the kvp tree? */
value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES,
(char*)current_token->data, account_fullname,
NULL);
/* if the token/account is already in the tree, read the current
* value from the tree and use this for the basis of the value we
* are putting back
*/
if (value)
{
PINFO("found existing value of '%ld'\n",
(long)kvp_value_get_gint64(value));
/* convert this value back into an integer */
token_count += kvp_value_get_gint64(value);
}
/* increment the token count */
token_count++;
/* create a new value */
new_value = kvp_value_new_gint64(token_count);
/* insert the value into the kvp tree at
* /imap->frame/IMAP_FRAME/token_string/account_name_string
*/
kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES,
(char*)current_token->data, account_fullname, NULL);
/* kvp_frame_set_slot_path() copied the value so we
* need to delete this one ;-) */
kvp_value_delete(new_value);
}
/* free up the account fullname string */
qof_instance_set_dirty (QOF_INSTANCE (imap->acc));
xaccAccountCommitEdit (imap->acc);
g_free(account_fullname);
LEAVE(" ");
}
/** @} */

View File

@ -1,84 +0,0 @@
/********************************************************************\
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License as *
* published by the Free Software Foundation; either version 2 of *
* the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License*
* along with this program; if not, contact: *
* *
* Free Software Foundation Voice: +1-617-542-5942 *
* 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
* Boston, MA 02110-1301, USA gnu@gnu.org *
\********************************************************************/
/** @addtogroup Import_Export
@{ */
/** @file import-match-map.h
@brief Generic import mapper service, maps strings->accounts
*
An import mapper service that stores Account Maps for the
generic importer. This allows importers to map various
"strings" to Gnucash accounts in a generic manner.
@author Copyright (C) 2002,2003 Derek Atkins <derek@ihtfp.com>
*/
#ifndef GNC_IMPORT_MATCH_MAP_H
#define GNC_IMPORT_MATCH_MAP_H
typedef struct _GncImportMatchMap GncImportMatchMap;
#include "Account.h"
/** @{
Obtain an ImportMatchMap object from an Account or a Book */
GncImportMatchMap * gnc_imap_create_from_account (Account *acc);
GncImportMatchMap * gnc_imap_create_from_book (QofBook *book);
/*@}*/
/** Destroy an import map. But all stored entries will still continue
to exist in the underlying kvp frame of the account or book. */
void gnc_imap_destroy (GncImportMatchMap *imap);
/** Clear an import map -- this removes ALL entries in the map */
void gnc_imap_clear (GncImportMatchMap *imap);
/** Look up an Account in the map */
Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category,
const char *key);
/** Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
void gnc_imap_add_account (GncImportMatchMap *imap, const char *category,
const char *key, Account *acc);
/** Look up an Account in the map from a GList* of pointers to strings(tokens)
from the current transaction */
Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList* tokens);
/** Store an Account in the map. This mapping is immediatly stored in
the underlying kvp frame, regardless of whether the MatchMap is
destroyed later or not. */
void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens,
Account *acc);
/** @name Some well-known categories
NOTE: You DO NOT have to use these values in your importer -- these
are just "well known" values, not "mandatory" values. You are free
to use these if they apply, map your own fields to these labels, or
create your own category strings.
*/
/** @{*/
#define GNCIMPORT_DESC "desc"
#define GNCIMPORT_MEMO "memo"
#define GNCIMPORT_PAYEE "payee"
/**@}*/
#endif /* GNC_IMPORT_MATCH_MAP_H */
/**@}*/