diff --git a/ChangeLog b/ChangeLog index f40341df05..6ae8ef8556 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2006-07-04 Derek Atkins + + * src/core-utils/gnc-glib-utils.h + * src/core-utils/gw-core-utils-spec.scm: + Remove the gnc_utf8_validate() API. It's not used anywhere. + * src/core-utils/gnc-glib-utils.c: + Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate + but ignore certain control characters between 0x00 and 0x20 + that are not valid XML characters. Fixes #346535. + 2006-07-04 Christian Stimming * po/de.po: Proof-read and improved German translation by feedback diff --git a/src/core-utils/gnc-glib-utils.c b/src/core-utils/gnc-glib-utils.c index 3acb2fcaa0..9ebe09cb75 100644 --- a/src/core-utils/gnc-glib-utils.c +++ b/src/core-utils/gnc-glib-utils.c @@ -44,10 +44,159 @@ safe_utf8_collate (const char * da, const char * db) return 0; } -gboolean -gnc_utf8_validate (const gchar *str) +/******************************************************************** + * The following definitions are from gutf8.c, for use by + * gnc_utf8_validate(). These are all verbatim copies, except for + * UNICODE_VALID() which has been modified to look for the strict + * subset of UTF-8 that is valid XML text. + */ + +#define UTF8_COMPUTE(Char, Mask, Len) \ + if (Char < 128) \ + { \ + Len = 1; \ + Mask = 0x7f; \ + } \ + else if ((Char & 0xe0) == 0xc0) \ + { \ + Len = 2; \ + Mask = 0x1f; \ + } \ + else if ((Char & 0xf0) == 0xe0) \ + { \ + Len = 3; \ + Mask = 0x0f; \ + } \ + else if ((Char & 0xf8) == 0xf0) \ + { \ + Len = 4; \ + Mask = 0x07; \ + } \ + else if ((Char & 0xfc) == 0xf8) \ + { \ + Len = 5; \ + Mask = 0x03; \ + } \ + else if ((Char & 0xfe) == 0xfc) \ + { \ + Len = 6; \ + Mask = 0x01; \ + } \ + else \ + Len = -1; + +#define UTF8_LENGTH(Char) \ + ((Char) < 0x80 ? 1 : \ + ((Char) < 0x800 ? 2 : \ + ((Char) < 0x10000 ? 3 : \ + ((Char) < 0x200000 ? 4 : \ + ((Char) < 0x4000000 ? 5 : 6))))) + + +#define UTF8_GET(Result, Chars, Count, Mask, Len) \ + (Result) = (Chars)[0] & (Mask); \ + for ((Count) = 1; (Count) < (Len); ++(Count)) \ + { \ + if (((Chars)[(Count)] & 0xc0) != 0x80) \ + { \ + (Result) = -1; \ + break; \ + } \ + (Result) <<= 6; \ + (Result) |= ((Chars)[(Count)] & 0x3f); \ + } + +#define UNICODE_VALID(Char) \ + ((Char) < 0x110000 && \ + (((Char) & 0xFFFFF800) != 0xD800) && \ + ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ + ((Char) > 0x20 || (Char) == 0x09 || (Char) == 0x0A || (Char) == 0x0D) && \ + ((Char) & 0xFFFE) != 0xFFFE) + +/** + * gnc_utf8_validate (copied from g_utf8_validate): + * @str: a pointer to character data + * @max_len: max bytes to validate, or -1 to go until nul + * @end: return location for end of valid data + * + * Validates UTF-8 encoded text. @str is the text to validate; + * if @str is nul-terminated, then @max_len can be -1, otherwise + * @max_len should be the number of bytes to validate. + * If @end is non-%NULL, then the end of the valid range + * will be stored there (i.e. the address of the first invalid byte + * if some bytes were invalid, or the end of the text being validated + * otherwise). + * + * This function looks validates the strict subset of UTF-8 that is + * valid XML text, as detailed in + * http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535 + * + * Returns %TRUE if all of @str was valid. Many GLib and GTK+ + * routines require valid UTF-8 as input; + * so data read from a file or the network should be checked + * with g_utf8_validate() before doing anything else with it. + * + * Return value: %TRUE if the text was valid UTF-8 + **/ +static gboolean +gnc_utf8_validate (const gchar *str, + gssize max_len, + const gchar **end) { - return g_utf8_validate(str, -1, NULL); + + const gchar *p; + + g_return_val_if_fail (str != NULL, FALSE); + + if (end) + *end = str; + + p = str; + + while ((max_len < 0 || (p - str) < max_len) && *p) + { + int i, mask = 0, len; + gunichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + + if (len == -1) + break; + + /* check that the expected number of bytes exists in str */ + if (max_len >= 0 && + ((max_len - (p - str)) < len)) + break; + + UTF8_GET (result, p, i, mask, len); + + if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */ + break; + + if (result == (gunichar)-1) + break; + + if (!UNICODE_VALID (result)) + break; + + p += len; + } + + if (end) + *end = p; + + /* See that we covered the entire length if a length was + * passed in, or that we ended on a nul if not + */ + if (max_len >= 0 && + p != (str + max_len)) + return FALSE; + else if (max_len < 0 && + *p != '\0') + return FALSE; + else + return TRUE; } void @@ -56,12 +205,12 @@ gnc_utf8_strip_invalid (gchar *str) gchar *end; gint len; - if (g_utf8_validate(str, -1, (const gchar **)&end)) + if (gnc_utf8_validate(str, -1, (const gchar **)&end)) return; g_warning("Invalid utf8 string: %s", str); do { len = strlen(end); memmove(end, end+1, len); /* shuffle the remainder one byte */ - } while (!g_utf8_validate(str, -1, (const gchar **)&end)); + } while (!gnc_utf8_validate(str, -1, (const gchar **)&end)); } diff --git a/src/core-utils/gnc-glib-utils.h b/src/core-utils/gnc-glib-utils.h index f52d25bc48..e1f03b0bbf 100644 --- a/src/core-utils/gnc-glib-utils.h +++ b/src/core-utils/gnc-glib-utils.h @@ -56,19 +56,12 @@ int safe_utf8_collate (const char *str1, const char *str2); -/** This is a helper function for guile. C code should call - * g_utf8_validate directly. - * - * @param str The string to be validated. - * - * @return TRUE if this string is valid utf8. */ -gboolean gnc_utf8_validate (const gchar *str); - - /** Strip any non-utf8 characters from a string. This function * rewrites the string "in place" instead of allocating and returning - * a new string. This allows it to operat on strings that are - * defined as character arrays in a larger data structure. + * a new string. This allows it to operate on strings that are + * defined as character arrays in a larger data structure. Note that + * it also removes some subset of invalid XML characters, too. + * See http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535 * * @param str A pointer to the string to strip of invalid * characters. */ diff --git a/src/core-utils/gw-core-utils-spec.scm b/src/core-utils/gw-core-utils-spec.scm index 221f4182d0..bc9b0e4875 100644 --- a/src/core-utils/gw-core-utils-spec.scm +++ b/src/core-utils/gw-core-utils-spec.scm @@ -53,14 +53,6 @@ '((( caller-owned) program)) "Get a boolean value from gconf.") - (gw:wrap-function - ws - 'gnc:utf8-validate - ' - "gnc_utf8_validate" - '((( caller-owned) program)) - "Validate UTF8 encoded text.") - (gw:wrap-function ws 'gnc:utf8-strip-invalid