mirror of
https://github.com/Gnucash/gnucash.git
synced 2025-02-25 18:55:30 -06:00
Consider certain control character as invalid UTF-8. Fixes #346535.
* src/core-utils/gnc-glib-utils.h * src/core-utils/gw-core-utils-spec.scm: Remove the gnc_utf8_validate() API. It's not used anywhere. * src/core-utils/gnc-glib-utils.c: Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate but ignore certain control characters between 0x00 and 0x20 that are not valid XML characters. Fixes #346535. git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@14466 57a11ea4-9604-0410-9ed3-97b8803252fd
This commit is contained in:
parent
c8c5db450f
commit
a50040835c
10
ChangeLog
10
ChangeLog
@ -1,3 +1,13 @@
|
|||||||
|
2006-07-04 Derek Atkins <derek@ihtfp.com>
|
||||||
|
|
||||||
|
* src/core-utils/gnc-glib-utils.h
|
||||||
|
* src/core-utils/gw-core-utils-spec.scm:
|
||||||
|
Remove the gnc_utf8_validate() API. It's not used anywhere.
|
||||||
|
* src/core-utils/gnc-glib-utils.c:
|
||||||
|
Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate
|
||||||
|
but ignore certain control characters between 0x00 and 0x20
|
||||||
|
that are not valid XML characters. Fixes #346535.
|
||||||
|
|
||||||
2006-07-04 Christian Stimming <stimming@tuhh.de>
|
2006-07-04 Christian Stimming <stimming@tuhh.de>
|
||||||
|
|
||||||
* po/de.po: Proof-read and improved German translation by feedback
|
* po/de.po: Proof-read and improved German translation by feedback
|
||||||
|
@ -44,10 +44,159 @@ safe_utf8_collate (const char * da, const char * db)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
gboolean
|
/********************************************************************
|
||||||
gnc_utf8_validate (const gchar *str)
|
* The following definitions are from gutf8.c, for use by
|
||||||
|
* gnc_utf8_validate(). These are all verbatim copies, except for
|
||||||
|
* UNICODE_VALID() which has been modified to look for the strict
|
||||||
|
* subset of UTF-8 that is valid XML text.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define UTF8_COMPUTE(Char, Mask, Len) \
|
||||||
|
if (Char < 128) \
|
||||||
|
{ \
|
||||||
|
Len = 1; \
|
||||||
|
Mask = 0x7f; \
|
||||||
|
} \
|
||||||
|
else if ((Char & 0xe0) == 0xc0) \
|
||||||
|
{ \
|
||||||
|
Len = 2; \
|
||||||
|
Mask = 0x1f; \
|
||||||
|
} \
|
||||||
|
else if ((Char & 0xf0) == 0xe0) \
|
||||||
|
{ \
|
||||||
|
Len = 3; \
|
||||||
|
Mask = 0x0f; \
|
||||||
|
} \
|
||||||
|
else if ((Char & 0xf8) == 0xf0) \
|
||||||
|
{ \
|
||||||
|
Len = 4; \
|
||||||
|
Mask = 0x07; \
|
||||||
|
} \
|
||||||
|
else if ((Char & 0xfc) == 0xf8) \
|
||||||
|
{ \
|
||||||
|
Len = 5; \
|
||||||
|
Mask = 0x03; \
|
||||||
|
} \
|
||||||
|
else if ((Char & 0xfe) == 0xfc) \
|
||||||
|
{ \
|
||||||
|
Len = 6; \
|
||||||
|
Mask = 0x01; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
Len = -1;
|
||||||
|
|
||||||
|
#define UTF8_LENGTH(Char) \
|
||||||
|
((Char) < 0x80 ? 1 : \
|
||||||
|
((Char) < 0x800 ? 2 : \
|
||||||
|
((Char) < 0x10000 ? 3 : \
|
||||||
|
((Char) < 0x200000 ? 4 : \
|
||||||
|
((Char) < 0x4000000 ? 5 : 6)))))
|
||||||
|
|
||||||
|
|
||||||
|
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
|
||||||
|
(Result) = (Chars)[0] & (Mask); \
|
||||||
|
for ((Count) = 1; (Count) < (Len); ++(Count)) \
|
||||||
|
{ \
|
||||||
|
if (((Chars)[(Count)] & 0xc0) != 0x80) \
|
||||||
|
{ \
|
||||||
|
(Result) = -1; \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
(Result) <<= 6; \
|
||||||
|
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define UNICODE_VALID(Char) \
|
||||||
|
((Char) < 0x110000 && \
|
||||||
|
(((Char) & 0xFFFFF800) != 0xD800) && \
|
||||||
|
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
|
||||||
|
((Char) > 0x20 || (Char) == 0x09 || (Char) == 0x0A || (Char) == 0x0D) && \
|
||||||
|
((Char) & 0xFFFE) != 0xFFFE)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gnc_utf8_validate (copied from g_utf8_validate):
|
||||||
|
* @str: a pointer to character data
|
||||||
|
* @max_len: max bytes to validate, or -1 to go until nul
|
||||||
|
* @end: return location for end of valid data
|
||||||
|
*
|
||||||
|
* Validates UTF-8 encoded text. @str is the text to validate;
|
||||||
|
* if @str is nul-terminated, then @max_len can be -1, otherwise
|
||||||
|
* @max_len should be the number of bytes to validate.
|
||||||
|
* If @end is non-%NULL, then the end of the valid range
|
||||||
|
* will be stored there (i.e. the address of the first invalid byte
|
||||||
|
* if some bytes were invalid, or the end of the text being validated
|
||||||
|
* otherwise).
|
||||||
|
*
|
||||||
|
* This function looks validates the strict subset of UTF-8 that is
|
||||||
|
* valid XML text, as detailed in
|
||||||
|
* http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
|
||||||
|
*
|
||||||
|
* Returns %TRUE if all of @str was valid. Many GLib and GTK+
|
||||||
|
* routines <emphasis>require</emphasis> valid UTF-8 as input;
|
||||||
|
* so data read from a file or the network should be checked
|
||||||
|
* with g_utf8_validate() before doing anything else with it.
|
||||||
|
*
|
||||||
|
* Return value: %TRUE if the text was valid UTF-8
|
||||||
|
**/
|
||||||
|
static gboolean
|
||||||
|
gnc_utf8_validate (const gchar *str,
|
||||||
|
gssize max_len,
|
||||||
|
const gchar **end)
|
||||||
{
|
{
|
||||||
return g_utf8_validate(str, -1, NULL);
|
|
||||||
|
const gchar *p;
|
||||||
|
|
||||||
|
g_return_val_if_fail (str != NULL, FALSE);
|
||||||
|
|
||||||
|
if (end)
|
||||||
|
*end = str;
|
||||||
|
|
||||||
|
p = str;
|
||||||
|
|
||||||
|
while ((max_len < 0 || (p - str) < max_len) && *p)
|
||||||
|
{
|
||||||
|
int i, mask = 0, len;
|
||||||
|
gunichar result;
|
||||||
|
unsigned char c = (unsigned char) *p;
|
||||||
|
|
||||||
|
UTF8_COMPUTE (c, mask, len);
|
||||||
|
|
||||||
|
if (len == -1)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* check that the expected number of bytes exists in str */
|
||||||
|
if (max_len >= 0 &&
|
||||||
|
((max_len - (p - str)) < len))
|
||||||
|
break;
|
||||||
|
|
||||||
|
UTF8_GET (result, p, i, mask, len);
|
||||||
|
|
||||||
|
if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (result == (gunichar)-1)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (!UNICODE_VALID (result))
|
||||||
|
break;
|
||||||
|
|
||||||
|
p += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (end)
|
||||||
|
*end = p;
|
||||||
|
|
||||||
|
/* See that we covered the entire length if a length was
|
||||||
|
* passed in, or that we ended on a nul if not
|
||||||
|
*/
|
||||||
|
if (max_len >= 0 &&
|
||||||
|
p != (str + max_len))
|
||||||
|
return FALSE;
|
||||||
|
else if (max_len < 0 &&
|
||||||
|
*p != '\0')
|
||||||
|
return FALSE;
|
||||||
|
else
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -56,12 +205,12 @@ gnc_utf8_strip_invalid (gchar *str)
|
|||||||
gchar *end;
|
gchar *end;
|
||||||
gint len;
|
gint len;
|
||||||
|
|
||||||
if (g_utf8_validate(str, -1, (const gchar **)&end))
|
if (gnc_utf8_validate(str, -1, (const gchar **)&end))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
g_warning("Invalid utf8 string: %s", str);
|
g_warning("Invalid utf8 string: %s", str);
|
||||||
do {
|
do {
|
||||||
len = strlen(end);
|
len = strlen(end);
|
||||||
memmove(end, end+1, len); /* shuffle the remainder one byte */
|
memmove(end, end+1, len); /* shuffle the remainder one byte */
|
||||||
} while (!g_utf8_validate(str, -1, (const gchar **)&end));
|
} while (!gnc_utf8_validate(str, -1, (const gchar **)&end));
|
||||||
}
|
}
|
||||||
|
@ -56,19 +56,12 @@
|
|||||||
int safe_utf8_collate (const char *str1, const char *str2);
|
int safe_utf8_collate (const char *str1, const char *str2);
|
||||||
|
|
||||||
|
|
||||||
/** This is a helper function for guile. C code should call
|
|
||||||
* g_utf8_validate directly.
|
|
||||||
*
|
|
||||||
* @param str The string to be validated.
|
|
||||||
*
|
|
||||||
* @return TRUE if this string is valid utf8. */
|
|
||||||
gboolean gnc_utf8_validate (const gchar *str);
|
|
||||||
|
|
||||||
|
|
||||||
/** Strip any non-utf8 characters from a string. This function
|
/** Strip any non-utf8 characters from a string. This function
|
||||||
* rewrites the string "in place" instead of allocating and returning
|
* rewrites the string "in place" instead of allocating and returning
|
||||||
* a new string. This allows it to operat on strings that are
|
* a new string. This allows it to operate on strings that are
|
||||||
* defined as character arrays in a larger data structure.
|
* defined as character arrays in a larger data structure. Note that
|
||||||
|
* it also removes some subset of invalid XML characters, too.
|
||||||
|
* See http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
|
||||||
*
|
*
|
||||||
* @param str A pointer to the string to strip of invalid
|
* @param str A pointer to the string to strip of invalid
|
||||||
* characters. */
|
* characters. */
|
||||||
|
@ -53,14 +53,6 @@
|
|||||||
'(((<gw:mchars> caller-owned) program))
|
'(((<gw:mchars> caller-owned) program))
|
||||||
"Get a boolean value from gconf.")
|
"Get a boolean value from gconf.")
|
||||||
|
|
||||||
(gw:wrap-function
|
|
||||||
ws
|
|
||||||
'gnc:utf8-validate
|
|
||||||
'<gw:bool>
|
|
||||||
"gnc_utf8_validate"
|
|
||||||
'(((<gw:mchars> caller-owned) program))
|
|
||||||
"Validate UTF8 encoded text.")
|
|
||||||
|
|
||||||
(gw:wrap-function
|
(gw:wrap-function
|
||||||
ws
|
ws
|
||||||
'gnc:utf8-strip-invalid
|
'gnc:utf8-strip-invalid
|
||||||
|
Loading…
Reference in New Issue
Block a user