Consider certain control character as invalid UTF-8. Fixes #346535.

* src/core-utils/gnc-glib-utils.h * src/core-utils/gw-core-utils-spec.scm: Remove the gnc_utf8_validate() API. It's not used anywhere. * src/core-utils/gnc-glib-utils.c: Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate but ignore certain control characters between 0x00 and 0x20 that are not valid XML characters. Fixes #346535. git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@14466 57a11ea4-9604-0410-9ed3-97b8803252fd
2025-02-25 18:55:30 -06:00 · 2006-07-05 16:04:36 +00:00 · 2006-07-05 16:04:36 +00:00 · a50040835c
commit a50040835c
parent c8c5db450f
4 changed files with 168 additions and 24 deletions
--- a/10
+++ b/10
@ -1,3 +1,13 @@
 2006-07-04  Derek Atkins  <derek@ihtfp.com>
 	* src/core-utils/gnc-glib-utils.h
 	* src/core-utils/gw-core-utils-spec.scm:
 	  Remove the gnc_utf8_validate() API. It's not used anywhere.
 	* src/core-utils/gnc-glib-utils.c:
 	  Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate
 	  but ignore certain control characters between 0x00 and 0x20
 	  that are not valid XML characters.  Fixes #346535.
 2006-07-04  Christian Stimming  <stimming@tuhh.de>
 	* po/de.po: Proof-read and improved German translation by feedback
--- a/src/core-utils/gnc-glib-utils.c
+++ b/src/core-utils/gnc-glib-utils.c
@ -44,10 +44,159 @@ safe_utf8_collate (const char * da, const char * db)
  return 0;
 }
-gboolean
+/********************************************************************
-gnc_utf8_validate (const gchar *str)
+ * The following definitions are from gutf8.c, for use by
 * gnc_utf8_validate().  These are all verbatim copies, except for
 * UNICODE_VALID() which has been modified to look for the strict
 * subset of UTF-8 that is valid XML text.
 */
 #define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
    {									      \
      Len = 1;								      \
      Mask = 0x7f;							      \
    }									      \
  else if ((Char & 0xe0) == 0xc0)					      \
    {									      \
      Len = 2;								      \
      Mask = 0x1f;							      \
    }									      \
  else if ((Char & 0xf0) == 0xe0)					      \
    {									      \
      Len = 3;								      \
      Mask = 0x0f;							      \
    }									      \
  else if ((Char & 0xf8) == 0xf0)					      \
    {									      \
      Len = 4;								      \
      Mask = 0x07;							      \
    }									      \
  else if ((Char & 0xfc) == 0xf8)					      \
    {									      \
      Len = 5;								      \
      Mask = 0x03;							      \
    }									      \
  else if ((Char & 0xfe) == 0xfc)					      \
    {									      \
      Len = 6;								      \
      Mask = 0x01;							      \
    }									      \
  else									      \
    Len = -1;
 #define UTF8_LENGTH(Char)              \
  ((Char) < 0x80 ? 1 :                 \
   ((Char) < 0x800 ? 2 :               \
    ((Char) < 0x10000 ? 3 :            \
     ((Char) < 0x200000 ? 4 :          \
      ((Char) < 0x4000000 ? 5 : 6)))))
 #define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
  (Result) = (Chars)[0] & (Mask);					      \
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
    {									      \
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
 	{								      \
 	  (Result) = -1;						      \
 	  break;							      \
 	}								      \
      (Result) <<= 6;							      \
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
    }
 #define UNICODE_VALID(Char)                   \
    ((Char) < 0x110000 &&			      \
     (((Char) & 0xFFFFF800) != 0xD800) &&	      \
     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&	      \
     ((Char) > 0x20 || (Char) == 0x09 || (Char) == 0x0A || (Char) == 0x0D) && \
     ((Char) & 0xFFFE) != 0xFFFE)
 /**
 * gnc_utf8_validate (copied from g_utf8_validate):
 * @str: a pointer to character data
 * @max_len: max bytes to validate, or -1 to go until nul
 * @end: return location for end of valid data
 * 
 * Validates UTF-8 encoded text. @str is the text to validate;
 * if @str is nul-terminated, then @max_len can be -1, otherwise
 * @max_len should be the number of bytes to validate.
 * If @end is non-%NULL, then the end of the valid range
 * will be stored there (i.e. the address of the first invalid byte
 * if some bytes were invalid, or the end of the text being validated
 * otherwise).
 *
 * This function looks validates the strict subset of UTF-8 that is
 * valid XML text, as detailed in
 * http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
 *
 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
 * routines <emphasis>require</emphasis> valid UTF-8 as input;
 * so data read from a file or the network should be checked
 * with g_utf8_validate() before doing anything else with it.
 * 
 * Return value: %TRUE if the text was valid UTF-8
 **/
 static gboolean
 gnc_utf8_validate (const gchar  *str,
                 gssize        max_len,    
                 const gchar **end)
 {
-  return g_utf8_validate(str, -1, NULL);
+
  const gchar *p;
  g_return_val_if_fail (str != NULL, FALSE);
  if (end)
    *end = str;
  p = str;
  while ((max_len < 0 || (p - str) < max_len) && *p)
    {
      int i, mask = 0, len;
      gunichar result;
      unsigned char c = (unsigned char) *p;
      UTF8_COMPUTE (c, mask, len);
      if (len == -1)
        break;
      /* check that the expected number of bytes exists in str */
      if (max_len >= 0 &&
          ((max_len - (p - str)) < len))
        break;
      UTF8_GET (result, p, i, mask, len);
      if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
 	break;
      if (result == (gunichar)-1)
        break;
      if (!UNICODE_VALID (result))
 	break;
      p += len;
    }
  if (end)
    *end = p;
  /* See that we covered the entire length if a length was
   * passed in, or that we ended on a nul if not
   */
  if (max_len >= 0 &&
      p != (str + max_len))
    return FALSE;
  else if (max_len < 0 &&
           *p != '\0')
    return FALSE;
  else
    return TRUE;
 }
 void
@ -56,12 +205,12 @@ gnc_utf8_strip_invalid (gchar *str)
  gchar *end;
  gint len;
-  if (g_utf8_validate(str, -1, (const gchar **)&end))
+  if (gnc_utf8_validate(str, -1, (const gchar **)&end))
    return;
  g_warning("Invalid utf8 string: %s", str);
  do {
    len = strlen(end);
    memmove(end, end+1, len);	/* shuffle the remainder one byte */
-  } while (!g_utf8_validate(str, -1, (const gchar **)&end));
+  } while (!gnc_utf8_validate(str, -1, (const gchar **)&end));
 }
--- a/src/core-utils/gnc-glib-utils.h
+++ b/src/core-utils/gnc-glib-utils.h
@ -56,19 +56,12 @@
 int safe_utf8_collate (const char *str1, const char *str2);
 /** This is a helper function for guile. C code should call
 *  g_utf8_validate directly.
 *
 *  @param str The string to be validated.
 *
 *  @return TRUE if this string is valid utf8. */
 gboolean gnc_utf8_validate (const gchar *str);
 /** Strip any non-utf8 characters from a string.  This function
 *  rewrites the string "in place" instead of allocating and returning
- *  a new string.  This allows it to operat on strings that are
+ *  a new string.  This allows it to operate on strings that are
- *  defined as character arrays in a larger data structure.
+ *  defined as character arrays in a larger data structure.  Note that
 *  it also removes some subset of invalid XML characters, too.
 *  See http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
 *
 *  @param str A pointer to the string to strip of invalid
 *  characters. */
--- a/src/core-utils/gw-core-utils-spec.scm
+++ b/src/core-utils/gw-core-utils-spec.scm
@ -53,14 +53,6 @@
   '(((<gw:mchars> caller-owned) program))
   "Get a boolean value from gconf.")
  (gw:wrap-function
   ws
   'gnc:utf8-validate
   '<gw:bool>
   "gnc_utf8_validate"
   '(((<gw:mchars> caller-owned) program))
   "Validate UTF8 encoded text.")
  (gw:wrap-function
   ws
   'gnc:utf8-strip-invalid