mbyte: replace vim_tolower with mb_tolower handling locale correctly

2025-02-25 18:55:25 -06:00 · 2017-04-08 16:45:38 +02:00
parent 3b88e37b83
commit db9ef6263e
16 changed files with 79 additions and 135 deletions
--- a/src/nvim/charset.c
+++ b/src/nvim/charset.c
@@ -212,8 +212,8 @@ int buf_init_chartab(buf_T *buf, int global)
        // work properly when 'encoding' is "latin1" and the locale is
        // "C".
        if (!do_isalpha
-            || vim_islower(c)
-            || vim_isupper(c)
+            || mb_islower(c)
+            || mb_isupper(c)
            || (p_altkeymap && (F_isalpha(c) || F_isdigit(c)))) {
          if (i == 0) {
            // (re)set ID flag
@@ -417,11 +417,11 @@ char_u* str_foldcase(char_u *str, int orglen, char_u *buf, int buflen)
  while (STR_CHAR(i) != NUL) {
    int c = utf_ptr2char(STR_PTR(i));
    int olen = utf_ptr2len(STR_PTR(i));
-    int lc = utf_tolower(c);
+    int lc = mb_tolower(c);

    // Only replace the character when it is not an invalid
    // sequence (ASCII character or more than one byte) and
-    // utf_tolower() doesn't return the original character.
+    // mb_tolower() doesn't return the original character.
    if (((c < 0x80) || (olen > 1)) && (c != lc)) {
      int nlen = utf_char2len(lc);

@@ -1506,67 +1506,6 @@ char_u* skiptohex(char_u *q)
  return p;
 }

-// Vim's own character class functions.  These exist because many library
-// islower()/toupper() etc. do not work properly: they crash when used with
-// invalid values or can't handle latin1 when the locale is C.
-// Speed is most important here.
-
-/// Check that the character is lower-case
-///
-/// @param  c  character to check
-bool vim_islower(int c)
-  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
-{
-  if (c <= '@') {
-    return false;
-  }
-
-  if (c >= 0x80) {
-    return utf_islower(c);
-  }
-  return islower(c);
-}
-
-/// Check that the character is upper-case
-///
-/// @param  c  character to check
-bool vim_isupper(int c)
-  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
-{
-  if (c <= '@') {
-    return false;
-  }
-
-  if (c >= 0x80) {
-      return utf_isupper(c);
-  }
-  return isupper(c);
-}
-
-int vim_toupper(int c)
-{
-  if (c <= '@') {
-    return c;
-  }
-
-  if (c >= 0x80) {
-    return utf_toupper(c);
-  }
-  return TOUPPER_LOC(c);
-}
-
-int vim_tolower(int c)
-{
-  if (c <= '@') {
-    return c;
-  }
-
-  if (c >= 0x80) {
-    return utf_tolower(c);
-  }
-  return TOLOWER_LOC(c);
-}
-
 /// Skip over text until ' ' or '\t' or NUL
 ///
 /// @param[in]  p  Text to skip over.
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@@ -2037,12 +2037,12 @@ int ins_compl_add_infercase(char_u *str, int len, int icase, char_u *fname, int
        } else {
          c = *(p++);
        }
-        if (vim_islower(c)) {
+        if (mb_islower(c)) {
          has_lower = true;
-          if (vim_isupper(wca[i])) {
+          if (mb_isupper(wca[i])) {
            // Rule 1 is satisfied.
            for (i = actual_compl_length; i < actual_len; i++) {
-              wca[i] = vim_tolower(wca[i]);
+              wca[i] = mb_tolower(wca[i]);
            }
            break;
          }
@@ -2062,14 +2062,14 @@ int ins_compl_add_infercase(char_u *str, int len, int icase, char_u *fname, int
        } else {
          c = *(p++);
        }
-        if (was_letter && vim_isupper(c) && vim_islower(wca[i])) {
+        if (was_letter && mb_isupper(c) && mb_islower(wca[i])) {
          // Rule 2 is satisfied.
          for (i = actual_compl_length; i < actual_len; i++) {
-            wca[i] = vim_toupper(wca[i]);
+            wca[i] = mb_toupper(wca[i]);
          }
          break;
        }
-        was_letter = vim_islower(c) || vim_isupper(c);
+        was_letter = mb_islower(c) || mb_isupper(c);
      }
    }

@@ -2082,10 +2082,10 @@ int ins_compl_add_infercase(char_u *str, int len, int icase, char_u *fname, int
        } else {
          c = *(p++);
        }
-        if (vim_islower(c)) {
-          wca[i] = vim_tolower(wca[i]);
-        } else if (vim_isupper(c)) {
-          wca[i] = vim_toupper(wca[i]);
+        if (mb_islower(c)) {
+          wca[i] = mb_tolower(wca[i]);
+        } else if (mb_isupper(c)) {
+          wca[i] = mb_toupper(wca[i]);
        }
      }
    }
@@ -2302,7 +2302,7 @@ static void ins_compl_longest_match(compl_T *match)
        c1 = *p;
        c2 = *s;
      }
-      if (match->cp_icase ? (vim_tolower(c1) != vim_tolower(c2))
+      if (match->cp_icase ? (mb_tolower(c1) != mb_tolower(c2))
          : (c1 != c2))
        break;
      if (has_mbyte) {
--- a/src/nvim/eval.c
+++ b/src/nvim/eval.c
@@ -16802,7 +16802,7 @@ static void f_tolower(typval_T *argvars, typval_T *rettv, FunPtr fptr)
      int c, lc;

      c = utf_ptr2char(p);
-      lc = utf_tolower(c);
+      lc = mb_tolower(c);
      l = utf_ptr2len(p);
      /* TODO: reallocate string when byte count changes. */
      if (utf_char2len(lc) == l)
--- a/src/nvim/ex_getln.c
+++ b/src/nvim/ex_getln.c
@@ -1231,7 +1231,7 @@ static int command_line_handle_key(CommandLineState *s)
        // command line has no uppercase characters, convert
        // the character to lowercase
        if (p_ic && p_scs && !pat_has_uppercase(ccline.cmdbuff)) {
-          s->c = vim_tolower(s->c);
+          s->c = mb_tolower(s->c);
        }

        if (s->c != NUL) {
@@ -3018,7 +3018,7 @@ ExpandOne (
                      || xp->xp_context == EXPAND_FILES
                      || xp->xp_context == EXPAND_SHELLCMD
                      || xp->xp_context == EXPAND_BUFFERS)) {
-          if (vim_tolower(c0) != vim_tolower(ci)) {
+          if (mb_tolower(c0) != mb_tolower(ci)) {
            break;
          }
        } else if (c0 != ci) {
--- a/src/nvim/file_search.c
+++ b/src/nvim/file_search.c
@@ -1057,7 +1057,7 @@ static bool ff_wc_equal(char_u *s1, char_u *s2)
    c1 = PTR2CHAR(s1 + i);
    c2 = PTR2CHAR(s2 + j);

-    if ((p_fic ? vim_tolower(c1) != vim_tolower(c2) : c1 != c2)
+    if ((p_fic ? mb_tolower(c1) != mb_tolower(c2) : c1 != c2)
        && (prev1 != '*' || prev2 != '*')) {
      return false;
    }
--- a/src/nvim/macros.h
+++ b/src/nvim/macros.h
@@ -62,7 +62,7 @@
 * toupper() and tolower() that use the current locale.
 * Careful: Only call TOUPPER_LOC() and TOLOWER_LOC() with a character in the
 * range 0 - 255.  toupper()/tolower() on some systems can't handle others.
- * Note: It is often better to use vim_tolower() and vim_toupper(), because many
+ * Note: It is often better to use mb_tolower() and mb_toupper(), because many
 * toupper() and tolower() implementations only work for ASCII.
 */
 #define TOUPPER_LOC toupper
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1174,11 +1174,16 @@ int utf_fold(int a)
  return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
 }

+// Vim's own character class functions.  These exist because many library
+// islower()/toupper() etc. do not work properly: they crash when used with
+// invalid values or can't handle latin1 when the locale is C.
+// Speed is most important here.
+
 /*
 * Return the upper-case equivalent of "a", which is a UCS-4 character.  Use
 * simple case folding.
 */
-int utf_toupper(int a)
+int mb_toupper(int a)
 {
  /* If 'casemap' contains "keepascii" use ASCII style toupper(). */
  if (a < 128 && (cmp_flags & CMP_KEEPASCII))
@@ -1198,17 +1203,17 @@ int utf_toupper(int a)
  return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
 }

-bool utf_islower(int a)
+bool mb_islower(int a)
 {
  /* German sharp s is lower case but has no upper case equivalent. */
-  return (utf_toupper(a) != a) || a == 0xdf;
+  return (mb_toupper(a) != a) || a == 0xdf;
 }

 /*
 * Return the lower-case equivalent of "a", which is a UCS-4 character.  Use
 * simple case folding.
 */
-int utf_tolower(int a)
+int mb_tolower(int a)
 {
  /* If 'casemap' contains "keepascii" use ASCII style tolower(). */
  if (a < 128 && (cmp_flags & CMP_KEEPASCII))
@@ -1228,9 +1233,9 @@ int utf_tolower(int a)
  return utf_convert(a, toLower, ARRAY_SIZE(toLower));
 }

-bool utf_isupper(int a)
+bool mb_isupper(int a)
 {
-  return utf_tolower(a) != a;
+  return mb_tolower(a) != a;
 }

 static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@@ -2731,7 +2731,7 @@ do_dialog (
      }

      /* Make the character lowercase, as chars in "hotkeys" are. */
-      c = vim_tolower(c);
+      c = mb_tolower(c);
      retval = 1;
      for (i = 0; hotkeys[i]; ++i) {
        if (has_mbyte) {
@@ -2777,7 +2777,7 @@ copy_char (

  if (has_mbyte) {
    if (lowercase) {
-      c = vim_tolower((*mb_ptr2char)(from));
+      c = mb_tolower((*mb_ptr2char)(from));
      return (*mb_char2bytes)(c, to);
    } else {
      len = (*mb_ptr2len)(from);
--- a/src/nvim/ops.c
+++ b/src/nvim/ops.c
@@ -1956,16 +1956,16 @@ int swapchar(int op_type, pos_T *pos)
  if (enc_dbcs != 0 && c >= 0x100)      /* No lower/uppercase letter */
    return FALSE;
  nc = c;
-  if (vim_islower(c)) {
+  if (mb_islower(c)) {
    if (op_type == OP_ROT13)
      nc = ROT13(c, 'a');
    else if (op_type != OP_LOWER)
-      nc = vim_toupper(c);
-  } else if (vim_isupper(c)) {
+      nc = mb_toupper(c);
+  } else if (mb_isupper(c)) {
    if (op_type == OP_ROT13)
      nc = ROT13(c, 'A');
    else if (op_type != OP_UPPER)
-      nc = vim_tolower(c);
+      nc = mb_tolower(c);
  }
  if (nc != c) {
    if (enc_utf8 && (c >= 0x80 || nc >= 0x80)) {
@@ -3327,7 +3327,7 @@ void ex_display(exarg_T *eap)

    get_clipboard(name, &yb, true);

-    if (name == vim_tolower(redir_reg)
+    if (name == mb_tolower(redir_reg)
        || (redir_reg == '"' && yb == y_previous))
      continue;             /* do not list register being written to, the
                             * pointer can be freed */
--- a/src/nvim/path.c
+++ b/src/nvim/path.c
@@ -1853,7 +1853,7 @@ int pathcmp(const char *p, const char *q, int maxlen)
      break;
    }

-    if ((p_fic ? vim_toupper(c1) != vim_toupper(c2) : c1 != c2)
+    if ((p_fic ? mb_toupper(c1) != mb_toupper(c2) : c1 != c2)
 #ifdef BACKSLASH_IN_FILENAME
        /* consider '/' and '\\' to be equal */
        && !((c1 == '/' && c2 == '\\')
@@ -1864,7 +1864,7 @@ int pathcmp(const char *p, const char *q, int maxlen)
        return -1;
      if (vim_ispathsep(c2))
        return 1;
-      return p_fic ? vim_toupper(c1) - vim_toupper(c2)
+      return p_fic ? mb_toupper(c1) - mb_toupper(c2)
             : c1 - c2;         /* no match */
    }

--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -2350,7 +2350,7 @@ collection:
              break;
            case CLASS_LOWER:
              for (cu = 1; cu <= 255; cu++) {
-                if (vim_islower(cu) && cu != 170 && cu != 186) {
+                if (mb_islower(cu) && cu != 170 && cu != 186) {
                  regmbc(cu);
                }
              }
@@ -2376,7 +2376,7 @@ collection:
              break;
            case CLASS_UPPER:
              for (cu = 1; cu <= 255; cu++) {
-                if (vim_isupper(cu)) {
+                if (mb_isupper(cu)) {
                  regmbc(cu);
                }
              }
@@ -3474,7 +3474,7 @@ static long bt_regexec_both(char_u *line,
        || (ireg_ic
            && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
                || (c < 255 && prog->regstart < 255
-                    && vim_tolower(prog->regstart) == vim_tolower(c))))) {
+                    && mb_tolower(prog->regstart) == mb_tolower(c))))) {
      retval = regtry(prog, col);
    } else {
      retval = 0;
@@ -4155,7 +4155,7 @@ regmatch (
          if (*opnd != *reginput
              && (!ireg_ic
                  || (!enc_utf8
-                      && vim_tolower(*opnd) != vim_tolower(*reginput)))) {
+                      && mb_tolower(*opnd) != mb_tolower(*reginput)))) {
            status = RA_NOMATCH;
          } else if (*opnd == NUL) {
            // match empty string always works; happens when "~" is
@@ -4573,10 +4573,10 @@ regmatch (
          if (OP(next) == EXACTLY) {
            rst.nextb = *OPERAND(next);
            if (ireg_ic) {
-              if (vim_isupper(rst.nextb))
-                rst.nextb_ic = vim_tolower(rst.nextb);
+              if (mb_isupper(rst.nextb))
+                rst.nextb_ic = mb_tolower(rst.nextb);
              else
-                rst.nextb_ic = vim_toupper(rst.nextb);
+                rst.nextb_ic = mb_toupper(rst.nextb);
            } else
              rst.nextb_ic = rst.nextb;
          } else {
@@ -5339,8 +5339,8 @@ do_class:
     * would have been used for it.  It does handle single-byte
     * characters, such as latin1. */
    if (ireg_ic) {
-      cu = vim_toupper(*opnd);
-      cl = vim_tolower(*opnd);
+      cu = mb_toupper(*opnd);
+      cl = mb_tolower(*opnd);
      while (count < maxcount && (*scan == cu || *scan == cl)) {
        count++;
        scan++;
@@ -6314,10 +6314,10 @@ static char_u *cstrchr(char_u *s, int c)
   * For UTF-8 need to use folded case. */
  if (enc_utf8 && c > 0x80)
    cc = utf_fold(c);
-  else if (vim_isupper(c))
-    cc = vim_tolower(c);
-  else if (vim_islower(c))
-    cc = vim_toupper(c);
+  else if (mb_isupper(c))
+    cc = mb_tolower(c);
+  else if (mb_islower(c))
+    cc = mb_toupper(c);
  else
    return vim_strchr(s, c);

@@ -6348,28 +6348,28 @@ static char_u *cstrchr(char_u *s, int c)

 static fptr_T do_upper(int *d, int c)
 {
-  *d = vim_toupper(c);
+  *d = mb_toupper(c);

  return (fptr_T)NULL;
 }

 static fptr_T do_Upper(int *d, int c)
 {
-  *d = vim_toupper(c);
+  *d = mb_toupper(c);

  return (fptr_T)do_Upper;
 }

 static fptr_T do_lower(int *d, int c)
 {
-  *d = vim_tolower(c);
+  *d = mb_tolower(c);

  return (fptr_T)NULL;
 }

 static fptr_T do_Lower(int *d, int c)
 {
-  *d = vim_tolower(c);
+  *d = mb_tolower(c);

  return (fptr_T)do_Lower;
 }
--- a/src/nvim/regexp_nfa.c
+++ b/src/nvim/regexp_nfa.c
@@ -4373,7 +4373,7 @@ static int check_char_class(int class, int c)
      return OK;
    break;
  case NFA_CLASS_LOWER:
-    if (vim_islower(c) && c != 170 && c != 186) {
+    if (mb_islower(c) && c != 170 && c != 186) {
      return OK;
    }
    break;
@@ -4391,7 +4391,7 @@ static int check_char_class(int class, int c)
      return OK;
    break;
  case NFA_CLASS_UPPER:
-    if (vim_isupper(c))
+    if (mb_isupper(c))
      return OK;
    break;
  case NFA_CLASS_XDIGIT:
@@ -4892,7 +4892,7 @@ static long find_match_text(colnr_T startcol, int regstart, char_u *match_text)
      int c2_len = PTR2LEN(s2);
      int c2 = PTR2CHAR(s2);

-      if ((c1 != c2 && (!ireg_ic || vim_tolower(c1) != vim_tolower(c2)))
+      if ((c1 != c2 && (!ireg_ic || mb_tolower(c1) != mb_tolower(c2)))
          || c1_len != c2_len) {
        match = false;
        break;
@@ -5585,11 +5585,11 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start,
              break;
            }
            if (ireg_ic) {
-              int curc_low = vim_tolower(curc);
+              int curc_low = mb_tolower(curc);
              int done = FALSE;

              for (; c1 <= c2; ++c1)
-                if (vim_tolower(c1) == curc_low) {
+                if (mb_tolower(c1) == curc_low) {
                  result = result_if_matched;
                  done = TRUE;
                  break;
@@ -5599,8 +5599,8 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start,
            }
          } else if (state->c < 0 ? check_char_class(state->c, curc)
                     : (curc == state->c
-                        || (ireg_ic && vim_tolower(curc)
-                            == vim_tolower(state->c)))) {
+                        || (ireg_ic && mb_tolower(curc)
+                            == mb_tolower(state->c)))) {
            result = result_if_matched;
            break;
          }
@@ -6004,7 +6004,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start,
        result = (c == curc);

        if (!result && ireg_ic)
-          result = vim_tolower(c) == vim_tolower(curc);
+          result = mb_tolower(c) == mb_tolower(curc);

        // If ireg_icombine is not set only skip over the character
        // itself.  When it is set skip over composing characters.
@@ -6152,8 +6152,8 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start,
            // Checking if the required start character matches is
            // cheaper than adding a state that won't match.
            c = PTR2CHAR(reginput + clen);
-            if (c != prog->regstart && (!ireg_ic || vim_tolower(c)
-                                        != vim_tolower(prog->regstart))) {
+            if (c != prog->regstart && (!ireg_ic || mb_tolower(c)
+                                        != mb_tolower(prog->regstart))) {
 #ifdef REGEXP_DEBUG
              fprintf(log_fd,
                  "  Skipping start state, regstart does not match\n");
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@@ -336,7 +336,7 @@ int pat_has_uppercase(char_u *pat)
    int l;

    if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1) {
-      if (enc_utf8 && utf_isupper(utf_ptr2char(p)))
+      if (enc_utf8 && mb_isupper(utf_ptr2char(p)))
        return TRUE;
      p += l;
    } else if (*p == '\\') {
@@ -348,7 +348,7 @@ int pat_has_uppercase(char_u *pat)
        p += 2;
      else
        p += 1;
-    } else if (vim_isupper(*p))
+    } else if (mb_isupper(*p))
      return TRUE;
    else
      ++p;
--- a/src/nvim/spell.c
+++ b/src/nvim/spell.c
@@ -2545,10 +2545,10 @@ void init_spell_chartab(void)
  } else if (enc_utf8)   {
    for (i = 128; i < 256; ++i) {
      int f = utf_fold(i);
-      int u = utf_toupper(i);
+      int u = mb_toupper(i);

-      spelltab.st_isu[i] = utf_isupper(i);
-      spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i);
+      spelltab.st_isu[i] = mb_isupper(i);
+      spelltab.st_isw[i] = spelltab.st_isu[i] || mb_islower(i);
      // The folded/upper-cased value is different between latin1 and
      // utf8 for 0xb5, causing E763 for no good reason.  Use the latin1
      // value for utf-8 to avoid this.
@@ -2558,13 +2558,13 @@ void init_spell_chartab(void)
  } else {
    // Rough guess: use locale-dependent library functions.
    for (i = 128; i < 256; ++i) {
-      if (vim_isupper(i)) {
+      if (mb_isupper(i)) {
        spelltab.st_isw[i] = true;
        spelltab.st_isu[i] = true;
-        spelltab.st_fold[i] = vim_tolower(i);
-      } else if (vim_islower(i))   {
+        spelltab.st_fold[i] = mb_tolower(i);
+      } else if (mb_islower(i))   {
        spelltab.st_isw[i] = true;
-        spelltab.st_upper[i] = vim_toupper(i);
+        spelltab.st_upper[i] = mb_toupper(i);
      }
    }
  }
--- a/src/nvim/spell_defs.h
+++ b/src/nvim/spell_defs.h
@@ -265,11 +265,11 @@ typedef struct trystate_S {
                         : (c) < \
                         256 ? (int)spelltab.st_fold[c] : (int)towlower(c))

-#define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
+#define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? mb_toupper(c) \
                          : (c) < \
                          256 ? (int)spelltab.st_upper[c] : (int)towupper(c))

-#define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
+#define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? mb_isupper(c) \
                          : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))

 // First language that is loaded, start of the linked list of loaded
--- a/src/nvim/strings.c
+++ b/src/nvim/strings.c
@@ -309,7 +309,7 @@ char *strup_save(const char *const orig)

    if (enc_utf8) {
      int c = utf_ptr2char((const char_u *)p);
-      int uc = utf_toupper(c);
+      int uc = mb_toupper(c);

      // Reallocate string when byte count changes.  This is rare,
      // thus it's OK to do another malloc()/free().