vim-patch:8.2.1933: cannot sort using locale ordering

Problem: Cannot sort using locale ordering. Solution: Add a flag for :sort and sort() to use the locale. (Dominique Pellé, closes vim/vim#7237) 55e29611d2
2025-02-25 18:55:25 -06:00 · 2021-04-19 19:57:19 +01:00
parent 1d72b6e4cd
commit 6a0b8cbd81
5 changed files with 109 additions and 11 deletions
--- a/runtime/doc/change.txt
+++ b/runtime/doc/change.txt
@@ -1742,7 +1742,7 @@ Vim has a sorting function and a sorting command.  The sorting function can be
 found here: |sort()|, |uniq()|.

 							*:sor* *:sort*
-:[range]sor[t][!] [b][f][i][n][o][r][u][x] [/{pattern}/]
+:[range]sor[t][!] [b][f][i][l][n][o][r][u][x] [/{pattern}/]
 			Sort lines in [range].  When no range is given all
 			lines are sorted.

@@ -1750,6 +1750,14 @@ found here: |sort()|, |uniq()|.

 			With [i] case is ignored.

+			With [l] sort uses the current locale. See
+			`language collate` to check or set the locale used
+			for ordering. For example, with "en_US.UTF8",
+			Ö will be ordered after O and before P,
+			whereas with the Swedish locale "sv_SE.UTF8",
+			it will be after Z.
+			Case is typically ignored by the locale.
+
 			Options [n][f][x][o][b] are mutually exclusive.

 			With [n] sorting is done on the first decimal number
@@ -1816,8 +1824,7 @@ found here: |sort()|, |uniq()|.
 Note that using `:sort` with `:global` doesn't sort the matching lines, it's
 quite useless.

-The details about sorting depend on the library function used.  There is no
-guarantee that sorting obeys the current locale.  You will have to try it out.
+`:sort` does not use the current locale unless the l flag is used.
 Vim does do a "stable" sort.

 The sorting can be interrupted, but if you interrupt it too late in the
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -8359,6 +8359,13 @@ sort({list} [, {func} [, {dict}]])			*sort()* *E702*
 		When {func} is given and it is '1' or 'i' then case is
 		ignored.

+		When {func} is given and it is 'l' then the current locale
+		is used for ordering. See `language collate` to check or set
+		the locale used for ordering.  For example, with "en_US.UTF8",
+		Ö will be ordered after O and before P, whereas with the
+		Swedish locale "sv_SE.UTF8", it will be after Z.
+		Case is typically ignored by the locale.
+
 		When {func} is given and it is 'n' then all items will be
 		sorted numerical (Implementation detail: This uses the
 		strtod() function to parse numbers, Strings, Lists, Dicts and
--- a/src/nvim/eval/funcs.c
+++ b/src/nvim/eval/funcs.c
@@ -9166,6 +9166,7 @@ static void f_sockconnect(typval_T *argvars, typval_T *rettv, FunPtr fptr)
 /// struct storing information about current sort
 typedef struct {
  int item_compare_ic;
+  bool item_compare_lc;
  bool item_compare_numeric;
  bool item_compare_numbers;
  bool item_compare_float;
@@ -9240,10 +9241,10 @@ static int item_compare(const void *s1, const void *s2, bool keep_zero)
    p2 = "";
  }
  if (!sortinfo->item_compare_numeric) {
-    if (sortinfo->item_compare_ic) {
-      res = STRICMP(p1, p2);
+    if (sortinfo->item_compare_lc) {
+      res = strcoll(p1, p2);
    } else {
-      res = STRCMP(p1, p2);
+      res = sortinfo->item_compare_ic ? STRICMP(p1, p2): STRCMP(p1, p2);
    }
  } else {
    double n1, n2;
@@ -9378,6 +9379,7 @@ static void do_sort_uniq(typval_T *argvars, typval_T *rettv, bool sort)
    }

    info.item_compare_ic = false;
+    info.item_compare_lc = false;
    info.item_compare_numeric = false;
    info.item_compare_numbers = false;
    info.item_compare_float = false;
@@ -9422,6 +9424,9 @@ static void do_sort_uniq(typval_T *argvars, typval_T *rettv, bool sort)
          } else if (strcmp(info.item_compare_func, "i") == 0) {
            info.item_compare_func = NULL;
            info.item_compare_ic = true;
+          } else if (strcmp(info.item_compare_func, "l") == 0) {
+            info.item_compare_func = NULL;
+            info.item_compare_lc = true;
          }
        }
      }
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@@ -358,6 +358,7 @@ static int linelen(int *has_tab)
 static char_u   *sortbuf1;
 static char_u   *sortbuf2;

+static int sort_lc;       ///< sort using locale
 static int sort_ic;       ///< ignore case
 static int sort_nr;       ///< sort on number
 static int sort_rx;       ///< sort on regex instead of skipping it
@@ -381,6 +382,13 @@ typedef struct {
  } st_u;
 } sorti_T;

+static int string_compare(const void *s1, const void *s2) FUNC_ATTR_NONNULL_ALL
+{
+  if (sort_lc) {
+    return strcoll((char *)s1, (char *)s2);
+  }
+  return sort_ic ? STRICMP(s1, s2) : STRCMP(s1, s2);
+}

 static int sort_compare(const void *s1, const void *s2)
 {
@@ -424,8 +432,7 @@ static int sort_compare(const void *s1, const void *s2)
           l2.st_u.line.end_col_nr - l2.st_u.line.start_col_nr + 1);
    sortbuf2[l2.st_u.line.end_col_nr - l2.st_u.line.start_col_nr] = NUL;

-    result = sort_ic ? STRICMP(sortbuf1, sortbuf2)
-             : STRCMP(sortbuf1, sortbuf2);
+    result = string_compare(sortbuf1, sortbuf2);
  }

  /* If two lines have the same value, preserve the original line order. */
@@ -466,7 +473,7 @@ void ex_sort(exarg_T *eap)
  regmatch.regprog = NULL;
  sorti_T *nrs = xmalloc(count * sizeof(sorti_T));

-  sort_abort = sort_ic = sort_rx = sort_nr = sort_flt = 0;
+  sort_abort = sort_ic = sort_lc = sort_rx = sort_nr = sort_flt = 0;
  size_t format_found = 0;
  bool change_occurred = false;   // Buffer contents changed.

@@ -474,6 +481,8 @@ void ex_sort(exarg_T *eap)
    if (ascii_iswhite(*p)) {
    } else if (*p == 'i') {
      sort_ic = true;
+    } else if (*p == 'l') {
+      sort_lc = true;
    } else if (*p == 'r') {
      sort_rx = true;
    } else if (*p == 'n') {
@@ -645,8 +654,7 @@ void ex_sort(exarg_T *eap)
    s = ml_get(get_lnum);
    size_t bytelen = STRLEN(s) + 1;  // include EOL in bytelen
    old_count += bytelen;
-    if (!unique || i == 0
-        || (sort_ic ? STRICMP(s, sortbuf1) : STRCMP(s, sortbuf1)) != 0) {
+    if (!unique || i == 0 || string_compare(s, sortbuf1) != 0) {
      // Copy the line into a buffer, it may become invalid in
      // ml_append(). And it's needed for "unique".
      STRCPY(sortbuf1, s);
--- a/src/nvim/testdir/test_sort.vim
+++ b/src/nvim/testdir/test_sort.vim
@@ -13,6 +13,25 @@ func Test_sort_strings()
  " numbers compared as strings
  call assert_equal([1, 2, 3], sort([3, 2, 1]))
  call assert_equal([13, 28, 3], sort([3, 28, 13]))
+
+  call assert_equal(['A', 'O', 'P', 'a', 'o', 'p', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ'],
+  \            sort(['A', 'O', 'P', 'a', 'o', 'p', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ']))
+
+  call assert_equal(['A', 'a', 'o', 'O', 'p', 'P', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ'],
+  \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'i'))
+
+  let lc = execute('language collate')
+  " With the following locales, the accentuated letters are ordered
+  " similarly to the non-accentuated letters...
+  if lc =~? '"\(en\|es\|de\|fr\|it\|nl\).*\.utf-\?8"'
+    call assert_equal(['a', 'A', 'ä', 'Ä', 'o', 'O', 'ô', 'Ô', 'œ', 'œ', 'p', 'P'],
+    \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'l'))
+  " ... whereas with a Swedish locale, the accentuated letters are ordered
+  " after Z.
+  elseif lc =~? '"sv.*utf-\?8"'
+    call assert_equal(['a', 'A', 'o', 'O', 'p', 'P', 'ä', 'Ä', 'œ', 'œ', 'ô', 'Ô'],
+    \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'l'))
+  endif
 endfunc

 func Test_sort_numeric()
@@ -1223,6 +1242,58 @@ func Test_sort_cmd()
 	\ },
 	\ ]

+    " With the following locales, the accentuated letters are ordered
+    " similarly to the non-accentuated letters...
+    let lc = execute('language collate')
+    if lc =~? '"\(en\|es\|de\|fr\|it\|nl\).*\.utf-\?8"'
+      let tests += [
+	\ {
+	\    'name' : 'sort with locale',
+	\    'cmd' : '%sort l',
+	\    'input' : [
+	\	'A',
+	\	'E',
+	\	'O',
+	\	'À',
+	\	'È',
+	\	'É',
+	\	'Ô',
+	\	'Œ',
+	\	'Z',
+	\	'a',
+	\	'e',
+	\	'o',
+	\	'à',
+	\	'è',
+	\	'é',
+	\	'ô',
+	\	'œ',
+	\	'z'
+	\    ],
+	\    'expected' : [
+	\	'a',
+	\	'A',
+	\	'à',
+	\	'À',
+	\	'e',
+	\	'E',
+	\	'é',
+	\	'É',
+	\	'è',
+	\	'È',
+	\	'o',
+	\	'O',
+	\	'ô',
+	\	'Ô',
+	\	'œ',
+	\	'Œ',
+	\	'z',
+	\	'Z'
+	\    ]
+	\ },
+	\ ]
+  endif
+
  for t in tests
    enew!
    call append(0, t.input)