Merge pull request #25934 from bfredl/screenlinechar

refactor(grid): make screen rendering more multibyte than ever before
2025-02-25 18:55:25 -06:00 · 2023-11-17 13:38:10 +01:00 · 2023-11-17 13:38:10 +01:00 · 7af89ef464
commit 7af89ef464
parent 20ec4c776a b522cb1ac3
26 changed files with 399 additions and 602 deletions
--- a/runtime/doc/mbyte.txt
+++ b/runtime/doc/mbyte.txt
@ -646,7 +646,8 @@ widespread as file format.
 A composing or combining character is used to change the meaning of the
 character before it.  The combining characters are drawn on top of the
 preceding character.
-Up to six combining characters can be displayed.
+Too big combined characters cannot be displayed, but they can still be
 inspected using the |g8| and |ga| commands described below.
 When editing text a composing character is mostly considered part of the
 preceding character.  For example "x" will delete a character and its
 following composing characters by default.
--- a/runtime/doc/news.txt
+++ b/runtime/doc/news.txt
@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior.
  Note that syntax highlighting of code examples requires a matching parser
  and may be affected by custom queries.
 • Support for rendering multibyte characters using composing characters has been
  enhanced. The maximum limit have been increased from 1+6 codepoints to
  31 bytes, which is guaranteed to fit all chars from before but often more.
  NOTE: the regexp engine still has a hard-coded limit of considering
  6 composing chars only.
 ==============================================================================
 REMOVED FEATURES                                                 *news-removed*
--- a/runtime/doc/vim_diff.txt
+++ b/runtime/doc/vim_diff.txt
@ -722,9 +722,16 @@ Options:
 <
  *'macatsui'*
  *'maxcombine'* *'mco'*
-    Nvim always displays up to 6 combining characters.  You can still edit
+    Nvim counts maximum character sizes in bytes, not codepoints. This is
-    text with more than 6 combining characters, you just can't see them.
+    guaranteed to be big enough to always fit all chars properly displayed
-    Use |g8| or |ga|.  See |mbyte-combining|.
+    in vim with 'maxcombine' set to 6.
    You can still edit text with larger characters than fits in the screen buffer,
    you just can't see them. Use |g8| or |ga|. See |mbyte-combining|.
    NOTE: the rexexp engine still has a hard-coded limit of considering
    6 composing chars only.
  *'maxmem'* Nvim delegates memory-management to the OS.
  *'maxmemtot'* Nvim delegates memory-management to the OS.
  printoptions
--- a/runtime/lua/vim/_meta/options.lua
+++ b/runtime/lua/vim/_meta/options.lua
@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg
 --- security reasons.
 ---
 --- @type boolean
-vim.o.fsync = false
+vim.o.fsync = true
 vim.o.fs = vim.o.fsync
 vim.go.fsync = vim.o.fsync
 vim.go.fs = vim.go.fsync
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len)
 /// convert bytes to a character.
 void ins_char(int c)
 {
-  char buf[MB_MAXBYTES + 1];
+  char buf[MB_MAXCHAR + 1];
  size_t n = (size_t)utf_char2bytes(c, buf);
  // When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte.
@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
  // If 'delcombine' is set and deleting (less than) one character, only
  // delete the last combining character.
-  if (p_deco && use_delcombine
+  if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
-      && utfc_ptr2len(oldp + col) >= count) {
+    char *p0 = oldp + col;
-    int cc[MAX_MCO];
+    if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
    (void)utfc_ptr2char(oldp + col, cc);
    if (cc[0] != NUL) {
      // Find the last composing char, there can be several.
      int n = col;
      do {
--- a/src/nvim/charset.c
+++ b/src/nvim/charset.c
@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab)
  while (*p) {
    const size_t l = (size_t)utfc_ptr2len(p);
    if (l > 1) {
-      int pcc[MAX_MCO + 1];
+      if (vim_isprintc(utf_ptr2char(p))) {
      pcc[0] = utfc_ptr2char(p, &pcc[1]);
      if (vim_isprintc(pcc[0])) {
        len += l;
      } else {
-        for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
+        for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
          int c = utf_ptr2char(p + off);
          char hexbuf[9];
-          len += transchar_hex(hexbuf, pcc[i]);
+          len += transchar_hex(hexbuf, c);
        }
      }
      p += l;
@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co
      if (buf_p + l > buf_e) {
        break;  // Exceeded `buf` size.
      }
      int pcc[MAX_MCO + 1];
      pcc[0] = utfc_ptr2char(p, &pcc[1]);
-      if (vim_isprintc(pcc[0])) {
+      if (vim_isprintc(utf_ptr2char(p))) {
        memmove(buf_p, p, l);
        buf_p += l;
      } else {
-        for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
+        for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
          int c = utf_ptr2char(p + off);
          char hexbuf[9];  // <up to 6 bytes>NUL
-          const size_t hexlen = transchar_hex(hexbuf, pcc[i]);
+          const size_t hexlen = transchar_hex(hexbuf, c);
          if (buf_p + hexlen > buf_e) {
            break;
          }
--- a/src/nvim/digraph.c
+++ b/src/nvim/digraph.c
@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n)
 bool check_digraph_chars_valid(int char1, int char2)
 {
  if (char2 == 0) {
-    char msg[MB_MAXBYTES + 1];
+    char msg[MB_MAXCHAR + 1];
    msg[utf_char2bytes(char1, msg)] = NUL;
    semsg(_(e_digraph_must_be_just_two_characters_str), msg);
    return false;
--- a/src/nvim/drawline.c
+++ b/src/nvim/drawline.c
@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
  const char *p = *pp;
  int cells = utf_ptr2cells(p);
  int c_len = utfc_ptr2len(p);
  int u8c, u8cc[MAX_MCO];
  assert(maxcells > 0);
  if (cells > maxcells) {
    dest[0] = schar_from_ascii(' ');
    return 1;
  }
  u8c = utfc_ptr2char(p, u8cc);
  if (*p == TAB) {
    cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells);
  }
@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
    for (int c = 0; c < cells; c++) {
      dest[c] = schar_from_ascii(' ');
    }
    goto done;
  } else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) {
    dest[0] = schar_from_ascii(*p);
  } else {
-    dest[0] = schar_from_cc(u8c, u8cc);
+    int u8c;
    dest[0] = utfc_ptr2schar(p, &u8c);
    if (cells > 1) {
      dest[1] = 0;
    }
  }
-  if (cells > 1) {
+
    dest[1] = 0;
  }
 done:
  *pp += c_len;
  return cells;
 }
@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t
  }
 }
 static bool check_mb_utf8(int *c, int *u8cc)
 {
  if (utf_char2len(*c) > 1) {
    *u8cc = 0;
    *c = 0xc0;
    return true;
  }
  return false;
 }
 static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line)
 {
  colnr_T trailcol = MAXCOL;
@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
 {
  winlinevars_T wlv;                  // variables passed between functions
  int c = 0;                          // init for GCC
  colnr_T vcol_prev = -1;             // "wlv.vcol" of previous character
  char *line;                         // current line
  char *ptr;                          // current position in "line"
@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
  int multi_attr = 0;                   // attributes desired by multibyte
  int mb_l = 1;                         // multi-byte byte length
  int mb_c = 0;                         // decoded multi-byte character
-  bool mb_utf8 = false;                 // screen char is UTF-8 char
+  schar_T mb_schar;                     // complete screen char
  int u8cc[MAX_MCO];                    // composing UTF-8 chars
  int change_start = MAXCOL;            // first col of changed area
  int change_end = -1;                  // last col of changed area
  bool in_multispace = false;           // in multiple consecutive spaces
@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
    // For the '$' of the 'list' option, n_extra == 1, p_extra == "".
    if (wlv.n_extra > 0) {
      if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) {
-        c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
+        mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
-        mb_c = c;               // doesn't handle non-utf-8 multi-byte!
+        mb_schar = schar_from_char(mb_c);
-        mb_utf8 = check_mb_utf8(&c, u8cc);
+        wlv.n_extra--;
      } else {
        assert(wlv.p_extra != NULL);
        c = (uint8_t)(*wlv.p_extra);
        mb_c = c;
        // If the UTF-8 character is more than one byte:
        // Decode it into "mb_c".
        mb_l = utfc_ptr2len(wlv.p_extra);
-        mb_utf8 = false;
+        mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c);
-        if (mb_l > wlv.n_extra) {
+        // mb_l=0 at the end-of-line NUL
-          mb_l = 1;
+        if (mb_l > wlv.n_extra || mb_l == 0) {
        } else if (mb_l > 1) {
          mb_c = utfc_ptr2char(wlv.p_extra, u8cc);
          mb_utf8 = true;
          c = 0xc0;
        }
        if (mb_l == 0) {          // at the NUL at end-of-line
          mb_l = 1;
        }
        // If a double-width char doesn't fit display a '>' in the last column.
        // Don't advance the pointer but put the character at the start of the next line.
        if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
-          c = '>';
+          mb_c = '>';
          mb_c = c;
          mb_l = 1;
          (void)mb_l;
          mb_schar = schar_from_ascii(mb_c);
          multi_attr = win_hl_attr(wp, HLF_AT);
          if (wlv.cul_attr) {
@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
              ? hl_combine_attr(wlv.cul_attr, multi_attr)
              : hl_combine_attr(multi_attr, wlv.cul_attr);
          }
          // put the pointer back to output the double-width
          // character at the start of the next line.
          wlv.n_extra++;
          wlv.p_extra--;
        } else {
-          wlv.n_extra -= mb_l - 1;
+          wlv.n_extra -= mb_l;
-          wlv.p_extra += mb_l - 1;
+          wlv.p_extra += mb_l;
        }
        wlv.p_extra++;
      }
      wlv.n_extra--;
      // Only restore search_attr and area_attr after "n_extra" in
      // the next screen line is also done.
@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
      }
    } else if (has_fold) {
      // skip writing the buffer line itself
-      c = NUL;
+      mb_c = NUL;
    } else {
      int c0;
      char *prev_ptr = ptr;
-      // Get a character from the line itself.
+      // first byte of next char
-      c0 = c = (uint8_t)(*ptr);
+      int c0 = (uint8_t)(*ptr);
-      mb_c = c;
+      if (c0 == NUL) {
      if (c == NUL) {
        // no more cells to skip
        wlv.skip_cells = 0;
      }
-      // If the UTF-8 character is more than one byte: Decode it
+      // Get a character from the line itself.
      // into "mb_c".
      mb_l = utfc_ptr2len(ptr);
-      mb_utf8 = false;
+      mb_schar = utfc_ptr2schar(ptr, &mb_c);
      if (mb_l > 1) {
        mb_c = utfc_ptr2char(ptr, u8cc);
        // Overlong encoded ASCII or ASCII with composing char
        // is displayed normally, except a NUL.
        if (mb_c < 0x80) {
          c0 = c = mb_c;
        }
        mb_utf8 = true;
-        // At start of the line we can have a composing char.
+      // Overlong encoded ASCII or ASCII with composing char
-        // Draw it as a space with a composing char.
+      // is displayed normally, except a NUL.
-        if (utf_iscomposing(mb_c)) {
+      if (mb_l > 1 && mb_c < 0x80) {
-          for (int i = MAX_MCO - 1; i > 0; i--) {
+        c0 = mb_c;
            u8cc[i] = u8cc[i - 1];
          }
          u8cc[0] = mb_c;
          mb_c = ' ';
        }
      }
-      if ((mb_l == 1 && c >= 0x80)
+      if ((mb_l == 1 && c0 >= 0x80)
          || (mb_l >= 1 && mb_c == 0)
          || (mb_l > 1 && (!vim_isprintc(mb_c)))) {
        // Illegal UTF-8 byte: display as <xx>.
-        // Non-BMP character : display as ? or fullwidth ?.
+        // Non-printable character : display as ? or fullwidth ?.
        transchar_hex(wlv.extra, mb_c);
        if (wp->w_p_rl) {  // reverse
          rl_mirror_ascii(wlv.extra, NULL);
        }
        wlv.p_extra = wlv.extra;
        c = (uint8_t)(*wlv.p_extra);
        mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra);
-        mb_utf8 = (c >= 0x80);
+        mb_schar = schar_from_char(mb_c);
        wlv.n_extra = (int)strlen(wlv.p_extra);
        wlv.c_extra = NUL;
        wlv.c_final = NUL;
@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
      // last column; the character is displayed at the start of the
      // next line.
      if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
-        c = '>';
+        mb_c = '>';
        mb_c = c;
        mb_utf8 = false;
        mb_l = 1;
        mb_schar = schar_from_ascii(mb_c);
        multi_attr = win_hl_attr(wp, HLF_AT);
        // Put pointer back so that the character will be
        // displayed at the start of the next line.
@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
        wlv.n_extra = 1;
        wlv.c_extra = MB_FILLER_CHAR;
        wlv.c_final = NUL;
-        c = ' ';
+        mb_c = ' ';
        mb_l = 1;
        mb_schar = schar_from_ascii(mb_c);
        if (area_attr == 0 && search_attr == 0) {
          wlv.n_attr = wlv.n_extra + 1;
          wlv.extra_attr = win_hl_attr(wp, HLF_AT);
          saved_attr2 = wlv.char_attr;             // save current attr
        }
        mb_c = c;
        mb_utf8 = false;
        mb_l = 1;
      }
      ptr++;
@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          // no concealing past the end of the line, it interferes
          // with line highlighting.
-          if (c == NUL) {
+          syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr);
            syntax_flags = 0;
          } else {
            syntax_flags = get_syntax_info(&syntax_seqnr);
          }
        }
        if (has_decor && v > 0) {
@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          spell_attr = 0;
          // do not calculate cap_col at the end of the line or when
          // only white space is following
-          if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
+          if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
            char *p;
            hlf_T spell_hlf = HLF_COUNT;
            v -= mb_l - 1;
@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
        //
        // So only allow to linebreak, once we have found chars not in
        // 'breakat' in the line.
-        if (wp->w_p_lbr && !wlv.need_lbr && c != NUL
+        if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL
            && !vim_isbreak((uint8_t)(*ptr))) {
          wlv.need_lbr = true;
        }
        // Found last space before word: check for line break.
-        if (wp->w_p_lbr && c0 == c && wlv.need_lbr
+        if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr
-            && vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) {
+            && vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) {
          int mb_off = utf_head_off(line, ptr - 1);
          char *p = ptr - (mb_off + 1);
          chartabsize_T cts;
@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1;
          clear_chartabsize_arg(&cts);
-          if (on_last_col && c != TAB) {
+          if (on_last_col && mb_c != TAB) {
            // Do not continue search/match highlighting over the
            // line break, but for TABs the highlighting should
            // include the complete width of the character
            search_attr = 0;
          }
-          if (c == TAB && wlv.n_extra + wlv.col > grid->cols) {
+          if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) {
            wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts,
                                          wp->w_buffer->b_p_vts_array) - 1;
          }
          wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' ';
          wlv.c_final = NUL;
-          if (ascii_iswhite(c)) {
+          if (mb_c < 128 && ascii_iswhite(mb_c)) {
-            if (c == TAB) {
+            if (mb_c == TAB) {
              // See "Tab alignment" below.
              FIX_FOR_BOGUSCOLS;
            }
            if (!wp->w_p_list) {
-              c = ' ';
+              mb_c = ' ';
              mb_schar = schar_from_ascii(mb_c);
            }
          }
        }
        if (wp->w_p_list) {
-          in_multispace = c == ' ' && (*ptr == ' '
+          in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' '));
                                       || (prev_ptr > line && prev_ptr[-1] == ' '));
          if (!in_multispace) {
            multispace_pos = 0;
          }
@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
        // But not when the character is followed by a composing
        // character (use mb_l to check that).
        if (wp->w_p_list
-            && ((((c == 160 && mb_l == 1)
+            && ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3))
                  || (mb_utf8
                      && ((mb_c == 160 && mb_l == 2)
                          || (mb_c == 0x202f && mb_l == 3))))
                 && wp->w_p_lcs_chars.nbsp)
-                || (c == ' '
+                || (mb_c == ' '
                    && mb_l == 1
                    && (wp->w_p_lcs_chars.space
                        || (in_multispace && wp->w_p_lcs_chars.multispace != NULL))
                    && ptr - line >= leadcol
                    && ptr - line <= trailcol))) {
          if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) {
-            c = wp->w_p_lcs_chars.multispace[multispace_pos++];
+            mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++];
            if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) {
              multispace_pos = 0;
            }
          } else {
-            c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
+            mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
          }
          wlv.n_attr = 1;
          wlv.extra_attr = win_hl_attr(wp, HLF_0);
          saved_attr2 = wlv.char_attr;  // save current attr
-          mb_c = c;
+          mb_schar = schar_from_char(mb_c);
          mb_utf8 = check_mb_utf8(&c, u8cc);
        }
-        if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol)
+        if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol)
-                         || (leadcol != 0 && ptr < line + leadcol))) {
+                                         || (leadcol != 0 && ptr < line + leadcol))) {
          if (leadcol != 0 && in_multispace && ptr < line + leadcol
              && wp->w_p_lcs_chars.leadmultispace != NULL) {
-            c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
+            mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
            if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) {
              multispace_pos = 0;
            }
          } else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) {
-            c = wp->w_p_lcs_chars.trail;
+            mb_c = wp->w_p_lcs_chars.trail;
          } else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) {
-            c = wp->w_p_lcs_chars.lead;
+            mb_c = wp->w_p_lcs_chars.lead;
          } else if (leadcol != 0 && wp->w_p_lcs_chars.space) {
-            c = wp->w_p_lcs_chars.space;
+            mb_c = wp->w_p_lcs_chars.space;
          }
          wlv.n_attr = 1;
          wlv.extra_attr = win_hl_attr(wp, HLF_0);
          saved_attr2 = wlv.char_attr;  // save current attr
-          mb_c = c;
+          mb_schar = schar_from_char(mb_c);
          mb_utf8 = check_mb_utf8(&c, u8cc);
        }
      }
      // Handling of non-printable characters.
-      if (!vim_isprintc(c)) {
+      if (!vim_isprintc(mb_c)) {
        // when getting a character from the file, we may have to
        // turn it into something else on the way to putting it on the screen.
-        if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
+        if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
          int tab_len = 0;
          colnr_T vcol_adjusted = wlv.vcol;  // removed showbreak length
          char *const sbr = get_showbreak_value(wp);
@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
              if (wlv.n_extra > 0) {
                len += wlv.n_extra - tab_len;
              }
-              c = wp->w_p_lcs_chars.tab1;
+              mb_c = wp->w_p_lcs_chars.tab1;
              p = get_extra_buf((size_t)len + 1);
              memset(p, ' ', (size_t)len);
              p[len] = NUL;
@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
            }
          }
          mb_utf8 = false;  // don't draw as UTF-8
          if (wp->w_p_list) {
-            c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
+            mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
-                 ? wp->w_p_lcs_chars.tab3
+                    ? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1;
                 : wp->w_p_lcs_chars.tab1;
            if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) {
              wlv.c_extra = NUL;  // using p_extra from above
            } else {
@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
            wlv.n_attr = tab_len + 1;
            wlv.extra_attr = win_hl_attr(wp, HLF_0);
            saved_attr2 = wlv.char_attr;  // save current attr
            mb_c = c;
            mb_utf8 = check_mb_utf8(&c, u8cc);
          } else {
            wlv.c_final = NUL;
            wlv.c_extra = ' ';
-            c = ' ';
+            mb_c = ' ';
          }
-        } else if (c == NUL
+          mb_schar = schar_from_char(mb_c);
        } else if (mb_c == NUL
                   && (wp->w_p_list
                       || ((wlv.fromcol >= 0 || fromcol_prev >= 0)
                           && wlv.tocol > wlv.vcol
@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
            wlv.n_extra = 0;
          }
          if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) {
-            c = wp->w_p_lcs_chars.eol;
+            mb_c = wp->w_p_lcs_chars.eol;
          } else {
-            c = ' ';
+            mb_c = ' ';
          }
          lcs_eol_one = -1;
          ptr--;  // put it back at the NUL
          wlv.extra_attr = win_hl_attr(wp, HLF_AT);
          wlv.n_attr = 1;
-          mb_c = c;
+          mb_schar = schar_from_char(mb_c);
-          mb_utf8 = check_mb_utf8(&c, u8cc);
+        } else if (mb_c != NUL) {
-        } else if (c != NUL) {
+          wlv.p_extra = transchar_buf(wp->w_buffer, mb_c);
          wlv.p_extra = transchar_buf(wp->w_buffer, c);
          if (wlv.n_extra == 0) {
-            wlv.n_extra = byte2cells(c) - 1;
+            wlv.n_extra = byte2cells(mb_c) - 1;
          }
          if ((dy_flags & DY_UHEX) && wp->w_p_rl) {
            rl_mirror_ascii(wlv.p_extra, NULL);   // reverse "<12>"
@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          if (wp->w_p_lbr) {
            char *p;
-            c = (uint8_t)(*wlv.p_extra);
+            mb_c = (uint8_t)(*wlv.p_extra);
            p = get_extra_buf((size_t)wlv.n_extra + 1);
            memset(p, ' ', (size_t)wlv.n_extra);
            strncpy(p,  // NOLINT(runtime/printf)
@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
            p[wlv.n_extra] = NUL;
            wlv.p_extra = p;
          } else {
-            wlv.n_extra = byte2cells(c) - 1;
+            wlv.n_extra = byte2cells(mb_c) - 1;
-            c = (uint8_t)(*wlv.p_extra++);
+            mb_c = (uint8_t)(*wlv.p_extra++);
          }
          wlv.n_attr = wlv.n_extra + 1;
          wlv.extra_attr = win_hl_attr(wp, HLF_8);
          saved_attr2 = wlv.char_attr;  // save current attr
-          mb_utf8 = false;   // don't draw as UTF-8
+          mb_schar = schar_from_ascii(mb_c);
        } else if (VIsual_active
                   && (VIsual_mode == Ctrl_V || VIsual_mode == 'v')
                   && virtual_active()
                   && wlv.tocol != MAXCOL
                   && wlv.vcol < wlv.tocol
                   && wlv.col < grid->cols) {
-          c = ' ';
+          mb_c = ' ';
          mb_schar = schar_from_char(mb_c);
          ptr--;  // put it back at the NUL
        }
      }
@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          // First time at this concealed item: display one
          // character.
          if (has_match_conc && match_conc) {
-            c = match_conc;
+            mb_c = match_conc;
          } else if (decor_conceal && decor_state.conceal_char) {
-            c = decor_state.conceal_char;
+            mb_c = decor_state.conceal_char;
            if (decor_state.conceal_attr) {
              wlv.char_attr = decor_state.conceal_attr;
            }
          } else if (syn_get_sub_char() != NUL) {
-            c = syn_get_sub_char();
+            mb_c = syn_get_sub_char();
          } else if (wp->w_p_lcs_chars.conceal != NUL) {
-            c = wp->w_p_lcs_chars.conceal;
+            mb_c = wp->w_p_lcs_chars.conceal;
          } else {
-            c = ' ';
+            mb_c = ' ';
          }
          prev_syntax_id = syntax_seqnr;
@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          is_concealing = true;
          wlv.skip_cells = 1;
        }
-        mb_c = c;
+        mb_schar = schar_from_char(mb_c);
        mb_utf8 = check_mb_utf8(&c, u8cc);
      } else {
        prev_syntax_id = 0;
        is_concealing = false;
@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
        && (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0)
        && wlv.filler_todo <= 0
        && wlv.draw_state > WL_STC
-        && c != NUL) {
+        && mb_c != NUL) {
-      c = wp->w_p_lcs_chars.prec;
+      mb_c = wp->w_p_lcs_chars.prec;
      lcs_prec_todo = NUL;
      if (utf_char2cells(mb_c) > 1) {
        // Double-width character being overwritten by the "precedes"
@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
        wlv.n_attr = 2;
        wlv.extra_attr = win_hl_attr(wp, HLF_AT);
      }
-      mb_c = c;
+      mb_schar = schar_from_char(mb_c);
      mb_utf8 = check_mb_utf8(&c, u8cc);
      saved_attr3 = wlv.char_attr;  // save current attr
      wlv.char_attr = win_hl_attr(wp, HLF_AT);  // overwriting char_attr
      n_attr3 = 1;
    }
    // At end of the text line or just after the last character.
-    if (c == NUL && eol_hl_off == 0) {
+    if (mb_c == NUL && eol_hl_off == 0) {
      // flag to indicate whether prevcol equals startcol of search_hl or
      // one of the matches
      bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl,
@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
    }
    // At end of the text line.
-    if (c == NUL) {
+    if (mb_c == NUL) {
      // Highlight 'cursorcolumn' & 'colorcolumn' past end of the line.
      if (wp->w_p_wrap) {
        v = wlv.startrow == 0 ? wp->w_skipcol : 0;
@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
          || lcs_eol_one > 0
          || (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL))
          || has_more_inline_virt(&wlv, v)) {
-        c = wp->w_p_lcs_chars.ext;
+        mb_c = wp->w_p_lcs_chars.ext;
        wlv.char_attr = win_hl_attr(wp, HLF_AT);
-        mb_c = c;
+        mb_schar = schar_from_char(mb_c);
        mb_utf8 = check_mb_utf8(&c, u8cc);
      }
    }
@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
    // Skip characters that are left of the screen for 'nowrap'.
    if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) {
      // Store the character.
-      if (mb_utf8) {
+      linebuf_char[wlv.off] = mb_schar;
        linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc);
      } else {
        linebuf_char[wlv.off] = schar_from_ascii((char)c);
      }
      if (multi_attr) {
        linebuf_attr[wlv.off] = multi_attr;
        multi_attr = 0;
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight)
      pc_status = PC_STATUS_SET;
    }
-    char buf[MB_MAXBYTES + 1];
+    char buf[MB_MAXCHAR + 1];
    grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr);
    grid_line_flush();
  }
@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent)
    int cc;
    if ((cc = utf_char2len(c)) > 1) {
-      char buf[MB_MAXBYTES + 1];
+      char buf[MB_MAXCHAR + 1];
      utf_char2bytes(c, buf);
      buf[cc] = NUL;
@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
  int cc;
  int temp = 0;                     // init for GCC
  bool did_backspace = false;
  int cpc[MAX_MCO];                 // composing characters
  bool call_fix_indent = false;
  // can't delete anything in an empty file
@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
        if (State & REPLACE_FLAG) {
          replace_do_bs(-1);
        } else {
-          const int l_p_deco = p_deco;
+          bool has_composing = false;
-          if (l_p_deco) {
+          if (p_deco) {
-            (void)utfc_ptr2char(get_cursor_pos_ptr(), cpc);
+            char *p0 = get_cursor_pos_ptr();
            has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
          }
          (void)del_char(false);
          // If there are combining characters and 'delcombine' is set
-          // move the cursor back.  Don't back up before the base
+          // move the cursor back.  Don't back up before the base character.
-          // character.
+          if (has_composing) {
          if (l_p_deco && cpc[0] != NUL) {
            inc_cursor();
          }
          if (revins_chars) {
--- a/src/nvim/eval.c
+++ b/src/nvim/eval.c
@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE
 /// Set v:char to character "c".
 void set_vim_var_char(int c)
 {
-  char buf[MB_MAXBYTES + 1];
+  char buf[MB_MAXCHAR + 1];
  buf[utf_char2bytes(c, buf)] = NUL;
  set_vim_var_string(VV_CHAR, buf, -1);
--- a/src/nvim/eval/funcs.c
+++ b/src/nvim/eval/funcs.c
@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
    return;
  }
-  char buf[MB_MAXBYTES];
+  char buf[MB_MAXCHAR];
  const int len = utf_char2bytes((int)num, buf);
  rettv->v_type = VAR_STRING;
@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
  if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
    c = -1;
  } else {
-    char buf[MB_MAXBYTES + 1];
+    char buf[MAX_SCHAR_SIZE + 1];
    schar_get(buf, grid_getchar(grid, row, col, NULL));
    c = utf_ptr2char(buf);
  }
@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
  ScreenGrid *grid;
  screenchar_adjust(&grid, &row, &col);
  tv_list_alloc_ret(rettv, kListLenMayKnow);
  if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
    tv_list_alloc_ret(rettv, 0);
    return;
  }
-  char buf[MB_MAXBYTES + 1];
+  char buf[MAX_SCHAR_SIZE + 1];
  schar_get(buf, grid_getchar(grid, row, col, NULL));
-  int pcc[MAX_MCO];
+
-  int c = utfc_ptr2char(buf, pcc);
+  // schar values are already processed chars which are always NUL-terminated.
-  int composing_len = 0;
+  // A single [0] is expected when char is NUL.
-  while (composing_len < MAX_MCO && pcc[composing_len] != 0) {
+  size_t i = 0;
-    composing_len++;
+  do {
-  }
+    int c = utf_ptr2char(buf + i);
-  tv_list_alloc_ret(rettv, composing_len + 1);
+    tv_list_append_number(rettv->vval.v_list, c);
-  tv_list_append_number(rettv->vval.v_list, c);
+    i += (size_t)utf_ptr2len(buf + i);
-  for (int i = 0; i < composing_len; i++) {
+  } while (buf[i] != NUL);
    tv_list_append_number(rettv->vval.v_list, pcc[i]);
  }
 }
 /// "screencol()" function
@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr
    return;
  }
-  char buf[MB_MAXBYTES + 1];
+  char buf[MAX_SCHAR_SIZE + 1];
  schar_get(buf, grid_getchar(grid, row, col, NULL));
  rettv->vval.v_string = xstrdup(buf);
 }
@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt
  char *const csearch = tv_dict_get_string(d, "char", false);
  if (csearch != NULL) {
-    int pcc[MAX_MCO];
+    int c = utf_ptr2char(csearch);
    const int c = utfc_ptr2char(csearch, pcc);
    set_last_csearch(c, csearch, utfc_ptr2len(csearch));
  }
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[]
 /// ":ascii" and "ga" implementation
 void do_ascii(exarg_T *eap)
 {
-  char *dig;
+  char *data = get_cursor_pos_ptr();
-  int cc[MAX_MCO];
+  size_t len = (size_t)utfc_ptr2len(data);
-  int c = utfc_ptr2char(get_cursor_pos_ptr(), cc);
+
-  if (c == NUL) {
+  if (len == 0) {
    msg("NUL", 0);
    return;
  }
-  size_t iobuff_len = 0;
+  bool need_clear = true;
  msg_sb_eol();
  msg_start();
-  int ci = 0;
+  int c = utf_ptr2char(data);
  size_t off = 0;
  // TODO(bfredl): merge this with the main loop
  if (c < 0x80) {
    if (c == NL) {  // NUL is stored as NL.
      c = NUL;
@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap)
    char buf2[20];
    buf2[0] = NUL;
-    dig = get_digraph_for_char(cval);
+    char *dig = get_digraph_for_char(cval);
    if (dig != NULL) {
-      iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
+      vim_snprintf(IObuff, sizeof(IObuff),
-                                         sizeof(IObuff) - iobuff_len,
+                   _("<%s>%s%s  %d,  Hex %02x,  Oct %03o, Digr %s"),
-                                         _("<%s>%s%s  %d,  Hex %02x,  Oct %03o, Digr %s"),
+                   transchar(c), buf1, buf2, cval, cval, cval, dig);
                                         transchar(c), buf1, buf2, cval, cval, cval, dig);
    } else {
-      iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
+      vim_snprintf(IObuff, sizeof(IObuff),
-                                         sizeof(IObuff) - iobuff_len,
+                   _("<%s>%s%s  %d,  Hex %02x,  Octal %03o"),
-                                         _("<%s>%s%s  %d,  Hex %02x,  Octal %03o"),
+                   transchar(c), buf1, buf2, cval, cval, cval);
                                         transchar(c), buf1, buf2, cval, cval, cval);
    }
-    c = cc[ci++];
+    msg_multiline(IObuff, 0, true, &need_clear);
    off += (size_t)utf_ptr2len(data);  // needed for overlong ascii?
  }
 #define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1)
  // Space for description:
  // - 1 byte for separator (starting from second entry)
  // - 1 byte for "<"
  // - 1 byte for space to draw composing character on (optional, but really
  //   mostly required)
  // - up to MB_MAXBYTES bytes for character itself
  // - 16 bytes for raw text ("> , Hex , Octal ").
  // - at least 4 bytes for hexadecimal representation
  // - at least 3 bytes for decimal representation
  // - at least 3 bytes for octal representation
  // - 1 byte for NUL
  //
  // Taking into account MAX_MCO and characters which need 8 bytes for
  // hexadecimal representation, but not taking translation into account:
  // resulting string will occupy less then 400 bytes (conservative estimate).
  //
  // Less then 1000 bytes if translation multiplies number of bytes needed for
  // raw text by 6, so it should always fit into 1025 bytes reserved for IObuff.
  // Repeat for combining characters, also handle multiby here.
-  while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) {
+  while (off < len) {
    c = utf_ptr2char(data + off);
    size_t iobuff_len = 0;
    // This assumes every multi-byte char is printable...
-    if (iobuff_len > 0) {
+    if (off > 0) {
      IObuff[iobuff_len++] = ' ';
    }
    IObuff[iobuff_len++] = '<';
@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap)
    }
    iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
-    dig = get_digraph_for_char(c);
+    char *dig = get_digraph_for_char(c);
    if (dig != NULL) {
-      iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
+      vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
-                                         sizeof(IObuff) - iobuff_len,
+                   (c < 0x10000
-                                         (c < 0x10000
+                    ? _("> %d, Hex %04x, Oct %o, Digr %s")
-                                          ? _("> %d, Hex %04x, Oct %o, Digr %s")
+                    : _("> %d, Hex %08x, Oct %o, Digr %s")),
-                                          : _("> %d, Hex %08x, Oct %o, Digr %s")),
+                   c, c, c, dig);
                                         c, c, c, dig);
    } else {
-      iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
+      vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
-                                         sizeof(IObuff) - iobuff_len,
+                   (c < 0x10000
-                                         (c < 0x10000
+                    ? _("> %d, Hex %04x, Octal %o")
-                                          ? _("> %d, Hex %04x, Octal %o")
+                    : _("> %d, Hex %08x, Octal %o")),
-                                          : _("> %d, Hex %08x, Octal %o")),
+                   c, c, c);
                                         c, c, c);
    }
-    if (ci == MAX_MCO) {
+
-      break;
+    msg_multiline(IObuff, 0, true, &need_clear);
-    }
+
-    c = cc[ci++];
+    off += (size_t)utf_ptr2len(data + off);  // needed for overlong ascii?
  }
  if (ci != MAX_MCO && c != 0) {
    xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len);
  }
-  msg(IObuff, 0);
+  if (need_clear) {
    msg_clr_eos();
  }
  msg_end();
 }
 /// ":left", ":center" and ":right": align text.
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off)
  }
 }
 /// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell.
 schar_T schar_from_cc(int c, int u8cc[MAX_MCO])
 {
  char buf[MAX_SCHAR_SIZE];
  int len = utf_char2bytes(c, buf);
  for (int i = 0; i < MAX_MCO; i++) {
    if (u8cc[i] == 0) {
      break;
    }
    len += utf_char2bytes(u8cc[i], buf + len);
  }
  buf[len] = 0;
  return schar_from_buf(buf, (size_t)len);
 }
 schar_T schar_from_str(char *str)
 {
  if (str == NULL) {
@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols)
    schar_get(scbuf, buf[i]);
    char scbuf_new[MAX_SCHAR_SIZE];
-    int len = utf_char2bytes(c0new, scbuf_new);
+    size_t len = (size_t)utf_char2bytes(c0new, scbuf_new);
    if (c1new) {
-      len += utf_char2bytes(c1new, scbuf_new + len);
+      len += (size_t)utf_char2bytes(c1new, scbuf_new + len);
    }
    int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0);
    size_t rest = strlen(scbuf + off);
-    if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) {
+    if (rest + len + 1 > MAX_SCHAR_SIZE) {
-      // TODO(bfredl): this cannot happen just yet, as we only construct
+      // Too bigly, discard one code-point.
-      // schar_T values with up to MAX_MCO+1 composing codepoints. When code
+      // This should be enough as c0 cannot grow more than from 2 to 4 bytes
-      // is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit,
+      // (base arabic to extended arabic)
-      // we need be able to peel off a composing char which doesn't fit anymore.
+      rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
      abort();
    }
    memcpy(scbuf_new + len, scbuf + off, rest);
-    buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest);
+    buf[i] = schar_from_buf(scbuf_new, len + rest);
 next:
    c0prev = c0;
@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row)
  return grid->attrs[grid->line_offset[row]] < 0;
 }
-/// Get a single character directly from grid.chars into "bytes", which must
+/// Get a single character directly from grid.chars
-/// have a size of "MB_MAXBYTES + 1".
+///
-/// If "attrp" is not NULL, return the character's attribute in "*attrp".
+/// @param[out] attrp  set to the character's attribute (optional)
 schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp)
 {
  grid_adjust(&grid, &row, &col);
@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
 {
  const char *ptr = text;
  int len = textlen;
  int u8cc[MAX_MCO];
  assert(grid_line_grid);
  int start_col = col;
  int max_col = grid_line_maxcol;
-  while (col < max_col
+  while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
         && (len < 0 || (int)(ptr - text) < len)
         && *ptr != NUL) {
    // check if this is the first byte of a multibyte
    int mbyte_blen = len > 0
      ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
      : utfc_ptr2len(ptr);
-    int u8c = len >= 0
+    int firstc;
-      ? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr))
+    schar_T schar = len >= 0
-      : utfc_ptr2char(ptr, u8cc);
+      ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
-    int mbyte_cells = utf_char2cells(u8c);
+      : utfc_ptr2schar(ptr, &firstc);
    int mbyte_cells = utf_char2cells(firstc);
    if (mbyte_cells > 2) {
      mbyte_cells = 1;
-      u8c = 0xFFFD;
+
-      u8cc[0] = 0;
+      schar = schar_from_char(0xFFFD);
    }
    if (col + mbyte_cells > max_col) {
      // Only 1 cell left, but character requires 2 cells:
      // display a '>' in the last column to avoid wrapping. */
-      u8c = '>';
+      schar = schar_from_ascii('>');
      u8cc[0] = 0;
      mbyte_cells = 1;
    }
    schar_T buf;
    // TODO(bfredl): why not just keep the original byte sequence.
    buf = schar_from_cc(u8c, u8cc);
    // When at the start of the text and overwriting the right half of a
    // two-cell character in the same grid, truncate that into a '>'.
    if (ptr == text && col > grid_line_first && col < grid_line_last
@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
      linebuf_char[col - 1] = schar_from_ascii('>');
    }
-    linebuf_char[col] = buf;
+    linebuf_char[col] = schar;
    linebuf_attr[col] = attr;
    linebuf_vcol[col] = -1;
    if (mbyte_cells == 2) {
--- a/src/nvim/grid_defs.h
+++ b/src/nvim/grid_defs.h
@ -7,8 +7,8 @@
 #include "nvim/pos.h"
 #include "nvim/types.h"
-#define MAX_MCO  6  // fixed value for 'maxcombine'
+// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
-// Includes final NUL. at least 4*(MAX_MCO+1)+1
+// ensures we can fit all composed chars which did fit before.
 #define MAX_SCHAR_SIZE 32
 // if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness)
@ -35,7 +35,7 @@ enum {
 /// we can avoid sending bigger updates than necessary to the Ul layer.
 ///
 /// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can
-/// contain up to MAX_MCO composing characters after the base character.
+/// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes
 /// The composing characters are to be drawn on top of the original character.
 /// The content after the NUL is not defined (so comparison must be done a
 /// single cell at a time). Double-width characters are stored in the left cell,
--- a/src/nvim/insexpand.c
+++ b/src/nvim/insexpand.c
@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c)
    return;
  }
  if ((cc = utf_char2len(c)) > 1) {
-    char buf[MB_MAXBYTES + 1];
+    char buf[MB_MAXCHAR + 1];
    utf_char2bytes(c, buf);
    buf[cc] = NUL;
--- a/src/nvim/lua/stdlib.c
+++ b/src/nvim/lua/stdlib.c
@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
  if (offset < 0 || offset > (intptr_t)s1_len) {
    return luaL_error(lstate, "index out of range");
  }
-  int head_offset = utf_cp_head_off(s1, s1 + offset - 1);
+  int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
  lua_pushinteger(lstate, head_offset);
  return 1;
 }
--- a/src/nvim/match.c
+++ b/src/nvim/match.c
@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
    tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id);
    if (cur->mit_conceal_char) {
-      char buf[MB_MAXBYTES + 1];
+      char buf[MB_MAXCHAR + 1];
      buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL;
      tv_dict_add_str(dict, S_LEN("conceal"), buf);
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@ -48,6 +48,7 @@
 #include "nvim/getchar.h"
 #include "nvim/gettext.h"
 #include "nvim/globals.h"
 #include "nvim/grid.h"
 #include "nvim/grid_defs.h"
 #include "nvim/iconv.h"
 #include "nvim/keycodes.h"
@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
  return arabic_combine(utf_ptr2char(p1), c2);
 }
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
 ///
-/// Also gets up to #MAX_MCO composing characters.
+/// Caller is expected to check for things like unprintable chars etc
 /// If first char in string is a composing char, prepend a space to display it correctly.
 ///
-/// @param[out]  pcc  Location where to store composing characters. Must have
+/// If "p" starts with an invalid sequence, zero is returned.
 ///                   space at least for #MAX_MCO + 1 elements.
 ///
-/// @return leading character.
+/// @param[out] firstc (required) The first codepoint of the screen char,
-int utfc_ptr2char(const char *p, int *pcc)
+///                    or the first byte of an invalid sequence
 ///
 /// @return the char
 schar_T utfc_ptr2schar(const char *p, int *firstc)
  FUNC_ATTR_NONNULL_ALL
 {
  int i = 0;
  int c = utf_ptr2char(p);
-  int len = utf_ptr2len(p);
+  *firstc = c;  // NOT optional, you are gonna need it
  bool first_compose = utf_iscomposing(c);
  size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
  size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
-  // Only accept a composing char when the first char isn't illegal.
+  if (len == 1 && (uint8_t)(*p) >= 0x80) {
-  if ((len > 1 || (uint8_t)(*p) < 0x80)
+    return 0;  // invalid sequence
      && (uint8_t)p[len] >= 0x80
      && utf_composinglike(p, p + len)) {
    int cc = utf_ptr2char(p + len);
    while (true) {
      pcc[i++] = cc;
      if (i == MAX_MCO) {
        break;
      }
      len += utf_ptr2len(p + len);
      if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
        break;
      }
    }
  }
-  if (i < MAX_MCO) {    // last composing char must be 0
+  return schar_from_buf_first(p, len, first_compose);
    pcc[i] = 0;
  }
  return c;
 }
-// Convert a UTF-8 byte string to a wide character.  Also get up to MAX_MCO
+/// Get the screen char at the beginning of a string with length
-// composing characters.  Use no more than p[maxlen].
+///
-//
+/// Like utfc_ptr2schar but use no more than p[maxlen].
-// @param [out] pcc: composing chars, last one is 0
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+  FUNC_ATTR_NONNULL_ALL
 {
  assert(maxlen > 0);
-  int i = 0;
+  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
-
+  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
-  int len = utf_ptr2len_len(p, maxlen);
+    // invalid or truncated sequence
-  // Is it safe to use utf_ptr2char()?
+    *firstc = (uint8_t)(*p);
-  bool safe = len > 1 && len <= maxlen;
+    return 0;
  int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
  // Only accept a composing char when the first char isn't illegal.
  if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
    for (; i < MAX_MCO; i++) {
      int len_cc = utf_ptr2len_len(p + len, maxlen - len);
      safe = len_cc > 1 && len_cc <= maxlen - len;
      if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
          || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
        break;
      }
      len += len_cc;
    }
  }
-  if (i < MAX_MCO) {
+  int c = utf_ptr2char(p);
-    // last composing char must be 0
+  *firstc = c;
-    pcc[i] = 0;
+  bool first_compose = utf_iscomposing(c);
-  }
+  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
  len = (size_t)utfc_ptr2len_len(p, maxlen);
-  return c;
+  return schar_from_buf_first(p, len, first_compose);
-#undef ISCOMPOSING
+}
 /// Caller must ensure there is space for `first_compose`
 static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
 {
  if (first_compose) {
    char cbuf[MAX_SCHAR_SIZE];
    cbuf[0] = ' ';
    memcpy(cbuf + 1, buf, len);
    return schar_from_buf(cbuf, len + 1);
  } else {
    return schar_from_buf(buf, len);
  }
 }
 /// Get the length of a UTF-8 byte sequence representing a single codepoint
@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
    return 1;
  }
-  // Check for composing characters.  We can handle only the first six, but
+  // Check for composing characters.
  // skip all of them (otherwise the cursor would get stuck).
  int prevlen = 0;
  while (true) {
    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
 /// Return the offset from "p" to the first byte of the codepoint it points
 /// to. Can start anywhere in a stream of bytes.
 /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
 ///
 /// @param[in] base  Pointer to start of string
 /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
 //
-/// @return 0 if invalid sequence, else offset to previous codepoint
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
 int utf_cp_head_off(const char *base, const char *p)
 {
  int i;
@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
  }
  // Find the first character that is not 10xx.xxxx
-  for (i = 0; p - i > base; i--) {
+  for (i = 0; p - i >= base; i++) {
-    if (((uint8_t)p[i] & 0xc0) != 0x80) {
+    if (((uint8_t)p[-i] & 0xc0) != 0x80) {
      break;
    }
  }
-  // Find the last character that is 10xx.xxxx
+  // Find the last character that is 10xx.xxxx (condition terminates on NUL)
-  for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {}
+  int j = 1;
  while (((uint8_t)p[j] & 0xc0) == 0x80) {
    j++;
  }
  // Check for illegal sequence.
-  if (utf8len_tab[(uint8_t)p[i]] == 1) {
+  if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
    return 0;
  }
  return i;
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@ -7,6 +7,7 @@
 #include "nvim/cmdexpand_defs.h"
 #include "nvim/eval/typval_defs.h"
 #include "nvim/func_attr.h"
 #include "nvim/grid_defs.h"
 #include "nvim/mbyte_defs.h"
 #include "nvim/os/os_defs.h"
 #include "nvim/types.h"
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0;
 static void ui_ext_msg_set_pos(int row, bool scrolled)
 {
-  char buf[MAX_MCO + 1];
+  char buf[MB_MAXCHAR + 1];
  size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf);
  buf[size] = '\0';
  ui_call_msg_set_pos(msg_grid.handle, row, scrolled,
@ -1471,7 +1471,7 @@ void msg_putchar(int c)
 void msg_putchar_attr(int c, int attr)
 {
-  char buf[MB_MAXBYTES + 1];
+  char buf[MB_MAXCHAR + 1];
  if (IS_SPECIAL(c)) {
    buf[0] = (char)K_SPECIAL;
@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr)
    mode_displayed = false;
  }
  // If the string starts with a composing character first draw a space on
  // which the composing char can be drawn.
  if (utf_iscomposing(utf_ptr2char(msgstr))) {
    msg_puts_attr(" ", attr);
  }
  // Go over the string.  Special characters are translated and printed.
  // Normal characters are printed several at a time.
  while (--len >= 0 && !got_int) {
--- a/src/nvim/option_vars.h
+++ b/src/nvim/option_vars.h
@ -556,6 +556,7 @@ EXTERN char *p_mp;              ///< 'makeprg'
 EXTERN char *p_mps;             ///< 'matchpairs'
 EXTERN OptInt p_mat;            ///< 'matchtime'
 EXTERN OptInt p_mco;            ///< 'maxcombine'
 #define MAX_MCO  6  // fixed value for 'maxcombine'
 EXTERN OptInt p_mfd;            ///< 'maxfuncdepth'
 EXTERN OptInt p_mmd;            ///< 'maxmapdepth'
 EXTERN OptInt p_mmp;            ///< 'maxmempattern'
--- a/src/nvim/spellsuggest.c
+++ b/src/nvim/spellsuggest.c
@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word)
 static bool similar_chars(slang_T *slang, int c1, int c2)
 {
  int m1, m2;
-  char buf[MB_MAXBYTES + 1];
+  char buf[MB_MAXCHAR + 1];
  hashitem_T *hi;
  if (c1 >= 256) {
--- a/test/functional/ui/fold_spec.lua
+++ b/test/functional/ui/fold_spec.lua
@ -1102,8 +1102,6 @@ describe("folded lines", function()
    end)
    it("works with multibyte text", function()
      -- Currently the only allowed value of 'maxcombine'
      eq(6, meths.get_option_value('maxcombine', {}))
      eq(true, meths.get_option_value('arabicshape', {}))
      insert([[
        å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة
@ -1120,7 +1118,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ                               |
+          å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ                               |
          möre tex^t                                    |
          {1:~                                            }|
          {1:~                                            }|
@ -1132,7 +1130,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ                               |
+          å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ                               |
          möre tex^t                                    |
          {1:~                                            }|
          {1:~                                            }|
@ -1156,7 +1154,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {5:^+--  2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
+          {5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1168,7 +1166,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {5:^+--  2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
+          {5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1192,7 +1190,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {5:^+--  2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}|
+          {5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1204,7 +1202,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {5:^+--  2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}|
+          {5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1228,7 +1226,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {7:+ }{8:  1 }{5:^+--  2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}|
+          {7:+ }{8:  1 }{5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1240,7 +1238,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {7:+ }{8:  1 }{5:^+--  2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}|
+          {7:+ }{8:  1 }{5:^+--  2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
          {1:~                                            }|
          {1:~                                            }|
          {1:~                                            }|
@ -1265,7 +1263,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2  --^+}{8: 1  }{7: +}|
+          {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}{8: 1  }{7: +}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1277,7 +1275,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2  --^+}{8: 1  }{7: +}|
+          {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}{8: 1  }{7: +}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1301,7 +1299,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2  --^+}|
+          {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1313,7 +1311,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2  --^+}|
+          {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1337,7 +1335,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-          {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2  --^+}|
+          {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1349,7 +1347,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-          {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2  --^+}|
+          {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2  --^+}|
          {1:                                            ~}|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1373,7 +1371,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-                                         ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å|
+                                         ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
                                              txet eröm|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1385,7 +1383,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-                                         ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å|
+                                         ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
                                              txet eröm|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1409,7 +1407,7 @@ describe("folded lines", function()
          [2:---------------------------------------------]|
          [3:---------------------------------------------]|
        ## grid 2
-                                         ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å|
+                                         ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
                                              txet eröm|
          {1:                                            ~}|
          {1:                                            ~}|
@ -1421,7 +1419,7 @@ describe("folded lines", function()
        ]])
      else
        screen:expect([[
-                                         ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å|
+                                         ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
                                              txet eröm|
          {1:                                            ~}|
          {1:                                            ~}|
--- a/test/functional/ui/multibyte_spec.lua
+++ b/test/functional/ui/multibyte_spec.lua
@ -228,6 +228,36 @@ describe("multibyte rendering", function()
    ]]}
  end)
  it('works with arabicshape and multiple composing chars', function()
    -- this tests an important edge case: arabicshape might increase the byte size of the base
    -- character in a way so that the last composing char no longer fits. use "g8" on the text
    -- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape')
    -- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the
    -- test text with even more zalgo energy to still touch this edge case.
    meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"})
    command('set noarabicshape')
    screen:expect{grid=[[
      ^سلام့̀́̂̃̄̅̆̇̈̉̊̋̌                                                        |
      {1:~                                                           }|
      {1:~                                                           }|
      {1:~                                                           }|
      {1:~                                                           }|
                                                                  |
    ]]}
    command('set arabicshape')
    screen:expect{grid=[[
      ^ﺱﻼﻣ̀́̂̃̄̅̆̇̈̉̊̋̌                                                         |
      {1:~                                                           }|
      {1:~                                                           }|
      {1:~                                                           }|
      {1:~                                                           }|
                                                                  |
    ]]}
  end)
 end)
 describe('multibyte rendering: statusline', function()
--- a/test/functional/ui/output_spec.lua
+++ b/test/functional/ui/output_spec.lua
@ -225,8 +225,8 @@ describe("shell command :!", function()
        å                                                    |
        ref: å̲                                               |
        1: å̲                                                 |
-        2: å ̲                                               |
+        2: å ̲                                                |
-        3: å ̲                                               |
+        3: å ̲                                                |
                                                             |
        {3:Press ENTER or type command to continue}^              |
      ]])
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it)
 local ffi     = helpers.ffi
 local eq      = helpers.eq
-local mbyte = helpers.cimport("./src/nvim/mbyte.h")
+local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
 local charset = helpers.cimport('./src/nvim/charset.h')
 describe('mbyte', function()
  -- Array for composing characters
  local intp = ffi.typeof('int[?]')
  local function to_intp()
    -- how to get MAX_MCO from globals.h?
    return intp(7, 1)
  end
  -- Convert from bytes to string
  local function to_string(bytes)
    local s = {}
@ -30,14 +22,14 @@ describe('mbyte', function()
  itp('utf_ptr2char', function()
    -- For strings with length 1 the first byte is returned.
    for c = 0, 255 do
-      eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
+      eq(c, lib.utf_ptr2char(to_string({c, 0})))
    end
    -- Some ill formed byte sequences that should not be recognized as UTF-8
    -- First byte: 0xc0 or 0xc1
    -- Second byte: 0x80 .. 0xbf
-    --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
+    --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
-    --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
+    --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
    --
    -- Sequences with more than four bytes
  end)
@ -47,240 +39,133 @@ describe('mbyte', function()
      local char_p = ffi.typeof('char[?]')
      for c = n * 0x1000, n * 0x1000 + 0xFFF do
        local p = char_p(4, 0)
-        mbyte.utf_char2bytes(c, p)
+        lib.utf_char2bytes(c, p)
-        eq(c, mbyte.utf_ptr2char(p))
+        eq(c, lib.utf_ptr2char(p))
-        eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
+        eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
      end
    end)
  end
-  describe('utfc_ptr2char_len', function()
+  describe('utfc_ptr2schar_len', function()
    local function test_seq(seq)
      local firstc = ffi.new("int[1]")
      local buf = ffi.new("char[32]")
      lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
      return {ffi.string(buf), firstc[0]}
    end
    local function byte(val)
      return {string.char(val), val}
    end
    itp('1-byte sequences', function()
-      local pcc = to_intp()
+      eq({'', 0}, test_seq{0})
-      for c = 0, 255 do
+      for c = 1, 127 do
-        eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
+        eq(byte(c), test_seq{c})
-        eq(0, pcc[0])
+      end
      for c = 128, 255 do
        eq({'', c}, test_seq{c})
      end
    end)
    itp('2-byte sequences', function()
      local pcc = to_intp()
      -- No combining characters
-      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
+      eq(byte(0x7f), test_seq{0x7f, 0x7f})
      eq(0, pcc[0])
      -- No combining characters
-      pcc = to_intp()
+      eq(byte(0x7f), test_seq{0x7f, 0x80})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
      eq(0, pcc[0])
      -- No UTF-8 sequence
-      pcc = to_intp()
+      eq({'', 0xc2}, test_seq{0xc2, 0x7f})
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
      eq(0, pcc[0])
      -- One UTF-8 character
-      pcc = to_intp()
+      eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
      eq(0, pcc[0])
      -- No UTF-8 sequence
-      pcc = to_intp()
+      eq({'', 0xc2}, test_seq{0xc2, 0xc0})
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
      eq(0, pcc[0])
    end)
    itp('3-byte sequences', function()
      local pcc = to_intp()
      -- No second UTF-8 character
-      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
+      eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
      eq(0, pcc[0])
      -- No combining character
-      pcc = to_intp()
+      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
      eq(0, pcc[0])
      -- Combining character is U+0300
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])
      -- No UTF-8 sequence
-      pcc = to_intp()
+      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
      eq(0, pcc[0])
      -- Incomplete combining character
-      pcc = to_intp()
+      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
      eq(0, pcc[0])
-      -- One UTF-8 character
+      -- One UTF-8 character (composing only)
-      pcc = to_intp()
+      eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
      eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
      eq(0, pcc[0])
    end)
    itp('4-byte sequences', function()
      local pcc = to_intp()
      -- No following combining character
-      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
+      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
      eq(0, pcc[0])
      -- No second UTF-8 character
-      pcc = to_intp()
+      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
      eq(0, pcc[0])
      -- Combining character U+0300
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])
      -- No UTF-8 sequence
-      pcc = to_intp()
+      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
      eq(0, pcc[0])
      -- No following UTF-8 character
-      pcc = to_intp()
+      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
      eq(0, pcc[0])
      -- Combining character U+0301
-      pcc = to_intp()
+      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])
      -- One UTF-8 character
-      pcc = to_intp()
+      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
      eq(0, pcc[0])
    end)
    itp('5+-byte sequences', function()
      local pcc = to_intp()
      -- No following combining character
-      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
+      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
      eq(0, pcc[0])
      -- No second UTF-8 character
-      pcc = to_intp()
+      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- Combining character U+0300
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
      eq(0x0300, pcc[0])
      eq(0x0000, pcc[1])
      -- Combining characters U+0300 and U+0301
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0000, pcc[2])
      -- Combining characters U+0300, U+0301, U+0302
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0000, pcc[3])
      -- Combining characters U+0300, U+0301, U+0302, U+0303
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0000, pcc[4])
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
-      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
+      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
-        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0000, pcc[5])
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
      -- U+0305
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0305, pcc[5])
      eq(1, pcc[6])
-      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
+      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
-      -- U+0305, U+0306, but only save six (= MAX_MCO).
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
      pcc = to_intp()
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0303, pcc[3])
      eq(0x0304, pcc[4])
      eq(0x0305, pcc[5])
      eq(0x0001, pcc[6])
      -- Only three following combining characters U+0300, U+0301, U+0302
-      pcc = to_intp()
+      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
      eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
        {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
      eq(0x0300, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0302, pcc[2])
      eq(0x0000, pcc[3])
      -- No UTF-8 sequence
-      pcc = to_intp()
+      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
      eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- No following UTF-8 character
-      pcc = to_intp()
+      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- Combining character U+0301
-      pcc = to_intp()
+      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])
      -- Combining character U+0301
-      pcc = to_intp()
+      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
      eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
      eq(0x0301, pcc[0])
      eq(0x0000, pcc[1])
      -- One UTF-8 character
-      pcc = to_intp()
+      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
      eq(0, pcc[0])
      -- One UTF-8 character
-      pcc = to_intp()
+      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
      eq(0, pcc[0])
      -- One UTF-8 character
-      pcc = to_intp()
+      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
      eq(0, pcc[0])
      -- Combining characters U+1AB0 and U+0301
-      pcc = to_intp()
+      eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
      eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
        {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
      eq(0x1ab0, pcc[0])
      eq(0x0301, pcc[1])
      eq(0x0000, pcc[2])
    end)
  end)