Merge pull request #25934 from bfredl/screenlinechar

refactor(grid): make screen rendering more multibyte than ever before
This commit is contained in:
bfredl 2023-11-17 13:38:10 +01:00 committed by GitHub
commit 7af89ef464
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 399 additions and 602 deletions

View File

@ -646,7 +646,8 @@ widespread as file format.
A composing or combining character is used to change the meaning of the A composing or combining character is used to change the meaning of the
character before it. The combining characters are drawn on top of the character before it. The combining characters are drawn on top of the
preceding character. preceding character.
Up to six combining characters can be displayed. Too big combined characters cannot be displayed, but they can still be
inspected using the |g8| and |ga| commands described below.
When editing text a composing character is mostly considered part of the When editing text a composing character is mostly considered part of the
preceding character. For example "x" will delete a character and its preceding character. For example "x" will delete a character and its
following composing characters by default. following composing characters by default.

View File

@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior.
Note that syntax highlighting of code examples requires a matching parser Note that syntax highlighting of code examples requires a matching parser
and may be affected by custom queries. and may be affected by custom queries.
• Support for rendering multibyte characters using composing characters has been
enhanced. The maximum limit have been increased from 1+6 codepoints to
31 bytes, which is guaranteed to fit all chars from before but often more.
NOTE: the regexp engine still has a hard-coded limit of considering
6 composing chars only.
============================================================================== ==============================================================================
REMOVED FEATURES *news-removed* REMOVED FEATURES *news-removed*

View File

@ -722,9 +722,16 @@ Options:
< <
*'macatsui'* *'macatsui'*
*'maxcombine'* *'mco'* *'maxcombine'* *'mco'*
Nvim always displays up to 6 combining characters. You can still edit Nvim counts maximum character sizes in bytes, not codepoints. This is
text with more than 6 combining characters, you just can't see them. guaranteed to be big enough to always fit all chars properly displayed
Use |g8| or |ga|. See |mbyte-combining|. in vim with 'maxcombine' set to 6.
You can still edit text with larger characters than fits in the screen buffer,
you just can't see them. Use |g8| or |ga|. See |mbyte-combining|.
NOTE: the rexexp engine still has a hard-coded limit of considering
6 composing chars only.
*'maxmem'* Nvim delegates memory-management to the OS. *'maxmem'* Nvim delegates memory-management to the OS.
*'maxmemtot'* Nvim delegates memory-management to the OS. *'maxmemtot'* Nvim delegates memory-management to the OS.
printoptions printoptions

View File

@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg
--- security reasons. --- security reasons.
--- ---
--- @type boolean --- @type boolean
vim.o.fsync = false vim.o.fsync = true
vim.o.fs = vim.o.fsync vim.o.fs = vim.o.fsync
vim.go.fsync = vim.o.fsync vim.go.fsync = vim.o.fsync
vim.go.fs = vim.go.fsync vim.go.fs = vim.go.fsync

View File

@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len)
/// convert bytes to a character. /// convert bytes to a character.
void ins_char(int c) void ins_char(int c)
{ {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
size_t n = (size_t)utf_char2bytes(c, buf); size_t n = (size_t)utf_char2bytes(c, buf);
// When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte. // When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte.
@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
// If 'delcombine' is set and deleting (less than) one character, only // If 'delcombine' is set and deleting (less than) one character, only
// delete the last combining character. // delete the last combining character.
if (p_deco && use_delcombine if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
&& utfc_ptr2len(oldp + col) >= count) { char *p0 = oldp + col;
int cc[MAX_MCO]; if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
(void)utfc_ptr2char(oldp + col, cc);
if (cc[0] != NUL) {
// Find the last composing char, there can be several. // Find the last composing char, there can be several.
int n = col; int n = col;
do { do {

View File

@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab)
while (*p) { while (*p) {
const size_t l = (size_t)utfc_ptr2len(p); const size_t l = (size_t)utfc_ptr2len(p);
if (l > 1) { if (l > 1) {
int pcc[MAX_MCO + 1]; if (vim_isprintc(utf_ptr2char(p))) {
pcc[0] = utfc_ptr2char(p, &pcc[1]);
if (vim_isprintc(pcc[0])) {
len += l; len += l;
} else { } else {
for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
int c = utf_ptr2char(p + off);
char hexbuf[9]; char hexbuf[9];
len += transchar_hex(hexbuf, pcc[i]); len += transchar_hex(hexbuf, c);
} }
} }
p += l; p += l;
@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co
if (buf_p + l > buf_e) { if (buf_p + l > buf_e) {
break; // Exceeded `buf` size. break; // Exceeded `buf` size.
} }
int pcc[MAX_MCO + 1];
pcc[0] = utfc_ptr2char(p, &pcc[1]);
if (vim_isprintc(pcc[0])) { if (vim_isprintc(utf_ptr2char(p))) {
memmove(buf_p, p, l); memmove(buf_p, p, l);
buf_p += l; buf_p += l;
} else { } else {
for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
int c = utf_ptr2char(p + off);
char hexbuf[9]; // <up to 6 bytes>NUL char hexbuf[9]; // <up to 6 bytes>NUL
const size_t hexlen = transchar_hex(hexbuf, pcc[i]); const size_t hexlen = transchar_hex(hexbuf, c);
if (buf_p + hexlen > buf_e) { if (buf_p + hexlen > buf_e) {
break; break;
} }

View File

@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n)
bool check_digraph_chars_valid(int char1, int char2) bool check_digraph_chars_valid(int char1, int char2)
{ {
if (char2 == 0) { if (char2 == 0) {
char msg[MB_MAXBYTES + 1]; char msg[MB_MAXCHAR + 1];
msg[utf_char2bytes(char1, msg)] = NUL; msg[utf_char2bytes(char1, msg)] = NUL;
semsg(_(e_digraph_must_be_just_two_characters_str), msg); semsg(_(e_digraph_must_be_just_two_characters_str), msg);
return false; return false;

View File

@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
const char *p = *pp; const char *p = *pp;
int cells = utf_ptr2cells(p); int cells = utf_ptr2cells(p);
int c_len = utfc_ptr2len(p); int c_len = utfc_ptr2len(p);
int u8c, u8cc[MAX_MCO];
assert(maxcells > 0); assert(maxcells > 0);
if (cells > maxcells) { if (cells > maxcells) {
dest[0] = schar_from_ascii(' '); dest[0] = schar_from_ascii(' ');
return 1; return 1;
} }
u8c = utfc_ptr2char(p, u8cc);
if (*p == TAB) { if (*p == TAB) {
cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells); cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells);
} }
@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
for (int c = 0; c < cells; c++) { for (int c = 0; c < cells; c++) {
dest[c] = schar_from_ascii(' '); dest[c] = schar_from_ascii(' ');
} }
goto done;
} else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) {
dest[0] = schar_from_ascii(*p);
} else { } else {
dest[0] = schar_from_cc(u8c, u8cc); int u8c;
dest[0] = utfc_ptr2schar(p, &u8c);
if (cells > 1) {
dest[1] = 0;
}
} }
if (cells > 1) {
dest[1] = 0;
}
done:
*pp += c_len; *pp += c_len;
return cells; return cells;
} }
@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t
} }
} }
static bool check_mb_utf8(int *c, int *u8cc)
{
if (utf_char2len(*c) > 1) {
*u8cc = 0;
*c = 0xc0;
return true;
}
return false;
}
static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line) static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line)
{ {
colnr_T trailcol = MAXCOL; colnr_T trailcol = MAXCOL;
@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
{ {
winlinevars_T wlv; // variables passed between functions winlinevars_T wlv; // variables passed between functions
int c = 0; // init for GCC
colnr_T vcol_prev = -1; // "wlv.vcol" of previous character colnr_T vcol_prev = -1; // "wlv.vcol" of previous character
char *line; // current line char *line; // current line
char *ptr; // current position in "line" char *ptr; // current position in "line"
@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
int multi_attr = 0; // attributes desired by multibyte int multi_attr = 0; // attributes desired by multibyte
int mb_l = 1; // multi-byte byte length int mb_l = 1; // multi-byte byte length
int mb_c = 0; // decoded multi-byte character int mb_c = 0; // decoded multi-byte character
bool mb_utf8 = false; // screen char is UTF-8 char schar_T mb_schar; // complete screen char
int u8cc[MAX_MCO]; // composing UTF-8 chars
int change_start = MAXCOL; // first col of changed area int change_start = MAXCOL; // first col of changed area
int change_end = -1; // last col of changed area int change_end = -1; // last col of changed area
bool in_multispace = false; // in multiple consecutive spaces bool in_multispace = false; // in multiple consecutive spaces
@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// For the '$' of the 'list' option, n_extra == 1, p_extra == "". // For the '$' of the 'list' option, n_extra == 1, p_extra == "".
if (wlv.n_extra > 0) { if (wlv.n_extra > 0) {
if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) { if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) {
c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra; mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
mb_c = c; // doesn't handle non-utf-8 multi-byte! mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc); wlv.n_extra--;
} else { } else {
assert(wlv.p_extra != NULL); assert(wlv.p_extra != NULL);
c = (uint8_t)(*wlv.p_extra);
mb_c = c;
// If the UTF-8 character is more than one byte:
// Decode it into "mb_c".
mb_l = utfc_ptr2len(wlv.p_extra); mb_l = utfc_ptr2len(wlv.p_extra);
mb_utf8 = false; mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c);
if (mb_l > wlv.n_extra) { // mb_l=0 at the end-of-line NUL
mb_l = 1; if (mb_l > wlv.n_extra || mb_l == 0) {
} else if (mb_l > 1) {
mb_c = utfc_ptr2char(wlv.p_extra, u8cc);
mb_utf8 = true;
c = 0xc0;
}
if (mb_l == 0) { // at the NUL at end-of-line
mb_l = 1; mb_l = 1;
} }
// If a double-width char doesn't fit display a '>' in the last column. // If a double-width char doesn't fit display a '>' in the last column.
// Don't advance the pointer but put the character at the start of the next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
c = '>'; mb_c = '>';
mb_c = c;
mb_l = 1; mb_l = 1;
(void)mb_l; (void)mb_l;
mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT); multi_attr = win_hl_attr(wp, HLF_AT);
if (wlv.cul_attr) { if (wlv.cul_attr) {
@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
? hl_combine_attr(wlv.cul_attr, multi_attr) ? hl_combine_attr(wlv.cul_attr, multi_attr)
: hl_combine_attr(multi_attr, wlv.cul_attr); : hl_combine_attr(multi_attr, wlv.cul_attr);
} }
// put the pointer back to output the double-width
// character at the start of the next line.
wlv.n_extra++;
wlv.p_extra--;
} else { } else {
wlv.n_extra -= mb_l - 1; wlv.n_extra -= mb_l;
wlv.p_extra += mb_l - 1; wlv.p_extra += mb_l;
} }
wlv.p_extra++;
} }
wlv.n_extra--;
// Only restore search_attr and area_attr after "n_extra" in // Only restore search_attr and area_attr after "n_extra" in
// the next screen line is also done. // the next screen line is also done.
@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
} }
} else if (has_fold) { } else if (has_fold) {
// skip writing the buffer line itself // skip writing the buffer line itself
c = NUL; mb_c = NUL;
} else { } else {
int c0;
char *prev_ptr = ptr; char *prev_ptr = ptr;
// Get a character from the line itself. // first byte of next char
c0 = c = (uint8_t)(*ptr); int c0 = (uint8_t)(*ptr);
mb_c = c; if (c0 == NUL) {
if (c == NUL) {
// no more cells to skip // no more cells to skip
wlv.skip_cells = 0; wlv.skip_cells = 0;
} }
// If the UTF-8 character is more than one byte: Decode it // Get a character from the line itself.
// into "mb_c".
mb_l = utfc_ptr2len(ptr); mb_l = utfc_ptr2len(ptr);
mb_utf8 = false; mb_schar = utfc_ptr2schar(ptr, &mb_c);
if (mb_l > 1) {
mb_c = utfc_ptr2char(ptr, u8cc);
// Overlong encoded ASCII or ASCII with composing char
// is displayed normally, except a NUL.
if (mb_c < 0x80) {
c0 = c = mb_c;
}
mb_utf8 = true;
// At start of the line we can have a composing char. // Overlong encoded ASCII or ASCII with composing char
// Draw it as a space with a composing char. // is displayed normally, except a NUL.
if (utf_iscomposing(mb_c)) { if (mb_l > 1 && mb_c < 0x80) {
for (int i = MAX_MCO - 1; i > 0; i--) { c0 = mb_c;
u8cc[i] = u8cc[i - 1];
}
u8cc[0] = mb_c;
mb_c = ' ';
}
} }
if ((mb_l == 1 && c >= 0x80) if ((mb_l == 1 && c0 >= 0x80)
|| (mb_l >= 1 && mb_c == 0) || (mb_l >= 1 && mb_c == 0)
|| (mb_l > 1 && (!vim_isprintc(mb_c)))) { || (mb_l > 1 && (!vim_isprintc(mb_c)))) {
// Illegal UTF-8 byte: display as <xx>. // Illegal UTF-8 byte: display as <xx>.
// Non-BMP character : display as ? or fullwidth ?. // Non-printable character : display as ? or fullwidth ?.
transchar_hex(wlv.extra, mb_c); transchar_hex(wlv.extra, mb_c);
if (wp->w_p_rl) { // reverse if (wp->w_p_rl) { // reverse
rl_mirror_ascii(wlv.extra, NULL); rl_mirror_ascii(wlv.extra, NULL);
} }
wlv.p_extra = wlv.extra; wlv.p_extra = wlv.extra;
c = (uint8_t)(*wlv.p_extra);
mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra); mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra);
mb_utf8 = (c >= 0x80); mb_schar = schar_from_char(mb_c);
wlv.n_extra = (int)strlen(wlv.p_extra); wlv.n_extra = (int)strlen(wlv.p_extra);
wlv.c_extra = NUL; wlv.c_extra = NUL;
wlv.c_final = NUL; wlv.c_final = NUL;
@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// last column; the character is displayed at the start of the // last column; the character is displayed at the start of the
// next line. // next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
c = '>'; mb_c = '>';
mb_c = c;
mb_utf8 = false;
mb_l = 1; mb_l = 1;
mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT); multi_attr = win_hl_attr(wp, HLF_AT);
// Put pointer back so that the character will be // Put pointer back so that the character will be
// displayed at the start of the next line. // displayed at the start of the next line.
@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 1; wlv.n_extra = 1;
wlv.c_extra = MB_FILLER_CHAR; wlv.c_extra = MB_FILLER_CHAR;
wlv.c_final = NUL; wlv.c_final = NUL;
c = ' '; mb_c = ' ';
mb_l = 1;
mb_schar = schar_from_ascii(mb_c);
if (area_attr == 0 && search_attr == 0) { if (area_attr == 0 && search_attr == 0) {
wlv.n_attr = wlv.n_extra + 1; wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_AT); wlv.extra_attr = win_hl_attr(wp, HLF_AT);
saved_attr2 = wlv.char_attr; // save current attr saved_attr2 = wlv.char_attr; // save current attr
} }
mb_c = c;
mb_utf8 = false;
mb_l = 1;
} }
ptr++; ptr++;
@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// no concealing past the end of the line, it interferes // no concealing past the end of the line, it interferes
// with line highlighting. // with line highlighting.
if (c == NUL) { syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr);
syntax_flags = 0;
} else {
syntax_flags = get_syntax_info(&syntax_seqnr);
}
} }
if (has_decor && v > 0) { if (has_decor && v > 0) {
@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
spell_attr = 0; spell_attr = 0;
// do not calculate cap_col at the end of the line or when // do not calculate cap_col at the end of the line or when
// only white space is following // only white space is following
if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) { if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
char *p; char *p;
hlf_T spell_hlf = HLF_COUNT; hlf_T spell_hlf = HLF_COUNT;
v -= mb_l - 1; v -= mb_l - 1;
@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// //
// So only allow to linebreak, once we have found chars not in // So only allow to linebreak, once we have found chars not in
// 'breakat' in the line. // 'breakat' in the line.
if (wp->w_p_lbr && !wlv.need_lbr && c != NUL if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL
&& !vim_isbreak((uint8_t)(*ptr))) { && !vim_isbreak((uint8_t)(*ptr))) {
wlv.need_lbr = true; wlv.need_lbr = true;
} }
// Found last space before word: check for line break. // Found last space before word: check for line break.
if (wp->w_p_lbr && c0 == c && wlv.need_lbr if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr
&& vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) { && vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) {
int mb_off = utf_head_off(line, ptr - 1); int mb_off = utf_head_off(line, ptr - 1);
char *p = ptr - (mb_off + 1); char *p = ptr - (mb_off + 1);
chartabsize_T cts; chartabsize_T cts;
@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1; wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1;
clear_chartabsize_arg(&cts); clear_chartabsize_arg(&cts);
if (on_last_col && c != TAB) { if (on_last_col && mb_c != TAB) {
// Do not continue search/match highlighting over the // Do not continue search/match highlighting over the
// line break, but for TABs the highlighting should // line break, but for TABs the highlighting should
// include the complete width of the character // include the complete width of the character
search_attr = 0; search_attr = 0;
} }
if (c == TAB && wlv.n_extra + wlv.col > grid->cols) { if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) {
wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts, wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts,
wp->w_buffer->b_p_vts_array) - 1; wp->w_buffer->b_p_vts_array) - 1;
} }
wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' '; wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' ';
wlv.c_final = NUL; wlv.c_final = NUL;
if (ascii_iswhite(c)) { if (mb_c < 128 && ascii_iswhite(mb_c)) {
if (c == TAB) { if (mb_c == TAB) {
// See "Tab alignment" below. // See "Tab alignment" below.
FIX_FOR_BOGUSCOLS; FIX_FOR_BOGUSCOLS;
} }
if (!wp->w_p_list) { if (!wp->w_p_list) {
c = ' '; mb_c = ' ';
mb_schar = schar_from_ascii(mb_c);
} }
} }
} }
if (wp->w_p_list) { if (wp->w_p_list) {
in_multispace = c == ' ' && (*ptr == ' ' in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' '));
|| (prev_ptr > line && prev_ptr[-1] == ' '));
if (!in_multispace) { if (!in_multispace) {
multispace_pos = 0; multispace_pos = 0;
} }
@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// But not when the character is followed by a composing // But not when the character is followed by a composing
// character (use mb_l to check that). // character (use mb_l to check that).
if (wp->w_p_list if (wp->w_p_list
&& ((((c == 160 && mb_l == 1) && ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3))
|| (mb_utf8
&& ((mb_c == 160 && mb_l == 2)
|| (mb_c == 0x202f && mb_l == 3))))
&& wp->w_p_lcs_chars.nbsp) && wp->w_p_lcs_chars.nbsp)
|| (c == ' ' || (mb_c == ' '
&& mb_l == 1 && mb_l == 1
&& (wp->w_p_lcs_chars.space && (wp->w_p_lcs_chars.space
|| (in_multispace && wp->w_p_lcs_chars.multispace != NULL)) || (in_multispace && wp->w_p_lcs_chars.multispace != NULL))
&& ptr - line >= leadcol && ptr - line >= leadcol
&& ptr - line <= trailcol))) { && ptr - line <= trailcol))) {
if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) { if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) {
c = wp->w_p_lcs_chars.multispace[multispace_pos++]; mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++];
if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) { if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) {
multispace_pos = 0; multispace_pos = 0;
} }
} else { } else {
c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp; mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
} }
wlv.n_attr = 1; wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0); wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr saved_attr2 = wlv.char_attr; // save current attr
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc);
} }
if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol) if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol)
|| (leadcol != 0 && ptr < line + leadcol))) { || (leadcol != 0 && ptr < line + leadcol))) {
if (leadcol != 0 && in_multispace && ptr < line + leadcol if (leadcol != 0 && in_multispace && ptr < line + leadcol
&& wp->w_p_lcs_chars.leadmultispace != NULL) { && wp->w_p_lcs_chars.leadmultispace != NULL) {
c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++]; mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) { if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) {
multispace_pos = 0; multispace_pos = 0;
} }
} else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) { } else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) {
c = wp->w_p_lcs_chars.trail; mb_c = wp->w_p_lcs_chars.trail;
} else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) { } else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) {
c = wp->w_p_lcs_chars.lead; mb_c = wp->w_p_lcs_chars.lead;
} else if (leadcol != 0 && wp->w_p_lcs_chars.space) { } else if (leadcol != 0 && wp->w_p_lcs_chars.space) {
c = wp->w_p_lcs_chars.space; mb_c = wp->w_p_lcs_chars.space;
} }
wlv.n_attr = 1; wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0); wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr saved_attr2 = wlv.char_attr; // save current attr
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc);
} }
} }
// Handling of non-printable characters. // Handling of non-printable characters.
if (!vim_isprintc(c)) { if (!vim_isprintc(mb_c)) {
// when getting a character from the file, we may have to // when getting a character from the file, we may have to
// turn it into something else on the way to putting it on the screen. // turn it into something else on the way to putting it on the screen.
if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) { if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
int tab_len = 0; int tab_len = 0;
colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length
char *const sbr = get_showbreak_value(wp); char *const sbr = get_showbreak_value(wp);
@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wlv.n_extra > 0) { if (wlv.n_extra > 0) {
len += wlv.n_extra - tab_len; len += wlv.n_extra - tab_len;
} }
c = wp->w_p_lcs_chars.tab1; mb_c = wp->w_p_lcs_chars.tab1;
p = get_extra_buf((size_t)len + 1); p = get_extra_buf((size_t)len + 1);
memset(p, ' ', (size_t)len); memset(p, ' ', (size_t)len);
p[len] = NUL; p[len] = NUL;
@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
} }
} }
mb_utf8 = false; // don't draw as UTF-8
if (wp->w_p_list) { if (wp->w_p_list) {
c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3) mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
? wp->w_p_lcs_chars.tab3 ? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1;
: wp->w_p_lcs_chars.tab1;
if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) { if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) {
wlv.c_extra = NUL; // using p_extra from above wlv.c_extra = NUL; // using p_extra from above
} else { } else {
@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = tab_len + 1; wlv.n_attr = tab_len + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0); wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr saved_attr2 = wlv.char_attr; // save current attr
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
} else { } else {
wlv.c_final = NUL; wlv.c_final = NUL;
wlv.c_extra = ' '; wlv.c_extra = ' ';
c = ' '; mb_c = ' ';
} }
} else if (c == NUL mb_schar = schar_from_char(mb_c);
} else if (mb_c == NUL
&& (wp->w_p_list && (wp->w_p_list
|| ((wlv.fromcol >= 0 || fromcol_prev >= 0) || ((wlv.fromcol >= 0 || fromcol_prev >= 0)
&& wlv.tocol > wlv.vcol && wlv.tocol > wlv.vcol
@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 0; wlv.n_extra = 0;
} }
if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) { if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) {
c = wp->w_p_lcs_chars.eol; mb_c = wp->w_p_lcs_chars.eol;
} else { } else {
c = ' '; mb_c = ' ';
} }
lcs_eol_one = -1; lcs_eol_one = -1;
ptr--; // put it back at the NUL ptr--; // put it back at the NUL
wlv.extra_attr = win_hl_attr(wp, HLF_AT); wlv.extra_attr = win_hl_attr(wp, HLF_AT);
wlv.n_attr = 1; wlv.n_attr = 1;
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc); } else if (mb_c != NUL) {
} else if (c != NUL) { wlv.p_extra = transchar_buf(wp->w_buffer, mb_c);
wlv.p_extra = transchar_buf(wp->w_buffer, c);
if (wlv.n_extra == 0) { if (wlv.n_extra == 0) {
wlv.n_extra = byte2cells(c) - 1; wlv.n_extra = byte2cells(mb_c) - 1;
} }
if ((dy_flags & DY_UHEX) && wp->w_p_rl) { if ((dy_flags & DY_UHEX) && wp->w_p_rl) {
rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>" rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>"
@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wp->w_p_lbr) { if (wp->w_p_lbr) {
char *p; char *p;
c = (uint8_t)(*wlv.p_extra); mb_c = (uint8_t)(*wlv.p_extra);
p = get_extra_buf((size_t)wlv.n_extra + 1); p = get_extra_buf((size_t)wlv.n_extra + 1);
memset(p, ' ', (size_t)wlv.n_extra); memset(p, ' ', (size_t)wlv.n_extra);
strncpy(p, // NOLINT(runtime/printf) strncpy(p, // NOLINT(runtime/printf)
@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
p[wlv.n_extra] = NUL; p[wlv.n_extra] = NUL;
wlv.p_extra = p; wlv.p_extra = p;
} else { } else {
wlv.n_extra = byte2cells(c) - 1; wlv.n_extra = byte2cells(mb_c) - 1;
c = (uint8_t)(*wlv.p_extra++); mb_c = (uint8_t)(*wlv.p_extra++);
} }
wlv.n_attr = wlv.n_extra + 1; wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_8); wlv.extra_attr = win_hl_attr(wp, HLF_8);
saved_attr2 = wlv.char_attr; // save current attr saved_attr2 = wlv.char_attr; // save current attr
mb_utf8 = false; // don't draw as UTF-8 mb_schar = schar_from_ascii(mb_c);
} else if (VIsual_active } else if (VIsual_active
&& (VIsual_mode == Ctrl_V || VIsual_mode == 'v') && (VIsual_mode == Ctrl_V || VIsual_mode == 'v')
&& virtual_active() && virtual_active()
&& wlv.tocol != MAXCOL && wlv.tocol != MAXCOL
&& wlv.vcol < wlv.tocol && wlv.vcol < wlv.tocol
&& wlv.col < grid->cols) { && wlv.col < grid->cols) {
c = ' '; mb_c = ' ';
mb_schar = schar_from_char(mb_c);
ptr--; // put it back at the NUL ptr--; // put it back at the NUL
} }
} }
@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// First time at this concealed item: display one // First time at this concealed item: display one
// character. // character.
if (has_match_conc && match_conc) { if (has_match_conc && match_conc) {
c = match_conc; mb_c = match_conc;
} else if (decor_conceal && decor_state.conceal_char) { } else if (decor_conceal && decor_state.conceal_char) {
c = decor_state.conceal_char; mb_c = decor_state.conceal_char;
if (decor_state.conceal_attr) { if (decor_state.conceal_attr) {
wlv.char_attr = decor_state.conceal_attr; wlv.char_attr = decor_state.conceal_attr;
} }
} else if (syn_get_sub_char() != NUL) { } else if (syn_get_sub_char() != NUL) {
c = syn_get_sub_char(); mb_c = syn_get_sub_char();
} else if (wp->w_p_lcs_chars.conceal != NUL) { } else if (wp->w_p_lcs_chars.conceal != NUL) {
c = wp->w_p_lcs_chars.conceal; mb_c = wp->w_p_lcs_chars.conceal;
} else { } else {
c = ' '; mb_c = ' ';
} }
prev_syntax_id = syntax_seqnr; prev_syntax_id = syntax_seqnr;
@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
is_concealing = true; is_concealing = true;
wlv.skip_cells = 1; wlv.skip_cells = 1;
} }
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc);
} else { } else {
prev_syntax_id = 0; prev_syntax_id = 0;
is_concealing = false; is_concealing = false;
@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
&& (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0) && (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0)
&& wlv.filler_todo <= 0 && wlv.filler_todo <= 0
&& wlv.draw_state > WL_STC && wlv.draw_state > WL_STC
&& c != NUL) { && mb_c != NUL) {
c = wp->w_p_lcs_chars.prec; mb_c = wp->w_p_lcs_chars.prec;
lcs_prec_todo = NUL; lcs_prec_todo = NUL;
if (utf_char2cells(mb_c) > 1) { if (utf_char2cells(mb_c) > 1) {
// Double-width character being overwritten by the "precedes" // Double-width character being overwritten by the "precedes"
@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = 2; wlv.n_attr = 2;
wlv.extra_attr = win_hl_attr(wp, HLF_AT); wlv.extra_attr = win_hl_attr(wp, HLF_AT);
} }
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc);
saved_attr3 = wlv.char_attr; // save current attr saved_attr3 = wlv.char_attr; // save current attr
wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr
n_attr3 = 1; n_attr3 = 1;
} }
// At end of the text line or just after the last character. // At end of the text line or just after the last character.
if (c == NUL && eol_hl_off == 0) { if (mb_c == NUL && eol_hl_off == 0) {
// flag to indicate whether prevcol equals startcol of search_hl or // flag to indicate whether prevcol equals startcol of search_hl or
// one of the matches // one of the matches
bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl, bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl,
@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
} }
// At end of the text line. // At end of the text line.
if (c == NUL) { if (mb_c == NUL) {
// Highlight 'cursorcolumn' & 'colorcolumn' past end of the line. // Highlight 'cursorcolumn' & 'colorcolumn' past end of the line.
if (wp->w_p_wrap) { if (wp->w_p_wrap) {
v = wlv.startrow == 0 ? wp->w_skipcol : 0; v = wlv.startrow == 0 ? wp->w_skipcol : 0;
@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
|| lcs_eol_one > 0 || lcs_eol_one > 0
|| (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL)) || (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL))
|| has_more_inline_virt(&wlv, v)) { || has_more_inline_virt(&wlv, v)) {
c = wp->w_p_lcs_chars.ext; mb_c = wp->w_p_lcs_chars.ext;
wlv.char_attr = win_hl_attr(wp, HLF_AT); wlv.char_attr = win_hl_attr(wp, HLF_AT);
mb_c = c; mb_schar = schar_from_char(mb_c);
mb_utf8 = check_mb_utf8(&c, u8cc);
} }
} }
@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// Skip characters that are left of the screen for 'nowrap'. // Skip characters that are left of the screen for 'nowrap'.
if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) { if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) {
// Store the character. // Store the character.
if (mb_utf8) { linebuf_char[wlv.off] = mb_schar;
linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc);
} else {
linebuf_char[wlv.off] = schar_from_ascii((char)c);
}
if (multi_attr) { if (multi_attr) {
linebuf_attr[wlv.off] = multi_attr; linebuf_attr[wlv.off] = multi_attr;
multi_attr = 0; multi_attr = 0;

View File

@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight)
pc_status = PC_STATUS_SET; pc_status = PC_STATUS_SET;
} }
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr); grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr);
grid_line_flush(); grid_line_flush();
} }
@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent)
int cc; int cc;
if ((cc = utf_char2len(c)) > 1) { if ((cc = utf_char2len(c)) > 1) {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf); utf_char2bytes(c, buf);
buf[cc] = NUL; buf[cc] = NUL;
@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
int cc; int cc;
int temp = 0; // init for GCC int temp = 0; // init for GCC
bool did_backspace = false; bool did_backspace = false;
int cpc[MAX_MCO]; // composing characters
bool call_fix_indent = false; bool call_fix_indent = false;
// can't delete anything in an empty file // can't delete anything in an empty file
@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
if (State & REPLACE_FLAG) { if (State & REPLACE_FLAG) {
replace_do_bs(-1); replace_do_bs(-1);
} else { } else {
const int l_p_deco = p_deco; bool has_composing = false;
if (l_p_deco) { if (p_deco) {
(void)utfc_ptr2char(get_cursor_pos_ptr(), cpc); char *p0 = get_cursor_pos_ptr();
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
} }
(void)del_char(false); (void)del_char(false);
// If there are combining characters and 'delcombine' is set // If there are combining characters and 'delcombine' is set
// move the cursor back. Don't back up before the base // move the cursor back. Don't back up before the base character.
// character. if (has_composing) {
if (l_p_deco && cpc[0] != NUL) {
inc_cursor(); inc_cursor();
} }
if (revins_chars) { if (revins_chars) {

View File

@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE
/// Set v:char to character "c". /// Set v:char to character "c".
void set_vim_var_char(int c) void set_vim_var_char(int c)
{ {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(c, buf)] = NUL; buf[utf_char2bytes(c, buf)] = NUL;
set_vim_var_string(VV_CHAR, buf, -1); set_vim_var_string(VV_CHAR, buf, -1);

View File

@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
return; return;
} }
char buf[MB_MAXBYTES]; char buf[MB_MAXCHAR];
const int len = utf_char2bytes((int)num, buf); const int len = utf_char2bytes((int)num, buf);
rettv->v_type = VAR_STRING; rettv->v_type = VAR_STRING;
@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
c = -1; c = -1;
} else { } else {
char buf[MB_MAXBYTES + 1]; char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL)); schar_get(buf, grid_getchar(grid, row, col, NULL));
c = utf_ptr2char(buf); c = utf_ptr2char(buf);
} }
@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
ScreenGrid *grid; ScreenGrid *grid;
screenchar_adjust(&grid, &row, &col); screenchar_adjust(&grid, &row, &col);
tv_list_alloc_ret(rettv, kListLenMayKnow);
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
tv_list_alloc_ret(rettv, 0);
return; return;
} }
char buf[MB_MAXBYTES + 1]; char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL)); schar_get(buf, grid_getchar(grid, row, col, NULL));
int pcc[MAX_MCO];
int c = utfc_ptr2char(buf, pcc); // schar values are already processed chars which are always NUL-terminated.
int composing_len = 0; // A single [0] is expected when char is NUL.
while (composing_len < MAX_MCO && pcc[composing_len] != 0) { size_t i = 0;
composing_len++; do {
} int c = utf_ptr2char(buf + i);
tv_list_alloc_ret(rettv, composing_len + 1); tv_list_append_number(rettv->vval.v_list, c);
tv_list_append_number(rettv->vval.v_list, c); i += (size_t)utf_ptr2len(buf + i);
for (int i = 0; i < composing_len; i++) { } while (buf[i] != NUL);
tv_list_append_number(rettv->vval.v_list, pcc[i]);
}
} }
/// "screencol()" function /// "screencol()" function
@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr
return; return;
} }
char buf[MB_MAXBYTES + 1]; char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL)); schar_get(buf, grid_getchar(grid, row, col, NULL));
rettv->vval.v_string = xstrdup(buf); rettv->vval.v_string = xstrdup(buf);
} }
@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt
char *const csearch = tv_dict_get_string(d, "char", false); char *const csearch = tv_dict_get_string(d, "char", false);
if (csearch != NULL) { if (csearch != NULL) {
int pcc[MAX_MCO]; int c = utf_ptr2char(csearch);
const int c = utfc_ptr2char(csearch, pcc);
set_last_csearch(c, csearch, utfc_ptr2len(csearch)); set_last_csearch(c, csearch, utfc_ptr2len(csearch));
} }

View File

@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[]
/// ":ascii" and "ga" implementation /// ":ascii" and "ga" implementation
void do_ascii(exarg_T *eap) void do_ascii(exarg_T *eap)
{ {
char *dig; char *data = get_cursor_pos_ptr();
int cc[MAX_MCO]; size_t len = (size_t)utfc_ptr2len(data);
int c = utfc_ptr2char(get_cursor_pos_ptr(), cc);
if (c == NUL) { if (len == 0) {
msg("NUL", 0); msg("NUL", 0);
return; return;
} }
size_t iobuff_len = 0; bool need_clear = true;
msg_sb_eol();
msg_start();
int ci = 0; int c = utf_ptr2char(data);
size_t off = 0;
// TODO(bfredl): merge this with the main loop
if (c < 0x80) { if (c < 0x80) {
if (c == NL) { // NUL is stored as NL. if (c == NL) { // NUL is stored as NL.
c = NUL; c = NUL;
@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap)
char buf2[20]; char buf2[20];
buf2[0] = NUL; buf2[0] = NUL;
dig = get_digraph_for_char(cval); char *dig = get_digraph_for_char(cval);
if (dig != NULL) { if (dig != NULL) {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, vim_snprintf(IObuff, sizeof(IObuff),
sizeof(IObuff) - iobuff_len, _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"),
_("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"), transchar(c), buf1, buf2, cval, cval, cval, dig);
transchar(c), buf1, buf2, cval, cval, cval, dig);
} else { } else {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, vim_snprintf(IObuff, sizeof(IObuff),
sizeof(IObuff) - iobuff_len, _("<%s>%s%s %d, Hex %02x, Octal %03o"),
_("<%s>%s%s %d, Hex %02x, Octal %03o"), transchar(c), buf1, buf2, cval, cval, cval);
transchar(c), buf1, buf2, cval, cval, cval);
} }
c = cc[ci++]; msg_multiline(IObuff, 0, true, &need_clear);
off += (size_t)utf_ptr2len(data); // needed for overlong ascii?
} }
#define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1)
// Space for description:
// - 1 byte for separator (starting from second entry)
// - 1 byte for "<"
// - 1 byte for space to draw composing character on (optional, but really
// mostly required)
// - up to MB_MAXBYTES bytes for character itself
// - 16 bytes for raw text ("> , Hex , Octal ").
// - at least 4 bytes for hexadecimal representation
// - at least 3 bytes for decimal representation
// - at least 3 bytes for octal representation
// - 1 byte for NUL
//
// Taking into account MAX_MCO and characters which need 8 bytes for
// hexadecimal representation, but not taking translation into account:
// resulting string will occupy less then 400 bytes (conservative estimate).
//
// Less then 1000 bytes if translation multiplies number of bytes needed for
// raw text by 6, so it should always fit into 1025 bytes reserved for IObuff.
// Repeat for combining characters, also handle multiby here. // Repeat for combining characters, also handle multiby here.
while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) { while (off < len) {
c = utf_ptr2char(data + off);
size_t iobuff_len = 0;
// This assumes every multi-byte char is printable... // This assumes every multi-byte char is printable...
if (iobuff_len > 0) { if (off > 0) {
IObuff[iobuff_len++] = ' '; IObuff[iobuff_len++] = ' ';
} }
IObuff[iobuff_len++] = '<'; IObuff[iobuff_len++] = '<';
@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap)
} }
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
dig = get_digraph_for_char(c); char *dig = get_digraph_for_char(c);
if (dig != NULL) { if (dig != NULL) {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
sizeof(IObuff) - iobuff_len, (c < 0x10000
(c < 0x10000 ? _("> %d, Hex %04x, Oct %o, Digr %s")
? _("> %d, Hex %04x, Oct %o, Digr %s") : _("> %d, Hex %08x, Oct %o, Digr %s")),
: _("> %d, Hex %08x, Oct %o, Digr %s")), c, c, c, dig);
c, c, c, dig);
} else { } else {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
sizeof(IObuff) - iobuff_len, (c < 0x10000
(c < 0x10000 ? _("> %d, Hex %04x, Octal %o")
? _("> %d, Hex %04x, Octal %o") : _("> %d, Hex %08x, Octal %o")),
: _("> %d, Hex %08x, Octal %o")), c, c, c);
c, c, c);
} }
if (ci == MAX_MCO) {
break; msg_multiline(IObuff, 0, true, &need_clear);
}
c = cc[ci++]; off += (size_t)utf_ptr2len(data + off); // needed for overlong ascii?
}
if (ci != MAX_MCO && c != 0) {
xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len);
} }
msg(IObuff, 0); if (need_clear) {
msg_clr_eos();
}
msg_end();
} }
/// ":left", ":center" and ":right": align text. /// ":left", ":center" and ":right": align text.

View File

@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off)
} }
} }
/// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell.
schar_T schar_from_cc(int c, int u8cc[MAX_MCO])
{
char buf[MAX_SCHAR_SIZE];
int len = utf_char2bytes(c, buf);
for (int i = 0; i < MAX_MCO; i++) {
if (u8cc[i] == 0) {
break;
}
len += utf_char2bytes(u8cc[i], buf + len);
}
buf[len] = 0;
return schar_from_buf(buf, (size_t)len);
}
schar_T schar_from_str(char *str) schar_T schar_from_str(char *str)
{ {
if (str == NULL) { if (str == NULL) {
@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols)
schar_get(scbuf, buf[i]); schar_get(scbuf, buf[i]);
char scbuf_new[MAX_SCHAR_SIZE]; char scbuf_new[MAX_SCHAR_SIZE];
int len = utf_char2bytes(c0new, scbuf_new); size_t len = (size_t)utf_char2bytes(c0new, scbuf_new);
if (c1new) { if (c1new) {
len += utf_char2bytes(c1new, scbuf_new + len); len += (size_t)utf_char2bytes(c1new, scbuf_new + len);
} }
int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0); int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0);
size_t rest = strlen(scbuf + off); size_t rest = strlen(scbuf + off);
if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) { if (rest + len + 1 > MAX_SCHAR_SIZE) {
// TODO(bfredl): this cannot happen just yet, as we only construct // Too bigly, discard one code-point.
// schar_T values with up to MAX_MCO+1 composing codepoints. When code // This should be enough as c0 cannot grow more than from 2 to 4 bytes
// is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit, // (base arabic to extended arabic)
// we need be able to peel off a composing char which doesn't fit anymore. rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
abort();
} }
memcpy(scbuf_new + len, scbuf + off, rest); memcpy(scbuf_new + len, scbuf + off, rest);
buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest); buf[i] = schar_from_buf(scbuf_new, len + rest);
next: next:
c0prev = c0; c0prev = c0;
@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row)
return grid->attrs[grid->line_offset[row]] < 0; return grid->attrs[grid->line_offset[row]] < 0;
} }
/// Get a single character directly from grid.chars into "bytes", which must /// Get a single character directly from grid.chars
/// have a size of "MB_MAXBYTES + 1". ///
/// If "attrp" is not NULL, return the character's attribute in "*attrp". /// @param[out] attrp set to the character's attribute (optional)
schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp) schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp)
{ {
grid_adjust(&grid, &row, &col); grid_adjust(&grid, &row, &col);
@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
{ {
const char *ptr = text; const char *ptr = text;
int len = textlen; int len = textlen;
int u8cc[MAX_MCO];
assert(grid_line_grid); assert(grid_line_grid);
int start_col = col; int start_col = col;
int max_col = grid_line_maxcol; int max_col = grid_line_maxcol;
while (col < max_col while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
&& (len < 0 || (int)(ptr - text) < len)
&& *ptr != NUL) {
// check if this is the first byte of a multibyte // check if this is the first byte of a multibyte
int mbyte_blen = len > 0 int mbyte_blen = len > 0
? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
: utfc_ptr2len(ptr); : utfc_ptr2len(ptr);
int u8c = len >= 0 int firstc;
? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr)) schar_T schar = len >= 0
: utfc_ptr2char(ptr, u8cc); ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
int mbyte_cells = utf_char2cells(u8c); : utfc_ptr2schar(ptr, &firstc);
int mbyte_cells = utf_char2cells(firstc);
if (mbyte_cells > 2) { if (mbyte_cells > 2) {
mbyte_cells = 1; mbyte_cells = 1;
u8c = 0xFFFD;
u8cc[0] = 0; schar = schar_from_char(0xFFFD);
} }
if (col + mbyte_cells > max_col) { if (col + mbyte_cells > max_col) {
// Only 1 cell left, but character requires 2 cells: // Only 1 cell left, but character requires 2 cells:
// display a '>' in the last column to avoid wrapping. */ // display a '>' in the last column to avoid wrapping. */
u8c = '>'; schar = schar_from_ascii('>');
u8cc[0] = 0;
mbyte_cells = 1; mbyte_cells = 1;
} }
schar_T buf;
// TODO(bfredl): why not just keep the original byte sequence.
buf = schar_from_cc(u8c, u8cc);
// When at the start of the text and overwriting the right half of a // When at the start of the text and overwriting the right half of a
// two-cell character in the same grid, truncate that into a '>'. // two-cell character in the same grid, truncate that into a '>'.
if (ptr == text && col > grid_line_first && col < grid_line_last if (ptr == text && col > grid_line_first && col < grid_line_last
@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
linebuf_char[col - 1] = schar_from_ascii('>'); linebuf_char[col - 1] = schar_from_ascii('>');
} }
linebuf_char[col] = buf; linebuf_char[col] = schar;
linebuf_attr[col] = attr; linebuf_attr[col] = attr;
linebuf_vcol[col] = -1; linebuf_vcol[col] = -1;
if (mbyte_cells == 2) { if (mbyte_cells == 2) {

View File

@ -7,8 +7,8 @@
#include "nvim/pos.h" #include "nvim/pos.h"
#include "nvim/types.h" #include "nvim/types.h"
#define MAX_MCO 6 // fixed value for 'maxcombine' // Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
// Includes final NUL. at least 4*(MAX_MCO+1)+1 // ensures we can fit all composed chars which did fit before.
#define MAX_SCHAR_SIZE 32 #define MAX_SCHAR_SIZE 32
// if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness) // if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness)
@ -35,7 +35,7 @@ enum {
/// we can avoid sending bigger updates than necessary to the Ul layer. /// we can avoid sending bigger updates than necessary to the Ul layer.
/// ///
/// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can /// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can
/// contain up to MAX_MCO composing characters after the base character. /// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes
/// The composing characters are to be drawn on top of the original character. /// The composing characters are to be drawn on top of the original character.
/// The content after the NUL is not defined (so comparison must be done a /// The content after the NUL is not defined (so comparison must be done a
/// single cell at a time). Double-width characters are stored in the left cell, /// single cell at a time). Double-width characters are stored in the left cell,

View File

@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c)
return; return;
} }
if ((cc = utf_char2len(c)) > 1) { if ((cc = utf_char2len(c)) > 1) {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf); utf_char2bytes(c, buf);
buf[cc] = NUL; buf[cc] = NUL;

View File

@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
if (offset < 0 || offset > (intptr_t)s1_len) { if (offset < 0 || offset > (intptr_t)s1_len) {
return luaL_error(lstate, "index out of range"); return luaL_error(lstate, "index out of range");
} }
int head_offset = utf_cp_head_off(s1, s1 + offset - 1); int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
lua_pushinteger(lstate, head_offset); lua_pushinteger(lstate, head_offset);
return 1; return 1;
} }

View File

@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id); tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id);
if (cur->mit_conceal_char) { if (cur->mit_conceal_char) {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL; buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL;
tv_dict_add_str(dict, S_LEN("conceal"), buf); tv_dict_add_str(dict, S_LEN("conceal"), buf);

View File

@ -48,6 +48,7 @@
#include "nvim/getchar.h" #include "nvim/getchar.h"
#include "nvim/gettext.h" #include "nvim/gettext.h"
#include "nvim/globals.h" #include "nvim/globals.h"
#include "nvim/grid.h"
#include "nvim/grid_defs.h" #include "nvim/grid_defs.h"
#include "nvim/iconv.h" #include "nvim/iconv.h"
#include "nvim/keycodes.h" #include "nvim/keycodes.h"
@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
return arabic_combine(utf_ptr2char(p1), c2); return arabic_combine(utf_ptr2char(p1), c2);
} }
/// Convert a UTF-8 string to a wide character /// Get the screen char at the beginning of a string
/// ///
/// Also gets up to #MAX_MCO composing characters. /// Caller is expected to check for things like unprintable chars etc
/// If first char in string is a composing char, prepend a space to display it correctly.
/// ///
/// @param[out] pcc Location where to store composing characters. Must have /// If "p" starts with an invalid sequence, zero is returned.
/// space at least for #MAX_MCO + 1 elements.
/// ///
/// @return leading character. /// @param[out] firstc (required) The first codepoint of the screen char,
int utfc_ptr2char(const char *p, int *pcc) /// or the first byte of an invalid sequence
///
/// @return the char
schar_T utfc_ptr2schar(const char *p, int *firstc)
FUNC_ATTR_NONNULL_ALL
{ {
int i = 0;
int c = utf_ptr2char(p); int c = utf_ptr2char(p);
int len = utf_ptr2len(p); *firstc = c; // NOT optional, you are gonna need it
bool first_compose = utf_iscomposing(c);
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
// Only accept a composing char when the first char isn't illegal. if (len == 1 && (uint8_t)(*p) >= 0x80) {
if ((len > 1 || (uint8_t)(*p) < 0x80) return 0; // invalid sequence
&& (uint8_t)p[len] >= 0x80
&& utf_composinglike(p, p + len)) {
int cc = utf_ptr2char(p + len);
while (true) {
pcc[i++] = cc;
if (i == MAX_MCO) {
break;
}
len += utf_ptr2len(p + len);
if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
break;
}
}
} }
if (i < MAX_MCO) { // last composing char must be 0 return schar_from_buf_first(p, len, first_compose);
pcc[i] = 0;
}
return c;
} }
// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO /// Get the screen char at the beginning of a string with length
// composing characters. Use no more than p[maxlen]. ///
// /// Like utfc_ptr2schar but use no more than p[maxlen].
// @param [out] pcc: composing chars, last one is 0 schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
int utfc_ptr2char_len(const char *p, int *pcc, int maxlen) FUNC_ATTR_NONNULL_ALL
{ {
assert(maxlen > 0); assert(maxlen > 0);
int i = 0; size_t len = (size_t)utf_ptr2len_len(p, maxlen);
if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
int len = utf_ptr2len_len(p, maxlen); // invalid or truncated sequence
// Is it safe to use utf_ptr2char()? *firstc = (uint8_t)(*p);
bool safe = len > 1 && len <= maxlen; return 0;
int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
// Only accept a composing char when the first char isn't illegal.
if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
for (; i < MAX_MCO; i++) {
int len_cc = utf_ptr2len_len(p + len, maxlen - len);
safe = len_cc > 1 && len_cc <= maxlen - len;
if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
|| !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
break;
}
len += len_cc;
}
} }
if (i < MAX_MCO) { int c = utf_ptr2char(p);
// last composing char must be 0 *firstc = c;
pcc[i] = 0; bool first_compose = utf_iscomposing(c);
} maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
len = (size_t)utfc_ptr2len_len(p, maxlen);
return c; return schar_from_buf_first(p, len, first_compose);
#undef ISCOMPOSING }
/// Caller must ensure there is space for `first_compose`
static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
{
if (first_compose) {
char cbuf[MAX_SCHAR_SIZE];
cbuf[0] = ' ';
memcpy(cbuf + 1, buf, len);
return schar_from_buf(cbuf, len + 1);
} else {
return schar_from_buf(buf, len);
}
} }
/// Get the length of a UTF-8 byte sequence representing a single codepoint /// Get the length of a UTF-8 byte sequence representing a single codepoint
@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
return 1; return 1;
} }
// Check for composing characters. We can handle only the first six, but // Check for composing characters.
// skip all of them (otherwise the cursor would get stuck).
int prevlen = 0; int prevlen = 0;
while (true) { while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
/// Return the offset from "p" to the first byte of the codepoint it points /// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes. /// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
/// separately and returns a negative offset. /// separately.
/// ///
/// @param[in] base Pointer to start of string /// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
// //
/// @return 0 if invalid sequence, else offset to previous codepoint /// @return 0 if invalid sequence, else number of bytes to previous codepoint
int utf_cp_head_off(const char *base, const char *p) int utf_cp_head_off(const char *base, const char *p)
{ {
int i; int i;
@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
} }
// Find the first character that is not 10xx.xxxx // Find the first character that is not 10xx.xxxx
for (i = 0; p - i > base; i--) { for (i = 0; p - i >= base; i++) {
if (((uint8_t)p[i] & 0xc0) != 0x80) { if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break; break;
} }
} }
// Find the last character that is 10xx.xxxx // Find the last character that is 10xx.xxxx (condition terminates on NUL)
for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {} int j = 1;
while (((uint8_t)p[j] & 0xc0) == 0x80) {
j++;
}
// Check for illegal sequence. // Check for illegal sequence.
if (utf8len_tab[(uint8_t)p[i]] == 1) { if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0; return 0;
} }
return i; return i;

View File

@ -7,6 +7,7 @@
#include "nvim/cmdexpand_defs.h" #include "nvim/cmdexpand_defs.h"
#include "nvim/eval/typval_defs.h" #include "nvim/eval/typval_defs.h"
#include "nvim/func_attr.h" #include "nvim/func_attr.h"
#include "nvim/grid_defs.h"
#include "nvim/mbyte_defs.h" #include "nvim/mbyte_defs.h"
#include "nvim/os/os_defs.h" #include "nvim/os/os_defs.h"
#include "nvim/types.h" #include "nvim/types.h"

View File

@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0;
static void ui_ext_msg_set_pos(int row, bool scrolled) static void ui_ext_msg_set_pos(int row, bool scrolled)
{ {
char buf[MAX_MCO + 1]; char buf[MB_MAXCHAR + 1];
size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf); size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf);
buf[size] = '\0'; buf[size] = '\0';
ui_call_msg_set_pos(msg_grid.handle, row, scrolled, ui_call_msg_set_pos(msg_grid.handle, row, scrolled,
@ -1471,7 +1471,7 @@ void msg_putchar(int c)
void msg_putchar_attr(int c, int attr) void msg_putchar_attr(int c, int attr)
{ {
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
if (IS_SPECIAL(c)) { if (IS_SPECIAL(c)) {
buf[0] = (char)K_SPECIAL; buf[0] = (char)K_SPECIAL;
@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr)
mode_displayed = false; mode_displayed = false;
} }
// If the string starts with a composing character first draw a space on
// which the composing char can be drawn.
if (utf_iscomposing(utf_ptr2char(msgstr))) {
msg_puts_attr(" ", attr);
}
// Go over the string. Special characters are translated and printed. // Go over the string. Special characters are translated and printed.
// Normal characters are printed several at a time. // Normal characters are printed several at a time.
while (--len >= 0 && !got_int) { while (--len >= 0 && !got_int) {

View File

@ -556,6 +556,7 @@ EXTERN char *p_mp; ///< 'makeprg'
EXTERN char *p_mps; ///< 'matchpairs' EXTERN char *p_mps; ///< 'matchpairs'
EXTERN OptInt p_mat; ///< 'matchtime' EXTERN OptInt p_mat; ///< 'matchtime'
EXTERN OptInt p_mco; ///< 'maxcombine' EXTERN OptInt p_mco; ///< 'maxcombine'
#define MAX_MCO 6 // fixed value for 'maxcombine'
EXTERN OptInt p_mfd; ///< 'maxfuncdepth' EXTERN OptInt p_mfd; ///< 'maxfuncdepth'
EXTERN OptInt p_mmd; ///< 'maxmapdepth' EXTERN OptInt p_mmd; ///< 'maxmapdepth'
EXTERN OptInt p_mmp; ///< 'maxmempattern' EXTERN OptInt p_mmp; ///< 'maxmempattern'

View File

@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word)
static bool similar_chars(slang_T *slang, int c1, int c2) static bool similar_chars(slang_T *slang, int c1, int c2)
{ {
int m1, m2; int m1, m2;
char buf[MB_MAXBYTES + 1]; char buf[MB_MAXCHAR + 1];
hashitem_T *hi; hashitem_T *hi;
if (c1 >= 256) { if (c1 >= 256) {

View File

@ -1102,8 +1102,6 @@ describe("folded lines", function()
end) end)
it("works with multibyte text", function() it("works with multibyte text", function()
-- Currently the only allowed value of 'maxcombine'
eq(6, meths.get_option_value('maxcombine', {}))
eq(true, meths.get_option_value('arabicshape', {})) eq(true, meths.get_option_value('arabicshape', {}))
insert([[ insert([[
å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة
@ -1120,7 +1118,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
å x̎͂̀̂͛͛ َََِّ | å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ |
möre tex^t | möre tex^t |
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1132,7 +1130,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
å x̎͂̀̂͛͛ َََِّ | å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ |
möre tex^t | möre tex^t |
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1156,7 +1154,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{5:^+-- 2 lines: å x̎͂̀̂͛͛ َََِّ·················}| {5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ·················}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1168,7 +1166,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{5:^+-- 2 lines: å x̎͂̀̂͛͛ َََِّ·················}| {5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ·················}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1192,7 +1190,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة·················}| {5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1204,7 +1202,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة·················}| {5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1228,7 +1226,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة···········}| {7:+ }{8: 1 }{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1240,7 +1238,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة···········}| {7:+ }{8: 1 }{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
{1:~ }| {1:~ }|
@ -1265,7 +1263,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}{8: 1 }{7: +}| {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1277,7 +1275,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}{8: 1 }{7: +}| {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1301,7 +1299,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}| {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1313,7 +1311,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}| {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1337,7 +1335,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
{5:·················َََِّ x̎͂̀̂͛͛ å :senil 2 --^+}| {5:·················َََِّ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1349,7 +1347,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
{5:·················َََِّ x̎͂̀̂͛͛ å :senil 2 --^+}| {5:·················َََِّ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1373,7 +1371,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
َََِّ^ x̎͂̀̂͛͛ å| َََِّ^ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm| txet eröm|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1385,7 +1383,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
َََِّ^ x̎͂̀̂͛͛ å| َََِّ^ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm| txet eröm|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1409,7 +1407,7 @@ describe("folded lines", function()
[2:---------------------------------------------]| [2:---------------------------------------------]|
[3:---------------------------------------------]| [3:---------------------------------------------]|
## grid 2 ## grid 2
ةيَّبِرَعَ^لا x̎͂̀̂͛͛ å| ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm| txet eröm|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|
@ -1421,7 +1419,7 @@ describe("folded lines", function()
]]) ]])
else else
screen:expect([[ screen:expect([[
ةيَّبِرَعَ^لا x̎͂̀̂͛͛ å| ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm| txet eröm|
{1: ~}| {1: ~}|
{1: ~}| {1: ~}|

View File

@ -228,6 +228,36 @@ describe("multibyte rendering", function()
]]} ]]}
end) end)
it('works with arabicshape and multiple composing chars', function()
-- this tests an important edge case: arabicshape might increase the byte size of the base
-- character in a way so that the last composing char no longer fits. use "g8" on the text
-- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape')
-- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the
-- test text with even more zalgo energy to still touch this edge case.
meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"})
command('set noarabicshape')
screen:expect{grid=[[
^سلام̀́̂̃̄̅̆̇̈̉̊̋̌ |
{1:~ }|
{1:~ }|
{1:~ }|
{1:~ }|
|
]]}
command('set arabicshape')
screen:expect{grid=[[
^̀́̂̃̄̅̆̇̈̉̊̋̌ |
{1:~ }|
{1:~ }|
{1:~ }|
{1:~ }|
|
]]}
end)
end) end)
describe('multibyte rendering: statusline', function() describe('multibyte rendering: statusline', function()

View File

@ -225,8 +225,8 @@ describe("shell command :!", function()
å | å |
ref: å̲ | ref: å̲ |
1: å̲ | 1: å̲ |
2: å ̲ | 2: å ̲ |
3: å ̲ | 3: å ̲ |
| |
{3:Press ENTER or type command to continue}^ | {3:Press ENTER or type command to continue}^ |
]]) ]])

View File

@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it)
local ffi = helpers.ffi local ffi = helpers.ffi
local eq = helpers.eq local eq = helpers.eq
local mbyte = helpers.cimport("./src/nvim/mbyte.h") local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
local charset = helpers.cimport('./src/nvim/charset.h')
describe('mbyte', function() describe('mbyte', function()
-- Array for composing characters
local intp = ffi.typeof('int[?]')
local function to_intp()
-- how to get MAX_MCO from globals.h?
return intp(7, 1)
end
-- Convert from bytes to string -- Convert from bytes to string
local function to_string(bytes) local function to_string(bytes)
local s = {} local s = {}
@ -30,14 +22,14 @@ describe('mbyte', function()
itp('utf_ptr2char', function() itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned. -- For strings with length 1 the first byte is returned.
for c = 0, 255 do for c = 0, 255 do
eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) eq(c, lib.utf_ptr2char(to_string({c, 0})))
end end
-- Some ill formed byte sequences that should not be recognized as UTF-8 -- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1 -- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf -- Second byte: 0x80 .. 0xbf
--eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80}))) --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf}))) --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
-- --
-- Sequences with more than four bytes -- Sequences with more than four bytes
end) end)
@ -47,240 +39,133 @@ describe('mbyte', function()
local char_p = ffi.typeof('char[?]') local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0) local p = char_p(4, 0)
mbyte.utf_char2bytes(c, p) lib.utf_char2bytes(c, p)
eq(c, mbyte.utf_ptr2char(p)) eq(c, lib.utf_ptr2char(p))
eq(charset.vim_iswordc(c), charset.vim_iswordp(p)) eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end end
end) end)
end end
describe('utfc_ptr2char_len', function() describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
local firstc = ffi.new("int[1]")
local buf = ffi.new("char[32]")
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return {ffi.string(buf), firstc[0]}
end
local function byte(val)
return {string.char(val), val}
end
itp('1-byte sequences', function() itp('1-byte sequences', function()
local pcc = to_intp() eq({'', 0}, test_seq{0})
for c = 0, 255 do for c = 1, 127 do
eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) eq(byte(c), test_seq{c})
eq(0, pcc[0]) end
for c = 128, 255 do
eq({'', c}, test_seq{c})
end end
end) end)
itp('2-byte sequences', function() itp('2-byte sequences', function()
local pcc = to_intp()
-- No combining characters -- No combining characters
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2)) eq(byte(0x7f), test_seq{0x7f, 0x7f})
eq(0, pcc[0])
-- No combining characters -- No combining characters
pcc = to_intp() eq(byte(0x7f), test_seq{0x7f, 0x80})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
eq(0, pcc[0])
-- No UTF-8 sequence -- No UTF-8 sequence
pcc = to_intp() eq({'', 0xc2}, test_seq{0xc2, 0x7f})
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
eq(0, pcc[0])
-- One UTF-8 character -- One UTF-8 character
pcc = to_intp() eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
eq(0, pcc[0])
-- No UTF-8 sequence -- No UTF-8 sequence
pcc = to_intp() eq({'', 0xc2}, test_seq{0xc2, 0xc0})
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
eq(0, pcc[0])
end) end)
itp('3-byte sequences', function() itp('3-byte sequences', function()
local pcc = to_intp()
-- No second UTF-8 character -- No second UTF-8 character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3)) eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
eq(0, pcc[0])
-- No combining character -- No combining character
pcc = to_intp() eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
eq(0, pcc[0])
-- Combining character is U+0300 -- Combining character is U+0300
pcc = to_intp() eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
-- No UTF-8 sequence -- No UTF-8 sequence
pcc = to_intp() eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
eq(0, pcc[0])
-- Incomplete combining character -- Incomplete combining character
pcc = to_intp() eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
eq(0, pcc[0])
-- One UTF-8 character -- One UTF-8 character (composing only)
pcc = to_intp() eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
eq(0, pcc[0])
end) end)
itp('4-byte sequences', function() itp('4-byte sequences', function()
local pcc = to_intp()
-- No following combining character -- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4)) eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
eq(0, pcc[0])
-- No second UTF-8 character -- No second UTF-8 character
pcc = to_intp() eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
-- Combining character U+0300 -- Combining character U+0300
pcc = to_intp() eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
-- No UTF-8 sequence -- No UTF-8 sequence
pcc = to_intp() eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
-- No following UTF-8 character -- No following UTF-8 character
pcc = to_intp() eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
eq(0, pcc[0])
-- Combining character U+0301 -- Combining character U+0301
pcc = to_intp() eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
-- One UTF-8 character -- One UTF-8 character
pcc = to_intp() eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
eq(0, pcc[0])
end) end)
itp('5+-byte sequences', function() itp('5+-byte sequences', function()
local pcc = to_intp()
-- No following combining character -- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
eq(0, pcc[0])
-- No second UTF-8 character -- No second UTF-8 character
pcc = to_intp() eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
-- Combining character U+0300 -- Combining character U+0300
pcc = to_intp() eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
-- Combining characters U+0300 and U+0301 -- Combining characters U+0300 and U+0301
pcc = to_intp() eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
-- Combining characters U+0300, U+0301, U+0302 -- Combining characters U+0300, U+0301, U+0302
pcc = to_intp() eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
-- Combining characters U+0300, U+0301, U+0302, U+0303 -- Combining characters U+0300, U+0301, U+0302, U+0303
pcc = to_intp() eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0000, pcc[4])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
pcc = to_intp() eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string( -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0000, pcc[5])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
-- U+0305
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(1, pcc[6])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
-- U+0305, U+0306, but only save six (= MAX_MCO). eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(0x0001, pcc[6])
-- Only three following combining characters U+0300, U+0301, U+0302 -- Only three following combining characters U+0300, U+0301, U+0302
pcc = to_intp() eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
-- No UTF-8 sequence -- No UTF-8 sequence
pcc = to_intp() eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
-- No following UTF-8 character -- No following UTF-8 character
pcc = to_intp() eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
eq(0, pcc[0])
-- Combining character U+0301 -- Combining character U+0301
pcc = to_intp() eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
-- Combining character U+0301 -- Combining character U+0301
pcc = to_intp() eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
-- One UTF-8 character -- One UTF-8 character
pcc = to_intp() eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
eq(0, pcc[0])
-- One UTF-8 character -- One UTF-8 character
pcc = to_intp() eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
-- One UTF-8 character -- One UTF-8 character
pcc = to_intp() eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
eq(0, pcc[0])
-- Combining characters U+1AB0 and U+0301 -- Combining characters U+1AB0 and U+0301
pcc = to_intp() eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
eq(0x1ab0, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
end) end)
end) end)