Merge pull request #25934 from bfredl/screenlinechar

refactor(grid): make screen rendering more multibyte than ever before
This commit is contained in:
bfredl 2023-11-17 13:38:10 +01:00 committed by GitHub
commit 7af89ef464
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 399 additions and 602 deletions

View File

@ -646,7 +646,8 @@ widespread as file format.
A composing or combining character is used to change the meaning of the
character before it. The combining characters are drawn on top of the
preceding character.
Up to six combining characters can be displayed.
Too big combined characters cannot be displayed, but they can still be
inspected using the |g8| and |ga| commands described below.
When editing text a composing character is mostly considered part of the
preceding character. For example "x" will delete a character and its
following composing characters by default.

View File

@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior.
Note that syntax highlighting of code examples requires a matching parser
and may be affected by custom queries.
• Support for rendering multibyte characters using composing characters has been
enhanced. The maximum limit have been increased from 1+6 codepoints to
31 bytes, which is guaranteed to fit all chars from before but often more.
NOTE: the regexp engine still has a hard-coded limit of considering
6 composing chars only.
==============================================================================
REMOVED FEATURES *news-removed*

View File

@ -722,9 +722,16 @@ Options:
<
*'macatsui'*
*'maxcombine'* *'mco'*
Nvim always displays up to 6 combining characters. You can still edit
text with more than 6 combining characters, you just can't see them.
Use |g8| or |ga|. See |mbyte-combining|.
Nvim counts maximum character sizes in bytes, not codepoints. This is
guaranteed to be big enough to always fit all chars properly displayed
in vim with 'maxcombine' set to 6.
You can still edit text with larger characters than fits in the screen buffer,
you just can't see them. Use |g8| or |ga|. See |mbyte-combining|.
NOTE: the rexexp engine still has a hard-coded limit of considering
6 composing chars only.
*'maxmem'* Nvim delegates memory-management to the OS.
*'maxmemtot'* Nvim delegates memory-management to the OS.
printoptions

View File

@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg
--- security reasons.
---
--- @type boolean
vim.o.fsync = false
vim.o.fsync = true
vim.o.fs = vim.o.fsync
vim.go.fsync = vim.o.fsync
vim.go.fs = vim.go.fsync

View File

@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len)
/// convert bytes to a character.
void ins_char(int c)
{
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
size_t n = (size_t)utf_char2bytes(c, buf);
// When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte.
@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
// If 'delcombine' is set and deleting (less than) one character, only
// delete the last combining character.
if (p_deco && use_delcombine
&& utfc_ptr2len(oldp + col) >= count) {
int cc[MAX_MCO];
(void)utfc_ptr2char(oldp + col, cc);
if (cc[0] != NUL) {
if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
char *p0 = oldp + col;
if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
// Find the last composing char, there can be several.
int n = col;
do {

View File

@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab)
while (*p) {
const size_t l = (size_t)utfc_ptr2len(p);
if (l > 1) {
int pcc[MAX_MCO + 1];
pcc[0] = utfc_ptr2char(p, &pcc[1]);
if (vim_isprintc(pcc[0])) {
if (vim_isprintc(utf_ptr2char(p))) {
len += l;
} else {
for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
int c = utf_ptr2char(p + off);
char hexbuf[9];
len += transchar_hex(hexbuf, pcc[i]);
len += transchar_hex(hexbuf, c);
}
}
p += l;
@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co
if (buf_p + l > buf_e) {
break; // Exceeded `buf` size.
}
int pcc[MAX_MCO + 1];
pcc[0] = utfc_ptr2char(p, &pcc[1]);
if (vim_isprintc(pcc[0])) {
if (vim_isprintc(utf_ptr2char(p))) {
memmove(buf_p, p, l);
buf_p += l;
} else {
for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
int c = utf_ptr2char(p + off);
char hexbuf[9]; // <up to 6 bytes>NUL
const size_t hexlen = transchar_hex(hexbuf, pcc[i]);
const size_t hexlen = transchar_hex(hexbuf, c);
if (buf_p + hexlen > buf_e) {
break;
}

View File

@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n)
bool check_digraph_chars_valid(int char1, int char2)
{
if (char2 == 0) {
char msg[MB_MAXBYTES + 1];
char msg[MB_MAXCHAR + 1];
msg[utf_char2bytes(char1, msg)] = NUL;
semsg(_(e_digraph_must_be_just_two_characters_str), msg);
return false;

View File

@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
const char *p = *pp;
int cells = utf_ptr2cells(p);
int c_len = utfc_ptr2len(p);
int u8c, u8cc[MAX_MCO];
assert(maxcells > 0);
if (cells > maxcells) {
dest[0] = schar_from_ascii(' ');
return 1;
}
u8c = utfc_ptr2char(p, u8cc);
if (*p == TAB) {
cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells);
}
@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
for (int c = 0; c < cells; c++) {
dest[c] = schar_from_ascii(' ');
}
goto done;
} else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) {
dest[0] = schar_from_ascii(*p);
} else {
dest[0] = schar_from_cc(u8c, u8cc);
}
int u8c;
dest[0] = utfc_ptr2schar(p, &u8c);
if (cells > 1) {
dest[1] = 0;
}
done:
}
*pp += c_len;
return cells;
}
@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t
}
}
static bool check_mb_utf8(int *c, int *u8cc)
{
if (utf_char2len(*c) > 1) {
*u8cc = 0;
*c = 0xc0;
return true;
}
return false;
}
static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line)
{
colnr_T trailcol = MAXCOL;
@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
{
winlinevars_T wlv; // variables passed between functions
int c = 0; // init for GCC
colnr_T vcol_prev = -1; // "wlv.vcol" of previous character
char *line; // current line
char *ptr; // current position in "line"
@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
int multi_attr = 0; // attributes desired by multibyte
int mb_l = 1; // multi-byte byte length
int mb_c = 0; // decoded multi-byte character
bool mb_utf8 = false; // screen char is UTF-8 char
int u8cc[MAX_MCO]; // composing UTF-8 chars
schar_T mb_schar; // complete screen char
int change_start = MAXCOL; // first col of changed area
int change_end = -1; // last col of changed area
bool in_multispace = false; // in multiple consecutive spaces
@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// For the '$' of the 'list' option, n_extra == 1, p_extra == "".
if (wlv.n_extra > 0) {
if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) {
c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
mb_c = c; // doesn't handle non-utf-8 multi-byte!
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
mb_schar = schar_from_char(mb_c);
wlv.n_extra--;
} else {
assert(wlv.p_extra != NULL);
c = (uint8_t)(*wlv.p_extra);
mb_c = c;
// If the UTF-8 character is more than one byte:
// Decode it into "mb_c".
mb_l = utfc_ptr2len(wlv.p_extra);
mb_utf8 = false;
if (mb_l > wlv.n_extra) {
mb_l = 1;
} else if (mb_l > 1) {
mb_c = utfc_ptr2char(wlv.p_extra, u8cc);
mb_utf8 = true;
c = 0xc0;
}
if (mb_l == 0) { // at the NUL at end-of-line
mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c);
// mb_l=0 at the end-of-line NUL
if (mb_l > wlv.n_extra || mb_l == 0) {
mb_l = 1;
}
// If a double-width char doesn't fit display a '>' in the last column.
// Don't advance the pointer but put the character at the start of the next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
c = '>';
mb_c = c;
mb_c = '>';
mb_l = 1;
(void)mb_l;
mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT);
if (wlv.cul_attr) {
@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
? hl_combine_attr(wlv.cul_attr, multi_attr)
: hl_combine_attr(multi_attr, wlv.cul_attr);
}
// put the pointer back to output the double-width
// character at the start of the next line.
wlv.n_extra++;
wlv.p_extra--;
} else {
wlv.n_extra -= mb_l - 1;
wlv.p_extra += mb_l - 1;
wlv.n_extra -= mb_l;
wlv.p_extra += mb_l;
}
wlv.p_extra++;
}
wlv.n_extra--;
// Only restore search_attr and area_attr after "n_extra" in
// the next screen line is also done.
@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
} else if (has_fold) {
// skip writing the buffer line itself
c = NUL;
mb_c = NUL;
} else {
int c0;
char *prev_ptr = ptr;
// Get a character from the line itself.
c0 = c = (uint8_t)(*ptr);
mb_c = c;
if (c == NUL) {
// first byte of next char
int c0 = (uint8_t)(*ptr);
if (c0 == NUL) {
// no more cells to skip
wlv.skip_cells = 0;
}
// If the UTF-8 character is more than one byte: Decode it
// into "mb_c".
// Get a character from the line itself.
mb_l = utfc_ptr2len(ptr);
mb_utf8 = false;
if (mb_l > 1) {
mb_c = utfc_ptr2char(ptr, u8cc);
mb_schar = utfc_ptr2schar(ptr, &mb_c);
// Overlong encoded ASCII or ASCII with composing char
// is displayed normally, except a NUL.
if (mb_c < 0x80) {
c0 = c = mb_c;
}
mb_utf8 = true;
// At start of the line we can have a composing char.
// Draw it as a space with a composing char.
if (utf_iscomposing(mb_c)) {
for (int i = MAX_MCO - 1; i > 0; i--) {
u8cc[i] = u8cc[i - 1];
}
u8cc[0] = mb_c;
mb_c = ' ';
}
if (mb_l > 1 && mb_c < 0x80) {
c0 = mb_c;
}
if ((mb_l == 1 && c >= 0x80)
if ((mb_l == 1 && c0 >= 0x80)
|| (mb_l >= 1 && mb_c == 0)
|| (mb_l > 1 && (!vim_isprintc(mb_c)))) {
// Illegal UTF-8 byte: display as <xx>.
// Non-BMP character : display as ? or fullwidth ?.
// Non-printable character : display as ? or fullwidth ?.
transchar_hex(wlv.extra, mb_c);
if (wp->w_p_rl) { // reverse
rl_mirror_ascii(wlv.extra, NULL);
}
wlv.p_extra = wlv.extra;
c = (uint8_t)(*wlv.p_extra);
mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra);
mb_utf8 = (c >= 0x80);
mb_schar = schar_from_char(mb_c);
wlv.n_extra = (int)strlen(wlv.p_extra);
wlv.c_extra = NUL;
wlv.c_final = NUL;
@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// last column; the character is displayed at the start of the
// next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
c = '>';
mb_c = c;
mb_utf8 = false;
mb_c = '>';
mb_l = 1;
mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT);
// Put pointer back so that the character will be
// displayed at the start of the next line.
@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 1;
wlv.c_extra = MB_FILLER_CHAR;
wlv.c_final = NUL;
c = ' ';
mb_c = ' ';
mb_l = 1;
mb_schar = schar_from_ascii(mb_c);
if (area_attr == 0 && search_attr == 0) {
wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
saved_attr2 = wlv.char_attr; // save current attr
}
mb_c = c;
mb_utf8 = false;
mb_l = 1;
}
ptr++;
@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// no concealing past the end of the line, it interferes
// with line highlighting.
if (c == NUL) {
syntax_flags = 0;
} else {
syntax_flags = get_syntax_info(&syntax_seqnr);
}
syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr);
}
if (has_decor && v > 0) {
@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
spell_attr = 0;
// do not calculate cap_col at the end of the line or when
// only white space is following
if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
char *p;
hlf_T spell_hlf = HLF_COUNT;
v -= mb_l - 1;
@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
//
// So only allow to linebreak, once we have found chars not in
// 'breakat' in the line.
if (wp->w_p_lbr && !wlv.need_lbr && c != NUL
if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL
&& !vim_isbreak((uint8_t)(*ptr))) {
wlv.need_lbr = true;
}
// Found last space before word: check for line break.
if (wp->w_p_lbr && c0 == c && wlv.need_lbr
&& vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) {
if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr
&& vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) {
int mb_off = utf_head_off(line, ptr - 1);
char *p = ptr - (mb_off + 1);
chartabsize_T cts;
@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1;
clear_chartabsize_arg(&cts);
if (on_last_col && c != TAB) {
if (on_last_col && mb_c != TAB) {
// Do not continue search/match highlighting over the
// line break, but for TABs the highlighting should
// include the complete width of the character
search_attr = 0;
}
if (c == TAB && wlv.n_extra + wlv.col > grid->cols) {
if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) {
wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts,
wp->w_buffer->b_p_vts_array) - 1;
}
wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' ';
wlv.c_final = NUL;
if (ascii_iswhite(c)) {
if (c == TAB) {
if (mb_c < 128 && ascii_iswhite(mb_c)) {
if (mb_c == TAB) {
// See "Tab alignment" below.
FIX_FOR_BOGUSCOLS;
}
if (!wp->w_p_list) {
c = ' ';
mb_c = ' ';
mb_schar = schar_from_ascii(mb_c);
}
}
}
if (wp->w_p_list) {
in_multispace = c == ' ' && (*ptr == ' '
|| (prev_ptr > line && prev_ptr[-1] == ' '));
in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' '));
if (!in_multispace) {
multispace_pos = 0;
}
@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// But not when the character is followed by a composing
// character (use mb_l to check that).
if (wp->w_p_list
&& ((((c == 160 && mb_l == 1)
|| (mb_utf8
&& ((mb_c == 160 && mb_l == 2)
|| (mb_c == 0x202f && mb_l == 3))))
&& ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3))
&& wp->w_p_lcs_chars.nbsp)
|| (c == ' '
|| (mb_c == ' '
&& mb_l == 1
&& (wp->w_p_lcs_chars.space
|| (in_multispace && wp->w_p_lcs_chars.multispace != NULL))
&& ptr - line >= leadcol
&& ptr - line <= trailcol))) {
if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) {
c = wp->w_p_lcs_chars.multispace[multispace_pos++];
mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++];
if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) {
multispace_pos = 0;
}
} else {
c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
}
wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_schar = schar_from_char(mb_c);
}
if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol)
if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol)
|| (leadcol != 0 && ptr < line + leadcol))) {
if (leadcol != 0 && in_multispace && ptr < line + leadcol
&& wp->w_p_lcs_chars.leadmultispace != NULL) {
c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) {
multispace_pos = 0;
}
} else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) {
c = wp->w_p_lcs_chars.trail;
mb_c = wp->w_p_lcs_chars.trail;
} else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) {
c = wp->w_p_lcs_chars.lead;
mb_c = wp->w_p_lcs_chars.lead;
} else if (leadcol != 0 && wp->w_p_lcs_chars.space) {
c = wp->w_p_lcs_chars.space;
mb_c = wp->w_p_lcs_chars.space;
}
wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_schar = schar_from_char(mb_c);
}
}
// Handling of non-printable characters.
if (!vim_isprintc(c)) {
if (!vim_isprintc(mb_c)) {
// when getting a character from the file, we may have to
// turn it into something else on the way to putting it on the screen.
if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
int tab_len = 0;
colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length
char *const sbr = get_showbreak_value(wp);
@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wlv.n_extra > 0) {
len += wlv.n_extra - tab_len;
}
c = wp->w_p_lcs_chars.tab1;
mb_c = wp->w_p_lcs_chars.tab1;
p = get_extra_buf((size_t)len + 1);
memset(p, ' ', (size_t)len);
p[len] = NUL;
@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
}
mb_utf8 = false; // don't draw as UTF-8
if (wp->w_p_list) {
c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
? wp->w_p_lcs_chars.tab3
: wp->w_p_lcs_chars.tab1;
mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1;
if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) {
wlv.c_extra = NUL; // using p_extra from above
} else {
@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = tab_len + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
} else {
wlv.c_final = NUL;
wlv.c_extra = ' ';
c = ' ';
mb_c = ' ';
}
} else if (c == NUL
mb_schar = schar_from_char(mb_c);
} else if (mb_c == NUL
&& (wp->w_p_list
|| ((wlv.fromcol >= 0 || fromcol_prev >= 0)
&& wlv.tocol > wlv.vcol
@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 0;
}
if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) {
c = wp->w_p_lcs_chars.eol;
mb_c = wp->w_p_lcs_chars.eol;
} else {
c = ' ';
mb_c = ' ';
}
lcs_eol_one = -1;
ptr--; // put it back at the NUL
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
wlv.n_attr = 1;
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
} else if (c != NUL) {
wlv.p_extra = transchar_buf(wp->w_buffer, c);
mb_schar = schar_from_char(mb_c);
} else if (mb_c != NUL) {
wlv.p_extra = transchar_buf(wp->w_buffer, mb_c);
if (wlv.n_extra == 0) {
wlv.n_extra = byte2cells(c) - 1;
wlv.n_extra = byte2cells(mb_c) - 1;
}
if ((dy_flags & DY_UHEX) && wp->w_p_rl) {
rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>"
@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wp->w_p_lbr) {
char *p;
c = (uint8_t)(*wlv.p_extra);
mb_c = (uint8_t)(*wlv.p_extra);
p = get_extra_buf((size_t)wlv.n_extra + 1);
memset(p, ' ', (size_t)wlv.n_extra);
strncpy(p, // NOLINT(runtime/printf)
@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
p[wlv.n_extra] = NUL;
wlv.p_extra = p;
} else {
wlv.n_extra = byte2cells(c) - 1;
c = (uint8_t)(*wlv.p_extra++);
wlv.n_extra = byte2cells(mb_c) - 1;
mb_c = (uint8_t)(*wlv.p_extra++);
}
wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_8);
saved_attr2 = wlv.char_attr; // save current attr
mb_utf8 = false; // don't draw as UTF-8
mb_schar = schar_from_ascii(mb_c);
} else if (VIsual_active
&& (VIsual_mode == Ctrl_V || VIsual_mode == 'v')
&& virtual_active()
&& wlv.tocol != MAXCOL
&& wlv.vcol < wlv.tocol
&& wlv.col < grid->cols) {
c = ' ';
mb_c = ' ';
mb_schar = schar_from_char(mb_c);
ptr--; // put it back at the NUL
}
}
@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// First time at this concealed item: display one
// character.
if (has_match_conc && match_conc) {
c = match_conc;
mb_c = match_conc;
} else if (decor_conceal && decor_state.conceal_char) {
c = decor_state.conceal_char;
mb_c = decor_state.conceal_char;
if (decor_state.conceal_attr) {
wlv.char_attr = decor_state.conceal_attr;
}
} else if (syn_get_sub_char() != NUL) {
c = syn_get_sub_char();
mb_c = syn_get_sub_char();
} else if (wp->w_p_lcs_chars.conceal != NUL) {
c = wp->w_p_lcs_chars.conceal;
mb_c = wp->w_p_lcs_chars.conceal;
} else {
c = ' ';
mb_c = ' ';
}
prev_syntax_id = syntax_seqnr;
@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
is_concealing = true;
wlv.skip_cells = 1;
}
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_schar = schar_from_char(mb_c);
} else {
prev_syntax_id = 0;
is_concealing = false;
@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
&& (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0)
&& wlv.filler_todo <= 0
&& wlv.draw_state > WL_STC
&& c != NUL) {
c = wp->w_p_lcs_chars.prec;
&& mb_c != NUL) {
mb_c = wp->w_p_lcs_chars.prec;
lcs_prec_todo = NUL;
if (utf_char2cells(mb_c) > 1) {
// Double-width character being overwritten by the "precedes"
@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = 2;
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
}
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_schar = schar_from_char(mb_c);
saved_attr3 = wlv.char_attr; // save current attr
wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr
n_attr3 = 1;
}
// At end of the text line or just after the last character.
if (c == NUL && eol_hl_off == 0) {
if (mb_c == NUL && eol_hl_off == 0) {
// flag to indicate whether prevcol equals startcol of search_hl or
// one of the matches
bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl,
@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
// At end of the text line.
if (c == NUL) {
if (mb_c == NUL) {
// Highlight 'cursorcolumn' & 'colorcolumn' past end of the line.
if (wp->w_p_wrap) {
v = wlv.startrow == 0 ? wp->w_skipcol : 0;
@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
|| lcs_eol_one > 0
|| (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL))
|| has_more_inline_virt(&wlv, v)) {
c = wp->w_p_lcs_chars.ext;
mb_c = wp->w_p_lcs_chars.ext;
wlv.char_attr = win_hl_attr(wp, HLF_AT);
mb_c = c;
mb_utf8 = check_mb_utf8(&c, u8cc);
mb_schar = schar_from_char(mb_c);
}
}
@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// Skip characters that are left of the screen for 'nowrap'.
if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) {
// Store the character.
if (mb_utf8) {
linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc);
} else {
linebuf_char[wlv.off] = schar_from_ascii((char)c);
}
linebuf_char[wlv.off] = mb_schar;
if (multi_attr) {
linebuf_attr[wlv.off] = multi_attr;
multi_attr = 0;

View File

@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight)
pc_status = PC_STATUS_SET;
}
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr);
grid_line_flush();
}
@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent)
int cc;
if ((cc = utf_char2len(c)) > 1) {
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf);
buf[cc] = NUL;
@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
int cc;
int temp = 0; // init for GCC
bool did_backspace = false;
int cpc[MAX_MCO]; // composing characters
bool call_fix_indent = false;
// can't delete anything in an empty file
@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
if (State & REPLACE_FLAG) {
replace_do_bs(-1);
} else {
const int l_p_deco = p_deco;
if (l_p_deco) {
(void)utfc_ptr2char(get_cursor_pos_ptr(), cpc);
bool has_composing = false;
if (p_deco) {
char *p0 = get_cursor_pos_ptr();
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
}
(void)del_char(false);
// If there are combining characters and 'delcombine' is set
// move the cursor back. Don't back up before the base
// character.
if (l_p_deco && cpc[0] != NUL) {
// move the cursor back. Don't back up before the base character.
if (has_composing) {
inc_cursor();
}
if (revins_chars) {

View File

@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE
/// Set v:char to character "c".
void set_vim_var_char(int c)
{
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(c, buf)] = NUL;
set_vim_var_string(VV_CHAR, buf, -1);

View File

@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
return;
}
char buf[MB_MAXBYTES];
char buf[MB_MAXCHAR];
const int len = utf_char2bytes((int)num, buf);
rettv->v_type = VAR_STRING;
@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
c = -1;
} else {
char buf[MB_MAXBYTES + 1];
char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
c = utf_ptr2char(buf);
}
@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
ScreenGrid *grid;
screenchar_adjust(&grid, &row, &col);
tv_list_alloc_ret(rettv, kListLenMayKnow);
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
tv_list_alloc_ret(rettv, 0);
return;
}
char buf[MB_MAXBYTES + 1];
char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
int pcc[MAX_MCO];
int c = utfc_ptr2char(buf, pcc);
int composing_len = 0;
while (composing_len < MAX_MCO && pcc[composing_len] != 0) {
composing_len++;
}
tv_list_alloc_ret(rettv, composing_len + 1);
// schar values are already processed chars which are always NUL-terminated.
// A single [0] is expected when char is NUL.
size_t i = 0;
do {
int c = utf_ptr2char(buf + i);
tv_list_append_number(rettv->vval.v_list, c);
for (int i = 0; i < composing_len; i++) {
tv_list_append_number(rettv->vval.v_list, pcc[i]);
}
i += (size_t)utf_ptr2len(buf + i);
} while (buf[i] != NUL);
}
/// "screencol()" function
@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr
return;
}
char buf[MB_MAXBYTES + 1];
char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
rettv->vval.v_string = xstrdup(buf);
}
@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt
char *const csearch = tv_dict_get_string(d, "char", false);
if (csearch != NULL) {
int pcc[MAX_MCO];
const int c = utfc_ptr2char(csearch, pcc);
int c = utf_ptr2char(csearch);
set_last_csearch(c, csearch, utfc_ptr2len(csearch));
}

View File

@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[]
/// ":ascii" and "ga" implementation
void do_ascii(exarg_T *eap)
{
char *dig;
int cc[MAX_MCO];
int c = utfc_ptr2char(get_cursor_pos_ptr(), cc);
if (c == NUL) {
char *data = get_cursor_pos_ptr();
size_t len = (size_t)utfc_ptr2len(data);
if (len == 0) {
msg("NUL", 0);
return;
}
size_t iobuff_len = 0;
bool need_clear = true;
msg_sb_eol();
msg_start();
int ci = 0;
int c = utf_ptr2char(data);
size_t off = 0;
// TODO(bfredl): merge this with the main loop
if (c < 0x80) {
if (c == NL) { // NUL is stored as NL.
c = NUL;
@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap)
char buf2[20];
buf2[0] = NUL;
dig = get_digraph_for_char(cval);
char *dig = get_digraph_for_char(cval);
if (dig != NULL) {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
sizeof(IObuff) - iobuff_len,
vim_snprintf(IObuff, sizeof(IObuff),
_("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"),
transchar(c), buf1, buf2, cval, cval, cval, dig);
} else {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
sizeof(IObuff) - iobuff_len,
vim_snprintf(IObuff, sizeof(IObuff),
_("<%s>%s%s %d, Hex %02x, Octal %03o"),
transchar(c), buf1, buf2, cval, cval, cval);
}
c = cc[ci++];
msg_multiline(IObuff, 0, true, &need_clear);
off += (size_t)utf_ptr2len(data); // needed for overlong ascii?
}
#define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1)
// Space for description:
// - 1 byte for separator (starting from second entry)
// - 1 byte for "<"
// - 1 byte for space to draw composing character on (optional, but really
// mostly required)
// - up to MB_MAXBYTES bytes for character itself
// - 16 bytes for raw text ("> , Hex , Octal ").
// - at least 4 bytes for hexadecimal representation
// - at least 3 bytes for decimal representation
// - at least 3 bytes for octal representation
// - 1 byte for NUL
//
// Taking into account MAX_MCO and characters which need 8 bytes for
// hexadecimal representation, but not taking translation into account:
// resulting string will occupy less then 400 bytes (conservative estimate).
//
// Less then 1000 bytes if translation multiplies number of bytes needed for
// raw text by 6, so it should always fit into 1025 bytes reserved for IObuff.
// Repeat for combining characters, also handle multiby here.
while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) {
while (off < len) {
c = utf_ptr2char(data + off);
size_t iobuff_len = 0;
// This assumes every multi-byte char is printable...
if (iobuff_len > 0) {
if (off > 0) {
IObuff[iobuff_len++] = ' ';
}
IObuff[iobuff_len++] = '<';
@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap)
}
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
dig = get_digraph_for_char(c);
char *dig = get_digraph_for_char(c);
if (dig != NULL) {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
sizeof(IObuff) - iobuff_len,
vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
(c < 0x10000
? _("> %d, Hex %04x, Oct %o, Digr %s")
: _("> %d, Hex %08x, Oct %o, Digr %s")),
c, c, c, dig);
} else {
iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
sizeof(IObuff) - iobuff_len,
vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
(c < 0x10000
? _("> %d, Hex %04x, Octal %o")
: _("> %d, Hex %08x, Octal %o")),
c, c, c);
}
if (ci == MAX_MCO) {
break;
}
c = cc[ci++];
}
if (ci != MAX_MCO && c != 0) {
xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len);
msg_multiline(IObuff, 0, true, &need_clear);
off += (size_t)utf_ptr2len(data + off); // needed for overlong ascii?
}
msg(IObuff, 0);
if (need_clear) {
msg_clr_eos();
}
msg_end();
}
/// ":left", ":center" and ":right": align text.

View File

@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off)
}
}
/// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell.
schar_T schar_from_cc(int c, int u8cc[MAX_MCO])
{
char buf[MAX_SCHAR_SIZE];
int len = utf_char2bytes(c, buf);
for (int i = 0; i < MAX_MCO; i++) {
if (u8cc[i] == 0) {
break;
}
len += utf_char2bytes(u8cc[i], buf + len);
}
buf[len] = 0;
return schar_from_buf(buf, (size_t)len);
}
schar_T schar_from_str(char *str)
{
if (str == NULL) {
@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols)
schar_get(scbuf, buf[i]);
char scbuf_new[MAX_SCHAR_SIZE];
int len = utf_char2bytes(c0new, scbuf_new);
size_t len = (size_t)utf_char2bytes(c0new, scbuf_new);
if (c1new) {
len += utf_char2bytes(c1new, scbuf_new + len);
len += (size_t)utf_char2bytes(c1new, scbuf_new + len);
}
int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0);
size_t rest = strlen(scbuf + off);
if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) {
// TODO(bfredl): this cannot happen just yet, as we only construct
// schar_T values with up to MAX_MCO+1 composing codepoints. When code
// is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit,
// we need be able to peel off a composing char which doesn't fit anymore.
abort();
if (rest + len + 1 > MAX_SCHAR_SIZE) {
// Too bigly, discard one code-point.
// This should be enough as c0 cannot grow more than from 2 to 4 bytes
// (base arabic to extended arabic)
rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
}
memcpy(scbuf_new + len, scbuf + off, rest);
buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest);
buf[i] = schar_from_buf(scbuf_new, len + rest);
next:
c0prev = c0;
@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row)
return grid->attrs[grid->line_offset[row]] < 0;
}
/// Get a single character directly from grid.chars into "bytes", which must
/// have a size of "MB_MAXBYTES + 1".
/// If "attrp" is not NULL, return the character's attribute in "*attrp".
/// Get a single character directly from grid.chars
///
/// @param[out] attrp set to the character's attribute (optional)
schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp)
{
grid_adjust(&grid, &row, &col);
@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
{
const char *ptr = text;
int len = textlen;
int u8cc[MAX_MCO];
assert(grid_line_grid);
int start_col = col;
int max_col = grid_line_maxcol;
while (col < max_col
&& (len < 0 || (int)(ptr - text) < len)
&& *ptr != NUL) {
while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
// check if this is the first byte of a multibyte
int mbyte_blen = len > 0
? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
: utfc_ptr2len(ptr);
int u8c = len >= 0
? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr))
: utfc_ptr2char(ptr, u8cc);
int mbyte_cells = utf_char2cells(u8c);
int firstc;
schar_T schar = len >= 0
? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
: utfc_ptr2schar(ptr, &firstc);
int mbyte_cells = utf_char2cells(firstc);
if (mbyte_cells > 2) {
mbyte_cells = 1;
u8c = 0xFFFD;
u8cc[0] = 0;
schar = schar_from_char(0xFFFD);
}
if (col + mbyte_cells > max_col) {
// Only 1 cell left, but character requires 2 cells:
// display a '>' in the last column to avoid wrapping. */
u8c = '>';
u8cc[0] = 0;
schar = schar_from_ascii('>');
mbyte_cells = 1;
}
schar_T buf;
// TODO(bfredl): why not just keep the original byte sequence.
buf = schar_from_cc(u8c, u8cc);
// When at the start of the text and overwriting the right half of a
// two-cell character in the same grid, truncate that into a '>'.
if (ptr == text && col > grid_line_first && col < grid_line_last
@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
linebuf_char[col - 1] = schar_from_ascii('>');
}
linebuf_char[col] = buf;
linebuf_char[col] = schar;
linebuf_attr[col] = attr;
linebuf_vcol[col] = -1;
if (mbyte_cells == 2) {

View File

@ -7,8 +7,8 @@
#include "nvim/pos.h"
#include "nvim/types.h"
#define MAX_MCO 6 // fixed value for 'maxcombine'
// Includes final NUL. at least 4*(MAX_MCO+1)+1
// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
// ensures we can fit all composed chars which did fit before.
#define MAX_SCHAR_SIZE 32
// if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness)
@ -35,7 +35,7 @@ enum {
/// we can avoid sending bigger updates than necessary to the Ul layer.
///
/// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can
/// contain up to MAX_MCO composing characters after the base character.
/// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes
/// The composing characters are to be drawn on top of the original character.
/// The content after the NUL is not defined (so comparison must be done a
/// single cell at a time). Double-width characters are stored in the left cell,

View File

@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c)
return;
}
if ((cc = utf_char2len(c)) > 1) {
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf);
buf[cc] = NUL;

View File

@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
if (offset < 0 || offset > (intptr_t)s1_len) {
return luaL_error(lstate, "index out of range");
}
int head_offset = utf_cp_head_off(s1, s1 + offset - 1);
int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
lua_pushinteger(lstate, head_offset);
return 1;
}

View File

@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id);
if (cur->mit_conceal_char) {
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL;
tv_dict_add_str(dict, S_LEN("conceal"), buf);

View File

@ -48,6 +48,7 @@
#include "nvim/getchar.h"
#include "nvim/gettext.h"
#include "nvim/globals.h"
#include "nvim/grid.h"
#include "nvim/grid_defs.h"
#include "nvim/iconv.h"
#include "nvim/keycodes.h"
@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
return arabic_combine(utf_ptr2char(p1), c2);
}
/// Convert a UTF-8 string to a wide character
/// Get the screen char at the beginning of a string
///
/// Also gets up to #MAX_MCO composing characters.
/// Caller is expected to check for things like unprintable chars etc
/// If first char in string is a composing char, prepend a space to display it correctly.
///
/// @param[out] pcc Location where to store composing characters. Must have
/// space at least for #MAX_MCO + 1 elements.
/// If "p" starts with an invalid sequence, zero is returned.
///
/// @return leading character.
int utfc_ptr2char(const char *p, int *pcc)
/// @param[out] firstc (required) The first codepoint of the screen char,
/// or the first byte of an invalid sequence
///
/// @return the char
schar_T utfc_ptr2schar(const char *p, int *firstc)
FUNC_ATTR_NONNULL_ALL
{
int i = 0;
int c = utf_ptr2char(p);
int len = utf_ptr2len(p);
*firstc = c; // NOT optional, you are gonna need it
bool first_compose = utf_iscomposing(c);
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
// Only accept a composing char when the first char isn't illegal.
if ((len > 1 || (uint8_t)(*p) < 0x80)
&& (uint8_t)p[len] >= 0x80
&& utf_composinglike(p, p + len)) {
int cc = utf_ptr2char(p + len);
while (true) {
pcc[i++] = cc;
if (i == MAX_MCO) {
break;
}
len += utf_ptr2len(p + len);
if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
break;
}
}
if (len == 1 && (uint8_t)(*p) >= 0x80) {
return 0; // invalid sequence
}
if (i < MAX_MCO) { // last composing char must be 0
pcc[i] = 0;
}
return c;
return schar_from_buf_first(p, len, first_compose);
}
// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
// composing characters. Use no more than p[maxlen].
//
// @param [out] pcc: composing chars, last one is 0
int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
/// Get the screen char at the beginning of a string with length
///
/// Like utfc_ptr2schar but use no more than p[maxlen].
schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
FUNC_ATTR_NONNULL_ALL
{
assert(maxlen > 0);
int i = 0;
int len = utf_ptr2len_len(p, maxlen);
// Is it safe to use utf_ptr2char()?
bool safe = len > 1 && len <= maxlen;
int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
// Only accept a composing char when the first char isn't illegal.
if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
for (; i < MAX_MCO; i++) {
int len_cc = utf_ptr2len_len(p + len, maxlen - len);
safe = len_cc > 1 && len_cc <= maxlen - len;
if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
|| !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
break;
}
len += len_cc;
}
size_t len = (size_t)utf_ptr2len_len(p, maxlen);
if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
// invalid or truncated sequence
*firstc = (uint8_t)(*p);
return 0;
}
if (i < MAX_MCO) {
// last composing char must be 0
pcc[i] = 0;
}
int c = utf_ptr2char(p);
*firstc = c;
bool first_compose = utf_iscomposing(c);
maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
len = (size_t)utfc_ptr2len_len(p, maxlen);
return c;
#undef ISCOMPOSING
return schar_from_buf_first(p, len, first_compose);
}
/// Caller must ensure there is space for `first_compose`
static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
{
if (first_compose) {
char cbuf[MAX_SCHAR_SIZE];
cbuf[0] = ' ';
memcpy(cbuf + 1, buf, len);
return schar_from_buf(cbuf, len + 1);
} else {
return schar_from_buf(buf, len);
}
}
/// Get the length of a UTF-8 byte sequence representing a single codepoint
@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
return 1;
}
// Check for composing characters. We can handle only the first six, but
// skip all of them (otherwise the cursor would get stuck).
// Check for composing characters.
int prevlen = 0;
while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
/// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
/// separately and returns a negative offset.
/// separately.
///
/// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
//
/// @return 0 if invalid sequence, else offset to previous codepoint
/// @return 0 if invalid sequence, else number of bytes to previous codepoint
int utf_cp_head_off(const char *base, const char *p)
{
int i;
@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
}
// Find the first character that is not 10xx.xxxx
for (i = 0; p - i > base; i--) {
if (((uint8_t)p[i] & 0xc0) != 0x80) {
for (i = 0; p - i >= base; i++) {
if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break;
}
}
// Find the last character that is 10xx.xxxx
for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {}
// Find the last character that is 10xx.xxxx (condition terminates on NUL)
int j = 1;
while (((uint8_t)p[j] & 0xc0) == 0x80) {
j++;
}
// Check for illegal sequence.
if (utf8len_tab[(uint8_t)p[i]] == 1) {
if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0;
}
return i;

View File

@ -7,6 +7,7 @@
#include "nvim/cmdexpand_defs.h"
#include "nvim/eval/typval_defs.h"
#include "nvim/func_attr.h"
#include "nvim/grid_defs.h"
#include "nvim/mbyte_defs.h"
#include "nvim/os/os_defs.h"
#include "nvim/types.h"

View File

@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0;
static void ui_ext_msg_set_pos(int row, bool scrolled)
{
char buf[MAX_MCO + 1];
char buf[MB_MAXCHAR + 1];
size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf);
buf[size] = '\0';
ui_call_msg_set_pos(msg_grid.handle, row, scrolled,
@ -1471,7 +1471,7 @@ void msg_putchar(int c)
void msg_putchar_attr(int c, int attr)
{
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
if (IS_SPECIAL(c)) {
buf[0] = (char)K_SPECIAL;
@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr)
mode_displayed = false;
}
// If the string starts with a composing character first draw a space on
// which the composing char can be drawn.
if (utf_iscomposing(utf_ptr2char(msgstr))) {
msg_puts_attr(" ", attr);
}
// Go over the string. Special characters are translated and printed.
// Normal characters are printed several at a time.
while (--len >= 0 && !got_int) {

View File

@ -556,6 +556,7 @@ EXTERN char *p_mp; ///< 'makeprg'
EXTERN char *p_mps; ///< 'matchpairs'
EXTERN OptInt p_mat; ///< 'matchtime'
EXTERN OptInt p_mco; ///< 'maxcombine'
#define MAX_MCO 6 // fixed value for 'maxcombine'
EXTERN OptInt p_mfd; ///< 'maxfuncdepth'
EXTERN OptInt p_mmd; ///< 'maxmapdepth'
EXTERN OptInt p_mmp; ///< 'maxmempattern'

View File

@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word)
static bool similar_chars(slang_T *slang, int c1, int c2)
{
int m1, m2;
char buf[MB_MAXBYTES + 1];
char buf[MB_MAXCHAR + 1];
hashitem_T *hi;
if (c1 >= 256) {

View File

@ -1102,8 +1102,6 @@ describe("folded lines", function()
end)
it("works with multibyte text", function()
-- Currently the only allowed value of 'maxcombine'
eq(6, meths.get_option_value('maxcombine', {}))
eq(true, meths.get_option_value('arabicshape', {}))
insert([[
å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة
@ -1120,7 +1118,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
å x̎͂̀̂͛͛ َََِّ |
å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ |
möre tex^t |
{1:~ }|
{1:~ }|
@ -1132,7 +1130,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
å x̎͂̀̂͛͛ َََِّ |
å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ |
möre tex^t |
{1:~ }|
{1:~ }|
@ -1156,7 +1154,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{5:^+-- 2 lines: å x̎͂̀̂͛͛ َََِّ·················}|
{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1168,7 +1166,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{5:^+-- 2 lines: å x̎͂̀̂͛͛ َََِّ·················}|
{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ َََِّ·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1192,7 +1190,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة·················}|
{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1204,7 +1202,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة·················}|
{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1228,7 +1226,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة···········}|
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1240,7 +1238,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̎͂̀̂͛͛ العَرَبِيَّة···········}|
{7:+ }{8: 1 }{5:^+-- 2 lines: å x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }|
{1:~ }|
{1:~ }|
@ -1265,7 +1263,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}{8: 1 }{7: +}|
{5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1277,7 +1275,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}{8: 1 }{7: +}|
{5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1301,7 +1299,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}|
{5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1313,7 +1311,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ å :senil 2 --^+}|
{5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1337,7 +1335,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
{5:·················َََِّ x̎͂̀̂͛͛ å :senil 2 --^+}|
{5:·················َََِّ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1349,7 +1347,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
{5:·················َََِّ x̎͂̀̂͛͛ å :senil 2 --^+}|
{5:·················َََِّ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@ -1373,7 +1371,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
َََِّ^ x̎͂̀̂͛͛ å|
َََِّ^ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm|
{1: ~}|
{1: ~}|
@ -1385,7 +1383,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
َََِّ^ x̎͂̀̂͛͛ å|
َََِّ^ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm|
{1: ~}|
{1: ~}|
@ -1409,7 +1407,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
ةيَّبِرَعَ^لا x̎͂̀̂͛͛ å|
ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm|
{1: ~}|
{1: ~}|
@ -1421,7 +1419,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
ةيَّبِرَعَ^لا x̎͂̀̂͛͛ å|
ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ å|
txet eröm|
{1: ~}|
{1: ~}|

View File

@ -228,6 +228,36 @@ describe("multibyte rendering", function()
]]}
end)
it('works with arabicshape and multiple composing chars', function()
-- this tests an important edge case: arabicshape might increase the byte size of the base
-- character in a way so that the last composing char no longer fits. use "g8" on the text
-- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape')
-- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the
-- test text with even more zalgo energy to still touch this edge case.
meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"})
command('set noarabicshape')
screen:expect{grid=[[
^سلام̀́̂̃̄̅̆̇̈̉̊̋̌ |
{1:~ }|
{1:~ }|
{1:~ }|
{1:~ }|
|
]]}
command('set arabicshape')
screen:expect{grid=[[
^̀́̂̃̄̅̆̇̈̉̊̋̌ |
{1:~ }|
{1:~ }|
{1:~ }|
{1:~ }|
|
]]}
end)
end)
describe('multibyte rendering: statusline', function()

View File

@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it)
local ffi = helpers.ffi
local eq = helpers.eq
local mbyte = helpers.cimport("./src/nvim/mbyte.h")
local charset = helpers.cimport('./src/nvim/charset.h')
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
describe('mbyte', function()
-- Array for composing characters
local intp = ffi.typeof('int[?]')
local function to_intp()
-- how to get MAX_MCO from globals.h?
return intp(7, 1)
end
-- Convert from bytes to string
local function to_string(bytes)
local s = {}
@ -30,14 +22,14 @@ describe('mbyte', function()
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
eq(c, lib.utf_ptr2char(to_string({c, 0})))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf
--eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
--eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
--
-- Sequences with more than four bytes
end)
@ -47,240 +39,133 @@ describe('mbyte', function()
local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0)
mbyte.utf_char2bytes(c, p)
eq(c, mbyte.utf_ptr2char(p))
eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
lib.utf_char2bytes(c, p)
eq(c, lib.utf_ptr2char(p))
eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end
end)
end
describe('utfc_ptr2char_len', function()
describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
local firstc = ffi.new("int[1]")
local buf = ffi.new("char[32]")
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return {ffi.string(buf), firstc[0]}
end
local function byte(val)
return {string.char(val), val}
end
itp('1-byte sequences', function()
local pcc = to_intp()
for c = 0, 255 do
eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
eq(0, pcc[0])
eq({'', 0}, test_seq{0})
for c = 1, 127 do
eq(byte(c), test_seq{c})
end
for c = 128, 255 do
eq({'', c}, test_seq{c})
end
end)
itp('2-byte sequences', function()
local pcc = to_intp()
-- No combining characters
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f})
-- No combining characters
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f})
-- One UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
eq(0, pcc[0])
eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0xc0})
end)
itp('3-byte sequences', function()
local pcc = to_intp()
-- No second UTF-8 character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
-- No combining character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
-- Combining character is U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
-- Incomplete combining character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
-- One UTF-8 character
pcc = to_intp()
eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
eq(0, pcc[0])
-- One UTF-8 character (composing only)
eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
end)
itp('4-byte sequences', function()
local pcc = to_intp()
-- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
-- No second UTF-8 character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
-- Combining character U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
-- No following UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
end)
itp('5+-byte sequences', function()
local pcc = to_intp()
-- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
-- No second UTF-8 character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
-- Combining character U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
-- Combining characters U+0300 and U+0301
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
-- Combining characters U+0300, U+0301, U+0302
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
-- Combining characters U+0300, U+0301, U+0302, U+0303
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0000, pcc[4])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0000, pcc[5])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
-- U+0305
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(1, pcc[6])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
-- U+0305, U+0306, but only save six (= MAX_MCO).
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(0x0001, pcc[6])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
-- Only three following combining characters U+0300, U+0301, U+0302
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
-- No following UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
-- Combining characters U+1AB0 and U+0301
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
eq(0x1ab0, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
end)
end)