vim-patch:9.0.1617: charidx() result is not consistent with byteidx() (#23963)

Problem:    charidx() and utf16idx() result is not consistent with byteidx().
Solution:   When the index is equal to the length of the text return the
            lenght of the text instead of -1. (Yegappan Lakshmanan,
            closes vim/vim#12503)

577922b917

Co-authored-by: Yegappan Lakshmanan <yegappan@yahoo.com>
This commit is contained in:
zeertzjq 2023-06-09 17:43:46 +08:00 committed by GitHub
parent e5e0bda41b
commit 106922898a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 47 deletions

View File

@ -1167,11 +1167,13 @@ charidx({string}, {idx} [, {countcc} [, {utf16}]])
When {utf16} is present and TRUE, {idx} is used as the UTF-16 When {utf16} is present and TRUE, {idx} is used as the UTF-16
index in the String {expr} instead of as the byte index. index in the String {expr} instead of as the byte index.
Returns -1 if the arguments are invalid or if {idx} is greater Returns -1 if the arguments are invalid or if there are less
than the index of the last byte in {string}. An error is than {idx} bytes. If there are exactly {idx} bytes the length
given if the first argument is not a string, the second of the string in characters is returned.
argument is not a number or when the third argument is present
and is not zero or one. An error is given and -1 is returned if the first argument is
not a string, the second argument is not a number or when the
third argument is present and is not zero or one.
See |byteidx()| and |byteidxcomp()| for getting the byte index See |byteidx()| and |byteidxcomp()| for getting the byte index
from the character index and |utf16idx()| for getting the from the character index and |utf16idx()| for getting the
@ -9138,8 +9140,8 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
< <
*utf16idx()* *utf16idx()*
utf16idx({string}, {idx} [, {countcc} [, {charidx}]]) utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Same as |charidx()| but returns the UTF-16 index of the byte Same as |charidx()| but returns the UTF-16 code unit index of
at {idx} in {string} (after converting it to UTF-16). the byte at {idx} in {string} (after converting it to UTF-16).
When {charidx} is present and TRUE, {idx} is used as the When {charidx} is present and TRUE, {idx} is used as the
character index in the String {string} instead of as the byte character index in the String {string} instead of as the byte
@ -9147,6 +9149,10 @@ utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
An {idx} in the middle of a UTF-8 sequence is rounded upwards An {idx} in the middle of a UTF-8 sequence is rounded upwards
to the end of that sequence. to the end of that sequence.
Returns -1 if the arguments are invalid or if there are less
than {idx} bytes in {string}. If there are exactly {idx} bytes
the length of the string in UTF-16 code units is returned.
See |byteidx()| and |byteidxcomp()| for getting the byte index See |byteidx()| and |byteidxcomp()| for getting the byte index
from the UTF-16 index and |charidx()| for getting the from the UTF-16 index and |charidx()| for getting the
character index from the UTF-16 index. character index from the UTF-16 index.

View File

@ -1603,6 +1603,11 @@ void f_charidx(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
int len; int len;
for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++) { for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++) {
if (*p == NUL) { if (*p == NUL) {
// If the index is exactly the number of bytes or utf-16 code units
// in the string then return the length of the string in characters.
if (utf16idx ? (idx == 0) : (p == (str + idx))) {
rettv->vval.v_number = len;
}
return; return;
} }
if (utf16idx) { if (utf16idx) {
@ -2047,6 +2052,11 @@ void f_utf16idx(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
int len; int len;
for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++) { for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++) {
if (*p == NUL) { if (*p == NUL) {
// If the index is exactly the number of bytes or characters in the
// string then return the length of the string in utf-16 code units.
if (charidx ? (idx == 0) : (p == (str + idx))) {
rettv->vval.v_number = len;
}
return; return;
} }
const int clen = ptr2len(p); const int clen = ptr2len(p);

View File

@ -1267,7 +1267,8 @@ func Test_charidx()
call assert_equal(1, charidx(a, 3)) call assert_equal(1, charidx(a, 3))
call assert_equal(2, charidx(a, 4)) call assert_equal(2, charidx(a, 4))
call assert_equal(3, charidx(a, 7)) call assert_equal(3, charidx(a, 7))
call assert_equal(-1, charidx(a, 8)) call assert_equal(4, charidx(a, 8))
call assert_equal(-1, charidx(a, 9))
call assert_equal(-1, charidx(a, -1)) call assert_equal(-1, charidx(a, -1))
" count composing characters " count composing characters
@ -1275,14 +1276,18 @@ func Test_charidx()
call assert_equal(2, a->charidx(2, 1)) call assert_equal(2, a->charidx(2, 1))
call assert_equal(3, a->charidx(4, 1)) call assert_equal(3, a->charidx(4, 1))
call assert_equal(5, a->charidx(7, 1)) call assert_equal(5, a->charidx(7, 1))
call assert_equal(-1, a->charidx(8, 1)) call assert_equal(6, a->charidx(8, 1))
call assert_equal(-1, a->charidx(9, 1))
" empty string " empty string
call assert_equal(-1, charidx('', 0)) call assert_equal(0, charidx('', 0))
call assert_equal(-1, charidx('', 0, 1)) call assert_equal(-1, charidx('', 1))
call assert_equal(0, charidx('', 0, 1))
call assert_equal(-1, charidx('', 1, 1))
" error cases " error cases
call assert_equal(-1, charidx(v:_null_string, 0)) call assert_equal(0, charidx(v:_null_string, 0))
call assert_equal(-1, charidx(v:_null_string, 1))
call assert_fails('let x = charidx([], 1)', 'E1174:') call assert_fails('let x = charidx([], 1)', 'E1174:')
call assert_fails('let x = charidx("abc", [])', 'E1210:') call assert_fails('let x = charidx("abc", [])', 'E1210:')
call assert_fails('let x = charidx("abc", 1, [])', 'E1212:') call assert_fails('let x = charidx("abc", 1, [])', 'E1212:')
@ -1294,10 +1299,10 @@ endfunc
func Test_charidx_from_utf16_index() func Test_charidx_from_utf16_index()
" string with single byte characters " string with single byte characters
let str = "abc" let str = "abc"
for i in range(3) for i in range(4)
call assert_equal(i, charidx(str, i, v:false, v:true)) call assert_equal(i, charidx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, charidx(str, 3, v:false, v:true)) call assert_equal(-1, charidx(str, 4, v:false, v:true))
" string with two byte characters " string with two byte characters
let str = "a©©b" let str = "a©©b"
@ -1305,7 +1310,8 @@ func Test_charidx_from_utf16_index()
call assert_equal(1, charidx(str, 1, v:false, v:true)) call assert_equal(1, charidx(str, 1, v:false, v:true))
call assert_equal(2, charidx(str, 2, v:false, v:true)) call assert_equal(2, charidx(str, 2, v:false, v:true))
call assert_equal(3, charidx(str, 3, v:false, v:true)) call assert_equal(3, charidx(str, 3, v:false, v:true))
call assert_equal(-1, charidx(str, 4, v:false, v:true)) call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
" string with four byte characters " string with four byte characters
let str = "a😊😊b" let str = "a😊😊b"
@ -1315,38 +1321,48 @@ func Test_charidx_from_utf16_index()
call assert_equal(2, charidx(str, 3, v:false, v:true)) call assert_equal(2, charidx(str, 3, v:false, v:true))
call assert_equal(2, charidx(str, 4, v:false, v:true)) call assert_equal(2, charidx(str, 4, v:false, v:true))
call assert_equal(3, charidx(str, 5, v:false, v:true)) call assert_equal(3, charidx(str, 5, v:false, v:true))
call assert_equal(-1, charidx(str, 6, v:false, v:true)) call assert_equal(4, charidx(str, 6, v:false, v:true))
call assert_equal(-1, charidx(str, 7, v:false, v:true))
" string with composing characters " string with composing characters
let str = '-á-b́' let str = '-á-b́'
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true)) call assert_equal(i, charidx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true)) call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
for i in str->strchars()->range() for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true)) call assert_equal(i, charidx(str, i, v:true, v:true))
endfor endfor
call assert_equal(-1, charidx(str, 6, v:true, v:true)) call assert_equal(6, charidx(str, 6, v:true, v:true))
call assert_equal(-1, charidx(str, 7, v:true, v:true))
" string with multiple composing characters " string with multiple composing characters
let str = '-ą́-ą́' let str = '-ą́-ą́'
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true)) call assert_equal(i, charidx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true)) call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
for i in str->strchars()->range() for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true)) call assert_equal(i, charidx(str, i, v:true, v:true))
endfor endfor
call assert_equal(-1, charidx(str, 8, v:true, v:true)) call assert_equal(8, charidx(str, 8, v:true, v:true))
call assert_equal(-1, charidx(str, 9, v:true, v:true))
" empty string " empty string
call assert_equal(-1, charidx('', 0, v:false, v:true)) call assert_equal(0, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true)) call assert_equal(-1, charidx('', 1, v:false, v:true))
call assert_equal(0, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx('', 1, v:true, v:true))
" error cases " error cases
call assert_equal(-1, charidx('', 0, v:false, v:true)) call assert_equal(0, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true)) call assert_equal(-1, charidx('', 1, v:false, v:true))
call assert_equal(-1, charidx(v:_null_string, 0, v:false, v:true)) call assert_equal(0, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx('', 1, v:true, v:true))
call assert_equal(0, charidx(v:_null_string, 0, v:false, v:true))
call assert_equal(-1, charidx(v:_null_string, 1, v:false, v:true))
call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:') call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:')
call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:') call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:')
endfunc endfunc
@ -1355,10 +1371,10 @@ endfunc
func Test_utf16idx_from_byteidx() func Test_utf16idx_from_byteidx()
" UTF-16 index of a string with single byte characters " UTF-16 index of a string with single byte characters
let str = "abc" let str = "abc"
for i in range(3) for i in range(4)
call assert_equal(i, utf16idx(str, i)) call assert_equal(i, utf16idx(str, i))
endfor endfor
call assert_equal(-1, utf16idx(str, 3)) call assert_equal(-1, utf16idx(str, 4))
" UTF-16 index of a string with two byte characters " UTF-16 index of a string with two byte characters
let str = 'a©©b' let str = 'a©©b'
@ -1368,7 +1384,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(2, str->utf16idx(3)) call assert_equal(2, str->utf16idx(3))
call assert_equal(2, str->utf16idx(4)) call assert_equal(2, str->utf16idx(4))
call assert_equal(3, str->utf16idx(5)) call assert_equal(3, str->utf16idx(5))
call assert_equal(-1, str->utf16idx(6)) call assert_equal(4, str->utf16idx(6))
call assert_equal(-1, str->utf16idx(7))
" UTF-16 index of a string with four byte characters " UTF-16 index of a string with four byte characters
let str = 'a😊😊b' let str = 'a😊😊b'
@ -1382,7 +1399,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(4, utf16idx(str, 7)) call assert_equal(4, utf16idx(str, 7))
call assert_equal(4, utf16idx(str, 8)) call assert_equal(4, utf16idx(str, 8))
call assert_equal(5, utf16idx(str, 9)) call assert_equal(5, utf16idx(str, 9))
call assert_equal(-1, utf16idx(str, 10)) call assert_equal(6, utf16idx(str, 10))
call assert_equal(-1, utf16idx(str, 11))
" UTF-16 index of a string with composing characters " UTF-16 index of a string with composing characters
let str = '-á-b́' let str = '-á-b́'
@ -1394,7 +1412,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(3, utf16idx(str, 5)) call assert_equal(3, utf16idx(str, 5))
call assert_equal(3, utf16idx(str, 6)) call assert_equal(3, utf16idx(str, 6))
call assert_equal(3, utf16idx(str, 7)) call assert_equal(3, utf16idx(str, 7))
call assert_equal(-1, utf16idx(str, 8)) call assert_equal(4, utf16idx(str, 8))
call assert_equal(-1, utf16idx(str, 9))
call assert_equal(0, utf16idx(str, 0, v:true)) call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true)) call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true)) call assert_equal(2, utf16idx(str, 2, v:true))
@ -1403,7 +1422,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(4, utf16idx(str, 5, v:true)) call assert_equal(4, utf16idx(str, 5, v:true))
call assert_equal(5, utf16idx(str, 6, v:true)) call assert_equal(5, utf16idx(str, 6, v:true))
call assert_equal(5, utf16idx(str, 7, v:true)) call assert_equal(5, utf16idx(str, 7, v:true))
call assert_equal(-1, utf16idx(str, 8, v:true)) call assert_equal(6, utf16idx(str, 8, v:true))
call assert_equal(-1, utf16idx(str, 9, v:true))
" string with multiple composing characters " string with multiple composing characters
let str = '-ą́-ą́' let str = '-ą́-ą́'
@ -1419,7 +1439,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(3, utf16idx(str, 9)) call assert_equal(3, utf16idx(str, 9))
call assert_equal(3, utf16idx(str, 10)) call assert_equal(3, utf16idx(str, 10))
call assert_equal(3, utf16idx(str, 11)) call assert_equal(3, utf16idx(str, 11))
call assert_equal(-1, utf16idx(str, 12)) call assert_equal(4, utf16idx(str, 12))
call assert_equal(-1, utf16idx(str, 13))
call assert_equal(0, utf16idx(str, 0, v:true)) call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true)) call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true)) call assert_equal(2, utf16idx(str, 2, v:true))
@ -1432,16 +1453,21 @@ func Test_utf16idx_from_byteidx()
call assert_equal(6, utf16idx(str, 9, v:true)) call assert_equal(6, utf16idx(str, 9, v:true))
call assert_equal(7, utf16idx(str, 10, v:true)) call assert_equal(7, utf16idx(str, 10, v:true))
call assert_equal(7, utf16idx(str, 11, v:true)) call assert_equal(7, utf16idx(str, 11, v:true))
call assert_equal(-1, utf16idx(str, 12, v:true)) call assert_equal(8, utf16idx(str, 12, v:true))
call assert_equal(-1, utf16idx(str, 13, v:true))
" empty string " empty string
call assert_equal(-1, utf16idx('', 0)) call assert_equal(0, utf16idx('', 0))
call assert_equal(-1, utf16idx('', 0, v:true)) call assert_equal(-1, utf16idx('', 1))
call assert_equal(0, utf16idx('', 0, v:true))
call assert_equal(-1, utf16idx('', 1, v:true))
" error cases " error cases
call assert_equal(-1, utf16idx("", 0)) call assert_equal(0, utf16idx("", 0))
call assert_equal(-1, utf16idx("", 1))
call assert_equal(-1, utf16idx("abc", -1)) call assert_equal(-1, utf16idx("abc", -1))
call assert_equal(-1, utf16idx(v:_null_string, 0)) call assert_equal(0, utf16idx(v:_null_string, 0))
call assert_equal(-1, utf16idx(v:_null_string, 1))
call assert_fails('let l = utf16idx([], 0)', 'E1174:') call assert_fails('let l = utf16idx([], 0)', 'E1174:')
call assert_fails('let l = utf16idx("ab", [])', 'E1210:') call assert_fails('let l = utf16idx("ab", [])', 'E1210:')
call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:') call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:')
@ -1453,14 +1479,16 @@ func Test_utf16idx_from_charidx()
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true)) call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 3, v:false, v:true)) call assert_equal(3, utf16idx(str, 3, v:false, v:true))
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
" UTF-16 index of a string with two byte characters " UTF-16 index of a string with two byte characters
let str = "a©©b" let str = "a©©b"
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true)) call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
" UTF-16 index of a string with four byte characters " UTF-16 index of a string with four byte characters
let str = "a😊😊b" let str = "a😊😊b"
@ -1468,36 +1496,44 @@ func Test_utf16idx_from_charidx()
call assert_equal(2, utf16idx(str, 1, v:false, v:true)) call assert_equal(2, utf16idx(str, 1, v:false, v:true))
call assert_equal(4, utf16idx(str, 2, v:false, v:true)) call assert_equal(4, utf16idx(str, 2, v:false, v:true))
call assert_equal(5, utf16idx(str, 3, v:false, v:true)) call assert_equal(5, utf16idx(str, 3, v:false, v:true))
call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) call assert_equal(6, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
" UTF-16 index of a string with composing characters " UTF-16 index of a string with composing characters
let str = '-á-b́' let str = '-á-b́'
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true)) call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
for i in str->strchars()->range() for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true)) call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 6, v:true, v:true)) call assert_equal(6, utf16idx(str, 6, v:true, v:true))
call assert_equal(-1, utf16idx(str, 7, v:true, v:true))
" string with multiple composing characters " string with multiple composing characters
let str = '-ą́-ą́' let str = '-ą́-ą́'
for i in str->strcharlen()->range() for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true)) call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
for i in str->strchars()->range() for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true)) call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor endfor
call assert_equal(-1, utf16idx(str, 8, v:true, v:true)) call assert_equal(8, utf16idx(str, 8, v:true, v:true))
call assert_equal(-1, utf16idx(str, 9, v:true, v:true))
" empty string " empty string
call assert_equal(-1, utf16idx('', 0, v:false, v:true)) call assert_equal(0, utf16idx('', 0, v:false, v:true))
call assert_equal(-1, utf16idx('', 0, v:true, v:true)) call assert_equal(-1, utf16idx('', 1, v:false, v:true))
call assert_equal(0, utf16idx('', 0, v:true, v:true))
call assert_equal(-1, utf16idx('', 1, v:true, v:true))
" error cases " error cases
call assert_equal(-1, utf16idx(v:_null_string, 0, v:true, v:true)) call assert_equal(0, utf16idx(v:_null_string, 0, v:true, v:true))
call assert_equal(-1, utf16idx(v:_null_string, 1, v:true, v:true))
call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:') call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:')
endfunc endfunc