eval/decode: Fix surrogate pairs processing

This commit is contained in:
ZyX 2016-02-03 21:46:01 +03:00
parent ea82270d30
commit 5814e29cdb
4 changed files with 51 additions and 17 deletions

View File

@ -340,12 +340,12 @@ int json_decode_string(const char *const buf, const size_t len,
goto json_decode_string_fail;
}
char *str = xmalloc(len + 1);
uint16_t fst_in_pair = 0;
int fst_in_pair = 0;
char *str_end = str;
for (const char *t = s; t < p; t++) {
if (t[0] != '\\' || t[1] != 'u') {
if (fst_in_pair != 0) {
str_end += utf_char2bytes((int) fst_in_pair, (char_u *) str_end);
str_end += utf_char2bytes(fst_in_pair, (char_u *) str_end);
fst_in_pair = 0;
}
}
@ -353,20 +353,21 @@ int json_decode_string(const char *const buf, const size_t len,
t++;
switch (*t) {
case 'u': {
char ubuf[] = { t[1], t[2], t[3], t[4], 0 };
const char ubuf[] = { t[1], t[2], t[3], t[4], 0 };
t += 4;
unsigned long ch;
vim_str2nr((char_u *) ubuf, NULL, NULL, 0, 0, 2, NULL, &ch);
if (0xD800UL <= ch && ch <= 0xDB7FUL) {
fst_in_pair = (uint16_t) ch;
} else if (0xDC00ULL <= ch && ch <= 0xDB7FUL) {
if (fst_in_pair != 0) {
int full_char = (
(int) (ch - 0xDC00UL)
+ (((int) (fst_in_pair - 0xD800)) << 10)
);
str_end += utf_char2bytes(full_char, (char_u *) str_end);
}
if (SURROGATE_HI_START <= ch && ch <= SURROGATE_HI_END) {
fst_in_pair = (int) ch;
} else if (SURROGATE_LO_START <= ch && ch <= SURROGATE_LO_END
&& fst_in_pair != 0) {
const int full_char = (
(int) (ch - SURROGATE_LO_START)
+ ((fst_in_pair - SURROGATE_HI_START) << 10)
+ SURROGATE_FIRST_CHAR
);
str_end += utf_char2bytes(full_char, (char_u *) str_end);
fst_in_pair = 0;
} else {
str_end += utf_char2bytes((int) ch, (char_u *) str_end);
}

View File

@ -970,7 +970,7 @@ static inline int convert_to_json_string(garray_T *const gap,
default: {
if (vim_isprintc(ch)) {
ga_concat_len(gap, buf + i, shift);
} else if (ch <= 0xFFFF) {
} else if (ch < SURROGATE_FIRST_CHAR) {
ga_concat_len(gap, ((const char []) {
'\\', 'u',
xdigits[(ch >> (4 * 3)) & 0xF],
@ -979,9 +979,9 @@ static inline int convert_to_json_string(garray_T *const gap,
xdigits[(ch >> (4 * 0)) & 0xF],
}), sizeof("\\u1234") - 1);
} else {
uint32_t tmp = (uint32_t) ch - 0x010000;
uint16_t hi = 0xD800 + ((tmp >> 10) & 0x03FF);
uint16_t lo = 0xDC00 + ((tmp >> 0) & 0x03FF);
uint32_t tmp = (uint32_t) ch - SURROGATE_FIRST_CHAR;
uint16_t hi = SURROGATE_HI_START + ((tmp >> 10) & ((1 << 10) - 1));
uint16_t lo = SURROGATE_LO_END + ((tmp >> 0) & ((1 << 10) - 1));
ga_concat_len(gap, ((const char []) {
'\\', 'u',
xdigits[(hi >> (4 * 3)) & 0xF],

View File

@ -54,6 +54,21 @@ static inline ListReaderState encode_init_lrstate(const list_T *const list)
/// Array mapping values from SpecialVarValue enum to names
extern const char *const encode_special_var_names[];
/// First codepoint in high surrogates block
#define SURROGATE_HI_START 0xD800
/// Last codepoint in high surrogates block
#define SURROGATE_HI_END 0xDBFF
/// First codepoint in low surrogates block
#define SURROGATE_LO_START 0xDC00
/// Last codepoint in low surrogates block
#define SURROGATE_LO_END 0xDFFF
/// First character that needs to be encoded as surrogate pair
#define SURROGATE_FIRST_CHAR 0x10000
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "eval/encode.h.generated.h"
#endif

View File

@ -297,6 +297,24 @@ describe('jsondecode() function', function()
-- '"\xF9\x80\x80\x80\x80"',
-- '"\xFC\x90\x80\x80\x80\x80"',
end)
it('parses surrogate pairs properly', function()
eq('\xF0\x90\x80\x80', funcs.jsondecode('"\\uD800\\uDC00"'))
eq('\xED\xA0\x80a\xED\xB0\x80', funcs.jsondecode('"\\uD800a\\uDC00"'))
eq('\xED\xA0\x80\t\xED\xB0\x80', funcs.jsondecode('"\\uD800\\t\\uDC00"'))
eq('\xED\xA0\x80', funcs.jsondecode('"\\uD800"'))
eq('\xED\xA0\x80a', funcs.jsondecode('"\\uD800a"'))
eq('\xED\xA0\x80\t', funcs.jsondecode('"\\uD800\\t"'))
eq('\xED\xB0\x80', funcs.jsondecode('"\\uDC00"'))
eq('\xED\xB0\x80a', funcs.jsondecode('"\\uDC00a"'))
eq('\xED\xB0\x80\t', funcs.jsondecode('"\\uDC00\\t"'))
eq('\xED\xB0\x80', funcs.jsondecode('"\\uDC00"'))
eq('a\xED\xB0\x80', funcs.jsondecode('"a\\uDC00"'))
eq('\t\xED\xB0\x80', funcs.jsondecode('"\\t\\uDC00"'))
end)
end)
describe('jsonencode() function', function()