mirror of
https://github.com/neovim/neovim.git
synced 2025-02-25 18:55:25 -06:00
Merge pull request #30042 from bfredl/bobbytables
refactor(multibyte): replace generated unicode tables with utf8proc
This commit is contained in:
commit
e1937286f0
@ -35,8 +35,8 @@ GETTEXT_SHA256 66415634c6e8c3fa8b71362879ec7575e27da43da562c798a8a2f223e6e47f5c
|
|||||||
LIBICONV_URL https://github.com/neovim/deps/raw/b9bf36eb31f27e8136d907da38fa23518927737e/opt/libiconv-1.17.tar.gz
|
LIBICONV_URL https://github.com/neovim/deps/raw/b9bf36eb31f27e8136d907da38fa23518927737e/opt/libiconv-1.17.tar.gz
|
||||||
LIBICONV_SHA256 8f74213b56238c85a50a5329f77e06198771e70dd9a739779f4c02f65d971313
|
LIBICONV_SHA256 8f74213b56238c85a50a5329f77e06198771e70dd9a739779f4c02f65d971313
|
||||||
|
|
||||||
UTF8PROC_URL https://github.com/JuliaStrings/utf8proc/archive/v2.9.0.tar.gz
|
UTF8PROC_URL https://github.com/JuliaStrings/utf8proc/archive/3de4596fbe28956855df2ecb3c11c0bbc3535838.tar.gz
|
||||||
UTF8PROC_SHA256 18c1626e9fc5a2e192311e36b3010bfc698078f692888940f1fa150547abb0c1
|
UTF8PROC_SHA256 fb4a16bb659b58afb7f921fcc8928d0b3c1fcab135366c8a4f9ca7de1b1cfada
|
||||||
|
|
||||||
TREESITTER_C_URL https://github.com/tree-sitter/tree-sitter-c/archive/v0.21.3.tar.gz
|
TREESITTER_C_URL https://github.com/tree-sitter/tree-sitter-c/archive/v0.21.3.tar.gz
|
||||||
TREESITTER_C_SHA256 75a3780df6114cd37496761c4a7c9fd900c78bee3a2707f590d78c0ca3a24368
|
TREESITTER_C_SHA256 75a3780df6114cd37496761c4a7c9fd900c78bee3a2707f590d78c0ca3a24368
|
||||||
|
@ -301,7 +301,6 @@ set(GENERATOR_DIR ${CMAKE_CURRENT_LIST_DIR}/generators)
|
|||||||
set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval)
|
set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval)
|
||||||
set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit)
|
set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit)
|
||||||
set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime)
|
set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime)
|
||||||
set(UNICODE_DIR ${PROJECT_SOURCE_DIR}/src/unicode)
|
|
||||||
|
|
||||||
# GENERATOR_DIR
|
# GENERATOR_DIR
|
||||||
set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua)
|
set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua)
|
||||||
@ -316,7 +315,6 @@ set(GENERATOR_PRELOAD ${GENERATOR_DIR}/preload.lua)
|
|||||||
set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua)
|
set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua)
|
||||||
set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua)
|
set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua)
|
||||||
set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua)
|
set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua)
|
||||||
set(UNICODE_TABLES_GENERATOR ${GENERATOR_DIR}/gen_unicode_tables.lua)
|
|
||||||
|
|
||||||
# GENERATED_DIR and GENERATED_INCLUDES_DIR
|
# GENERATED_DIR and GENERATED_INCLUDES_DIR
|
||||||
set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h)
|
set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h)
|
||||||
@ -333,7 +331,6 @@ set(GENERATED_OPTIONS_MAP ${GENERATED_DIR}/options_map.generated.h)
|
|||||||
set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h)
|
set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h)
|
||||||
set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h)
|
set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h)
|
||||||
set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h)
|
set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h)
|
||||||
set(GENERATED_UNICODE_TABLES ${GENERATED_DIR}/unicode_tables.generated.h)
|
|
||||||
set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h)
|
set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h)
|
||||||
set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h)
|
set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h)
|
||||||
|
|
||||||
@ -350,7 +347,6 @@ set(LUA_LOADER_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/loader.lua)
|
|||||||
set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua)
|
set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua)
|
||||||
set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua)
|
set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua)
|
||||||
|
|
||||||
file(GLOB UNICODE_FILES CONFIGURE_DEPENDS ${UNICODE_DIR}/*.txt)
|
|
||||||
file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h)
|
file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h)
|
||||||
list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h)
|
list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h)
|
||||||
file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h)
|
file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h)
|
||||||
@ -587,15 +583,6 @@ foreach(sfile ${NVIM_SOURCES}
|
|||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_custom_command(OUTPUT ${GENERATED_UNICODE_TABLES}
|
|
||||||
COMMAND ${LUA_PRG} ${UNICODE_TABLES_GENERATOR}
|
|
||||||
${UNICODE_DIR}
|
|
||||||
${GENERATED_UNICODE_TABLES}
|
|
||||||
DEPENDS
|
|
||||||
${UNICODE_TABLES_GENERATOR}
|
|
||||||
${UNICODE_FILES}
|
|
||||||
)
|
|
||||||
|
|
||||||
set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua)
|
set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua)
|
||||||
configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA})
|
configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA})
|
||||||
|
|
||||||
@ -687,7 +674,6 @@ list(APPEND NVIM_GENERATED_FOR_SOURCES
|
|||||||
"${GENERATED_EVENTS_NAMES_MAP}"
|
"${GENERATED_EVENTS_NAMES_MAP}"
|
||||||
"${GENERATED_OPTIONS}"
|
"${GENERATED_OPTIONS}"
|
||||||
"${GENERATED_OPTIONS_MAP}"
|
"${GENERATED_OPTIONS_MAP}"
|
||||||
"${GENERATED_UNICODE_TABLES}"
|
|
||||||
"${VIM_MODULE_FILE}"
|
"${VIM_MODULE_FILE}"
|
||||||
"${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h"
|
"${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h"
|
||||||
)
|
)
|
||||||
|
@ -1,264 +0,0 @@
|
|||||||
-- Script creates the following tables in unicode_tables.generated.h:
|
|
||||||
--
|
|
||||||
-- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed
|
|
||||||
-- intervals. Codepoints in these intervals have double (W or F) or ambiguous
|
|
||||||
-- (A) east asian width respectively.
|
|
||||||
-- 2. combining table: same as the above, but characters inside are combining
|
|
||||||
-- characters (i.e. have general categories equal to Mn, Mc or Me).
|
|
||||||
-- 3. foldCase table used to convert characters to
|
|
||||||
-- folded variants. In this table first two values are
|
|
||||||
-- character ranges: like in previous tables they are sorted and must be
|
|
||||||
-- non-overlapping. Third value means step inside the range: e.g. if it is
|
|
||||||
-- 2 then interval applies only to first, third, fifth, … character in range.
|
|
||||||
-- Fourth value is number that should be added to the codepoint to yield
|
|
||||||
-- folded codepoint.
|
|
||||||
-- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed
|
|
||||||
-- intervals of Emoji characters. emoji_wide contains all the characters
|
|
||||||
-- which don't have ambiguous or double width, and emoji_all has all Emojis.
|
|
||||||
if arg[1] == '--help' then
|
|
||||||
print('Usage:')
|
|
||||||
print(' gen_unicode_tables.lua unicode/ unicode_tables.generated.h')
|
|
||||||
os.exit(0)
|
|
||||||
end
|
|
||||||
|
|
||||||
local basedir = arg[1]
|
|
||||||
local pathsep = package.config:sub(1, 1)
|
|
||||||
local get_path = function(fname)
|
|
||||||
return basedir .. pathsep .. fname
|
|
||||||
end
|
|
||||||
|
|
||||||
local unicodedata_fname = get_path('UnicodeData.txt')
|
|
||||||
local eastasianwidth_fname = get_path('EastAsianWidth.txt')
|
|
||||||
local emoji_fname = get_path('emoji-data.txt')
|
|
||||||
|
|
||||||
local utf_tables_fname = arg[2]
|
|
||||||
|
|
||||||
local split_on_semicolons = function(s)
|
|
||||||
local ret = {}
|
|
||||||
local idx = 1
|
|
||||||
while idx <= #s + 1 do
|
|
||||||
local item = s:match('^[^;]*', idx)
|
|
||||||
idx = idx + #item + 1
|
|
||||||
if idx <= #s + 1 then
|
|
||||||
assert(s:sub(idx - 1, idx - 1) == ';')
|
|
||||||
end
|
|
||||||
item = item:gsub('^%s*', '')
|
|
||||||
item = item:gsub('%s*$', '')
|
|
||||||
table.insert(ret, item)
|
|
||||||
end
|
|
||||||
return ret
|
|
||||||
end
|
|
||||||
|
|
||||||
local fp_lines_to_lists = function(fp, n, has_comments)
|
|
||||||
local ret = {}
|
|
||||||
local line
|
|
||||||
local i = 0
|
|
||||||
while true do
|
|
||||||
i = i + 1
|
|
||||||
line = fp:read('*l')
|
|
||||||
if not line then
|
|
||||||
break
|
|
||||||
end
|
|
||||||
if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then
|
|
||||||
local l = split_on_semicolons(line)
|
|
||||||
if #l ~= n then
|
|
||||||
io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n))
|
|
||||||
io.stderr:write('Line: ' .. line .. '\n')
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
table.insert(ret, l)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return ret
|
|
||||||
end
|
|
||||||
|
|
||||||
local parse_data_to_props = function(ud_fp)
|
|
||||||
return fp_lines_to_lists(ud_fp, 15, false)
|
|
||||||
end
|
|
||||||
|
|
||||||
local parse_width_props = function(eaw_fp)
|
|
||||||
return fp_lines_to_lists(eaw_fp, 2, true)
|
|
||||||
end
|
|
||||||
|
|
||||||
local parse_emoji_props = function(emoji_fp)
|
|
||||||
return fp_lines_to_lists(emoji_fp, 2, true)
|
|
||||||
end
|
|
||||||
|
|
||||||
local make_range = function(start, end_, step, add)
|
|
||||||
if step and add then
|
|
||||||
return (' {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add)
|
|
||||||
else
|
|
||||||
return (' {0x%04x, 0x%04x},\n'):format(start, end_)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
local build_combining_table = function(ut_fp, dataprops)
|
|
||||||
ut_fp:write('static const struct interval combining[] = {\n')
|
|
||||||
local start = -1
|
|
||||||
local end_ = -1
|
|
||||||
for _, p in ipairs(dataprops) do
|
|
||||||
-- The 'Mc' property was removed, it does take up space.
|
|
||||||
if ({ Mn = true, Me = true })[p[3]] then
|
|
||||||
local n = tonumber(p[1], 16)
|
|
||||||
if start >= 0 and end_ + 1 == n then
|
|
||||||
-- Continue with the same range.
|
|
||||||
end_ = n
|
|
||||||
else
|
|
||||||
if start >= 0 then
|
|
||||||
-- Produce previous range.
|
|
||||||
ut_fp:write(make_range(start, end_))
|
|
||||||
end
|
|
||||||
start = n
|
|
||||||
end_ = n
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
if start >= 0 then
|
|
||||||
ut_fp:write(make_range(start, end_))
|
|
||||||
end
|
|
||||||
ut_fp:write('};\n')
|
|
||||||
end
|
|
||||||
|
|
||||||
local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name)
|
|
||||||
ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
|
|
||||||
local start = -1
|
|
||||||
local end_ = -1
|
|
||||||
local dataidx = 1
|
|
||||||
local ret = {}
|
|
||||||
for _, p in ipairs(widthprops) do
|
|
||||||
if widths[p[2]:sub(1, 1)] then
|
|
||||||
local rng_start, rng_end = p[1]:find('%.%.')
|
|
||||||
local n, n_last
|
|
||||||
if rng_start then
|
|
||||||
-- It is a range. We don’t check for composing char then.
|
|
||||||
n = tonumber(p[1]:sub(1, rng_start - 1), 16)
|
|
||||||
n_last = tonumber(p[1]:sub(rng_end + 1), 16)
|
|
||||||
else
|
|
||||||
n = tonumber(p[1], 16)
|
|
||||||
n_last = n
|
|
||||||
end
|
|
||||||
local dn
|
|
||||||
while true do
|
|
||||||
dn = tonumber(dataprops[dataidx][1], 16)
|
|
||||||
if dn >= n then
|
|
||||||
break
|
|
||||||
end
|
|
||||||
dataidx = dataidx + 1
|
|
||||||
end
|
|
||||||
if dn ~= n and n_last == n then
|
|
||||||
io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
|
|
||||||
end
|
|
||||||
-- Only use the char when it’s not a composing char.
|
|
||||||
-- But use all chars from a range.
|
|
||||||
local dp = dataprops[dataidx]
|
|
||||||
if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then
|
|
||||||
if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542
|
|
||||||
-- Continue with the same range.
|
|
||||||
else
|
|
||||||
if start >= 0 then
|
|
||||||
ut_fp:write(make_range(start, end_))
|
|
||||||
table.insert(ret, { start, end_ })
|
|
||||||
end
|
|
||||||
start = n
|
|
||||||
end
|
|
||||||
end_ = n_last
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
if start >= 0 then
|
|
||||||
ut_fp:write(make_range(start, end_))
|
|
||||||
table.insert(ret, { start, end_ })
|
|
||||||
end
|
|
||||||
ut_fp:write('};\n')
|
|
||||||
return ret
|
|
||||||
end
|
|
||||||
|
|
||||||
local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth)
|
|
||||||
local emojiwidth = {}
|
|
||||||
local emoji = {}
|
|
||||||
for _, p in ipairs(emojiprops) do
|
|
||||||
if p[2]:match('Emoji%s+#') then
|
|
||||||
local rng_start, rng_end = p[1]:find('%.%.')
|
|
||||||
local n
|
|
||||||
local n_last
|
|
||||||
if rng_start then
|
|
||||||
n = tonumber(p[1]:sub(1, rng_start - 1), 16)
|
|
||||||
n_last = tonumber(p[1]:sub(rng_end + 1), 16)
|
|
||||||
else
|
|
||||||
n = tonumber(p[1], 16)
|
|
||||||
n_last = n
|
|
||||||
end
|
|
||||||
if #emoji > 0 and n - 1 == emoji[#emoji][2] then
|
|
||||||
emoji[#emoji][2] = n_last
|
|
||||||
else
|
|
||||||
table.insert(emoji, { n, n_last })
|
|
||||||
end
|
|
||||||
|
|
||||||
-- Characters below 1F000 may be considered single width traditionally,
|
|
||||||
-- making them double width causes problems.
|
|
||||||
if n >= 0x1f000 then
|
|
||||||
-- exclude characters that are in the ambiguous/doublewidth table
|
|
||||||
for _, ambi in ipairs(ambiwidth) do
|
|
||||||
if n >= ambi[1] and n <= ambi[2] then
|
|
||||||
n = ambi[2] + 1
|
|
||||||
end
|
|
||||||
if n_last >= ambi[1] and n_last <= ambi[2] then
|
|
||||||
n_last = ambi[1] - 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
for _, double in ipairs(doublewidth) do
|
|
||||||
if n >= double[1] and n <= double[2] then
|
|
||||||
n = double[2] + 1
|
|
||||||
end
|
|
||||||
if n_last >= double[1] and n_last <= double[2] then
|
|
||||||
n_last = double[1] - 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
if n <= n_last then
|
|
||||||
if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then
|
|
||||||
emojiwidth[#emojiwidth][2] = n_last
|
|
||||||
else
|
|
||||||
table.insert(emojiwidth, { n, n_last })
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
ut_fp:write('static const struct interval emoji_all[] = {\n')
|
|
||||||
for _, p in ipairs(emoji) do
|
|
||||||
ut_fp:write(make_range(p[1], p[2]))
|
|
||||||
end
|
|
||||||
ut_fp:write('};\n')
|
|
||||||
|
|
||||||
ut_fp:write('static const struct interval emoji_wide[] = {\n')
|
|
||||||
for _, p in ipairs(emojiwidth) do
|
|
||||||
ut_fp:write(make_range(p[1], p[2]))
|
|
||||||
end
|
|
||||||
ut_fp:write('};\n')
|
|
||||||
end
|
|
||||||
|
|
||||||
local ud_fp = io.open(unicodedata_fname, 'r')
|
|
||||||
local dataprops = parse_data_to_props(ud_fp)
|
|
||||||
ud_fp:close()
|
|
||||||
|
|
||||||
local ut_fp = io.open(utf_tables_fname, 'w')
|
|
||||||
|
|
||||||
build_combining_table(ut_fp, dataprops)
|
|
||||||
|
|
||||||
local eaw_fp = io.open(eastasianwidth_fname, 'r')
|
|
||||||
local widthprops = parse_width_props(eaw_fp)
|
|
||||||
eaw_fp:close()
|
|
||||||
|
|
||||||
local doublewidth =
|
|
||||||
build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth')
|
|
||||||
local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous')
|
|
||||||
|
|
||||||
local emoji_fp = io.open(emoji_fname, 'r')
|
|
||||||
local emojiprops = parse_emoji_props(emoji_fp)
|
|
||||||
emoji_fp:close()
|
|
||||||
|
|
||||||
build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth)
|
|
||||||
|
|
||||||
ut_fp:close()
|
|
@ -85,7 +85,6 @@ struct interval {
|
|||||||
// uncrustify:off
|
// uncrustify:off
|
||||||
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
||||||
# include "mbyte.c.generated.h"
|
# include "mbyte.c.generated.h"
|
||||||
# include "unicode_tables.generated.h"
|
|
||||||
#endif
|
#endif
|
||||||
// uncrustify:on
|
// uncrustify:on
|
||||||
|
|
||||||
@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
|
|||||||
return utf_class_tab(utf_ptr2char(p), chartab);
|
return utf_class_tab(utf_ptr2char(p), chartab);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return true if "c" is in "table".
|
static bool prop_is_emojilike(const utf8proc_property_t *prop)
|
||||||
static bool intable(const struct interval *table, size_t n_items, int c)
|
|
||||||
FUNC_ATTR_PURE
|
|
||||||
{
|
{
|
||||||
assert(n_items > 0);
|
return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
|
||||||
// first quick check for Latin1 etc. characters
|
|| prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
|
||||||
if (c < table[0].first) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(n_items <= SIZE_MAX / 2);
|
|
||||||
// binary search in table
|
|
||||||
size_t bot = 0;
|
|
||||||
size_t top = n_items;
|
|
||||||
do {
|
|
||||||
size_t mid = (bot + top) >> 1;
|
|
||||||
if (table[mid].last < c) {
|
|
||||||
bot = mid + 1;
|
|
||||||
} else if (table[mid].first > c) {
|
|
||||||
top = mid;
|
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} while (top > bot);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
|
/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
|
||||||
@ -496,13 +474,18 @@ int utf_char2cells(int c)
|
|||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
|
const utf8proc_property_t *prop = utf8proc_get_property(c);
|
||||||
|
|
||||||
|
if (prop->charwidth == 2) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
|
if (*p_ambw == 'd' && prop->ambiguous_width) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
|
|
||||||
|
// Characters below 1F000 may be considered single width traditionally,
|
||||||
|
// making them double width causes problems.
|
||||||
|
if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in)
|
|||||||
}
|
}
|
||||||
int cells = utf_char2cells(c);
|
int cells = utf_char2cells(c);
|
||||||
if (cells == 1 && p_emoji
|
if (cells == 1 && p_emoji
|
||||||
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
|
&& prop_is_emojilike(utf8proc_get_property(c))) {
|
||||||
int c2 = utf_ptr2char(p_in + len);
|
int c2 = utf_ptr2char(p_in + len);
|
||||||
if (c2 == 0xFE0F) {
|
if (c2 == 0xFE0F) {
|
||||||
return 2; // emoji presentation
|
return 2; // emoji presentation
|
||||||
@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size)
|
|||||||
}
|
}
|
||||||
int cells = utf_char2cells(c);
|
int cells = utf_char2cells(c);
|
||||||
if (cells == 1 && p_emoji && size > len
|
if (cells == 1 && p_emoji && size > len
|
||||||
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)
|
&& prop_is_emojilike(utf8proc_get_property(c))
|
||||||
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
|
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
|
||||||
int c2 = utf_ptr2char(p + len);
|
int c2 = utf_ptr2char(p + len);
|
||||||
if (c2 == 0xFE0F) {
|
if (c2 == 0xFE0F) {
|
||||||
@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf)
|
|||||||
/// Returns false for negative values.
|
/// Returns false for negative values.
|
||||||
bool utf_iscomposing_legacy(int c)
|
bool utf_iscomposing_legacy(int c)
|
||||||
{
|
{
|
||||||
return intable(combining, ARRAY_SIZE(combining), c);
|
const utf8proc_property_t *prop = utf8proc_get_property(c);
|
||||||
|
return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
@ -1182,6 +1166,33 @@ bool utf_printable(int c)
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
// Return true if "c" is in "table".
|
||||||
|
static bool intable(const struct interval *table, size_t n_items, int c)
|
||||||
|
FUNC_ATTR_PURE
|
||||||
|
{
|
||||||
|
assert(n_items > 0);
|
||||||
|
// first quick check for Latin1 etc. characters
|
||||||
|
if (c < table[0].first) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(n_items <= SIZE_MAX / 2);
|
||||||
|
// binary search in table
|
||||||
|
size_t bot = 0;
|
||||||
|
size_t top = n_items;
|
||||||
|
do {
|
||||||
|
size_t mid = (bot + top) >> 1;
|
||||||
|
if (table[mid].last < c) {
|
||||||
|
bot = mid + 1;
|
||||||
|
} else if (table[mid].first > c) {
|
||||||
|
top = mid;
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} while (top > bot);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Return true for characters that can be displayed in a normal way.
|
// Return true for characters that can be displayed in a normal way.
|
||||||
// Only for characters of 0x100 and above!
|
// Only for characters of 0x100 and above!
|
||||||
bool utf_printable(int c)
|
bool utf_printable(int c)
|
||||||
@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
|
|||||||
return 1; // punctuation
|
return 1; // punctuation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const utf8proc_property_t *prop = utf8proc_get_property(c);
|
||||||
// emoji
|
// emoji
|
||||||
if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
|
if (prop_is_emojilike(prop)) {
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
|
|||||||
bool utf_ambiguous_width(const char *p)
|
bool utf_ambiguous_width(const char *p)
|
||||||
{
|
{
|
||||||
int c = utf_ptr2char(p);
|
int c = utf_ptr2char(p);
|
||||||
return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
|
if (c < 0x80) {
|
||||||
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const utf8proc_property_t *prop = utf8proc_get_property(c);
|
||||||
|
return prop->ambiguous_width || prop_is_emojilike(prop);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
|
// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
|
||||||
|
37
src/unicode/Copyright.txt
vendored
37
src/unicode/Copyright.txt
vendored
@ -1,37 +0,0 @@
|
|||||||
COPYRIGHT AND PERMISSION NOTICE
|
|
||||||
|
|
||||||
Copyright © 1991-2015 Unicode, Inc. All rights reserved.
|
|
||||||
Distributed under the Terms of Use in
|
|
||||||
https://www.unicode.org/copyright.html.
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of the Unicode data files and any associated documentation
|
|
||||||
(the "Data Files") or Unicode software and any associated documentation
|
|
||||||
(the "Software") to deal in the Data Files or Software
|
|
||||||
without restriction, including without limitation the rights to use,
|
|
||||||
copy, modify, merge, publish, distribute, and/or sell copies of
|
|
||||||
the Data Files or Software, and to permit persons to whom the Data Files
|
|
||||||
or Software are furnished to do so, provided that
|
|
||||||
(a) this copyright and permission notice appear with all copies
|
|
||||||
of the Data Files or Software,
|
|
||||||
(b) this copyright and permission notice appear in associated
|
|
||||||
documentation, and
|
|
||||||
(c) there is clear notice in each modified Data File or in the Software
|
|
||||||
as well as in the documentation associated with the Data File(s) or
|
|
||||||
Software that the data or software has been modified.
|
|
||||||
|
|
||||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
||||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
||||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
||||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
|
||||||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
|
||||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
|
||||||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
||||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
||||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
|
||||||
|
|
||||||
Except as contained in this notice, the name of a copyright holder
|
|
||||||
shall not be used in advertising or otherwise to promote the sale,
|
|
||||||
use or other dealings in these Data Files or Software without prior
|
|
||||||
written authorization of the copyright holder.
|
|
2621
src/unicode/EastAsianWidth.txt
vendored
2621
src/unicode/EastAsianWidth.txt
vendored
File diff suppressed because it is too large
Load Diff
34931
src/unicode/UnicodeData.txt
vendored
34931
src/unicode/UnicodeData.txt
vendored
File diff suppressed because it is too large
Load Diff
1320
src/unicode/emoji-data.txt
vendored
1320
src/unicode/emoji-data.txt
vendored
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user