vim-patch:8.2.1665: cannot do fuzzy string matching

Problem:    Cannot do fuzzy string matching.
Solution:   Add matchfuzzy(). (Yegappan Lakshmanan, closes vim/vim#6932)
635414dd2f

Adjust Test_matchfuzzy's 2nd assert to expect the last error thrown, as
v8.2.1183 hasn't been ported yet (to be honest, the error message is kinda weird
if the 2nd argument is not convertible to string). We can still port this fully
as porting v8.2.1183 would require removing this change to pass CI.
This commit is contained in:
Sean Dewar 2022-01-01 14:58:32 +00:00
parent f02a5a7bda
commit fba00b5e7e
No known key found for this signature in database
GPG Key ID: 08CC2C83AD41B581
5 changed files with 348 additions and 0 deletions

View File

@ -296,6 +296,7 @@ matcharg({nr}) List arguments of |:match|
matchdelete({id} [, {win}]) Number delete match identified by {id} matchdelete({id} [, {win}]) Number delete match identified by {id}
matchend({expr}, {pat}[, {start}[, {count}]]) matchend({expr}, {pat}[, {start}[, {count}]])
Number position where {pat} ends in {expr} Number position where {pat} ends in {expr}
matchfuzzy({list}, {str}) List fuzzy match {str} in {list}
matchlist({expr}, {pat}[, {start}[, {count}]]) matchlist({expr}, {pat}[, {start}[, {count}]])
List match and submatches of {pat} in {expr} List match and submatches of {pat} in {expr}
matchstr({expr}, {pat}[, {start}[, {count}]]) matchstr({expr}, {pat}[, {start}[, {count}]])
@ -4857,6 +4858,27 @@ matchend({expr}, {pat} [, {start} [, {count}]]) *matchend()*
Can also be used as a |method|: > Can also be used as a |method|: >
GetText()->matchend('word') GetText()->matchend('word')
matchfuzzy({list}, {str}) *matchfuzzy()*
Returns a list with all the strings in {list} that fuzzy
match {str}. The strings in the returned list are sorted
based on the matching score. {str} is treated as a literal
string and regular expression matching is NOT supported.
The maximum supported {str} length is 256.
If there are no matching strings or there is an error, then an
empty list is returned. If length of {str} is greater than
256, then returns an empty list.
Example: >
:echo matchfuzzy(["clay", "crow"], "cay")
< results in ["clay"]. >
:echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl")
< results in a list of buffer names fuzzy matching "ndl". >
:echo v:oldfiles->matchfuzzy("test")
< results in a list of file names fuzzy matching "test". >
:let l = readfile("buffer.c")->matchfuzzy("str")
< results in a list of lines in "buffer.c" fuzzy matching "str".
matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()* matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()*
Same as |match()|, but return a |List|. The first item in the Same as |match()|, but return a |List|. The first item in the
list is the matched string, same as what matchstr() would list is the matched string, same as what matchstr() would

View File

@ -608,6 +608,7 @@ String manipulation: *string-functions*
toupper() turn a string to uppercase toupper() turn a string to uppercase
match() position where a pattern matches in a string match() position where a pattern matches in a string
matchend() position where a pattern match ends in a string matchend() position where a pattern match ends in a string
matchfuzzy() fuzzy matches a string in a list of strings
matchstr() match of a pattern in a string matchstr() match of a pattern in a string
matchstrpos() match and positions of a pattern in a string matchstrpos() match and positions of a pattern in a string
matchlist() like matchstr() and also return submatches matchlist() like matchstr() and also return submatches

View File

@ -249,6 +249,7 @@ return {
matcharg={args=1, base=1}, matcharg={args=1, base=1},
matchdelete={args={1, 2}, base=1}, matchdelete={args={1, 2}, base=1},
matchend={args={2, 4}, base=1}, matchend={args={2, 4}, base=1},
matchfuzzy={args=2, base=1},
matchlist={args={2, 4}, base=1}, matchlist={args={2, 4}, base=1},
matchstr={args={2, 4}, base=1}, matchstr={args={2, 4}, base=1},
matchstrpos={args={2,4}, base=1}, matchstrpos={args={2,4}, base=1},

View File

@ -4764,6 +4764,304 @@ the_end:
restore_last_search_pattern(); restore_last_search_pattern();
} }
/// Fuzzy string matching
///
/// Ported from the lib_fts library authored by Forrest Smith.
/// https://github.com/forrestthewoods/lib_fts/tree/master/code
///
/// Blog describing the algorithm:
/// https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
///
/// Each matching string is assigned a score. The following factors are checked:
/// Matched letter
/// Unmatched letter
/// Consecutively matched letters
/// Proximity to start
/// Letter following a separator (space, underscore)
/// Uppercase letter following lowercase (aka CamelCase)
///
/// Matched letters are good. Unmatched letters are bad. Matching near the start
/// is good. Matching the first letter in the middle of a phrase is good.
/// Matching the uppercase letters in camel case entries is good.
///
/// The score assigned for each factor is explained below.
/// File paths are different from file names. File extensions may be ignorable.
/// Single words care about consecutive matches but not separators or camel
/// case.
/// Score starts at 0
/// Matched letter: +0 points
/// Unmatched letter: -1 point
/// Consecutive match bonus: +5 points
/// Separator bonus: +10 points
/// Camel case bonus: +10 points
/// Unmatched leading letter: -3 points (max: -9)
///
/// There is some nuance to this. Scores dont have an intrinsic meaning. The
/// score range isnt 0 to 100. Its roughly [-50, 50]. Longer words have a
/// lower minimum score due to unmatched letter penalty. Longer search patterns
/// have a higher maximum score due to match bonuses.
///
/// Separator and camel case bonus is worth a LOT. Consecutive matches are worth
/// quite a bit.
///
/// There is a penalty if you DONT match the first three letters. Which
/// effectively rewards matching near the start. However theres no difference
/// in matching between the middle and end.
///
/// There is not an explicit bonus for an exact match. Unmatched letters receive
/// a penalty. So shorter strings and closer matches are worth more.
typedef struct {
const listitem_T *item;
int score;
} fuzzyItem_T;
static bool fuzzy_match_recursive(const char_u *fuzpat, const char_u *str, int *const outScore,
const char_u *const strBegin, const char_u *const srcMatches,
char_u *const matches, const int maxMatches, int nextMatch,
int *const recursionCount, const int recursionLimit)
FUNC_ATTR_NONNULL_ARG(1, 2, 3, 4, 6, 9) FUNC_ATTR_WARN_UNUSED_RESULT
{
// Recursion params
bool recursiveMatch = false;
char_u bestRecursiveMatches[256];
int bestRecursiveScore = 0;
// Count recursions
(*recursionCount)++;
if (*recursionCount >= recursionLimit) {
return false;
}
// Detect end of strings
if (*fuzpat == '\0' || *str == '\0') {
return false;
}
// Loop through fuzpat and str looking for a match
bool first_match = true;
while (*fuzpat != '\0' && *str != '\0') {
// Found match
if (mb_tolower(*fuzpat) == mb_tolower(*str)) {
// Supplied matches buffer was too short
if (nextMatch >= maxMatches) {
return false;
}
// "Copy-on-Write" srcMatches into matches
if (first_match && srcMatches) {
memcpy(matches, srcMatches, nextMatch);
first_match = false;
}
// Recursive call that "skips" this match
char_u recursiveMatches[256];
int recursiveScore = 0;
if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, strBegin, matches,
recursiveMatches, sizeof(recursiveMatches), nextMatch,
recursionCount, recursionLimit)) {
// Pick best recursive score
if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
memcpy(bestRecursiveMatches, recursiveMatches, 256);
bestRecursiveScore = recursiveScore;
}
recursiveMatch = true;
}
// Advance
matches[nextMatch++] = (char_u)(str - strBegin);
fuzpat++;
}
str++;
}
// Determine if full fuzpat was matched
const bool matched = *fuzpat == '\0';
// Calculate score
if (matched) {
// bonus for adjacent matches
const int sequential_bonus = 15;
// bonus if match occurs after a separator
const int separator_bonus = 30;
// bonus if match is uppercase and prev is lower
const int camel_bonus = 30;
// bonus if the first letter is matched
const int first_letter_bonus = 15;
// penalty applied for every letter in str before the first match
const int leading_letter_penalty = -5;
// maximum penalty for leading letters
const int max_leading_letter_penalty = -15;
// penalty for every letter that doesn't matter
const int unmatched_letter_penalty = -1;
// Iterate str to end
while (*str != '\0') {
str++;
}
// Initialize score
*outScore = 100;
// Apply leading letter penalty
int penalty = leading_letter_penalty * matches[0];
if (penalty < max_leading_letter_penalty) {
penalty = max_leading_letter_penalty;
}
*outScore += penalty;
// Apply unmatched penalty
const int unmatched = (int)(str - strBegin) - nextMatch;
*outScore += unmatched_letter_penalty * unmatched;
// Apply ordering bonuses
for (int i = 0; i < nextMatch; i++) {
const char_u currIdx = matches[i];
if (i > 0) {
const char_u prevIdx = matches[i - 1];
// Sequential
if (currIdx == (prevIdx + 1)) {
*outScore += sequential_bonus;
}
}
// Check for bonuses based on neighbor character value
if (currIdx > 0) {
// Camel case
const char_u neighbor = strBegin[currIdx - 1];
const char_u curr = strBegin[currIdx];
if (islower(neighbor) && isupper(curr)) {
*outScore += camel_bonus;
}
// Separator
const bool neighborSeparator = neighbor == '_' || neighbor == ' ';
if (neighborSeparator) {
*outScore += separator_bonus;
}
} else {
// First letter
*outScore += first_letter_bonus;
}
}
}
// Return best result
if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) {
// Recursive score is better than "this"
memcpy(matches, bestRecursiveMatches, maxMatches);
*outScore = bestRecursiveScore;
return true;
} else if (matched) {
return true; // "this" score is better than recursive
}
return false; // no match
}
/// fuzzy_match()
///
/// Performs exhaustive search via recursion to find all possible matches and
/// match with highest score.
/// Scores values have no intrinsic meaning. Possible score range is not
/// normalized and varies with pattern.
/// Recursion is limited internally (default=10) to prevent degenerate cases
/// (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
/// Uses char_u for match indices. Therefore patterns are limited to 256
/// characters.
///
/// Returns true if fuzpat is found AND calculates a score.
static bool fuzzy_match(const char_u *const str, const char_u *const fuzpat, int *const outScore)
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
{
char_u matches[256];
int recursionCount = 0;
int recursionLimit = 10;
*outScore = 0;
return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, sizeof(matches), 0,
&recursionCount, recursionLimit);
}
/// Sort the fuzzy matches in the descending order of the match score.
static int fuzzy_item_compare(const void *const s1, const void *const s2)
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE
{
const int v1 = ((const fuzzyItem_T *)s1)->score;
const int v2 = ((const fuzzyItem_T *)s2)->score;
return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
}
/// Fuzzy search the string 'str' in 'strlist' and return the matching strings
/// in 'fmatchlist'.
static void match_fuzzy(const list_T *const strlist, const char_u *const str,
list_T *const fmatchlist)
FUNC_ATTR_NONNULL_ARG(2, 3)
{
const long len = tv_list_len(strlist);
if (len == 0) {
return;
}
fuzzyItem_T *const ptrs = xmalloc(sizeof(fuzzyItem_T) * len);
long i = 0;
bool found_match = false;
// For all the string items in strlist, get the fuzzy matching score
TV_LIST_ITER_CONST(strlist, li, {
ptrs[i].item = li;
ptrs[i].score = -9999;
// ignore non-string items in the list
const typval_T *const tv = TV_LIST_ITEM_TV(li);
if (tv->v_type == VAR_STRING && tv->vval.v_string != NULL) {
int score;
if (fuzzy_match(tv->vval.v_string, str, &score)) {
ptrs[i].score = score;
found_match = true;
}
}
i++;
});
if (found_match) {
// Sort the list by the descending order of the match score
qsort(ptrs, (size_t)len, sizeof(fuzzyItem_T), fuzzy_item_compare);
// Copy the matching strings with 'score != -9999' to the return list
for (i = 0; i < len; i++) {
if (ptrs[i].score == -9999) {
break;
}
const typval_T *const tv = TV_LIST_ITEM_TV(ptrs[i].item);
tv_list_append_string(fmatchlist, (const char *)tv->vval.v_string, -1);
}
}
xfree(ptrs);
}
/// "matchfuzzy()" function
void f_matchfuzzy(typval_T *argvars, typval_T *rettv, FunPtr fptr)
{
if (argvars[0].v_type != VAR_LIST) {
emsg(_(e_listreq));
return;
}
if (argvars[0].vval.v_list == NULL) {
return;
}
if (argvars[1].v_type != VAR_STRING || argvars[1].vval.v_string == NULL) {
semsg(_(e_invarg2), tv_get_string(&argvars[1]));
return;
}
match_fuzzy(argvars[0].vval.v_list, (const char_u *)tv_get_string(&argvars[1]),
tv_list_alloc_ret(rettv, kListLenUnknown));
}
/// Find identifiers or defines in included files. /// Find identifiers or defines in included files.
/// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase. /// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase.
/// ///

View File

@ -1731,6 +1731,32 @@ func Test_nr2char()
call assert_equal("\x80\xfc\b\xfd\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX", eval('"\<M-' .. nr2char(0x40000000) .. '>"')) call assert_equal("\x80\xfc\b\xfd\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX", eval('"\<M-' .. nr2char(0x40000000) .. '>"'))
endfunc endfunc
" Test for matchfuzzy()
func Test_matchfuzzy()
call assert_fails('call matchfuzzy(10, "abc")', 'E714:')
" Needs v8.2.1183.
" call assert_fails('call matchfuzzy(["abc"], [])', 'E730:')
call assert_fails('call matchfuzzy(["abc"], [])', 'E475:')
call assert_equal([], matchfuzzy([], 'abc'))
call assert_equal([], matchfuzzy(['abc'], ''))
call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac'))
call assert_equal([], matchfuzzy([10, 20], 'ac'))
call assert_equal(['abc'], matchfuzzy(['abc'], 'abc'))
call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra'))
call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa'))
call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one'))
call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo'))
call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo'))
call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa'))
call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257)))
%bw!
eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)})
let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl')
call assert_equal(1, len(l))
call assert_match('needle', l[0])
endfunc
" Test for getcurpos() and setpos() " Test for getcurpos() and setpos()
func Test_getcurpos_setpos() func Test_getcurpos_setpos()
new new