Merge pull request #2621 from TimKam/fix-stemming-removes-short-words-from-search-results

Fix: stemming removes short words from search results
This commit is contained in:
Takeshi KOMIYA 2016-07-12 14:05:41 +09:00 committed by GitHub
commit b58b6014c9
5 changed files with 34 additions and 6 deletions

View File

@ -386,14 +386,19 @@ class IndexBuilder(object):
_filter = self.lang.word_filter
for word in visitor.found_title_words:
word = stem(word)
if _filter(word):
stemmed_word = stem(word)
if _filter(stemmed_word):
self._title_mapping.setdefault(stemmed_word, set()).add(docname)
elif _filter(word): # stemmer must not remove words from search index
self._title_mapping.setdefault(word, set()).add(docname)
for word in visitor.found_words:
word = stem(word)
if word not in self._title_mapping and _filter(word):
self._mapping.setdefault(word, set()).add(docname)
stemmed_word = stem(word)
# again, stemmer must not remove words from search index
if not _filter(stemmed_word) and _filter(word):
stemmed_word = word
if stemmed_word not in self._title_mapping and _filter(stemmed_word):
self._mapping.setdefault(stemmed_word, set()).add(docname)
def context_for_searchtool(self):
return dict(

0
sphinx/search/test Normal file
View File

View File

@ -159,6 +159,10 @@ var Search = {
}
// stem the word
var word = stemmer.stemWord(tmp[i].toLowerCase());
// prevent stemmer from cutting word smaller than two chars
if(word.length < 3 && tmp[i].length >= 3) {
word = tmp[i];
}
var toAppend;
// select the correct list
if (word[0] == '-') {

View File

@ -5,4 +5,10 @@ meta keywords
:keywords lang=en: findthiskey, thistoo, notgerman
:keywords: thisonetoo
:keywords lang=de: onlygerman, onlytoogerman
:description: thisnoteither
:description: thisnoteither
Stemmer
=======
zfs
findthisstemmedkey

View File

@ -92,3 +92,16 @@ def test_meta_keys_are_handled_for_language_de(app, status, warning):
assert is_registered_term(searchindex, 'onlygerman')
assert not is_registered_term(searchindex, 'notgerman')
assert is_registered_term(searchindex, 'onlytoogerman')
@with_app(testroot='search')
def test_stemmer_does_not_remove_short_words(app, status, warning):
app.builder.build_all()
searchindex = (app.outdir / 'searchindex.js').text()
assert 'zfs' in searchindex
@with_app(testroot='search')
def test_stemmer(app, status, warning):
searchindex = (app.outdir / 'searchindex.js').text()
assert 'findthisstemmedkei' in searchindex