#5605 fix Chinese search index (#5611)

generate search index for Latin words correctly if search language is Chinese
This commit is contained in:
Timotheus Kampik 2018-12-25 18:41:52 +01:00 committed by GitHub
parent 920aafaee6
commit 221614654f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 2 deletions

View File

@ -15,6 +15,8 @@ Features added
Bugs fixed
----------
* #5605 If the documentation language is set to Chinese, English words could not
be searched.
Testing
--------

View File

@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
language_name = 'Chinese'
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
latin_terms = [] # type: List[unicode]
def init(self, options):
# type: (Dict) -> None
@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
if JIEBA:
chinese = list(jieba.cut_for_search(input))
latin1 = self.latin1_letters.findall(input)
latin1 = \
[term.strip() for term in self.latin1_letters.findall(input)]
self.latin_terms.extend(latin1)
return chinese + latin1
def word_filter(self, stemmed_word):
@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):
def stem(self, word):
# type: (unicode) -> unicode
# Don't stem Latin words that are long enough to be relevant for search
# if not stemmed, but would be too short after being stemmed
# avoids some issues with acronyms
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stem(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
return self.stemmer.stem(word.lower())

View File

@ -8,3 +8,9 @@ textinheading
=============
lorem ipsum
可以查看 FAQ 模块中 Chinesetest 部分
模块中 CAS service部分
可以Chinesetesttwo查看

View File

@ -240,3 +240,18 @@ def test_IndexBuilder_lookup():
# zh_CN
index = IndexBuilder(env, 'zh_CN', {}, None)
assert index.lang.lang == 'zh'
@pytest.mark.sphinx(
testroot='search',
confoverrides={'html_search_language': 'zh'},
srcdir='search_zh'
)
def test_search_index_gen_zh(app, status, warning):
app.builder.build_all()
# jsdump fails if search language is 'zh'; hence we just get the text:
searchindex = (app.outdir / 'searchindex.js').text()
assert 'chinesetest ' not in searchindex
assert 'chinesetest' in searchindex
assert 'chinesetesttwo' in searchindex
assert 'cas' in searchindex