mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
generate search index for Latin words correctly if search language is Chinese
This commit is contained in:
parent
920aafaee6
commit
221614654f
2
CHANGES
2
CHANGES
@ -15,6 +15,8 @@ Features added
|
||||
|
||||
Bugs fixed
|
||||
----------
|
||||
* #5605 If the documentation language is set to Chinese, English words could not
|
||||
be searched.
|
||||
|
||||
Testing
|
||||
--------
|
||||
|
@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
|
||||
language_name = 'Chinese'
|
||||
js_stemmer_code = js_porter_stemmer
|
||||
stopwords = english_stopwords
|
||||
latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
|
||||
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
|
||||
latin_terms = [] # type: List[unicode]
|
||||
|
||||
def init(self, options):
|
||||
# type: (Dict) -> None
|
||||
@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
|
||||
if JIEBA:
|
||||
chinese = list(jieba.cut_for_search(input))
|
||||
|
||||
latin1 = self.latin1_letters.findall(input)
|
||||
latin1 = \
|
||||
[term.strip() for term in self.latin1_letters.findall(input)]
|
||||
self.latin_terms.extend(latin1)
|
||||
return chinese + latin1
|
||||
|
||||
def word_filter(self, stemmed_word):
|
||||
@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):
|
||||
|
||||
def stem(self, word):
|
||||
# type: (unicode) -> unicode
|
||||
|
||||
# Don't stem Latin words that are long enough to be relevant for search
|
||||
# if not stemmed, but would be too short after being stemmed
|
||||
# avoids some issues with acronyms
|
||||
should_not_be_stemmed = (
|
||||
word in self.latin_terms and
|
||||
len(word) >= 3 and
|
||||
len(self.stemmer.stem(word.lower())) < 3
|
||||
)
|
||||
if should_not_be_stemmed:
|
||||
return word.lower()
|
||||
return self.stemmer.stem(word.lower())
|
||||
|
@ -8,3 +8,9 @@ textinheading
|
||||
=============
|
||||
|
||||
lorem ipsum
|
||||
|
||||
可以查看 FAQ 模块中 Chinesetest 部分
|
||||
|
||||
模块中 CAS service部分
|
||||
|
||||
可以Chinesetesttwo查看
|
||||
|
@ -240,3 +240,18 @@ def test_IndexBuilder_lookup():
|
||||
# zh_CN
|
||||
index = IndexBuilder(env, 'zh_CN', {}, None)
|
||||
assert index.lang.lang == 'zh'
|
||||
|
||||
|
||||
@pytest.mark.sphinx(
|
||||
testroot='search',
|
||||
confoverrides={'html_search_language': 'zh'},
|
||||
srcdir='search_zh'
|
||||
)
|
||||
def test_search_index_gen_zh(app, status, warning):
|
||||
app.builder.build_all()
|
||||
# jsdump fails if search language is 'zh'; hence we just get the text:
|
||||
searchindex = (app.outdir / 'searchindex.js').text()
|
||||
assert 'chinesetest ' not in searchindex
|
||||
assert 'chinesetest' in searchindex
|
||||
assert 'chinesetesttwo' in searchindex
|
||||
assert 'cas' in searchindex
|
||||
|
Loading…
Reference in New Issue
Block a user