mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
generate search index for Latin words correctly if search language is Chinese
This commit is contained in:
parent
920aafaee6
commit
221614654f
2
CHANGES
2
CHANGES
@ -15,6 +15,8 @@ Features added
|
|||||||
|
|
||||||
Bugs fixed
|
Bugs fixed
|
||||||
----------
|
----------
|
||||||
|
* #5605 If the documentation language is set to Chinese, English words could not
|
||||||
|
be searched.
|
||||||
|
|
||||||
Testing
|
Testing
|
||||||
--------
|
--------
|
||||||
|
@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
|
|||||||
language_name = 'Chinese'
|
language_name = 'Chinese'
|
||||||
js_stemmer_code = js_porter_stemmer
|
js_stemmer_code = js_porter_stemmer
|
||||||
stopwords = english_stopwords
|
stopwords = english_stopwords
|
||||||
latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
|
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
|
||||||
|
latin_terms = [] # type: List[unicode]
|
||||||
|
|
||||||
def init(self, options):
|
def init(self, options):
|
||||||
# type: (Dict) -> None
|
# type: (Dict) -> None
|
||||||
@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
|
|||||||
if JIEBA:
|
if JIEBA:
|
||||||
chinese = list(jieba.cut_for_search(input))
|
chinese = list(jieba.cut_for_search(input))
|
||||||
|
|
||||||
latin1 = self.latin1_letters.findall(input)
|
latin1 = \
|
||||||
|
[term.strip() for term in self.latin1_letters.findall(input)]
|
||||||
|
self.latin_terms.extend(latin1)
|
||||||
return chinese + latin1
|
return chinese + latin1
|
||||||
|
|
||||||
def word_filter(self, stemmed_word):
|
def word_filter(self, stemmed_word):
|
||||||
@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):
|
|||||||
|
|
||||||
def stem(self, word):
|
def stem(self, word):
|
||||||
# type: (unicode) -> unicode
|
# type: (unicode) -> unicode
|
||||||
|
|
||||||
|
# Don't stem Latin words that are long enough to be relevant for search
|
||||||
|
# if not stemmed, but would be too short after being stemmed
|
||||||
|
# avoids some issues with acronyms
|
||||||
|
should_not_be_stemmed = (
|
||||||
|
word in self.latin_terms and
|
||||||
|
len(word) >= 3 and
|
||||||
|
len(self.stemmer.stem(word.lower())) < 3
|
||||||
|
)
|
||||||
|
if should_not_be_stemmed:
|
||||||
|
return word.lower()
|
||||||
return self.stemmer.stem(word.lower())
|
return self.stemmer.stem(word.lower())
|
||||||
|
@ -8,3 +8,9 @@ textinheading
|
|||||||
=============
|
=============
|
||||||
|
|
||||||
lorem ipsum
|
lorem ipsum
|
||||||
|
|
||||||
|
可以查看 FAQ 模块中 Chinesetest 部分
|
||||||
|
|
||||||
|
模块中 CAS service部分
|
||||||
|
|
||||||
|
可以Chinesetesttwo查看
|
||||||
|
@ -240,3 +240,18 @@ def test_IndexBuilder_lookup():
|
|||||||
# zh_CN
|
# zh_CN
|
||||||
index = IndexBuilder(env, 'zh_CN', {}, None)
|
index = IndexBuilder(env, 'zh_CN', {}, None)
|
||||||
assert index.lang.lang == 'zh'
|
assert index.lang.lang == 'zh'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.sphinx(
|
||||||
|
testroot='search',
|
||||||
|
confoverrides={'html_search_language': 'zh'},
|
||||||
|
srcdir='search_zh'
|
||||||
|
)
|
||||||
|
def test_search_index_gen_zh(app, status, warning):
|
||||||
|
app.builder.build_all()
|
||||||
|
# jsdump fails if search language is 'zh'; hence we just get the text:
|
||||||
|
searchindex = (app.outdir / 'searchindex.js').text()
|
||||||
|
assert 'chinesetest ' not in searchindex
|
||||||
|
assert 'chinesetest' in searchindex
|
||||||
|
assert 'chinesetesttwo' in searchindex
|
||||||
|
assert 'cas' in searchindex
|
||||||
|
Loading…
Reference in New Issue
Block a user