diff --git a/CHANGES b/CHANGES index 867cdc950..d3f8b0a0e 100644 --- a/CHANGES +++ b/CHANGES @@ -15,6 +15,8 @@ Features added Bugs fixed ---------- +* #5605 If the documentation language is set to Chinese, English words could not + be searched. Testing -------- diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py index 6c5b65d6b..3753bc990 100644 --- a/sphinx/search/zh.py +++ b/sphinx/search/zh.py @@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage): language_name = 'Chinese' js_stemmer_code = js_porter_stemmer stopwords = english_stopwords - latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]') + latin1_letters = re.compile(r'[a-zA-Z0-9_]+') + latin_terms = [] # type: List[unicode] def init(self, options): # type: (Dict) -> None @@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage): if JIEBA: chinese = list(jieba.cut_for_search(input)) - latin1 = self.latin1_letters.findall(input) + latin1 = \ + [term.strip() for term in self.latin1_letters.findall(input)] + self.latin_terms.extend(latin1) return chinese + latin1 def word_filter(self, stemmed_word): @@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage): def stem(self, word): # type: (unicode) -> unicode + + # Don't stem Latin words that are long enough to be relevant for search + # if not stemmed, but would be too short after being stemmed + # avoids some issues with acronyms + should_not_be_stemmed = ( + word in self.latin_terms and + len(word) >= 3 and + len(self.stemmer.stem(word.lower())) < 3 + ) + if should_not_be_stemmed: + return word.lower() return self.stemmer.stem(word.lower()) diff --git a/tests/roots/test-search/tocitem.rst b/tests/roots/test-search/tocitem.rst index f082abf65..5d99f0a66 100644 --- a/tests/roots/test-search/tocitem.rst +++ b/tests/roots/test-search/tocitem.rst @@ -8,3 +8,9 @@ textinheading ============= lorem ipsum + +可以查看 FAQ 模块中 Chinesetest 部分 + +模块中 CAS service部分 + +可以Chinesetesttwo查看 diff --git a/tests/test_search.py b/tests/test_search.py index 886151831..4c7eb8b21 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -240,3 +240,18 @@ def test_IndexBuilder_lookup(): # zh_CN index = IndexBuilder(env, 'zh_CN', {}, None) assert index.lang.lang == 'zh' + + +@pytest.mark.sphinx( + testroot='search', + confoverrides={'html_search_language': 'zh'}, + srcdir='search_zh' +) +def test_search_index_gen_zh(app, status, warning): + app.builder.build_all() + # jsdump fails if search language is 'zh'; hence we just get the text: + searchindex = (app.outdir / 'searchindex.js').text() + assert 'chinesetest ' not in searchindex + assert 'chinesetest' in searchindex + assert 'chinesetesttwo' in searchindex + assert 'cas' in searchindex