#5605 fix Chinese search index (#5611)

generate search index for Latin words correctly if search language is Chinese
2025-02-25 18:55:22 -06:00 · 2018-12-25 18:41:52 +01:00 · 2018-12-25 18:41:52 +01:00 · 221614654f
commit 221614654f
parent 920aafaee6
4 changed files with 39 additions and 2 deletions
--- a/2
+++ b/2
@ -15,6 +15,8 @@ Features added

 Bugs fixed
 ----------
+* #5605 If the documentation language is set to Chinese, English words could not
+  be searched.

 Testing
 --------
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
    language_name = 'Chinese'
    js_stemmer_code = js_porter_stemmer
    stopwords = english_stopwords
-    latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
+    latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
+    latin_terms = []  # type: List[unicode]

    def init(self, options):
        # type: (Dict) -> None
@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
        if JIEBA:
            chinese = list(jieba.cut_for_search(input))

-        latin1 = self.latin1_letters.findall(input)
+        latin1 = \
+            [term.strip() for term in self.latin1_letters.findall(input)]
+        self.latin_terms.extend(latin1)
        return chinese + latin1

    def word_filter(self, stemmed_word):
@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):

    def stem(self, word):
        # type: (unicode) -> unicode
+
+        # Don't stem Latin words that are long enough to be relevant for search
+        # if not stemmed, but would be too short after being stemmed
+        # avoids some issues with acronyms
+        should_not_be_stemmed = (
+            word in self.latin_terms and
+            len(word) >= 3 and
+            len(self.stemmer.stem(word.lower())) < 3
+        )
+        if should_not_be_stemmed:
+            return word.lower()
        return self.stemmer.stem(word.lower())
--- a/tests/roots/test-search/tocitem.rst
+++ b/tests/roots/test-search/tocitem.rst
@ -8,3 +8,9 @@ textinheading
 =============

 lorem ipsum
+
+可以查看 FAQ 模块中 Chinesetest 部分
+
+模块中 CAS service部分
+
+可以Chinesetesttwo查看
--- a/tests/test_search.py
+++ b/tests/test_search.py
@ -240,3 +240,18 @@ def test_IndexBuilder_lookup():
    # zh_CN
    index = IndexBuilder(env, 'zh_CN', {}, None)
    assert index.lang.lang == 'zh'
+
+
+@pytest.mark.sphinx(
+    testroot='search',
+    confoverrides={'html_search_language': 'zh'},
+    srcdir='search_zh'
+)
+def test_search_index_gen_zh(app, status, warning):
+    app.builder.build_all()
+    # jsdump fails if search language is 'zh'; hence we just get the text:
+    searchindex = (app.outdir / 'searchindex.js').text()
+    assert 'chinesetest ' not in searchindex
+    assert 'chinesetest' in searchindex
+    assert 'chinesetesttwo' in searchindex
+    assert 'cas' in searchindex