Convert `latin_terms` to a set (#12995)

2025-02-25 18:55:22 -06:00 · 2024-10-10 14:23:03 +01:00 · 2024-10-10 14:23:03 +01:00 · 705d5ddd9f
commit 705d5ddd9f
parent dcd276d1d8
2 changed files with 12 additions and 5 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@ -146,6 +146,9 @@ Bugs fixed
  and ensure deterministic resolution of global toctree in parallel builds
  by choosing the lexicographically greatest parent document.
  Patch by A. Rafey Khan
+* #12995: Significantly improve performance when building the search index
+  for Chinese languages.
+  Patch by Adam Turner.


 Testing
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
    js_stemmer_code = js_porter_stemmer
    stopwords = english_stopwords
    latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
-    latin_terms: list[str] = []
+
+    def __init__(self, options: dict[str, str]) -> None:
+        super().__init__(options)
+        self.latin_terms: set[str] = set()

    def init(self, options: dict[str, str]) -> None:
        if JIEBA:
@ -238,12 +241,13 @@ class SearchChinese(SearchLanguage):
        self.stemmer = snowballstemmer.stemmer('english')

    def split(self, input: str) -> list[str]:
-        chinese: list[str] = []
        if JIEBA:
-            chinese = list(jieba.cut_for_search(input))
+            chinese: list[str] = list(jieba.cut_for_search(input))
+        else:
+            chinese = []

        latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
-        self.latin_terms.extend(latin1)
+        self.latin_terms.update(latin1)
        return chinese + latin1

    def word_filter(self, stemmed_word: str) -> bool:
@ -255,7 +259,7 @@ class SearchChinese(SearchLanguage):
        # avoids some issues with acronyms
        stemmed = self.stemmer.stemWord(word.lower())
        should_not_be_stemmed = (
-            word in self.latin_terms and len(word) >= 3 > len(stemmed)
+            len(word) >= 3 > len(stemmed) and word in self.latin_terms
        )  # fmt: skip
        if should_not_be_stemmed:
            return word.lower()