Convert `latin_terms` to a set (#12995)

This commit is contained in:
Adam Turner 2024-10-10 14:23:03 +01:00 committed by GitHub
parent dcd276d1d8
commit 705d5ddd9f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 5 deletions

View File

@ -146,6 +146,9 @@ Bugs fixed
and ensure deterministic resolution of global toctree in parallel builds
by choosing the lexicographically greatest parent document.
Patch by A. Rafey Khan
* #12995: Significantly improve performance when building the search index
for Chinese languages.
Patch by Adam Turner.
Testing

View File

@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
latin_terms: list[str] = []
def __init__(self, options: dict[str, str]) -> None:
super().__init__(options)
self.latin_terms: set[str] = set()
def init(self, options: dict[str, str]) -> None:
if JIEBA:
@ -238,12 +241,13 @@ class SearchChinese(SearchLanguage):
self.stemmer = snowballstemmer.stemmer('english')
def split(self, input: str) -> list[str]:
chinese: list[str] = []
if JIEBA:
chinese = list(jieba.cut_for_search(input))
chinese: list[str] = list(jieba.cut_for_search(input))
else:
chinese = []
latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
self.latin_terms.extend(latin1)
self.latin_terms.update(latin1)
return chinese + latin1
def word_filter(self, stemmed_word: str) -> bool:
@ -255,7 +259,7 @@ class SearchChinese(SearchLanguage):
# avoids some issues with acronyms
stemmed = self.stemmer.stemWord(word.lower())
should_not_be_stemmed = (
word in self.latin_terms and len(word) >= 3 > len(stemmed)
len(word) >= 3 > len(stemmed) and word in self.latin_terms
) # fmt: skip
if should_not_be_stemmed:
return word.lower()