mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Convert `latin_terms
` to a set (#12995)
This commit is contained in:
parent
dcd276d1d8
commit
705d5ddd9f
@ -146,6 +146,9 @@ Bugs fixed
|
||||
and ensure deterministic resolution of global toctree in parallel builds
|
||||
by choosing the lexicographically greatest parent document.
|
||||
Patch by A. Rafey Khan
|
||||
* #12995: Significantly improve performance when building the search index
|
||||
for Chinese languages.
|
||||
Patch by Adam Turner.
|
||||
|
||||
|
||||
Testing
|
||||
|
@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
|
||||
js_stemmer_code = js_porter_stemmer
|
||||
stopwords = english_stopwords
|
||||
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
|
||||
latin_terms: list[str] = []
|
||||
|
||||
def __init__(self, options: dict[str, str]) -> None:
|
||||
super().__init__(options)
|
||||
self.latin_terms: set[str] = set()
|
||||
|
||||
def init(self, options: dict[str, str]) -> None:
|
||||
if JIEBA:
|
||||
@ -238,12 +241,13 @@ class SearchChinese(SearchLanguage):
|
||||
self.stemmer = snowballstemmer.stemmer('english')
|
||||
|
||||
def split(self, input: str) -> list[str]:
|
||||
chinese: list[str] = []
|
||||
if JIEBA:
|
||||
chinese = list(jieba.cut_for_search(input))
|
||||
chinese: list[str] = list(jieba.cut_for_search(input))
|
||||
else:
|
||||
chinese = []
|
||||
|
||||
latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
|
||||
self.latin_terms.extend(latin1)
|
||||
self.latin_terms.update(latin1)
|
||||
return chinese + latin1
|
||||
|
||||
def word_filter(self, stemmed_word: str) -> bool:
|
||||
@ -255,7 +259,7 @@ class SearchChinese(SearchLanguage):
|
||||
# avoids some issues with acronyms
|
||||
stemmed = self.stemmer.stemWord(word.lower())
|
||||
should_not_be_stemmed = (
|
||||
word in self.latin_terms and len(word) >= 3 > len(stemmed)
|
||||
len(word) >= 3 > len(stemmed) and word in self.latin_terms
|
||||
) # fmt: skip
|
||||
if should_not_be_stemmed:
|
||||
return word.lower()
|
||||
|
Loading…
Reference in New Issue
Block a user