Enable automatic formatting for `sphinx/search/` (#12967)

This commit is contained in:
Adam Turner 2024-10-04 16:28:02 +01:00 committed by GitHub
parent e1c5f6d314
commit 81c6f1e8cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 187 additions and 145 deletions

View File

@ -7,7 +7,7 @@ extend-exclude = [
"tests/js/roots/*",
"build/*",
"doc/_build/*",
"sphinx/search/*",
# "sphinx/search/*",
"doc/usage/extensions/example*.py",
]
@ -411,6 +411,8 @@ select = [
"sphinx/ext/autodoc/importer.py" = ["D402"]
"sphinx/util/requests.py" = ["D402"]
"sphinx/search/*" = ["E501"]
"tests/*" = [
"E501",
"ANN", # tests don't need annotations
@ -475,7 +477,6 @@ exclude = [
"sphinx/ext/todo.py",
"sphinx/ext/viewcode.py",
"sphinx/registry.py",
"sphinx/search/*",
"sphinx/testing/*",
"sphinx/transforms/*",
"sphinx/writers/*",

View File

@ -1,4 +1,5 @@
"""Create a full-text search index for offline search."""
from __future__ import annotations
import dataclasses
@ -15,12 +16,13 @@ from docutils import nodes
from docutils.nodes import Element, Node
from sphinx import addnodes, package_dir
from sphinx.environment import BuildEnvironment
from sphinx.util.index_entries import split_index_msg
if TYPE_CHECKING:
from collections.abc import Iterable
from sphinx.environment import BuildEnvironment
class SearchLanguage:
"""
@ -52,10 +54,11 @@ class SearchLanguage:
This class is used to preprocess search word which Sphinx HTML readers
type, before searching index. Default implementation does nothing.
"""
lang: str = ''
language_name: str = ''
stopwords: set[str] = set()
js_splitter_code: str = ""
js_splitter_code: str = ''
js_stemmer_rawcode: str = ''
js_stemmer_code = """
/**
@ -105,16 +108,14 @@ var Stemmer = function() {
Return true if the target word should be registered in the search index.
This method is called after stemming.
"""
return (
len(word) == 0 or not (
((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
(ord(word[0]) < 256 and (
word in self.stopwords
))))
return len(word) == 0 or not (
((len(word) < 3) and (12353 < ord(word[0]) < 12436))
or (ord(word[0]) < 256 and (word in self.stopwords))
)
# SearchEnglish imported after SearchLanguage is defined due to circular import
from sphinx.search.en import SearchEnglish
from sphinx.search.en import SearchEnglish # NoQA: E402
def parse_stop_word(source: str) -> set[str]:
@ -165,10 +166,10 @@ class _JavaScriptIndex:
return self.PREFIX + json.dumps(data, sort_keys=True) + self.SUFFIX
def loads(self, s: str) -> Any:
data = s[len(self.PREFIX):-len(self.SUFFIX)]
if not data or not s.startswith(self.PREFIX) or not \
s.endswith(self.SUFFIX):
raise ValueError('invalid data')
data = s[len(self.PREFIX) : -len(self.SUFFIX)]
if not data or not s.startswith(self.PREFIX) or not s.endswith(self.SUFFIX):
msg = 'invalid data'
raise ValueError(msg)
return json.loads(data)
def dump(self, data: Any, f: IO[str]) -> None:
@ -187,9 +188,8 @@ def _is_meta_keywords(
) -> bool:
if node.get('name') == 'keywords':
meta_lang = node.get('lang')
if meta_lang is None: # lang not specified
return True
elif meta_lang == lang: # matched to html_search_language
if meta_lang is None or meta_lang == lang:
# lang not specified or matched to html_search_language
return True
return False
@ -222,8 +222,18 @@ class WordCollector(nodes.NodeVisitor):
# Some people might put content in raw HTML that should be searched,
# so we just amateurishly strip HTML tags and index the remaining
# content
nodetext = re.sub(r'<style.*?</style>', '', node.astext(), flags=re.IGNORECASE|re.DOTALL)
nodetext = re.sub(r'<script.*?</script>', '', nodetext, flags=re.IGNORECASE|re.DOTALL)
nodetext = re.sub(
r'<style.*?</style>',
'',
node.astext(),
flags=re.IGNORECASE | re.DOTALL,
)
nodetext = re.sub(
r'<script.*?</script>',
'',
nodetext,
flags=re.IGNORECASE | re.DOTALL,
)
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
self.found_words.extend(self.lang.split(nodetext))
raise nodes.SkipNode
@ -245,12 +255,15 @@ class IndexBuilder:
Helper class that creates a search index based on the doctrees
passed to the `feed` method.
"""
formats = {
'json': json,
'pickle': pickle
'json': json,
'pickle': pickle,
}
def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
def __init__(
self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
) -> None:
self.env = env
# docname -> title
self._titles: dict[str, str | None] = env._search_index_titles
@ -261,9 +274,13 @@ class IndexBuilder:
# stemmed words in titles -> set(docname)
self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
# docname -> all titles in document
self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles
self._all_titles: dict[str, list[tuple[str, str | None]]] = (
env._search_index_all_titles
)
# docname -> list(index entry)
self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
self._index_entries: dict[str, list[tuple[str, str, str]]] = (
env._search_index_index_entries
)
# objtype -> index
self._objtypes: dict[tuple[str, str], int] = env._search_index_objtypes
# objtype index -> (domain, type, objname (localized))
@ -290,7 +307,7 @@ class IndexBuilder:
self.js_scorer_code = fp.read().decode()
else:
self.js_scorer_code = ''
self.js_splitter_code = ""
self.js_splitter_code = ''
def load(self, stream: IO, format: Any) -> None:
"""Reconstruct from frozen data."""
@ -298,15 +315,15 @@ class IndexBuilder:
format = self.formats[format]
frozen = format.load(stream)
# if an old index is present, we treat it as not existing.
if not isinstance(frozen, dict) or \
frozen.get('envversion') != self.env.version:
raise ValueError('old format')
if not isinstance(frozen, dict) or frozen.get('envversion') != self.env.version:
msg = 'old format'
raise ValueError(msg)
index2fn = frozen['docnames']
self._filenames = dict(zip(index2fn, frozen['filenames']))
self._titles = dict(zip(index2fn, frozen['titles']))
self._filenames = dict(zip(index2fn, frozen['filenames'], strict=True))
self._titles = dict(zip(index2fn, frozen['titles'], strict=True))
self._all_titles = {}
for docname in self._titles.keys():
for docname in self._titles:
self._all_titles[docname] = []
for title, doc_tuples in frozen['alltitles'].items():
for doc, titleid in doc_tuples:
@ -331,8 +348,9 @@ class IndexBuilder:
format = self.formats[format]
format.dump(self.freeze(), stream)
def get_objects(self, fn2index: dict[str, int]
) -> dict[str, list[tuple[int, int, int, str, str]]]:
def get_objects(
self, fn2index: dict[str, int]
) -> dict[str, list[tuple[int, int, int, str, str]]]:
rv: dict[str, list[tuple[int, int, int, str, str]]] = {}
otypes = self._objtypes
onames = self._objnames
@ -355,8 +373,11 @@ class IndexBuilder:
otype = domain.object_types.get(type)
if otype:
# use str() to fire translation proxies
onames[typeindex] = (domain.name, type,
str(domain.get_type_name(otype)))
onames[typeindex] = (
domain.name,
type,
str(domain.get_type_name(otype)),
)
else:
onames[typeindex] = (domain.name, type, type)
if anchor == fullname:
@ -368,7 +389,9 @@ class IndexBuilder:
plist.append((fn2index[docname], typeindex, prio, shortanchor, name))
return rv
def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
def get_terms(
self, fn2index: dict[str, int]
) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
"""
Return a mapping of document and title terms to their corresponding sorted document IDs.
@ -377,10 +400,10 @@ class IndexBuilder:
of integers.
"""
rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {})
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping), strict=True):
for k, v in mapping.items():
if len(v) == 1:
fn, = v
(fn,) = v
if fn in fn2index:
rv[k] = fn2index[fn]
else:
@ -389,7 +412,7 @@ class IndexBuilder:
def freeze(self) -> dict[str, Any]:
"""Create a usable data structure for serializing."""
docnames, titles = zip(*sorted(self._titles.items()))
docnames, titles = zip(*sorted(self._titles.items()), strict=True)
filenames = [self._filenames.get(docname) for docname in docnames]
fn2index = {f: i for (i, f) in enumerate(docnames)}
terms, title_terms = self.get_terms(fn2index)
@ -406,15 +429,28 @@ class IndexBuilder:
index_entries: dict[str, list[tuple[int, str, bool]]] = {}
for docname, entries in self._index_entries.items():
for entry, entry_id, main_entry in entries:
index_entries.setdefault(entry.lower(), []).append((fn2index[docname], entry_id, main_entry == "main"))
index_entries.setdefault(entry.lower(), []).append((
fn2index[docname],
entry_id,
main_entry == 'main',
))
return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
objects=objects, objtypes=objtypes, objnames=objnames,
titleterms=title_terms, envversion=self.env.version,
alltitles=alltitles, indexentries=index_entries)
return {
'docnames': docnames,
'filenames': filenames,
'titles': titles,
'terms': terms,
'objects': objects,
'objtypes': objtypes,
'objnames': objnames,
'titleterms': title_terms,
'envversion': self.env.version,
'alltitles': alltitles,
'indexentries': index_entries,
}
def label(self) -> str:
return f"{self.lang.language_name} (code: {self.lang.lang})"
return f'{self.lang.language_name} (code: {self.lang.lang})'
def prune(self, docnames: Iterable[str]) -> None:
"""Remove data for all docnames not in the list."""
@ -434,7 +470,9 @@ class IndexBuilder:
for wordnames in self._title_mapping.values():
wordnames.intersection_update(docnames)
def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
def feed(
self, docname: str, filename: str, title: str, doctree: nodes.document
) -> None:
"""Feed a doctree to the index."""
self._titles[docname] = title
self._filenames[docname] = filename
@ -495,15 +533,22 @@ class IndexBuilder:
# Some people might put content in raw HTML that should be searched,
# so we just amateurishly strip HTML tags and index the remaining
# content
nodetext = re.sub(r'<style.*?</style>', '', node.astext(),
flags=re.IGNORECASE | re.DOTALL)
nodetext = re.sub(r'<script.*?</script>', '', nodetext,
flags=re.IGNORECASE | re.DOTALL)
nodetext = re.sub(
r'<style.*?</style>',
'',
node.astext(),
flags=re.IGNORECASE | re.DOTALL,
)
nodetext = re.sub(
r'<script.*?</script>',
'',
nodetext,
flags=re.IGNORECASE | re.DOTALL,
)
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
word_store.words.extend(split(nodetext))
return
elif (isinstance(node, nodes.meta)
and _is_meta_keywords(node, language)):
elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
keywords = [keyword.strip() for keyword in node['content'].split(',')]
word_store.words.extend(keywords)
elif isinstance(node, nodes.Text):
@ -553,11 +598,16 @@ class IndexBuilder:
"""Returns JS code that will be inserted into language_data.js."""
if self.lang.js_stemmer_rawcode:
js_dir = path.join(package_dir, 'search', 'minified-js')
with open(path.join(js_dir, 'base-stemmer.js'), encoding='utf-8') as js_file:
with open(
path.join(js_dir, 'base-stemmer.js'), encoding='utf-8'
) as js_file:
base_js = js_file.read()
with open(path.join(js_dir, self.lang.js_stemmer_rawcode), encoding='utf-8') as js_file:
with open(
path.join(js_dir, self.lang.js_stemmer_rawcode), encoding='utf-8'
) as js_file:
language_js = js_file.read()
return ('%s\n%s\nStemmer = %sStemmer;' %
(base_js, language_js, self.lang.language_name))
return (
f'{base_js}\n{language_js}\nStemmer = {self.lang.language_name}Stemmer;'
)
else:
return self.lang.js_stemmer_code

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
danish_stopwords = parse_stop_word('''
danish_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/danish/stop.txt
og | and
i | in
@ -104,7 +102,7 @@ været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
''')
""")
class SearchDanish(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
german_stopwords = parse_stop_word('''
german_stopwords = parse_stop_word("""
|source: https://snowball.tartarus.org/algorithms/german/stop.txt
aber | but
@ -287,7 +285,7 @@ zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between
''')
""")
class SearchGerman(SearchLanguage):

View File

@ -2,13 +2,12 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage
english_stopwords = set("""
english_stopwords = set(
"""
a and are as at
be but by
for
@ -18,7 +17,8 @@ of on or
such
that the their then there these they this to
was will with
""".split())
""".split()
)
js_porter_stemmer = """
/**

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
spanish_stopwords = parse_stop_word('''
spanish_stopwords = parse_stop_word("""
|source: https://snowball.tartarus.org/algorithms/spanish/stop.txt
de | from, of
la | the, her
@ -347,7 +345,7 @@ tenida
tenidos
tenidas
tened
''')
""")
class SearchSpanish(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
finnish_stopwords = parse_stop_word('''
finnish_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/finnish/stop.txt
| forms of BE
@ -97,7 +95,7 @@ kun | when
niin | so
nyt | now
itse | self
''')
""")
class SearchFinnish(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
french_stopwords = parse_stop_word('''
french_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/french/stop.txt
au | a + le
aux | a + les
@ -183,7 +181,7 @@ quelle | which
quelles | which
sans | without
soi | oneself
''')
""")
class SearchFrench(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
hungarian_stopwords = parse_stop_word('''
hungarian_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/hungarian/stop.txt
| prepared by Anna Tordai
a
@ -210,7 +208,7 @@ vissza
vele
viszont
volna
''')
""")
class SearchHungarian(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
italian_stopwords = parse_stop_word('''
italian_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/italian/stop.txt
ad | a (to) before vowel
al | a + il
@ -300,7 +298,7 @@ stessi
stesse
stessimo
stessero
''')
""")
class SearchItalian(SearchLanguage):

View File

@ -17,12 +17,14 @@ from typing import Any
try:
import MeCab # type: ignore[import-not-found]
native_module = True
except ImportError:
native_module = False
try:
import janome.tokenizer # type: ignore[import-not-found]
janome_module = True
except ImportError:
janome_module = False
@ -61,7 +63,8 @@ class MecabSplitter(BaseSplitter):
result = self.native.parse(input)
else:
result = self.ctypes_libmecab.mecab_sparse_tostr(
self.ctypes_mecab, input.encode(self.dict_encode))
self.ctypes_mecab, input.encode(self.dict_encode)
)
return result.split(' ')
def init_native(self, options: dict[str, str]) -> None:
@ -89,7 +92,8 @@ class MecabSplitter(BaseSplitter):
if os.path.exists(lib):
libpath = lib
if libpath is None:
raise RuntimeError('MeCab dynamic library is not available')
msg = 'MeCab dynamic library is not available'
raise RuntimeError(msg)
param = 'mecab -Owakati'
dict = options.get('dict')
@ -101,11 +105,15 @@ class MecabSplitter(BaseSplitter):
self.ctypes_libmecab = ctypes.CDLL(libpath)
self.ctypes_libmecab.mecab_new2.argtypes = (ctypes.c_char_p,)
self.ctypes_libmecab.mecab_new2.restype = ctypes.c_void_p
self.ctypes_libmecab.mecab_sparse_tostr.argtypes = (ctypes.c_void_p, ctypes.c_char_p)
self.ctypes_libmecab.mecab_sparse_tostr.argtypes = (
ctypes.c_void_p,
ctypes.c_char_p,
)
self.ctypes_libmecab.mecab_sparse_tostr.restype = ctypes.c_char_p
self.ctypes_mecab = self.ctypes_libmecab.mecab_new2(param.encode(fs_enc))
if self.ctypes_mecab is None:
raise SphinxError('mecab initialization failed')
msg = 'mecab initialization failed'
raise SphinxError(msg)
def __del__(self) -> None:
if self.ctypes_libmecab:
@ -121,8 +129,11 @@ class JanomeSplitter(BaseSplitter):
def init_tokenizer(self) -> None:
if not janome_module:
raise RuntimeError('Janome is not available')
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
msg = 'Janome is not available'
raise RuntimeError(msg)
self.tokenizer = janome.tokenizer.Tokenizer(
udic=self.user_dict, udic_enc=self.user_dict_enc
)
def split(self, input: str) -> list[str]:
result = ' '.join(token.surface for token in self.tokenizer.tokenize(input))
@ -130,14 +141,18 @@ class JanomeSplitter(BaseSplitter):
class DefaultSplitter(BaseSplitter):
patterns_ = {re.compile(pattern): value for pattern, value in {
'[一二三四五六七八九十百千万億兆]': 'M',
'[一-龠々〆ヵヶ]': 'H',
'[ぁ-ん]': 'I',
'[ァ-ヴーア-ン゙ー]': 'K',
'[a-zA-Z--]': 'A',
'[0-9-]': 'N',
}.items()}
patterns_ = {
re.compile(pattern): value
for pattern, value in {
'[一二三四五六七八九十百千万億兆]': 'M',
'[一-龠々〆ヵヶ]': 'H',
'[ぁ-ん]': 'I',
'[ァ-ヴーア-ン゙ー]': 'K',
'[a-zA-Z--]': 'A',
'[0-9-]': 'N',
}.items()
}
# fmt: off
BIAS__ = -332
BC1__ = {'HH': 6, 'II': 2461, 'KH': 406, 'OH': -1378}
BC2__ = {'AA': -3267, 'AI': 2744, 'AN': -878, 'HH': -4070, 'HM': -1711,
@ -398,6 +413,7 @@ class DefaultSplitter(BaseSplitter):
'': 798, '': -960, '': 887, '': -695, '': 535, '': -697,
'': 753, '': -507, '': 974, '': -822, '': 1811, '': 463,
'': 1082, '': -270, '': 306, '': -673, '': -496}
# fmt: on
# ctype_
def ctype_(self, char: str) -> str:
@ -427,18 +443,18 @@ class DefaultSplitter(BaseSplitter):
for i in range(4, len(seg) - 3):
score = self.BIAS__
w1 = seg[i-3]
w2 = seg[i-2]
w3 = seg[i-1]
w1 = seg[i - 3]
w2 = seg[i - 2]
w3 = seg[i - 1]
w4 = seg[i]
w5 = seg[i+1]
w6 = seg[i+2]
c1 = ctype[i-3]
c2 = ctype[i-2]
c3 = ctype[i-1]
w5 = seg[i + 1]
w6 = seg[i + 2]
c1 = ctype[i - 3]
c2 = ctype[i - 2]
c3 = ctype[i - 1]
c4 = ctype[i]
c5 = ctype[i+1]
c6 = ctype[i+2]
c5 = ctype[i + 1]
c6 = ctype[i + 2]
score += self.ts_(self.UP1__, p1)
score += self.ts_(self.UP2__, p2)
score += self.ts_(self.UP3__, p3)
@ -470,7 +486,7 @@ class DefaultSplitter(BaseSplitter):
score += self.ts_(self.TC2__, c2 + c3 + c4)
score += self.ts_(self.TC3__, c3 + c4 + c5)
score += self.ts_(self.TC4__, c4 + c5 + c6)
# score += self.ts_(self.TC5__, c4 + c5 + c6)
# score += self.ts_(self.TC5__, c4 + c5 + c6)
score += self.ts_(self.UQ1__, p1 + c1)
score += self.ts_(self.UQ2__, p2 + c2)
score += self.ts_(self.UQ1__, p3 + c3)
@ -501,6 +517,7 @@ class SearchJapanese(SearchLanguage):
Japanese search implementation: uses no stemmer, but word splitting is quite
complicated.
"""
lang = 'ja'
language_name = 'Japanese'

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
dutch_stopwords = parse_stop_word('''
dutch_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/dutch/stop.txt
de | the
en | and
@ -111,7 +109,7 @@ uw | your
iemand | somebody
geweest | been; past participle of 'be'
andere | other
''')
""")
class SearchDutch(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
norwegian_stopwords = parse_stop_word('''
norwegian_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/norwegian/stop.txt
og | and
i | in
@ -186,7 +184,7 @@ verte | become *
vort | become *
varte | became *
vart | became *
''')
""")
class SearchNorwegian(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
portuguese_stopwords = parse_stop_word('''
portuguese_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/portuguese/stop.txt
de | of, from
a | the; to, at; her
@ -245,7 +243,7 @@ terão
teria
teríamos
teriam
''')
""")
class SearchPortuguese(SearchLanguage):

View File

@ -2,8 +2,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, Set
import snowballstemmer
from sphinx.search import SearchLanguage

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
russian_stopwords = parse_stop_word('''
russian_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/russian/stop.txt
и | and
в | in/into
@ -235,7 +233,7 @@ russian_stopwords = parse_stop_word('''
| можн
| нужн
| нельзя
''')
""")
class SearchRussian(SearchLanguage):

View File

@ -2,13 +2,11 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict
import snowballstemmer
from sphinx.search import SearchLanguage, parse_stop_word
swedish_stopwords = parse_stop_word('''
swedish_stopwords = parse_stop_word("""
| source: https://snowball.tartarus.org/algorithms/swedish/stop.txt
och | and
det | it, this/that
@ -124,7 +122,7 @@ våra | our
ert | your
era | your
vilkas | whose
''')
""")
class SearchSwedish(SearchLanguage):

View File

@ -2,8 +2,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, Set
import snowballstemmer
from sphinx.search import SearchLanguage

View File

@ -11,11 +11,13 @@ from sphinx.search import SearchLanguage
try:
import jieba # type: ignore[import-not-found]
JIEBA = True
except ImportError:
JIEBA = False
english_stopwords = set("""
english_stopwords = set(
"""
a and are as at
be but by
for
@ -25,7 +27,8 @@ of on or
such
that the their then there these they this to
was will with
""".split())
""".split()
)
js_porter_stemmer = """
/**
@ -239,8 +242,7 @@ class SearchChinese(SearchLanguage):
if JIEBA:
chinese = list(jieba.cut_for_search(input))
latin1 = \
[term.strip() for term in self.latin1_letters.findall(input)]
latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
self.latin_terms.extend(latin1)
return chinese + latin1
@ -252,9 +254,9 @@ class SearchChinese(SearchLanguage):
# if not stemmed, but would be too short after being stemmed
# avoids some issues with acronyms
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stemWord(word.lower())) < 3
word in self.latin_terms
and len(word) >= 3
and len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()