mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Enable automatic formatting for `sphinx/search/
` (#12967)
This commit is contained in:
parent
e1c5f6d314
commit
81c6f1e8cc
@ -7,7 +7,7 @@ extend-exclude = [
|
||||
"tests/js/roots/*",
|
||||
"build/*",
|
||||
"doc/_build/*",
|
||||
"sphinx/search/*",
|
||||
# "sphinx/search/*",
|
||||
"doc/usage/extensions/example*.py",
|
||||
]
|
||||
|
||||
@ -411,6 +411,8 @@ select = [
|
||||
"sphinx/ext/autodoc/importer.py" = ["D402"]
|
||||
"sphinx/util/requests.py" = ["D402"]
|
||||
|
||||
"sphinx/search/*" = ["E501"]
|
||||
|
||||
"tests/*" = [
|
||||
"E501",
|
||||
"ANN", # tests don't need annotations
|
||||
@ -475,7 +477,6 @@ exclude = [
|
||||
"sphinx/ext/todo.py",
|
||||
"sphinx/ext/viewcode.py",
|
||||
"sphinx/registry.py",
|
||||
"sphinx/search/*",
|
||||
"sphinx/testing/*",
|
||||
"sphinx/transforms/*",
|
||||
"sphinx/writers/*",
|
||||
|
@ -1,4 +1,5 @@
|
||||
"""Create a full-text search index for offline search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
@ -15,12 +16,13 @@ from docutils import nodes
|
||||
from docutils.nodes import Element, Node
|
||||
|
||||
from sphinx import addnodes, package_dir
|
||||
from sphinx.environment import BuildEnvironment
|
||||
from sphinx.util.index_entries import split_index_msg
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from sphinx.environment import BuildEnvironment
|
||||
|
||||
|
||||
class SearchLanguage:
|
||||
"""
|
||||
@ -52,10 +54,11 @@ class SearchLanguage:
|
||||
This class is used to preprocess search word which Sphinx HTML readers
|
||||
type, before searching index. Default implementation does nothing.
|
||||
"""
|
||||
|
||||
lang: str = ''
|
||||
language_name: str = ''
|
||||
stopwords: set[str] = set()
|
||||
js_splitter_code: str = ""
|
||||
js_splitter_code: str = ''
|
||||
js_stemmer_rawcode: str = ''
|
||||
js_stemmer_code = """
|
||||
/**
|
||||
@ -105,16 +108,14 @@ var Stemmer = function() {
|
||||
Return true if the target word should be registered in the search index.
|
||||
This method is called after stemming.
|
||||
"""
|
||||
return (
|
||||
len(word) == 0 or not (
|
||||
((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
|
||||
(ord(word[0]) < 256 and (
|
||||
word in self.stopwords
|
||||
))))
|
||||
return len(word) == 0 or not (
|
||||
((len(word) < 3) and (12353 < ord(word[0]) < 12436))
|
||||
or (ord(word[0]) < 256 and (word in self.stopwords))
|
||||
)
|
||||
|
||||
|
||||
# SearchEnglish imported after SearchLanguage is defined due to circular import
|
||||
from sphinx.search.en import SearchEnglish
|
||||
from sphinx.search.en import SearchEnglish # NoQA: E402
|
||||
|
||||
|
||||
def parse_stop_word(source: str) -> set[str]:
|
||||
@ -165,10 +166,10 @@ class _JavaScriptIndex:
|
||||
return self.PREFIX + json.dumps(data, sort_keys=True) + self.SUFFIX
|
||||
|
||||
def loads(self, s: str) -> Any:
|
||||
data = s[len(self.PREFIX):-len(self.SUFFIX)]
|
||||
if not data or not s.startswith(self.PREFIX) or not \
|
||||
s.endswith(self.SUFFIX):
|
||||
raise ValueError('invalid data')
|
||||
data = s[len(self.PREFIX) : -len(self.SUFFIX)]
|
||||
if not data or not s.startswith(self.PREFIX) or not s.endswith(self.SUFFIX):
|
||||
msg = 'invalid data'
|
||||
raise ValueError(msg)
|
||||
return json.loads(data)
|
||||
|
||||
def dump(self, data: Any, f: IO[str]) -> None:
|
||||
@ -187,9 +188,8 @@ def _is_meta_keywords(
|
||||
) -> bool:
|
||||
if node.get('name') == 'keywords':
|
||||
meta_lang = node.get('lang')
|
||||
if meta_lang is None: # lang not specified
|
||||
return True
|
||||
elif meta_lang == lang: # matched to html_search_language
|
||||
if meta_lang is None or meta_lang == lang:
|
||||
# lang not specified or matched to html_search_language
|
||||
return True
|
||||
|
||||
return False
|
||||
@ -222,8 +222,18 @@ class WordCollector(nodes.NodeVisitor):
|
||||
# Some people might put content in raw HTML that should be searched,
|
||||
# so we just amateurishly strip HTML tags and index the remaining
|
||||
# content
|
||||
nodetext = re.sub(r'<style.*?</style>', '', node.astext(), flags=re.IGNORECASE|re.DOTALL)
|
||||
nodetext = re.sub(r'<script.*?</script>', '', nodetext, flags=re.IGNORECASE|re.DOTALL)
|
||||
nodetext = re.sub(
|
||||
r'<style.*?</style>',
|
||||
'',
|
||||
node.astext(),
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
nodetext = re.sub(
|
||||
r'<script.*?</script>',
|
||||
'',
|
||||
nodetext,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
|
||||
self.found_words.extend(self.lang.split(nodetext))
|
||||
raise nodes.SkipNode
|
||||
@ -245,12 +255,15 @@ class IndexBuilder:
|
||||
Helper class that creates a search index based on the doctrees
|
||||
passed to the `feed` method.
|
||||
"""
|
||||
|
||||
formats = {
|
||||
'json': json,
|
||||
'pickle': pickle
|
||||
'json': json,
|
||||
'pickle': pickle,
|
||||
}
|
||||
|
||||
def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
|
||||
def __init__(
|
||||
self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
|
||||
) -> None:
|
||||
self.env = env
|
||||
# docname -> title
|
||||
self._titles: dict[str, str | None] = env._search_index_titles
|
||||
@ -261,9 +274,13 @@ class IndexBuilder:
|
||||
# stemmed words in titles -> set(docname)
|
||||
self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
|
||||
# docname -> all titles in document
|
||||
self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles
|
||||
self._all_titles: dict[str, list[tuple[str, str | None]]] = (
|
||||
env._search_index_all_titles
|
||||
)
|
||||
# docname -> list(index entry)
|
||||
self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
|
||||
self._index_entries: dict[str, list[tuple[str, str, str]]] = (
|
||||
env._search_index_index_entries
|
||||
)
|
||||
# objtype -> index
|
||||
self._objtypes: dict[tuple[str, str], int] = env._search_index_objtypes
|
||||
# objtype index -> (domain, type, objname (localized))
|
||||
@ -290,7 +307,7 @@ class IndexBuilder:
|
||||
self.js_scorer_code = fp.read().decode()
|
||||
else:
|
||||
self.js_scorer_code = ''
|
||||
self.js_splitter_code = ""
|
||||
self.js_splitter_code = ''
|
||||
|
||||
def load(self, stream: IO, format: Any) -> None:
|
||||
"""Reconstruct from frozen data."""
|
||||
@ -298,15 +315,15 @@ class IndexBuilder:
|
||||
format = self.formats[format]
|
||||
frozen = format.load(stream)
|
||||
# if an old index is present, we treat it as not existing.
|
||||
if not isinstance(frozen, dict) or \
|
||||
frozen.get('envversion') != self.env.version:
|
||||
raise ValueError('old format')
|
||||
if not isinstance(frozen, dict) or frozen.get('envversion') != self.env.version:
|
||||
msg = 'old format'
|
||||
raise ValueError(msg)
|
||||
index2fn = frozen['docnames']
|
||||
self._filenames = dict(zip(index2fn, frozen['filenames']))
|
||||
self._titles = dict(zip(index2fn, frozen['titles']))
|
||||
self._filenames = dict(zip(index2fn, frozen['filenames'], strict=True))
|
||||
self._titles = dict(zip(index2fn, frozen['titles'], strict=True))
|
||||
self._all_titles = {}
|
||||
|
||||
for docname in self._titles.keys():
|
||||
for docname in self._titles:
|
||||
self._all_titles[docname] = []
|
||||
for title, doc_tuples in frozen['alltitles'].items():
|
||||
for doc, titleid in doc_tuples:
|
||||
@ -331,8 +348,9 @@ class IndexBuilder:
|
||||
format = self.formats[format]
|
||||
format.dump(self.freeze(), stream)
|
||||
|
||||
def get_objects(self, fn2index: dict[str, int]
|
||||
) -> dict[str, list[tuple[int, int, int, str, str]]]:
|
||||
def get_objects(
|
||||
self, fn2index: dict[str, int]
|
||||
) -> dict[str, list[tuple[int, int, int, str, str]]]:
|
||||
rv: dict[str, list[tuple[int, int, int, str, str]]] = {}
|
||||
otypes = self._objtypes
|
||||
onames = self._objnames
|
||||
@ -355,8 +373,11 @@ class IndexBuilder:
|
||||
otype = domain.object_types.get(type)
|
||||
if otype:
|
||||
# use str() to fire translation proxies
|
||||
onames[typeindex] = (domain.name, type,
|
||||
str(domain.get_type_name(otype)))
|
||||
onames[typeindex] = (
|
||||
domain.name,
|
||||
type,
|
||||
str(domain.get_type_name(otype)),
|
||||
)
|
||||
else:
|
||||
onames[typeindex] = (domain.name, type, type)
|
||||
if anchor == fullname:
|
||||
@ -368,7 +389,9 @@ class IndexBuilder:
|
||||
plist.append((fn2index[docname], typeindex, prio, shortanchor, name))
|
||||
return rv
|
||||
|
||||
def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
|
||||
def get_terms(
|
||||
self, fn2index: dict[str, int]
|
||||
) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
|
||||
"""
|
||||
Return a mapping of document and title terms to their corresponding sorted document IDs.
|
||||
|
||||
@ -377,10 +400,10 @@ class IndexBuilder:
|
||||
of integers.
|
||||
"""
|
||||
rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {})
|
||||
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
|
||||
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping), strict=True):
|
||||
for k, v in mapping.items():
|
||||
if len(v) == 1:
|
||||
fn, = v
|
||||
(fn,) = v
|
||||
if fn in fn2index:
|
||||
rv[k] = fn2index[fn]
|
||||
else:
|
||||
@ -389,7 +412,7 @@ class IndexBuilder:
|
||||
|
||||
def freeze(self) -> dict[str, Any]:
|
||||
"""Create a usable data structure for serializing."""
|
||||
docnames, titles = zip(*sorted(self._titles.items()))
|
||||
docnames, titles = zip(*sorted(self._titles.items()), strict=True)
|
||||
filenames = [self._filenames.get(docname) for docname in docnames]
|
||||
fn2index = {f: i for (i, f) in enumerate(docnames)}
|
||||
terms, title_terms = self.get_terms(fn2index)
|
||||
@ -406,15 +429,28 @@ class IndexBuilder:
|
||||
index_entries: dict[str, list[tuple[int, str, bool]]] = {}
|
||||
for docname, entries in self._index_entries.items():
|
||||
for entry, entry_id, main_entry in entries:
|
||||
index_entries.setdefault(entry.lower(), []).append((fn2index[docname], entry_id, main_entry == "main"))
|
||||
index_entries.setdefault(entry.lower(), []).append((
|
||||
fn2index[docname],
|
||||
entry_id,
|
||||
main_entry == 'main',
|
||||
))
|
||||
|
||||
return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
|
||||
objects=objects, objtypes=objtypes, objnames=objnames,
|
||||
titleterms=title_terms, envversion=self.env.version,
|
||||
alltitles=alltitles, indexentries=index_entries)
|
||||
return {
|
||||
'docnames': docnames,
|
||||
'filenames': filenames,
|
||||
'titles': titles,
|
||||
'terms': terms,
|
||||
'objects': objects,
|
||||
'objtypes': objtypes,
|
||||
'objnames': objnames,
|
||||
'titleterms': title_terms,
|
||||
'envversion': self.env.version,
|
||||
'alltitles': alltitles,
|
||||
'indexentries': index_entries,
|
||||
}
|
||||
|
||||
def label(self) -> str:
|
||||
return f"{self.lang.language_name} (code: {self.lang.lang})"
|
||||
return f'{self.lang.language_name} (code: {self.lang.lang})'
|
||||
|
||||
def prune(self, docnames: Iterable[str]) -> None:
|
||||
"""Remove data for all docnames not in the list."""
|
||||
@ -434,7 +470,9 @@ class IndexBuilder:
|
||||
for wordnames in self._title_mapping.values():
|
||||
wordnames.intersection_update(docnames)
|
||||
|
||||
def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
|
||||
def feed(
|
||||
self, docname: str, filename: str, title: str, doctree: nodes.document
|
||||
) -> None:
|
||||
"""Feed a doctree to the index."""
|
||||
self._titles[docname] = title
|
||||
self._filenames[docname] = filename
|
||||
@ -495,15 +533,22 @@ class IndexBuilder:
|
||||
# Some people might put content in raw HTML that should be searched,
|
||||
# so we just amateurishly strip HTML tags and index the remaining
|
||||
# content
|
||||
nodetext = re.sub(r'<style.*?</style>', '', node.astext(),
|
||||
flags=re.IGNORECASE | re.DOTALL)
|
||||
nodetext = re.sub(r'<script.*?</script>', '', nodetext,
|
||||
flags=re.IGNORECASE | re.DOTALL)
|
||||
nodetext = re.sub(
|
||||
r'<style.*?</style>',
|
||||
'',
|
||||
node.astext(),
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
nodetext = re.sub(
|
||||
r'<script.*?</script>',
|
||||
'',
|
||||
nodetext,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
|
||||
word_store.words.extend(split(nodetext))
|
||||
return
|
||||
elif (isinstance(node, nodes.meta)
|
||||
and _is_meta_keywords(node, language)):
|
||||
elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
|
||||
keywords = [keyword.strip() for keyword in node['content'].split(',')]
|
||||
word_store.words.extend(keywords)
|
||||
elif isinstance(node, nodes.Text):
|
||||
@ -553,11 +598,16 @@ class IndexBuilder:
|
||||
"""Returns JS code that will be inserted into language_data.js."""
|
||||
if self.lang.js_stemmer_rawcode:
|
||||
js_dir = path.join(package_dir, 'search', 'minified-js')
|
||||
with open(path.join(js_dir, 'base-stemmer.js'), encoding='utf-8') as js_file:
|
||||
with open(
|
||||
path.join(js_dir, 'base-stemmer.js'), encoding='utf-8'
|
||||
) as js_file:
|
||||
base_js = js_file.read()
|
||||
with open(path.join(js_dir, self.lang.js_stemmer_rawcode), encoding='utf-8') as js_file:
|
||||
with open(
|
||||
path.join(js_dir, self.lang.js_stemmer_rawcode), encoding='utf-8'
|
||||
) as js_file:
|
||||
language_js = js_file.read()
|
||||
return ('%s\n%s\nStemmer = %sStemmer;' %
|
||||
(base_js, language_js, self.lang.language_name))
|
||||
return (
|
||||
f'{base_js}\n{language_js}\nStemmer = {self.lang.language_name}Stemmer;'
|
||||
)
|
||||
else:
|
||||
return self.lang.js_stemmer_code
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
danish_stopwords = parse_stop_word('''
|
||||
danish_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/danish/stop.txt
|
||||
og | and
|
||||
i | in
|
||||
@ -104,7 +102,7 @@ været | be
|
||||
thi | for (conj)
|
||||
jer | you
|
||||
sådan | such, like this/like that
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchDanish(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
german_stopwords = parse_stop_word('''
|
||||
german_stopwords = parse_stop_word("""
|
||||
|source: https://snowball.tartarus.org/algorithms/german/stop.txt
|
||||
aber | but
|
||||
|
||||
@ -287,7 +285,7 @@ zum | zu + dem
|
||||
zur | zu + der
|
||||
zwar | indeed
|
||||
zwischen | between
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchGerman(SearchLanguage):
|
||||
|
@ -2,13 +2,12 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
|
||||
english_stopwords = set("""
|
||||
english_stopwords = set(
|
||||
"""
|
||||
a and are as at
|
||||
be but by
|
||||
for
|
||||
@ -18,7 +17,8 @@ of on or
|
||||
such
|
||||
that the their then there these they this to
|
||||
was will with
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
js_porter_stemmer = """
|
||||
/**
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
spanish_stopwords = parse_stop_word('''
|
||||
spanish_stopwords = parse_stop_word("""
|
||||
|source: https://snowball.tartarus.org/algorithms/spanish/stop.txt
|
||||
de | from, of
|
||||
la | the, her
|
||||
@ -347,7 +345,7 @@ tenida
|
||||
tenidos
|
||||
tenidas
|
||||
tened
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchSpanish(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
finnish_stopwords = parse_stop_word('''
|
||||
finnish_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/finnish/stop.txt
|
||||
| forms of BE
|
||||
|
||||
@ -97,7 +95,7 @@ kun | when
|
||||
niin | so
|
||||
nyt | now
|
||||
itse | self
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchFinnish(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
french_stopwords = parse_stop_word('''
|
||||
french_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/french/stop.txt
|
||||
au | a + le
|
||||
aux | a + les
|
||||
@ -183,7 +181,7 @@ quelle | which
|
||||
quelles | which
|
||||
sans | without
|
||||
soi | oneself
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchFrench(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
hungarian_stopwords = parse_stop_word('''
|
||||
hungarian_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/hungarian/stop.txt
|
||||
| prepared by Anna Tordai
|
||||
a
|
||||
@ -210,7 +208,7 @@ vissza
|
||||
vele
|
||||
viszont
|
||||
volna
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchHungarian(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
italian_stopwords = parse_stop_word('''
|
||||
italian_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/italian/stop.txt
|
||||
ad | a (to) before vowel
|
||||
al | a + il
|
||||
@ -300,7 +298,7 @@ stessi
|
||||
stesse
|
||||
stessimo
|
||||
stessero
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchItalian(SearchLanguage):
|
||||
|
@ -17,12 +17,14 @@ from typing import Any
|
||||
|
||||
try:
|
||||
import MeCab # type: ignore[import-not-found]
|
||||
|
||||
native_module = True
|
||||
except ImportError:
|
||||
native_module = False
|
||||
|
||||
try:
|
||||
import janome.tokenizer # type: ignore[import-not-found]
|
||||
|
||||
janome_module = True
|
||||
except ImportError:
|
||||
janome_module = False
|
||||
@ -61,7 +63,8 @@ class MecabSplitter(BaseSplitter):
|
||||
result = self.native.parse(input)
|
||||
else:
|
||||
result = self.ctypes_libmecab.mecab_sparse_tostr(
|
||||
self.ctypes_mecab, input.encode(self.dict_encode))
|
||||
self.ctypes_mecab, input.encode(self.dict_encode)
|
||||
)
|
||||
return result.split(' ')
|
||||
|
||||
def init_native(self, options: dict[str, str]) -> None:
|
||||
@ -89,7 +92,8 @@ class MecabSplitter(BaseSplitter):
|
||||
if os.path.exists(lib):
|
||||
libpath = lib
|
||||
if libpath is None:
|
||||
raise RuntimeError('MeCab dynamic library is not available')
|
||||
msg = 'MeCab dynamic library is not available'
|
||||
raise RuntimeError(msg)
|
||||
|
||||
param = 'mecab -Owakati'
|
||||
dict = options.get('dict')
|
||||
@ -101,11 +105,15 @@ class MecabSplitter(BaseSplitter):
|
||||
self.ctypes_libmecab = ctypes.CDLL(libpath)
|
||||
self.ctypes_libmecab.mecab_new2.argtypes = (ctypes.c_char_p,)
|
||||
self.ctypes_libmecab.mecab_new2.restype = ctypes.c_void_p
|
||||
self.ctypes_libmecab.mecab_sparse_tostr.argtypes = (ctypes.c_void_p, ctypes.c_char_p)
|
||||
self.ctypes_libmecab.mecab_sparse_tostr.argtypes = (
|
||||
ctypes.c_void_p,
|
||||
ctypes.c_char_p,
|
||||
)
|
||||
self.ctypes_libmecab.mecab_sparse_tostr.restype = ctypes.c_char_p
|
||||
self.ctypes_mecab = self.ctypes_libmecab.mecab_new2(param.encode(fs_enc))
|
||||
if self.ctypes_mecab is None:
|
||||
raise SphinxError('mecab initialization failed')
|
||||
msg = 'mecab initialization failed'
|
||||
raise SphinxError(msg)
|
||||
|
||||
def __del__(self) -> None:
|
||||
if self.ctypes_libmecab:
|
||||
@ -121,8 +129,11 @@ class JanomeSplitter(BaseSplitter):
|
||||
|
||||
def init_tokenizer(self) -> None:
|
||||
if not janome_module:
|
||||
raise RuntimeError('Janome is not available')
|
||||
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
|
||||
msg = 'Janome is not available'
|
||||
raise RuntimeError(msg)
|
||||
self.tokenizer = janome.tokenizer.Tokenizer(
|
||||
udic=self.user_dict, udic_enc=self.user_dict_enc
|
||||
)
|
||||
|
||||
def split(self, input: str) -> list[str]:
|
||||
result = ' '.join(token.surface for token in self.tokenizer.tokenize(input))
|
||||
@ -130,14 +141,18 @@ class JanomeSplitter(BaseSplitter):
|
||||
|
||||
|
||||
class DefaultSplitter(BaseSplitter):
|
||||
patterns_ = {re.compile(pattern): value for pattern, value in {
|
||||
'[一二三四五六七八九十百千万億兆]': 'M',
|
||||
'[一-龠々〆ヵヶ]': 'H',
|
||||
'[ぁ-ん]': 'I',
|
||||
'[ァ-ヴーア-ン゙ー]': 'K',
|
||||
'[a-zA-Za-zA-Z]': 'A',
|
||||
'[0-90-9]': 'N',
|
||||
}.items()}
|
||||
patterns_ = {
|
||||
re.compile(pattern): value
|
||||
for pattern, value in {
|
||||
'[一二三四五六七八九十百千万億兆]': 'M',
|
||||
'[一-龠々〆ヵヶ]': 'H',
|
||||
'[ぁ-ん]': 'I',
|
||||
'[ァ-ヴーア-ン゙ー]': 'K',
|
||||
'[a-zA-Za-zA-Z]': 'A',
|
||||
'[0-90-9]': 'N',
|
||||
}.items()
|
||||
}
|
||||
# fmt: off
|
||||
BIAS__ = -332
|
||||
BC1__ = {'HH': 6, 'II': 2461, 'KH': 406, 'OH': -1378}
|
||||
BC2__ = {'AA': -3267, 'AI': 2744, 'AN': -878, 'HH': -4070, 'HM': -1711,
|
||||
@ -398,6 +413,7 @@ class DefaultSplitter(BaseSplitter):
|
||||
'委': 798, '学': -960, '市': 887, '広': -695, '後': 535, '業': -697,
|
||||
'相': 753, '社': -507, '福': 974, '空': -822, '者': 1811, '連': 463,
|
||||
'郎': 1082, '1': -270, 'E1': 306, 'ル': -673, 'ン': -496}
|
||||
# fmt: on
|
||||
|
||||
# ctype_
|
||||
def ctype_(self, char: str) -> str:
|
||||
@ -427,18 +443,18 @@ class DefaultSplitter(BaseSplitter):
|
||||
|
||||
for i in range(4, len(seg) - 3):
|
||||
score = self.BIAS__
|
||||
w1 = seg[i-3]
|
||||
w2 = seg[i-2]
|
||||
w3 = seg[i-1]
|
||||
w1 = seg[i - 3]
|
||||
w2 = seg[i - 2]
|
||||
w3 = seg[i - 1]
|
||||
w4 = seg[i]
|
||||
w5 = seg[i+1]
|
||||
w6 = seg[i+2]
|
||||
c1 = ctype[i-3]
|
||||
c2 = ctype[i-2]
|
||||
c3 = ctype[i-1]
|
||||
w5 = seg[i + 1]
|
||||
w6 = seg[i + 2]
|
||||
c1 = ctype[i - 3]
|
||||
c2 = ctype[i - 2]
|
||||
c3 = ctype[i - 1]
|
||||
c4 = ctype[i]
|
||||
c5 = ctype[i+1]
|
||||
c6 = ctype[i+2]
|
||||
c5 = ctype[i + 1]
|
||||
c6 = ctype[i + 2]
|
||||
score += self.ts_(self.UP1__, p1)
|
||||
score += self.ts_(self.UP2__, p2)
|
||||
score += self.ts_(self.UP3__, p3)
|
||||
@ -470,7 +486,7 @@ class DefaultSplitter(BaseSplitter):
|
||||
score += self.ts_(self.TC2__, c2 + c3 + c4)
|
||||
score += self.ts_(self.TC3__, c3 + c4 + c5)
|
||||
score += self.ts_(self.TC4__, c4 + c5 + c6)
|
||||
# score += self.ts_(self.TC5__, c4 + c5 + c6)
|
||||
# score += self.ts_(self.TC5__, c4 + c5 + c6)
|
||||
score += self.ts_(self.UQ1__, p1 + c1)
|
||||
score += self.ts_(self.UQ2__, p2 + c2)
|
||||
score += self.ts_(self.UQ1__, p3 + c3)
|
||||
@ -501,6 +517,7 @@ class SearchJapanese(SearchLanguage):
|
||||
Japanese search implementation: uses no stemmer, but word splitting is quite
|
||||
complicated.
|
||||
"""
|
||||
|
||||
lang = 'ja'
|
||||
language_name = 'Japanese'
|
||||
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
dutch_stopwords = parse_stop_word('''
|
||||
dutch_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/dutch/stop.txt
|
||||
de | the
|
||||
en | and
|
||||
@ -111,7 +109,7 @@ uw | your
|
||||
iemand | somebody
|
||||
geweest | been; past participle of 'be'
|
||||
andere | other
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchDutch(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
norwegian_stopwords = parse_stop_word('''
|
||||
norwegian_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/norwegian/stop.txt
|
||||
og | and
|
||||
i | in
|
||||
@ -186,7 +184,7 @@ verte | become *
|
||||
vort | become *
|
||||
varte | became *
|
||||
vart | became *
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchNorwegian(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
portuguese_stopwords = parse_stop_word('''
|
||||
portuguese_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/portuguese/stop.txt
|
||||
de | of, from
|
||||
a | the; to, at; her
|
||||
@ -245,7 +243,7 @@ terão
|
||||
teria
|
||||
teríamos
|
||||
teriam
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchPortuguese(SearchLanguage):
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict, Set
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
russian_stopwords = parse_stop_word('''
|
||||
russian_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/russian/stop.txt
|
||||
и | and
|
||||
в | in/into
|
||||
@ -235,7 +233,7 @@ russian_stopwords = parse_stop_word('''
|
||||
| можн
|
||||
| нужн
|
||||
| нельзя
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchRussian(SearchLanguage):
|
||||
|
@ -2,13 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
swedish_stopwords = parse_stop_word('''
|
||||
swedish_stopwords = parse_stop_word("""
|
||||
| source: https://snowball.tartarus.org/algorithms/swedish/stop.txt
|
||||
och | and
|
||||
det | it, this/that
|
||||
@ -124,7 +122,7 @@ våra | our
|
||||
ert | your
|
||||
era | your
|
||||
vilkas | whose
|
||||
''')
|
||||
""")
|
||||
|
||||
|
||||
class SearchSwedish(SearchLanguage):
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict, Set
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
|
@ -11,11 +11,13 @@ from sphinx.search import SearchLanguage
|
||||
|
||||
try:
|
||||
import jieba # type: ignore[import-not-found]
|
||||
|
||||
JIEBA = True
|
||||
except ImportError:
|
||||
JIEBA = False
|
||||
|
||||
english_stopwords = set("""
|
||||
english_stopwords = set(
|
||||
"""
|
||||
a and are as at
|
||||
be but by
|
||||
for
|
||||
@ -25,7 +27,8 @@ of on or
|
||||
such
|
||||
that the their then there these they this to
|
||||
was will with
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
js_porter_stemmer = """
|
||||
/**
|
||||
@ -239,8 +242,7 @@ class SearchChinese(SearchLanguage):
|
||||
if JIEBA:
|
||||
chinese = list(jieba.cut_for_search(input))
|
||||
|
||||
latin1 = \
|
||||
[term.strip() for term in self.latin1_letters.findall(input)]
|
||||
latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
|
||||
self.latin_terms.extend(latin1)
|
||||
return chinese + latin1
|
||||
|
||||
@ -252,9 +254,9 @@ class SearchChinese(SearchLanguage):
|
||||
# if not stemmed, but would be too short after being stemmed
|
||||
# avoids some issues with acronyms
|
||||
should_not_be_stemmed = (
|
||||
word in self.latin_terms and
|
||||
len(word) >= 3 and
|
||||
len(self.stemmer.stemWord(word.lower())) < 3
|
||||
word in self.latin_terms
|
||||
and len(word) >= 3
|
||||
and len(self.stemmer.stemWord(word.lower())) < 3
|
||||
)
|
||||
if should_not_be_stemmed:
|
||||
return word.lower()
|
||||
|
Loading…
Reference in New Issue
Block a user