mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Migrate to py3 style type annotation: sphinx.search
This commit is contained in:
parent
28ebe127a8
commit
4ec7fdf24b
@ -13,21 +13,21 @@ import re
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from os import path
|
||||
from typing import Any, Dict, IO, Iterable, List, Tuple, Set
|
||||
|
||||
from docutils import nodes
|
||||
from docutils.nodes import Node
|
||||
|
||||
from sphinx import addnodes
|
||||
from sphinx import package_dir
|
||||
from sphinx.deprecation import RemovedInSphinx40Warning
|
||||
from sphinx.environment import BuildEnvironment
|
||||
from sphinx.search.jssplitter import splitter_code
|
||||
from sphinx.util import jsdump, rpartition
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any, Dict, IO, Iterable, List, Tuple, Set # NOQA
|
||||
from typing import Type # for python3.5.1
|
||||
from docutils import nodes # NOQA
|
||||
from sphinx.environment import BuildEnvironment # NOQA
|
||||
|
||||
|
||||
class SearchLanguage:
|
||||
@ -69,19 +69,16 @@ var Stemmer = function() {
|
||||
|
||||
_word_re = re.compile(r'(?u)\w+')
|
||||
|
||||
def __init__(self, options):
|
||||
# type: (Dict) -> None
|
||||
def __init__(self, options: Dict) -> None:
|
||||
self.options = options
|
||||
self.init(options)
|
||||
|
||||
def init(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
"""
|
||||
Initialize the class with the options the user has given.
|
||||
"""
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
"""
|
||||
This method splits a sentence into words. Default splitter splits input
|
||||
at white spaces, which should be enough for most languages except CJK
|
||||
@ -89,8 +86,7 @@ var Stemmer = function() {
|
||||
"""
|
||||
return self._word_re.findall(input)
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
"""
|
||||
This method implements stemming algorithm of the Python version.
|
||||
|
||||
@ -103,8 +99,7 @@ var Stemmer = function() {
|
||||
"""
|
||||
return word
|
||||
|
||||
def word_filter(self, word):
|
||||
# type: (str) -> bool
|
||||
def word_filter(self, word: str) -> bool:
|
||||
"""
|
||||
Return true if the target word should be registered in the search index.
|
||||
This method is called after stemming.
|
||||
@ -121,8 +116,7 @@ var Stemmer = function() {
|
||||
from sphinx.search.en import SearchEnglish
|
||||
|
||||
|
||||
def parse_stop_word(source):
|
||||
# type: (str) -> Set[str]
|
||||
def parse_stop_word(source: str) -> Set[str]:
|
||||
"""
|
||||
parse snowball style word list like this:
|
||||
|
||||
@ -166,24 +160,20 @@ class _JavaScriptIndex:
|
||||
PREFIX = 'Search.setIndex('
|
||||
SUFFIX = ')'
|
||||
|
||||
def dumps(self, data):
|
||||
# type: (Any) -> str
|
||||
def dumps(self, data: Any) -> str:
|
||||
return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
|
||||
|
||||
def loads(self, s):
|
||||
# type: (str) -> Any
|
||||
def loads(self, s: str) -> Any:
|
||||
data = s[len(self.PREFIX):-len(self.SUFFIX)]
|
||||
if not data or not s.startswith(self.PREFIX) or not \
|
||||
s.endswith(self.SUFFIX):
|
||||
raise ValueError('invalid data')
|
||||
return jsdump.loads(data)
|
||||
|
||||
def dump(self, data, f):
|
||||
# type: (Any, IO) -> None
|
||||
def dump(self, data: Any, f: IO) -> None:
|
||||
f.write(self.dumps(data))
|
||||
|
||||
def load(self, f):
|
||||
# type: (IO) -> Any
|
||||
def load(self, f: IO) -> Any:
|
||||
return self.loads(f.read())
|
||||
|
||||
|
||||
@ -195,15 +185,13 @@ class WordCollector(nodes.NodeVisitor):
|
||||
A special visitor that collects words for the `IndexBuilder`.
|
||||
"""
|
||||
|
||||
def __init__(self, document, lang):
|
||||
# type: (nodes.document, SearchLanguage) -> None
|
||||
def __init__(self, document: nodes.document, lang: SearchLanguage) -> None:
|
||||
super().__init__(document)
|
||||
self.found_words = [] # type: List[str]
|
||||
self.found_title_words = [] # type: List[str]
|
||||
self.lang = lang
|
||||
|
||||
def is_meta_keywords(self, node, nodetype=None):
|
||||
# type: (addnodes.meta, Any) -> bool
|
||||
def is_meta_keywords(self, node: addnodes.meta, nodetype: Any = None) -> bool:
|
||||
if nodetype is not None:
|
||||
warnings.warn('"nodetype" argument for WordCollector.is_meta_keywords() '
|
||||
'is deprecated.', RemovedInSphinx40Warning)
|
||||
@ -217,8 +205,7 @@ class WordCollector(nodes.NodeVisitor):
|
||||
|
||||
return False
|
||||
|
||||
def dispatch_visit(self, node):
|
||||
# type: (nodes.Node) -> None
|
||||
def dispatch_visit(self, node: Node) -> None:
|
||||
if isinstance(node, nodes.comment):
|
||||
raise nodes.SkipNode
|
||||
elif isinstance(node, nodes.raw):
|
||||
@ -251,8 +238,7 @@ class IndexBuilder:
|
||||
'pickle': pickle
|
||||
}
|
||||
|
||||
def __init__(self, env, lang, options, scoring):
|
||||
# type: (BuildEnvironment, str, Dict, str) -> None
|
||||
def __init__(self, env: BuildEnvironment, lang: str, options: Dict, scoring: str) -> None:
|
||||
self.env = env
|
||||
self._titles = {} # type: Dict[str, str]
|
||||
# docname -> title
|
||||
@ -292,8 +278,7 @@ class IndexBuilder:
|
||||
self.js_scorer_code = ''
|
||||
self.js_splitter_code = splitter_code
|
||||
|
||||
def load(self, stream, format):
|
||||
# type: (IO, Any) -> None
|
||||
def load(self, stream: IO, format: Any) -> None:
|
||||
"""Reconstruct from frozen data."""
|
||||
if isinstance(format, str):
|
||||
format = self.formats[format]
|
||||
@ -306,8 +291,7 @@ class IndexBuilder:
|
||||
self._filenames = dict(zip(index2fn, frozen['filenames']))
|
||||
self._titles = dict(zip(index2fn, frozen['titles']))
|
||||
|
||||
def load_terms(mapping):
|
||||
# type: (Dict[str, Any]) -> Dict[str, Set[str]]
|
||||
def load_terms(mapping: Dict[str, Any]) -> Dict[str, Set[str]]:
|
||||
rv = {}
|
||||
for k, v in mapping.items():
|
||||
if isinstance(v, int):
|
||||
@ -320,15 +304,14 @@ class IndexBuilder:
|
||||
self._title_mapping = load_terms(frozen['titleterms'])
|
||||
# no need to load keywords/objtypes
|
||||
|
||||
def dump(self, stream, format):
|
||||
# type: (IO, Any) -> None
|
||||
def dump(self, stream: IO, format: Any) -> None:
|
||||
"""Dump the frozen index to a stream."""
|
||||
if isinstance(format, str):
|
||||
format = self.formats[format]
|
||||
format.dump(self.freeze(), stream)
|
||||
|
||||
def get_objects(self, fn2index):
|
||||
# type: (Dict[str, int]) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]
|
||||
def get_objects(self, fn2index: Dict[str, int]
|
||||
) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]:
|
||||
rv = {} # type: Dict[str, Dict[str, Tuple[int, int, int, str]]]
|
||||
otypes = self._objtypes
|
||||
onames = self._objnames
|
||||
@ -364,8 +347,7 @@ class IndexBuilder:
|
||||
pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
|
||||
return rv
|
||||
|
||||
def get_terms(self, fn2index):
|
||||
# type: (Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]
|
||||
def get_terms(self, fn2index: Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
||||
rvs = {}, {} # type: Tuple[Dict[str, List[str]], Dict[str, List[str]]]
|
||||
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
|
||||
for k, v in mapping.items():
|
||||
@ -377,8 +359,7 @@ class IndexBuilder:
|
||||
rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])
|
||||
return rvs
|
||||
|
||||
def freeze(self):
|
||||
# type: () -> Dict[str, Any]
|
||||
def freeze(self) -> Dict[str, Any]:
|
||||
"""Create a usable data structure for serializing."""
|
||||
docnames, titles = zip(*sorted(self._titles.items()))
|
||||
filenames = [self._filenames.get(docname) for docname in docnames]
|
||||
@ -392,12 +373,10 @@ class IndexBuilder:
|
||||
objects=objects, objtypes=objtypes, objnames=objnames,
|
||||
titleterms=title_terms, envversion=self.env.version)
|
||||
|
||||
def label(self):
|
||||
# type: () -> str
|
||||
def label(self) -> str:
|
||||
return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)
|
||||
|
||||
def prune(self, docnames):
|
||||
# type: (Iterable[str]) -> None
|
||||
def prune(self, docnames: Iterable[str]) -> None:
|
||||
"""Remove data for all docnames not in the list."""
|
||||
new_titles = {}
|
||||
new_filenames = {}
|
||||
@ -412,8 +391,7 @@ class IndexBuilder:
|
||||
for wordnames in self._title_mapping.values():
|
||||
wordnames.intersection_update(docnames)
|
||||
|
||||
def feed(self, docname, filename, title, doctree):
|
||||
# type: (str, str, str, nodes.document) -> None
|
||||
def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
|
||||
"""Feed a doctree to the index."""
|
||||
self._titles[docname] = title
|
||||
self._filenames[docname] = filename
|
||||
@ -422,8 +400,7 @@ class IndexBuilder:
|
||||
doctree.walk(visitor)
|
||||
|
||||
# memoize self.lang.stem
|
||||
def stem(word):
|
||||
# type: (str) -> str
|
||||
def stem(word: str) -> str:
|
||||
try:
|
||||
return self._stem_cache[word]
|
||||
except KeyError:
|
||||
@ -447,8 +424,7 @@ class IndexBuilder:
|
||||
if _filter(stemmed_word) and not already_indexed:
|
||||
self._mapping.setdefault(stemmed_word, set()).add(docname)
|
||||
|
||||
def context_for_searchtool(self):
|
||||
# type: () -> Dict[str, Any]
|
||||
def context_for_searchtool(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'search_language_stemming_code': self.lang.js_stemmer_code,
|
||||
'search_language_stop_words': jsdump.dumps(sorted(self.lang.stopwords)),
|
||||
@ -456,8 +432,7 @@ class IndexBuilder:
|
||||
'search_word_splitter_code': self.js_splitter_code,
|
||||
}
|
||||
|
||||
def get_js_stemmer_rawcode(self):
|
||||
# type: () -> str
|
||||
def get_js_stemmer_rawcode(self) -> str:
|
||||
if self.lang.js_stemmer_rawcode:
|
||||
return path.join(package_dir, 'search', 'non-minified-js',
|
||||
self.lang.js_stemmer_rawcode)
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
danish_stopwords = parse_stop_word('''
|
||||
@ -128,10 +126,8 @@ class SearchDanish(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = danish_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('danish')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
german_stopwords = parse_stop_word('''
|
||||
@ -311,10 +309,8 @@ class SearchGerman(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = german_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('german')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util.stemmer import get_stemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Dict # NOQA
|
||||
|
||||
english_stopwords = set("""
|
||||
a and are as at
|
||||
be but by
|
||||
@ -220,10 +218,8 @@ class SearchEnglish(SearchLanguage):
|
||||
js_stemmer_code = js_porter_stemmer
|
||||
stopwords = english_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = get_stemmer()
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stem(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
spanish_stopwords = parse_stop_word('''
|
||||
@ -371,10 +369,8 @@ class SearchSpanish(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = spanish_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('spanish')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
finnish_stopwords = parse_stop_word('''
|
||||
@ -121,10 +119,8 @@ class SearchFinnish(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = finnish_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('finnish')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
french_stopwords = parse_stop_word('''
|
||||
@ -207,10 +205,8 @@ class SearchFrench(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = french_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('french')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
hungarian_stopwords = parse_stop_word('''
|
||||
@ -235,10 +233,8 @@ class SearchHungarian(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = hungarian_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('hungarian')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
italian_stopwords = parse_stop_word('''
|
||||
@ -324,10 +322,8 @@ class SearchItalian(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = italian_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('italian')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -20,6 +20,7 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from typing import Any, Dict, List
|
||||
|
||||
try:
|
||||
import MeCab
|
||||
@ -38,21 +39,13 @@ from sphinx.errors import SphinxError, ExtensionError
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util import import_object
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any, Dict, List # NOQA
|
||||
|
||||
|
||||
class BaseSplitter:
|
||||
|
||||
def __init__(self, options):
|
||||
# type: (Dict) -> None
|
||||
def __init__(self, options: Dict) -> None:
|
||||
self.options = options
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
"""
|
||||
|
||||
:param str input:
|
||||
:return:
|
||||
:rtype: list[str]
|
||||
@ -61,8 +54,7 @@ class BaseSplitter:
|
||||
|
||||
|
||||
class MecabSplitter(BaseSplitter):
|
||||
def __init__(self, options):
|
||||
# type: (Dict) -> None
|
||||
def __init__(self, options: Dict) -> None:
|
||||
super().__init__(options)
|
||||
self.ctypes_libmecab = None # type: Any
|
||||
self.ctypes_mecab = None # type: Any
|
||||
@ -72,8 +64,7 @@ class MecabSplitter(BaseSplitter):
|
||||
self.init_native(options)
|
||||
self.dict_encode = options.get('dic_enc', 'utf-8')
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
if native_module:
|
||||
result = self.native.parse(input)
|
||||
else:
|
||||
@ -81,16 +72,14 @@ class MecabSplitter(BaseSplitter):
|
||||
self.ctypes_mecab, input.encode(self.dict_encode))
|
||||
return result.split(' ')
|
||||
|
||||
def init_native(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init_native(self, options: Dict) -> None:
|
||||
param = '-Owakati'
|
||||
dict = options.get('dict')
|
||||
if dict:
|
||||
param += ' -d %s' % dict
|
||||
self.native = MeCab.Tagger(param)
|
||||
|
||||
def init_ctypes(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init_ctypes(self, options: Dict) -> None:
|
||||
import ctypes.util
|
||||
|
||||
lib = options.get('lib')
|
||||
@ -126,8 +115,7 @@ class MecabSplitter(BaseSplitter):
|
||||
if self.ctypes_mecab is None:
|
||||
raise SphinxError('mecab initialization failed')
|
||||
|
||||
def __del__(self):
|
||||
# type: () -> None
|
||||
def __del__(self) -> None:
|
||||
if self.ctypes_libmecab:
|
||||
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
|
||||
|
||||
@ -135,21 +123,18 @@ MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
|
||||
|
||||
|
||||
class JanomeSplitter(BaseSplitter):
|
||||
def __init__(self, options):
|
||||
# type: (Dict) -> None
|
||||
def __init__(self, options: Dict) -> None:
|
||||
super().__init__(options)
|
||||
self.user_dict = options.get('user_dic')
|
||||
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
|
||||
self.init_tokenizer()
|
||||
|
||||
def init_tokenizer(self):
|
||||
# type: () -> None
|
||||
def init_tokenizer(self) -> None:
|
||||
if not janome_module:
|
||||
raise RuntimeError('Janome is not available')
|
||||
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
result = ' '.join(token.surface for token in self.tokenizer.tokenize(input))
|
||||
return result.split(' ')
|
||||
|
||||
@ -425,23 +410,20 @@ class DefaultSplitter(BaseSplitter):
|
||||
'郎': 1082, '1': -270, 'E1': 306, 'ル': -673, 'ン': -496}
|
||||
|
||||
# ctype_
|
||||
def ctype_(self, char):
|
||||
# type: (str) -> str
|
||||
def ctype_(self, char: str) -> str:
|
||||
for pattern, value in self.patterns_.items():
|
||||
if pattern.match(char):
|
||||
return value
|
||||
return 'O'
|
||||
|
||||
# ts_
|
||||
def ts_(self, dict, key):
|
||||
# type: (Dict[str, int], str) -> int
|
||||
def ts_(self, dict: Dict[str, int], key: str) -> int:
|
||||
if key in dict:
|
||||
return dict[key]
|
||||
return 0
|
||||
|
||||
# segment
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
if not input:
|
||||
return []
|
||||
|
||||
@ -549,8 +531,7 @@ class SearchJapanese(SearchLanguage):
|
||||
'janome': 'sphinx.search.ja.JanomeSplitter',
|
||||
}
|
||||
|
||||
def init(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
type = options.get('type', 'sphinx.search.ja.DefaultSplitter')
|
||||
if type in self.splitters:
|
||||
dotted_path = self.splitters[type]
|
||||
@ -565,14 +546,11 @@ class SearchJapanese(SearchLanguage):
|
||||
raise ExtensionError("Splitter module %r can't be imported" %
|
||||
dotted_path)
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
return self.splitter.split(input)
|
||||
|
||||
def word_filter(self, stemmed_word):
|
||||
# type: (str) -> bool
|
||||
def word_filter(self, stemmed_word: str) -> bool:
|
||||
return len(stemmed_word) > 1
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return word
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
dutch_stopwords = parse_stop_word('''
|
||||
@ -135,10 +133,8 @@ class SearchDutch(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = dutch_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('dutch')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
norwegian_stopwords = parse_stop_word('''
|
||||
@ -210,10 +208,8 @@ class SearchNorwegian(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = norwegian_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('norwegian')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
portuguese_stopwords = parse_stop_word('''
|
||||
@ -270,10 +268,8 @@ class SearchPortuguese(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = portuguese_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('portuguese')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
File diff suppressed because one or more lines are too long
@ -8,13 +8,11 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any # NOQA
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
russian_stopwords = parse_stop_word('''
|
||||
@ -259,10 +257,8 @@ class SearchRussian(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = russian_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('russian')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
@ -8,13 +8,12 @@
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Any
|
||||
from sphinx.search import SearchLanguage, parse_stop_word
|
||||
|
||||
|
||||
swedish_stopwords = parse_stop_word('''
|
||||
| source: http://snowball.tartarus.org/algorithms/swedish/stop.txt
|
||||
@ -147,10 +146,8 @@ class SearchSwedish(SearchLanguage):
|
||||
js_stemmer_code = js_stemmer
|
||||
stopwords = swedish_stopwords
|
||||
|
||||
def init(self, options):
|
||||
# type: (Any) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = snowballstemmer.stemmer('swedish')
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
File diff suppressed because one or more lines are too long
@ -10,6 +10,7 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util.stemmer import get_stemmer
|
||||
@ -20,10 +21,6 @@ try:
|
||||
except ImportError:
|
||||
JIEBA = False
|
||||
|
||||
if False:
|
||||
# For type annotation
|
||||
from typing import Dict, List # NOQA
|
||||
|
||||
english_stopwords = set("""
|
||||
a and are as at
|
||||
be but by
|
||||
@ -235,8 +232,7 @@ class SearchChinese(SearchLanguage):
|
||||
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
|
||||
latin_terms = [] # type: List[str]
|
||||
|
||||
def init(self, options):
|
||||
# type: (Dict) -> None
|
||||
def init(self, options: Dict) -> None:
|
||||
if JIEBA:
|
||||
dict_path = options.get('dict')
|
||||
if dict_path and os.path.isfile(dict_path):
|
||||
@ -244,8 +240,7 @@ class SearchChinese(SearchLanguage):
|
||||
|
||||
self.stemmer = get_stemmer()
|
||||
|
||||
def split(self, input):
|
||||
# type: (str) -> List[str]
|
||||
def split(self, input: str) -> List[str]:
|
||||
chinese = [] # type: List[str]
|
||||
if JIEBA:
|
||||
chinese = list(jieba.cut_for_search(input))
|
||||
@ -255,13 +250,10 @@ class SearchChinese(SearchLanguage):
|
||||
self.latin_terms.extend(latin1)
|
||||
return chinese + latin1
|
||||
|
||||
def word_filter(self, stemmed_word):
|
||||
# type: (str) -> bool
|
||||
def word_filter(self, stemmed_word: str) -> bool:
|
||||
return len(stemmed_word) > 1
|
||||
|
||||
def stem(self, word):
|
||||
# type: (str) -> str
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
# Don't stem Latin words that are long enough to be relevant for search
|
||||
# if not stemmed, but would be too short after being stemmed
|
||||
# avoids some issues with acronyms
|
||||
|
Loading…
Reference in New Issue
Block a user