Migrate to py3 style type annotation: sphinx.search

This commit is contained in:
Takeshi KOMIYA 2019-12-25 00:52:47 +09:00
parent 28ebe127a8
commit 4ec7fdf24b
18 changed files with 116 additions and 228 deletions

View File

@ -13,21 +13,21 @@ import re
import warnings
from importlib import import_module
from os import path
from typing import Any, Dict, IO, Iterable, List, Tuple, Set
from docutils import nodes
from docutils.nodes import Node
from sphinx import addnodes
from sphinx import package_dir
from sphinx.deprecation import RemovedInSphinx40Warning
from sphinx.environment import BuildEnvironment
from sphinx.search.jssplitter import splitter_code
from sphinx.util import jsdump, rpartition
if False:
# For type annotation
from typing import Any, Dict, IO, Iterable, List, Tuple, Set # NOQA
from typing import Type # for python3.5.1
from docutils import nodes # NOQA
from sphinx.environment import BuildEnvironment # NOQA
class SearchLanguage:
@ -69,19 +69,16 @@ var Stemmer = function() {
_word_re = re.compile(r'(?u)\w+')
def __init__(self, options):
# type: (Dict) -> None
def __init__(self, options: Dict) -> None:
self.options = options
self.init(options)
def init(self, options):
# type: (Dict) -> None
def init(self, options: Dict) -> None:
"""
Initialize the class with the options the user has given.
"""
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
"""
This method splits a sentence into words. Default splitter splits input
at white spaces, which should be enough for most languages except CJK
@ -89,8 +86,7 @@ var Stemmer = function() {
"""
return self._word_re.findall(input)
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
"""
This method implements stemming algorithm of the Python version.
@ -103,8 +99,7 @@ var Stemmer = function() {
"""
return word
def word_filter(self, word):
# type: (str) -> bool
def word_filter(self, word: str) -> bool:
"""
Return true if the target word should be registered in the search index.
This method is called after stemming.
@ -121,8 +116,7 @@ var Stemmer = function() {
from sphinx.search.en import SearchEnglish
def parse_stop_word(source):
# type: (str) -> Set[str]
def parse_stop_word(source: str) -> Set[str]:
"""
parse snowball style word list like this:
@ -166,24 +160,20 @@ class _JavaScriptIndex:
PREFIX = 'Search.setIndex('
SUFFIX = ')'
def dumps(self, data):
# type: (Any) -> str
def dumps(self, data: Any) -> str:
return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
def loads(self, s):
# type: (str) -> Any
def loads(self, s: str) -> Any:
data = s[len(self.PREFIX):-len(self.SUFFIX)]
if not data or not s.startswith(self.PREFIX) or not \
s.endswith(self.SUFFIX):
raise ValueError('invalid data')
return jsdump.loads(data)
def dump(self, data, f):
# type: (Any, IO) -> None
def dump(self, data: Any, f: IO) -> None:
f.write(self.dumps(data))
def load(self, f):
# type: (IO) -> Any
def load(self, f: IO) -> Any:
return self.loads(f.read())
@ -195,15 +185,13 @@ class WordCollector(nodes.NodeVisitor):
A special visitor that collects words for the `IndexBuilder`.
"""
def __init__(self, document, lang):
# type: (nodes.document, SearchLanguage) -> None
def __init__(self, document: nodes.document, lang: SearchLanguage) -> None:
super().__init__(document)
self.found_words = [] # type: List[str]
self.found_title_words = [] # type: List[str]
self.lang = lang
def is_meta_keywords(self, node, nodetype=None):
# type: (addnodes.meta, Any) -> bool
def is_meta_keywords(self, node: addnodes.meta, nodetype: Any = None) -> bool:
if nodetype is not None:
warnings.warn('"nodetype" argument for WordCollector.is_meta_keywords() '
'is deprecated.', RemovedInSphinx40Warning)
@ -217,8 +205,7 @@ class WordCollector(nodes.NodeVisitor):
return False
def dispatch_visit(self, node):
# type: (nodes.Node) -> None
def dispatch_visit(self, node: Node) -> None:
if isinstance(node, nodes.comment):
raise nodes.SkipNode
elif isinstance(node, nodes.raw):
@ -251,8 +238,7 @@ class IndexBuilder:
'pickle': pickle
}
def __init__(self, env, lang, options, scoring):
# type: (BuildEnvironment, str, Dict, str) -> None
def __init__(self, env: BuildEnvironment, lang: str, options: Dict, scoring: str) -> None:
self.env = env
self._titles = {} # type: Dict[str, str]
# docname -> title
@ -292,8 +278,7 @@ class IndexBuilder:
self.js_scorer_code = ''
self.js_splitter_code = splitter_code
def load(self, stream, format):
# type: (IO, Any) -> None
def load(self, stream: IO, format: Any) -> None:
"""Reconstruct from frozen data."""
if isinstance(format, str):
format = self.formats[format]
@ -306,8 +291,7 @@ class IndexBuilder:
self._filenames = dict(zip(index2fn, frozen['filenames']))
self._titles = dict(zip(index2fn, frozen['titles']))
def load_terms(mapping):
# type: (Dict[str, Any]) -> Dict[str, Set[str]]
def load_terms(mapping: Dict[str, Any]) -> Dict[str, Set[str]]:
rv = {}
for k, v in mapping.items():
if isinstance(v, int):
@ -320,15 +304,14 @@ class IndexBuilder:
self._title_mapping = load_terms(frozen['titleterms'])
# no need to load keywords/objtypes
def dump(self, stream, format):
# type: (IO, Any) -> None
def dump(self, stream: IO, format: Any) -> None:
"""Dump the frozen index to a stream."""
if isinstance(format, str):
format = self.formats[format]
format.dump(self.freeze(), stream)
def get_objects(self, fn2index):
# type: (Dict[str, int]) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]
def get_objects(self, fn2index: Dict[str, int]
) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]:
rv = {} # type: Dict[str, Dict[str, Tuple[int, int, int, str]]]
otypes = self._objtypes
onames = self._objnames
@ -364,8 +347,7 @@ class IndexBuilder:
pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
return rv
def get_terms(self, fn2index):
# type: (Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]
def get_terms(self, fn2index: Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
rvs = {}, {} # type: Tuple[Dict[str, List[str]], Dict[str, List[str]]]
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
for k, v in mapping.items():
@ -377,8 +359,7 @@ class IndexBuilder:
rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])
return rvs
def freeze(self):
# type: () -> Dict[str, Any]
def freeze(self) -> Dict[str, Any]:
"""Create a usable data structure for serializing."""
docnames, titles = zip(*sorted(self._titles.items()))
filenames = [self._filenames.get(docname) for docname in docnames]
@ -392,12 +373,10 @@ class IndexBuilder:
objects=objects, objtypes=objtypes, objnames=objnames,
titleterms=title_terms, envversion=self.env.version)
def label(self):
# type: () -> str
def label(self) -> str:
return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)
def prune(self, docnames):
# type: (Iterable[str]) -> None
def prune(self, docnames: Iterable[str]) -> None:
"""Remove data for all docnames not in the list."""
new_titles = {}
new_filenames = {}
@ -412,8 +391,7 @@ class IndexBuilder:
for wordnames in self._title_mapping.values():
wordnames.intersection_update(docnames)
def feed(self, docname, filename, title, doctree):
# type: (str, str, str, nodes.document) -> None
def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
"""Feed a doctree to the index."""
self._titles[docname] = title
self._filenames[docname] = filename
@ -422,8 +400,7 @@ class IndexBuilder:
doctree.walk(visitor)
# memoize self.lang.stem
def stem(word):
# type: (str) -> str
def stem(word: str) -> str:
try:
return self._stem_cache[word]
except KeyError:
@ -447,8 +424,7 @@ class IndexBuilder:
if _filter(stemmed_word) and not already_indexed:
self._mapping.setdefault(stemmed_word, set()).add(docname)
def context_for_searchtool(self):
# type: () -> Dict[str, Any]
def context_for_searchtool(self) -> Dict[str, Any]:
return {
'search_language_stemming_code': self.lang.js_stemmer_code,
'search_language_stop_words': jsdump.dumps(sorted(self.lang.stopwords)),
@ -456,8 +432,7 @@ class IndexBuilder:
'search_word_splitter_code': self.js_splitter_code,
}
def get_js_stemmer_rawcode(self):
# type: () -> str
def get_js_stemmer_rawcode(self) -> str:
if self.lang.js_stemmer_rawcode:
return path.join(package_dir, 'search', 'non-minified-js',
self.lang.js_stemmer_rawcode)

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
danish_stopwords = parse_stop_word('''
@ -128,10 +126,8 @@ class SearchDanish(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = danish_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('danish')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
german_stopwords = parse_stop_word('''
@ -311,10 +309,8 @@ class SearchGerman(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = german_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('german')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from typing import Dict
from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer
if False:
# For type annotation
from typing import Dict # NOQA
english_stopwords = set("""
a and are as at
be but by
@ -220,10 +218,8 @@ class SearchEnglish(SearchLanguage):
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
def init(self, options):
# type: (Dict) -> None
def init(self, options: Dict) -> None:
self.stemmer = get_stemmer()
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stem(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
spanish_stopwords = parse_stop_word('''
@ -371,10 +369,8 @@ class SearchSpanish(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = spanish_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('spanish')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
finnish_stopwords = parse_stop_word('''
@ -121,10 +119,8 @@ class SearchFinnish(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = finnish_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('finnish')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
french_stopwords = parse_stop_word('''
@ -207,10 +205,8 @@ class SearchFrench(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = french_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('french')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
hungarian_stopwords = parse_stop_word('''
@ -235,10 +233,8 @@ class SearchHungarian(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = hungarian_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('hungarian')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
italian_stopwords = parse_stop_word('''
@ -324,10 +322,8 @@ class SearchItalian(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = italian_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('italian')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -20,6 +20,7 @@ import os
import re
import sys
import warnings
from typing import Any, Dict, List
try:
import MeCab
@ -38,21 +39,13 @@ from sphinx.errors import SphinxError, ExtensionError
from sphinx.search import SearchLanguage
from sphinx.util import import_object
if False:
# For type annotation
from typing import Any, Dict, List # NOQA
class BaseSplitter:
def __init__(self, options):
# type: (Dict) -> None
def __init__(self, options: Dict) -> None:
self.options = options
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
"""
:param str input:
:return:
:rtype: list[str]
@ -61,8 +54,7 @@ class BaseSplitter:
class MecabSplitter(BaseSplitter):
def __init__(self, options):
# type: (Dict) -> None
def __init__(self, options: Dict) -> None:
super().__init__(options)
self.ctypes_libmecab = None # type: Any
self.ctypes_mecab = None # type: Any
@ -72,8 +64,7 @@ class MecabSplitter(BaseSplitter):
self.init_native(options)
self.dict_encode = options.get('dic_enc', 'utf-8')
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
if native_module:
result = self.native.parse(input)
else:
@ -81,16 +72,14 @@ class MecabSplitter(BaseSplitter):
self.ctypes_mecab, input.encode(self.dict_encode))
return result.split(' ')
def init_native(self, options):
# type: (Dict) -> None
def init_native(self, options: Dict) -> None:
param = '-Owakati'
dict = options.get('dict')
if dict:
param += ' -d %s' % dict
self.native = MeCab.Tagger(param)
def init_ctypes(self, options):
# type: (Dict) -> None
def init_ctypes(self, options: Dict) -> None:
import ctypes.util
lib = options.get('lib')
@ -126,8 +115,7 @@ class MecabSplitter(BaseSplitter):
if self.ctypes_mecab is None:
raise SphinxError('mecab initialization failed')
def __del__(self):
# type: () -> None
def __del__(self) -> None:
if self.ctypes_libmecab:
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
@ -135,21 +123,18 @@ MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
class JanomeSplitter(BaseSplitter):
def __init__(self, options):
# type: (Dict) -> None
def __init__(self, options: Dict) -> None:
super().__init__(options)
self.user_dict = options.get('user_dic')
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
self.init_tokenizer()
def init_tokenizer(self):
# type: () -> None
def init_tokenizer(self) -> None:
if not janome_module:
raise RuntimeError('Janome is not available')
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
result = ' '.join(token.surface for token in self.tokenizer.tokenize(input))
return result.split(' ')
@ -425,23 +410,20 @@ class DefaultSplitter(BaseSplitter):
'': 1082, '': -270, '': 306, '': -673, '': -496}
# ctype_
def ctype_(self, char):
# type: (str) -> str
def ctype_(self, char: str) -> str:
for pattern, value in self.patterns_.items():
if pattern.match(char):
return value
return 'O'
# ts_
def ts_(self, dict, key):
# type: (Dict[str, int], str) -> int
def ts_(self, dict: Dict[str, int], key: str) -> int:
if key in dict:
return dict[key]
return 0
# segment
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
if not input:
return []
@ -549,8 +531,7 @@ class SearchJapanese(SearchLanguage):
'janome': 'sphinx.search.ja.JanomeSplitter',
}
def init(self, options):
# type: (Dict) -> None
def init(self, options: Dict) -> None:
type = options.get('type', 'sphinx.search.ja.DefaultSplitter')
if type in self.splitters:
dotted_path = self.splitters[type]
@ -565,14 +546,11 @@ class SearchJapanese(SearchLanguage):
raise ExtensionError("Splitter module %r can't be imported" %
dotted_path)
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
return self.splitter.split(input)
def word_filter(self, stemmed_word):
# type: (str) -> bool
def word_filter(self, stemmed_word: str) -> bool:
return len(stemmed_word) > 1
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return word

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
dutch_stopwords = parse_stop_word('''
@ -135,10 +133,8 @@ class SearchDutch(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = dutch_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('dutch')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
norwegian_stopwords = parse_stop_word('''
@ -210,10 +208,8 @@ class SearchNorwegian(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = norwegian_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('norwegian')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
portuguese_stopwords = parse_stop_word('''
@ -270,10 +268,8 @@ class SearchPortuguese(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = portuguese_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('portuguese')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

File diff suppressed because one or more lines are too long

View File

@ -8,13 +8,11 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any # NOQA
from sphinx.search import SearchLanguage, parse_stop_word
russian_stopwords = parse_stop_word('''
@ -259,10 +257,8 @@ class SearchRussian(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = russian_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('russian')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

View File

@ -8,13 +8,12 @@
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage, parse_stop_word
from typing import Dict
import snowballstemmer
if False:
# For type annotation
from typing import Any
from sphinx.search import SearchLanguage, parse_stop_word
swedish_stopwords = parse_stop_word('''
| source: http://snowball.tartarus.org/algorithms/swedish/stop.txt
@ -147,10 +146,8 @@ class SearchSwedish(SearchLanguage):
js_stemmer_code = js_stemmer
stopwords = swedish_stopwords
def init(self, options):
# type: (Any) -> None
def init(self, options: Dict) -> None:
self.stemmer = snowballstemmer.stemmer('swedish')
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@
import os
import re
from typing import Dict, List
from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer
@ -20,10 +21,6 @@ try:
except ImportError:
JIEBA = False
if False:
# For type annotation
from typing import Dict, List # NOQA
english_stopwords = set("""
a and are as at
be but by
@ -235,8 +232,7 @@ class SearchChinese(SearchLanguage):
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
latin_terms = [] # type: List[str]
def init(self, options):
# type: (Dict) -> None
def init(self, options: Dict) -> None:
if JIEBA:
dict_path = options.get('dict')
if dict_path and os.path.isfile(dict_path):
@ -244,8 +240,7 @@ class SearchChinese(SearchLanguage):
self.stemmer = get_stemmer()
def split(self, input):
# type: (str) -> List[str]
def split(self, input: str) -> List[str]:
chinese = [] # type: List[str]
if JIEBA:
chinese = list(jieba.cut_for_search(input))
@ -255,13 +250,10 @@ class SearchChinese(SearchLanguage):
self.latin_terms.extend(latin1)
return chinese + latin1
def word_filter(self, stemmed_word):
# type: (str) -> bool
def word_filter(self, stemmed_word: str) -> bool:
return len(stemmed_word) > 1
def stem(self, word):
# type: (str) -> str
def stem(self, word: str) -> str:
# Don't stem Latin words that are long enough to be relevant for search
# if not stemmed, but would be too short after being stemmed
# avoids some issues with acronyms