sphinx/sphinx/search/ja.py
Jon Dufresne 22afc77c48 Python-3-only clean ups discovered by pyupgrade
https://github.com/asottile/pyupgrade

> A tool to automatically upgrade syntax for newer versions of the
> language.

- Drop u str prefix
- Drop base object inheritance
- Drop args to super()
- Use set literals
- Use dict comprehension
- Use set comprehension
2019-03-19 01:09:48 +09:00

579 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
sphinx.search.ja
~~~~~~~~~~~~~~~~
Japanese search language: includes routine to split words.
:copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
# Python Version of TinySegmenter
# (http://chasen.org/~taku/software/TinySegmenter/)
# TinySegmenter is super compact Japanese tokenizer.
#
# TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
# Python Version was developed by xnights <programming.magic(at)gmail.com>.
# For details, see http://programming-magic.com/?id=170
import os
import re
import sys
import warnings
try:
import MeCab
native_module = True
except ImportError:
native_module = False
try:
import janome.tokenizer
janome_module = True
except ImportError:
janome_module = False
from sphinx.deprecation import RemovedInSphinx30Warning
from sphinx.errors import SphinxError, ExtensionError
from sphinx.search import SearchLanguage
from sphinx.util import import_object
if False:
# For type annotation
from typing import Any, Dict, List # NOQA
class BaseSplitter:
def __init__(self, options):
# type: (Dict) -> None
self.options = options
def split(self, input):
# type: (str) -> List[str]
"""
:param str input:
:return:
:rtype: list[str]
"""
raise NotImplementedError
class MecabSplitter(BaseSplitter):
def __init__(self, options):
# type: (Dict) -> None
super().__init__(options)
self.ctypes_libmecab = None # type: Any
self.ctypes_mecab = None # type: Any
if not native_module:
self.init_ctypes(options)
else:
self.init_native(options)
self.dict_encode = options.get('dic_enc', 'utf-8')
def split(self, input):
# type: (str) -> List[str]
if native_module:
result = self.native.parse(input)
else:
result = self.ctypes_libmecab.mecab_sparse_tostr(
self.ctypes_mecab, input.encode(self.dict_encode))
return result.split(' ')
def init_native(self, options):
# type: (Dict) -> None
param = '-Owakati'
dict = options.get('dict')
if dict:
param += ' -d %s' % dict
self.native = MeCab.Tagger(param)
def init_ctypes(self, options):
# type: (Dict) -> None
import ctypes.util
lib = options.get('lib')
if lib is None:
if sys.platform.startswith('win'):
libname = 'libmecab.dll'
else:
libname = 'mecab'
libpath = ctypes.util.find_library(libname)
elif os.path.basename(lib) == lib:
libpath = ctypes.util.find_library(lib)
else:
libpath = None
if os.path.exists(lib):
libpath = lib
if libpath is None:
raise RuntimeError('MeCab dynamic library is not available')
param = 'mecab -Owakati'
dict = options.get('dict')
if dict:
param += ' -d %s' % dict
fs_enc = sys.getfilesystemencoding() or sys.getdefaultencoding()
self.ctypes_libmecab = ctypes.CDLL(libpath)
self.ctypes_libmecab.mecab_new2.argtypes = (ctypes.c_char_p,)
self.ctypes_libmecab.mecab_new2.restype = ctypes.c_void_p
self.ctypes_libmecab.mecab_sparse_tostr.argtypes = (ctypes.c_void_p, ctypes.c_char_p)
self.ctypes_libmecab.mecab_sparse_tostr.restype = ctypes.c_char_p
self.ctypes_mecab = self.ctypes_libmecab.mecab_new2(param.encode(fs_enc))
if self.ctypes_mecab is None:
raise SphinxError('mecab initialization failed')
def __del__(self):
# type: () -> None
if self.ctypes_libmecab:
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
class JanomeSplitter(BaseSplitter):
def __init__(self, options):
# type: (Dict) -> None
super().__init__(options)
self.user_dict = options.get('user_dic')
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
self.init_tokenizer()
def init_tokenizer(self):
# type: () -> None
if not janome_module:
raise RuntimeError('Janome is not available')
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
def split(self, input):
# type: (str) -> List[str]
result = ' '.join(token.surface for token in self.tokenizer.tokenize(input))
return result.split(' ')
class DefaultSplitter(BaseSplitter):
patterns_ = {re.compile(pattern): value for pattern, value in {
'[一二三四五六七八九十百千万億兆]': 'M',
'[一-龠々〆ヵヶ]': 'H',
'[ぁ-ん]': 'I',
'[ァ-ヴーア-ン゙ー]': 'K',
'[a-zA-Z--]': 'A',
'[0-9-]': 'N',
}.items()}
BIAS__ = -332
BC1__ = {'HH': 6, 'II': 2461, 'KH': 406, 'OH': -1378}
BC2__ = {'AA': -3267, 'AI': 2744, 'AN': -878, 'HH': -4070, 'HM': -1711,
'HN': 4012, 'HO': 3761, 'IA': 1327, 'IH': -1184, 'II': -1332,
'IK': 1721, 'IO': 5492, 'KI': 3831, 'KK': -8741, 'MH': -3132,
'MK': 3334, 'OO': -2920}
BC3__ = {'HH': 996, 'HI': 626, 'HK': -721, 'HN': -1307, 'HO': -836, 'IH': -301,
'KK': 2762, 'MK': 1079, 'MM': 4034, 'OA': -1652, 'OH': 266}
BP1__ = {'BB': 295, 'OB': 304, 'OO': -125, 'UB': 352}
BP2__ = {'BO': 60, 'OO': -1762}
BQ1__ = {'BHH': 1150, 'BHM': 1521, 'BII': -1158, 'BIM': 886, 'BMH': 1208,
'BNH': 449, 'BOH': -91, 'BOO': -2597, 'OHI': 451, 'OIH': -296,
'OKA': 1851, 'OKH': -1020, 'OKK': 904, 'OOO': 2965}
BQ2__ = {'BHH': 118, 'BHI': -1159, 'BHM': 466, 'BIH': -919, 'BKK': -1720,
'BKO': 864, 'OHH': -1139, 'OHM': -181, 'OIH': 153, 'UHI': -1146}
BQ3__ = {'BHH': -792, 'BHI': 2664, 'BII': -299, 'BKI': 419, 'BMH': 937,
'BMM': 8335, 'BNN': 998, 'BOH': 775, 'OHH': 2174, 'OHM': 439, 'OII': 280,
'OKH': 1798, 'OKI': -793, 'OKO': -2242, 'OMH': -2402, 'OOO': 11699}
BQ4__ = {'BHH': -3895, 'BIH': 3761, 'BII': -4654, 'BIK': 1348, 'BKK': -1806,
'BMI': -3385, 'BOO': -12396, 'OAH': 926, 'OHH': 266, 'OHK': -2036,
'ONN': -973}
BW1__ = {',と': 660, ',同': 727, 'B1あ': 1404, 'B1同': 542, '、と': 660,
'、同': 727, '」と': 1682, 'あっ': 1505, 'いう': 1743, 'いっ': -2055,
'いる': 672, 'うし': -4817, 'うん': 665, 'から': 3472, 'がら': 600,
'こう': -790, 'こと': 2083, 'こん': -1262, 'さら': -4143, 'さん': 4573,
'した': 2641, 'して': 1104, 'すで': -3399, 'そこ': 1977, 'それ': -871,
'たち': 1122, 'ため': 601, 'った': 3463, 'つい': -802, 'てい': 805,
'てき': 1249, 'でき': 1127, 'です': 3445, 'では': 844, 'とい': -4915,
'とみ': 1922, 'どこ': 3887, 'ない': 5713, 'なっ': 3015, 'など': 7379,
'なん': -1113, 'にし': 2468, 'には': 1498, 'にも': 1671, 'に対': -912,
'の一': -501, 'の中': 741, 'ませ': 2448, 'まで': 1711, 'まま': 2600,
'まる': -2155, 'やむ': -1947, 'よっ': -2565, 'れた': 2369, 'れで': -913,
'をし': 1860, 'を見': 731, '亡く': -1886, '京都': 2558, '取り': -2784,
'大き': -2604, '大阪': 1497, '平方': -2314, '引き': -1336, '日本': -195,
'本当': -2423, '毎日': -2113, '目指': -724, 'B1あ': 1404, 'B1同': 542,
'」と': 1682}
BW2__ = {'..': -11822, '11': -669, '――': -5730, '': -13175, 'いう': -1609,
'うか': 2490, 'かし': -1350, 'かも': -602, 'から': -7194, 'かれ': 4612,
'がい': 853, 'がら': -3198, 'きた': 1941, 'くな': -1597, 'こと': -8392,
'この': -4193, 'させ': 4533, 'され': 13168, 'さん': -3977, 'しい': -1819,
'しか': -545, 'した': 5078, 'して': 972, 'しな': 939, 'その': -3744,
'たい': -1253, 'たた': -662, 'ただ': -3857, 'たち': -786, 'たと': 1224,
'たは': -939, 'った': 4589, 'って': 1647, 'っと': -2094, 'てい': 6144,
'てき': 3640, 'てく': 2551, 'ては': -3110, 'ても': -3065, 'でい': 2666,
'でき': -1528, 'でし': -3828, 'です': -4761, 'でも': -4203, 'とい': 1890,
'とこ': -1746, 'とと': -2279, 'との': 720, 'とみ': 5168, 'とも': -3941,
'ない': -2488, 'なが': -1313, 'など': -6509, 'なの': 2614, 'なん': 3099,
'にお': -1615, 'にし': 2748, 'にな': 2454, 'によ': -7236, 'に対': -14943,
'に従': -4688, 'に関': -11388, 'のか': 2093, 'ので': -7059, 'のに': -6041,
'のの': -6125, 'はい': 1073, 'はが': -1033, 'はず': -2532, 'ばれ': 1813,
'まし': -1316, 'まで': -6621, 'まれ': 5409, 'めて': -3153, 'もい': 2230,
'もの': -10713, 'らか': -944, 'らし': -1611, 'らに': -1897, 'りし': 651,
'りま': 1620, 'れた': 4270, 'れて': 849, 'れば': 4114, 'ろう': 6067,
'われ': 7901, 'を通': -11877, 'んだ': 728, 'んな': -4115, '一人': 602,
'一方': -1375, '一日': 970, '一部': -1051, '上が': -4479, '会社': -1116,
'出て': 2163, '分の': -7758, '同党': 970, '同日': -913, '大阪': -2471,
'委員': -1250, '少な': -1050, '年度': -8669, '年間': -1626, '府県': -2363,
'手権': -1982, '新聞': -4066, '日新': -722, '日本': -7068, '日米': 3372,
'曜日': -601, '朝鮮': -2355, '本人': -2697, '東京': -1543, '然と': -1384,
'社会': -1276, '立て': -990, '第に': -1612, '米国': -4268, '': -669}
BW3__ = {'あた': -2194, 'あり': 719, 'ある': 3846, 'い.': -1185, 'い。': -1185,
'いい': 5308, 'いえ': 2079, 'いく': 3029, 'いた': 2056, 'いっ': 1883,
'いる': 5600, 'いわ': 1527, 'うち': 1117, 'うと': 4798, 'えと': 1454,
'か.': 2857, 'か。': 2857, 'かけ': -743, 'かっ': -4098, 'かに': -669,
'から': 6520, 'かり': -2670, 'が,': 1816, 'が、': 1816, 'がき': -4855,
'がけ': -1127, 'がっ': -913, 'がら': -4977, 'がり': -2064, 'きた': 1645,
'けど': 1374, 'こと': 7397, 'この': 1542, 'ころ': -2757, 'さい': -714,
'さを': 976, 'し,': 1557, 'し、': 1557, 'しい': -3714, 'した': 3562,
'して': 1449, 'しな': 2608, 'しま': 1200, 'す.': -1310, 'す。': -1310,
'する': 6521, 'ず,': 3426, 'ず、': 3426, 'ずに': 841, 'そう': 428,
'た.': 8875, 'た。': 8875, 'たい': -594, 'たの': 812, 'たり': -1183,
'たる': -853, 'だ.': 4098, 'だ。': 4098, 'だっ': 1004, 'った': -4748,
'って': 300, 'てい': 6240, 'てお': 855, 'ても': 302, 'です': 1437,
'でに': -1482, 'では': 2295, 'とう': -1387, 'とし': 2266, 'との': 541,
'とも': -3543, 'どう': 4664, 'ない': 1796, 'なく': -903, 'など': 2135,
'に,': -1021, 'に、': -1021, 'にし': 1771, 'にな': 1906, 'には': 2644,
'の,': -724, 'の、': -724, 'の子': -1000, 'は,': 1337, 'は、': 1337,
'べき': 2181, 'まし': 1113, 'ます': 6943, 'まっ': -1549, 'まで': 6154,
'まれ': -793, 'らし': 1479, 'られ': 6820, 'るる': 3818, 'れ,': 854,
'れ、': 854, 'れた': 1850, 'れて': 1375, 'れば': -3246, 'れる': 1091,
'われ': -605, 'んだ': 606, 'んで': 798, 'カ月': 990, '会議': 860,
'入り': 1232, '大会': 2217, '始め': 1681, '': 965, '新聞': -5055,
'日,': 974, '日、': 974, '社会': 2024, 'カ月': 990}
TC1__ = {'AAA': 1093, 'HHH': 1029, 'HHM': 580, 'HII': 998, 'HOH': -390,
'HOM': -331, 'IHI': 1169, 'IOH': -142, 'IOI': -1015, 'IOM': 467,
'MMH': 187, 'OOI': -1832}
TC2__ = {'HHO': 2088, 'HII': -1023, 'HMM': -1154, 'IHI': -1965,
'KKH': 703, 'OII': -2649}
TC3__ = {'AAA': -294, 'HHH': 346, 'HHI': -341, 'HII': -1088, 'HIK': 731,
'HOH': -1486, 'IHH': 128, 'IHI': -3041, 'IHO': -1935, 'IIH': -825,
'IIM': -1035, 'IOI': -542, 'KHH': -1216, 'KKA': 491, 'KKH': -1217,
'KOK': -1009, 'MHH': -2694, 'MHM': -457, 'MHO': 123, 'MMH': -471,
'NNH': -1689, 'NNO': 662, 'OHO': -3393}
TC4__ = {'HHH': -203, 'HHI': 1344, 'HHK': 365, 'HHM': -122, 'HHN': 182,
'HHO': 669, 'HIH': 804, 'HII': 679, 'HOH': 446, 'IHH': 695,
'IHO': -2324, 'IIH': 321, 'III': 1497, 'IIO': 656, 'IOO': 54,
'KAK': 4845, 'KKA': 3386, 'KKK': 3065, 'MHH': -405, 'MHI': 201,
'MMH': -241, 'MMM': 661, 'MOM': 841}
TQ1__ = {'BHHH': -227, 'BHHI': 316, 'BHIH': -132, 'BIHH': 60, 'BIII': 1595,
'BNHH': -744, 'BOHH': 225, 'BOOO': -908, 'OAKK': 482, 'OHHH': 281,
'OHIH': 249, 'OIHI': 200, 'OIIH': -68}
TQ2__ = {'BIHH': -1401, 'BIII': -1033, 'BKAK': -543, 'BOOO': -5591}
TQ3__ = {'BHHH': 478, 'BHHM': -1073, 'BHIH': 222, 'BHII': -504, 'BIIH': -116,
'BIII': -105, 'BMHI': -863, 'BMHM': -464, 'BOMH': 620, 'OHHH': 346,
'OHHI': 1729, 'OHII': 997, 'OHMH': 481, 'OIHH': 623, 'OIIH': 1344,
'OKAK': 2792, 'OKHH': 587, 'OKKA': 679, 'OOHH': 110, 'OOII': -685}
TQ4__ = {'BHHH': -721, 'BHHM': -3604, 'BHII': -966, 'BIIH': -607, 'BIII': -2181,
'OAAA': -2763, 'OAKK': 180, 'OHHH': -294, 'OHHI': 2446, 'OHHO': 480,
'OHIH': -1573, 'OIHH': 1935, 'OIHI': -493, 'OIIH': 626, 'OIII': -4007,
'OKAK': -8156}
TW1__ = {'につい': -4681, '東京都': 2026}
TW2__ = {'ある程': -2049, 'いった': -1256, 'ころが': -2434, 'しょう': 3873,
'その後': -4430, 'だって': -1049, 'ていた': 1833, 'として': -4657,
'ともに': -4517, 'もので': 1882, '一気に': -792, '初めて': -1512,
'同時に': -8097, '大きな': -1255, '対して': -2721, '社会党': -3216}
TW3__ = {'いただ': -1734, 'してい': 1314, 'として': -4314, 'につい': -5483,
'にとっ': -5989, 'に当た': -6247, 'ので,': -727, 'ので、': -727,
'のもの': -600, 'れから': -3752, '十二月': -2287}
TW4__ = {'いう.': 8576, 'いう。': 8576, 'からな': -2348, 'してい': 2958,
'たが,': 1516, 'たが、': 1516, 'ている': 1538, 'という': 1349,
'ました': 5543, 'ません': 1097, 'ようと': -4258, 'よると': 5865}
UC1__ = {'A': 484, 'K': 93, 'M': 645, 'O': -505}
UC2__ = {'A': 819, 'H': 1059, 'I': 409, 'M': 3987, 'N': 5775, 'O': 646}
UC3__ = {'A': -1370, 'I': 2311}
UC4__ = {'A': -2643, 'H': 1809, 'I': -1032, 'K': -3450, 'M': 3565,
'N': 3876, 'O': 6646}
UC5__ = {'H': 313, 'I': -1238, 'K': -799, 'M': 539, 'O': -831}
UC6__ = {'H': -506, 'I': -253, 'K': 87, 'M': 247, 'O': -387}
UP1__ = {'O': -214}
UP2__ = {'B': 69, 'O': 935}
UP3__ = {'B': 189}
UQ1__ = {'BH': 21, 'BI': -12, 'BK': -99, 'BN': 142, 'BO': -56, 'OH': -95,
'OI': 477, 'OK': 410, 'OO': -2422}
UQ2__ = {'BH': 216, 'BI': 113, 'OK': 1759}
UQ3__ = {'BA': -479, 'BH': 42, 'BI': 1913, 'BK': -7198, 'BM': 3160,
'BN': 6427, 'BO': 14761, 'OI': -827, 'ON': -3212}
UW1__ = {',': 156, '': 156, '': -463, '': -941, '': -127, '': -553,
'': 121, '': 505, '': -201, '': -547, '': -123, '': -789,
'': -185, '': -847, '': -466, '': -470, '': 182, '': -292,
'': 208, '': 169, '': -446, '': -137, '': -135, '': -402,
'': -268, '': -912, '': 871, '': -460, '': 561, '': 729,
'': -411, '': -141, '': 361, '': -408, '': -386, '': -718,
'': -463, '': -135}
UW2__ = {',': -829, '': -829, '': 892, '': -645, '': 3145, '': -538,
'': 505, '': 134, '': -502, '': 1454, '': -856, '': -412,
'': 1141, '': 878, '': 540, '': 1529, '': -675, '': 300,
'': -1011, '': 188, '': 1837, '': -949, '': -291, '': -268,
'': -981, '': 1273, '': 1063, '': -1764, '': 130, '': -409,
'': -1273, '': 1261, '': 600, '': -1263, '': -402, '': 1639,
'': -579, '': -694, '': 571, '': -2516, '': 2095, '': -587,
'': 306, '': 568, '': 831, '': -758, '': -2150, '': -302,
'': -968, '': -861, '': 492, '': -123, '': 978, '': 362,
'': 548, '': -3025, '': -1566, '': -3414, '': -422, '': -1769,
'': -865, '': -483, '': -1519, '': 760, '': 1023, '': -2009,
'': -813, '': -1060, '': 1067, '': -1519, '': -1033, '': 1522,
'': -1355, '': -1682, '': -1815, '': -1462, '': -630, '': -1843,
'': -1650, '': -931, '': -665, '': -2378, '': -180, '': -1740,
'': 752, '': 529, '': -1584, '': -242, '': -1165, '': -763,
'': 810, '': 509, '': -1353, '': 838, '西': -744, '': -3874,
'調': 1010, '': 1198, '': 3041, '': 1758, '': -1257, '': -645,
'': 3145, '': 831, '': -587, '': 306, '': 568}
UW3__ = {',': 4889, '1': -800, '': -1723, '': 4889, '': -2311, '': 5827,
'': 2670, '': -3573, '': -2696, '': 1006, '': 2342, '': 1983,
'': -4864, '': -1163, '': 3271, '': 1004, '': 388, '': 401,
'': -3552, '': -3116, '': -1058, '': -395, '': 584, '': 3685,
'': -5228, '': 842, '': -521, '': -1444, '': -1081, '': 6167,
'': 2318, '': 1691, '': -899, '': -2788, '': 2745, '': 4056,
'': 4555, '': -2171, '': -1798, '': 1199, '': -5516, '': -4384,
'': -120, '': 1205, '': 2323, '': -788, '': -202, '': 727,
'': 649, '': 5905, '': 2773, '': -1207, '': 6620, '': -518,
'': 551, '': 1319, '': 874, '': -1350, '': 521, '': 1109,
'': 1591, '': 2201, '': 278, '': -3794, '': -1619, '': -1759,
'': -2087, '': 3815, '': 653, '': -758, '': -1193, '': 974,
'': 2742, '': 792, '': 1889, '': -1368, '': 811, '': 4265,
'': -361, '': -2439, '': 4858, '': 3593, '': 1574, '': -3030,
'': 755, '': -1880, '': 5807, '': 3095, '': 457, '': 2475,
'': 1129, '': 2286, '': 4437, '': 365, '': -949, '': -1872,
'': 1327, '': -1038, '': 4646, '': -2309, '': -783, '': -1006,
'': 483, '': 1233, '': 3588, '': -241, '': 3906, '': -837,
'': 4513, '': 642, '': 1389, '': 1219, '': -241, '': 2016,
'': -1356, '': -423, '': -1008, '': 1078, '': -513, '': -3102,
'': 1155, '': 3197, '': -1804, '': 2416, '': -1030, '': 1605,
'': 1452, '': -2352, '': -3885, '': 1905, '': -1291, '': 1822,
'': -488, '': -3973, '': -2013, '': -1479, '': 3222, '': -1489,
'': 1764, '': 2099, '': 5792, '': -661, '': -1248, '': -951,
'': -937, '': 4125, '': 360, '': 3094, '': 364, '': -805,
'': 5156, '': 2438, '': 484, '': 2613, '': -1694, '': -1073,
'': 1868, '': -495, '': 979, '': 461, '': -3850, '': -273,
'': 914, '': 1215, '': 7313, '': -1835, '': 792, '': 6293,
'': -1528, '': 4231, '': 401, '': -960, '': 1201, '': 7767,
'': 3066, '': 3663, '': 1384, '': -4229, '': 1163, '': 1255,
'': 6457, '': 725, '': -2869, '': 785, '': 1044, '調': -562,
'': -733, '': 1777, '': 1835, '': 1375, '': -1504, '': -1136,
'': -681, '': 1026, '': 4404, '': 1200, '': 2163, '': 421,
'': -1432, '': 1302, '': -1282, '': 2009, '': -1045, '': 2066,
'': 1620, '': -800, '': 2670, '': -3794, '': -1350, '': 551,
'グ': 1319, '': 874, '': 521, '': 1109, '': 1591, '': 2201, '': 278}
UW4__ = {',': 3930, '.': 3508, '': -4841, '': 3930, '': 3508, '': 4999,
'': 1895, '': 3798, '': -5156, '': 4752, '': -3435, '': -640,
'': -2514, '': 2405, '': 530, '': 6006, '': -4482, '': -3821,
'': -3788, '': -4376, '': -4734, '': 2255, '': 1979, '': 2864,
'': -843, '': -2506, '': -731, '': 1251, '': 181, '': 4091,
'': 5034, '': 5408, '': -3654, '': -5882, '': -1659, '': 3994,
'': 7410, '': 4547, '': 5433, '': 6499, '': 1853, '': 1413,
'': 7396, '': 8578, '': 1940, '': 4249, '': -4134, '': 1345,
'': 6665, '': -744, '': 1464, '': 1051, '': -2082, '': -882,
'': -5046, '': 4169, '': -2666, '': 2795, '': -1544, '': 3351,
'': -2922, '': -9726, '': -14896, '': -2613, '': -4570,
'': -1783, '': 13150, '': -2352, '': 2145, '': 1789, '': 1287,
'': -724, '': -403, '': -1635, '': -881, '': -541, '': -856,
'': -3637, '': -4371, '': -11870, '': -2069, '': 2210, '': 782,
'': -190, '': -1768, '': 1036, '': 544, '': 950, '': -1286,
'': 530, '': 4292, '': 601, '': -2006, '': -1212, '': 584,
'': 788, '': 1347, '': 1623, '': 3879, '': -302, '': -740,
'': -2715, '': 776, '': 4517, '': 1013, '': 1555, '': -1834,
'': -681, '': -910, '': -851, '': 1500, '': -619, '': -1200,
'': 866, '': -1410, '': -2094, '': -1413, '': 1067, '': 571,
'': -4802, '': -1397, '': -1057, '': -809, '': 1910, '': -1328,
'': -1500, '': -2056, '': -2667, '': 2771, '': 374, '': -4556,
'': 456, '': 553, '': 916, '': -1566, '': 856, '': 787,
'': 2182, '': 704, '': 522, '': -856, '': 1798, '': 1829,
'': 845, '': -9066, '': -485, '': -442, '': -360, '': -1043,
'': 5388, '': -2716, '': -910, '': -939, '': -543, '': -735,
'': 672, '': -1267, '': -1286, '': -1101, '': -2900, '': 1826,
'': 2586, '': 922, '': -3485, '': 2997, '': -867, '': -2112,
'': 788, '': 2937, '': 786, '': 2171, '': 1146, '': -1169,
'': 940, '': -994, '': 749, '': 2145, '': -730, '': -852,
'': -792, '': 792, '': -1184, '': -244, '': -1000, '': 730,
'': -1481, '': 1158, '': -1433, '': -3370, '': 929, '': -1291,
'': 2596, '': -4866, '': 1192, '': -1100, '': -2213, '': 357,
'': -2344, '': -2297, '': -2604, '': -878, '': -1659, '': -792,
'': -1984, '': 1749, '': 2120, '': 1895, '': 3798, '': -4371,
'': -724, '': -11870, '': 2145, '': 1789, '': 1287, '': -403,
'': -1635, '': -881, '': -541, '': -856, '': -3637}
UW5__ = {',': 465, '.': -299, '1': -514, 'E2': -32768, ']': -2762, '': 465,
'': -299, '': 363, '': 1655, '': 331, '': -503, '': 1199,
'': 527, '': 647, '': -421, '': 1624, '': 1971, '': 312,
'': -983, '': -1537, '': -1371, '': -852, '': -1186, '': 1093,
'': 52, '': 921, '': -18, '': -850, '': -127, '': 1682,
'': -787, '': -1224, '': -635, '': -578, '': 1001, '': 502,
'': 865, '': 3350, '': 854, '': -208, '': 429, '': 504,
'': 419, '': -1264, '': 327, '': 241, '': 451, '': -343,
'': -871, '': 722, '': -1153, '': -654, '': 3519, '': -901,
'': 848, '': 2104, '': -1296, '': -548, '': 1785, '': -1304,
'': -2991, '': 921, '': 1763, '': 872, '': -814, '': 1618,
'': -1682, '': 218, '': -4353, '': 932, '': 1356, '': -1508,
'': -1347, '': 240, '': -3912, '': -3149, '': 1319, '': -1052,
'': -4003, '': -997, '': -278, '': -813, '': 1955, '': -2233,
'': 663, '': -1073, '': 1219, '': -1018, '': -368, '': 786,
'': 1191, '': 2368, '': -689, '': -514, '': -32768, '': 363,
'': 241, '': 451, '': -343}
UW6__ = {',': 227, '.': 808, '1': -270, 'E1': 306, '': 227, '': 808,
'': -307, '': 189, '': 241, '': -73, '': -121, '': -200,
'': 1782, '': 383, '': -428, '': 573, '': -1014, '': 101,
'': -105, '': -253, '': -149, '': -417, '': -236, '': -206,
'': 187, '': -135, '': 195, '': -673, '': -496, '': -277,
'': 201, '': -800, '': 624, '': 302, '': 1792, '': -1212,
'': 798, '': -960, '': 887, '': -695, '': 535, '': -697,
'': 753, '': -507, '': 974, '': -822, '': 1811, '': 463,
'': 1082, '': -270, '': 306, '': -673, '': -496}
# ctype_
def ctype_(self, char):
# type: (str) -> str
for pattern, value in self.patterns_.items():
if pattern.match(char):
return value
return 'O'
# ts_
def ts_(self, dict, key):
# type: (Dict[str, int], str) -> int
if key in dict:
return dict[key]
return 0
# segment
def split(self, input):
# type: (str) -> List[str]
if not input:
return []
result = []
seg = ['B3', 'B2', 'B1']
ctype = ['O', 'O', 'O']
for t in input:
seg.append(t)
ctype.append(self.ctype_(t))
seg.append('E1')
seg.append('E2')
seg.append('E3')
ctype.append('O')
ctype.append('O')
ctype.append('O')
word = seg[3]
p1 = 'U'
p2 = 'U'
p3 = 'U'
for i in range(4, len(seg) - 3):
score = self.BIAS__
w1 = seg[i-3]
w2 = seg[i-2]
w3 = seg[i-1]
w4 = seg[i]
w5 = seg[i+1]
w6 = seg[i+2]
c1 = ctype[i-3]
c2 = ctype[i-2]
c3 = ctype[i-1]
c4 = ctype[i]
c5 = ctype[i+1]
c6 = ctype[i+2]
score += self.ts_(self.UP1__, p1)
score += self.ts_(self.UP2__, p2)
score += self.ts_(self.UP3__, p3)
score += self.ts_(self.BP1__, p1 + p2)
score += self.ts_(self.BP2__, p2 + p3)
score += self.ts_(self.UW1__, w1)
score += self.ts_(self.UW2__, w2)
score += self.ts_(self.UW3__, w3)
score += self.ts_(self.UW4__, w4)
score += self.ts_(self.UW5__, w5)
score += self.ts_(self.UW6__, w6)
score += self.ts_(self.BW1__, w2 + w3)
score += self.ts_(self.BW2__, w3 + w4)
score += self.ts_(self.BW3__, w4 + w5)
score += self.ts_(self.TW1__, w1 + w2 + w3)
score += self.ts_(self.TW2__, w2 + w3 + w4)
score += self.ts_(self.TW3__, w3 + w4 + w5)
score += self.ts_(self.TW4__, w4 + w5 + w6)
score += self.ts_(self.UC1__, c1)
score += self.ts_(self.UC2__, c2)
score += self.ts_(self.UC3__, c3)
score += self.ts_(self.UC4__, c4)
score += self.ts_(self.UC5__, c5)
score += self.ts_(self.UC6__, c6)
score += self.ts_(self.BC1__, c2 + c3)
score += self.ts_(self.BC2__, c3 + c4)
score += self.ts_(self.BC3__, c4 + c5)
score += self.ts_(self.TC1__, c1 + c2 + c3)
score += self.ts_(self.TC2__, c2 + c3 + c4)
score += self.ts_(self.TC3__, c3 + c4 + c5)
score += self.ts_(self.TC4__, c4 + c5 + c6)
# score += self.ts_(self.TC5__, c4 + c5 + c6)
score += self.ts_(self.UQ1__, p1 + c1)
score += self.ts_(self.UQ2__, p2 + c2)
score += self.ts_(self.UQ1__, p3 + c3)
score += self.ts_(self.BQ1__, p2 + c2 + c3)
score += self.ts_(self.BQ2__, p2 + c3 + c4)
score += self.ts_(self.BQ3__, p3 + c2 + c3)
score += self.ts_(self.BQ4__, p3 + c3 + c4)
score += self.ts_(self.TQ1__, p2 + c1 + c2 + c3)
score += self.ts_(self.TQ2__, p2 + c2 + c3 + c4)
score += self.ts_(self.TQ3__, p3 + c1 + c2 + c3)
score += self.ts_(self.TQ4__, p3 + c2 + c3 + c4)
p = 'O'
if score > 0:
result.append(word.strip())
word = ''
p = 'B'
p1 = p2
p2 = p3
p3 = p
word += seg[i]
result.append(word.strip())
return result
TinySegmenter = DefaultSplitter # keep backward compatibility until Sphinx-1.6
class SearchJapanese(SearchLanguage):
"""
Japanese search implementation: uses no stemmer, but word splitting is quite
complicated.
"""
lang = 'ja'
language_name = 'Japanese'
splitters = {
'default': 'sphinx.search.ja.DefaultSplitter',
'mecab': 'sphinx.search.ja.MecabSplitter',
'janome': 'sphinx.search.ja.JanomeSplitter',
}
def init(self, options):
# type: (Dict) -> None
type = options.get('type', 'sphinx.search.ja.DefaultSplitter')
if type in self.splitters:
dotted_path = self.splitters[type]
warnings.warn('html_search_options["type"]: %s is deprecated. '
'Please give "%s" instead.' % (type, dotted_path),
RemovedInSphinx30Warning, stacklevel=2)
else:
dotted_path = type
try:
self.splitter = import_object(dotted_path)(options)
except ExtensionError:
raise ExtensionError("Splitter module %r can't be imported" %
dotted_path)
def split(self, input):
# type: (str) -> List[str]
return self.splitter.split(input)
def word_filter(self, stemmed_word):
# type: (str) -> bool
return len(stemmed_word) > 1
def stem(self, word):
# type: (str) -> str
return word