Merge pull request #1854 from mocobeta/search_ja_janome

Allow to choose Janome for Japanese splitter.
This commit is contained in:
shimizukawa 2016-02-14 16:46:21 +09:00
commit 81a0f13199
2 changed files with 31 additions and 3 deletions

View File

@ -910,12 +910,16 @@ that use Sphinx's HTMLWriter class.
The Japanese support has these options:
* ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
* ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
TinySegmenter word splitter algorithm)
* ``dic_enc`` -- the encoding for the MeCab algorithm
* ``dict`` -- the dictionary to use for the MeCab algorithm
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
Python binding is not installed
* ``user_dic`` -- the user dictionary file path for Janome
* ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
`Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
.. versionadded:: 1.1

View File

@ -29,6 +29,12 @@ try:
except ImportError:
native_module = False
try:
import janome.tokenizer
janome_module = True
except ImportError:
janome_module = False
from sphinx.errors import SphinxError
from sphinx.search import SearchLanguage
@ -103,6 +109,22 @@ class MecabBinder(object):
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
class JanomeBinder(object):
def __init__(self, options):
self.user_dict = options.get('user_dic')
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
self.init_tokenizer()
def init_tokenizer(self):
if not janome_module:
raise RuntimeError('Janome is not available')
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
def split(self, input):
result = u' '.join(token.surface for token in self.tokenizer.tokenize(input))
return result.split(u' ')
class TinySegmenter(object):
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
u'[一二三四五六七八九十百千万億兆]': u'M',
@ -489,11 +511,13 @@ class SearchJapanese(SearchLanguage):
def init(self, options):
type = options.get('type', 'default')
if type not in ('mecab', 'default'):
raise ValueError(("Japanese tokenizer's type should be 'mecab'"
if type not in ('mecab', 'janome', 'default'):
raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
" or 'default'"))
if type == 'mecab':
self.splitter = MecabBinder(options)
if type == 'janome':
self.splitter = JanomeBinder(options)
else:
self.splitter = TinySegmenter()