mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Merge pull request #1854 from mocobeta/search_ja_janome
Allow to choose Janome for Japanese splitter.
This commit is contained in:
commit
81a0f13199
@ -910,12 +910,16 @@ that use Sphinx's HTMLWriter class.
|
||||
|
||||
The Japanese support has these options:
|
||||
|
||||
* ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
|
||||
* ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
|
||||
TinySegmenter word splitter algorithm)
|
||||
* ``dic_enc`` -- the encoding for the MeCab algorithm
|
||||
* ``dict`` -- the dictionary to use for the MeCab algorithm
|
||||
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
|
||||
Python binding is not installed
|
||||
* ``user_dic`` -- the user dictionary file path for Janome
|
||||
* ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
|
||||
|
||||
`Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
|
@ -29,6 +29,12 @@ try:
|
||||
except ImportError:
|
||||
native_module = False
|
||||
|
||||
try:
|
||||
import janome.tokenizer
|
||||
janome_module = True
|
||||
except ImportError:
|
||||
janome_module = False
|
||||
|
||||
from sphinx.errors import SphinxError
|
||||
from sphinx.search import SearchLanguage
|
||||
|
||||
@ -103,6 +109,22 @@ class MecabBinder(object):
|
||||
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
|
||||
|
||||
|
||||
class JanomeBinder(object):
|
||||
def __init__(self, options):
|
||||
self.user_dict = options.get('user_dic')
|
||||
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
|
||||
self.init_tokenizer()
|
||||
|
||||
def init_tokenizer(self):
|
||||
if not janome_module:
|
||||
raise RuntimeError('Janome is not available')
|
||||
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
|
||||
|
||||
def split(self, input):
|
||||
result = u' '.join(token.surface for token in self.tokenizer.tokenize(input))
|
||||
return result.split(u' ')
|
||||
|
||||
|
||||
class TinySegmenter(object):
|
||||
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
|
||||
u'[一二三四五六七八九十百千万億兆]': u'M',
|
||||
@ -489,11 +511,13 @@ class SearchJapanese(SearchLanguage):
|
||||
|
||||
def init(self, options):
|
||||
type = options.get('type', 'default')
|
||||
if type not in ('mecab', 'default'):
|
||||
raise ValueError(("Japanese tokenizer's type should be 'mecab'"
|
||||
if type not in ('mecab', 'janome', 'default'):
|
||||
raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
|
||||
" or 'default'"))
|
||||
if type == 'mecab':
|
||||
self.splitter = MecabBinder(options)
|
||||
if type == 'janome':
|
||||
self.splitter = JanomeBinder(options)
|
||||
else:
|
||||
self.splitter = TinySegmenter()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user