mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Merge pull request #1854 from mocobeta/search_ja_janome
Allow to choose Janome for Japanese splitter.
This commit is contained in:
commit
81a0f13199
@ -910,12 +910,16 @@ that use Sphinx's HTMLWriter class.
|
|||||||
|
|
||||||
The Japanese support has these options:
|
The Japanese support has these options:
|
||||||
|
|
||||||
* ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
|
* ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
|
||||||
TinySegmenter word splitter algorithm)
|
TinySegmenter word splitter algorithm)
|
||||||
* ``dic_enc`` -- the encoding for the MeCab algorithm
|
* ``dic_enc`` -- the encoding for the MeCab algorithm
|
||||||
* ``dict`` -- the dictionary to use for the MeCab algorithm
|
* ``dict`` -- the dictionary to use for the MeCab algorithm
|
||||||
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
|
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
|
||||||
Python binding is not installed
|
Python binding is not installed
|
||||||
|
* ``user_dic`` -- the user dictionary file path for Janome
|
||||||
|
* ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
|
||||||
|
|
||||||
|
`Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
|
||||||
|
|
||||||
.. versionadded:: 1.1
|
.. versionadded:: 1.1
|
||||||
|
|
||||||
|
@ -29,6 +29,12 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
native_module = False
|
native_module = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import janome.tokenizer
|
||||||
|
janome_module = True
|
||||||
|
except ImportError:
|
||||||
|
janome_module = False
|
||||||
|
|
||||||
from sphinx.errors import SphinxError
|
from sphinx.errors import SphinxError
|
||||||
from sphinx.search import SearchLanguage
|
from sphinx.search import SearchLanguage
|
||||||
|
|
||||||
@ -103,6 +109,22 @@ class MecabBinder(object):
|
|||||||
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
|
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
|
||||||
|
|
||||||
|
|
||||||
|
class JanomeBinder(object):
|
||||||
|
def __init__(self, options):
|
||||||
|
self.user_dict = options.get('user_dic')
|
||||||
|
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
|
||||||
|
self.init_tokenizer()
|
||||||
|
|
||||||
|
def init_tokenizer(self):
|
||||||
|
if not janome_module:
|
||||||
|
raise RuntimeError('Janome is not available')
|
||||||
|
self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
|
||||||
|
|
||||||
|
def split(self, input):
|
||||||
|
result = u' '.join(token.surface for token in self.tokenizer.tokenize(input))
|
||||||
|
return result.split(u' ')
|
||||||
|
|
||||||
|
|
||||||
class TinySegmenter(object):
|
class TinySegmenter(object):
|
||||||
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
|
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
|
||||||
u'[一二三四五六七八九十百千万億兆]': u'M',
|
u'[一二三四五六七八九十百千万億兆]': u'M',
|
||||||
@ -489,11 +511,13 @@ class SearchJapanese(SearchLanguage):
|
|||||||
|
|
||||||
def init(self, options):
|
def init(self, options):
|
||||||
type = options.get('type', 'default')
|
type = options.get('type', 'default')
|
||||||
if type not in ('mecab', 'default'):
|
if type not in ('mecab', 'janome', 'default'):
|
||||||
raise ValueError(("Japanese tokenizer's type should be 'mecab'"
|
raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
|
||||||
" or 'default'"))
|
" or 'default'"))
|
||||||
if type == 'mecab':
|
if type == 'mecab':
|
||||||
self.splitter = MecabBinder(options)
|
self.splitter = MecabBinder(options)
|
||||||
|
if type == 'janome':
|
||||||
|
self.splitter = JanomeBinder(options)
|
||||||
else:
|
else:
|
||||||
self.splitter = TinySegmenter()
|
self.splitter = TinySegmenter()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user