Closes #1853: support custom text splitter on html search with language='ja'.

This commit is contained in:
shimizukawa 2016-02-14 19:13:38 +09:00
parent e1394b5427
commit 0992ce542b
3 changed files with 92 additions and 21 deletions

View File

@ -72,6 +72,7 @@ Features added
* #1286, #2099: Add ``sphinx.ext.autosectionlabel`` extension to allow reference
sections using its title. Thanks to Tadhg O'Higgins.
* #1854: Allow to choose Janome for Japanese splitter.
* #1853: support custom text splitter on html search with `language='ja'`.
Bugs fixed
----------

View File

@ -915,19 +915,61 @@ that use Sphinx's HTMLWriter class.
The Japanese support has these options:
* ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
TinySegmenter word splitter algorithm)
* ``dic_enc`` -- the encoding for the MeCab algorithm
* ``dict`` -- the dictionary to use for the MeCab algorithm
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
Python binding is not installed
* ``user_dic`` -- the user dictionary file path for Janome
* ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
:type:
_`type` is dotted module path string to specify Splitter implementation which
should be derived from :class:`sphinx.search.ja.BaseSplitter`.
If not specified or None is specified, ``'sphinx.search.ja.DefaultSplitter'`` will
be used.
`Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
You can choose from these modules:
:'sphinx.search.ja.DefaultSplitter':
TinySegmenter algorithm. This is default splitter.
:'sphinx.search.ja.MeCabSplitter':
MeCab binding. To use this splitter, 'mecab' python binding or dynamic link
library ('libmecab.so' for linux, 'libmecab.dll' for windows) is required.
:'sphinx.search.ja.JanomeSplitter':
Janome binding. To use this splitter,
`Janome <https://pypi.python.org/pypi/Janome>`_ is required.
To keep compatibility, ``'mecab'``, ``'janome'`` and ``'default'`` are also
acceptable. However it will be deprecated in Sphinx-1.6.
Other option values depend on splitter value which you choose.
Options for ``'mecab'``:
:dic_enc:
_`dic_enc option` is the encoding for the MeCab algorithm.
:dict:
_`dict option` is the dictionary to use for the MeCab algorithm.
:lib:
_`lib option` is the library name for finding the MeCab library via ctypes if
the Python binding is not installed.
For example::
html_search_options = {
'splitter': 'mecab',
'options': {
'dic_enc': 'utf-8',
'dict': '/path/to/mecab.dic',
'lib': '/path/to/libmecab.so',
}
}
Options for ``'janome'``:
:user_dic: _`user_dic option` is the user dictionary file path for Janome.
:user_dic_enc:
_`user_dic_enc option` is the encoding for the user dictionary file specified by
``user_dic`` option. Default is 'utf8'.
.. versionadded:: 1.1
.. versionchanged:: 1.4
html_search_options for Japanese is re-organized and any custom splitter can be
used by `type`_ settings.
The Chinese support has these options:

View File

@ -35,12 +35,29 @@ try:
except ImportError:
janome_module = False
from sphinx.errors import SphinxError
from sphinx.errors import SphinxError, ExtensionError
from sphinx.search import SearchLanguage
from sphinx.util import import_object
class MecabBinder(object):
class BaseSplitter(object):
def __init__(self, options):
self.options = options
def split(self, input):
"""
:param str input:
:return:
:rtype: list[str]
"""
raise NotImplementedError
class MecabSplitter(BaseSplitter):
def __init__(self, options):
super(MecabSplitter, self).__init__(options)
self.ctypes_libmecab = None
self.ctypes_mecab = None
if not native_module:
@ -108,9 +125,12 @@ class MecabBinder(object):
if self.ctypes_libmecab:
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
class JanomeBinder(object):
class JanomeSplitter(BaseSplitter):
def __init__(self, options):
super(JanomeSplitter, self).__init__(options)
self.user_dict = options.get('user_dic')
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
self.init_tokenizer()
@ -125,7 +145,7 @@ class JanomeBinder(object):
return result.split(u' ')
class TinySegmenter(object):
class DefaultSplitter(BaseSplitter):
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
u'[一二三四五六七八九十百千万億兆]': u'M',
u'[一-龠々〆ヵヶ]': u'H',
@ -501,6 +521,9 @@ class TinySegmenter(object):
return result
TinySegmenter = DefaultSplitter # keep backward compatibility until Sphinx-1.6
class SearchJapanese(SearchLanguage):
"""
Japanese search implementation: uses no stemmer, but word splitting is quite
@ -508,18 +531,23 @@ class SearchJapanese(SearchLanguage):
"""
lang = 'ja'
language_name = 'Japanese'
splitters = {
'default': 'sphinx.search.ja.DefaultSplitter',
'mecab': 'sphinx.sarch.ja.MecabSplitter',
'janome': 'sphinx.search.ja.JanomeSplitter',
}
def init(self, options):
type = options.get('type', 'default')
if type not in ('mecab', 'janome', 'default'):
raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
" or 'default'"))
if type == 'mecab':
self.splitter = MecabBinder(options)
if type == 'janome':
self.splitter = JanomeBinder(options)
if type in self.splitters:
dotted_path = self.splitters[type]
else:
self.splitter = TinySegmenter()
dotted_path = type
try:
self.splitter = import_object(dotted_path)(options)
except ExtensionError:
raise ExtensionError("Splitter module %r can't be imported" %
dotted_path)
def split(self, input):
return self.splitter.split(input)