mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Closes #1853: support custom text splitter on html search with language='ja'
.
This commit is contained in:
parent
e1394b5427
commit
0992ce542b
1
CHANGES
1
CHANGES
@ -72,6 +72,7 @@ Features added
|
||||
* #1286, #2099: Add ``sphinx.ext.autosectionlabel`` extension to allow reference
|
||||
sections using its title. Thanks to Tadhg O'Higgins.
|
||||
* #1854: Allow to choose Janome for Japanese splitter.
|
||||
* #1853: support custom text splitter on html search with `language='ja'`.
|
||||
|
||||
Bugs fixed
|
||||
----------
|
||||
|
@ -915,19 +915,61 @@ that use Sphinx's HTMLWriter class.
|
||||
|
||||
The Japanese support has these options:
|
||||
|
||||
* ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
|
||||
TinySegmenter word splitter algorithm)
|
||||
* ``dic_enc`` -- the encoding for the MeCab algorithm
|
||||
* ``dict`` -- the dictionary to use for the MeCab algorithm
|
||||
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
|
||||
Python binding is not installed
|
||||
* ``user_dic`` -- the user dictionary file path for Janome
|
||||
* ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
|
||||
:type:
|
||||
_`type` is dotted module path string to specify Splitter implementation which
|
||||
should be derived from :class:`sphinx.search.ja.BaseSplitter`.
|
||||
If not specified or None is specified, ``'sphinx.search.ja.DefaultSplitter'`` will
|
||||
be used.
|
||||
|
||||
`Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
|
||||
You can choose from these modules:
|
||||
|
||||
:'sphinx.search.ja.DefaultSplitter':
|
||||
TinySegmenter algorithm. This is default splitter.
|
||||
:'sphinx.search.ja.MeCabSplitter':
|
||||
MeCab binding. To use this splitter, 'mecab' python binding or dynamic link
|
||||
library ('libmecab.so' for linux, 'libmecab.dll' for windows) is required.
|
||||
:'sphinx.search.ja.JanomeSplitter':
|
||||
Janome binding. To use this splitter,
|
||||
`Janome <https://pypi.python.org/pypi/Janome>`_ is required.
|
||||
|
||||
To keep compatibility, ``'mecab'``, ``'janome'`` and ``'default'`` are also
|
||||
acceptable. However it will be deprecated in Sphinx-1.6.
|
||||
|
||||
|
||||
Other option values depend on splitter value which you choose.
|
||||
|
||||
Options for ``'mecab'``:
|
||||
:dic_enc:
|
||||
_`dic_enc option` is the encoding for the MeCab algorithm.
|
||||
:dict:
|
||||
_`dict option` is the dictionary to use for the MeCab algorithm.
|
||||
:lib:
|
||||
_`lib option` is the library name for finding the MeCab library via ctypes if
|
||||
the Python binding is not installed.
|
||||
|
||||
For example::
|
||||
|
||||
html_search_options = {
|
||||
'splitter': 'mecab',
|
||||
'options': {
|
||||
'dic_enc': 'utf-8',
|
||||
'dict': '/path/to/mecab.dic',
|
||||
'lib': '/path/to/libmecab.so',
|
||||
}
|
||||
}
|
||||
|
||||
Options for ``'janome'``:
|
||||
:user_dic: _`user_dic option` is the user dictionary file path for Janome.
|
||||
:user_dic_enc:
|
||||
_`user_dic_enc option` is the encoding for the user dictionary file specified by
|
||||
``user_dic`` option. Default is 'utf8'.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
html_search_options for Japanese is re-organized and any custom splitter can be
|
||||
used by `type`_ settings.
|
||||
|
||||
|
||||
The Chinese support has these options:
|
||||
|
||||
|
@ -35,12 +35,29 @@ try:
|
||||
except ImportError:
|
||||
janome_module = False
|
||||
|
||||
from sphinx.errors import SphinxError
|
||||
from sphinx.errors import SphinxError, ExtensionError
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util import import_object
|
||||
|
||||
|
||||
class MecabBinder(object):
|
||||
class BaseSplitter(object):
|
||||
|
||||
def __init__(self, options):
|
||||
self.options = options
|
||||
|
||||
def split(self, input):
|
||||
"""
|
||||
|
||||
:param str input:
|
||||
:return:
|
||||
:rtype: list[str]
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MecabSplitter(BaseSplitter):
|
||||
def __init__(self, options):
|
||||
super(MecabSplitter, self).__init__(options)
|
||||
self.ctypes_libmecab = None
|
||||
self.ctypes_mecab = None
|
||||
if not native_module:
|
||||
@ -108,9 +125,12 @@ class MecabBinder(object):
|
||||
if self.ctypes_libmecab:
|
||||
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
|
||||
|
||||
MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
|
||||
|
||||
class JanomeBinder(object):
|
||||
|
||||
class JanomeSplitter(BaseSplitter):
|
||||
def __init__(self, options):
|
||||
super(JanomeSplitter, self).__init__(options)
|
||||
self.user_dict = options.get('user_dic')
|
||||
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
|
||||
self.init_tokenizer()
|
||||
@ -125,7 +145,7 @@ class JanomeBinder(object):
|
||||
return result.split(u' ')
|
||||
|
||||
|
||||
class TinySegmenter(object):
|
||||
class DefaultSplitter(BaseSplitter):
|
||||
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
|
||||
u'[一二三四五六七八九十百千万億兆]': u'M',
|
||||
u'[一-龠々〆ヵヶ]': u'H',
|
||||
@ -501,6 +521,9 @@ class TinySegmenter(object):
|
||||
return result
|
||||
|
||||
|
||||
TinySegmenter = DefaultSplitter # keep backward compatibility until Sphinx-1.6
|
||||
|
||||
|
||||
class SearchJapanese(SearchLanguage):
|
||||
"""
|
||||
Japanese search implementation: uses no stemmer, but word splitting is quite
|
||||
@ -508,18 +531,23 @@ class SearchJapanese(SearchLanguage):
|
||||
"""
|
||||
lang = 'ja'
|
||||
language_name = 'Japanese'
|
||||
splitters = {
|
||||
'default': 'sphinx.search.ja.DefaultSplitter',
|
||||
'mecab': 'sphinx.sarch.ja.MecabSplitter',
|
||||
'janome': 'sphinx.search.ja.JanomeSplitter',
|
||||
}
|
||||
|
||||
def init(self, options):
|
||||
type = options.get('type', 'default')
|
||||
if type not in ('mecab', 'janome', 'default'):
|
||||
raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
|
||||
" or 'default'"))
|
||||
if type == 'mecab':
|
||||
self.splitter = MecabBinder(options)
|
||||
if type == 'janome':
|
||||
self.splitter = JanomeBinder(options)
|
||||
if type in self.splitters:
|
||||
dotted_path = self.splitters[type]
|
||||
else:
|
||||
self.splitter = TinySegmenter()
|
||||
dotted_path = type
|
||||
try:
|
||||
self.splitter = import_object(dotted_path)(options)
|
||||
except ExtensionError:
|
||||
raise ExtensionError("Splitter module %r can't be imported" %
|
||||
dotted_path)
|
||||
|
||||
def split(self, input):
|
||||
return self.splitter.split(input)
|
||||
|
Loading…
Reference in New Issue
Block a user