Closes #1853: support custom text splitter on html search with language='ja'.

2025-02-25 18:55:22 -06:00 · 2016-02-14 19:13:38 +09:00 · 2016-02-14 19:13:38 +09:00 · 0992ce542b
commit 0992ce542b
parent e1394b5427
3 changed files with 92 additions and 21 deletions
--- a/1
+++ b/1
@ -72,6 +72,7 @@ Features added
 * #1286, #2099: Add ``sphinx.ext.autosectionlabel`` extension to allow reference
  sections using its title. Thanks to Tadhg O'Higgins.
 * #1854: Allow to choose Janome for Japanese splitter.
+* #1853: support custom text splitter on html search with `language='ja'`.

 Bugs fixed
 ----------
--- a/doc/config.rst
+++ b/doc/config.rst
@ -915,19 +915,61 @@ that use Sphinx's HTMLWriter class.

   The Japanese support has these options:

-   * ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
-     TinySegmenter word splitter algorithm)
-   * ``dic_enc`` -- the encoding for the MeCab algorithm
-   * ``dict`` -- the dictionary to use for the MeCab algorithm
-   * ``lib`` -- the library name for finding the MeCab library via ctypes if the
-     Python binding is not installed
-   * ``user_dic`` -- the user dictionary file path for Janome
-   * ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
+   :type:
+      _`type` is dotted module path string to specify Splitter implementation which
+      should be derived from :class:`sphinx.search.ja.BaseSplitter`.
+      If not specified or None is specified, ``'sphinx.search.ja.DefaultSplitter'`` will
+      be used.

-   `Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.
+      You can choose from these modules:
+
+      :'sphinx.search.ja.DefaultSplitter':
+         TinySegmenter algorithm. This is default splitter.
+      :'sphinx.search.ja.MeCabSplitter':
+         MeCab binding. To use this splitter, 'mecab' python binding or dynamic link
+         library ('libmecab.so' for linux, 'libmecab.dll' for windows) is required.
+      :'sphinx.search.ja.JanomeSplitter':
+         Janome binding. To use this splitter,
+         `Janome <https://pypi.python.org/pypi/Janome>`_ is required.
+
+      To keep compatibility, ``'mecab'``, ``'janome'`` and ``'default'`` are also
+      acceptable. However it will be deprecated in Sphinx-1.6.
+
+
+   Other option values depend on splitter value which you choose.
+
+   Options for ``'mecab'``:
+      :dic_enc:
+         _`dic_enc option` is the encoding for the MeCab algorithm.
+      :dict:
+         _`dict option` is the dictionary to use for the MeCab algorithm.
+      :lib:
+         _`lib option` is the library name for finding the MeCab library via ctypes if
+         the Python binding is not installed.
+
+      For example::
+
+          html_search_options = {
+              'splitter': 'mecab',
+              'options': {
+                  'dic_enc': 'utf-8',
+                  'dict': '/path/to/mecab.dic',
+                  'lib': '/path/to/libmecab.so',
+           }
+       }
+
+   Options for ``'janome'``:
+      :user_dic: _`user_dic option` is the user dictionary file path for Janome.
+      :user_dic_enc:
+         _`user_dic_enc option` is the encoding for the user dictionary file specified by
+         ``user_dic`` option. Default is 'utf8'.

   .. versionadded:: 1.1

+   .. versionchanged:: 1.4
+      html_search_options for Japanese is re-organized and any custom splitter can be
+      used by `type`_ settings.
+

   The Chinese support has these options:

--- a/sphinx/search/ja.py
+++ b/sphinx/search/ja.py
@ -35,12 +35,29 @@ try:
 except ImportError:
    janome_module = False

-from sphinx.errors import SphinxError
+from sphinx.errors import SphinxError, ExtensionError
 from sphinx.search import SearchLanguage
+from sphinx.util import import_object


-class MecabBinder(object):
+class BaseSplitter(object):
+
    def __init__(self, options):
+        self.options = options
+
+    def split(self, input):
+        """
+
+        :param str input:
+        :return:
+        :rtype: list[str]
+        """
+        raise NotImplementedError
+
+
+class MecabSplitter(BaseSplitter):
+    def __init__(self, options):
+        super(MecabSplitter, self).__init__(options)
        self.ctypes_libmecab = None
        self.ctypes_mecab = None
        if not native_module:
@ -108,9 +125,12 @@ class MecabBinder(object):
        if self.ctypes_libmecab:
            self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)

+MeCabBinder = MecabSplitter  # keep backward compatibility until Sphinx-1.6

-class JanomeBinder(object):
+
+class JanomeSplitter(BaseSplitter):
    def __init__(self, options):
+        super(JanomeSplitter, self).__init__(options)
        self.user_dict = options.get('user_dic')
        self.user_dict_enc = options.get('user_dic_enc', 'utf8')
        self.init_tokenizer()
@ -125,7 +145,7 @@ class JanomeBinder(object):
        return result.split(u' ')


-class TinySegmenter(object):
+class DefaultSplitter(BaseSplitter):
    patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
        u'[一二三四五六七八九十百千万億兆]': u'M',
        u'[一-龠々〆ヵヶ]': u'H',
@ -501,6 +521,9 @@ class TinySegmenter(object):
        return result


+TinySegmenter = DefaultSplitter  # keep backward compatibility until Sphinx-1.6
+
+
 class SearchJapanese(SearchLanguage):
    """
    Japanese search implementation: uses no stemmer, but word splitting is quite
@ -508,18 +531,23 @@ class SearchJapanese(SearchLanguage):
    """
    lang = 'ja'
    language_name = 'Japanese'
+    splitters = {
+        'default': 'sphinx.search.ja.DefaultSplitter',
+        'mecab': 'sphinx.sarch.ja.MecabSplitter',
+        'janome': 'sphinx.search.ja.JanomeSplitter',
+    }

    def init(self, options):
        type = options.get('type', 'default')
-        if type not in ('mecab', 'janome', 'default'):
-            raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
-                              " or 'default'"))
-        if type == 'mecab':
-            self.splitter = MecabBinder(options)
-        if type == 'janome':
-            self.splitter = JanomeBinder(options)
+        if type in self.splitters:
+            dotted_path = self.splitters[type]
        else:
-            self.splitter = TinySegmenter()
+            dotted_path = type
+        try:
+            self.splitter = import_object(dotted_path)(options)
+        except ExtensionError:
+            raise ExtensionError("Splitter module %r can't be imported" %
+                                 dotted_path)

    def split(self, input):
        return self.splitter.split(input)