Merge pull request #1854 from mocobeta/search_ja_janome

Allow to choose Janome for Japanese splitter.
2025-02-25 18:55:22 -06:00 · 2016-02-14 16:46:21 +09:00 · 2016-02-14 16:46:21 +09:00 · 81a0f13199
commit 81a0f13199
parent 7a4f914f91 1d22c20664
2 changed files with 31 additions and 3 deletions
--- a/doc/config.rst
+++ b/doc/config.rst
@ -910,12 +910,16 @@ that use Sphinx's HTMLWriter class.

   The Japanese support has these options:

-   * ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
+   * ``type`` -- ``'mecab'`` or ``'janome'`` or ``'default'`` (selects either MeCab or Janome or
     TinySegmenter word splitter algorithm)
   * ``dic_enc`` -- the encoding for the MeCab algorithm
   * ``dict`` -- the dictionary to use for the MeCab algorithm
   * ``lib`` -- the library name for finding the MeCab library via ctypes if the
     Python binding is not installed
+   * ``user_dic`` -- the user dictionary file path for Janome
+   * ``user_dic_enc`` -- the encoding for the user dictionary file specified by ``user_dic`` option (default is 'utf8')
+
+   `Janome <https://pypi.python.org/pypi/Janome>`_ is required to use type ``'janome'``.

   .. versionadded:: 1.1

--- a/sphinx/search/ja.py
+++ b/sphinx/search/ja.py
@ -29,6 +29,12 @@ try:
 except ImportError:
    native_module = False

+try:
+    import janome.tokenizer
+    janome_module = True
+except ImportError:
+    janome_module = False
+
 from sphinx.errors import SphinxError
 from sphinx.search import SearchLanguage

@ -103,6 +109,22 @@ class MecabBinder(object):
            self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)


+class JanomeBinder(object):
+    def __init__(self, options):
+        self.user_dict = options.get('user_dic')
+        self.user_dict_enc = options.get('user_dic_enc', 'utf8')
+        self.init_tokenizer()
+
+    def init_tokenizer(self):
+        if not janome_module:
+            raise RuntimeError('Janome is not available')
+        self.tokenizer = janome.tokenizer.Tokenizer(udic=self.user_dict, udic_enc=self.user_dict_enc)
+
+    def split(self, input):
+        result = u' '.join(token.surface for token in self.tokenizer.tokenize(input))
+        return result.split(u' ')
+
+
 class TinySegmenter(object):
    patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
        u'[一二三四五六七八九十百千万億兆]': u'M',
@ -489,11 +511,13 @@ class SearchJapanese(SearchLanguage):

    def init(self, options):
        type = options.get('type', 'default')
-        if type not in ('mecab', 'default'):
-            raise ValueError(("Japanese tokenizer's type should be 'mecab'"
+        if type not in ('mecab', 'janome', 'default'):
+            raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
                              " or 'default'"))
        if type == 'mecab':
            self.splitter = MecabBinder(options)
+        if type == 'janome':
+            self.splitter = JanomeBinder(options)
        else:
            self.splitter = TinySegmenter()