Merge pull request #2170 from daoiqi/zh-search

add Chinese Search into milestones
2025-02-25 18:55:22 -06:00 · 2016-01-06 10:34:36 +01:00 · 2016-01-06 10:34:36 +01:00 · 3bd7d08b6f
commit 3bd7d08b6f
parent 1179bbc96b d76a700fa2
4 changed files with 298 additions and 2 deletions
--- a/doc/config.rst
+++ b/doc/config.rst
@ -874,6 +874,7 @@ that use Sphinx's HTMLWriter class.
   * ``es`` -- Spanish
   * ``sv`` -- Swedish
   * ``tr`` -- Turkish
+   * ``zh`` -- Chinese

   .. admonition:: Accelerating build speed

@ -908,6 +909,12 @@ that use Sphinx's HTMLWriter class.

   .. versionadded:: 1.1

+
+   The Chinese support has these options:
+
+   * ``dict``  -- the ``jieba`` dictionary path if want to use
+     custom dictionary.
+
 .. confval:: html_search_scorer

   The name of a JavaScript file (relative to the configuration directory) that
--- a/sphinx/quickstart.py
+++ b/sphinx/quickstart.py
@ -261,11 +261,12 @@ html_static_path = ['%(dot)sstatic']
 # Language to be used for generating the HTML full-text search index.
 # Sphinx supports the following languages:
 #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
-#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
 #html_search_language = 'en'

 # A dictionary with options for the search language support, empty by default.
-# Now only 'ja' uses this config value
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
 #html_search_options = {'type': 'default'}

 # The name of a javascript file (relative to the configuration directory) that
--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -135,6 +135,7 @@ languages = {
    'ru': 'sphinx.search.ru.SearchRussian',
    'sv': 'sphinx.search.sv.SearchSwedish',
    'tr': 'sphinx.search.tr.SearchTurkish',
+    'zh': 'sphinx.search.zh.SearchChinese',
 }


--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@ -0,0 +1,287 @@
+# -*- coding: utf-8 -*-
+"""
+    sphinx.search.zh
+    ~~~~~~~~~~~~~~~~
+
+    Chinese search language: includes routine to split words.
+
+    :copyright: Copyright 2015 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import os
+import re
+
+from sphinx.search import SearchLanguage
+
+try:
+    # http://bitbucket.org/methane/porterstemmer/
+    from porterstemmer import Stemmer as CStemmer
+    CSTEMMER = True
+    PYSTEMMER = False
+except ImportError:
+    CSTEMMER = False
+    try:
+        from Stemmer import Stemmer as PyStemmer
+        PYSTEMMER = True
+    except ImportError:
+        from sphinx.util.stemmer import PorterStemmer
+        PYSTEMMER = False
+
+try:
+    import jieba
+    JIEBA = True
+except ImportError:
+    JIEBA = False
+
+english_stopwords = set("""
+a  and  are  as  at
+be  but  by
+for
+if  in  into  is  it
+near  no  not
+of  on  or
+such
+that  the  their  then  there  these  they  this  to
+was  will  with
+""".split())
+
+js_porter_stemmer = """
+/**
+ * Porter Stemmer
+ */
+var Stemmer = function() {
+
+  var step2list = {
+    ational: 'ate',
+    tional: 'tion',
+    enci: 'ence',
+    anci: 'ance',
+    izer: 'ize',
+    bli: 'ble',
+    alli: 'al',
+    entli: 'ent',
+    eli: 'e',
+    ousli: 'ous',
+    ization: 'ize',
+    ation: 'ate',
+    ator: 'ate',
+    alism: 'al',
+    iveness: 'ive',
+    fulness: 'ful',
+    ousness: 'ous',
+    aliti: 'al',
+    iviti: 'ive',
+    biliti: 'ble',
+    logi: 'log'
+  };
+
+  var step3list = {
+    icate: 'ic',
+    ative: '',
+    alize: 'al',
+    iciti: 'ic',
+    ical: 'ic',
+    ful: '',
+    ness: ''
+  };
+
+  var c = "[^aeiou]";          // consonant
+  var v = "[aeiouy]";          // vowel
+  var C = c + "[^aeiouy]*";    // consonant sequence
+  var V = v + "[aeiou]*";      // vowel sequence
+
+  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
+  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
+  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
+  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
+
+  this.stemWord = function (w) {
+    var stem;
+    var suffix;
+    var firstch;
+    var origword = w;
+
+    if (w.length < 3)
+      return w;
+
+    var re;
+    var re2;
+    var re3;
+    var re4;
+
+    firstch = w.substr(0,1);
+    if (firstch == "y")
+      w = firstch.toUpperCase() + w.substr(1);
+
+    // Step 1a
+    re = /^(.+?)(ss|i)es$/;
+    re2 = /^(.+?)([^s])s$/;
+
+    if (re.test(w))
+      w = w.replace(re,"$1$2");
+    else if (re2.test(w))
+      w = w.replace(re2,"$1$2");
+
+    // Step 1b
+    re = /^(.+?)eed$/;
+    re2 = /^(.+?)(ed|ing)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      re = new RegExp(mgr0);
+      if (re.test(fp[1])) {
+        re = /.$/;
+        w = w.replace(re,"");
+      }
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1];
+      re2 = new RegExp(s_v);
+      if (re2.test(stem)) {
+        w = stem;
+        re2 = /(at|bl|iz)$/;
+        re3 = new RegExp("([^aeiouylsz])\\\\1$");
+        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+        if (re2.test(w))
+          w = w + "e";
+        else if (re3.test(w)) {
+          re = /.$/;
+          w = w.replace(re,"");
+        }
+        else if (re4.test(w))
+          w = w + "e";
+      }
+    }
+
+    // Step 1c
+    re = /^(.+?)y$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(s_v);
+      if (re.test(stem))
+        w = stem + "i";
+    }
+
+    // Step 2
+    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
+ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step2list[suffix];
+    }
+
+    // Step 3
+    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step3list[suffix];
+    }
+
+    // Step 4
+    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
+iti|ous|ive|ize)$/;
+    re2 = /^(.+?)(s|t)(ion)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      if (re.test(stem))
+        w = stem;
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1] + fp[2];
+      re2 = new RegExp(mgr1);
+      if (re2.test(stem))
+        w = stem;
+    }
+
+    // Step 5
+    re = /^(.+?)e$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      re2 = new RegExp(meq1);
+      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
+        w = stem;
+    }
+    re = /ll$/;
+    re2 = new RegExp(mgr1);
+    if (re.test(w) && re2.test(w)) {
+      re = /.$/;
+      w = w.replace(re,"");
+    }
+
+    // and turn initial Y back to y
+    if (firstch == "y")
+      w = firstch.toLowerCase() + w.substr(1);
+    return w;
+  }
+}
+"""
+
+
+class SearchChinese(SearchLanguage):
+    """
+    Chinese search implementation
+    """
+
+    lang = 'zh'
+    language_name = 'Chinese'
+    js_stemmer_code = js_porter_stemmer
+    stopwords = english_stopwords
+    latin1_letters = re.compile(r'\w+(?u)[\u0000-\u00ff]')
+
+    def init(self, options):
+        if JIEBA:
+            dict_path = options.get('dict')
+            if dict_path and os.path.isfile(dict_path):
+                jieba.set_dictionary(dict_path)
+
+        if CSTEMMER:
+            class Stemmer(CStemmer):
+                def stem(self, word):
+                    return self(word.lower())
+        elif PYSTEMMER:
+            class Stemmer(object):
+                def __init__(self):
+                    self.stemmer = PyStemmer('porter')
+
+                def stem(self, word):
+                    return self.stemmer.stemWord(word)
+        else:
+            class Stemmer(PorterStemmer):
+                """All those porter stemmer implementations look hideous;
+                make at least the stem method nicer.
+                """
+                def stem(self, word):
+                    word = word.lower()
+                    return PorterStemmer.stem(self, word, 0, len(word) - 1)
+
+        self.stemmer = Stemmer()
+
+    def split(self, input):
+        chinese = []
+        if JIEBA:
+            chinese = list(jieba.cut_for_search(input))
+
+        latin1 = self.latin1_letters.findall(input)
+        return chinese + latin1
+
+    def word_filter(self, stemmed_word):
+        return len(stemmed_word) > 1
+
+    def stem(self, word):
+        return self.stemmer.stem(word)