Merge pull request #2170 from daoiqi/zh-search

add Chinese Search into milestones
This commit is contained in:
Georg Brandl 2016-01-06 10:34:36 +01:00
commit 3bd7d08b6f
4 changed files with 298 additions and 2 deletions

View File

@ -874,6 +874,7 @@ that use Sphinx's HTMLWriter class.
* ``es`` -- Spanish
* ``sv`` -- Swedish
* ``tr`` -- Turkish
* ``zh`` -- Chinese
.. admonition:: Accelerating build speed
@ -908,6 +909,12 @@ that use Sphinx's HTMLWriter class.
.. versionadded:: 1.1
The Chinese support has these options:
* ``dict`` -- the ``jieba`` dictionary path if want to use
custom dictionary.
.. confval:: html_search_scorer
The name of a JavaScript file (relative to the configuration directory) that

View File

@ -261,11 +261,12 @@ html_static_path = ['%(dot)sstatic']
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
#html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# Now only 'ja' uses this config value
# 'ja' uses this config value.
# 'zh' user can custom change `jieba` dictionary path.
#html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that

View File

@ -135,6 +135,7 @@ languages = {
'ru': 'sphinx.search.ru.SearchRussian',
'sv': 'sphinx.search.sv.SearchSwedish',
'tr': 'sphinx.search.tr.SearchTurkish',
'zh': 'sphinx.search.zh.SearchChinese',
}

287
sphinx/search/zh.py Normal file
View File

@ -0,0 +1,287 @@
# -*- coding: utf-8 -*-
"""
sphinx.search.zh
~~~~~~~~~~~~~~~~
Chinese search language: includes routine to split words.
:copyright: Copyright 2015 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import os
import re
from sphinx.search import SearchLanguage
try:
# http://bitbucket.org/methane/porterstemmer/
from porterstemmer import Stemmer as CStemmer
CSTEMMER = True
PYSTEMMER = False
except ImportError:
CSTEMMER = False
try:
from Stemmer import Stemmer as PyStemmer
PYSTEMMER = True
except ImportError:
from sphinx.util.stemmer import PorterStemmer
PYSTEMMER = False
try:
import jieba
JIEBA = True
except ImportError:
JIEBA = False
english_stopwords = set("""
a and are as at
be but by
for
if in into is it
near no not
of on or
such
that the their then there these they this to
was will with
""".split())
js_porter_stemmer = """
/**
* Porter Stemmer
*/
var Stemmer = function() {
var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};
var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};
var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;
if (w.length < 3)
return w;
var re;
var re2;
var re3;
var re4;
firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}
"""
class SearchChinese(SearchLanguage):
"""
Chinese search implementation
"""
lang = 'zh'
language_name = 'Chinese'
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
latin1_letters = re.compile(r'\w+(?u)[\u0000-\u00ff]')
def init(self, options):
if JIEBA:
dict_path = options.get('dict')
if dict_path and os.path.isfile(dict_path):
jieba.set_dictionary(dict_path)
if CSTEMMER:
class Stemmer(CStemmer):
def stem(self, word):
return self(word.lower())
elif PYSTEMMER:
class Stemmer(object):
def __init__(self):
self.stemmer = PyStemmer('porter')
def stem(self, word):
return self.stemmer.stemWord(word)
else:
class Stemmer(PorterStemmer):
"""All those porter stemmer implementations look hideous;
make at least the stem method nicer.
"""
def stem(self, word):
word = word.lower()
return PorterStemmer.stem(self, word, 0, len(word) - 1)
self.stemmer = Stemmer()
def split(self, input):
chinese = []
if JIEBA:
chinese = list(jieba.cut_for_search(input))
latin1 = self.latin1_letters.findall(input)
return chinese + latin1
def word_filter(self, stemmed_word):
return len(stemmed_word) > 1
def stem(self, word):
return self.stemmer.stem(word)