From 765f035ec1b8674092a48cc589de54911bd328d0 Mon Sep 17 00:00:00 2001 From: Enhao Hu Date: Thu, 23 Apr 2015 01:25:42 +0800 Subject: [PATCH] add Traditional Chinese search --- sphinx/search/zh_TW.py | 291 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 sphinx/search/zh_TW.py diff --git a/sphinx/search/zh_TW.py b/sphinx/search/zh_TW.py new file mode 100644 index 000000000..ba99d2d8f --- /dev/null +++ b/sphinx/search/zh_TW.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +""" + sphinx.search.zh_TW + ~~~~~~~~~~~~~~~~ + + Traditional Chinese search language: includes routine to split words. + + :copyright: Copyright 2015 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import os +import re + +from sphinx.search import SearchLanguage + +try: + # http://bitbucket.org/methane/porterstemmer/ + from porterstemmer import Stemmer as CStemmer + CSTEMMER = True + PYSTEMMER = False +except ImportError: + CSTEMMER = False + try: + from Stemmer import Stemmer as PyStemmer + PYSTEMMER = True + except ImportError: + from sphinx.util.stemmer import PorterStemmer + PYSTEMMER = False + +try: + import jieba + JIEBA = True +except ImportError: + JIEBA = False + +english_stopwords = set(""" +a and are as at +be but by +for +if in into is it +near no not +of on or +such +that the their then there these they this to +was will with +""".split()) + +js_porter_stemmer = """ +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ +ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\ +iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} +""" + +class SearchChinese(SearchLanguage): + """ + Traditional Chinese search implementation + """ + + lang = 'zh_TW' + language_name = 'Traditional Chinese' + js_stemmer_code = js_porter_stemmer + stopwords = english_stopwords + latin1_letters = re.compile(r'\w+(?u)[\u0000-\u00ff]') + + def init(self, options): + if JIEBA: + dict = options.get('dict') + if os.path.isfile(dict): + jieba.set_dictionary(dict) + print + print "Dictionary path:", dict + + if CSTEMMER: + class Stemmer(CStemmer): + def stem(self, word): + return self(word.lower()) + elif PYSTEMMER: + class Stemmer(object): + def __init__(self): + self.stemmer = PyStemmer('porter') + + def stem(self, word): + return self.stemmer.stemWord(word) + else: + class Stemmer(PorterStemmer): + """All those porter stemmer implementations look hideous; + make at least the stem method nicer. + """ + def stem(self, word): + word = word.lower() + return PorterStemmer.stem(self, word, 0, len(word) - 1) + + self.stemmer = Stemmer() + + def split(self, input): + if JIEBA: + chinese = list(jieba.cut_for_search(input)) + + latin1 = self.latin1_letters.findall(input) + return chinese + latin1 + + def word_filter(self, stemmed_word): + return len(stemmed_word) > 1 + + def stem(self, word): + return self.stemmer.stem(word) + +def setup(app): + app.add_search_language(SearchChinese) +