#273: Add an API for adding full-text search support for languages other than English. Add support for Japanese.

Based on the implementation by SHIBUKAWA Yoshiki in https://bitbucket.org/shibu/sphinx/.
This commit is contained in:
Georg Brandl 2011-01-04 12:40:19 +01:00
parent c5b5c16cb3
commit 7beb8533b5
11 changed files with 681 additions and 241 deletions

View File

@ -27,6 +27,7 @@ Other contributors, listed alphabetically, are:
* Benjamin Peterson -- unittests
* T. Powers -- HTML output improvements
* Stefan Seefeld -- toctree improvements
* Shibukawa Yoshiki -- pluggable search API and Japanese search
* Antonio Valentino -- qthelp builder
* Pauli Virtanen -- autodoc improvements, autosummary extension
* Stefan van der Walt -- autosummary extension

View File

@ -28,6 +28,9 @@ Release 1.1 (in development)
requests and allow configuring the timeout. New config values:
:confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
* #273: Add an API for adding full-text search support for languages
other than English. Add support for Japanese.
* #221: Add Swedish locale.
* Added ``inline`` option to graphviz directives, and fixed the

View File

@ -694,6 +694,38 @@ that use Sphinx' HTMLWriter class.
.. versionadded:: 1.0
.. confval:: html_search_language
Language to be used for generating the HTML full-text search index. This
defaults to the global language selected with :confval:`language`. If there
is no support for this language, ``"en"`` is used which selects the English
language.
Support is present for these languages:
* ``en`` -- English
* ``ja`` -- Japanese
.. versionadded:: 1.1
.. confval:: html_search_options
A dictionary with options for the search language support, empty by default.
The meaning of these options depends on the language selected.
The English support has no options.
The Japanese support has these options:
* ``type`` -- ``'mecab'`` or ``'default'`` (selects either MeCab or
TinySegmenter word splitter algorithm)
* ``dic_enc`` -- the encoding for the MeCab algorithm
* ``dict`` -- the dictionary to use for the MeCab algorithm
* ``lib`` -- the library name for finding the MeCab library via ctypes if the
Python binding is not installed
.. versionadded:: 1.1
.. confval:: htmlhelp_basename
Output file base name for HTML help builder. Default is ``'pydoc'``.

View File

@ -286,6 +286,15 @@ the following public API:
.. versionadded:: 0.6
.. method:: Sphinx.add_search_language(cls)
Add *cls*, which must be a subclass of :class:`sphinx.search.SearchLanguage`,
as a support language for building the HTML full-text search index. The
class must have a *lang* attribute that indicates the language it should be
used for. See :confval:`html_search_language`.
.. versionadded:: 1.1
.. method:: Sphinx.connect(event, callback)
Register *callback* to be called when *event* is emitted. For details on

View File

@ -487,6 +487,11 @@ class Sphinx(object):
from sphinx.ext import autodoc
autodoc.AutoDirective._special_attrgetters[type] = getter
def add_search_language(self, cls):
from sphinx.search import languages, SearchLanguage
assert isinstance(cls, SearchLanguage)
languages[cls.lang] = cls
class TemplateBridge(object):
"""

View File

@ -539,13 +539,18 @@ class StandaloneHTMLBuilder(Builder):
if jsfile:
copyfile(jsfile, path.join(self.outdir, '_static',
'translations.js'))
# add context items for search function used in searchtools.js_t
ctx = self.globalcontext.copy()
ctx.update(self.indexer.globalcontext_for_searchtool())
# then, copy over theme-supplied static files
if self.theme:
themeentries = [path.join(themepath, 'static')
for themepath in self.theme.get_dirchain()[::-1]]
for entry in themeentries:
copy_static_entry(entry, path.join(self.outdir, '_static'),
self, self.globalcontext)
self, ctx)
# then, copy over all user-supplied static files
staticentries = [path.join(self.confdir, spath)
for spath in self.config.html_static_path]
@ -558,7 +563,7 @@ class StandaloneHTMLBuilder(Builder):
self.warn('html_static_path entry %r does not exist' % entry)
continue
copy_static_entry(entry, path.join(self.outdir, '_static'), self,
self.globalcontext, exclude_matchers=matchers)
ctx, exclude_matchers=matchers)
# copy logo and favicon files if not already in static path
if self.config.html_logo:
logobase = path.basename(self.config.html_logo)

View File

@ -106,6 +106,8 @@ class Config(object):
html_output_encoding = ('utf-8', 'html'),
html_compact_lists = (True, 'html'),
html_secnumber_suffix = ('. ', 'html'),
html_search_language = (None, 'html'),
html_search_options = ({}, 'html'),
# HTML help only options
htmlhelp_basename = (lambda self: make_filename(self.project), None),

View File

@ -3,7 +3,7 @@
sphinx.search
~~~~~~~~~~~~~
Create a search index for offline search.
Create a full-text search index for offline search.
:copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
@ -14,28 +14,90 @@ import cPickle as pickle
from docutils.nodes import comment, Text, NodeVisitor, SkipNode
from sphinx.util import jsdump, rpartition
try:
# http://bitbucket.org/methane/porterstemmer/
from porterstemmer import Stemmer as CStemmer
CSTEMMER = True
except ImportError:
from sphinx.util.stemmer import PorterStemmer
CSTEMMER = False
word_re = re.compile(r'\w+(?u)')
class SearchLanguage(object):
"""
This class is the base class for search natural language preprocessors. If
you want to add support for a new language, you should override the methods
of this class.
stopwords = set("""
a and are as at
be but by
for
if in into is it
near no not
of on or
such
that the their then there these they this to
was will with
""".split())
You should override `lang` class property too (e.g. 'en', 'fr' and so on).
.. attribute:: stopwords
This is a set of stop words of the target language. Default `stopwords`
is empty. This word is used for building index and embedded in JS.
.. attribute:: js_stemmer_code
Return stemmer class of JavaScript version. This class' name should be
``Stemmer`` and this class must have ``stemWord`` method. This string is
embedded as-is in searchtools.js.
This class is used to preprocess search word which Sphinx HTML readers
type, before searching index. Default implementation does nothing.
"""
lang = None
stopwords = set()
js_stemmer_code = """
/**
* Dummy stemmer for languages without stemming rules.
*/
var Stemmer = function() {
this.stemWord = function(w) {
return w;
}
}
"""
_word_re = re.compile(r'\w+(?u)')
def __init__(self, options):
self.options = options
self.init(options)
def init(self, options):
"""
Initialize the class with the options the user has given.
"""
def split(self, input):
"""
This method splits a sentence into words. Default splitter splits input
at white spaces, which should be enough for most languages except CJK
languages.
"""
return self._word_re.findall(input)
def stem(self, word):
"""
This method implements stemming algorithm of the Python version.
Default implementation does nothing. You should implement this if the
language has any stemming rules.
This class is used to preprocess search words before registering them in
the search index. The stemming of the Python version and the JS version
(given in the js_stemmer_code attribute) must be compatible.
"""
return word
def word_filter(self, word):
"""
Return true if the target word should be registered in the search index.
This method is called after stemming.
"""
return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
(ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or
word.isdigit())))
from sphinx.search import en, ja
languages = {
'en': en.SearchEnglish,
'ja': ja.SearchJapanese,
}
class _JavaScriptIndex(object):
@ -67,39 +129,21 @@ class _JavaScriptIndex(object):
js_index = _JavaScriptIndex()
if CSTEMMER:
class Stemmer(CStemmer):
def stem(self, word):
return self(word.lower())
else:
class Stemmer(PorterStemmer):
"""
All those porter stemmer implementations look hideous.
make at least the stem method nicer.
"""
def stem(self, word):
word = word.lower()
return PorterStemmer.stem(self, word, 0, len(word) - 1)
class WordCollector(NodeVisitor):
"""
A special visitor that collects words for the `IndexBuilder`.
"""
def __init__(self, document):
def __init__(self, document, lang):
NodeVisitor.__init__(self, document)
self.found_words = []
self.lang = lang
def dispatch_visit(self, node):
if node.__class__ is comment:
raise SkipNode
if node.__class__ is Text:
self.found_words.extend(word_re.findall(node.astext()))
self.found_words.extend(self.lang.split(node.astext()))
class IndexBuilder(object):
@ -114,7 +158,6 @@ class IndexBuilder(object):
def __init__(self, env):
self.env = env
self._stemmer = Stemmer()
# filename -> title
self._titles = {}
# stemmed word -> set(filenames)
@ -123,6 +166,11 @@ class IndexBuilder(object):
self._objtypes = {}
# objtype index -> objname (localized)
self._objnames = {}
# add language-specific SearchLanguage instance
search_language = env.config.html_search_language or env.config.language
if not search_language or search_language not in languages:
search_language = 'en'
self.lang = languages[search_language](env.config.html_search_options)
def load(self, stream, format):
"""Reconstruct from frozen data."""
@ -215,17 +263,22 @@ class IndexBuilder(object):
"""Feed a doctree to the index."""
self._titles[filename] = title
visitor = WordCollector(doctree)
visitor = WordCollector(doctree, self.lang)
doctree.walk(visitor)
def add_term(word, stem=self._stemmer.stem):
def add_term(word, stem=self.lang.stem):
word = stem(word)
if len(word) < 3 or word in stopwords or word.isdigit():
return
self._mapping.setdefault(word, set()).add(filename)
if self.lang.word_filter(word):
self._mapping.setdefault(word, set()).add(filename)
for word in word_re.findall(title):
for word in self.lang.split(title):
add_term(word)
for word in visitor.found_words:
add_term(word)
def globalcontext_for_searchtool(self):
return dict(
search_language_stemming_code = self.lang.js_stemmer_code,
search_language_stop_words = jsdump.dumps(self.lang.stopwords),
)

242
sphinx/search/en.py Normal file
View File

@ -0,0 +1,242 @@
# -*- coding: utf-8 -*-
"""
sphinx.search_languages.en
~~~~~~~~~~~~~~~~~~~~~~~~~~
English search language: includes the JS porter stemmer.
:copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from sphinx.search import SearchLanguage
try:
# http://bitbucket.org/methane/porterstemmer/
from porterstemmer import Stemmer as CStemmer
CSTEMMER = True
except ImportError:
from sphinx.util.stemmer import PorterStemmer
CSTEMMER = False
english_stopwords = set("""
a and are as at
be but by
for
if in into is it
near no not
of on or
such
that the their then there these they this to
was will with
""".split())
js_porter_stemmer = """
/**
* Porter Stemmer
*/
var Stemmer = function() {
var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};
var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};
var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;
if (w.length < 3)
return w;
var re;
var re2;
var re3;
var re4;
firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}
"""
class SearchEnglish(SearchLanguage):
lang = 'en'
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
def init(self, options):
if CSTEMMER:
class Stemmer(CStemmer):
def stem(self, word):
return self(word.lower())
else:
class Stemmer(PorterStemmer):
"""All those porter stemmer implementations look hideous;
make at least the stem method nicer.
"""
def stem(self, word):
word = word.lower()
return PorterStemmer.stem(self, word, 0, len(word) - 1)
self.stemmer = Stemmer()
def stem(self, word):
return self.stemmer.stem(word)

273
sphinx/search/ja.py Normal file
View File

@ -0,0 +1,273 @@
# -*- coding: utf-8 -*-
"""
sphinx.search_languages.ja
~~~~~~~~~~~~~~~~~~~~~~~~~~
Japanese search language: includes routine to split words.
:copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
# Python Version of TinySegmenter
# (http://chasen.org/~taku/software/TinySegmenter/)
# TinySegmenter is super compact Japanese tokenizer.
#
# TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
# Python Version was developed by xnights <programming.magic(at)gmail.com>.
# For details, see http://programming-magic.com/?id=170
import os
import re
import sys
try:
import MeCab
native_module = True
except ImportError:
native_module = False
from sphinx.search import SearchLanguage
class MecabBinder(object):
def __init__(self, options):
self.ctypes_libmecab = None
self.ctypes_mecab = None
if not native_module:
self.init_ctypes(options)
else:
self.init_native(options)
self.dict_encode = options.get('dic_enc', 'utf-8')
def split(self, input):
input2 = input.encode(self.dict_encode)
if native_module:
result = self.native.parse(input2)
else:
result = self.ctypes_libmecab.mecab_sparse_tostr(
self.ctypes_mecab, input)
return result.decode(self.dict_encode).split(' ')
def init_native(self, options):
param = '-Owakati'
dict = options.get('dict')
if dict:
param += ' -d %s' % dict
self.native = MeCab.Tagger(param)
def init_ctypes(self, options):
import ctypes.util
lib = options.get('lib')
if lib is None:
if sys.platform.startswith('win'):
libname = 'libmecab.dll'
else:
libname = 'mecab'
libpath = ctypes.util.find_library(libname)
elif os.path.basename(lib) == lib:
libpath = ctypes.util.find_library(lib)
else:
libpath = None
if os.path.exists(lib):
libpath = lib
if libpath is None:
raise RuntimeError('MeCab dynamic library is not available')
param = 'mecab -Owakati'
dict = options.get('dict')
if dict:
param += ' -d %s' % dict
self.ctypes_libmecab = ctypes.CDLL(libpath)
self.ctypes_libmecab.mecab_sparse_tostr.restype = ctypes.c_char_p
self.ctypes_mecab = self.libmecab.mecab_new2(param)
def __del__(self):
if self.ctypes_libmecab:
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
class TinySegmenter(object):
patterns_ = dict([(re.compile(pattern), value) for pattern, value in {
u'[一二三四五六七八九十百千万億兆]': u'M',
u'[一-龠々〆ヵヶ]': u'H',
u'[ぁ-ん]': u'I',
u'[ァ-ヴーア-ン゙ー]': u'K',
u'[a-zA-Z--]': u'A',
u'[0-9-]': u'N',
}.iteritems()])
BIAS__ = -332
BC1__ = {u'HH':6,u'II':2461,u'KH':406,u'OH':-1378}
BC2__ = {u'AA':-3267,u'AI':2744,u'AN':-878,u'HH':-4070,u'HM':-1711,u'HN':4012,u'HO':3761,u'IA':1327,u'IH':-1184,u'II':-1332,u'IK':1721,u'IO':5492,u'KI':3831,u'KK':-8741,u'MH':-3132,u'MK':3334,u'OO':-2920}
BC3__ = {u'HH':996,u'HI':626,u'HK':-721,u'HN':-1307,u'HO':-836,u'IH':-301,u'KK':2762,u'MK':1079,u'MM':4034,u'OA':-1652,u'OH':266}
BP1__ = {u'BB':295,u'OB':304,u'OO':-125,u'UB':352}
BP2__ = {u'BO':60,u'OO':-1762}
BQ1__ = {u'BHH':1150,u'BHM':1521,u'BII':-1158,u'BIM':886,u'BMH':1208,u'BNH':449,u'BOH':-91,u'BOO':-2597,u'OHI':451,u'OIH':-296,u'OKA':1851,u'OKH':-1020,u'OKK':904,u'OOO':2965}
BQ2__ = {u'BHH':118,u'BHI':-1159,u'BHM':466,u'BIH':-919,u'BKK':-1720,u'BKO':864,u'OHH':-1139,u'OHM':-181,u'OIH':153,u'UHI':-1146}
BQ3__ = {u'BHH':-792,u'BHI':2664,u'BII':-299,u'BKI':419,u'BMH':937,u'BMM':8335,u'BNN':998,u'BOH':775,u'OHH':2174,u'OHM':439,u'OII':280,u'OKH':1798,u'OKI':-793,u'OKO':-2242,u'OMH':-2402,u'OOO':11699}
BQ4__ = {u'BHH':-3895,u'BIH':3761,u'BII':-4654,u'BIK':1348,u'BKK':-1806,u'BMI':-3385,u'BOO':-12396,u'OAH':926,u'OHH':266,u'OHK':-2036,u'ONN':-973}
BW1__ = {u',と':660,u',同':727,u'B1あ':1404,u'B1同':542,u'、と':660,u'、同':727,u'」と':1682,u'あっ':1505,u'いう':1743,u'いっ':-2055,u'いる':672,u'うし':-4817,u'うん':665,u'から':3472,u'がら':600,u'こう':-790,u'こと':2083,u'こん':-1262,u'さら':-4143,u'さん':4573,u'した':2641,u'して':1104,u'すで':-3399,u'そこ':1977,u'それ':-871,u'たち':1122,u'ため':601,u'った':3463,u'つい':-802,u'てい':805,u'てき':1249,u'でき':1127,u'です':3445,u'では':844,u'とい':-4915,u'とみ':1922,u'どこ':3887,u'ない':5713,u'なっ':3015,u'など':7379,u'なん':-1113,u'にし':2468,u'には':1498,u'にも':1671,u'に対':-912,u'の一':-501,u'の中':741,u'ませ':2448,u'まで':1711,u'まま':2600,u'まる':-2155,u'やむ':-1947,u'よっ':-2565,u'れた':2369,u'れで':-913,u'をし':1860,u'を見':731,u'亡く':-1886,u'京都':2558,u'取り':-2784,u'大き':-2604,u'大阪':1497,u'平方':-2314,u'引き':-1336,u'日本':-195,u'本当':-2423,u'毎日':-2113,u'目指':-724,u'B1あ':1404,u'B1同':542,u'」と':1682}
BW2__ = {u'..':-11822,u'11':-669,u'――':-5730,u'':-13175,u'いう':-1609,u'うか':2490,u'かし':-1350,u'かも':-602,u'から':-7194,u'かれ':4612,u'がい':853,u'がら':-3198,u'きた':1941,u'くな':-1597,u'こと':-8392,u'この':-4193,u'させ':4533,u'され':13168,u'さん':-3977,u'しい':-1819,u'しか':-545,u'した':5078,u'して':972,u'しな':939,u'その':-3744,u'たい':-1253,u'たた':-662,u'ただ':-3857,u'たち':-786,u'たと':1224,u'たは':-939,u'った':4589,u'って':1647,u'っと':-2094,u'てい':6144,u'てき':3640,u'てく':2551,u'ては':-3110,u'ても':-3065,u'でい':2666,u'でき':-1528,u'でし':-3828,u'です':-4761,u'でも':-4203,u'とい':1890,u'とこ':-1746,u'とと':-2279,u'との':720,u'とみ':5168,u'とも':-3941,u'ない':-2488,u'なが':-1313,u'など':-6509,u'なの':2614,u'なん':3099,u'にお':-1615,u'にし':2748,u'にな':2454,u'によ':-7236,u'に対':-14943,u'に従':-4688,u'に関':-11388,u'のか':2093,u'ので':-7059,u'のに':-6041,u'のの':-6125,u'はい':1073,u'はが':-1033,u'はず':-2532,u'ばれ':1813,u'まし':-1316,u'まで':-6621,u'まれ':5409,u'めて':-3153,u'もい':2230,u'もの':-10713,u'らか':-944,u'らし':-1611,u'らに':-1897,u'りし':651,u'りま':1620,u'れた':4270,u'れて':849,u'れば':4114,u'ろう':6067,u'われ':7901,u'を通':-11877,u'んだ':728,u'んな':-4115,u'一人':602,u'一方':-1375,u'一日':970,u'一部':-1051,u'上が':-4479,u'会社':-1116,u'出て':2163,u'分の':-7758,u'同党':970,u'同日':-913,u'大阪':-2471,u'委員':-1250,u'少な':-1050,u'年度':-8669,u'年間':-1626,u'府県':-2363,u'手権':-1982,u'新聞':-4066,u'日新':-722,u'日本':-7068,u'日米':3372,u'曜日':-601,u'朝鮮':-2355,u'本人':-2697,u'東京':-1543,u'然と':-1384,u'社会':-1276,u'立て':-990,u'第に':-1612,u'米国':-4268,u'':-669}
BW3__ = {u'あた':-2194,u'あり':719,u'ある':3846,u'い.':-1185,u'い。':-1185,u'いい':5308,u'いえ':2079,u'いく':3029,u'いた':2056,u'いっ':1883,u'いる':5600,u'いわ':1527,u'うち':1117,u'うと':4798,u'えと':1454,u'か.':2857,u'か。':2857,u'かけ':-743,u'かっ':-4098,u'かに':-669,u'から':6520,u'かり':-2670,u'が,':1816,u'が、':1816,u'がき':-4855,u'がけ':-1127,u'がっ':-913,u'がら':-4977,u'がり':-2064,u'きた':1645,u'けど':1374,u'こと':7397,u'この':1542,u'ころ':-2757,u'さい':-714,u'さを':976,u'し,':1557,u'し、':1557,u'しい':-3714,u'した':3562,u'して':1449,u'しな':2608,u'しま':1200,u'す.':-1310,u'す。':-1310,u'する':6521,u'ず,':3426,u'ず、':3426,u'ずに':841,u'そう':428,u'た.':8875,u'た。':8875,u'たい':-594,u'たの':812,u'たり':-1183,u'たる':-853,u'だ.':4098,u'だ。':4098,u'だっ':1004,u'った':-4748,u'って':300,u'てい':6240,u'てお':855,u'ても':302,u'です':1437,u'でに':-1482,u'では':2295,u'とう':-1387,u'とし':2266,u'との':541,u'とも':-3543,u'どう':4664,u'ない':1796,u'なく':-903,u'など':2135,u'に,':-1021,u'に、':-1021,u'にし':1771,u'にな':1906,u'には':2644,u'の,':-724,u'の、':-724,u'の子':-1000,u'は,':1337,u'は、':1337,u'べき':2181,u'まし':1113,u'ます':6943,u'まっ':-1549,u'まで':6154,u'まれ':-793,u'らし':1479,u'られ':6820,u'るる':3818,u'れ,':854,u'れ、':854,u'れた':1850,u'れて':1375,u'れば':-3246,u'れる':1091,u'われ':-605,u'んだ':606,u'んで':798,u'カ月':990,u'会議':860,u'入り':1232,u'大会':2217,u'始め':1681,u'':965,u'新聞':-5055,u'日,':974,u'日、':974,u'社会':2024,u'カ月':990}
TC1__ = {u'AAA':1093,u'HHH':1029,u'HHM':580,u'HII':998,u'HOH':-390,u'HOM':-331,u'IHI':1169,u'IOH':-142,u'IOI':-1015,u'IOM':467,u'MMH':187,u'OOI':-1832}
TC2__ = {u'HHO':2088,u'HII':-1023,u'HMM':-1154,u'IHI':-1965,u'KKH':703,u'OII':-2649}
TC3__ = {u'AAA':-294,u'HHH':346,u'HHI':-341,u'HII':-1088,u'HIK':731,u'HOH':-1486,u'IHH':128,u'IHI':-3041,u'IHO':-1935,u'IIH':-825,u'IIM':-1035,u'IOI':-542,u'KHH':-1216,u'KKA':491,u'KKH':-1217,u'KOK':-1009,u'MHH':-2694,u'MHM':-457,u'MHO':123,u'MMH':-471,u'NNH':-1689,u'NNO':662,u'OHO':-3393}
TC4__ = {u'HHH':-203,u'HHI':1344,u'HHK':365,u'HHM':-122,u'HHN':182,u'HHO':669,u'HIH':804,u'HII':679,u'HOH':446,u'IHH':695,u'IHO':-2324,u'IIH':321,u'III':1497,u'IIO':656,u'IOO':54,u'KAK':4845,u'KKA':3386,u'KKK':3065,u'MHH':-405,u'MHI':201,u'MMH':-241,u'MMM':661,u'MOM':841}
TQ1__ = {u'BHHH':-227,u'BHHI':316,u'BHIH':-132,u'BIHH':60,u'BIII':1595,u'BNHH':-744,u'BOHH':225,u'BOOO':-908,u'OAKK':482,u'OHHH':281,u'OHIH':249,u'OIHI':200,u'OIIH':-68}
TQ2__ = {u'BIHH':-1401,u'BIII':-1033,u'BKAK':-543,u'BOOO':-5591}
TQ3__ = {u'BHHH':478,u'BHHM':-1073,u'BHIH':222,u'BHII':-504,u'BIIH':-116,u'BIII':-105,u'BMHI':-863,u'BMHM':-464,u'BOMH':620,u'OHHH':346,u'OHHI':1729,u'OHII':997,u'OHMH':481,u'OIHH':623,u'OIIH':1344,u'OKAK':2792,u'OKHH':587,u'OKKA':679,u'OOHH':110,u'OOII':-685}
TQ4__ = {u'BHHH':-721,u'BHHM':-3604,u'BHII':-966,u'BIIH':-607,u'BIII':-2181,u'OAAA':-2763,u'OAKK':180,u'OHHH':-294,u'OHHI':2446,u'OHHO':480,u'OHIH':-1573,u'OIHH':1935,u'OIHI':-493,u'OIIH':626,u'OIII':-4007,u'OKAK':-8156}
TW1__ = {u'につい':-4681,u'東京都':2026}
TW2__ = {u'ある程':-2049,u'いった':-1256,u'ころが':-2434,u'しょう':3873,u'その後':-4430,u'だって':-1049,u'ていた':1833,u'として':-4657,u'ともに':-4517,u'もので':1882,u'一気に':-792,u'初めて':-1512,u'同時に':-8097,u'大きな':-1255,u'対して':-2721,u'社会党':-3216}
TW3__ = {u'いただ':-1734,u'してい':1314,u'として':-4314,u'につい':-5483,u'にとっ':-5989,u'に当た':-6247,u'ので,':-727,u'ので、':-727,u'のもの':-600,u'れから':-3752,u'十二月':-2287}
TW4__ = {u'いう.':8576,u'いう。':8576,u'からな':-2348,u'してい':2958,u'たが,':1516,u'たが、':1516,u'ている':1538,u'という':1349,u'ました':5543,u'ません':1097,u'ようと':-4258,u'よると':5865}
UC1__ = {u'A':484,u'K':93,u'M':645,u'O':-505}
UC2__ = {u'A':819,u'H':1059,u'I':409,u'M':3987,u'N':5775,u'O':646}
UC3__ = {u'A':-1370,u'I':2311}
UC4__ = {u'A':-2643,u'H':1809,u'I':-1032,u'K':-3450,u'M':3565,u'N':3876,u'O':6646}
UC5__ = {u'H':313,u'I':-1238,u'K':-799,u'M':539,u'O':-831}
UC6__ = {u'H':-506,u'I':-253,u'K':87,u'M':247,u'O':-387}
UP1__ = {u'O':-214}
UP2__ = {u'B':69,u'O':935}
UP3__ = {u'B':189}
UQ1__ = {u'BH':21,u'BI':-12,u'BK':-99,u'BN':142,u'BO':-56,u'OH':-95,u'OI':477,u'OK':410,u'OO':-2422}
UQ2__ = {u'BH':216,u'BI':113,u'OK':1759}
UQ3__ = {u'BA':-479,u'BH':42,u'BI':1913,u'BK':-7198,u'BM':3160,u'BN':6427,u'BO':14761,u'OI':-827,u'ON':-3212}
UW1__ = {u',':156,u'':156,u'':-463,u'':-941,u'':-127,u'':-553,u'':121,u'':505,u'':-201,u'':-547,u'':-123,u'':-789,u'':-185,u'':-847,u'':-466,u'':-470,u'':182,u'':-292,u'':208,u'':169,u'':-446,u'':-137,u'':-135,u'':-402,u'':-268,u'':-912,u'':871,u'':-460,u'':561,u'':729,u'':-411,u'':-141,u'':361,u'':-408,u'':-386,u'':-718,u'':-463,u'':-135}
UW2__ = {u',':-829,u'':-829,u'':892,u'':-645,u'':3145,u'':-538,u'':505,u'':134,u'':-502,u'':1454,u'':-856,u'':-412,u'':1141,u'':878,u'':540,u'':1529,u'':-675,u'':300,u'':-1011,u'':188,u'':1837,u'':-949,u'':-291,u'':-268,u'':-981,u'':1273,u'':1063,u'':-1764,u'':130,u'':-409,u'':-1273,u'':1261,u'':600,u'':-1263,u'':-402,u'':1639,u'':-579,u'':-694,u'':571,u'':-2516,u'':2095,u'':-587,u'':306,u'':568,u'':831,u'':-758,u'':-2150,u'':-302,u'':-968,u'':-861,u'':492,u'':-123,u'':978,u'':362,u'':548,u'':-3025,u'':-1566,u'':-3414,u'':-422,u'':-1769,u'':-865,u'':-483,u'':-1519,u'':760,u'':1023,u'':-2009,u'':-813,u'':-1060,u'':1067,u'':-1519,u'':-1033,u'':1522,u'':-1355,u'':-1682,u'':-1815,u'':-1462,u'':-630,u'':-1843,u'':-1650,u'':-931,u'':-665,u'':-2378,u'':-180,u'':-1740,u'':752,u'':529,u'':-1584,u'':-242,u'':-1165,u'':-763,u'':810,u'':509,u'':-1353,u'':838,u'西':-744,u'':-3874,u'調':1010,u'':1198,u'':3041,u'':1758,u'':-1257,u'':-645,u'':3145,u'':831,u'':-587,u'':306,u'':568}
UW3__ = {u',':4889,u'1':-800,u'':-1723,u'':4889,u'':-2311,u'':5827,u'':2670,u'':-3573,u'':-2696,u'':1006,u'':2342,u'':1983,u'':-4864,u'':-1163,u'':3271,u'':1004,u'':388,u'':401,u'':-3552,u'':-3116,u'':-1058,u'':-395,u'':584,u'':3685,u'':-5228,u'':842,u'':-521,u'':-1444,u'':-1081,u'':6167,u'':2318,u'':1691,u'':-899,u'':-2788,u'':2745,u'':4056,u'':4555,u'':-2171,u'':-1798,u'':1199,u'':-5516,u'':-4384,u'':-120,u'':1205,u'':2323,u'':-788,u'':-202,u'':727,u'':649,u'':5905,u'':2773,u'':-1207,u'':6620,u'':-518,u'':551,u'':1319,u'':874,u'':-1350,u'':521,u'':1109,u'':1591,u'':2201,u'':278,u'':-3794,u'':-1619,u'':-1759,u'':-2087,u'':3815,u'':653,u'':-758,u'':-1193,u'':974,u'':2742,u'':792,u'':1889,u'':-1368,u'':811,u'':4265,u'':-361,u'':-2439,u'':4858,u'':3593,u'':1574,u'':-3030,u'':755,u'':-1880,u'':5807,u'':3095,u'':457,u'':2475,u'':1129,u'':2286,u'':4437,u'':365,u'':-949,u'':-1872,u'':1327,u'':-1038,u'':4646,u'':-2309,u'':-783,u'':-1006,u'':483,u'':1233,u'':3588,u'':-241,u'':3906,u'':-837,u'':4513,u'':642,u'':1389,u'':1219,u'':-241,u'':2016,u'':-1356,u'':-423,u'':-1008,u'':1078,u'':-513,u'':-3102,u'':1155,u'':3197,u'':-1804,u'':2416,u'':-1030,u'':1605,u'':1452,u'':-2352,u'':-3885,u'':1905,u'':-1291,u'':1822,u'':-488,u'':-3973,u'':-2013,u'':-1479,u'':3222,u'':-1489,u'':1764,u'':2099,u'':5792,u'':-661,u'':-1248,u'':-951,u'':-937,u'':4125,u'':360,u'':3094,u'':364,u'':-805,u'':5156,u'':2438,u'':484,u'':2613,u'':-1694,u'':-1073,u'':1868,u'':-495,u'':979,u'':461,u'':-3850,u'':-273,u'':914,u'':1215,u'':7313,u'':-1835,u'':792,u'':6293,u'':-1528,u'':4231,u'':401,u'':-960,u'':1201,u'':7767,u'':3066,u'':3663,u'':1384,u'':-4229,u'':1163,u'':1255,u'':6457,u'':725,u'':-2869,u'':785,u'':1044,u'調':-562,u'':-733,u'':1777,u'':1835,u'':1375,u'':-1504,u'':-1136,u'':-681,u'':1026,u'':4404,u'':1200,u'':2163,u'':421,u'':-1432,u'':1302,u'':-1282,u'':2009,u'':-1045,u'':2066,u'':1620,u'':-800,u'':2670,u'':-3794,u'':-1350,u'':551,u'グ':1319,u'':874,u'':521,u'':1109,u'':1591,u'':2201,u'':278}
UW4__ = {u',':3930,u'.':3508,u'':-4841,u'':3930,u'':3508,u'':4999,u'':1895,u'':3798,u'':-5156,u'':4752,u'':-3435,u'':-640,u'':-2514,u'':2405,u'':530,u'':6006,u'':-4482,u'':-3821,u'':-3788,u'':-4376,u'':-4734,u'':2255,u'':1979,u'':2864,u'':-843,u'':-2506,u'':-731,u'':1251,u'':181,u'':4091,u'':5034,u'':5408,u'':-3654,u'':-5882,u'':-1659,u'':3994,u'':7410,u'':4547,u'':5433,u'':6499,u'':1853,u'':1413,u'':7396,u'':8578,u'':1940,u'':4249,u'':-4134,u'':1345,u'':6665,u'':-744,u'':1464,u'':1051,u'':-2082,u'':-882,u'':-5046,u'':4169,u'':-2666,u'':2795,u'':-1544,u'':3351,u'':-2922,u'':-9726,u'':-14896,u'':-2613,u'':-4570,u'':-1783,u'':13150,u'':-2352,u'':2145,u'':1789,u'':1287,u'':-724,u'':-403,u'':-1635,u'':-881,u'':-541,u'':-856,u'':-3637,u'':-4371,u'':-11870,u'':-2069,u'':2210,u'':782,u'':-190,u'':-1768,u'':1036,u'':544,u'':950,u'':-1286,u'':530,u'':4292,u'':601,u'':-2006,u'':-1212,u'':584,u'':788,u'':1347,u'':1623,u'':3879,u'':-302,u'':-740,u'':-2715,u'':776,u'':4517,u'':1013,u'':1555,u'':-1834,u'':-681,u'':-910,u'':-851,u'':1500,u'':-619,u'':-1200,u'':866,u'':-1410,u'':-2094,u'':-1413,u'':1067,u'':571,u'':-4802,u'':-1397,u'':-1057,u'':-809,u'':1910,u'':-1328,u'':-1500,u'':-2056,u'':-2667,u'':2771,u'':374,u'':-4556,u'':456,u'':553,u'':916,u'':-1566,u'':856,u'':787,u'':2182,u'':704,u'':522,u'':-856,u'':1798,u'':1829,u'':845,u'':-9066,u'':-485,u'':-442,u'':-360,u'':-1043,u'':5388,u'':-2716,u'':-910,u'':-939,u'':-543,u'':-735,u'':672,u'':-1267,u'':-1286,u'':-1101,u'':-2900,u'':1826,u'':2586,u'':922,u'':-3485,u'':2997,u'':-867,u'':-2112,u'':788,u'':2937,u'':786,u'':2171,u'':1146,u'':-1169,u'':940,u'':-994,u'':749,u'':2145,u'':-730,u'':-852,u'':-792,u'':792,u'':-1184,u'':-244,u'':-1000,u'':730,u'':-1481,u'':1158,u'':-1433,u'':-3370,u'':929,u'':-1291,u'':2596,u'':-4866,u'':1192,u'':-1100,u'':-2213,u'':357,u'':-2344,u'':-2297,u'':-2604,u'':-878,u'':-1659,u'':-792,u'':-1984,u'':1749,u'':2120,u'':1895,u'':3798,u'':-4371,u'':-724,u'':-11870,u'':2145,u'':1789,u'':1287,u'':-403,u'':-1635,u'':-881,u'':-541,u'':-856,u'':-3637}
UW5__ = {u',':465,u'.':-299,u'1':-514,u'E2':-32768,u']':-2762,u'':465,u'':-299,u'':363,u'':1655,u'':331,u'':-503,u'':1199,u'':527,u'':647,u'':-421,u'':1624,u'':1971,u'':312,u'':-983,u'':-1537,u'':-1371,u'':-852,u'':-1186,u'':1093,u'':52,u'':921,u'':-18,u'':-850,u'':-127,u'':1682,u'':-787,u'':-1224,u'':-635,u'':-578,u'':1001,u'':502,u'':865,u'':3350,u'':854,u'':-208,u'':429,u'':504,u'':419,u'':-1264,u'':327,u'':241,u'':451,u'':-343,u'':-871,u'':722,u'':-1153,u'':-654,u'':3519,u'':-901,u'':848,u'':2104,u'':-1296,u'':-548,u'':1785,u'':-1304,u'':-2991,u'':921,u'':1763,u'':872,u'':-814,u'':1618,u'':-1682,u'':218,u'':-4353,u'':932,u'':1356,u'':-1508,u'':-1347,u'':240,u'':-3912,u'':-3149,u'':1319,u'':-1052,u'':-4003,u'':-997,u'':-278,u'':-813,u'':1955,u'':-2233,u'':663,u'':-1073,u'':1219,u'':-1018,u'':-368,u'':786,u'':1191,u'':2368,u'':-689,u'':-514,u'':-32768,u'':363,u'':241,u'':451,u'':-343}
UW6__ = {u',':227,u'.':808,u'1':-270,u'E1':306,u'':227,u'':808,u'':-307,u'':189,u'':241,u'':-73,u'':-121,u'':-200,u'':1782,u'':383,u'':-428,u'':573,u'':-1014,u'':101,u'':-105,u'':-253,u'':-149,u'':-417,u'':-236,u'':-206,u'':187,u'':-135,u'':195,u'':-673,u'':-496,u'':-277,u'':201,u'':-800,u'':624,u'':302,u'':1792,u'':-1212,u'':798,u'':-960,u'':887,u'':-695,u'':535,u'':-697,u'':753,u'':-507,u'':974,u'':-822,u'':1811,u'':463,u'':1082,u'':-270,u'':306,u'':-673,u'':-496}
# ctype_
def ctype_(self, char):
for pattern, value in self.patterns_.iteritems():
if pattern.match(char):
return value
return u'O'
# ts_
def ts_(self, dict, key):
if key in dict:
return dict[key]
return 0
# segment
def split(self, input):
if not input:
return []
result = []
seg = [u'B3',u'B2',u'B1']
ctype = [u'O',u'O',u'O']
for t in input:
seg.append(t)
ctype.append(self.ctype_(t))
seg.append(u'E1')
seg.append(u'E2')
seg.append(u'E3')
ctype.append(u'O')
ctype.append(u'O')
ctype.append(u'O')
word = seg[3]
p1 = u'U'
p2 = u'U'
p3 = u'U'
for i in range(4, len(seg) - 3):
score = self.BIAS__
w1 = seg[i-3]
w2 = seg[i-2]
w3 = seg[i-1]
w4 = seg[i]
w5 = seg[i+1]
w6 = seg[i+2]
c1 = ctype[i-3]
c2 = ctype[i-2]
c3 = ctype[i-1]
c4 = ctype[i]
c5 = ctype[i+1]
c6 = ctype[i+2]
score += self.ts_(self.UP1__, p1)
score += self.ts_(self.UP2__, p2)
score += self.ts_(self.UP3__, p3)
score += self.ts_(self.BP1__, p1 + p2)
score += self.ts_(self.BP2__, p2 + p3)
score += self.ts_(self.UW1__, w1)
score += self.ts_(self.UW2__, w2)
score += self.ts_(self.UW3__, w3)
score += self.ts_(self.UW4__, w4)
score += self.ts_(self.UW5__, w5)
score += self.ts_(self.UW6__, w6)
score += self.ts_(self.BW1__, w2 + w3)
score += self.ts_(self.BW2__, w3 + w4)
score += self.ts_(self.BW3__, w4 + w5)
score += self.ts_(self.TW1__, w1 + w2 + w3)
score += self.ts_(self.TW2__, w2 + w3 + w4)
score += self.ts_(self.TW3__, w3 + w4 + w5)
score += self.ts_(self.TW4__, w4 + w5 + w6)
score += self.ts_(self.UC1__, c1)
score += self.ts_(self.UC2__, c2)
score += self.ts_(self.UC3__, c3)
score += self.ts_(self.UC4__, c4)
score += self.ts_(self.UC5__, c5)
score += self.ts_(self.UC6__, c6)
score += self.ts_(self.BC1__, c2 + c3)
score += self.ts_(self.BC2__, c3 + c4)
score += self.ts_(self.BC3__, c4 + c5)
score += self.ts_(self.TC1__, c1 + c2 + c3)
score += self.ts_(self.TC2__, c2 + c3 + c4)
score += self.ts_(self.TC3__, c3 + c4 + c5)
score += self.ts_(self.TC4__, c4 + c5 + c6)
# score += self.ts_(self.TC5__, c4 + c5 + c6)
score += self.ts_(self.UQ1__, p1 + c1)
score += self.ts_(self.UQ2__, p2 + c2)
score += self.ts_(self.UQ1__, p3 + c3)
score += self.ts_(self.BQ1__, p2 + c2 + c3)
score += self.ts_(self.BQ2__, p2 + c3 + c4)
score += self.ts_(self.BQ3__, p3 + c2 + c3)
score += self.ts_(self.BQ4__, p3 + c3 + c4)
score += self.ts_(self.TQ1__, p2 + c1 + c2 + c3)
score += self.ts_(self.TQ2__, p2 + c2 + c3 + c4)
score += self.ts_(self.TQ3__, p3 + c1 + c2 + c3)
score += self.ts_(self.TQ4__, p3 + c2 + c3 + c4)
p = u'O'
if score > 0:
result.append(word)
word = u''
p = u'B'
p1 = p2
p2 = p3
p3 = p
word += seg[i]
result.append(word)
return result
class SearchJapanese(SearchLanguage):
"""
Japanese search implementation: uses no stemmer, but word splitting is quite
complicated.
"""
lang = 'ja'
def init(self, options):
type = options.get('type', 'default')
if type not in ('mecab', 'default'):
raise ValueError(("Japanese tokenizer's type should be 'mecab'"
" or 'default'"))
self.libmecab = None
if type == 'mecab':
self.splitter = MecabBinder(options)
else:
self.splitter = TinySegmenter()
def split(self, input):
return self.splitter.split(input)
def word_filter(self, stemmed_word):
return len(stemmed_word) > 1

View File

@ -1,6 +1,6 @@
/*
* searchtools.js
* ~~~~~~~~~~~~~~
* searchtools.js_t
* ~~~~~~~~~~~~~~~~
*
* Sphinx JavaScript utilties for the full-text search.
*
@ -36,188 +36,7 @@ jQuery.makeSearchSummary = function(text, keywords, hlwords) {
return rv;
}
/**
* Porter Stemmer
*/
var PorterStemmer = function() {
var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};
var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};
var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;
if (w.length < 3)
return w;
var re;
var re2;
var re3;
var re4;
firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}
{{ search_language_stemming_code|safe }}
/**
* Search Module
@ -300,14 +119,10 @@ var Search = {
},
query : function(query) {
var stopwords = ['and', 'then', 'into', 'it', 'as', 'are', 'in',
'if', 'for', 'no', 'there', 'their', 'was', 'is',
'be', 'to', 'that', 'but', 'they', 'not', 'such',
'with', 'by', 'a', 'on', 'these', 'of', 'will',
'this', 'near', 'the', 'or', 'at'];
var stopwords = {{ search_language_stop_words }};
// stem the searchterms and add them to the correct list
var stemmer = new PorterStemmer();
// Stem the searchterms and add them to the correct list
var stemmer = new Stemmer();
var searchterms = [];
var excluded = [];
var hlterms = [];