From 6bda4586bd55ff751bd8ad034d539d05ae353a53 Mon Sep 17 00:00:00 2001 From: shibukawa yoshiki Date: Wed, 22 Jan 2014 02:12:04 -0800 Subject: [PATCH] Add development memo about stemming JS code, acceleration tips about stemming, small bug fix --- doc/config.rst | 9 +++++++++ doc/devguide.rst | 12 ++++++++++++ sphinx/search/__init__.py | 2 +- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/config.rst b/doc/config.rst index 260ccddd4..2fd1cf66b 100644 --- a/doc/config.rst +++ b/doc/config.rst @@ -747,6 +747,15 @@ that use Sphinx' HTMLWriter class. * ``sv`` -- Swedish * ``tr`` -- Turkish + .. admonition:: Accelerate build speed + + Each language (except Japanese) provides its own stemming algorithm. + Sphinx uses Python implementation by default. You can use + C implementation to accelerate building the index file. + + * `PorterStemmer `_ (`en`) + * `PyStemmer `_ (all languages) + .. versionadded:: 1.1 .. versionchanged:: 1.3 diff --git a/doc/devguide.rst b/doc/devguide.rst index fccdd3fa7..666822d6b 100644 --- a/doc/devguide.rst +++ b/doc/devguide.rst @@ -243,3 +243,15 @@ Debugging Tips * Set the debugging options in the `Docutils configuration file `_. + +* JavaScript stemming algorithms in `sphinx/search/*.py` (except `en.py`) are + genereted by + `modified snowballcode generator `_. + Generated `JSX `_ files are + in `this repository `_. + You can get resulting JavaScript files by the following command: + + .. code-block:: bash + + $ npm install + $ node_modules/.bin/grunt build # -> dest/*.global.js diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index 14fe16dd5..03a1f9df9 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -89,7 +89,7 @@ var Stemmer = function() { Return true if the target word should be registered in the search index. This method is called after stemming. """ - return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or + return len(word) == 0 or not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or (ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or word.isdigit())))