From a24671bafb7fbb9c9a8c95845dcbabaf591983b8 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Sat, 4 Apr 2009 21:45:22 +0900 Subject: [PATCH 1/4] Use PyStemmer instead of PorterStemmer. PorterStemmer consumes a lot of time. So use PyStemmer implemented in C. --- sphinx/search.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/sphinx/search.py b/sphinx/search.py index fe20c24a5..4aec41836 100644 --- a/sphinx/search.py +++ b/sphinx/search.py @@ -14,8 +14,14 @@ from cStringIO import StringIO from docutils.nodes import Text, NodeVisitor -from sphinx.util.stemmer import PorterStemmer from sphinx.util import jsdump, rpartition +try: + # PyStemmer is wrapper for stemmer in c + import Stemmer as PyStemmer + PYSTEMMER = True +except ImportError: + from sphinx.util.stemmer import PorterStemmer + PYSTEMMER = False word_re = re.compile(r'\w+(?u)') @@ -62,15 +68,29 @@ class _JavaScriptIndex(object): js_index = _JavaScriptIndex() -class Stemmer(PorterStemmer): - """ - All those porter stemmer implementations look hideous. - make at least the stem method nicer. - """ +if PYSTEMMER: + class Stemmer(object): + + def __init__(self): + self._stemmer = PyStemmer.Stemmer('english') + + def stem(self, word): + return self._stemmer.stemWord(word.lower()) + + def stemWords(self, iter): + import itertools + return self._stemmer.stemWords(itertools.imap(lambda x: x.lower(), iter)) +else: + class Stemmer(PorterStemmer): + """ + All those porter stemmer implementations look hideous. + make at least the stem method nicer. + """ + + def stem(self, word): + word = word.lower() + return PorterStemmer.stem(self, word, 0, len(word) - 1) - def stem(self, word): - word = word.lower() - return PorterStemmer.stem(self, word, 0, len(word) - 1) class WordCollector(NodeVisitor): @@ -196,11 +216,11 @@ class IndexBuilder(object): visitor = WordCollector(doctree) doctree.walk(visitor) - def add_term(word, prefix='', stem=self._stemmer.stem): + def add_term(word, stem=self._stemmer.stem): word = stem(word) if len(word) < 3 or word in stopwords or word.isdigit(): return - self._mapping.setdefault(prefix + word, set()).add(filename) + self._mapping.setdefault(word, set()).add(filename) for word in word_re.findall(title): add_term(word) From f04fd3d902fb49322e8178b0f2e2de117d28505c Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Sat, 4 Apr 2009 21:47:05 +0900 Subject: [PATCH 2/4] Don't use publish_parts to avoid building OptionParser many times. --- sphinx/builders/html.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/sphinx/builders/html.py b/sphinx/builders/html.py index 365cf5f96..274836576 100644 --- a/sphinx/builders/html.py +++ b/sphinx/builders/html.py @@ -23,7 +23,7 @@ except ImportError: from docutils import nodes from docutils.io import DocTreeInput, StringOutput -from docutils.core import publish_parts +from docutils.core import Publisher, publish_parts from docutils.utils import new_document from docutils.frontend import OptionParser from docutils.readers.doctree import Reader as DoctreeReader @@ -181,14 +181,26 @@ class StandaloneHTMLBuilder(Builder): """Utility: Render a lone doctree node.""" doc = new_document('') doc.append(node) - return publish_parts( - doc, - source_class=DocTreeInput, - reader=DoctreeReader(), - writer=HTMLWriter(self), - settings_overrides={'output_encoding': 'unicode'} - ) + # cache publisher object. + if 'publisher' not in self.__dict__: + self.publisher = Publisher( + source_class = DocTreeInput, + destination_class=StringOutput) + self.publisher.set_components('standalone', + 'restructuredtext', 'pseudoxml') + + pub = self.publisher + + pub.reader = DoctreeReader() + pub.writer = HTMLWriter(self) + pub.process_programmatic_settings( + None, {'output_encoding': 'unicode'}, None) + pub.set_source(doc, None) + pub.set_destination(None, None) + pub.publish() + return pub.writer.parts + def prepare_writing(self, docnames): from sphinx.search import IndexBuilder From aa05225f5c3191bf06e2ce5e2bb7466252c9ea5d Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Sun, 5 Apr 2009 05:53:49 +0900 Subject: [PATCH 3/4] Use porterstemmer instead of PyStemmer. --- sphinx/search.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/sphinx/search.py b/sphinx/search.py index 4aec41836..8bde13266 100644 --- a/sphinx/search.py +++ b/sphinx/search.py @@ -16,12 +16,12 @@ from docutils.nodes import Text, NodeVisitor from sphinx.util import jsdump, rpartition try: - # PyStemmer is wrapper for stemmer in c - import Stemmer as PyStemmer - PYSTEMMER = True + # http://bitbucket.org/methane/porterstemmer/ + from porterstemmer import Stemmer as CStemmer + CSTEMMER = True except ImportError: from sphinx.util.stemmer import PorterStemmer - PYSTEMMER = False + CSTEMMER = False word_re = re.compile(r'\w+(?u)') @@ -68,18 +68,12 @@ class _JavaScriptIndex(object): js_index = _JavaScriptIndex() -if PYSTEMMER: - class Stemmer(object): - - def __init__(self): - self._stemmer = PyStemmer.Stemmer('english') +if CSTEMMER: + class Stemmer(CStemmer): def stem(self, word): - return self._stemmer.stemWord(word.lower()) + return self(word.lower()) - def stemWords(self, iter): - import itertools - return self._stemmer.stemWords(itertools.imap(lambda x: x.lower(), iter)) else: class Stemmer(PorterStemmer): """ From ea0ba249e3c6d091cf9c60714f9684afbcf0477e Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Sun, 5 Apr 2009 23:30:33 +0900 Subject: [PATCH 4/4] tune traversing. 29s->24s --- sphinx/util/__init__.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sphinx/util/__init__.py b/sphinx/util/__init__.py index 7e4b1092b..1eb8383aa 100644 --- a/sphinx/util/__init__.py +++ b/sphinx/util/__init__.py @@ -416,30 +416,28 @@ def copy_static_entry(source, target, builder, context={}): # traverse() is called so many times during a build that it saves # on average 20-25% overall build time! -def _all_traverse(self): +def _all_traverse(self, result): """Version of Node.traverse() that doesn't need a condition.""" - result = [] result.append(self) for child in self.children: - result.extend(child._all_traverse()) + child._all_traverse(result) return result -def _fast_traverse(self, cls): +def _fast_traverse(self, cls, result): """Version of Node.traverse() that only supports instance checks.""" - result = [] if isinstance(self, cls): result.append(self) for child in self.children: - result.extend(child._fast_traverse(cls)) + child._fast_traverse(cls, result) return result def _new_traverse(self, condition=None, include_self=1, descend=1, siblings=0, ascend=0): if include_self and descend and not siblings and not ascend: if condition is None: - return self._all_traverse() + return self._all_traverse([]) elif isinstance(condition, (types.ClassType, type)): - return self._fast_traverse(condition) + return self._fast_traverse(condition, []) return self._old_traverse(condition, include_self, descend, siblings, ascend)