sphinx/sphinx/search.py

# -*- coding: utf-8 -*-
"""
    sphinx.search
    ~~~~~~~~~~~~~

    Create a search index for offline search.

    :copyright: 2007-2008 by Armin Ronacher.
    :license: BSD.
"""
import re
import cPickle as pickle
from cStringIO import StringIO

from docutils.nodes import Text, NodeVisitor

from sphinx.util.stemmer import PorterStemmer
from sphinx.util import jsdump, rpartition


word_re = re.compile(r'\w+(?u)')

stopwords = set("""
a  and  are  as  at
be  but  by
for
if  in  into  is  it
near  no  not
of  on  or
such
that  the  their  then  there  these  they  this  to
was  will  with
""".split())


class _JavaScriptIndex(object):
    """
    The search index as javascript file that calls a function
    on the documentation search object to register the index.
    """

    PREFIX = 'Search.setIndex('
    SUFFIX = ')'

    def dumps(self, data):
        return self.PREFIX + jsdump.dumps(data) + self.SUFFIX

    def loads(self, s):
        data = s[len(self.PREFIX):-len(self.SUFFIX)]
        if not data or not s.startswith(self.PREFIX) or not \
           s.endswith(self.SUFFIX):
            raise ValueError('invalid data')
        return jsdump.loads(data)

    def dump(self, data, f):
        f.write(self.dumps(data))

    def load(self, f):
        return self.loads(f.read())


js_index = _JavaScriptIndex()


class Stemmer(PorterStemmer):
    """
    All those porter stemmer implementations look hideous.
    make at least the stem method nicer.
    """

    def stem(self, word):
        word = word.lower()
        return PorterStemmer.stem(self, word, 0, len(word) - 1)


class WordCollector(NodeVisitor):
    """
    A special visitor that collects words for the `IndexBuilder`.
    """

    def __init__(self, document):
        NodeVisitor.__init__(self, document)
        self.found_words = []

    def dispatch_visit(self, node):
        if node.__class__ is Text:
            self.found_words.extend(word_re.findall(node.astext()))


class IndexBuilder(object):
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'jsdump':   jsdump,
        'pickle':   pickle
    }

    def __init__(self, env):
        self.env = env
        self._stemmer = Stemmer()
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}
        # desctypes -> index
        self._desctypes = {}

    def load(self, stream, format):
        """Reconstruct from frozen data."""
        if isinstance(format, basestring):
            format = self.formats[format]
        frozen = format.load(stream)
        # if an old index is present, we treat it as not existing.
        if not isinstance(frozen, dict):
            raise ValueError('old format')
        index2fn = frozen['filenames']
        self._titles = dict(zip(index2fn, frozen['titles']))
        self._mapping = {}
        for k, v in frozen['terms'].iteritems():
            if isinstance(v, int):
                self._mapping[k] = set([index2fn[v]])
            else:
                self._mapping[k] = set(index2fn[i] for i in v)
        # no need to load keywords/desctypes

    def dump(self, stream, format):
        """Dump the frozen index to a stream."""
        if isinstance(format, basestring):
            format = self.formats[format]
        format.dump(self.freeze(), stream)

    def get_modules(self, fn2index):
        rv = {}
        for name, (doc, _, _, _) in self.env.modules.iteritems():
            rv[name] = fn2index[doc]
        return rv

    def get_descrefs(self, fn2index):
        rv = {}
        dt = self._desctypes
        for fullname, (doc, desctype) in self.env.descrefs.iteritems():
            prefix, name = rpartition(fullname, '.')
            pdict = rv.setdefault(prefix, {})
            try:
                i = dt[desctype]
            except KeyError:
                i = len(dt)
                dt[desctype] = i
            pdict[name] = (fn2index[doc], i)
        return rv

    def get_terms(self, fn2index):
        rv = {}
        for k, v in self._mapping.iteritems():
            if len(v) == 1:
                fn, = v
                rv[k] = fn2index[fn]
            else:
                rv[k] = [fn2index[fn] for fn in v]
        return rv

    def freeze(self):
        """Create a usable data structure for serializing."""
        filenames = self._titles.keys()
        titles = self._titles.values()
        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
        return dict(
            filenames=filenames,
            titles=titles,
            terms=self.get_terms(fn2index),
            descrefs=self.get_descrefs(fn2index),
            modules=self.get_modules(fn2index),
            desctypes=dict((v, k) for (k, v) in self._desctypes.items()),
        )

    def prune(self, filenames):
        """Remove data for all filenames not in the list."""
        new_titles = {}
        for filename in filenames:
            if filename in self._titles:
                new_titles[filename] = self._titles[filename]
        self._titles = new_titles
        for wordnames in self._mapping.itervalues():
            wordnames.intersection_update(filenames)

    def feed(self, filename, title, doctree):
        """Feed a doctree to the index."""
        self._titles[filename] = title

        visitor = WordCollector(doctree)
        doctree.walk(visitor)

        def add_term(word, prefix='', stem=self._stemmer.stem):
            word = stem(word)
            if len(word) < 3 or word in stopwords or word.isdigit():
                return
            self._mapping.setdefault(prefix + word, set()).add(filename)

        for word in word_re.findall(title):
            add_term(word)

        for word in visitor.found_words:
            add_term(word)
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`# -- coding: utf-8 --`
			`"""`
			`sphinx.search`
			`~~~~~~~~~~~~~`

			`Create a search index for offline search.`

Update copyright years and change license to BSD. 2008-01-11 08:18:19 -06:00			`:copyright: 2007-2008 by Armin Ronacher.`
			`:license: BSD.`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`"""`
			`import re`
Implemented JSONHTMLBuilder and improved JSON handling (it now prefers json from the 2.6 stdlib or simplejson). 2008-06-26 06:11:20 -05:00			`import cPickle as pickle`
Improved search system. The search index is now a regular javascript file which should speed things up because browsers can cache it. Removed unused code from doctools.js 2008-09-10 04:11:56 -05:00			`from cStringIO import StringIO`
Checkin my holiday work: - Add "changes" builder to quickly get an overview over all "versionadded/changed/deprecated" directives for a certain version - Cross-reference keywords - Fix some problems in the webapp and the latex writer 2007-12-29 04:58:10 -06:00
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`from docutils.nodes import Text, NodeVisitor`
Checkin my holiday work: - Add "changes" builder to quickly get an overview over all "versionadded/changed/deprecated" directives for a certain version - Cross-reference keywords - Fix some problems in the webapp and the latex writer 2007-12-29 04:58:10 -06:00
A few refactorings in Sphinx. 2008-01-16 14:27:25 -06:00			`from sphinx.util.stemmer import PorterStemmer`
Rename util.json to util.jsdump because it doesn't generate valid JSON anymore. The JSON html builder still needs simplejson to work. 2008-09-24 07:01:16 -05:00			`from sphinx.util import jsdump, rpartition`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00

			`word_re = re.compile(r'\w+(?u)')`

Add stopword list. 2008-09-24 04:06:31 -05:00			`stopwords = set("""`
			`a and are as at`
			`be but by`
			`for`
			`if in into is it`
			`near no not`
			`of on or`
			`such`
			`that the their then there these they this to`
			`was will with`
			`""".split())`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00
A fix in the JS dumper and further compression by omitting redundant braces. 2008-09-24 06:51:02 -05:00
Improved search system. The search index is now a regular javascript file which should speed things up because browsers can cache it. Removed unused code from doctools.js 2008-09-10 04:11:56 -05:00			`class _JavaScriptIndex(object):`
			`"""`
			`The search index as javascript file that calls a function`
			`on the documentation search object to register the index.`
			`"""`

			`PREFIX = 'Search.setIndex('`
			`SUFFIX = ')'`

			`def dumps(self, data):`
Rename util.json to util.jsdump because it doesn't generate valid JSON anymore. The JSON html builder still needs simplejson to work. 2008-09-24 07:01:16 -05:00			`return self.PREFIX + jsdump.dumps(data) + self.SUFFIX`
Improved search system. The search index is now a regular javascript file which should speed things up because browsers can cache it. Removed unused code from doctools.js 2008-09-10 04:11:56 -05:00
			`def loads(self, s):`
			`data = s[len(self.PREFIX):-len(self.SUFFIX)]`
			`if not data or not s.startswith(self.PREFIX) or not \`
			`s.endswith(self.SUFFIX):`
			`raise ValueError('invalid data')`
Rename util.json to util.jsdump because it doesn't generate valid JSON anymore. The JSON html builder still needs simplejson to work. 2008-09-24 07:01:16 -05:00			`return jsdump.loads(data)`
Improved search system. The search index is now a regular javascript file which should speed things up because browsers can cache it. Removed unused code from doctools.js 2008-09-10 04:11:56 -05:00
			`def dump(self, data, f):`
			`f.write(self.dumps(data))`

			`def load(self, f):`
			`return self.loads(f.read())`


			`js_index = _JavaScriptIndex()`


Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`class Stemmer(PorterStemmer):`
			`"""`
			`All those porter stemmer implementations look hideous.`
			`make at least the stem method nicer.`
			`"""`

			`def stem(self, word):`
Moved searchindex to the bottom to not lock the search page, prepared index for title searching. 2008-09-10 06:26:07 -05:00			`word = word.lower()`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`return PorterStemmer.stem(self, word, 0, len(word) - 1)`


			`class WordCollector(NodeVisitor):`
			`"""`
			A special visitor that collects words for the `IndexBuilder`.
			`"""`

			`def __init__(self, document):`
			`NodeVisitor.__init__(self, document)`
			`self.found_words = []`

			`def dispatch_visit(self, node):`
			`if node.__class__ is Text:`
			`self.found_words.extend(word_re.findall(node.astext()))`


			`class IndexBuilder(object):`
			`"""`
			`Helper class that creates a searchindex based on the doctrees`
			passed to the `feed` method.
			`"""`
			`formats = {`
Rename util.json to util.jsdump because it doesn't generate valid JSON anymore. The JSON html builder still needs simplejson to work. 2008-09-24 07:01:16 -05:00			`'jsdump': jsdump,`
Refactored pickle builder into a SerializingHTMLBuilder and PickleHTMLBuilder. Subclasses can change the serialization format easily. 2008-06-26 04:40:42 -05:00			`'pickle': pickle`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`}`

Improved search slightly by adding keyword based lookup 2008-09-23 16:21:32 -05:00			`def __init__(self, env):`
			`self.env = env`
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00			`self._stemmer = Stemmer()`
			`# filename -> title`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`self._titles = {}`
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00			`# stemmed word -> set(filenames)`
			`self._mapping = {}`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`# desctypes -> index`
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`self._desctypes = {}`
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00
			`def load(self, stream, format):`
			`"""Reconstruct from frozen data."""`
Refactored pickle builder into a SerializingHTMLBuilder and PickleHTMLBuilder. Subclasses can change the serialization format easily. 2008-06-26 04:40:42 -05:00			`if isinstance(format, basestring):`
			`format = self.formats[format]`
			`frozen = format.load(stream)`
Some tiny search fixes. 2008-09-24 03:39:33 -05:00			`# if an old index is present, we treat it as not existing.`
			`if not isinstance(frozen, dict):`
			`raise ValueError('old format')`
			`index2fn = frozen['filenames']`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`self._titles = dict(zip(index2fn, frozen['titles']))`
A fix in the JS dumper and further compression by omitting redundant braces. 2008-09-24 06:51:02 -05:00			`self._mapping = {}`
			`for k, v in frozen['terms'].iteritems():`
			`if isinstance(v, int):`
			`self._mapping[k] = set([index2fn[v]])`
			`else:`
			`self._mapping[k] = set(index2fn[i] for i in v)`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`# no need to load keywords/desctypes`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00
			`def dump(self, stream, format):`
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00			`"""Dump the frozen index to a stream."""`
Refactored pickle builder into a SerializingHTMLBuilder and PickleHTMLBuilder. Subclasses can change the serialization format easily. 2008-06-26 04:40:42 -05:00			`if isinstance(format, basestring):`
			`format = self.formats[format]`
			`format.dump(self.freeze(), stream)`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`def get_modules(self, fn2index):`
			`rv = {}`
			`for name, (doc, _, _, _) in self.env.modules.iteritems():`
			`rv[name] = fn2index[doc]`
			`return rv`

			`def get_descrefs(self, fn2index):`
Improved search slightly by adding keyword based lookup 2008-09-23 16:21:32 -05:00			`rv = {}`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`dt = self._desctypes`
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`for fullname, (doc, desctype) in self.env.descrefs.iteritems():`
			`prefix, name = rpartition(fullname, '.')`
			`pdict = rv.setdefault(prefix, {})`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`try:`
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`i = dt[desctype]`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`except KeyError:`
			`i = len(dt)`
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`dt[desctype] = i`
			`pdict[name] = (fn2index[doc], i)`
Improved search slightly by adding keyword based lookup 2008-09-23 16:21:32 -05:00			`return rv`

A fix in the JS dumper and further compression by omitting redundant braces. 2008-09-24 06:51:02 -05:00			`def get_terms(self, fn2index):`
			`rv = {}`
			`for k, v in self._mapping.iteritems():`
			`if len(v) == 1:`
			`fn, = v`
			`rv[k] = fn2index[fn]`
			`else:`
			`rv[k] = [fn2index[fn] for fn in v]`
			`return rv`

Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`def freeze(self):`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`"""Create a usable data structure for serializing."""`
Improved search slightly by adding keyword based lookup 2008-09-23 16:21:32 -05:00			`filenames = self._titles.keys()`
			`titles = self._titles.values()`
			`fn2index = dict((f, i) for (i, f) in enumerate(filenames))`
			`return dict(`
			`filenames=filenames,`
			`titles=titles,`
A fix in the JS dumper and further compression by omitting redundant braces. 2008-09-24 06:51:02 -05:00			`terms=self.get_terms(fn2index),`
More compression: group objects by prefix. 2008-09-24 04:31:46 -05:00			`descrefs=self.get_descrefs(fn2index),`
			`modules=self.get_modules(fn2index),`
* Search for partial keyword matches and be case insensitive. * Show keyword results before regular ones. * Show full name, type of keyword and title of containing doc. 2008-09-24 03:46:33 -05:00			`desctypes=dict((v, k) for (k, v) in self._desctypes.items()),`
Improved search slightly by adding keyword based lookup 2008-09-23 16:21:32 -05:00			`)`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00			`def prune(self, filenames):`
			`"""Remove data for all filenames not in the list."""`
			`new_titles = {}`
			`for filename in filenames:`
			`if filename in self._titles:`
			`new_titles[filename] = self._titles[filename]`
			`self._titles = new_titles`
			`for wordnames in self._mapping.itervalues():`
			`wordnames.intersection_update(filenames)`

A few refactorings in Sphinx. 2008-01-16 14:27:25 -06:00			`def feed(self, filename, title, doctree):`
Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`"""Feed a doctree to the index."""`
Fix searching and search index creation for incremental builds. 2007-08-09 14:22:20 -05:00			`self._titles[filename] = title`

Initial import of the doc tools. 2007-07-23 04:02:25 -05:00			`visitor = WordCollector(doctree)`
			`doctree.walk(visitor)`
Moved searchindex to the bottom to not lock the search page, prepared index for title searching. 2008-09-10 06:26:07 -05:00
Add stopword list. 2008-09-24 04:06:31 -05:00			`def add_term(word, prefix='', stem=self._stemmer.stem):`
			`word = stem(word)`
			`if len(word) < 3 or word in stopwords or word.isdigit():`
			`return`
Moved searchindex to the bottom to not lock the search page, prepared index for title searching. 2008-09-10 06:26:07 -05:00			`self._mapping.setdefault(prefix + word, set()).add(filename)`

			`for word in word_re.findall(title):`
			`add_term(word)`

			`for word in visitor.found_words:`
			`add_term(word)`