sphinx/sphinx/search.py

206 lines
5.8 KiB
Python
Raw Normal View History

2007-07-23 04:02:25 -05:00
# -*- coding: utf-8 -*-
"""
sphinx.search
~~~~~~~~~~~~~
Create a search index for offline search.
:copyright: Copyright 2007-2009 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
2007-07-23 04:02:25 -05:00
"""
import re
import cPickle as pickle
from cStringIO import StringIO
2007-07-23 04:02:25 -05:00
from docutils.nodes import Text, NodeVisitor
2008-01-16 14:27:25 -06:00
from sphinx.util.stemmer import PorterStemmer
from sphinx.util import jsdump, rpartition
2007-07-23 04:02:25 -05:00
word_re = re.compile(r'\w+(?u)')
2008-09-24 04:06:31 -05:00
stopwords = set("""
a and are as at
be but by
for
if in into is it
near no not
of on or
such
that the their then there these they this to
was will with
""".split())
2007-07-23 04:02:25 -05:00
class _JavaScriptIndex(object):
"""
The search index as javascript file that calls a function
on the documentation search object to register the index.
"""
PREFIX = 'Search.setIndex('
SUFFIX = ')'
def dumps(self, data):
return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
def loads(self, s):
data = s[len(self.PREFIX):-len(self.SUFFIX)]
if not data or not s.startswith(self.PREFIX) or not \
s.endswith(self.SUFFIX):
raise ValueError('invalid data')
return jsdump.loads(data)
def dump(self, data, f):
f.write(self.dumps(data))
def load(self, f):
return self.loads(f.read())
js_index = _JavaScriptIndex()
2007-07-23 04:02:25 -05:00
class Stemmer(PorterStemmer):
"""
All those porter stemmer implementations look hideous.
make at least the stem method nicer.
"""
def stem(self, word):
word = word.lower()
2007-07-23 04:02:25 -05:00
return PorterStemmer.stem(self, word, 0, len(word) - 1)
class WordCollector(NodeVisitor):
"""
A special visitor that collects words for the `IndexBuilder`.
"""
def __init__(self, document):
NodeVisitor.__init__(self, document)
self.found_words = []
def dispatch_visit(self, node):
if node.__class__ is Text:
self.found_words.extend(word_re.findall(node.astext()))
class IndexBuilder(object):
"""
Helper class that creates a searchindex based on the doctrees
passed to the `feed` method.
"""
formats = {
'jsdump': jsdump,
'pickle': pickle
2007-07-23 04:02:25 -05:00
}
def __init__(self, env):
self.env = env
self._stemmer = Stemmer()
# filename -> title
2007-07-23 04:02:25 -05:00
self._titles = {}
# stemmed word -> set(filenames)
self._mapping = {}
# desctypes -> index
self._desctypes = {}
def load(self, stream, format):
"""Reconstruct from frozen data."""
if isinstance(format, basestring):
format = self.formats[format]
frozen = format.load(stream)
2008-09-24 03:39:33 -05:00
# if an old index is present, we treat it as not existing.
if not isinstance(frozen, dict):
raise ValueError('old format')
index2fn = frozen['filenames']
self._titles = dict(zip(index2fn, frozen['titles']))
self._mapping = {}
for k, v in frozen['terms'].iteritems():
if isinstance(v, int):
self._mapping[k] = set([index2fn[v]])
else:
self._mapping[k] = set(index2fn[i] for i in v)
# no need to load keywords/desctypes
2007-07-23 04:02:25 -05:00
def dump(self, stream, format):
"""Dump the frozen index to a stream."""
if isinstance(format, basestring):
format = self.formats[format]
format.dump(self.freeze(), stream)
2007-07-23 04:02:25 -05:00
def get_modules(self, fn2index):
rv = {}
for name, (doc, _, _, _) in self.env.modules.iteritems():
rv[name] = fn2index[doc]
return rv
def get_descrefs(self, fn2index):
rv = {}
dt = self._desctypes
for fullname, (doc, desctype) in self.env.descrefs.iteritems():
prefix, name = rpartition(fullname, '.')
pdict = rv.setdefault(prefix, {})
try:
i = dt[desctype]
except KeyError:
i = len(dt)
dt[desctype] = i
pdict[name] = (fn2index[doc], i)
return rv
def get_terms(self, fn2index):
rv = {}
for k, v in self._mapping.iteritems():
if len(v) == 1:
fn, = v
rv[k] = fn2index[fn]
else:
rv[k] = [fn2index[fn] for fn in v]
return rv
2007-07-23 04:02:25 -05:00
def freeze(self):
"""Create a usable data structure for serializing."""
filenames = self._titles.keys()
titles = self._titles.values()
fn2index = dict((f, i) for (i, f) in enumerate(filenames))
return dict(
filenames=filenames,
titles=titles,
terms=self.get_terms(fn2index),
descrefs=self.get_descrefs(fn2index),
modules=self.get_modules(fn2index),
desctypes=dict((v, k) for (k, v) in self._desctypes.items()),
)
2007-07-23 04:02:25 -05:00
def prune(self, filenames):
"""Remove data for all filenames not in the list."""
new_titles = {}
for filename in filenames:
if filename in self._titles:
new_titles[filename] = self._titles[filename]
self._titles = new_titles
for wordnames in self._mapping.itervalues():
wordnames.intersection_update(filenames)
2008-01-16 14:27:25 -06:00
def feed(self, filename, title, doctree):
2007-07-23 04:02:25 -05:00
"""Feed a doctree to the index."""
self._titles[filename] = title
2007-07-23 04:02:25 -05:00
visitor = WordCollector(doctree)
doctree.walk(visitor)
2008-09-24 04:06:31 -05:00
def add_term(word, prefix='', stem=self._stemmer.stem):
word = stem(word)
if len(word) < 3 or word in stopwords or word.isdigit():
return
self._mapping.setdefault(prefix + word, set()).add(filename)
for word in word_re.findall(title):
add_term(word)
for word in visitor.found_words:
add_term(word)