2007-07-23 04:02:25 -05:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
sphinx.search
|
|
|
|
~~~~~~~~~~~~~
|
|
|
|
|
|
|
|
Create a search index for offline search.
|
|
|
|
|
2008-01-11 08:18:19 -06:00
|
|
|
:copyright: 2007-2008 by Armin Ronacher.
|
|
|
|
:license: BSD.
|
2007-07-23 04:02:25 -05:00
|
|
|
"""
|
|
|
|
import re
|
2008-06-26 06:11:20 -05:00
|
|
|
import cPickle as pickle
|
2008-09-10 04:11:56 -05:00
|
|
|
from cStringIO import StringIO
|
2007-12-29 04:58:10 -06:00
|
|
|
|
2007-07-23 04:02:25 -05:00
|
|
|
from docutils.nodes import Text, NodeVisitor
|
2007-12-29 04:58:10 -06:00
|
|
|
|
2008-01-16 14:27:25 -06:00
|
|
|
from sphinx.util.stemmer import PorterStemmer
|
2008-09-24 07:01:16 -05:00
|
|
|
from sphinx.util import jsdump, rpartition
|
2007-07-23 04:02:25 -05:00
|
|
|
|
|
|
|
|
|
|
|
word_re = re.compile(r'\w+(?u)')
|
|
|
|
|
2008-09-24 04:06:31 -05:00
|
|
|
stopwords = set("""
|
|
|
|
a and are as at
|
|
|
|
be but by
|
|
|
|
for
|
|
|
|
if in into is it
|
|
|
|
near no not
|
|
|
|
of on or
|
|
|
|
such
|
|
|
|
that the their then there these they this to
|
|
|
|
was will with
|
|
|
|
""".split())
|
2007-07-23 04:02:25 -05:00
|
|
|
|
2008-09-24 06:51:02 -05:00
|
|
|
|
2008-09-10 04:11:56 -05:00
|
|
|
class _JavaScriptIndex(object):
|
|
|
|
"""
|
|
|
|
The search index as javascript file that calls a function
|
|
|
|
on the documentation search object to register the index.
|
|
|
|
"""
|
|
|
|
|
|
|
|
PREFIX = 'Search.setIndex('
|
|
|
|
SUFFIX = ')'
|
|
|
|
|
|
|
|
def dumps(self, data):
|
2008-09-24 07:01:16 -05:00
|
|
|
return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
|
2008-09-10 04:11:56 -05:00
|
|
|
|
|
|
|
def loads(self, s):
|
|
|
|
data = s[len(self.PREFIX):-len(self.SUFFIX)]
|
|
|
|
if not data or not s.startswith(self.PREFIX) or not \
|
|
|
|
s.endswith(self.SUFFIX):
|
|
|
|
raise ValueError('invalid data')
|
2008-09-24 07:01:16 -05:00
|
|
|
return jsdump.loads(data)
|
2008-09-10 04:11:56 -05:00
|
|
|
|
|
|
|
def dump(self, data, f):
|
|
|
|
f.write(self.dumps(data))
|
|
|
|
|
|
|
|
def load(self, f):
|
|
|
|
return self.loads(f.read())
|
|
|
|
|
|
|
|
|
|
|
|
js_index = _JavaScriptIndex()
|
|
|
|
|
|
|
|
|
2007-07-23 04:02:25 -05:00
|
|
|
class Stemmer(PorterStemmer):
|
|
|
|
"""
|
|
|
|
All those porter stemmer implementations look hideous.
|
|
|
|
make at least the stem method nicer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def stem(self, word):
|
2008-09-10 06:26:07 -05:00
|
|
|
word = word.lower()
|
2007-07-23 04:02:25 -05:00
|
|
|
return PorterStemmer.stem(self, word, 0, len(word) - 1)
|
|
|
|
|
|
|
|
|
|
|
|
class WordCollector(NodeVisitor):
|
|
|
|
"""
|
|
|
|
A special visitor that collects words for the `IndexBuilder`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, document):
|
|
|
|
NodeVisitor.__init__(self, document)
|
|
|
|
self.found_words = []
|
|
|
|
|
|
|
|
def dispatch_visit(self, node):
|
|
|
|
if node.__class__ is Text:
|
|
|
|
self.found_words.extend(word_re.findall(node.astext()))
|
|
|
|
|
|
|
|
|
|
|
|
class IndexBuilder(object):
|
|
|
|
"""
|
|
|
|
Helper class that creates a searchindex based on the doctrees
|
|
|
|
passed to the `feed` method.
|
|
|
|
"""
|
|
|
|
formats = {
|
2008-09-24 07:01:16 -05:00
|
|
|
'jsdump': jsdump,
|
2008-06-26 04:40:42 -05:00
|
|
|
'pickle': pickle
|
2007-07-23 04:02:25 -05:00
|
|
|
}
|
|
|
|
|
2008-09-23 16:21:32 -05:00
|
|
|
def __init__(self, env):
|
|
|
|
self.env = env
|
2007-08-09 14:22:20 -05:00
|
|
|
self._stemmer = Stemmer()
|
|
|
|
# filename -> title
|
2007-07-23 04:02:25 -05:00
|
|
|
self._titles = {}
|
2007-08-09 14:22:20 -05:00
|
|
|
# stemmed word -> set(filenames)
|
|
|
|
self._mapping = {}
|
2008-09-24 03:46:33 -05:00
|
|
|
# desctypes -> index
|
2008-09-24 04:31:46 -05:00
|
|
|
self._desctypes = {}
|
2007-08-09 14:22:20 -05:00
|
|
|
|
|
|
|
def load(self, stream, format):
|
|
|
|
"""Reconstruct from frozen data."""
|
2008-06-26 04:40:42 -05:00
|
|
|
if isinstance(format, basestring):
|
|
|
|
format = self.formats[format]
|
|
|
|
frozen = format.load(stream)
|
2008-09-24 03:39:33 -05:00
|
|
|
# if an old index is present, we treat it as not existing.
|
|
|
|
if not isinstance(frozen, dict):
|
|
|
|
raise ValueError('old format')
|
|
|
|
index2fn = frozen['filenames']
|
2008-09-24 03:46:33 -05:00
|
|
|
self._titles = dict(zip(index2fn, frozen['titles']))
|
2008-09-24 06:51:02 -05:00
|
|
|
self._mapping = {}
|
|
|
|
for k, v in frozen['terms'].iteritems():
|
|
|
|
if isinstance(v, int):
|
|
|
|
self._mapping[k] = set([index2fn[v]])
|
|
|
|
else:
|
|
|
|
self._mapping[k] = set(index2fn[i] for i in v)
|
2008-09-24 03:46:33 -05:00
|
|
|
# no need to load keywords/desctypes
|
2007-07-23 04:02:25 -05:00
|
|
|
|
|
|
|
def dump(self, stream, format):
|
2007-08-09 14:22:20 -05:00
|
|
|
"""Dump the frozen index to a stream."""
|
2008-06-26 04:40:42 -05:00
|
|
|
if isinstance(format, basestring):
|
|
|
|
format = self.formats[format]
|
|
|
|
format.dump(self.freeze(), stream)
|
2007-07-23 04:02:25 -05:00
|
|
|
|
2008-09-24 04:31:46 -05:00
|
|
|
def get_modules(self, fn2index):
|
|
|
|
rv = {}
|
|
|
|
for name, (doc, _, _, _) in self.env.modules.iteritems():
|
|
|
|
rv[name] = fn2index[doc]
|
|
|
|
return rv
|
|
|
|
|
|
|
|
def get_descrefs(self, fn2index):
|
2008-09-23 16:21:32 -05:00
|
|
|
rv = {}
|
2008-09-24 03:46:33 -05:00
|
|
|
dt = self._desctypes
|
2008-09-24 04:31:46 -05:00
|
|
|
for fullname, (doc, desctype) in self.env.descrefs.iteritems():
|
|
|
|
prefix, name = rpartition(fullname, '.')
|
|
|
|
pdict = rv.setdefault(prefix, {})
|
2008-09-24 03:46:33 -05:00
|
|
|
try:
|
2008-09-24 04:31:46 -05:00
|
|
|
i = dt[desctype]
|
2008-09-24 03:46:33 -05:00
|
|
|
except KeyError:
|
|
|
|
i = len(dt)
|
2008-09-24 04:31:46 -05:00
|
|
|
dt[desctype] = i
|
|
|
|
pdict[name] = (fn2index[doc], i)
|
2008-09-23 16:21:32 -05:00
|
|
|
return rv
|
|
|
|
|
2008-09-24 06:51:02 -05:00
|
|
|
def get_terms(self, fn2index):
|
|
|
|
rv = {}
|
|
|
|
for k, v in self._mapping.iteritems():
|
|
|
|
if len(v) == 1:
|
|
|
|
fn, = v
|
|
|
|
rv[k] = fn2index[fn]
|
|
|
|
else:
|
|
|
|
rv[k] = [fn2index[fn] for fn in v]
|
|
|
|
return rv
|
|
|
|
|
2007-07-23 04:02:25 -05:00
|
|
|
def freeze(self):
|
2008-09-24 03:46:33 -05:00
|
|
|
"""Create a usable data structure for serializing."""
|
2008-09-23 16:21:32 -05:00
|
|
|
filenames = self._titles.keys()
|
|
|
|
titles = self._titles.values()
|
|
|
|
fn2index = dict((f, i) for (i, f) in enumerate(filenames))
|
|
|
|
return dict(
|
|
|
|
filenames=filenames,
|
|
|
|
titles=titles,
|
2008-09-24 06:51:02 -05:00
|
|
|
terms=self.get_terms(fn2index),
|
2008-09-24 04:31:46 -05:00
|
|
|
descrefs=self.get_descrefs(fn2index),
|
|
|
|
modules=self.get_modules(fn2index),
|
2008-09-24 03:46:33 -05:00
|
|
|
desctypes=dict((v, k) for (k, v) in self._desctypes.items()),
|
2008-09-23 16:21:32 -05:00
|
|
|
)
|
2007-07-23 04:02:25 -05:00
|
|
|
|
2007-08-09 14:22:20 -05:00
|
|
|
def prune(self, filenames):
|
|
|
|
"""Remove data for all filenames not in the list."""
|
|
|
|
new_titles = {}
|
|
|
|
for filename in filenames:
|
|
|
|
if filename in self._titles:
|
|
|
|
new_titles[filename] = self._titles[filename]
|
|
|
|
self._titles = new_titles
|
|
|
|
for wordnames in self._mapping.itervalues():
|
|
|
|
wordnames.intersection_update(filenames)
|
|
|
|
|
2008-01-16 14:27:25 -06:00
|
|
|
def feed(self, filename, title, doctree):
|
2007-07-23 04:02:25 -05:00
|
|
|
"""Feed a doctree to the index."""
|
2007-08-09 14:22:20 -05:00
|
|
|
self._titles[filename] = title
|
|
|
|
|
2007-07-23 04:02:25 -05:00
|
|
|
visitor = WordCollector(doctree)
|
|
|
|
doctree.walk(visitor)
|
2008-09-10 06:26:07 -05:00
|
|
|
|
2008-09-24 04:06:31 -05:00
|
|
|
def add_term(word, prefix='', stem=self._stemmer.stem):
|
|
|
|
word = stem(word)
|
|
|
|
if len(word) < 3 or word in stopwords or word.isdigit():
|
|
|
|
return
|
2008-09-10 06:26:07 -05:00
|
|
|
self._mapping.setdefault(prefix + word, set()).add(filename)
|
|
|
|
|
|
|
|
for word in word_re.findall(title):
|
|
|
|
add_term(word)
|
|
|
|
|
|
|
|
for word in visitor.found_words:
|
|
|
|
add_term(word)
|