mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
151 lines
4.4 KiB
Python
151 lines
4.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
sphinx.search
|
|
~~~~~~~~~~~~~
|
|
|
|
Create a search index for offline search.
|
|
|
|
:copyright: 2007-2008 by Armin Ronacher.
|
|
:license: BSD.
|
|
"""
|
|
import re
|
|
import cPickle as pickle
|
|
|
|
from docutils.nodes import Text, NodeVisitor
|
|
|
|
from sphinx.util.stemmer import PorterStemmer
|
|
from sphinx.util import json
|
|
|
|
|
|
word_re = re.compile(r'\w+(?u)')
|
|
|
|
|
|
class Stemmer(PorterStemmer):
|
|
"""
|
|
All those porter stemmer implementations look hideous.
|
|
make at least the stem method nicer.
|
|
"""
|
|
|
|
def stem(self, word):
|
|
return PorterStemmer.stem(self, word, 0, len(word) - 1)
|
|
|
|
|
|
class WordCollector(NodeVisitor):
|
|
"""
|
|
A special visitor that collects words for the `IndexBuilder`.
|
|
"""
|
|
|
|
def __init__(self, document):
|
|
NodeVisitor.__init__(self, document)
|
|
self.found_words = []
|
|
|
|
def dispatch_visit(self, node):
|
|
if node.__class__ is Text:
|
|
self.found_words.extend(word_re.findall(node.astext()))
|
|
|
|
|
|
class IndexBuilder(object):
|
|
"""
|
|
Helper class that creates a searchindex based on the doctrees
|
|
passed to the `feed` method.
|
|
"""
|
|
formats = {
|
|
'json': json,
|
|
'pickle': pickle
|
|
}
|
|
|
|
def __init__(self):
|
|
self._stemmer = Stemmer()
|
|
# filename -> title
|
|
self._titles = {}
|
|
# stemmed word -> set(filenames)
|
|
self._mapping = {}
|
|
|
|
def load(self, stream, format):
|
|
"""Reconstruct from frozen data."""
|
|
if isinstance(format, basestring):
|
|
format = self.formats[format]
|
|
frozen = format.load(stream)
|
|
index2fn = frozen[0]
|
|
self._titles = dict(zip(frozen[0], frozen[1]))
|
|
self._mapping = dict((k, set(index2fn[i] for i in v))
|
|
for (k, v) in frozen[2].iteritems())
|
|
|
|
def dump(self, stream, format):
|
|
"""Dump the frozen index to a stream."""
|
|
if isinstance(format, basestring):
|
|
format = self.formats[format]
|
|
format.dump(self.freeze(), stream)
|
|
|
|
def freeze(self):
|
|
"""
|
|
Create a useable data structure. You can pass this output
|
|
to the `SearchFrontend` to search the index.
|
|
"""
|
|
fns, titles = self._titles.keys(), self._titles.values()
|
|
fn2index = dict((f, i) for (i, f) in enumerate(fns))
|
|
return [
|
|
fns,
|
|
titles,
|
|
dict((k, [fn2index[fn] for fn in v])
|
|
for (k, v) in self._mapping.iteritems()),
|
|
]
|
|
|
|
def prune(self, filenames):
|
|
"""Remove data for all filenames not in the list."""
|
|
new_titles = {}
|
|
for filename in filenames:
|
|
if filename in self._titles:
|
|
new_titles[filename] = self._titles[filename]
|
|
self._titles = new_titles
|
|
for wordnames in self._mapping.itervalues():
|
|
wordnames.intersection_update(filenames)
|
|
|
|
def feed(self, filename, title, doctree):
|
|
"""Feed a doctree to the index."""
|
|
self._titles[filename] = title
|
|
|
|
visitor = WordCollector(doctree)
|
|
doctree.walk(visitor)
|
|
for word in word_re.findall(title) + visitor.found_words:
|
|
self._mapping.setdefault(self._stemmer.stem(word.lower()),
|
|
set()).add(filename)
|
|
|
|
|
|
class SearchFrontend(object):
|
|
"""
|
|
This class acts as a frontend for the search index. It can search
|
|
a searchindex as provided by `IndexBuilder`.
|
|
"""
|
|
|
|
def __init__(self, index):
|
|
self.filenames, self.titles, self.words = index
|
|
self._stemmer = Stemmer()
|
|
|
|
def query(self, required, excluded):
|
|
file_map = {}
|
|
for word in required:
|
|
if word not in self.words:
|
|
break
|
|
for fid in self.words[word]:
|
|
file_map.setdefault(fid, set()).add(word)
|
|
|
|
return sorted(((self.filenames[fid], self.titles[fid])
|
|
for fid, words in file_map.iteritems()
|
|
if len(words) == len(required) and not
|
|
any(fid in self.words.get(word, ()) for word in excluded)
|
|
), key=lambda x: x[1].lower())
|
|
|
|
def search(self, searchstring):
|
|
required = set()
|
|
excluded = set()
|
|
for word in searchstring.split():
|
|
if word.startswith('-'):
|
|
storage = excluded
|
|
word = word[1:]
|
|
else:
|
|
storage = required
|
|
storage.add(self._stemmer.stem(word.lower()))
|
|
|
|
return self.query(required, excluded)
|