Added xapian search

This commit is contained in:
jacob 2010-06-24 14:30:02 -05:00
parent 00f841be2a
commit 8c4e535170
5 changed files with 204 additions and 6 deletions

View File

@ -23,12 +23,26 @@ class WebSupportBuilder(StandaloneHTMLBuilder):
name = 'websupport' name = 'websupport'
out_suffix = '.fpickle' out_suffix = '.fpickle'
def init(self):
self.init_search()
StandaloneHTMLBuilder.init(self)
def init_search(self):
self.search = self.app.search
if self.search is not None:
self.search.create_index()
def init_translator_class(self): def init_translator_class(self):
self.translator_class = WebSupportTranslator self.translator_class = WebSupportTranslator
def write_doc(self, docname, doctree): def write_doc(self, docname, doctree):
# The translator needs the docname to generate ids. # The translator needs the docname to generate ids.
self.docname = docname self.docname = docname
# Index the page if search is enabled.
if self.search is not None:
doc_contents = doctree.astext()
title = doc_contents[:20]
self.search.add_document(docname, title, doc_contents)
StandaloneHTMLBuilder.write_doc(self, docname, doctree) StandaloneHTMLBuilder.write_doc(self, docname, doctree)
def get_target_uri(self, docname, typ=None): def get_target_uri(self, docname, typ=None):
@ -59,7 +73,8 @@ class WebSupportBuilder(StandaloneHTMLBuilder):
ctx, event_arg) ctx, event_arg)
# Create a dict that will be pickled and used by webapps. # Create a dict that will be pickled and used by webapps.
doc_ctx = {'body': ctx.get('body', '')} doc_ctx = {'body': ctx.get('body', ''),
'title': ctx.get('title', '')}
# Partially render the html template to proved a more useful ctx. # Partially render the html template to proved a more useful ctx.
template = self.templates.environment.get_template(templatename) template = self.templates.environment.get_template(templatename)
template_module = template.make_module(ctx) template_module = template.make_module(ctx)
@ -86,4 +101,3 @@ class WebSupportBuilder(StandaloneHTMLBuilder):
os_path(ctx['sourcename'])) os_path(ctx['sourcename']))
ensuredir(path.dirname(source_name)) ensuredir(path.dirname(source_name))
copyfile(self.env.doc2path(pagename), source_name) copyfile(self.env.doc2path(pagename), source_name)

View File

@ -0,0 +1,36 @@
{#
basic/searchresults.html
~~~~~~~~~~~~~~~~~
Template for the body of the search results page.
:copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
#}
<h1 id="search-documentation">Search</h1>
<p>
From here you can search these documents. Enter your search
words into the box below and click "search".
</p>
<form action="" method="get">
<input type="text" name="q" value="" />
<input type="submit" value="search" />
<span id="search-progress" style="padding-left: 10px"></span>
</form>
{% if search_performed %}
<h2>Search Results</h2>
{% if not search_results %}
<p>'Your search did not match any results.</p>
{% endif %}
{% endif %}
<div id="search-results">
{% if search_results %}
<ul>
{% for href, caption, context in search_results %}
<li><a href="{{ href }}">{{ caption }}</a>
<div class="context">{{ context|e }}</div>
</li>
{% endfor %}
</ul>
{% endif %}
</div>

View File

@ -12,20 +12,47 @@
import cPickle as pickle import cPickle as pickle
from os import path from os import path
from jinja2 import Environment, FileSystemLoader
from sphinx.application import Sphinx from sphinx.application import Sphinx
from sphinx.websupport.search import search_adapters
class WebSupportApp(Sphinx):
def __init__(self, *args, **kwargs):
self.search = kwargs.pop('search', None)
Sphinx.__init__(self, *args, **kwargs)
class WebSupport(object): class WebSupport(object):
def init(self, srcdir='', outdir='', search=None):
def init(self, srcdir='', outdir=''):
self.srcdir = srcdir self.srcdir = srcdir
self.outdir = outdir or path.join(self.srcdir, '_build', self.outdir = outdir or path.join(self.srcdir, '_build',
'websupport') 'websupport')
self.init_templating()
if search is not None:
self.init_search(search)
def init_templating(self):
import sphinx
template_path = path.join(path.dirname(sphinx.__file__),
'themes', 'basic')
loader = FileSystemLoader(template_path)
self.template_env = Environment(loader=loader)
def init_search(self, search):
mod, cls = search_adapters[search]
search_class = getattr(__import__('sphinx.websupport.search.' + mod,
None, None, [cls]), cls)
search_path = path.join(self.outdir, 'search')
self.search = search_class(search_path)
self.results_template = \
self.template_env.get_template('searchresults.html')
def build(self, **kwargs): def build(self, **kwargs):
doctreedir = kwargs.pop('doctreedir', doctreedir = kwargs.pop('doctreedir',
path.join(self.outdir, 'doctrees')) path.join(self.outdir, 'doctrees'))
app = Sphinx(self.srcdir, self.srcdir, app = WebSupportApp(self.srcdir, self.srcdir,
self.outdir, doctreedir, 'websupport') self.outdir, doctreedir, 'websupport',
search=self.search)
app.build() app.build()
def get_document(self, docname): def get_document(self, docname):
@ -33,3 +60,12 @@ class WebSupport(object):
f = open(infilename, 'rb') f = open(infilename, 'rb')
document = pickle.load(f) document = pickle.load(f)
return document return document
def get_search_results(self, q):
results, results_found, results_displayed = self.search.query(q)
ctx = {'search_performed': True,
'search_results': results}
document = self.get_document('search')
document['body'] = self.results_template.render(ctx)
document['title'] = 'Search Results'
return document

View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
"""
sphinx.websupport.search
~~~~~~~~~~~~~~~~~~~~~~~~
Server side search support for the web support package.
:copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
class BaseSearch(object):
def create_index(self, path):
raise NotImplemented
def add_document(self, path, title, text):
raise NotImplemented
def query(self, q):
raise NotImplemented
def extract_context(self, text, query_string):
# From GSOC 2009
with_context_re = '([\W\w]{0,80})(%s)([\W\w]{0,80})' % (query_string)
try:
res = re.findall(with_context_re, text, re.I|re.U)[0]
return tuple((unicode(i, errors='ignore') for i in res))
except IndexError:
return '', '', ''
search_adapters = {
'xapian': ('xapiansearch', 'XapianSearch'),
'whoosh': ('whooshsearch', 'WhooshSearch'),
}

View File

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
sphinx.websupport.search.xapian
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Xapian search adapter.
:copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from os import path
import xapian
from sphinx.util.osutil import ensuredir
from sphinx.websupport.search import BaseSearch
class XapianSearch(BaseSearch):
# Adapted from the GSOC 2009 webapp project.
# Xapian metadata constants
DOC_PATH = 0
DOC_TITLE = 1
def __init__(self, db_path):
self.db_path = db_path
def create_index(self):
ensuredir(self.db_path)
self.database = xapian.WritableDatabase(self.db_path,
xapian.DB_CREATE_OR_OPEN)
self.indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
self.indexer.set_stemmer(stemmer)
def add_document(self, path, title, text):
self.database.begin_transaction()
doc = xapian.Document()
doc.set_data(text)
doc.add_value(self.DOC_PATH, path)
doc.add_value(self.DOC_TITLE, title)
self.indexer.set_document(doc)
self.indexer.index_text(text)
for word in text.split():
doc.add_posting(word, 1)
self.database.add_document(doc)
self.database.commit_transaction()
def query(self, q):
database = xapian.Database(self.db_path)
enquire = xapian.Enquire(database)
qp = xapian.QueryParser()
stemmer = xapian.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(database)
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
query = qp.parse_query(q)
# Find the top 100 results for the query.
enquire.set_query(query)
matches = enquire.get_mset(0, 100)
results_found = matches.get_matches_estimated()
results_displayed = matches.size()
results = []
for m in matches:
context = self.extract_context(m.document.get_data(), q)
results.append((m.document.get_value(self.DOC_PATH),
m.document.get_value(self.DOC_TITLE),
''.join(context) ))
return results, results_found, results_displayed