Fix searching and search index creation for incremental builds.

This commit is contained in:
Georg Brandl
2007-08-09 19:22:20 +00:00
parent be36a0f85d
commit b1271fa623
6 changed files with 121 additions and 85 deletions

View File

@@ -26,11 +26,13 @@ def usage(argv, msg=None):
print >>sys.stderr, """\
usage: %s [options] sourcedir outdir [filenames...]"
options: -b <builder> -- builder to use (one of %s)
-a -- write all files; default is to only write new and changed files
-d <path> -- path for the cached doctree files (default outdir/.doctrees)
-a -- write all files; default is to only write new and changed files
-E -- don't use a saved environment, always read all files
-d <path> -- path for the cached environment and doctree files
(default outdir/.doctrees)
-O <option[=value]> -- give option to to the builder (-O help for list)
-D <setting=value> -- override a setting in sourcedir/conf.py
-N -- do not do colored output
-N -- do not do colored output
modi:
* without -a and without filenames, write new and changed files.
* with -a, write all files.
@@ -39,7 +41,7 @@ modi:
def main(argv):
try:
opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:N')
opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:NE')
srcdirname = path.abspath(args[0])
if not path.isdir(srcdirname):
print >>sys.stderr, 'Error: Cannot find source directory.'
@@ -65,7 +67,7 @@ def main(argv):
return 1
builder = all_files = None
opt_help = False
opt_help = freshenv = False
options = {}
confoverrides = {}
doctreedir = path.join(outdirname, '.doctrees')
@@ -102,6 +104,8 @@ def main(argv):
confoverrides[key] = val
elif opt == '-N':
nocolor()
elif opt == '-E':
freshenv = True
if not sys.stdout.isatty() or sys.platform == 'win32':
# Windows' cmd box doesn't understand ANSI sequences
@@ -122,7 +126,8 @@ def main(argv):
builderobj = builderobj(srcdirname, outdirname, doctreedir, options,
status_stream=sys.stdout,
warning_stream=sys.stderr,
confoverrides=confoverrides)
confoverrides=confoverrides,
freshenv=freshenv)
if all_files:
builderobj.build_all()
elif filenames:

View File

@@ -72,18 +72,18 @@ class Builder(object):
Builds target formats from the reST sources.
"""
option_spec = {
'freshenv': 'Don\'t use a pickled environment',
}
option_spec = {}
def __init__(self, srcdirname, outdirname, doctreedirname,
options, confoverrides=None, env=None,
status_stream=None, warning_stream=None):
status_stream=None, warning_stream=None,
freshenv=False):
self.srcdir = srcdirname
self.outdir = outdirname
self.doctreedir = doctreedirname
if not path.isdir(doctreedirname):
os.mkdir(doctreedirname)
self.freshenv = freshenv
self.options = attrdict(options)
self.validate_options()
@@ -161,7 +161,7 @@ class Builder(object):
successfully loaded, False if a new environment had to be created."""
if self.env:
return
if not self.options.freshenv:
if not self.freshenv:
try:
self.msg('trying to load pickled env...', nonl=True)
self.env = BuildEnvironment.frompickle(
@@ -223,8 +223,6 @@ class Builder(object):
self.msg('creating index...')
self.env.create_index(self)
self.prepare_writing()
if filenames:
# add all TOC files that may have changed
filenames_set = set(filenames)
@@ -236,6 +234,8 @@ class Builder(object):
# build all
filenames_set = set(self.env.all_files)
self.prepare_writing(filenames)
# write target files
with collect_env_warnings(self):
self.msg('writing output...')
@@ -249,7 +249,7 @@ class Builder(object):
self.finish()
self.msg('done!')
def prepare_writing(self):
def prepare_writing(self, filenames):
raise NotImplementedError
def write_file(self, filename, doctree):
@@ -265,12 +265,6 @@ class StandaloneHTMLBuilder(Builder):
"""
name = 'html'
option_spec = Builder.option_spec
option_spec.update({
'nostyle': 'Don\'t copy style and script files',
'nosearchindex': 'Don\'t create a JSON search index for offline search',
})
copysource = True
def init(self):
@@ -301,12 +295,10 @@ class StandaloneHTMLBuilder(Builder):
settings_overrides={'output_encoding': 'unicode'}
)
def prepare_writing(self):
if not self.options.nosearchindex:
from .search import IndexBuilder
self.indexer = IndexBuilder()
else:
self.indexer = None
def prepare_writing(self, filenames):
from .search import IndexBuilder
self.indexer = IndexBuilder()
self.load_indexer(filenames)
self.docwriter = HTMLWriter(self.config)
self.docsettings = OptionParser(
defaults=self.env.settings,
@@ -463,20 +455,19 @@ class StandaloneHTMLBuilder(Builder):
)
self.handle_file('search.rst', searchcontext, 'search')
if not self.options.nostyle:
self.msg('copying style files...')
# copy style files
styledirname = path.join(path.dirname(__file__), 'style')
ensuredir(path.join(self.outdir, 'style'))
for filename in os.listdir(styledirname):
if not filename.startswith('.'):
shutil.copyfile(path.join(styledirname, filename),
path.join(self.outdir, 'style', filename))
# add pygments style file
f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
if pygments:
f.write(get_stylesheet())
f.close()
# copy style files
self.msg('copying style files...')
styledirname = path.join(path.dirname(__file__), 'style')
ensuredir(path.join(self.outdir, 'style'))
for filename in os.listdir(styledirname):
if not filename.startswith('.'):
shutil.copyfile(path.join(styledirname, filename),
path.join(self.outdir, 'style', filename))
# add pygments style file
f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
if pygments:
f.write(get_stylesheet())
f.close()
# dump the search index
self.handle_finish()
@@ -497,6 +488,16 @@ class StandaloneHTMLBuilder(Builder):
if path.getmtime(path.join(self.srcdir, filename)) > targetmtime:
yield filename
def load_indexer(self, filenames):
try:
with open(path.join(self.outdir, 'searchindex.json'), 'r') as f:
self.indexer.load(f, 'json')
except (IOError, OSError):
pass
# delete all entries for files that will be rebuilt
self.indexer.prune(set(self.env.all_files) - set(filenames))
def index_file(self, filename, doctree, title):
# only index pages with title
if self.indexer is not None and title:
@@ -522,11 +523,10 @@ class StandaloneHTMLBuilder(Builder):
path.join(self.outdir, context['sourcename']))
def handle_finish(self):
if self.indexer is not None:
self.msg('dumping search index...')
f = open(path.join(self.outdir, 'searchindex.json'), 'w')
self.msg('dumping search index...')
self.indexer.prune([self.get_target_uri(fn)[:-5] for fn in self.env.all_files])
with open(path.join(self.outdir, 'searchindex.json'), 'w') as f:
self.indexer.dump(f, 'json')
f.close()
class WebHTMLBuilder(StandaloneHTMLBuilder):
@@ -535,13 +535,6 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
"""
name = 'web'
# doesn't use the standalone specific options
option_spec = Builder.option_spec.copy()
option_spec.update({
'nostyle': 'Don\'t copy style and script files',
'nosearchindex': 'Don\'t create a search index for the online search',
})
def init(self):
# Nothing to do here.
pass
@@ -564,6 +557,15 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
return source_filename[:-9] # up to /
return source_filename[:-4] + '/'
def load_indexer(self, filenames):
try:
with open(path.join(self.outdir, 'searchindex.pickle'), 'r') as f:
self.indexer.load(f, 'pickle')
except (IOError, OSError):
pass
# delete all entries for files that will be rebuilt
self.indexer.prune(set(self.env.all_files) - set(filenames))
def index_file(self, filename, doctree, title):
# only index pages with title and category
if self.indexer is not None and title:
@@ -590,11 +592,11 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
with file(outfilename, 'wb') as fp:
pickle.dump(self.globalcontext, fp, 2)
if self.indexer is not None:
self.msg('dumping search index...')
f = open(path.join(self.outdir, 'searchindex.pickle'), 'w')
self.msg('dumping search index...')
self.indexer.prune(self.env.all_files)
with open(path.join(self.outdir, 'searchindex.pickle'), 'wb') as f:
self.indexer.dump(f, 'pickle')
f.close()
# touch 'last build' file, used by the web application to determine
# when to reload its environment and clear the cache
open(path.join(self.outdir, LAST_BUILD_FILENAME), 'w').close()
@@ -611,10 +613,9 @@ class HTMLHelpBuilder(StandaloneHTMLBuilder):
"""
name = 'htmlhelp'
option_spec = Builder.option_spec.copy()
option_spec.update({
option_spec = {
'outname': 'Output file base name (default "pydoc")'
})
}
# don't copy the reST source
copysource = False

View File

@@ -14,7 +14,7 @@ import pickle
from collections import defaultdict
from docutils.nodes import Text, NodeVisitor
from .util.stemmer import PorterStemmer
from .util.json import dump_json
from .util.json import dump_json, load_json
word_re = re.compile(r'\w+(?u)')
@@ -50,47 +50,71 @@ class IndexBuilder(object):
passed to the `feed` method.
"""
formats = {
'json': dump_json,
'pickle': pickle.dumps
'json': (dump_json, load_json),
'pickle': (pickle.dumps, pickle.loads),
}
def __init__(self):
self._filenames = {}
self._mapping = {}
self._titles = {}
self._categories = {}
self._stemmer = Stemmer()
# filename -> title
self._titles = {}
# stemmed word -> set(filenames)
self._mapping = {}
# category -> set(filenames)
self._categories = {}
def load(self, stream, format):
"""Reconstruct from frozen data."""
frozen = self.formats[format][1](stream.read())
index2fn = frozen[0]
self._titles = dict(zip(frozen[0], frozen[2]))
self._categories = dict((k, set(index2fn[i] for i in v))
for (k, v) in frozen[1].iteritems())
self._mapping = dict((k, set(index2fn[i] for i in v))
for (k, v) in frozen[3].iteritems())
def dump(self, stream, format):
"""Dump the freezed index to a stream."""
stream.write(self.formats[format](self.freeze()))
"""Dump the frozen index to a stream."""
stream.write(self.formats[format][0](self.freeze()))
def freeze(self):
"""
Create a useable data structure. You can pass this output
to the `SearchFrontend` to search the index.
"""
fns, titles = self._titles.keys(), self._titles.values()
fn2index = dict((f, i) for (i, f) in enumerate(fns))
return [
[k for k, v in sorted(self._filenames.items(),
key=lambda x: x[1])],
dict(item for item in sorted(self._categories.items(),
key=lambda x: x[0])),
[v for k, v in sorted(self._titles.items(),
key=lambda x: x[0])],
dict(item for item in sorted(self._mapping.items(),
key=lambda x: x[0])),
fns,
dict((k, [fn2index[fn] for fn in v])
for (k, v) in self._categories.iteritems()),
titles,
dict((k, [fn2index[fn] for fn in v])
for (k, v) in self._mapping.iteritems()),
]
def prune(self, filenames):
"""Remove data for all filenames not in the list."""
new_titles = {}
for filename in filenames:
if filename in self._titles:
new_titles[filename] = self._titles[filename]
self._titles = new_titles
for wordnames in self._mapping.itervalues():
wordnames.intersection_update(filenames)
for catnames in self._categories.itervalues():
catnames.intersection_update(filenames)
def feed(self, filename, category, title, doctree):
"""Feed a doctree to the index."""
file_id = self._filenames.setdefault(filename, len(self._filenames))
self._titles[file_id] = title
self._titles[filename] = title
self._categories.setdefault(category, set()).add(filename)
visitor = WordCollector(doctree)
doctree.walk(visitor)
self._categories.setdefault(category, set()).add(file_id)
for word in word_re.findall(title) + visitor.found_words:
self._mapping.setdefault(self._stemmer.stem(word.lower()),
set()).add(file_id)
set()).add(filename)
class SearchFrontend(object):

View File

@@ -424,5 +424,5 @@ var Search = {
}
$(document).ready(function() {
Documentation.Search.init();
Search.init();
});

View File

@@ -1,6 +1,6 @@
{% extends "layout.html" %}
{% set title = 'Search Documentation' %}
{% block header %}
{% block head %}
<script type="text/javascript" src="{{ pathto('style/searchtools.js', 1) }}"></script>
{% endblock %}
{% block body %}
@@ -26,13 +26,13 @@
('tutorial', 'Python Tutorial', true),
('library', 'Library Reference', true),
('maclib', 'Macintosh Library Modules', false),
('reference', 'Language Reference', false),
('extending', 'Extending and Embedding', false),
('c-api', 'Python/C API', false),
('install', 'Installing Python Modules', true),
('distutils', 'Distributing Python Modules', true),
('documenting', 'Documenting Python', false),
('whatsnew', 'What\'s new in Python?', false),
('reference', 'Language Reference', false)
] -%}
<li><input type="checkbox" name="area" id="area-{{ id }}" value="{{ id
}}"{% if checked %} checked{% endif %}>

View File

@@ -16,7 +16,7 @@
import re
ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]')
# escape \, ", control characters and everything outside ASCII
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
ESCAPE_DICT = {
'\\': '\\\\',
@@ -27,8 +27,6 @@ ESCAPE_DICT = {
'\r': '\\r',
'\t': '\\t',
}
for i in range(0x20):
ESCAPE_DICT.setdefault(chr(i), '\\u%04x' % (i,))
def encode_basestring_ascii(s):
@@ -70,3 +68,11 @@ def dump_json(obj, key=False):
elif isinstance(obj, basestring):
return encode_basestring_ascii(obj)
raise TypeError(type(obj))
STRING = re.compile(r'("(\\\\|\\"|[^"])*")')
def load_json(s):
d = {'null': None, 'true': True, 'false': False}
s = STRING.sub(r'u\1', s)
return eval(s, d)