Fix searching and search index creation for incremental builds.

This commit is contained in:
Georg Brandl
2007-08-09 19:22:20 +00:00
parent be36a0f85d
commit b1271fa623
6 changed files with 121 additions and 85 deletions

View File

@@ -26,11 +26,13 @@ def usage(argv, msg=None):
print >>sys.stderr, """\ print >>sys.stderr, """\
usage: %s [options] sourcedir outdir [filenames...]" usage: %s [options] sourcedir outdir [filenames...]"
options: -b <builder> -- builder to use (one of %s) options: -b <builder> -- builder to use (one of %s)
-a -- write all files; default is to only write new and changed files -a -- write all files; default is to only write new and changed files
-d <path> -- path for the cached doctree files (default outdir/.doctrees) -E -- don't use a saved environment, always read all files
-d <path> -- path for the cached environment and doctree files
(default outdir/.doctrees)
-O <option[=value]> -- give option to to the builder (-O help for list) -O <option[=value]> -- give option to to the builder (-O help for list)
-D <setting=value> -- override a setting in sourcedir/conf.py -D <setting=value> -- override a setting in sourcedir/conf.py
-N -- do not do colored output -N -- do not do colored output
modi: modi:
* without -a and without filenames, write new and changed files. * without -a and without filenames, write new and changed files.
* with -a, write all files. * with -a, write all files.
@@ -39,7 +41,7 @@ modi:
def main(argv): def main(argv):
try: try:
opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:N') opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:NE')
srcdirname = path.abspath(args[0]) srcdirname = path.abspath(args[0])
if not path.isdir(srcdirname): if not path.isdir(srcdirname):
print >>sys.stderr, 'Error: Cannot find source directory.' print >>sys.stderr, 'Error: Cannot find source directory.'
@@ -65,7 +67,7 @@ def main(argv):
return 1 return 1
builder = all_files = None builder = all_files = None
opt_help = False opt_help = freshenv = False
options = {} options = {}
confoverrides = {} confoverrides = {}
doctreedir = path.join(outdirname, '.doctrees') doctreedir = path.join(outdirname, '.doctrees')
@@ -102,6 +104,8 @@ def main(argv):
confoverrides[key] = val confoverrides[key] = val
elif opt == '-N': elif opt == '-N':
nocolor() nocolor()
elif opt == '-E':
freshenv = True
if not sys.stdout.isatty() or sys.platform == 'win32': if not sys.stdout.isatty() or sys.platform == 'win32':
# Windows' cmd box doesn't understand ANSI sequences # Windows' cmd box doesn't understand ANSI sequences
@@ -122,7 +126,8 @@ def main(argv):
builderobj = builderobj(srcdirname, outdirname, doctreedir, options, builderobj = builderobj(srcdirname, outdirname, doctreedir, options,
status_stream=sys.stdout, status_stream=sys.stdout,
warning_stream=sys.stderr, warning_stream=sys.stderr,
confoverrides=confoverrides) confoverrides=confoverrides,
freshenv=freshenv)
if all_files: if all_files:
builderobj.build_all() builderobj.build_all()
elif filenames: elif filenames:

View File

@@ -72,18 +72,18 @@ class Builder(object):
Builds target formats from the reST sources. Builds target formats from the reST sources.
""" """
option_spec = { option_spec = {}
'freshenv': 'Don\'t use a pickled environment',
}
def __init__(self, srcdirname, outdirname, doctreedirname, def __init__(self, srcdirname, outdirname, doctreedirname,
options, confoverrides=None, env=None, options, confoverrides=None, env=None,
status_stream=None, warning_stream=None): status_stream=None, warning_stream=None,
freshenv=False):
self.srcdir = srcdirname self.srcdir = srcdirname
self.outdir = outdirname self.outdir = outdirname
self.doctreedir = doctreedirname self.doctreedir = doctreedirname
if not path.isdir(doctreedirname): if not path.isdir(doctreedirname):
os.mkdir(doctreedirname) os.mkdir(doctreedirname)
self.freshenv = freshenv
self.options = attrdict(options) self.options = attrdict(options)
self.validate_options() self.validate_options()
@@ -161,7 +161,7 @@ class Builder(object):
successfully loaded, False if a new environment had to be created.""" successfully loaded, False if a new environment had to be created."""
if self.env: if self.env:
return return
if not self.options.freshenv: if not self.freshenv:
try: try:
self.msg('trying to load pickled env...', nonl=True) self.msg('trying to load pickled env...', nonl=True)
self.env = BuildEnvironment.frompickle( self.env = BuildEnvironment.frompickle(
@@ -223,8 +223,6 @@ class Builder(object):
self.msg('creating index...') self.msg('creating index...')
self.env.create_index(self) self.env.create_index(self)
self.prepare_writing()
if filenames: if filenames:
# add all TOC files that may have changed # add all TOC files that may have changed
filenames_set = set(filenames) filenames_set = set(filenames)
@@ -236,6 +234,8 @@ class Builder(object):
# build all # build all
filenames_set = set(self.env.all_files) filenames_set = set(self.env.all_files)
self.prepare_writing(filenames)
# write target files # write target files
with collect_env_warnings(self): with collect_env_warnings(self):
self.msg('writing output...') self.msg('writing output...')
@@ -249,7 +249,7 @@ class Builder(object):
self.finish() self.finish()
self.msg('done!') self.msg('done!')
def prepare_writing(self): def prepare_writing(self, filenames):
raise NotImplementedError raise NotImplementedError
def write_file(self, filename, doctree): def write_file(self, filename, doctree):
@@ -265,12 +265,6 @@ class StandaloneHTMLBuilder(Builder):
""" """
name = 'html' name = 'html'
option_spec = Builder.option_spec
option_spec.update({
'nostyle': 'Don\'t copy style and script files',
'nosearchindex': 'Don\'t create a JSON search index for offline search',
})
copysource = True copysource = True
def init(self): def init(self):
@@ -301,12 +295,10 @@ class StandaloneHTMLBuilder(Builder):
settings_overrides={'output_encoding': 'unicode'} settings_overrides={'output_encoding': 'unicode'}
) )
def prepare_writing(self): def prepare_writing(self, filenames):
if not self.options.nosearchindex: from .search import IndexBuilder
from .search import IndexBuilder self.indexer = IndexBuilder()
self.indexer = IndexBuilder() self.load_indexer(filenames)
else:
self.indexer = None
self.docwriter = HTMLWriter(self.config) self.docwriter = HTMLWriter(self.config)
self.docsettings = OptionParser( self.docsettings = OptionParser(
defaults=self.env.settings, defaults=self.env.settings,
@@ -463,20 +455,19 @@ class StandaloneHTMLBuilder(Builder):
) )
self.handle_file('search.rst', searchcontext, 'search') self.handle_file('search.rst', searchcontext, 'search')
if not self.options.nostyle: # copy style files
self.msg('copying style files...') self.msg('copying style files...')
# copy style files styledirname = path.join(path.dirname(__file__), 'style')
styledirname = path.join(path.dirname(__file__), 'style') ensuredir(path.join(self.outdir, 'style'))
ensuredir(path.join(self.outdir, 'style')) for filename in os.listdir(styledirname):
for filename in os.listdir(styledirname): if not filename.startswith('.'):
if not filename.startswith('.'): shutil.copyfile(path.join(styledirname, filename),
shutil.copyfile(path.join(styledirname, filename), path.join(self.outdir, 'style', filename))
path.join(self.outdir, 'style', filename)) # add pygments style file
# add pygments style file f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w') if pygments:
if pygments: f.write(get_stylesheet())
f.write(get_stylesheet()) f.close()
f.close()
# dump the search index # dump the search index
self.handle_finish() self.handle_finish()
@@ -497,6 +488,16 @@ class StandaloneHTMLBuilder(Builder):
if path.getmtime(path.join(self.srcdir, filename)) > targetmtime: if path.getmtime(path.join(self.srcdir, filename)) > targetmtime:
yield filename yield filename
def load_indexer(self, filenames):
try:
with open(path.join(self.outdir, 'searchindex.json'), 'r') as f:
self.indexer.load(f, 'json')
except (IOError, OSError):
pass
# delete all entries for files that will be rebuilt
self.indexer.prune(set(self.env.all_files) - set(filenames))
def index_file(self, filename, doctree, title): def index_file(self, filename, doctree, title):
# only index pages with title # only index pages with title
if self.indexer is not None and title: if self.indexer is not None and title:
@@ -522,11 +523,10 @@ class StandaloneHTMLBuilder(Builder):
path.join(self.outdir, context['sourcename'])) path.join(self.outdir, context['sourcename']))
def handle_finish(self): def handle_finish(self):
if self.indexer is not None: self.msg('dumping search index...')
self.msg('dumping search index...') self.indexer.prune([self.get_target_uri(fn)[:-5] for fn in self.env.all_files])
f = open(path.join(self.outdir, 'searchindex.json'), 'w') with open(path.join(self.outdir, 'searchindex.json'), 'w') as f:
self.indexer.dump(f, 'json') self.indexer.dump(f, 'json')
f.close()
class WebHTMLBuilder(StandaloneHTMLBuilder): class WebHTMLBuilder(StandaloneHTMLBuilder):
@@ -535,13 +535,6 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
""" """
name = 'web' name = 'web'
# doesn't use the standalone specific options
option_spec = Builder.option_spec.copy()
option_spec.update({
'nostyle': 'Don\'t copy style and script files',
'nosearchindex': 'Don\'t create a search index for the online search',
})
def init(self): def init(self):
# Nothing to do here. # Nothing to do here.
pass pass
@@ -564,6 +557,15 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
return source_filename[:-9] # up to / return source_filename[:-9] # up to /
return source_filename[:-4] + '/' return source_filename[:-4] + '/'
def load_indexer(self, filenames):
try:
with open(path.join(self.outdir, 'searchindex.pickle'), 'r') as f:
self.indexer.load(f, 'pickle')
except (IOError, OSError):
pass
# delete all entries for files that will be rebuilt
self.indexer.prune(set(self.env.all_files) - set(filenames))
def index_file(self, filename, doctree, title): def index_file(self, filename, doctree, title):
# only index pages with title and category # only index pages with title and category
if self.indexer is not None and title: if self.indexer is not None and title:
@@ -590,11 +592,11 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
with file(outfilename, 'wb') as fp: with file(outfilename, 'wb') as fp:
pickle.dump(self.globalcontext, fp, 2) pickle.dump(self.globalcontext, fp, 2)
if self.indexer is not None: self.msg('dumping search index...')
self.msg('dumping search index...') self.indexer.prune(self.env.all_files)
f = open(path.join(self.outdir, 'searchindex.pickle'), 'w') with open(path.join(self.outdir, 'searchindex.pickle'), 'wb') as f:
self.indexer.dump(f, 'pickle') self.indexer.dump(f, 'pickle')
f.close()
# touch 'last build' file, used by the web application to determine # touch 'last build' file, used by the web application to determine
# when to reload its environment and clear the cache # when to reload its environment and clear the cache
open(path.join(self.outdir, LAST_BUILD_FILENAME), 'w').close() open(path.join(self.outdir, LAST_BUILD_FILENAME), 'w').close()
@@ -611,10 +613,9 @@ class HTMLHelpBuilder(StandaloneHTMLBuilder):
""" """
name = 'htmlhelp' name = 'htmlhelp'
option_spec = Builder.option_spec.copy() option_spec = {
option_spec.update({
'outname': 'Output file base name (default "pydoc")' 'outname': 'Output file base name (default "pydoc")'
}) }
# don't copy the reST source # don't copy the reST source
copysource = False copysource = False

View File

@@ -14,7 +14,7 @@ import pickle
from collections import defaultdict from collections import defaultdict
from docutils.nodes import Text, NodeVisitor from docutils.nodes import Text, NodeVisitor
from .util.stemmer import PorterStemmer from .util.stemmer import PorterStemmer
from .util.json import dump_json from .util.json import dump_json, load_json
word_re = re.compile(r'\w+(?u)') word_re = re.compile(r'\w+(?u)')
@@ -50,47 +50,71 @@ class IndexBuilder(object):
passed to the `feed` method. passed to the `feed` method.
""" """
formats = { formats = {
'json': dump_json, 'json': (dump_json, load_json),
'pickle': pickle.dumps 'pickle': (pickle.dumps, pickle.loads),
} }
def __init__(self): def __init__(self):
self._filenames = {}
self._mapping = {}
self._titles = {}
self._categories = {}
self._stemmer = Stemmer() self._stemmer = Stemmer()
# filename -> title
self._titles = {}
# stemmed word -> set(filenames)
self._mapping = {}
# category -> set(filenames)
self._categories = {}
def load(self, stream, format):
"""Reconstruct from frozen data."""
frozen = self.formats[format][1](stream.read())
index2fn = frozen[0]
self._titles = dict(zip(frozen[0], frozen[2]))
self._categories = dict((k, set(index2fn[i] for i in v))
for (k, v) in frozen[1].iteritems())
self._mapping = dict((k, set(index2fn[i] for i in v))
for (k, v) in frozen[3].iteritems())
def dump(self, stream, format): def dump(self, stream, format):
"""Dump the freezed index to a stream.""" """Dump the frozen index to a stream."""
stream.write(self.formats[format](self.freeze())) stream.write(self.formats[format][0](self.freeze()))
def freeze(self): def freeze(self):
""" """
Create a useable data structure. You can pass this output Create a useable data structure. You can pass this output
to the `SearchFrontend` to search the index. to the `SearchFrontend` to search the index.
""" """
fns, titles = self._titles.keys(), self._titles.values()
fn2index = dict((f, i) for (i, f) in enumerate(fns))
return [ return [
[k for k, v in sorted(self._filenames.items(), fns,
key=lambda x: x[1])], dict((k, [fn2index[fn] for fn in v])
dict(item for item in sorted(self._categories.items(), for (k, v) in self._categories.iteritems()),
key=lambda x: x[0])), titles,
[v for k, v in sorted(self._titles.items(), dict((k, [fn2index[fn] for fn in v])
key=lambda x: x[0])], for (k, v) in self._mapping.iteritems()),
dict(item for item in sorted(self._mapping.items(),
key=lambda x: x[0])),
] ]
def prune(self, filenames):
"""Remove data for all filenames not in the list."""
new_titles = {}
for filename in filenames:
if filename in self._titles:
new_titles[filename] = self._titles[filename]
self._titles = new_titles
for wordnames in self._mapping.itervalues():
wordnames.intersection_update(filenames)
for catnames in self._categories.itervalues():
catnames.intersection_update(filenames)
def feed(self, filename, category, title, doctree): def feed(self, filename, category, title, doctree):
"""Feed a doctree to the index.""" """Feed a doctree to the index."""
file_id = self._filenames.setdefault(filename, len(self._filenames)) self._titles[filename] = title
self._titles[file_id] = title self._categories.setdefault(category, set()).add(filename)
visitor = WordCollector(doctree) visitor = WordCollector(doctree)
doctree.walk(visitor) doctree.walk(visitor)
self._categories.setdefault(category, set()).add(file_id)
for word in word_re.findall(title) + visitor.found_words: for word in word_re.findall(title) + visitor.found_words:
self._mapping.setdefault(self._stemmer.stem(word.lower()), self._mapping.setdefault(self._stemmer.stem(word.lower()),
set()).add(file_id) set()).add(filename)
class SearchFrontend(object): class SearchFrontend(object):

View File

@@ -424,5 +424,5 @@ var Search = {
} }
$(document).ready(function() { $(document).ready(function() {
Documentation.Search.init(); Search.init();
}); });

View File

@@ -1,6 +1,6 @@
{% extends "layout.html" %} {% extends "layout.html" %}
{% set title = 'Search Documentation' %} {% set title = 'Search Documentation' %}
{% block header %} {% block head %}
<script type="text/javascript" src="{{ pathto('style/searchtools.js', 1) }}"></script> <script type="text/javascript" src="{{ pathto('style/searchtools.js', 1) }}"></script>
{% endblock %} {% endblock %}
{% block body %} {% block body %}
@@ -26,13 +26,13 @@
('tutorial', 'Python Tutorial', true), ('tutorial', 'Python Tutorial', true),
('library', 'Library Reference', true), ('library', 'Library Reference', true),
('maclib', 'Macintosh Library Modules', false), ('maclib', 'Macintosh Library Modules', false),
('reference', 'Language Reference', false),
('extending', 'Extending and Embedding', false), ('extending', 'Extending and Embedding', false),
('c-api', 'Python/C API', false), ('c-api', 'Python/C API', false),
('install', 'Installing Python Modules', true), ('install', 'Installing Python Modules', true),
('distutils', 'Distributing Python Modules', true), ('distutils', 'Distributing Python Modules', true),
('documenting', 'Documenting Python', false), ('documenting', 'Documenting Python', false),
('whatsnew', 'What\'s new in Python?', false), ('whatsnew', 'What\'s new in Python?', false),
('reference', 'Language Reference', false)
] -%} ] -%}
<li><input type="checkbox" name="area" id="area-{{ id }}" value="{{ id <li><input type="checkbox" name="area" id="area-{{ id }}" value="{{ id
}}"{% if checked %} checked{% endif %}> }}"{% if checked %} checked{% endif %}>

View File

@@ -16,7 +16,7 @@
import re import re
ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]') # escape \, ", control characters and everything outside ASCII
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
ESCAPE_DICT = { ESCAPE_DICT = {
'\\': '\\\\', '\\': '\\\\',
@@ -27,8 +27,6 @@ ESCAPE_DICT = {
'\r': '\\r', '\r': '\\r',
'\t': '\\t', '\t': '\\t',
} }
for i in range(0x20):
ESCAPE_DICT.setdefault(chr(i), '\\u%04x' % (i,))
def encode_basestring_ascii(s): def encode_basestring_ascii(s):
@@ -70,3 +68,11 @@ def dump_json(obj, key=False):
elif isinstance(obj, basestring): elif isinstance(obj, basestring):
return encode_basestring_ascii(obj) return encode_basestring_ascii(obj)
raise TypeError(type(obj)) raise TypeError(type(obj))
STRING = re.compile(r'("(\\\\|\\"|[^"])*")')
def load_json(s):
d = {'null': None, 'true': True, 'false': False}
s = STRING.sub(r'u\1', s)
return eval(s, d)