Fix searching and search index creation for incremental builds.

2025-02-25 18:55:22 -06:00 · 2007-08-09 19:22:20 +00:00
parent be36a0f85d
commit b1271fa623
6 changed files with 121 additions and 85 deletions
--- a/sphinx/init.py
+++ b/sphinx/init.py
@@ -26,11 +26,13 @@ def usage(argv, msg=None):
    print >>sys.stderr, """\
 usage: %s [options] sourcedir outdir [filenames...]"
 options: -b <builder> -- builder to use (one of %s)
-         -a -- write all files; default is to only write new and changed files
+         -a        -- write all files; default is to only write new and changed files
-         -d <path> -- path for the cached doctree files (default outdir/.doctrees)
+         -E        -- don't use a saved environment, always read all files
         -d <path> -- path for the cached environment and doctree files
                      (default outdir/.doctrees)
         -O <option[=value]> -- give option to to the builder (-O help for list)
         -D <setting=value> -- override a setting in sourcedir/conf.py
-         -N -- do not do colored output
+         -N        -- do not do colored output
 modi:
 * without -a and without filenames, write new and changed files.
 * with -a, write all files.
@@ -39,7 +41,7 @@ modi:
 def main(argv):
    try:
-        opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:N')
+        opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:NE')
        srcdirname = path.abspath(args[0])
        if not path.isdir(srcdirname):
            print >>sys.stderr, 'Error: Cannot find source directory.'
@@ -65,7 +67,7 @@ def main(argv):
        return 1
    builder = all_files = None
-    opt_help = False
+    opt_help = freshenv = False
    options = {}
    confoverrides = {}
    doctreedir = path.join(outdirname, '.doctrees')
@@ -102,6 +104,8 @@ def main(argv):
            confoverrides[key] = val
        elif opt == '-N':
            nocolor()
        elif opt == '-E':
            freshenv = True
    if not sys.stdout.isatty() or sys.platform == 'win32':
        # Windows' cmd box doesn't understand ANSI sequences
@@ -122,7 +126,8 @@ def main(argv):
    builderobj = builderobj(srcdirname, outdirname, doctreedir, options,
                            status_stream=sys.stdout,
                            warning_stream=sys.stderr,
-                            confoverrides=confoverrides)
+                            confoverrides=confoverrides,
                            freshenv=freshenv)
    if all_files:
        builderobj.build_all()
    elif filenames:
--- a/sphinx/builder.py
+++ b/sphinx/builder.py
@@ -72,18 +72,18 @@ class Builder(object):
    Builds target formats from the reST sources.
    """
-    option_spec = {
+    option_spec = {}
        'freshenv': 'Don\'t use a pickled environment',
    }
    def __init__(self, srcdirname, outdirname, doctreedirname,
                 options, confoverrides=None, env=None,
-                 status_stream=None, warning_stream=None):
+                 status_stream=None, warning_stream=None,
                 freshenv=False):
        self.srcdir = srcdirname
        self.outdir = outdirname
        self.doctreedir = doctreedirname
        if not path.isdir(doctreedirname):
            os.mkdir(doctreedirname)
        self.freshenv = freshenv
        self.options = attrdict(options)
        self.validate_options()
@@ -161,7 +161,7 @@ class Builder(object):
           successfully loaded, False if a new environment had to be created."""
        if self.env:
            return
-        if not self.options.freshenv:
+        if not self.freshenv:
            try:
                self.msg('trying to load pickled env...', nonl=True)
                self.env = BuildEnvironment.frompickle(
@@ -223,8 +223,6 @@ class Builder(object):
        self.msg('creating index...')
        self.env.create_index(self)
        self.prepare_writing()
        if filenames:
            # add all TOC files that may have changed
            filenames_set = set(filenames)
@@ -236,6 +234,8 @@ class Builder(object):
            # build all
            filenames_set = set(self.env.all_files)
        self.prepare_writing(filenames)
        # write target files
        with collect_env_warnings(self):
            self.msg('writing output...')
@@ -249,7 +249,7 @@ class Builder(object):
        self.finish()
        self.msg('done!')
-    def prepare_writing(self):
+    def prepare_writing(self, filenames):
        raise NotImplementedError
    def write_file(self, filename, doctree):
@@ -265,12 +265,6 @@ class StandaloneHTMLBuilder(Builder):
    """
    name = 'html'
    option_spec = Builder.option_spec
    option_spec.update({
        'nostyle': 'Don\'t copy style and script files',
        'nosearchindex': 'Don\'t create a JSON search index for offline search',
    })
    copysource = True
    def init(self):
@@ -301,12 +295,10 @@ class StandaloneHTMLBuilder(Builder):
            settings_overrides={'output_encoding': 'unicode'}
        )
-    def prepare_writing(self):
+    def prepare_writing(self, filenames):
-        if not self.options.nosearchindex:
+        from .search import IndexBuilder
-            from .search import IndexBuilder
+        self.indexer = IndexBuilder()
-            self.indexer = IndexBuilder()
+        self.load_indexer(filenames)
        else:
            self.indexer = None
        self.docwriter = HTMLWriter(self.config)
        self.docsettings = OptionParser(
            defaults=self.env.settings,
@@ -463,20 +455,19 @@ class StandaloneHTMLBuilder(Builder):
        )
        self.handle_file('search.rst', searchcontext, 'search')
-        if not self.options.nostyle:
+        # copy style files
-            self.msg('copying style files...')
+        self.msg('copying style files...')
-            # copy style files
+        styledirname = path.join(path.dirname(__file__), 'style')
-            styledirname = path.join(path.dirname(__file__), 'style')
+        ensuredir(path.join(self.outdir, 'style'))
-            ensuredir(path.join(self.outdir, 'style'))
+        for filename in os.listdir(styledirname):
-            for filename in os.listdir(styledirname):
+            if not filename.startswith('.'):
-                if not filename.startswith('.'):
+                shutil.copyfile(path.join(styledirname, filename),
-                    shutil.copyfile(path.join(styledirname, filename),
+                                path.join(self.outdir, 'style', filename))
-                                    path.join(self.outdir, 'style', filename))
+        # add pygments style file
-            # add pygments style file
+        f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
-            f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
+        if pygments:
-            if pygments:
+            f.write(get_stylesheet())
-                f.write(get_stylesheet())
+        f.close()
            f.close()
        # dump the search index
        self.handle_finish()
@@ -497,6 +488,16 @@ class StandaloneHTMLBuilder(Builder):
            if path.getmtime(path.join(self.srcdir, filename)) > targetmtime:
                yield filename
    def load_indexer(self, filenames):
        try:
            with open(path.join(self.outdir, 'searchindex.json'), 'r') as f:
                self.indexer.load(f, 'json')
        except (IOError, OSError):
            pass
        # delete all entries for files that will be rebuilt
        self.indexer.prune(set(self.env.all_files) - set(filenames))
    def index_file(self, filename, doctree, title):
        # only index pages with title
        if self.indexer is not None and title:
@@ -522,11 +523,10 @@ class StandaloneHTMLBuilder(Builder):
                            path.join(self.outdir, context['sourcename']))
    def handle_finish(self):
-        if self.indexer is not None:
+        self.msg('dumping search index...')
-            self.msg('dumping search index...')
+        self.indexer.prune([self.get_target_uri(fn)[:-5] for fn in self.env.all_files])
-            f = open(path.join(self.outdir, 'searchindex.json'), 'w')
+        with open(path.join(self.outdir, 'searchindex.json'), 'w') as f:
            self.indexer.dump(f, 'json')
            f.close()
 class WebHTMLBuilder(StandaloneHTMLBuilder):
@@ -535,13 +535,6 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
    """
    name = 'web'
    # doesn't use the standalone specific options
    option_spec = Builder.option_spec.copy()
    option_spec.update({
        'nostyle': 'Don\'t copy style and script files',
        'nosearchindex': 'Don\'t create a search index for the online search',
    })
    def init(self):
        # Nothing to do here.
        pass
@@ -564,6 +557,15 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
            return source_filename[:-9] # up to /
        return source_filename[:-4] + '/'
    def load_indexer(self, filenames):
        try:
            with open(path.join(self.outdir, 'searchindex.pickle'), 'r') as f:
                self.indexer.load(f, 'pickle')
        except (IOError, OSError):
            pass
        # delete all entries for files that will be rebuilt
        self.indexer.prune(set(self.env.all_files) - set(filenames))
    def index_file(self, filename, doctree, title):
        # only index pages with title and category
        if self.indexer is not None and title:
@@ -590,11 +592,11 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
        with file(outfilename, 'wb') as fp:
            pickle.dump(self.globalcontext, fp, 2)
-        if self.indexer is not None:
+        self.msg('dumping search index...')
-            self.msg('dumping search index...')
+        self.indexer.prune(self.env.all_files)
-            f = open(path.join(self.outdir, 'searchindex.pickle'), 'w')
+        with open(path.join(self.outdir, 'searchindex.pickle'), 'wb') as f:
            self.indexer.dump(f, 'pickle')
-            f.close()
+
        # touch 'last build' file, used by the web application to determine
        # when to reload its environment and clear the cache
        open(path.join(self.outdir, LAST_BUILD_FILENAME), 'w').close()
@@ -611,10 +613,9 @@ class HTMLHelpBuilder(StandaloneHTMLBuilder):
    """
    name = 'htmlhelp'
-    option_spec = Builder.option_spec.copy()
+    option_spec = {
    option_spec.update({
        'outname': 'Output file base name (default "pydoc")'
-    })
+    }
    # don't copy the reST source
    copysource = False
--- a/sphinx/search.py
+++ b/sphinx/search.py
@@ -14,7 +14,7 @@ import pickle
 from collections import defaultdict
 from docutils.nodes import Text, NodeVisitor
 from .util.stemmer import PorterStemmer
-from .util.json import dump_json
+from .util.json import dump_json, load_json
 word_re = re.compile(r'\w+(?u)')
@@ -50,47 +50,71 @@ class IndexBuilder(object):
    passed to the `feed` method.
    """
    formats = {
-        'json':     dump_json,
+        'json':     (dump_json, load_json),
-        'pickle':   pickle.dumps
+        'pickle':   (pickle.dumps, pickle.loads),
    }
    def __init__(self):
        self._filenames = {}
        self._mapping = {}
        self._titles = {}
        self._categories = {}
        self._stemmer = Stemmer()
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}
        # category -> set(filenames)
        self._categories = {}
    def load(self, stream, format):
        """Reconstruct from frozen data."""
        frozen = self.formats[format][1](stream.read())
        index2fn = frozen[0]
        self._titles = dict(zip(frozen[0], frozen[2]))
        self._categories = dict((k, set(index2fn[i] for i in v))
                                for (k, v) in frozen[1].iteritems())
        self._mapping = dict((k, set(index2fn[i] for i in v))
                             for (k, v) in frozen[3].iteritems())
    def dump(self, stream, format):
-        """Dump the freezed index to a stream."""
+        """Dump the frozen index to a stream."""
-        stream.write(self.formats[format](self.freeze()))
+        stream.write(self.formats[format][0](self.freeze()))
    def freeze(self):
        """
        Create a useable data structure. You can pass this output
        to the `SearchFrontend` to search the index.
        """
        fns, titles = self._titles.keys(), self._titles.values()
        fn2index = dict((f, i) for (i, f) in enumerate(fns))
        return [
-            [k for k, v in sorted(self._filenames.items(),
+            fns,
-                                  key=lambda x: x[1])],
+            dict((k, [fn2index[fn] for fn in v])
-            dict(item for item in sorted(self._categories.items(),
+                 for (k, v) in self._categories.iteritems()),
-                                         key=lambda x: x[0])),
+            titles,
-            [v for k, v in sorted(self._titles.items(),
+            dict((k, [fn2index[fn] for fn in v])
-                                  key=lambda x: x[0])],
+                 for (k, v) in self._mapping.iteritems()),
            dict(item for item in sorted(self._mapping.items(),
                                         key=lambda x: x[0])),
        ]
    def prune(self, filenames):
        """Remove data for all filenames not in the list."""
        new_titles = {}
        for filename in filenames:
            if filename in self._titles:
                new_titles[filename] = self._titles[filename]
        self._titles = new_titles
        for wordnames in self._mapping.itervalues():
            wordnames.intersection_update(filenames)
        for catnames in self._categories.itervalues():
            catnames.intersection_update(filenames)
    def feed(self, filename, category, title, doctree):
        """Feed a doctree to the index."""
-        file_id = self._filenames.setdefault(filename, len(self._filenames))
+        self._titles[filename] = title
-        self._titles[file_id] = title
+        self._categories.setdefault(category, set()).add(filename)
        visitor = WordCollector(doctree)
        doctree.walk(visitor)
        self._categories.setdefault(category, set()).add(file_id)
        for word in word_re.findall(title) + visitor.found_words:
            self._mapping.setdefault(self._stemmer.stem(word.lower()),
-                                     set()).add(file_id)
+                                     set()).add(filename)
 class SearchFrontend(object):
--- a/sphinx/style/searchtools.js
+++ b/sphinx/style/searchtools.js
@@ -424,5 +424,5 @@ var Search = {
 }
 $(document).ready(function() {
-        Documentation.Search.init();
+        Search.init();
    });
--- a/sphinx/templates/search.html
+++ b/sphinx/templates/search.html
@@ -1,6 +1,6 @@
 {% extends "layout.html" %}
 {% set title = 'Search Documentation' %}
-{% block header %}
+{% block head %}
    <script type="text/javascript" src="{{ pathto('style/searchtools.js', 1) }}"></script>
 {% endblock %}
 {% block body %}
@@ -26,13 +26,13 @@
      ('tutorial', 'Python Tutorial', true),
      ('library', 'Library Reference', true),
      ('maclib', 'Macintosh Library Modules', false),
      ('reference', 'Language Reference', false),
      ('extending', 'Extending and Embedding', false),
      ('c-api', 'Python/C API', false),
      ('install', 'Installing Python Modules', true),
      ('distutils', 'Distributing Python Modules', true),
      ('documenting', 'Documenting Python', false),
      ('whatsnew', 'What\'s new in Python?', false),
      ('reference', 'Language Reference', false)
    ] -%}
      <li><input type="checkbox" name="area" id="area-{{ id }}" value="{{ id
          }}"{% if checked %} checked{% endif %}>
--- a/sphinx/util/json.py
+++ b/sphinx/util/json.py
@@ -16,7 +16,7 @@
 import re
-ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]')
+# escape \, ", control characters and everything outside ASCII
 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
 ESCAPE_DICT = {
    '\\': '\\\\',
@@ -27,8 +27,6 @@ ESCAPE_DICT = {
    '\r': '\\r',
    '\t': '\\t',
 }
 for i in range(0x20):
    ESCAPE_DICT.setdefault(chr(i), '\\u%04x' % (i,))
 def encode_basestring_ascii(s):
@@ -70,3 +68,11 @@ def dump_json(obj, key=False):
    elif isinstance(obj, basestring):
        return encode_basestring_ascii(obj)
    raise TypeError(type(obj))
 STRING = re.compile(r'("(\\\\|\\"|[^"])*")')
 def load_json(s):
    d = {'null': None, 'true': True, 'false': False}
    s = STRING.sub(r'u\1', s)
    return eval(s, d)