Fix searching and search index creation for incremental builds.

2025-02-25 18:55:22 -06:00 · 2007-08-09 19:22:20 +00:00
parent be36a0f85d
commit b1271fa623
6 changed files with 121 additions and 85 deletions
--- a/sphinx/init.py
+++ b/sphinx/init.py
@@ -26,11 +26,13 @@ def usage(argv, msg=None):
    print >>sys.stderr, """\
 usage: %s [options] sourcedir outdir [filenames...]"
 options: -b <builder> -- builder to use (one of %s)
-         -a -- write all files; default is to only write new and changed files
-         -d <path> -- path for the cached doctree files (default outdir/.doctrees)
+         -a        -- write all files; default is to only write new and changed files
+         -E        -- don't use a saved environment, always read all files
+         -d <path> -- path for the cached environment and doctree files
+                      (default outdir/.doctrees)
         -O <option[=value]> -- give option to to the builder (-O help for list)
         -D <setting=value> -- override a setting in sourcedir/conf.py
-         -N -- do not do colored output
+         -N        -- do not do colored output
 modi:
 * without -a and without filenames, write new and changed files.
 * with -a, write all files.
@@ -39,7 +41,7 @@ modi:

 def main(argv):
    try:
-        opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:N')
+        opts, args = getopt.getopt(argv[1:], 'ab:d:O:D:NE')
        srcdirname = path.abspath(args[0])
        if not path.isdir(srcdirname):
            print >>sys.stderr, 'Error: Cannot find source directory.'
@@ -65,7 +67,7 @@ def main(argv):
        return 1

    builder = all_files = None
-    opt_help = False
+    opt_help = freshenv = False
    options = {}
    confoverrides = {}
    doctreedir = path.join(outdirname, '.doctrees')
@@ -102,6 +104,8 @@ def main(argv):
            confoverrides[key] = val
        elif opt == '-N':
            nocolor()
+        elif opt == '-E':
+            freshenv = True

    if not sys.stdout.isatty() or sys.platform == 'win32':
        # Windows' cmd box doesn't understand ANSI sequences
@@ -122,7 +126,8 @@ def main(argv):
    builderobj = builderobj(srcdirname, outdirname, doctreedir, options,
                            status_stream=sys.stdout,
                            warning_stream=sys.stderr,
-                            confoverrides=confoverrides)
+                            confoverrides=confoverrides,
+                            freshenv=freshenv)
    if all_files:
        builderobj.build_all()
    elif filenames:
--- a/sphinx/builder.py
+++ b/sphinx/builder.py
@@ -72,18 +72,18 @@ class Builder(object):
    Builds target formats from the reST sources.
    """

-    option_spec = {
-        'freshenv': 'Don\'t use a pickled environment',
-    }
+    option_spec = {}

    def __init__(self, srcdirname, outdirname, doctreedirname,
                 options, confoverrides=None, env=None,
-                 status_stream=None, warning_stream=None):
+                 status_stream=None, warning_stream=None,
+                 freshenv=False):
        self.srcdir = srcdirname
        self.outdir = outdirname
        self.doctreedir = doctreedirname
        if not path.isdir(doctreedirname):
            os.mkdir(doctreedirname)
+        self.freshenv = freshenv

        self.options = attrdict(options)
        self.validate_options()
@@ -161,7 +161,7 @@ class Builder(object):
           successfully loaded, False if a new environment had to be created."""
        if self.env:
            return
-        if not self.options.freshenv:
+        if not self.freshenv:
            try:
                self.msg('trying to load pickled env...', nonl=True)
                self.env = BuildEnvironment.frompickle(
@@ -223,8 +223,6 @@ class Builder(object):
        self.msg('creating index...')
        self.env.create_index(self)

-        self.prepare_writing()
-
        if filenames:
            # add all TOC files that may have changed
            filenames_set = set(filenames)
@@ -236,6 +234,8 @@ class Builder(object):
            # build all
            filenames_set = set(self.env.all_files)

+        self.prepare_writing(filenames)
+
        # write target files
        with collect_env_warnings(self):
            self.msg('writing output...')
@@ -249,7 +249,7 @@ class Builder(object):
        self.finish()
        self.msg('done!')

-    def prepare_writing(self):
+    def prepare_writing(self, filenames):
        raise NotImplementedError

    def write_file(self, filename, doctree):
@@ -265,12 +265,6 @@ class StandaloneHTMLBuilder(Builder):
    """
    name = 'html'

-    option_spec = Builder.option_spec
-    option_spec.update({
-        'nostyle': 'Don\'t copy style and script files',
-        'nosearchindex': 'Don\'t create a JSON search index for offline search',
-    })
-
    copysource = True

    def init(self):
@@ -301,12 +295,10 @@ class StandaloneHTMLBuilder(Builder):
            settings_overrides={'output_encoding': 'unicode'}
        )

-    def prepare_writing(self):
-        if not self.options.nosearchindex:
-            from .search import IndexBuilder
-            self.indexer = IndexBuilder()
-        else:
-            self.indexer = None
+    def prepare_writing(self, filenames):
+        from .search import IndexBuilder
+        self.indexer = IndexBuilder()
+        self.load_indexer(filenames)
        self.docwriter = HTMLWriter(self.config)
        self.docsettings = OptionParser(
            defaults=self.env.settings,
@@ -463,20 +455,19 @@ class StandaloneHTMLBuilder(Builder):
        )
        self.handle_file('search.rst', searchcontext, 'search')

-        if not self.options.nostyle:
-            self.msg('copying style files...')
-            # copy style files
-            styledirname = path.join(path.dirname(__file__), 'style')
-            ensuredir(path.join(self.outdir, 'style'))
-            for filename in os.listdir(styledirname):
-                if not filename.startswith('.'):
-                    shutil.copyfile(path.join(styledirname, filename),
-                                    path.join(self.outdir, 'style', filename))
-            # add pygments style file
-            f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
-            if pygments:
-                f.write(get_stylesheet())
-            f.close()
+        # copy style files
+        self.msg('copying style files...')
+        styledirname = path.join(path.dirname(__file__), 'style')
+        ensuredir(path.join(self.outdir, 'style'))
+        for filename in os.listdir(styledirname):
+            if not filename.startswith('.'):
+                shutil.copyfile(path.join(styledirname, filename),
+                                path.join(self.outdir, 'style', filename))
+        # add pygments style file
+        f = open(path.join(self.outdir, 'style', 'pygments.css'), 'w')
+        if pygments:
+            f.write(get_stylesheet())
+        f.close()

        # dump the search index
        self.handle_finish()
@@ -497,6 +488,16 @@ class StandaloneHTMLBuilder(Builder):
            if path.getmtime(path.join(self.srcdir, filename)) > targetmtime:
                yield filename

+
+    def load_indexer(self, filenames):
+        try:
+            with open(path.join(self.outdir, 'searchindex.json'), 'r') as f:
+                self.indexer.load(f, 'json')
+        except (IOError, OSError):
+            pass
+        # delete all entries for files that will be rebuilt
+        self.indexer.prune(set(self.env.all_files) - set(filenames))
+
    def index_file(self, filename, doctree, title):
        # only index pages with title
        if self.indexer is not None and title:
@@ -522,11 +523,10 @@ class StandaloneHTMLBuilder(Builder):
                            path.join(self.outdir, context['sourcename']))

    def handle_finish(self):
-        if self.indexer is not None:
-            self.msg('dumping search index...')
-            f = open(path.join(self.outdir, 'searchindex.json'), 'w')
+        self.msg('dumping search index...')
+        self.indexer.prune([self.get_target_uri(fn)[:-5] for fn in self.env.all_files])
+        with open(path.join(self.outdir, 'searchindex.json'), 'w') as f:
            self.indexer.dump(f, 'json')
-            f.close()


 class WebHTMLBuilder(StandaloneHTMLBuilder):
@@ -535,13 +535,6 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
    """
    name = 'web'

-    # doesn't use the standalone specific options
-    option_spec = Builder.option_spec.copy()
-    option_spec.update({
-        'nostyle': 'Don\'t copy style and script files',
-        'nosearchindex': 'Don\'t create a search index for the online search',
-    })
-
    def init(self):
        # Nothing to do here.
        pass
@@ -564,6 +557,15 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
            return source_filename[:-9] # up to /
        return source_filename[:-4] + '/'

+    def load_indexer(self, filenames):
+        try:
+            with open(path.join(self.outdir, 'searchindex.pickle'), 'r') as f:
+                self.indexer.load(f, 'pickle')
+        except (IOError, OSError):
+            pass
+        # delete all entries for files that will be rebuilt
+        self.indexer.prune(set(self.env.all_files) - set(filenames))
+
    def index_file(self, filename, doctree, title):
        # only index pages with title and category
        if self.indexer is not None and title:
@@ -590,11 +592,11 @@ class WebHTMLBuilder(StandaloneHTMLBuilder):
        with file(outfilename, 'wb') as fp:
            pickle.dump(self.globalcontext, fp, 2)

-        if self.indexer is not None:
-            self.msg('dumping search index...')
-            f = open(path.join(self.outdir, 'searchindex.pickle'), 'w')
+        self.msg('dumping search index...')
+        self.indexer.prune(self.env.all_files)
+        with open(path.join(self.outdir, 'searchindex.pickle'), 'wb') as f:
            self.indexer.dump(f, 'pickle')
-            f.close()
+
        # touch 'last build' file, used by the web application to determine
        # when to reload its environment and clear the cache
        open(path.join(self.outdir, LAST_BUILD_FILENAME), 'w').close()
@@ -611,10 +613,9 @@ class HTMLHelpBuilder(StandaloneHTMLBuilder):
    """
    name = 'htmlhelp'

-    option_spec = Builder.option_spec.copy()
-    option_spec.update({
+    option_spec = {
        'outname': 'Output file base name (default "pydoc")'
-    })
+    }

    # don't copy the reST source
    copysource = False
--- a/sphinx/search.py
+++ b/sphinx/search.py
@@ -14,7 +14,7 @@ import pickle
 from collections import defaultdict
 from docutils.nodes import Text, NodeVisitor
 from .util.stemmer import PorterStemmer
-from .util.json import dump_json
+from .util.json import dump_json, load_json


 word_re = re.compile(r'\w+(?u)')
@@ -50,47 +50,71 @@ class IndexBuilder(object):
    passed to the `feed` method.
    """
    formats = {
-        'json':     dump_json,
-        'pickle':   pickle.dumps
+        'json':     (dump_json, load_json),
+        'pickle':   (pickle.dumps, pickle.loads),
    }

    def __init__(self):
-        self._filenames = {}
-        self._mapping = {}
-        self._titles = {}
-        self._categories = {}
        self._stemmer = Stemmer()
+        # filename -> title
+        self._titles = {}
+        # stemmed word -> set(filenames)
+        self._mapping = {}
+        # category -> set(filenames)
+        self._categories = {}
+
+    def load(self, stream, format):
+        """Reconstruct from frozen data."""
+        frozen = self.formats[format][1](stream.read())
+        index2fn = frozen[0]
+        self._titles = dict(zip(frozen[0], frozen[2]))
+        self._categories = dict((k, set(index2fn[i] for i in v))
+                                for (k, v) in frozen[1].iteritems())
+        self._mapping = dict((k, set(index2fn[i] for i in v))
+                             for (k, v) in frozen[3].iteritems())

    def dump(self, stream, format):
-        """Dump the freezed index to a stream."""
-        stream.write(self.formats[format](self.freeze()))
+        """Dump the frozen index to a stream."""
+        stream.write(self.formats[format][0](self.freeze()))

    def freeze(self):
        """
        Create a useable data structure. You can pass this output
        to the `SearchFrontend` to search the index.
        """
+        fns, titles = self._titles.keys(), self._titles.values()
+        fn2index = dict((f, i) for (i, f) in enumerate(fns))
        return [
-            [k for k, v in sorted(self._filenames.items(),
-                                  key=lambda x: x[1])],
-            dict(item for item in sorted(self._categories.items(),
-                                         key=lambda x: x[0])),
-            [v for k, v in sorted(self._titles.items(),
-                                  key=lambda x: x[0])],
-            dict(item for item in sorted(self._mapping.items(),
-                                         key=lambda x: x[0])),
+            fns,
+            dict((k, [fn2index[fn] for fn in v])
+                 for (k, v) in self._categories.iteritems()),
+            titles,
+            dict((k, [fn2index[fn] for fn in v])
+                 for (k, v) in self._mapping.iteritems()),
        ]

+    def prune(self, filenames):
+        """Remove data for all filenames not in the list."""
+        new_titles = {}
+        for filename in filenames:
+            if filename in self._titles:
+                new_titles[filename] = self._titles[filename]
+        self._titles = new_titles
+        for wordnames in self._mapping.itervalues():
+            wordnames.intersection_update(filenames)
+        for catnames in self._categories.itervalues():
+            catnames.intersection_update(filenames)
+
    def feed(self, filename, category, title, doctree):
        """Feed a doctree to the index."""
-        file_id = self._filenames.setdefault(filename, len(self._filenames))
-        self._titles[file_id] = title
+        self._titles[filename] = title
+        self._categories.setdefault(category, set()).add(filename)
+
        visitor = WordCollector(doctree)
        doctree.walk(visitor)
-        self._categories.setdefault(category, set()).add(file_id)
        for word in word_re.findall(title) + visitor.found_words:
            self._mapping.setdefault(self._stemmer.stem(word.lower()),
-                                     set()).add(file_id)
+                                     set()).add(filename)


 class SearchFrontend(object):
--- a/sphinx/style/searchtools.js
+++ b/sphinx/style/searchtools.js
@@ -424,5 +424,5 @@ var Search = {
 }

 $(document).ready(function() {
-        Documentation.Search.init();
+        Search.init();
    });
--- a/sphinx/templates/search.html
+++ b/sphinx/templates/search.html
@@ -1,6 +1,6 @@
 {% extends "layout.html" %}
 {% set title = 'Search Documentation' %}
-{% block header %}
+{% block head %}
    <script type="text/javascript" src="{{ pathto('style/searchtools.js', 1) }}"></script>
 {% endblock %}
 {% block body %}
@@ -26,13 +26,13 @@
      ('tutorial', 'Python Tutorial', true),
      ('library', 'Library Reference', true),
      ('maclib', 'Macintosh Library Modules', false),
+      ('reference', 'Language Reference', false),
      ('extending', 'Extending and Embedding', false),
      ('c-api', 'Python/C API', false),
      ('install', 'Installing Python Modules', true),
      ('distutils', 'Distributing Python Modules', true),
      ('documenting', 'Documenting Python', false),
      ('whatsnew', 'What\'s new in Python?', false),
-      ('reference', 'Language Reference', false)
    ] -%}
      <li><input type="checkbox" name="area" id="area-{{ id }}" value="{{ id
          }}"{% if checked %} checked{% endif %}>
--- a/sphinx/util/json.py
+++ b/sphinx/util/json.py
@@ -16,7 +16,7 @@

 import re

-ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]')
+# escape \, ", control characters and everything outside ASCII
 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
 ESCAPE_DICT = {
    '\\': '\\\\',
@@ -27,8 +27,6 @@ ESCAPE_DICT = {
    '\r': '\\r',
    '\t': '\\t',
 }
-for i in range(0x20):
-    ESCAPE_DICT.setdefault(chr(i), '\\u%04x' % (i,))


 def encode_basestring_ascii(s):
@@ -70,3 +68,11 @@ def dump_json(obj, key=False):
    elif isinstance(obj, basestring):
        return encode_basestring_ascii(obj)
    raise TypeError(type(obj))
+
+
+STRING = re.compile(r'("(\\\\|\\"|[^"])*")')
+
+def load_json(s):
+    d = {'null': None, 'true': True, 'false': False}
+    s = STRING.sub(r'u\1', s)
+    return eval(s, d)