Rewrite IndexBuilder loading

2025-02-25 18:55:22 -06:00 · 2023-01-02 15:02:25 +00:00 · 2023-01-02 15:02:25 +00:00 · 087522cf79
commit 087522cf79
parent 4c44f22009
3 changed files with 159 additions and 65 deletions
--- a/sphinx/environment/init.py
+++ b/sphinx/environment/init.py
@ -228,6 +228,25 @@ class BuildEnvironment:
        # attributes of "any" cross references
        self.ref_context: dict[str, Any] = {}

+        # search index data
+
+        # docname -> title
+        self._search_index_titles: dict[str, str] = {}
+        # docname -> filename
+        self._search_index_filenames: dict[str, str] = {}
+        # stemmed words -> set(docname)
+        self._search_index_mapping: dict[str, set[str]] = {}
+        # stemmed words in titles -> set(docname)
+        self._search_index_title_mapping: dict[str, set[str]] = {}
+        # docname -> all titles in document
+        self._search_index_all_titles: dict[str, list[tuple[str, str]]] = {}
+        # docname -> list(index entry)
+        self._search_index_index_entries: dict[str, list[tuple[str, str, str]]] = {}
+        # objtype -> index
+        self._search_index_objtypes: dict[tuple[str, str], int] = {}
+        # objtype index -> (domain, type, objname (localized))
+        self._search_index_objnames: dict[int, tuple[str, str, str]] = {}
+
        # set up environment
        self.setup(app)

--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -1,6 +1,8 @@
 """Create a full-text search index for offline search."""
 from __future__ import annotations

+import dataclasses
+import functools
 import html
 import json
 import pickle
@ -66,7 +68,7 @@ var Stemmer = function() {
 }
 """

-    _word_re = re.compile(r'(?u)\w+')
+    _word_re = re.compile(r'\w+')

    def __init__(self, options: dict) -> None:
        self.options = options
@ -179,6 +181,27 @@ class _JavaScriptIndex:
 js_index = _JavaScriptIndex()


+def _is_meta_keywords(
+    node: nodes.meta,  # type: ignore[name-defined]
+    lang: str | None,
+) -> bool:
+    if node.get('name') == 'keywords':
+        meta_lang = node.get('lang')
+        if meta_lang is None:  # lang not specified
+            return True
+        elif meta_lang == lang:  # matched to html_search_language
+            return True
+
+    return False
+
+
+@dataclasses.dataclass
+class WordStore:
+    words: list[str] = dataclasses.field(default_factory=list)
+    titles: list[tuple[str, str]] = dataclasses.field(default_factory=list)
+    title_words: list[str] = dataclasses.field(default_factory=list)
+
+
 class WordCollector(nodes.NodeVisitor):
    """
    A special visitor that collects words for the `IndexBuilder`.
@ -191,17 +214,6 @@ class WordCollector(nodes.NodeVisitor):
        self.found_title_words: list[str] = []
        self.lang = lang

-    def is_meta_keywords(self, node: Element) -> bool:
-        if (isinstance(node, nodes.meta)  # type: ignore
-                and node.get('name') == 'keywords'):
-            meta_lang = node.get('lang')
-            if meta_lang is None:  # lang not specified
-                return True
-            elif meta_lang == self.lang.lang:  # matched to html_search_language
-                return True
-
-        return False
-
    def dispatch_visit(self, node: Node) -> None:
        if isinstance(node, nodes.comment):
            raise nodes.SkipNode
@ -222,7 +234,7 @@ class WordCollector(nodes.NodeVisitor):
            ids = node.parent['ids']
            self.found_titles.append((title, ids[0] if ids else None))
            self.found_title_words.extend(self.lang.split(title))
-        elif isinstance(node, Element) and self.is_meta_keywords(node):
+        elif isinstance(node, Element) and _is_meta_keywords(node, self.lang.lang):
            keywords = node['content']
            keywords = [keyword.strip() for keyword in keywords.split(',')]
            self.found_words.extend(keywords)
@ -240,17 +252,22 @@ class IndexBuilder:

    def __init__(self, env: BuildEnvironment, lang: str, options: dict, scoring: str) -> None:
        self.env = env
-        self._titles: dict[str, str] = {}           # docname -> title
-        self._filenames: dict[str, str] = {}        # docname -> filename
-        self._mapping: dict[str, set[str]] = {}     # stemmed word -> set(docname)
+        # docname -> title
+        self._titles: dict[str, str] = env._search_index_titles
+        # docname -> filename
+        self._filenames: dict[str, str] = env._search_index_filenames
+        # stemmed words -> set(docname)
+        self._mapping: dict[str, set[str]] = env._search_index_mapping
        # stemmed words in titles -> set(docname)
-        self._title_mapping: dict[str, set[str]] = {}
-        self._all_titles: dict[str, list[tuple[str, str]]] = {}  # docname -> all titles
-        self._index_entries: dict[str, list[tuple[str, str, str]]] = {}  # docname -> index entry
-        self._stem_cache: dict[str, str] = {}       # word -> stemmed word
-        self._objtypes: dict[tuple[str, str], int] = {}     # objtype -> index
+        self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
+        # docname -> all titles in document
+        self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles
+        # docname -> list(index entry)
+        self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
+        # objtype -> index
+        self._objtypes: dict[tuple[str, str], int] = env._search_index_objtypes
        # objtype index -> (domain, type, objname (localized))
-        self._objnames: dict[int, tuple[str, str, str]] = {}
+        self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
        # add language-specific SearchLanguage instance
        lang_class = languages.get(lang)

@ -423,68 +440,81 @@ class IndexBuilder:
        self._titles[docname] = title
        self._filenames[docname] = filename

-        visitor = WordCollector(doctree, self.lang)
-        doctree.walk(visitor)
+        word_store = self._word_collector(doctree)

-        # memoize self.lang.stem
-        def stem(word: str) -> str:
-            try:
-                return self._stem_cache[word]
-            except KeyError:
-                self._stem_cache[word] = self.lang.stem(word).lower()
-                return self._stem_cache[word]
        _filter = self.lang.word_filter
+        _stem = self.lang.stem

-        self._all_titles[docname] = visitor.found_titles
+        # memoise self.lang.stem
+        @functools.lru_cache(maxsize=None)
+        def stem(word_to_stem: str) -> str:
+            return _stem(word_to_stem).lower()

-        for word in visitor.found_title_words:
+        self._all_titles[docname] = word_store.titles
+
+        for word in word_store.title_words:
+            # add stemmed and unstemmed as the stemmer must not remove words
+            # from search index.
            stemmed_word = stem(word)
            if _filter(stemmed_word):
                self._title_mapping.setdefault(stemmed_word, set()).add(docname)
-            elif _filter(word): # stemmer must not remove words from search index
+            elif _filter(word):
                self._title_mapping.setdefault(word, set()).add(docname)

-        for word in visitor.found_words:
+        for word in word_store.words:
+            # add stemmed and unstemmed as the stemmer must not remove words
+            # from search index.
            stemmed_word = stem(word)
-            # again, stemmer must not remove words from search index
            if not _filter(stemmed_word) and _filter(word):
                stemmed_word = word
-            already_indexed = docname in self._title_mapping.get(stemmed_word, set())
+            already_indexed = docname in self._title_mapping.get(stemmed_word, ())
            if _filter(stemmed_word) and not already_indexed:
                self._mapping.setdefault(stemmed_word, set()).add(docname)

        # find explicit entries within index directives
        _index_entries: set[tuple[str, str, str]] = set()
        for node in doctree.findall(addnodes.index):
-            for entry_type, value, tid, main, *index_key in node['entries']:
-                tid = tid or ''
-                try:
-                    if entry_type == 'single':
-                        try:
-                            entry, subentry = split_into(2, 'single', value)
-                        except ValueError:
-                            entry, = split_into(1, 'single', value)
-                            subentry = ''
-                        _index_entries.add((entry, tid, main))
-                        if subentry:
-                            _index_entries.add((subentry, tid, main))
-                    elif entry_type == 'pair':
-                        first, second = split_into(2, 'pair', value)
-                        _index_entries.add((first, tid, main))
-                        _index_entries.add((second, tid, main))
-                    elif entry_type == 'triple':
-                        first, second, third = split_into(3, 'triple', value)
-                        _index_entries.add((first, tid, main))
-                        _index_entries.add((second, tid, main))
-                        _index_entries.add((third, tid, main))
-                    elif entry_type in {'see', 'seealso'}:
-                        first, second = split_into(2, 'see', value)
-                        _index_entries.add((first, tid, main))
-                except ValueError:
-                    pass
-
+            for entry_type, value, target_id, main, *index_key in node['entries']:
+                _index_entries |= _parse_index_entry(entry_type, value, target_id, main)
        self._index_entries[docname] = sorted(_index_entries)

+    def _word_collector(self, doctree: nodes.document) -> WordStore:
+        def _visit_nodes(node):
+            if isinstance(node, nodes.comment):
+                return
+            elif isinstance(node, nodes.raw):
+                if 'html' in node.get('format', '').split():
+                    # Some people might put content in raw HTML that should be searched,
+                    # so we just amateurishly strip HTML tags and index the remaining
+                    # content
+                    nodetext = re.sub(r'<style.*?</style>', '', node.astext(),
+                                      flags=re.IGNORECASE | re.DOTALL)
+                    nodetext = re.sub(r'<script.*?</script>', '', nodetext,
+                                      flags=re.IGNORECASE | re.DOTALL)
+                    nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+                    word_store.words.extend(split(nodetext))
+                return
+            elif (isinstance(node, nodes.meta)  # type: ignore[attr-defined]
+                  and _is_meta_keywords(node, language)):
+                keywords = [keyword.strip() for keyword in node['content'].split(',')]
+                word_store.words.extend(keywords)
+            elif isinstance(node, nodes.Text):
+                word_store.words.extend(split(node.astext()))
+            elif isinstance(node, nodes.title):
+                title = node.astext()
+                ids = node.parent['ids']
+                word_store.titles.append((title, ids[0] if ids else None))
+                word_store.title_words.extend(split(title))
+            for child in node.children:
+                _visit_nodes(child)
+            return
+
+        word_store = WordStore()
+        split = self.lang.split
+        language = self.lang.lang
+        _visit_nodes(doctree)
+        return word_store
+
    def context_for_searchtool(self) -> dict[str, Any]:
        if self.lang.js_splitter_code:
            js_splitter_code = self.lang.js_splitter_code
@ -523,3 +553,41 @@ class IndexBuilder:
                    (base_js, language_js, self.lang.language_name))
        else:
            return self.lang.js_stemmer_code
+
+
+def _parse_index_entry(
+    entry_type: str,
+    value: str,
+    target_id: str,
+    main: str
+) -> set[tuple[str, str, str]]:
+    target_id = target_id or ''
+    if entry_type == 'single':
+        try:
+            entry, subentry = split_into(2, 'single', value)
+            if subentry:
+                return {(entry, target_id, main), (subentry, target_id, main)}
+        except ValueError:
+            entry, = split_into(1, 'single', value)
+        return {(entry, target_id, main)}
+    elif entry_type == 'pair':
+        try:
+            first, second = split_into(2, 'pair', value)
+            return {(first, target_id, main), (second, target_id, main)}
+        except ValueError:
+            pass
+    elif entry_type == 'triple':
+        try:
+            first, second, third = split_into(3, 'triple', value)
+            return {(first, target_id, main),
+                    (second, target_id, main),
+                    (third, target_id, main)}
+        except ValueError:
+            pass
+    elif entry_type in {'see', 'seealso'}:
+        try:
+            first, second = split_into(2, 'see', value)
+            return {(first, target_id, main)}
+        except ValueError:
+            pass
+    return set()
--- a/tests/test_search.py
+++ b/tests/test_search.py
@ -11,7 +11,12 @@ from docutils.parsers import rst

 from sphinx.search import IndexBuilder

-DummyEnvironment = namedtuple('DummyEnvironment', ['version', 'domains'])
+
+class DummyEnvironment(namedtuple('DummyEnvironment', ['version', 'domains'])):
+    def __getattr__(self, name):
+        if name.startswith('_search_index_'):
+            setattr(self, name, {})
+        return getattr(self, name, {})


 class DummyDomain:
@ -185,6 +190,8 @@ def test_IndexBuilder():
    assert index._objnames == {0: ('dummy1', 'objtype1', 'objtype1'),
                               1: ('dummy2', 'objtype1', 'objtype1')}

+    env = DummyEnvironment('1.0', {'dummy1': domain1, 'dummy2': domain2})
+
    # dump / load
    stream = BytesIO()
    index.dump(stream, 'pickle')