Re-structure index entry processing (#11505)

2025-02-25 18:55:22 -06:00 · 2023-07-24 03:17:29 +01:00 · 2023-07-24 03:17:29 +01:00 · 24b4d65a02
commit 24b4d65a02
parent 480630c649
7 changed files with 160 additions and 180 deletions
--- a/sphinx/builders/gettext.py
+++ b/sphinx/builders/gettext.py
@ -156,8 +156,8 @@ class I18nBuilder(Builder):
        if 'index' in self.env.config.gettext_additional_targets:
            # Extract translatable messages from index entries.
            for node, entries in traverse_translatable_index(doctree):
-                for typ, msg, _tid, _main, _key in entries:
-                    for m in split_index_msg(typ, msg):
+                for entry_type, value, _target_id, _main, _category_key in entries:
+                    for m in split_index_msg(entry_type, value):
                        catalog.add(m, node)


--- a/sphinx/domains/index.py
+++ b/sphinx/domains/index.py
@ -29,7 +29,7 @@ class IndexDomain(Domain):
    label = 'index'

    @property
-    def entries(self) -> dict[str, list[tuple[str, str, str, str, str]]]:
+    def entries(self) -> dict[str, list[tuple[str, str, str, str, str | None]]]:
        return self.data.setdefault('entries', {})

    def clear_doc(self, docname: str) -> None:
@ -44,8 +44,8 @@ class IndexDomain(Domain):
        entries = self.entries.setdefault(env.docname, [])
        for node in list(document.findall(addnodes.index)):
            try:
-                for entry in node['entries']:
-                    split_index_msg(entry[0], entry[1])
+                for (entry_type, value, _target_id, _main, _category_key) in node['entries']:
+                    split_index_msg(entry_type, value)
            except ValueError as exc:
                logger.warning(str(exc), location=node)
                node.parent.remove(node)
--- a/sphinx/environment/adapters/indexentries.py
+++ b/sphinx/environment/adapters/indexentries.py
@ -5,10 +5,9 @@ from __future__ import annotations
 import re
 import unicodedata
 from itertools import groupby
-from typing import Any, cast
+from typing import Any, Literal

 from sphinx.builders import Builder
-from sphinx.domains.index import IndexDomain
 from sphinx.environment import BuildEnvironment
 from sphinx.errors import NoUri
 from sphinx.locale import _, __
@ -20,6 +19,7 @@ logger = logging.getLogger(__name__)
 class IndexEntries:
    def __init__(self, env: BuildEnvironment) -> None:
        self.env = env
+        self.builder: Builder

    def create_index(self, builder: Builder, group_entries: bool = True,
                     _fixre: re.Pattern = re.compile(r'(.*) ([(][^()]*[)])'),
@ -27,89 +27,60 @@ class IndexEntries:
        """Create the real index from the collected index entries."""
        new: dict[str, list] = {}

-        def add_entry(word: str, subword: str, main: str | None, link: bool = True,
-                      dic: dict[str, list] = new, key: str | None = None) -> None:
-            # Force the word to be unicode if it's a ASCII bytestring.
-            # This will solve problems with unicode normalization later.
-            # For instance the RFC role will add bytestrings at the moment
-            word = str(word)
-            entry = dic.get(word)
-            if not entry:
-                dic[word] = entry = [[], {}, key]
-            if subword:
-                add_entry(subword, '', main, link=link, dic=entry[1], key=key)
-            elif link:
+        rel_uri: str | Literal[False]
+        index_domain = self.env.domains['index']
+        for docname, entries in index_domain.entries.items():
            try:
-                    uri = builder.get_relative_uri('genindex', fn) + '#' + tid
+                rel_uri = builder.get_relative_uri('genindex', docname)
            except NoUri:
-                    pass
-                else:
-                    entry[0].append((main, uri))
+                rel_uri = False

-        domain = cast(IndexDomain, self.env.get_domain('index'))
-        for fn, entries in domain.entries.items():
            # new entry types must be listed in directives/other.py!
-            for type, value, tid, main, index_key in entries:  # noqa: B007
+            for entry_type, value, target_id, main, category_key in entries:
+                uri = rel_uri is not False and f'{rel_uri}#{target_id}'
                try:
-                    if type == 'single':
+                    if entry_type == 'single':
                        try:
-                            entry, subentry = split_into(2, 'single', value)
+                            entry, sub_entry = split_into(2, 'single', value)
                        except ValueError:
                            entry, = split_into(1, 'single', value)
-                            subentry = ''
-                        add_entry(entry, subentry, main, key=index_key)
-                    elif type == 'pair':
+                            sub_entry = ''
+                        _add_entry(entry, sub_entry, main,
+                                   dic=new, link=uri, key=category_key)
+                    elif entry_type == 'pair':
                        first, second = split_into(2, 'pair', value)
-                        add_entry(first, second, main, key=index_key)
-                        add_entry(second, first, main, key=index_key)
-                    elif type == 'triple':
+                        _add_entry(first, second, main,
+                                   dic=new, link=uri, key=category_key)
+                        _add_entry(second, first, main,
+                                   dic=new, link=uri, key=category_key)
+                    elif entry_type == 'triple':
                        first, second, third = split_into(3, 'triple', value)
-                        add_entry(first, second + ' ' + third, main, key=index_key)
-                        add_entry(second, third + ', ' + first, main, key=index_key)
-                        add_entry(third, first + ' ' + second, main, key=index_key)
-                    elif type == 'see':
+                        _add_entry(first, second + ' ' + third, main,
+                                   dic=new, link=uri, key=category_key)
+                        _add_entry(second, third + ', ' + first, main,
+                                   dic=new, link=uri, key=category_key)
+                        _add_entry(third, first + ' ' + second, main,
+                                   dic=new, link=uri, key=category_key)
+                    elif entry_type == 'see':
                        first, second = split_into(2, 'see', value)
-                        add_entry(first, _('see %s') % second, None,
-                                  link=False, key=index_key)
-                    elif type == 'seealso':
+                        _add_entry(first, _('see %s') % second, None,
+                                   dic=new, link=False, key=category_key)
+                    elif entry_type == 'seealso':
                        first, second = split_into(2, 'see', value)
-                        add_entry(first, _('see also %s') % second, None,
-                                  link=False, key=index_key)
+                        _add_entry(first, _('see also %s') % second, None,
+                                   dic=new, link=False, key=category_key)
                    else:
-                        logger.warning(__('unknown index entry type %r'), type, location=fn)
+                        logger.warning(__('unknown index entry type %r'), entry_type,
+                                       location=docname)
                except ValueError as err:
-                    logger.warning(str(err), location=fn)
+                    logger.warning(str(err), location=docname)

-        # sort the index entries for same keyword.
-        def keyfunc0(entry: tuple[str, str]) -> tuple[bool, str]:
-            main, uri = entry
-            return (not main, uri)  # show main entries at first
+        for (targets, sub_items, _category_key) in new.values():
+            targets.sort(key=_key_func_0)
+            for (sub_targets, _0, _sub_category_key) in sub_items.values():
+                sub_targets.sort(key=_key_func_0)

-        for indexentry in new.values():
-            indexentry[0].sort(key=keyfunc0)
-            for subentry in indexentry[1].values():
-                subentry[0].sort(key=keyfunc0)  # type: ignore
-
-        # sort the index entries
-        def keyfunc(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
-            key, (void, void, category_key) = entry
-            if category_key:
-                # using specified category key to sort
-                key = category_key
-            lckey = unicodedata.normalize('NFD', key.lower())
-            if lckey.startswith('\N{RIGHT-TO-LEFT MARK}'):
-                lckey = lckey[1:]
-
-            if lckey[0:1].isalpha() or lckey.startswith('_'):
-                # put non-symbol characters at the following group (1)
-                sortkey = (1, lckey)
-            else:
-                # put symbols at the front of the index (0)
-                sortkey = (0, lckey)
-            # ensure a deterministic order *within* letters by also sorting on
-            # the entry itself
-            return (sortkey, entry[0])
-        newlist = sorted(new.items(), key=keyfunc)
+        new_list = sorted(new.items(), key=_key_func_1)

        if group_entries:
            # fixup entries: transform
@ -119,30 +90,70 @@ class IndexEntries:
            #   func()
            #     (in module foo)
            #     (in module bar)
-            oldkey = ''
-            oldsubitems: dict[str, list] = {}
+            old_key = ''
+            old_sub_items: dict[str, list] = {}
            i = 0
-            while i < len(newlist):
-                key, (targets, subitems, _key) = newlist[i]
-                # cannot move if it has subitems; structure gets too complex
-                if not subitems:
+            while i < len(new_list):
+                key, (targets, sub_items, category_key) = new_list[i]
+                # cannot move if it has sub_items; structure gets too complex
+                if not sub_items:
                    m = _fixre.match(key)
                    if m:
-                        if oldkey == m.group(1):
+                        if old_key == m.group(1):
                            # prefixes match: add entry as subitem of the
                            # previous entry
-                            oldsubitems.setdefault(m.group(2), [[], {}, _key])[0].\
-                                extend(targets)
-                            del newlist[i]
+                            old_sub_items.setdefault(
+                                m.group(2), [[], {}, category_key])[0].extend(targets)
+                            del new_list[i]
                            continue
-                        oldkey = m.group(1)
+                        old_key = m.group(1)
                    else:
-                        oldkey = key
-                oldsubitems = subitems
+                        old_key = key
+                old_sub_items = sub_items
                i += 1

-        # sort the sub-index entries
-        def keyfunc2(entry: tuple[str, list]) -> str:
+        return [(key_, list(group))
+                for (key_, group) in groupby(new_list, _key_func_3)]
+
+
+def _add_entry(word: str, subword: str, main: str | None, *,
+               dic: dict[str, list], link: str | Literal[False], key: str | None) -> None:
+    entry = dic.setdefault(word, [[], {}, key])
+    if subword:
+        entry = entry[1].setdefault(subword, [[], {}, key])
+    if link:
+        entry[0].append((main, link))
+
+
+def _key_func_0(entry: tuple[str, str]) -> tuple[bool, str]:
+    """sort the index entries for same keyword."""
+    main, uri = entry
+    return not main, uri  # show main entries at first
+
+
+def _key_func_1(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
+    """Sort the index entries"""
+    key, (_targets, _sub_items, category_key) = entry
+    if category_key:
+        # using the specified category key to sort
+        key = category_key
+    lc_key = unicodedata.normalize('NFD', key.lower())
+    if lc_key.startswith('\N{RIGHT-TO-LEFT MARK}'):
+        lc_key = lc_key[1:]
+
+    if not lc_key[0:1].isalpha() and not lc_key.startswith('_'):
+        # put symbols at the front of the index (0)
+        group = 0
+    else:
+        # put non-symbol characters at the following group (1)
+        group = 1
+    # ensure a deterministic order *within* letters by also sorting on
+    # the entry itself
+    return (group, lc_key), entry[0]
+
+
+def _key_func_2(entry: tuple[str, list]) -> str:
+    """sort the sub-index entries"""
    key = unicodedata.normalize('NFD', entry[0].lower())
    if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
        key = key[1:]
@ -150,23 +161,24 @@ class IndexEntries:
        key = chr(127) + key
    return key

-        # group the entries by letter
-        def keyfunc3(item: tuple[str, list]) -> str:
-            # hack: mutating the subitems dicts to a list in the keyfunc
-            k, v = item
-            v[1] = sorted(((si, se) for (si, (se, void, void)) in v[1].items()),
-                          key=keyfunc2)
-            if v[2] is None:
+
+def _key_func_3(entry: tuple[str, list]) -> str:
+    """Group the entries by letter"""
+    key, (targets, sub_items, category_key) = entry
+    # hack: mutating the sub_items dicts to a list in the key_func
+    entry[1][1] = sorted(((sub_key, sub_targets)
+                          for (sub_key, (sub_targets, _0, _sub_category_key))
+                          in sub_items.items()), key=_key_func_2)
+
+    if category_key is not None:
+        return category_key
+
    # now calculate the key
-                if k.startswith('\N{RIGHT-TO-LEFT MARK}'):
-                    k = k[1:]
-                letter = unicodedata.normalize('NFD', k[0])[0].upper()
+    if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
+        key = key[1:]
+    letter = unicodedata.normalize('NFD', key[0])[0].upper()
    if letter.isalpha() or letter == '_':
        return letter
-                else:
+
    # get all other symbols under one heading
    return _('Symbols')
-            else:
-                return v[2]
-        return [(key_, list(group))
-                for (key_, group) in groupby(newlist, keyfunc3)]
--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -31,7 +31,7 @@ from docutils.nodes import Element, Node

 from sphinx import addnodes, package_dir
 from sphinx.environment import BuildEnvironment
-from sphinx.util import split_into
+from sphinx.util import split_index_msg


 class SearchLanguage:
@ -478,8 +478,17 @@ class IndexBuilder:
        # find explicit entries within index directives
        _index_entries: set[tuple[str, str, str]] = set()
        for node in doctree.findall(addnodes.index):
-            for entry_type, value, target_id, main, *index_key in node['entries']:
-                _index_entries |= _parse_index_entry(entry_type, value, target_id, main)
+            for entry_type, value, target_id, main, _category_key in node['entries']:
+                try:
+                    result = split_index_msg(entry_type, value)
+                except ValueError:
+                    pass
+                else:
+                    target_id = target_id or ''
+                    if entry_type in {'see', 'seealso'}:
+                        _index_entries.add((result[0], target_id, main))
+                    _index_entries |= {(x, target_id, main) for x in result}
+
        self._index_entries[docname] = sorted(_index_entries)

    def _word_collector(self, doctree: nodes.document) -> WordStore:
@ -557,41 +566,3 @@ class IndexBuilder:
                    (base_js, language_js, self.lang.language_name))
        else:
            return self.lang.js_stemmer_code
-
-
-def _parse_index_entry(
-    entry_type: str,
-    value: str,
-    target_id: str,
-    main: str
-) -> set[tuple[str, str, str]]:
-    target_id = target_id or ''
-    if entry_type == 'single':
-        try:
-            entry, subentry = split_into(2, 'single', value)
-            if subentry:
-                return {(entry, target_id, main), (subentry, target_id, main)}
-        except ValueError:
-            entry, = split_into(1, 'single', value)
-        return {(entry, target_id, main)}
-    elif entry_type == 'pair':
-        try:
-            first, second = split_into(2, 'pair', value)
-            return {(first, target_id, main), (second, target_id, main)}
-        except ValueError:
-            pass
-    elif entry_type == 'triple':
-        try:
-            first, second, third = split_into(3, 'triple', value)
-            return {(first, target_id, main),
-                    (second, target_id, main),
-                    (third, target_id, main)}
-        except ValueError:
-            pass
-    elif entry_type in {'see', 'seealso'}:
-        try:
-            first, second = split_into(2, 'see', value)
-            return {(first, target_id, main)}
-        except ValueError:
-            pass
-    return set()
--- a/sphinx/transforms/i18n.py
+++ b/sphinx/transforms/i18n.py
@ -498,8 +498,8 @@ class Locale(SphinxTransform):
            # Extract and translate messages for index entries.
            for node, entries in traverse_translatable_index(self.document):
                new_entries: list[tuple[str, str, str, str, str | None]] = []
-                for type, msg, tid, main, _key in entries:
-                    msg_parts = split_index_msg(type, msg)
+                for entry_type, value, target_id, main, _category_key in entries:
+                    msg_parts = split_index_msg(entry_type, value)
                    msgstr_parts = []
                    for part in msg_parts:
                        msgstr = catalog.gettext(part)
@ -507,7 +507,8 @@ class Locale(SphinxTransform):
                            msgstr = part
                        msgstr_parts.append(msgstr)

-                    new_entries.append((type, ';'.join(msgstr_parts), tid, main, None))
+                    new_entry = entry_type, ';'.join(msgstr_parts), target_id, main, None
+                    new_entries.append(new_entry)

                node['raw_entries'] = entries
                node['entries'] = new_entries
--- a/sphinx/util/init.py
+++ b/sphinx/util/init.py
@ -247,28 +247,25 @@ def parselinenos(spec: str, total: int) -> list[int]:
 def split_into(n: int, type: str, value: str) -> list[str]:
    """Split an index entry into a given number of parts at semicolons."""
    parts = [x.strip() for x in value.split(';', n - 1)]
-    if sum(1 for part in parts if part) < n:
+    if len(list(filter(None, parts))) < n:
        raise ValueError(f'invalid {type} index entry {value!r}')
    return parts


-def split_index_msg(type: str, value: str) -> list[str]:
-    # new entry types must be listed in directives/other.py!
-    if type == 'single':
+def split_index_msg(entry_type: str, value: str) -> list[str]:
+    # new entry types must be listed in util/nodes.py!
+    if entry_type == 'single':
        try:
-            result = split_into(2, 'single', value)
+            return split_into(2, 'single', value)
        except ValueError:
-            result = split_into(1, 'single', value)
-    elif type == 'pair':
-        result = split_into(2, 'pair', value)
-    elif type == 'triple':
-        result = split_into(3, 'triple', value)
-    elif type in {'see', 'seealso'}:
-        result = split_into(2, 'see', value)
-    else:
-        raise ValueError(f'invalid {type} index entry {value!r}')
-
-    return result
+            return split_into(1, 'single', value)
+    if entry_type == 'pair':
+        return split_into(2, 'pair', value)
+    if entry_type == 'triple':
+        return split_into(3, 'triple', value)
+    if entry_type in {'see', 'seealso'}:
+        return split_into(2, 'see', value)
+    raise ValueError(f'invalid {entry_type} index entry {value!r}')


 def import_object(objname: str, source: str | None = None) -> Any:
--- a/sphinx/writers/texinfo.py
+++ b/sphinx/writers/texinfo.py
@ -1326,9 +1326,8 @@ class TexinfoTranslator(SphinxTranslator):
            self.ensure_eol()
        else:
            self.body.append('\n')
-        for entry in node['entries']:
-            typ, text, tid, text2, key_ = entry
-            text = self.escape_menu(text)
+        for (_entry_type, value, _target_id, _main, _category_key) in node['entries']:
+            text = self.escape_menu(value)
            self.body.append('@geindex %s\n' % text)

    def visit_versionmodified(self, node: Element) -> None: