Re-structure index entry processing (#11505)

2025-02-25 18:55:22 -06:00 · 2023-07-24 03:17:29 +01:00 · 2023-07-24 03:17:29 +01:00 · 24b4d65a02
commit 24b4d65a02
parent 480630c649
7 changed files with 160 additions and 180 deletions
--- a/sphinx/builders/gettext.py
+++ b/sphinx/builders/gettext.py
@ -156,8 +156,8 @@ class I18nBuilder(Builder):
        if 'index' in self.env.config.gettext_additional_targets:
            # Extract translatable messages from index entries.
            for node, entries in traverse_translatable_index(doctree):
-                for typ, msg, _tid, _main, _key in entries:
+                for entry_type, value, _target_id, _main, _category_key in entries:
-                    for m in split_index_msg(typ, msg):
+                    for m in split_index_msg(entry_type, value):
                        catalog.add(m, node)
--- a/sphinx/domains/index.py
+++ b/sphinx/domains/index.py
@ -29,7 +29,7 @@ class IndexDomain(Domain):
    label = 'index'
    @property
-    def entries(self) -> dict[str, list[tuple[str, str, str, str, str]]]:
+    def entries(self) -> dict[str, list[tuple[str, str, str, str, str | None]]]:
        return self.data.setdefault('entries', {})
    def clear_doc(self, docname: str) -> None:
@ -44,8 +44,8 @@ class IndexDomain(Domain):
        entries = self.entries.setdefault(env.docname, [])
        for node in list(document.findall(addnodes.index)):
            try:
-                for entry in node['entries']:
+                for (entry_type, value, _target_id, _main, _category_key) in node['entries']:
-                    split_index_msg(entry[0], entry[1])
+                    split_index_msg(entry_type, value)
            except ValueError as exc:
                logger.warning(str(exc), location=node)
                node.parent.remove(node)
--- a/sphinx/environment/adapters/indexentries.py
+++ b/sphinx/environment/adapters/indexentries.py
@ -5,10 +5,9 @@ from __future__ import annotations
 import re
 import unicodedata
 from itertools import groupby
-from typing import Any, cast
+from typing import Any, Literal
 from sphinx.builders import Builder
 from sphinx.domains.index import IndexDomain
 from sphinx.environment import BuildEnvironment
 from sphinx.errors import NoUri
 from sphinx.locale import _, __
@ -20,6 +19,7 @@ logger = logging.getLogger(__name__)
 class IndexEntries:
    def __init__(self, env: BuildEnvironment) -> None:
        self.env = env
        self.builder: Builder
    def create_index(self, builder: Builder, group_entries: bool = True,
                     _fixre: re.Pattern = re.compile(r'(.*) ([(][^()]*[)])'),
@ -27,89 +27,60 @@ class IndexEntries:
        """Create the real index from the collected index entries."""
        new: dict[str, list] = {}
-        def add_entry(word: str, subword: str, main: str | None, link: bool = True,
+        rel_uri: str | Literal[False]
-                      dic: dict[str, list] = new, key: str | None = None) -> None:
+        index_domain = self.env.domains['index']
-            # Force the word to be unicode if it's a ASCII bytestring.
+        for docname, entries in index_domain.entries.items():
-            # This will solve problems with unicode normalization later.
+            try:
-            # For instance the RFC role will add bytestrings at the moment
+                rel_uri = builder.get_relative_uri('genindex', docname)
-            word = str(word)
+            except NoUri:
-            entry = dic.get(word)
+                rel_uri = False
            if not entry:
                dic[word] = entry = [[], {}, key]
            if subword:
                add_entry(subword, '', main, link=link, dic=entry[1], key=key)
            elif link:
                try:
                    uri = builder.get_relative_uri('genindex', fn) + '#' + tid
                except NoUri:
                    pass
                else:
                    entry[0].append((main, uri))
        domain = cast(IndexDomain, self.env.get_domain('index'))
        for fn, entries in domain.entries.items():
            # new entry types must be listed in directives/other.py!
-            for type, value, tid, main, index_key in entries:  # noqa: B007
+            for entry_type, value, target_id, main, category_key in entries:
                uri = rel_uri is not False and f'{rel_uri}#{target_id}'
                try:
-                    if type == 'single':
+                    if entry_type == 'single':
                        try:
-                            entry, subentry = split_into(2, 'single', value)
+                            entry, sub_entry = split_into(2, 'single', value)
                        except ValueError:
                            entry, = split_into(1, 'single', value)
-                            subentry = ''
+                            sub_entry = ''
-                        add_entry(entry, subentry, main, key=index_key)
+                        _add_entry(entry, sub_entry, main,
-                    elif type == 'pair':
+                                   dic=new, link=uri, key=category_key)
                    elif entry_type == 'pair':
                        first, second = split_into(2, 'pair', value)
-                        add_entry(first, second, main, key=index_key)
+                        _add_entry(first, second, main,
-                        add_entry(second, first, main, key=index_key)
+                                   dic=new, link=uri, key=category_key)
-                    elif type == 'triple':
+                        _add_entry(second, first, main,
                                   dic=new, link=uri, key=category_key)
                    elif entry_type == 'triple':
                        first, second, third = split_into(3, 'triple', value)
-                        add_entry(first, second + ' ' + third, main, key=index_key)
+                        _add_entry(first, second + ' ' + third, main,
-                        add_entry(second, third + ', ' + first, main, key=index_key)
+                                   dic=new, link=uri, key=category_key)
-                        add_entry(third, first + ' ' + second, main, key=index_key)
+                        _add_entry(second, third + ', ' + first, main,
-                    elif type == 'see':
+                                   dic=new, link=uri, key=category_key)
                        _add_entry(third, first + ' ' + second, main,
                                   dic=new, link=uri, key=category_key)
                    elif entry_type == 'see':
                        first, second = split_into(2, 'see', value)
-                        add_entry(first, _('see %s') % second, None,
+                        _add_entry(first, _('see %s') % second, None,
-                                  link=False, key=index_key)
+                                   dic=new, link=False, key=category_key)
-                    elif type == 'seealso':
+                    elif entry_type == 'seealso':
                        first, second = split_into(2, 'see', value)
-                        add_entry(first, _('see also %s') % second, None,
+                        _add_entry(first, _('see also %s') % second, None,
-                                  link=False, key=index_key)
+                                   dic=new, link=False, key=category_key)
                    else:
-                        logger.warning(__('unknown index entry type %r'), type, location=fn)
+                        logger.warning(__('unknown index entry type %r'), entry_type,
                                       location=docname)
                except ValueError as err:
-                    logger.warning(str(err), location=fn)
+                    logger.warning(str(err), location=docname)
-        # sort the index entries for same keyword.
+        for (targets, sub_items, _category_key) in new.values():
-        def keyfunc0(entry: tuple[str, str]) -> tuple[bool, str]:
+            targets.sort(key=_key_func_0)
-            main, uri = entry
+            for (sub_targets, _0, _sub_category_key) in sub_items.values():
-            return (not main, uri)  # show main entries at first
+                sub_targets.sort(key=_key_func_0)
-        for indexentry in new.values():
+        new_list = sorted(new.items(), key=_key_func_1)
            indexentry[0].sort(key=keyfunc0)
            for subentry in indexentry[1].values():
                subentry[0].sort(key=keyfunc0)  # type: ignore
        # sort the index entries
        def keyfunc(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
            key, (void, void, category_key) = entry
            if category_key:
                # using specified category key to sort
                key = category_key
            lckey = unicodedata.normalize('NFD', key.lower())
            if lckey.startswith('\N{RIGHT-TO-LEFT MARK}'):
                lckey = lckey[1:]
            if lckey[0:1].isalpha() or lckey.startswith('_'):
                # put non-symbol characters at the following group (1)
                sortkey = (1, lckey)
            else:
                # put symbols at the front of the index (0)
                sortkey = (0, lckey)
            # ensure a deterministic order *within* letters by also sorting on
            # the entry itself
            return (sortkey, entry[0])
        newlist = sorted(new.items(), key=keyfunc)
        if group_entries:
            # fixup entries: transform
@ -119,54 +90,95 @@ class IndexEntries:
            #   func()
            #     (in module foo)
            #     (in module bar)
-            oldkey = ''
+            old_key = ''
-            oldsubitems: dict[str, list] = {}
+            old_sub_items: dict[str, list] = {}
            i = 0
-            while i < len(newlist):
+            while i < len(new_list):
-                key, (targets, subitems, _key) = newlist[i]
+                key, (targets, sub_items, category_key) = new_list[i]
-                # cannot move if it has subitems; structure gets too complex
+                # cannot move if it has sub_items; structure gets too complex
-                if not subitems:
+                if not sub_items:
                    m = _fixre.match(key)
                    if m:
-                        if oldkey == m.group(1):
+                        if old_key == m.group(1):
                            # prefixes match: add entry as subitem of the
                            # previous entry
-                            oldsubitems.setdefault(m.group(2), [[], {}, _key])[0].\
+                            old_sub_items.setdefault(
-                                extend(targets)
+                                m.group(2), [[], {}, category_key])[0].extend(targets)
-                            del newlist[i]
+                            del new_list[i]
                            continue
-                        oldkey = m.group(1)
+                        old_key = m.group(1)
                    else:
-                        oldkey = key
+                        old_key = key
-                oldsubitems = subitems
+                old_sub_items = sub_items
                i += 1
        # sort the sub-index entries
        def keyfunc2(entry: tuple[str, list]) -> str:
            key = unicodedata.normalize('NFD', entry[0].lower())
            if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
                key = key[1:]
            if key[0:1].isalpha() or key.startswith('_'):
                key = chr(127) + key
            return key
        # group the entries by letter
        def keyfunc3(item: tuple[str, list]) -> str:
            # hack: mutating the subitems dicts to a list in the keyfunc
            k, v = item
            v[1] = sorted(((si, se) for (si, (se, void, void)) in v[1].items()),
                          key=keyfunc2)
            if v[2] is None:
                # now calculate the key
                if k.startswith('\N{RIGHT-TO-LEFT MARK}'):
                    k = k[1:]
                letter = unicodedata.normalize('NFD', k[0])[0].upper()
                if letter.isalpha() or letter == '_':
                    return letter
                else:
                    # get all other symbols under one heading
                    return _('Symbols')
            else:
                return v[2]
        return [(key_, list(group))
-                for (key_, group) in groupby(newlist, keyfunc3)]
+                for (key_, group) in groupby(new_list, _key_func_3)]
 def _add_entry(word: str, subword: str, main: str | None, *,
               dic: dict[str, list], link: str | Literal[False], key: str | None) -> None:
    entry = dic.setdefault(word, [[], {}, key])
    if subword:
        entry = entry[1].setdefault(subword, [[], {}, key])
    if link:
        entry[0].append((main, link))
 def _key_func_0(entry: tuple[str, str]) -> tuple[bool, str]:
    """sort the index entries for same keyword."""
    main, uri = entry
    return not main, uri  # show main entries at first
 def _key_func_1(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
    """Sort the index entries"""
    key, (_targets, _sub_items, category_key) = entry
    if category_key:
        # using the specified category key to sort
        key = category_key
    lc_key = unicodedata.normalize('NFD', key.lower())
    if lc_key.startswith('\N{RIGHT-TO-LEFT MARK}'):
        lc_key = lc_key[1:]
    if not lc_key[0:1].isalpha() and not lc_key.startswith('_'):
        # put symbols at the front of the index (0)
        group = 0
    else:
        # put non-symbol characters at the following group (1)
        group = 1
    # ensure a deterministic order *within* letters by also sorting on
    # the entry itself
    return (group, lc_key), entry[0]
 def _key_func_2(entry: tuple[str, list]) -> str:
    """sort the sub-index entries"""
    key = unicodedata.normalize('NFD', entry[0].lower())
    if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
        key = key[1:]
    if key[0:1].isalpha() or key.startswith('_'):
        key = chr(127) + key
    return key
 def _key_func_3(entry: tuple[str, list]) -> str:
    """Group the entries by letter"""
    key, (targets, sub_items, category_key) = entry
    # hack: mutating the sub_items dicts to a list in the key_func
    entry[1][1] = sorted(((sub_key, sub_targets)
                          for (sub_key, (sub_targets, _0, _sub_category_key))
                          in sub_items.items()), key=_key_func_2)
    if category_key is not None:
        return category_key
    # now calculate the key
    if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
        key = key[1:]
    letter = unicodedata.normalize('NFD', key[0])[0].upper()
    if letter.isalpha() or letter == '_':
        return letter
    # get all other symbols under one heading
    return _('Symbols')
--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -31,7 +31,7 @@ from docutils.nodes import Element, Node
 from sphinx import addnodes, package_dir
 from sphinx.environment import BuildEnvironment
-from sphinx.util import split_into
+from sphinx.util import split_index_msg
 class SearchLanguage:
@ -478,8 +478,17 @@ class IndexBuilder:
        # find explicit entries within index directives
        _index_entries: set[tuple[str, str, str]] = set()
        for node in doctree.findall(addnodes.index):
-            for entry_type, value, target_id, main, *index_key in node['entries']:
+            for entry_type, value, target_id, main, _category_key in node['entries']:
-                _index_entries |= _parse_index_entry(entry_type, value, target_id, main)
+                try:
                    result = split_index_msg(entry_type, value)
                except ValueError:
                    pass
                else:
                    target_id = target_id or ''
                    if entry_type in {'see', 'seealso'}:
                        _index_entries.add((result[0], target_id, main))
                    _index_entries |= {(x, target_id, main) for x in result}
        self._index_entries[docname] = sorted(_index_entries)
    def _word_collector(self, doctree: nodes.document) -> WordStore:
@ -557,41 +566,3 @@ class IndexBuilder:
                    (base_js, language_js, self.lang.language_name))
        else:
            return self.lang.js_stemmer_code
 def _parse_index_entry(
    entry_type: str,
    value: str,
    target_id: str,
    main: str
 ) -> set[tuple[str, str, str]]:
    target_id = target_id or ''
    if entry_type == 'single':
        try:
            entry, subentry = split_into(2, 'single', value)
            if subentry:
                return {(entry, target_id, main), (subentry, target_id, main)}
        except ValueError:
            entry, = split_into(1, 'single', value)
        return {(entry, target_id, main)}
    elif entry_type == 'pair':
        try:
            first, second = split_into(2, 'pair', value)
            return {(first, target_id, main), (second, target_id, main)}
        except ValueError:
            pass
    elif entry_type == 'triple':
        try:
            first, second, third = split_into(3, 'triple', value)
            return {(first, target_id, main),
                    (second, target_id, main),
                    (third, target_id, main)}
        except ValueError:
            pass
    elif entry_type in {'see', 'seealso'}:
        try:
            first, second = split_into(2, 'see', value)
            return {(first, target_id, main)}
        except ValueError:
            pass
    return set()
--- a/sphinx/transforms/i18n.py
+++ b/sphinx/transforms/i18n.py
@ -498,8 +498,8 @@ class Locale(SphinxTransform):
            # Extract and translate messages for index entries.
            for node, entries in traverse_translatable_index(self.document):
                new_entries: list[tuple[str, str, str, str, str | None]] = []
-                for type, msg, tid, main, _key in entries:
+                for entry_type, value, target_id, main, _category_key in entries:
-                    msg_parts = split_index_msg(type, msg)
+                    msg_parts = split_index_msg(entry_type, value)
                    msgstr_parts = []
                    for part in msg_parts:
                        msgstr = catalog.gettext(part)
@ -507,7 +507,8 @@ class Locale(SphinxTransform):
                            msgstr = part
                        msgstr_parts.append(msgstr)
-                    new_entries.append((type, ';'.join(msgstr_parts), tid, main, None))
+                    new_entry = entry_type, ';'.join(msgstr_parts), target_id, main, None
                    new_entries.append(new_entry)
                node['raw_entries'] = entries
                node['entries'] = new_entries
--- a/sphinx/util/init.py
+++ b/sphinx/util/init.py
@ -247,28 +247,25 @@ def parselinenos(spec: str, total: int) -> list[int]:
 def split_into(n: int, type: str, value: str) -> list[str]:
    """Split an index entry into a given number of parts at semicolons."""
    parts = [x.strip() for x in value.split(';', n - 1)]
-    if sum(1 for part in parts if part) < n:
+    if len(list(filter(None, parts))) < n:
        raise ValueError(f'invalid {type} index entry {value!r}')
    return parts
-def split_index_msg(type: str, value: str) -> list[str]:
+def split_index_msg(entry_type: str, value: str) -> list[str]:
-    # new entry types must be listed in directives/other.py!
+    # new entry types must be listed in util/nodes.py!
-    if type == 'single':
+    if entry_type == 'single':
        try:
-            result = split_into(2, 'single', value)
+            return split_into(2, 'single', value)
        except ValueError:
-            result = split_into(1, 'single', value)
+            return split_into(1, 'single', value)
-    elif type == 'pair':
+    if entry_type == 'pair':
-        result = split_into(2, 'pair', value)
+        return split_into(2, 'pair', value)
-    elif type == 'triple':
+    if entry_type == 'triple':
-        result = split_into(3, 'triple', value)
+        return split_into(3, 'triple', value)
-    elif type in {'see', 'seealso'}:
+    if entry_type in {'see', 'seealso'}:
-        result = split_into(2, 'see', value)
+        return split_into(2, 'see', value)
-    else:
+    raise ValueError(f'invalid {entry_type} index entry {value!r}')
        raise ValueError(f'invalid {type} index entry {value!r}')
    return result
 def import_object(objname: str, source: str | None = None) -> Any:
--- a/sphinx/writers/texinfo.py
+++ b/sphinx/writers/texinfo.py
@ -1326,9 +1326,8 @@ class TexinfoTranslator(SphinxTranslator):
            self.ensure_eol()
        else:
            self.body.append('\n')
-        for entry in node['entries']:
+        for (_entry_type, value, _target_id, _main, _category_key) in node['entries']:
-            typ, text, tid, text2, key_ = entry
+            text = self.escape_menu(value)
            text = self.escape_menu(text)
            self.body.append('@geindex %s\n' % text)
    def visit_versionmodified(self, node: Element) -> None: