Re-structure index entry processing (#11505)

This commit is contained in:
Adam Turner 2023-07-24 03:17:29 +01:00 committed by GitHub
parent 480630c649
commit 24b4d65a02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 160 additions and 180 deletions

View File

@ -156,8 +156,8 @@ class I18nBuilder(Builder):
if 'index' in self.env.config.gettext_additional_targets: if 'index' in self.env.config.gettext_additional_targets:
# Extract translatable messages from index entries. # Extract translatable messages from index entries.
for node, entries in traverse_translatable_index(doctree): for node, entries in traverse_translatable_index(doctree):
for typ, msg, _tid, _main, _key in entries: for entry_type, value, _target_id, _main, _category_key in entries:
for m in split_index_msg(typ, msg): for m in split_index_msg(entry_type, value):
catalog.add(m, node) catalog.add(m, node)

View File

@ -29,7 +29,7 @@ class IndexDomain(Domain):
label = 'index' label = 'index'
@property @property
def entries(self) -> dict[str, list[tuple[str, str, str, str, str]]]: def entries(self) -> dict[str, list[tuple[str, str, str, str, str | None]]]:
return self.data.setdefault('entries', {}) return self.data.setdefault('entries', {})
def clear_doc(self, docname: str) -> None: def clear_doc(self, docname: str) -> None:
@ -44,8 +44,8 @@ class IndexDomain(Domain):
entries = self.entries.setdefault(env.docname, []) entries = self.entries.setdefault(env.docname, [])
for node in list(document.findall(addnodes.index)): for node in list(document.findall(addnodes.index)):
try: try:
for entry in node['entries']: for (entry_type, value, _target_id, _main, _category_key) in node['entries']:
split_index_msg(entry[0], entry[1]) split_index_msg(entry_type, value)
except ValueError as exc: except ValueError as exc:
logger.warning(str(exc), location=node) logger.warning(str(exc), location=node)
node.parent.remove(node) node.parent.remove(node)

View File

@ -5,10 +5,9 @@ from __future__ import annotations
import re import re
import unicodedata import unicodedata
from itertools import groupby from itertools import groupby
from typing import Any, cast from typing import Any, Literal
from sphinx.builders import Builder from sphinx.builders import Builder
from sphinx.domains.index import IndexDomain
from sphinx.environment import BuildEnvironment from sphinx.environment import BuildEnvironment
from sphinx.errors import NoUri from sphinx.errors import NoUri
from sphinx.locale import _, __ from sphinx.locale import _, __
@ -20,6 +19,7 @@ logger = logging.getLogger(__name__)
class IndexEntries: class IndexEntries:
def __init__(self, env: BuildEnvironment) -> None: def __init__(self, env: BuildEnvironment) -> None:
self.env = env self.env = env
self.builder: Builder
def create_index(self, builder: Builder, group_entries: bool = True, def create_index(self, builder: Builder, group_entries: bool = True,
_fixre: re.Pattern = re.compile(r'(.*) ([(][^()]*[)])'), _fixre: re.Pattern = re.compile(r'(.*) ([(][^()]*[)])'),
@ -27,89 +27,60 @@ class IndexEntries:
"""Create the real index from the collected index entries.""" """Create the real index from the collected index entries."""
new: dict[str, list] = {} new: dict[str, list] = {}
def add_entry(word: str, subword: str, main: str | None, link: bool = True, rel_uri: str | Literal[False]
dic: dict[str, list] = new, key: str | None = None) -> None: index_domain = self.env.domains['index']
# Force the word to be unicode if it's a ASCII bytestring. for docname, entries in index_domain.entries.items():
# This will solve problems with unicode normalization later. try:
# For instance the RFC role will add bytestrings at the moment rel_uri = builder.get_relative_uri('genindex', docname)
word = str(word) except NoUri:
entry = dic.get(word) rel_uri = False
if not entry:
dic[word] = entry = [[], {}, key]
if subword:
add_entry(subword, '', main, link=link, dic=entry[1], key=key)
elif link:
try:
uri = builder.get_relative_uri('genindex', fn) + '#' + tid
except NoUri:
pass
else:
entry[0].append((main, uri))
domain = cast(IndexDomain, self.env.get_domain('index'))
for fn, entries in domain.entries.items():
# new entry types must be listed in directives/other.py! # new entry types must be listed in directives/other.py!
for type, value, tid, main, index_key in entries: # noqa: B007 for entry_type, value, target_id, main, category_key in entries:
uri = rel_uri is not False and f'{rel_uri}#{target_id}'
try: try:
if type == 'single': if entry_type == 'single':
try: try:
entry, subentry = split_into(2, 'single', value) entry, sub_entry = split_into(2, 'single', value)
except ValueError: except ValueError:
entry, = split_into(1, 'single', value) entry, = split_into(1, 'single', value)
subentry = '' sub_entry = ''
add_entry(entry, subentry, main, key=index_key) _add_entry(entry, sub_entry, main,
elif type == 'pair': dic=new, link=uri, key=category_key)
elif entry_type == 'pair':
first, second = split_into(2, 'pair', value) first, second = split_into(2, 'pair', value)
add_entry(first, second, main, key=index_key) _add_entry(first, second, main,
add_entry(second, first, main, key=index_key) dic=new, link=uri, key=category_key)
elif type == 'triple': _add_entry(second, first, main,
dic=new, link=uri, key=category_key)
elif entry_type == 'triple':
first, second, third = split_into(3, 'triple', value) first, second, third = split_into(3, 'triple', value)
add_entry(first, second + ' ' + third, main, key=index_key) _add_entry(first, second + ' ' + third, main,
add_entry(second, third + ', ' + first, main, key=index_key) dic=new, link=uri, key=category_key)
add_entry(third, first + ' ' + second, main, key=index_key) _add_entry(second, third + ', ' + first, main,
elif type == 'see': dic=new, link=uri, key=category_key)
_add_entry(third, first + ' ' + second, main,
dic=new, link=uri, key=category_key)
elif entry_type == 'see':
first, second = split_into(2, 'see', value) first, second = split_into(2, 'see', value)
add_entry(first, _('see %s') % second, None, _add_entry(first, _('see %s') % second, None,
link=False, key=index_key) dic=new, link=False, key=category_key)
elif type == 'seealso': elif entry_type == 'seealso':
first, second = split_into(2, 'see', value) first, second = split_into(2, 'see', value)
add_entry(first, _('see also %s') % second, None, _add_entry(first, _('see also %s') % second, None,
link=False, key=index_key) dic=new, link=False, key=category_key)
else: else:
logger.warning(__('unknown index entry type %r'), type, location=fn) logger.warning(__('unknown index entry type %r'), entry_type,
location=docname)
except ValueError as err: except ValueError as err:
logger.warning(str(err), location=fn) logger.warning(str(err), location=docname)
# sort the index entries for same keyword. for (targets, sub_items, _category_key) in new.values():
def keyfunc0(entry: tuple[str, str]) -> tuple[bool, str]: targets.sort(key=_key_func_0)
main, uri = entry for (sub_targets, _0, _sub_category_key) in sub_items.values():
return (not main, uri) # show main entries at first sub_targets.sort(key=_key_func_0)
for indexentry in new.values(): new_list = sorted(new.items(), key=_key_func_1)
indexentry[0].sort(key=keyfunc0)
for subentry in indexentry[1].values():
subentry[0].sort(key=keyfunc0) # type: ignore
# sort the index entries
def keyfunc(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
key, (void, void, category_key) = entry
if category_key:
# using specified category key to sort
key = category_key
lckey = unicodedata.normalize('NFD', key.lower())
if lckey.startswith('\N{RIGHT-TO-LEFT MARK}'):
lckey = lckey[1:]
if lckey[0:1].isalpha() or lckey.startswith('_'):
# put non-symbol characters at the following group (1)
sortkey = (1, lckey)
else:
# put symbols at the front of the index (0)
sortkey = (0, lckey)
# ensure a deterministic order *within* letters by also sorting on
# the entry itself
return (sortkey, entry[0])
newlist = sorted(new.items(), key=keyfunc)
if group_entries: if group_entries:
# fixup entries: transform # fixup entries: transform
@ -119,54 +90,95 @@ class IndexEntries:
# func() # func()
# (in module foo) # (in module foo)
# (in module bar) # (in module bar)
oldkey = '' old_key = ''
oldsubitems: dict[str, list] = {} old_sub_items: dict[str, list] = {}
i = 0 i = 0
while i < len(newlist): while i < len(new_list):
key, (targets, subitems, _key) = newlist[i] key, (targets, sub_items, category_key) = new_list[i]
# cannot move if it has subitems; structure gets too complex # cannot move if it has sub_items; structure gets too complex
if not subitems: if not sub_items:
m = _fixre.match(key) m = _fixre.match(key)
if m: if m:
if oldkey == m.group(1): if old_key == m.group(1):
# prefixes match: add entry as subitem of the # prefixes match: add entry as subitem of the
# previous entry # previous entry
oldsubitems.setdefault(m.group(2), [[], {}, _key])[0].\ old_sub_items.setdefault(
extend(targets) m.group(2), [[], {}, category_key])[0].extend(targets)
del newlist[i] del new_list[i]
continue continue
oldkey = m.group(1) old_key = m.group(1)
else: else:
oldkey = key old_key = key
oldsubitems = subitems old_sub_items = sub_items
i += 1 i += 1
# sort the sub-index entries
def keyfunc2(entry: tuple[str, list]) -> str:
key = unicodedata.normalize('NFD', entry[0].lower())
if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
key = key[1:]
if key[0:1].isalpha() or key.startswith('_'):
key = chr(127) + key
return key
# group the entries by letter
def keyfunc3(item: tuple[str, list]) -> str:
# hack: mutating the subitems dicts to a list in the keyfunc
k, v = item
v[1] = sorted(((si, se) for (si, (se, void, void)) in v[1].items()),
key=keyfunc2)
if v[2] is None:
# now calculate the key
if k.startswith('\N{RIGHT-TO-LEFT MARK}'):
k = k[1:]
letter = unicodedata.normalize('NFD', k[0])[0].upper()
if letter.isalpha() or letter == '_':
return letter
else:
# get all other symbols under one heading
return _('Symbols')
else:
return v[2]
return [(key_, list(group)) return [(key_, list(group))
for (key_, group) in groupby(newlist, keyfunc3)] for (key_, group) in groupby(new_list, _key_func_3)]
def _add_entry(word: str, subword: str, main: str | None, *,
dic: dict[str, list], link: str | Literal[False], key: str | None) -> None:
entry = dic.setdefault(word, [[], {}, key])
if subword:
entry = entry[1].setdefault(subword, [[], {}, key])
if link:
entry[0].append((main, link))
def _key_func_0(entry: tuple[str, str]) -> tuple[bool, str]:
"""sort the index entries for same keyword."""
main, uri = entry
return not main, uri # show main entries at first
def _key_func_1(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
"""Sort the index entries"""
key, (_targets, _sub_items, category_key) = entry
if category_key:
# using the specified category key to sort
key = category_key
lc_key = unicodedata.normalize('NFD', key.lower())
if lc_key.startswith('\N{RIGHT-TO-LEFT MARK}'):
lc_key = lc_key[1:]
if not lc_key[0:1].isalpha() and not lc_key.startswith('_'):
# put symbols at the front of the index (0)
group = 0
else:
# put non-symbol characters at the following group (1)
group = 1
# ensure a deterministic order *within* letters by also sorting on
# the entry itself
return (group, lc_key), entry[0]
def _key_func_2(entry: tuple[str, list]) -> str:
"""sort the sub-index entries"""
key = unicodedata.normalize('NFD', entry[0].lower())
if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
key = key[1:]
if key[0:1].isalpha() or key.startswith('_'):
key = chr(127) + key
return key
def _key_func_3(entry: tuple[str, list]) -> str:
"""Group the entries by letter"""
key, (targets, sub_items, category_key) = entry
# hack: mutating the sub_items dicts to a list in the key_func
entry[1][1] = sorted(((sub_key, sub_targets)
for (sub_key, (sub_targets, _0, _sub_category_key))
in sub_items.items()), key=_key_func_2)
if category_key is not None:
return category_key
# now calculate the key
if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
key = key[1:]
letter = unicodedata.normalize('NFD', key[0])[0].upper()
if letter.isalpha() or letter == '_':
return letter
# get all other symbols under one heading
return _('Symbols')

View File

@ -31,7 +31,7 @@ from docutils.nodes import Element, Node
from sphinx import addnodes, package_dir from sphinx import addnodes, package_dir
from sphinx.environment import BuildEnvironment from sphinx.environment import BuildEnvironment
from sphinx.util import split_into from sphinx.util import split_index_msg
class SearchLanguage: class SearchLanguage:
@ -478,8 +478,17 @@ class IndexBuilder:
# find explicit entries within index directives # find explicit entries within index directives
_index_entries: set[tuple[str, str, str]] = set() _index_entries: set[tuple[str, str, str]] = set()
for node in doctree.findall(addnodes.index): for node in doctree.findall(addnodes.index):
for entry_type, value, target_id, main, *index_key in node['entries']: for entry_type, value, target_id, main, _category_key in node['entries']:
_index_entries |= _parse_index_entry(entry_type, value, target_id, main) try:
result = split_index_msg(entry_type, value)
except ValueError:
pass
else:
target_id = target_id or ''
if entry_type in {'see', 'seealso'}:
_index_entries.add((result[0], target_id, main))
_index_entries |= {(x, target_id, main) for x in result}
self._index_entries[docname] = sorted(_index_entries) self._index_entries[docname] = sorted(_index_entries)
def _word_collector(self, doctree: nodes.document) -> WordStore: def _word_collector(self, doctree: nodes.document) -> WordStore:
@ -557,41 +566,3 @@ class IndexBuilder:
(base_js, language_js, self.lang.language_name)) (base_js, language_js, self.lang.language_name))
else: else:
return self.lang.js_stemmer_code return self.lang.js_stemmer_code
def _parse_index_entry(
entry_type: str,
value: str,
target_id: str,
main: str
) -> set[tuple[str, str, str]]:
target_id = target_id or ''
if entry_type == 'single':
try:
entry, subentry = split_into(2, 'single', value)
if subentry:
return {(entry, target_id, main), (subentry, target_id, main)}
except ValueError:
entry, = split_into(1, 'single', value)
return {(entry, target_id, main)}
elif entry_type == 'pair':
try:
first, second = split_into(2, 'pair', value)
return {(first, target_id, main), (second, target_id, main)}
except ValueError:
pass
elif entry_type == 'triple':
try:
first, second, third = split_into(3, 'triple', value)
return {(first, target_id, main),
(second, target_id, main),
(third, target_id, main)}
except ValueError:
pass
elif entry_type in {'see', 'seealso'}:
try:
first, second = split_into(2, 'see', value)
return {(first, target_id, main)}
except ValueError:
pass
return set()

View File

@ -498,8 +498,8 @@ class Locale(SphinxTransform):
# Extract and translate messages for index entries. # Extract and translate messages for index entries.
for node, entries in traverse_translatable_index(self.document): for node, entries in traverse_translatable_index(self.document):
new_entries: list[tuple[str, str, str, str, str | None]] = [] new_entries: list[tuple[str, str, str, str, str | None]] = []
for type, msg, tid, main, _key in entries: for entry_type, value, target_id, main, _category_key in entries:
msg_parts = split_index_msg(type, msg) msg_parts = split_index_msg(entry_type, value)
msgstr_parts = [] msgstr_parts = []
for part in msg_parts: for part in msg_parts:
msgstr = catalog.gettext(part) msgstr = catalog.gettext(part)
@ -507,7 +507,8 @@ class Locale(SphinxTransform):
msgstr = part msgstr = part
msgstr_parts.append(msgstr) msgstr_parts.append(msgstr)
new_entries.append((type, ';'.join(msgstr_parts), tid, main, None)) new_entry = entry_type, ';'.join(msgstr_parts), target_id, main, None
new_entries.append(new_entry)
node['raw_entries'] = entries node['raw_entries'] = entries
node['entries'] = new_entries node['entries'] = new_entries

View File

@ -247,28 +247,25 @@ def parselinenos(spec: str, total: int) -> list[int]:
def split_into(n: int, type: str, value: str) -> list[str]: def split_into(n: int, type: str, value: str) -> list[str]:
"""Split an index entry into a given number of parts at semicolons.""" """Split an index entry into a given number of parts at semicolons."""
parts = [x.strip() for x in value.split(';', n - 1)] parts = [x.strip() for x in value.split(';', n - 1)]
if sum(1 for part in parts if part) < n: if len(list(filter(None, parts))) < n:
raise ValueError(f'invalid {type} index entry {value!r}') raise ValueError(f'invalid {type} index entry {value!r}')
return parts return parts
def split_index_msg(type: str, value: str) -> list[str]: def split_index_msg(entry_type: str, value: str) -> list[str]:
# new entry types must be listed in directives/other.py! # new entry types must be listed in util/nodes.py!
if type == 'single': if entry_type == 'single':
try: try:
result = split_into(2, 'single', value) return split_into(2, 'single', value)
except ValueError: except ValueError:
result = split_into(1, 'single', value) return split_into(1, 'single', value)
elif type == 'pair': if entry_type == 'pair':
result = split_into(2, 'pair', value) return split_into(2, 'pair', value)
elif type == 'triple': if entry_type == 'triple':
result = split_into(3, 'triple', value) return split_into(3, 'triple', value)
elif type in {'see', 'seealso'}: if entry_type in {'see', 'seealso'}:
result = split_into(2, 'see', value) return split_into(2, 'see', value)
else: raise ValueError(f'invalid {entry_type} index entry {value!r}')
raise ValueError(f'invalid {type} index entry {value!r}')
return result
def import_object(objname: str, source: str | None = None) -> Any: def import_object(objname: str, source: str | None = None) -> Any:

View File

@ -1326,9 +1326,8 @@ class TexinfoTranslator(SphinxTranslator):
self.ensure_eol() self.ensure_eol()
else: else:
self.body.append('\n') self.body.append('\n')
for entry in node['entries']: for (_entry_type, value, _target_id, _main, _category_key) in node['entries']:
typ, text, tid, text2, key_ = entry text = self.escape_menu(value)
text = self.escape_menu(text)
self.body.append('@geindex %s\n' % text) self.body.append('@geindex %s\n' % text)
def visit_versionmodified(self, node: Element) -> None: def visit_versionmodified(self, node: Element) -> None: