Re-structure index entry processing (#11505)

This commit is contained in:
Adam Turner 2023-07-24 03:17:29 +01:00 committed by GitHub
parent 480630c649
commit 24b4d65a02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 160 additions and 180 deletions

View File

@ -156,8 +156,8 @@ class I18nBuilder(Builder):
if 'index' in self.env.config.gettext_additional_targets:
# Extract translatable messages from index entries.
for node, entries in traverse_translatable_index(doctree):
for typ, msg, _tid, _main, _key in entries:
for m in split_index_msg(typ, msg):
for entry_type, value, _target_id, _main, _category_key in entries:
for m in split_index_msg(entry_type, value):
catalog.add(m, node)

View File

@ -29,7 +29,7 @@ class IndexDomain(Domain):
label = 'index'
@property
def entries(self) -> dict[str, list[tuple[str, str, str, str, str]]]:
def entries(self) -> dict[str, list[tuple[str, str, str, str, str | None]]]:
return self.data.setdefault('entries', {})
def clear_doc(self, docname: str) -> None:
@ -44,8 +44,8 @@ class IndexDomain(Domain):
entries = self.entries.setdefault(env.docname, [])
for node in list(document.findall(addnodes.index)):
try:
for entry in node['entries']:
split_index_msg(entry[0], entry[1])
for (entry_type, value, _target_id, _main, _category_key) in node['entries']:
split_index_msg(entry_type, value)
except ValueError as exc:
logger.warning(str(exc), location=node)
node.parent.remove(node)

View File

@ -5,10 +5,9 @@ from __future__ import annotations
import re
import unicodedata
from itertools import groupby
from typing import Any, cast
from typing import Any, Literal
from sphinx.builders import Builder
from sphinx.domains.index import IndexDomain
from sphinx.environment import BuildEnvironment
from sphinx.errors import NoUri
from sphinx.locale import _, __
@ -20,6 +19,7 @@ logger = logging.getLogger(__name__)
class IndexEntries:
def __init__(self, env: BuildEnvironment) -> None:
self.env = env
self.builder: Builder
def create_index(self, builder: Builder, group_entries: bool = True,
_fixre: re.Pattern = re.compile(r'(.*) ([(][^()]*[)])'),
@ -27,89 +27,60 @@ class IndexEntries:
"""Create the real index from the collected index entries."""
new: dict[str, list] = {}
def add_entry(word: str, subword: str, main: str | None, link: bool = True,
dic: dict[str, list] = new, key: str | None = None) -> None:
# Force the word to be unicode if it's a ASCII bytestring.
# This will solve problems with unicode normalization later.
# For instance the RFC role will add bytestrings at the moment
word = str(word)
entry = dic.get(word)
if not entry:
dic[word] = entry = [[], {}, key]
if subword:
add_entry(subword, '', main, link=link, dic=entry[1], key=key)
elif link:
rel_uri: str | Literal[False]
index_domain = self.env.domains['index']
for docname, entries in index_domain.entries.items():
try:
uri = builder.get_relative_uri('genindex', fn) + '#' + tid
rel_uri = builder.get_relative_uri('genindex', docname)
except NoUri:
pass
else:
entry[0].append((main, uri))
rel_uri = False
domain = cast(IndexDomain, self.env.get_domain('index'))
for fn, entries in domain.entries.items():
# new entry types must be listed in directives/other.py!
for type, value, tid, main, index_key in entries: # noqa: B007
for entry_type, value, target_id, main, category_key in entries:
uri = rel_uri is not False and f'{rel_uri}#{target_id}'
try:
if type == 'single':
if entry_type == 'single':
try:
entry, subentry = split_into(2, 'single', value)
entry, sub_entry = split_into(2, 'single', value)
except ValueError:
entry, = split_into(1, 'single', value)
subentry = ''
add_entry(entry, subentry, main, key=index_key)
elif type == 'pair':
sub_entry = ''
_add_entry(entry, sub_entry, main,
dic=new, link=uri, key=category_key)
elif entry_type == 'pair':
first, second = split_into(2, 'pair', value)
add_entry(first, second, main, key=index_key)
add_entry(second, first, main, key=index_key)
elif type == 'triple':
_add_entry(first, second, main,
dic=new, link=uri, key=category_key)
_add_entry(second, first, main,
dic=new, link=uri, key=category_key)
elif entry_type == 'triple':
first, second, third = split_into(3, 'triple', value)
add_entry(first, second + ' ' + third, main, key=index_key)
add_entry(second, third + ', ' + first, main, key=index_key)
add_entry(third, first + ' ' + second, main, key=index_key)
elif type == 'see':
_add_entry(first, second + ' ' + third, main,
dic=new, link=uri, key=category_key)
_add_entry(second, third + ', ' + first, main,
dic=new, link=uri, key=category_key)
_add_entry(third, first + ' ' + second, main,
dic=new, link=uri, key=category_key)
elif entry_type == 'see':
first, second = split_into(2, 'see', value)
add_entry(first, _('see %s') % second, None,
link=False, key=index_key)
elif type == 'seealso':
_add_entry(first, _('see %s') % second, None,
dic=new, link=False, key=category_key)
elif entry_type == 'seealso':
first, second = split_into(2, 'see', value)
add_entry(first, _('see also %s') % second, None,
link=False, key=index_key)
_add_entry(first, _('see also %s') % second, None,
dic=new, link=False, key=category_key)
else:
logger.warning(__('unknown index entry type %r'), type, location=fn)
logger.warning(__('unknown index entry type %r'), entry_type,
location=docname)
except ValueError as err:
logger.warning(str(err), location=fn)
logger.warning(str(err), location=docname)
# sort the index entries for same keyword.
def keyfunc0(entry: tuple[str, str]) -> tuple[bool, str]:
main, uri = entry
return (not main, uri) # show main entries at first
for (targets, sub_items, _category_key) in new.values():
targets.sort(key=_key_func_0)
for (sub_targets, _0, _sub_category_key) in sub_items.values():
sub_targets.sort(key=_key_func_0)
for indexentry in new.values():
indexentry[0].sort(key=keyfunc0)
for subentry in indexentry[1].values():
subentry[0].sort(key=keyfunc0) # type: ignore
# sort the index entries
def keyfunc(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
key, (void, void, category_key) = entry
if category_key:
# using specified category key to sort
key = category_key
lckey = unicodedata.normalize('NFD', key.lower())
if lckey.startswith('\N{RIGHT-TO-LEFT MARK}'):
lckey = lckey[1:]
if lckey[0:1].isalpha() or lckey.startswith('_'):
# put non-symbol characters at the following group (1)
sortkey = (1, lckey)
else:
# put symbols at the front of the index (0)
sortkey = (0, lckey)
# ensure a deterministic order *within* letters by also sorting on
# the entry itself
return (sortkey, entry[0])
newlist = sorted(new.items(), key=keyfunc)
new_list = sorted(new.items(), key=_key_func_1)
if group_entries:
# fixup entries: transform
@ -119,30 +90,70 @@ class IndexEntries:
# func()
# (in module foo)
# (in module bar)
oldkey = ''
oldsubitems: dict[str, list] = {}
old_key = ''
old_sub_items: dict[str, list] = {}
i = 0
while i < len(newlist):
key, (targets, subitems, _key) = newlist[i]
# cannot move if it has subitems; structure gets too complex
if not subitems:
while i < len(new_list):
key, (targets, sub_items, category_key) = new_list[i]
# cannot move if it has sub_items; structure gets too complex
if not sub_items:
m = _fixre.match(key)
if m:
if oldkey == m.group(1):
if old_key == m.group(1):
# prefixes match: add entry as subitem of the
# previous entry
oldsubitems.setdefault(m.group(2), [[], {}, _key])[0].\
extend(targets)
del newlist[i]
old_sub_items.setdefault(
m.group(2), [[], {}, category_key])[0].extend(targets)
del new_list[i]
continue
oldkey = m.group(1)
old_key = m.group(1)
else:
oldkey = key
oldsubitems = subitems
old_key = key
old_sub_items = sub_items
i += 1
# sort the sub-index entries
def keyfunc2(entry: tuple[str, list]) -> str:
return [(key_, list(group))
for (key_, group) in groupby(new_list, _key_func_3)]
def _add_entry(word: str, subword: str, main: str | None, *,
dic: dict[str, list], link: str | Literal[False], key: str | None) -> None:
entry = dic.setdefault(word, [[], {}, key])
if subword:
entry = entry[1].setdefault(subword, [[], {}, key])
if link:
entry[0].append((main, link))
def _key_func_0(entry: tuple[str, str]) -> tuple[bool, str]:
"""sort the index entries for same keyword."""
main, uri = entry
return not main, uri # show main entries at first
def _key_func_1(entry: tuple[str, list]) -> tuple[tuple[int, str], str]:
"""Sort the index entries"""
key, (_targets, _sub_items, category_key) = entry
if category_key:
# using the specified category key to sort
key = category_key
lc_key = unicodedata.normalize('NFD', key.lower())
if lc_key.startswith('\N{RIGHT-TO-LEFT MARK}'):
lc_key = lc_key[1:]
if not lc_key[0:1].isalpha() and not lc_key.startswith('_'):
# put symbols at the front of the index (0)
group = 0
else:
# put non-symbol characters at the following group (1)
group = 1
# ensure a deterministic order *within* letters by also sorting on
# the entry itself
return (group, lc_key), entry[0]
def _key_func_2(entry: tuple[str, list]) -> str:
"""sort the sub-index entries"""
key = unicodedata.normalize('NFD', entry[0].lower())
if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
key = key[1:]
@ -150,23 +161,24 @@ class IndexEntries:
key = chr(127) + key
return key
# group the entries by letter
def keyfunc3(item: tuple[str, list]) -> str:
# hack: mutating the subitems dicts to a list in the keyfunc
k, v = item
v[1] = sorted(((si, se) for (si, (se, void, void)) in v[1].items()),
key=keyfunc2)
if v[2] is None:
def _key_func_3(entry: tuple[str, list]) -> str:
"""Group the entries by letter"""
key, (targets, sub_items, category_key) = entry
# hack: mutating the sub_items dicts to a list in the key_func
entry[1][1] = sorted(((sub_key, sub_targets)
for (sub_key, (sub_targets, _0, _sub_category_key))
in sub_items.items()), key=_key_func_2)
if category_key is not None:
return category_key
# now calculate the key
if k.startswith('\N{RIGHT-TO-LEFT MARK}'):
k = k[1:]
letter = unicodedata.normalize('NFD', k[0])[0].upper()
if key.startswith('\N{RIGHT-TO-LEFT MARK}'):
key = key[1:]
letter = unicodedata.normalize('NFD', key[0])[0].upper()
if letter.isalpha() or letter == '_':
return letter
else:
# get all other symbols under one heading
return _('Symbols')
else:
return v[2]
return [(key_, list(group))
for (key_, group) in groupby(newlist, keyfunc3)]

View File

@ -31,7 +31,7 @@ from docutils.nodes import Element, Node
from sphinx import addnodes, package_dir
from sphinx.environment import BuildEnvironment
from sphinx.util import split_into
from sphinx.util import split_index_msg
class SearchLanguage:
@ -478,8 +478,17 @@ class IndexBuilder:
# find explicit entries within index directives
_index_entries: set[tuple[str, str, str]] = set()
for node in doctree.findall(addnodes.index):
for entry_type, value, target_id, main, *index_key in node['entries']:
_index_entries |= _parse_index_entry(entry_type, value, target_id, main)
for entry_type, value, target_id, main, _category_key in node['entries']:
try:
result = split_index_msg(entry_type, value)
except ValueError:
pass
else:
target_id = target_id or ''
if entry_type in {'see', 'seealso'}:
_index_entries.add((result[0], target_id, main))
_index_entries |= {(x, target_id, main) for x in result}
self._index_entries[docname] = sorted(_index_entries)
def _word_collector(self, doctree: nodes.document) -> WordStore:
@ -557,41 +566,3 @@ class IndexBuilder:
(base_js, language_js, self.lang.language_name))
else:
return self.lang.js_stemmer_code
def _parse_index_entry(
entry_type: str,
value: str,
target_id: str,
main: str
) -> set[tuple[str, str, str]]:
target_id = target_id or ''
if entry_type == 'single':
try:
entry, subentry = split_into(2, 'single', value)
if subentry:
return {(entry, target_id, main), (subentry, target_id, main)}
except ValueError:
entry, = split_into(1, 'single', value)
return {(entry, target_id, main)}
elif entry_type == 'pair':
try:
first, second = split_into(2, 'pair', value)
return {(first, target_id, main), (second, target_id, main)}
except ValueError:
pass
elif entry_type == 'triple':
try:
first, second, third = split_into(3, 'triple', value)
return {(first, target_id, main),
(second, target_id, main),
(third, target_id, main)}
except ValueError:
pass
elif entry_type in {'see', 'seealso'}:
try:
first, second = split_into(2, 'see', value)
return {(first, target_id, main)}
except ValueError:
pass
return set()

View File

@ -498,8 +498,8 @@ class Locale(SphinxTransform):
# Extract and translate messages for index entries.
for node, entries in traverse_translatable_index(self.document):
new_entries: list[tuple[str, str, str, str, str | None]] = []
for type, msg, tid, main, _key in entries:
msg_parts = split_index_msg(type, msg)
for entry_type, value, target_id, main, _category_key in entries:
msg_parts = split_index_msg(entry_type, value)
msgstr_parts = []
for part in msg_parts:
msgstr = catalog.gettext(part)
@ -507,7 +507,8 @@ class Locale(SphinxTransform):
msgstr = part
msgstr_parts.append(msgstr)
new_entries.append((type, ';'.join(msgstr_parts), tid, main, None))
new_entry = entry_type, ';'.join(msgstr_parts), target_id, main, None
new_entries.append(new_entry)
node['raw_entries'] = entries
node['entries'] = new_entries

View File

@ -247,28 +247,25 @@ def parselinenos(spec: str, total: int) -> list[int]:
def split_into(n: int, type: str, value: str) -> list[str]:
"""Split an index entry into a given number of parts at semicolons."""
parts = [x.strip() for x in value.split(';', n - 1)]
if sum(1 for part in parts if part) < n:
if len(list(filter(None, parts))) < n:
raise ValueError(f'invalid {type} index entry {value!r}')
return parts
def split_index_msg(type: str, value: str) -> list[str]:
# new entry types must be listed in directives/other.py!
if type == 'single':
def split_index_msg(entry_type: str, value: str) -> list[str]:
# new entry types must be listed in util/nodes.py!
if entry_type == 'single':
try:
result = split_into(2, 'single', value)
return split_into(2, 'single', value)
except ValueError:
result = split_into(1, 'single', value)
elif type == 'pair':
result = split_into(2, 'pair', value)
elif type == 'triple':
result = split_into(3, 'triple', value)
elif type in {'see', 'seealso'}:
result = split_into(2, 'see', value)
else:
raise ValueError(f'invalid {type} index entry {value!r}')
return result
return split_into(1, 'single', value)
if entry_type == 'pair':
return split_into(2, 'pair', value)
if entry_type == 'triple':
return split_into(3, 'triple', value)
if entry_type in {'see', 'seealso'}:
return split_into(2, 'see', value)
raise ValueError(f'invalid {entry_type} index entry {value!r}')
def import_object(objname: str, source: str | None = None) -> Any:

View File

@ -1326,9 +1326,8 @@ class TexinfoTranslator(SphinxTranslator):
self.ensure_eol()
else:
self.body.append('\n')
for entry in node['entries']:
typ, text, tid, text2, key_ = entry
text = self.escape_menu(text)
for (_entry_type, value, _target_id, _main, _category_key) in node['entries']:
text = self.escape_menu(value)
self.body.append('@geindex %s\n' % text)
def visit_versionmodified(self, node: Element) -> None: