Move XML Name pattern to `epub3`

2025-02-25 18:55:22 -06:00 · 2022-09-10 09:04:35 +01:00 · 2022-09-10 09:04:35 +01:00 · f4ab9adf77
commit f4ab9adf77
parent 5eb79c126a
4 changed files with 31 additions and 35 deletions
--- a/sphinx/builders/epub3.py
+++ b/sphinx/builders/epub3.py
@ -6,6 +6,7 @@ Originally derived from epub.py.
 from __future__ import annotations

 import html
+import re
 from os import path
 from typing import Any, NamedTuple

@ -14,7 +15,7 @@ from sphinx.application import Sphinx
 from sphinx.builders import _epub_base
 from sphinx.config import ENUM, Config
 from sphinx.locale import __
-from sphinx.util import logging, xmlname_checker
+from sphinx.util import logging
 from sphinx.util.fileutil import copy_asset_file
 from sphinx.util.i18n import format_date
 from sphinx.util.osutil import make_filename
@ -50,6 +51,19 @@ HTML_TAG = (
    'xmlns:epub="http://www.idpf.org/2007/ops">'
 )

+# https://www.w3.org/TR/REC-xml/#NT-Name
+_xml_name_start_char = (
+    ':|[A-Z]|_|[a-z]|[\u00C0-\u00D6]'
+    '|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]'
+    '|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]'
+    '|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]'
+    '|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF]'
+)
+_xml_name_char = (
+    _xml_name_start_char + r'\-|\.' '|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]'
+)
+_XML_NAME_PATTERN = re.compile(f'({_xml_name_start_char})({_xml_name_char})*')
+

 class Epub3Builder(_epub_base.EpubBuilder):
    """
@ -187,7 +201,7 @@ def validate_config_values(app: Sphinx) -> None:
        logger.warning(__('conf value "epub_language" (or "language") '
                          'should not be empty for EPUB3'))
    # <package> unique-identifier attribute
-    if not xmlname_checker().match(app.config.epub_uid):
+    if not _XML_NAME_PATTERN.match(app.config.epub_uid):
        logger.warning(__('conf value "epub_uid" should be XML NAME for EPUB3'))
    # dc:title
    if not app.config.epub_title:
--- a/sphinx/util/init.py
+++ b/sphinx/util/init.py
@ -371,32 +371,11 @@ def isurl(url: str) -> bool:
    return bool(url) and '://' in url


-def xmlname_checker() -> re.Pattern:
-    # https://www.w3.org/TR/REC-xml/#NT-Name
-    name_start_chars = [
-        ':', ['A', 'Z'], '_', ['a', 'z'], ['\u00C0', '\u00D6'],
-        ['\u00D8', '\u00F6'], ['\u00F8', '\u02FF'], ['\u0370', '\u037D'],
-        ['\u037F', '\u1FFF'], ['\u200C', '\u200D'], ['\u2070', '\u218F'],
-        ['\u2C00', '\u2FEF'], ['\u3001', '\uD7FF'], ['\uF900', '\uFDCF'],
-        ['\uFDF0', '\uFFFD'], ['\U00010000', '\U000EFFFF']]
+def _xml_name_checker():
+    # to prevent import cycles
+    from sphinx.builders.epub3 import _XML_NAME_PATTERN

-    name_chars = [
-        "\\-", "\\.", ['0', '9'], '\u00B7', ['\u0300', '\u036F'],
-        ['\u203F', '\u2040']
-    ]
-
-    def convert(entries: Any, splitter: str = '|') -> str:
-        results = []
-        for entry in entries:
-            if isinstance(entry, list):
-                results.append('[%s]' % convert(entry, '-'))
-            else:
-                results.append(entry)
-        return splitter.join(results)
-
-    start_chars_regex = convert(name_start_chars)
-    name_chars_regex = convert(name_chars)
-    return re.compile(f'({start_chars_regex})({start_chars_regex}|{name_chars_regex})*')
+    return _XML_NAME_PATTERN


 deprecated_alias('sphinx.util',
@ -410,6 +389,7 @@ deprecated_alias('sphinx.util',
                     'rfc1123_to_epoch': _http_date.rfc1123_to_epoch,
                     'save_traceback': _exceptions.save_traceback,
                     'format_exception_cut_frames': _exceptions.format_exception_cut_frames,
+                     'xmlname_checker': _xml_name_checker,
                 },
                 RemovedInSphinx70Warning,
                 {
@ -422,4 +402,5 @@ deprecated_alias('sphinx.util',
                     'rfc1123_to_epoch': 'sphinx.http_date.rfc1123_to_epoch',
                     'save_traceback': 'sphinx.exceptions.save_traceback',
                     'format_exception_cut_frames': 'sphinx.exceptions.format_exception_cut_frames',  # NoQA: E501
+                     'xmlname_checker': 'sphinx.builders.epub3._XML_NAME_PATTERN',
                 })
--- a/tests/test_build_epub.py
+++ b/tests/test_build_epub.py
@ -7,6 +7,8 @@ from xml.etree import ElementTree

 import pytest

+from sphinx.builders.epub3 import _XML_NAME_PATTERN
+

 # check given command is runnable
 def runnable(command):
@ -382,3 +384,9 @@ def test_run_epubcheck(app):
            print(exc.stdout.decode('utf-8'))
            print(exc.stderr.decode('utf-8'))
            raise AssertionError('epubcheck exited with return code %s' % exc.returncode)
+
+
+def test_xml_name_pattern_check():
+    assert _XML_NAME_PATTERN.match('id-pub')
+    assert _XML_NAME_PATTERN.match('webpage')
+    assert not _XML_NAME_PATTERN.match('1bfda21')
--- a/tests/test_util.py
+++ b/tests/test_util.py
@ -6,7 +6,7 @@ import tempfile
 import pytest

 from sphinx.errors import ExtensionError
-from sphinx.util import encode_uri, ensuredir, import_object, parselinenos, xmlname_checker
+from sphinx.util import encode_uri, ensuredir, import_object, parselinenos


 def test_encode_uri():
@ -75,10 +75,3 @@ def test_parselinenos():
        parselinenos('-', 10)
    with pytest.raises(ValueError):
        parselinenos('3-1', 10)
-
-
-def test_xmlname_check():
-    checker = xmlname_checker()
-    assert checker.match('id-pub')
-    assert checker.match('webpage')
-    assert not checker.match('1bfda21')