Merge pull request #4294 from tk0miya/refactor_parser

Refactor docutils components of Sphinx (reader, parser, FileInput and so on).
This commit is contained in:
Takeshi KOMIYA 2017-12-14 23:13:28 +09:00 committed by GitHub
commit acf5eaae84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 341 additions and 115 deletions

View File

@ -83,6 +83,7 @@ builtin_extensions = (
'sphinx.directives.code',
'sphinx.directives.other',
'sphinx.directives.patches',
'sphinx.io',
'sphinx.parsers',
'sphinx.roles',
'sphinx.transforms.post_transforms',

View File

@ -8,13 +8,15 @@
:copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
import codecs
from docutils.io import FileInput, NullOutput
from docutils.core import Publisher
from docutils.readers import standalone
from docutils.statemachine import StringList, string2lines
from docutils.writers import UnfilteredWriter
from six import string_types, text_type, iteritems
from six import text_type
from typing import Any, Union # NOQA
from sphinx.transforms import (
@ -28,7 +30,6 @@ from sphinx.transforms.i18n import (
PreserveTranslatableMessages, Locale, RemoveTranslatableInline,
)
from sphinx.util import logging
from sphinx.util import import_object, split_docinfo
from sphinx.util.docutils import LoggingReporter
if False:
@ -42,44 +43,18 @@ if False:
from sphinx.builders import Builder # NOQA
from sphinx.environment import BuildEnvironment # NOQA
docinfo_re = re.compile(':\\w+:.*?')
logger = logging.getLogger(__name__)
class SphinxBaseReader(standalone.Reader):
"""
Add our source parsers
A base class of readers for Sphinx.
This replaces reporter by Sphinx's on generating document.
"""
def __init__(self, app, parsers={}, *args, **kwargs):
# type: (Sphinx, Dict[unicode, Parser], Any, Any) -> None
standalone.Reader.__init__(self, *args, **kwargs)
self.parser_map = {} # type: Dict[unicode, Parser]
for suffix, parser_class in parsers.items():
if isinstance(parser_class, string_types):
parser_class = import_object(parser_class, 'source parser') # type: ignore
parser = parser_class()
if hasattr(parser, 'set_application'):
parser.set_application(app)
self.parser_map[suffix] = parser
def read(self, source, parser, settings):
# type: (Input, Parser, Dict) -> nodes.document
self.source = source
for suffix in self.parser_map:
if source.source_path.endswith(suffix):
self.parser = self.parser_map[suffix]
break
else:
# use special parser for unknown file-extension '*' (if exists)
self.parser = self.parser_map.get('*')
if not self.parser:
self.parser = parser
self.settings = settings
self.input = self.source.read()
self.parse()
return self.document
def get_transforms(self):
# type: () -> List[Transform]
@ -87,17 +62,19 @@ class SphinxBaseReader(standalone.Reader):
def new_document(self):
# type: () -> nodes.document
"""Creates a new document object which having a special reporter object good
for logging.
"""
document = standalone.Reader.new_document(self)
reporter = document.reporter
document.reporter = LoggingReporter(reporter.source, reporter.report_level,
reporter.halt_level, reporter.debug_flag,
reporter.error_handler)
document.reporter = LoggingReporter.from_reporter(reporter)
document.reporter.set_source(self.source)
return document
class SphinxStandaloneReader(SphinxBaseReader):
"""
Add our own transforms.
A basic document reader for Sphinx.
"""
transforms = [ApplySourceWorkaround, ExtraTranslatableNodes, PreserveTranslatableMessages,
Locale, CitationReferences, DefaultSubstitutions, MoveModuleTargets,
@ -108,29 +85,30 @@ class SphinxStandaloneReader(SphinxBaseReader):
class SphinxI18nReader(SphinxBaseReader):
"""
Replacer for document.reporter.get_source_and_line method.
A document reader for i18n.
reST text lines for translation do not have the original source line number.
This class provides the correct line numbers when reporting.
This returns the source line number of original text as current source line number
to let users know where the error happened.
Because the translated texts are partial and they don't have correct line numbers.
"""
lineno = None # type: int
transforms = [ApplySourceWorkaround, ExtraTranslatableNodes, CitationReferences,
DefaultSubstitutions, MoveModuleTargets, HandleCodeBlocks,
AutoNumbering, SortIds, RemoveTranslatableInline,
FilterSystemMessages, RefOnlyBulletListTransform,
UnreferencedFootnotesDetector]
def __init__(self, *args, **kwargs):
# type: (Any, Any) -> None
SphinxBaseReader.__init__(self, *args, **kwargs)
self.lineno = None # type: int
def set_lineno_for_reporter(self, lineno):
# type: (int) -> None
"""Stores the source line number of original text."""
self.lineno = lineno
def new_document(self):
# type: () -> nodes.document
"""Creates a new document object which having a special reporter object for
translation.
"""
document = SphinxBaseReader.new_document(self)
reporter = document.reporter
@ -143,6 +121,8 @@ class SphinxI18nReader(SphinxBaseReader):
class SphinxDummyWriter(UnfilteredWriter):
"""Dummy writer module used for generating doctree."""
supported = ('html',) # needed to keep "meta" nodes
def translate(self):
@ -155,7 +135,13 @@ def SphinxDummySourceClass(source, *args, **kwargs):
return source
class SphinxFileInput(FileInput):
class SphinxBaseFileInput(FileInput):
"""A base class of SphinxFileInput.
It supports to replace unknown Unicode characters to '?'. And it also emits
Sphinx events ``source-read`` on reading.
"""
def __init__(self, app, env, *args, **kwds):
# type: (Sphinx, BuildEnvironment, Any, Any) -> None
self.app = app
@ -175,27 +161,16 @@ class SphinxFileInput(FileInput):
def read(self):
# type: () -> unicode
def get_parser_type(source_path):
# type: (unicode) -> Tuple[unicode]
for suffix, parser_class in iteritems(self.app.registry.get_source_parsers()):
if source_path.endswith(suffix):
if isinstance(parser_class, string_types):
parser_class = import_object(parser_class, 'source parser') # type: ignore # NOQA
return parser_class.supported
return ('restructuredtext',)
"""Reads the contents from file.
After reading, it emits Sphinx event ``source-read``.
"""
data = FileInput.read(self)
if self.app:
arg = [data]
self.app.emit('source-read', self.env.docname, arg)
data = arg[0]
docinfo, data = split_docinfo(data)
if 'restructuredtext' in get_parser_type(self.source_path):
if self.env.config.rst_epilog:
data = data + '\n' + self.env.config.rst_epilog + '\n'
if self.env.config.rst_prolog:
data = self.env.config.rst_prolog + '\n' + data
return docinfo + data
# emit source-read event
arg = [data]
self.app.emit('source-read', self.env.docname, arg)
return arg[0]
def warn_and_replace(self, error):
# type: (Any) -> Tuple
@ -213,14 +188,84 @@ class SphinxFileInput(FileInput):
return (u'?', error.end)
class SphinxFileInput(SphinxBaseFileInput):
"""A basic FileInput for Sphinx."""
pass
class SphinxRSTFileInput(SphinxBaseFileInput):
"""A reST FileInput for Sphinx.
This FileInput automatically prepends and appends text by :confval:`rst_prolog` and
:confval:`rst_epilog`.
.. important::
This FileInput uses an instance of ``StringList`` as a return value of ``read()``
method to indicate original source filename and line numbers after prepending and
appending.
For that reason, ``sphinx.parsers.RSTParser`` should be used with this to parse
a content correctly.
"""
def prepend_prolog(self, text, prolog):
# type: (StringList, unicode) -> None
docinfo = self.count_docinfo_lines(text)
if docinfo:
# insert a blank line after docinfo
text.insert(docinfo, '', '<generated>', 0)
docinfo += 1
# insert prolog (after docinfo if exists)
for lineno, line in enumerate(prolog.splitlines()):
text.insert(docinfo + lineno, line, '<rst_prolog>', lineno)
text.insert(docinfo + lineno + 1, '', '<generated>', 0)
def append_epilog(self, text, epilog):
# type: (StringList, unicode) -> None
# append a blank line and rst_epilog
text.append('', '<generated>', 0)
for lineno, line in enumerate(epilog.splitlines()):
text.append(line, '<rst_epilog>', lineno)
def read(self):
# type: () -> StringList
inputstring = SphinxBaseFileInput.read(self)
lines = string2lines(inputstring, convert_whitespace=True)
content = StringList()
for lineno, line in enumerate(lines):
content.append(line, self.source_path, lineno)
if self.env.config.rst_prolog:
self.prepend_prolog(content, self.env.config.rst_prolog)
if self.env.config.rst_epilog:
self.append_epilog(content, self.env.config.rst_epilog)
return content
def count_docinfo_lines(self, content):
# type: (StringList) -> int
if len(content) == 0:
return 0
else:
for lineno, line in enumerate(content.data):
if not docinfo_re.match(line):
break
return lineno
def read_doc(app, env, filename):
# type: (Sphinx, BuildEnvironment, unicode) -> nodes.document
"""Parse a document and convert to doctree."""
reader = SphinxStandaloneReader(app, parsers=app.registry.get_source_parsers())
source = SphinxFileInput(app, env, source=None, source_path=filename,
encoding=env.config.source_encoding)
input_class = app.registry.get_source_input(filename)
reader = SphinxStandaloneReader()
source = input_class(app, env, source=None, source_path=filename,
encoding=env.config.source_encoding)
parser = app.registry.create_source_parser(app, filename)
pub = Publisher(reader=reader,
parser=parser,
writer=SphinxDummyWriter(),
source_class=SphinxDummySourceClass,
destination=NullOutput())
@ -229,3 +274,8 @@ def read_doc(app, env, filename):
pub.set_source(source, filename)
pub.publish()
return pub.document
def setup(app):
app.registry.add_source_input('*', SphinxFileInput)
app.registry.add_source_input('restructuredtext', SphinxRSTFileInput)

View File

@ -11,6 +11,8 @@
import docutils.parsers
import docutils.parsers.rst
from docutils.parsers.rst import states
from docutils.statemachine import StringList
from docutils.transforms.universal import SmartQuotes
from sphinx.transforms import SphinxSmartQuotes
@ -18,6 +20,7 @@ from sphinx.transforms import SphinxSmartQuotes
if False:
# For type annotation
from typing import Any, Dict, List, Type # NOQA
from docutils import nodes # NOQA
from docutils.transforms import Transform # NOQA
from sphinx.application import Sphinx # NOQA
@ -56,7 +59,7 @@ class Parser(docutils.parsers.Parser):
class RSTParser(docutils.parsers.rst.Parser):
"""A reST parser customized for Sphinx."""
"""A reST parser for Sphinx."""
def get_transforms(self):
# type: () -> List[Type[Transform]]
@ -66,6 +69,26 @@ class RSTParser(docutils.parsers.rst.Parser):
transforms.append(SphinxSmartQuotes)
return transforms
def parse(self, inputstring, document):
# type: (Any, nodes.document) -> None
"""Parse text and generate a document tree.
This accepts StringList as an inputstring parameter.
It enables to handle mixed contents (cf. :confval:`rst_prolog`) correctly.
"""
if isinstance(inputstring, StringList):
self.setup_parse(inputstring, document)
self.statemachine = states.RSTStateMachine(
state_classes=self.state_classes,
initial_state=self.initial_state,
debug=document.reporter.debug_flag)
# Give inputstring directly to statemachine.
self.statemachine.run(inputstring, document, inliner=self.inliner)
self.finish_parse()
else:
# otherwise, inputstring might be a string. It will be handled by superclass.
docutils.parsers.rst.Parser.parse(self, inputstring, document)
def setup(app):
# type: (Sphinx) -> Dict[unicode, Any]

View File

@ -13,21 +13,24 @@ from __future__ import print_function
import traceback
from pkg_resources import iter_entry_points
from six import itervalues
from six import iteritems, itervalues, string_types
from sphinx.errors import ExtensionError, SphinxError, VersionRequirementError
from sphinx.extension import Extension
from sphinx.domains import ObjType
from sphinx.domains.std import GenericObject, Target
from sphinx.locale import __
from sphinx.parsers import Parser as SphinxParser
from sphinx.roles import XRefRole
from sphinx.util import logging
from sphinx.util import import_object
from sphinx.util.docutils import directive_helper
if False:
# For type annotation
from typing import Any, Callable, Dict, Iterator, List, Type # NOQA
from docutils import nodes # NOQA
from docutils.io import Input # NOQA
from docutils.parsers import Parser # NOQA
from sphinx.application import Sphinx # NOQA
from sphinx.builders import Builder # NOQA
@ -48,6 +51,7 @@ class SphinxComponentRegistry(object):
self.builders = {} # type: Dict[unicode, Type[Builder]]
self.domains = {} # type: Dict[unicode, Type[Domain]]
self.source_parsers = {} # type: Dict[unicode, Parser]
self.source_inputs = {} # type: Dict[unicode, Input]
self.translators = {} # type: Dict[unicode, nodes.NodeVisitor]
def add_builder(self, builder):
@ -155,15 +159,61 @@ class SphinxComponentRegistry(object):
stddomain.object_types[directivename] = ObjType(objname or directivename, rolename)
def add_source_parser(self, suffix, parser):
# type: (unicode, Parser) -> None
# type: (unicode, Type[Parser]) -> None
if suffix in self.source_parsers:
raise ExtensionError(__('source_parser for %r is already registered') % suffix)
self.source_parsers[suffix] = parser
def get_source_parser(self, filename):
# type: (unicode) -> Type[Parser]
for suffix, parser_class in iteritems(self.source_parsers):
if filename.endswith(suffix):
break
else:
# use special parser for unknown file-extension '*' (if exists)
parser_class = self.source_parsers.get('*')
if parser_class is None:
raise SphinxError(__('Source parser for %s not registered') % filename)
else:
if isinstance(parser_class, string_types):
parser_class = import_object(parser_class, 'source parser') # type: ignore
return parser_class
def get_source_parsers(self):
# type: () -> Dict[unicode, Parser]
return self.source_parsers
def create_source_parser(self, app, filename):
# type: (Sphinx, unicode) -> Parser
parser_class = self.get_source_parser(filename)
parser = parser_class()
if isinstance(parser, SphinxParser):
parser.set_application(app)
return parser
def add_source_input(self, filetype, input_class):
# type: (unicode, Type[Input]) -> None
if filetype in self.source_inputs:
raise ExtensionError(__('source_input for %r is already registered') % filetype)
self.source_inputs[filetype] = input_class
def get_source_input(self, filename):
# type: (unicode) -> Type[Input]
parser = self.get_source_parser(filename)
for filetype in parser.supported:
if filetype in self.source_inputs:
input_class = self.source_inputs[filetype]
break
else:
# use special source_input for unknown file-type '*' (if exists)
input_class = self.source_inputs.get('*')
if input_class is None:
raise SphinxError(__('source_input for %s not registered') % filename)
else:
return input_class
def add_translator(self, name, translator):
# type: (unicode, Type[nodes.NodeVisitor]) -> None
self.translators[name] = translator

View File

@ -50,15 +50,12 @@ def publish_msgstr(app, source, source_path, source_line, config, settings):
:rtype: docutils.nodes.document
"""
from sphinx.io import SphinxI18nReader
reader = SphinxI18nReader(
app=app,
parsers=app.registry.get_source_parsers(),
parser_name='restructuredtext', # default parser
)
reader = SphinxI18nReader()
reader.set_lineno_for_reporter(source_line)
parser = app.registry.create_source_parser(app, '')
doc = reader.read(
source=StringInput(source=source, source_path=source_path),
parser=reader.parser,
parser=parser,
settings=settings,
)
try:

View File

@ -564,16 +564,6 @@ def encode_uri(uri):
return urlunsplit(split)
def split_docinfo(text):
# type: (unicode) -> Sequence[unicode]
docinfo_re = re.compile('\\A((?:\\s*:\\w+:.*?\n(?:[ \\t]+.*?\n)*)+)', re.M)
result = docinfo_re.split(text, 1) # type: ignore
if len(result) == 1:
return '', result[0]
else:
return result[1:]
def display_chunk(chunk):
# type: (Any) -> unicode
if isinstance(chunk, (list, tuple)):

View File

@ -18,8 +18,9 @@ from contextlib import contextmanager
import docutils
from docutils.languages import get_language
from docutils.utils import Reporter
from docutils.statemachine import ViewList
from docutils.parsers.rst import directives, roles, convert_directive_function
from docutils.utils import Reporter
from sphinx.errors import ExtensionError
from sphinx.locale import __
@ -33,6 +34,7 @@ if False:
from typing import Any, Callable, Iterator, List, Tuple # NOQA
from docutils import nodes # NOQA
from sphinx.environment import BuildEnvironment # NOQA
from sphinx.io import SphinxFileInput # NOQA
__version_info__ = tuple(LooseVersion(docutils.__version__).version)
@ -167,16 +169,34 @@ class WarningStream(object):
class LoggingReporter(Reporter):
@classmethod
def from_reporter(cls, reporter):
# type: (Reporter) -> LoggingReporter
"""Create an instance of LoggingReporter from other reporter object."""
return cls(reporter.source, reporter.report_level, reporter.halt_level,
reporter.debug_flag, reporter.error_handler)
def __init__(self, source, report_level, halt_level,
debug=False, error_handler='backslashreplace'):
# type: (unicode, int, int, bool, unicode) -> None
stream = WarningStream()
Reporter.__init__(self, source, report_level, halt_level,
stream, debug, error_handler=error_handler)
self.source_and_line = None # type: SphinxFileInput
def set_conditions(self, category, report_level, halt_level, debug=False):
# type: (unicode, int, int, bool) -> None
Reporter.set_conditions(self, category, report_level, halt_level, debug=debug)
def set_source(self, source):
# type: (SphinxFileInput) -> None
self.source_and_line = source
def system_message(self, *args, **kwargs):
# type: (Any, Any) -> Any
if kwargs.get('line') and isinstance(self.source_and_line, ViewList):
# replace source parameter if source is set
source, lineno = self.source_and_line.info(kwargs.get('line'))
kwargs['source'] = source
kwargs['line'] = lineno
return Reporter.system_message(self, *args, **kwargs)
def is_html5_writer_available():

118
tests/test_io.py Normal file
View File

@ -0,0 +1,118 @@
# -*- coding: utf-8 -*-
"""
test_sphinx_io
~~~~~~~~~~~~~~
Tests io modules.
:copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import pytest
from six import StringIO
from sphinx.io import SphinxRSTFileInput
@pytest.mark.sphinx(testroot='basic')
def test_SphinxRSTFileInput(app):
app.env.temp_data['docname'] = 'index'
# normal case
text = ('hello Sphinx world\n'
'Sphinx is a document generator')
source = SphinxRSTFileInput(app, app.env, source=StringIO(text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == ['hello Sphinx world',
'Sphinx is a document generator']
assert result.info(0) == ('dummy.rst', 0)
assert result.info(1) == ('dummy.rst', 1)
assert result.info(2) == ('dummy.rst', None) # out of range
# having rst_prolog ends without CR
app.env.config.rst_prolog = 'this is rst_prolog\nhello reST!'
source = SphinxRSTFileInput(app, app.env, source=StringIO(text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == ['this is rst_prolog',
'hello reST!',
'',
'hello Sphinx world',
'Sphinx is a document generator']
assert result.info(0) == ('<rst_prolog>', 0)
assert result.info(1) == ('<rst_prolog>', 1)
assert result.info(2) == ('<generated>', 0)
assert result.info(3) == ('dummy.rst', 0)
assert result.info(4) == ('dummy.rst', 1)
# having rst_prolog ends with CR
app.env.config.rst_prolog = 'this is rst_prolog\nhello reST!\n'
source = SphinxRSTFileInput(app, app.env, source=StringIO(text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == ['this is rst_prolog',
'hello reST!',
'',
'hello Sphinx world',
'Sphinx is a document generator']
# having docinfo and rst_prolog
docinfo_text = (':title: test of SphinxFileInput\n'
':author: Sphinx team\n'
'\n'
'hello Sphinx world\n'
'Sphinx is a document generator\n')
app.env.config.rst_prolog = 'this is rst_prolog\nhello reST!'
source = SphinxRSTFileInput(app, app.env, source=StringIO(docinfo_text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == [':title: test of SphinxFileInput',
':author: Sphinx team',
'',
'this is rst_prolog',
'hello reST!',
'',
'',
'hello Sphinx world',
'Sphinx is a document generator']
assert result.info(0) == ('dummy.rst', 0)
assert result.info(1) == ('dummy.rst', 1)
assert result.info(2) == ('<generated>', 0)
assert result.info(3) == ('<rst_prolog>', 0)
assert result.info(4) == ('<rst_prolog>', 1)
assert result.info(5) == ('<generated>', 0)
assert result.info(6) == ('dummy.rst', 2)
assert result.info(7) == ('dummy.rst', 3)
assert result.info(8) == ('dummy.rst', 4)
assert result.info(9) == ('dummy.rst', None) # out of range
# having rst_epilog
app.env.config.rst_prolog = None
app.env.config.rst_epilog = 'this is rst_epilog\ngood-bye reST!'
source = SphinxRSTFileInput(app, app.env, source=StringIO(text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == ['hello Sphinx world',
'Sphinx is a document generator',
'',
'this is rst_epilog',
'good-bye reST!']
assert result.info(0) == ('dummy.rst', 0)
assert result.info(1) == ('dummy.rst', 1)
assert result.info(2) == ('<generated>', 0)
assert result.info(3) == ('<rst_epilog>', 0)
assert result.info(4) == ('<rst_epilog>', 1)
assert result.info(5) == ('<rst_epilog>', None) # out of range
# expandtabs / convert whitespaces
app.env.config.rst_prolog = None
app.env.config.rst_epilog = None
text = ('\thello Sphinx world\n'
'\v\fSphinx is a document generator')
source = SphinxRSTFileInput(app, app.env, source=StringIO(text),
source_path='dummy.rst', encoding='utf-8')
result = source.read()
assert result.data == [' hello Sphinx world',
' Sphinx is a document generator']

View File

@ -14,8 +14,7 @@ from mock import patch
from sphinx.util import logging
from sphinx.util import (
display_chunk, encode_uri, parselinenos, split_docinfo, status_iterator,
xmlname_checker
display_chunk, encode_uri, parselinenos, status_iterator, xmlname_checker
)
from sphinx.testing.util import strip_escseq
@ -36,28 +35,6 @@ def test_encode_uri():
assert expected, encode_uri(uri)
def test_splitdocinfo():
source = "Hello world.\n"
docinfo, content = split_docinfo(source)
assert docinfo == ''
assert content == 'Hello world.\n'
source = ":orphan:\n\nHello world.\n"
docinfo, content = split_docinfo(source)
assert docinfo == ':orphan:\n'
assert content == '\nHello world.\n'
source = ":author: Georg Brandl\n:title: Manual of Sphinx\n\nHello world.\n"
docinfo, content = split_docinfo(source)
assert docinfo == ':author: Georg Brandl\n:title: Manual of Sphinx\n'
assert content == '\nHello world.\n'
source = ":multiline: one\n\ttwo\n\tthree\n\nHello world.\n"
docinfo, content = split_docinfo(source)
assert docinfo == ":multiline: one\n\ttwo\n\tthree\n"
assert content == '\nHello world.\n'
def test_display_chunk():
assert display_chunk('hello') == 'hello'
assert display_chunk(['hello']) == 'hello'