Fix #705: read module source in ModuleAnalyzer in binary mode, decode afterwards.

2025-02-25 18:55:22 -06:00 · 2011-09-19 09:03:07 +02:00 · 2011-09-19 09:03:07 +02:00 · 7fa67682ac
commit 7fa67682ac
parent 28609cc9b9
2 changed files with 21 additions and 8 deletions
--- a/sphinx/pycode/init.py
+++ b/sphinx/pycode/init.py
@ -10,13 +10,12 @@
 """

 from os import path
-from cStringIO import StringIO

 from sphinx.errors import PycodeError
 from sphinx.pycode import nodes
 from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
 from sphinx.util import get_module_source, detect_encoding
-from sphinx.util.pycompat import next
+from sphinx.util.pycompat import next, StringIO, BytesIO, TextIOWrapper
 from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc


@ -170,14 +169,16 @@ class ModuleAnalyzer(object):

    @classmethod
    def for_string(cls, string, modname, srcname='<string>'):
-        return cls(StringIO(string), modname, srcname)
+        if isinstance(string, bytes):
+            return cls(BytesIO(string), modname, srcname)
+        return cls(StringIO(string), modname, srcname, decoded=True)

    @classmethod
    def for_file(cls, filename, modname):
        if ('file', filename) in cls.cache:
            return cls.cache['file', filename]
        try:
-            fileobj = open(filename, 'r')
+            fileobj = open(filename, 'rb')
        except Exception, err:
            raise PycodeError('error opening %r' % filename, err)
        obj = cls(fileobj, modname, filename)
@ -204,7 +205,7 @@ class ModuleAnalyzer(object):
        cls.cache['module', modname] = obj
        return obj

-    def __init__(self, source, modname, srcname):
+    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
@ -214,9 +215,15 @@ class ModuleAnalyzer(object):

        # cache the source code as well
        pos = self.source.tell()
-        self.encoding = detect_encoding(self.source.readline)
-        self.code = self.source.read()
-        self.source.seek(pos)
+        if not decoded:
+            self.encoding = detect_encoding(self.source.readline)
+            self.code = self.source.read().decode(self.encoding)
+            self.source.seek(pos)
+            self.source = TextIOWrapper(self.source, self.encoding)
+        else:
+            self.encoding = None
+            self.code = self.source.read()
+            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
--- a/sphinx/util/pycompat.py
+++ b/sphinx/util/pycompat.py
@ -25,6 +25,8 @@ if sys.version_info >= (3, 0):
    bytes = bytes
    # prefix for Unicode strings
    u = ''
+    # StringIO/BytesIO classes
+    from io import StringIO, BytesIO, TextIOWrapper
    # support for running 2to3 over config files
    def convert_with_2to3(filepath):
        from lib2to3.refactor import RefactoringTool, get_fixers_from_package
@ -48,8 +50,12 @@ else:
    b = str
    bytes = str
    u = 'u'
+    from StringIO import StringIO
+    BytesIO = StringIO
    # no need to refactor on 2.x versions
    convert_with_2to3 = None
+    def TextIOWrapper(stream, encoding):
+        return codecs.lookup(encoding or 'ascii')[2](stream)


 # ------------------------------------------------------------------------------