Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.

2025-02-25 18:55:22 -06:00 · 2011-05-15 13:31:39 +02:00 · 2011-05-15 13:31:39 +02:00 · 40c294f0c8
commit 40c294f0c8
parent 8965cf1095
3 changed files with 59 additions and 14 deletions
--- a/3
+++ b/3
@ -1,6 +1,9 @@
 Release 1.0.8 (in development)
 ==============================

+* #657: viewcode now works correctly with source files that have
+  non-ASCII encoding.
+
 * #669: Respect the ``noindex`` flag option in py:module directives.

 * #675: Fix IndexErrors when including nonexisting lines with
--- a/sphinx/pycode/init.py
+++ b/sphinx/pycode/init.py
@ -17,7 +17,7 @@ from cStringIO import StringIO
 from sphinx.errors import PycodeError
 from sphinx.pycode import nodes
 from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
-from sphinx.util import get_module_source
+from sphinx.util import get_module_source, detect_encoding
 from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc


@ -37,10 +37,6 @@ for k, v in token.tok_name.iteritems():
 number2name = pygrammar.number2symbol.copy()
 number2name.update(token.tok_name)

-
-# a regex to recognize coding cookies
-_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
-
 _eq = nodes.Leaf(token.EQUAL, '=')


@ -195,11 +191,10 @@ class ModuleAnalyzer(object):
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source
-        # will be changed when found by parse()
-        self.encoding = sys.getdefaultencoding()

        # cache the source code as well
        pos = self.source.tell()
+        self.encoding = detect_encoding(self.source.readline)
        self.code = self.source.read()
        self.source.seek(pos)

@ -229,13 +224,6 @@ class ModuleAnalyzer(object):
            self.parsetree = pydriver.parse_tokens(self.tokens)
        except parse.ParseError, err:
            raise PycodeError('parsing failed', err)
-        # find the source code encoding, if present
-        comments = self.parsetree.get_prefix()
-        for line in comments.splitlines()[:2]:
-            match = _coding_re.search(line)
-            if match is not None:
-                self.encoding = match.group(1)
-                break

    def find_attr_docs(self, scope=''):
        """Find class and module-level attributes and their documentation."""
--- a/sphinx/util/init.py
+++ b/sphinx/util/init.py
@ -18,6 +18,7 @@ import tempfile
 import posixpath
 import traceback
 from os import path
+from codecs import BOM_UTF8

 import docutils
 from docutils.utils import relative_path
@ -211,6 +212,59 @@ def get_module_source(modname):
    return 'file', filename


+# a regex to recognize coding cookies
+_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+def detect_encoding(readline):
+    """Like tokenize.detect_encoding() from Py3k, but a bit simplified."""
+
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return None
+
+    def get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace('_', '-')
+        if enc == 'utf-8' or enc.startswith('utf-8-'):
+            return 'utf-8'
+        if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
+           enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
+            return 'iso-8859-1'
+        return orig_enc
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = _coding_re.findall(line_string)
+        if not matches:
+            return None
+        return get_normal_name(matches[0])
+
+    default = sys.getdefaultencoding()
+    first = read_or_stop()
+    if first and first.startswith(BOM_UTF8):
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding
+    second = read_or_stop()
+    if not second:
+        return default
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding
+    return default
+
+
 # Low-level utility functions and classes.

 class Tee(object):