Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.

This commit is contained in:
Georg Brandl 2011-05-15 13:31:39 +02:00
parent 8965cf1095
commit 40c294f0c8
3 changed files with 59 additions and 14 deletions

View File

@ -1,6 +1,9 @@
Release 1.0.8 (in development)
==============================
* #657: viewcode now works correctly with source files that have
non-ASCII encoding.
* #669: Respect the ``noindex`` flag option in py:module directives.
* #675: Fix IndexErrors when including nonexisting lines with

View File

@ -17,7 +17,7 @@ from cStringIO import StringIO
from sphinx.errors import PycodeError
from sphinx.pycode import nodes
from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
from sphinx.util import get_module_source
from sphinx.util import get_module_source, detect_encoding
from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
@ -37,10 +37,6 @@ for k, v in token.tok_name.iteritems():
number2name = pygrammar.number2symbol.copy()
number2name.update(token.tok_name)
# a regex to recognize coding cookies
_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
_eq = nodes.Leaf(token.EQUAL, '=')
@ -195,11 +191,10 @@ class ModuleAnalyzer(object):
self.srcname = srcname
# file-like object yielding source lines
self.source = source
# will be changed when found by parse()
self.encoding = sys.getdefaultencoding()
# cache the source code as well
pos = self.source.tell()
self.encoding = detect_encoding(self.source.readline)
self.code = self.source.read()
self.source.seek(pos)
@ -229,13 +224,6 @@ class ModuleAnalyzer(object):
self.parsetree = pydriver.parse_tokens(self.tokens)
except parse.ParseError, err:
raise PycodeError('parsing failed', err)
# find the source code encoding, if present
comments = self.parsetree.get_prefix()
for line in comments.splitlines()[:2]:
match = _coding_re.search(line)
if match is not None:
self.encoding = match.group(1)
break
def find_attr_docs(self, scope=''):
"""Find class and module-level attributes and their documentation."""

View File

@ -18,6 +18,7 @@ import tempfile
import posixpath
import traceback
from os import path
from codecs import BOM_UTF8
import docutils
from docutils.utils import relative_path
@ -211,6 +212,59 @@ def get_module_source(modname):
return 'file', filename
# a regex to recognize coding cookies
_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
def detect_encoding(readline):
"""Like tokenize.detect_encoding() from Py3k, but a bit simplified."""
def read_or_stop():
try:
return readline()
except StopIteration:
return None
def get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace('_', '-')
if enc == 'utf-8' or enc.startswith('utf-8-'):
return 'utf-8'
if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
return 'iso-8859-1'
return orig_enc
def find_cookie(line):
try:
line_string = line.decode('ascii')
except UnicodeDecodeError:
return None
matches = _coding_re.findall(line_string)
if not matches:
return None
return get_normal_name(matches[0])
default = sys.getdefaultencoding()
first = read_or_stop()
if first and first.startswith(BOM_UTF8):
first = first[3:]
default = 'utf-8-sig'
if not first:
return default
encoding = find_cookie(first)
if encoding:
return encoding
second = read_or_stop()
if not second:
return default
encoding = find_cookie(second)
if encoding:
return encoding
return default
# Low-level utility functions and classes.
class Tee(object):