Files
sphinx/sphinx/pycode/__init__.py
Takayuki Shimizukawa cc7509966c merge heads
2014-01-15 05:25:56 +00:00

348 lines
13 KiB
Python

# -*- coding: utf-8 -*-
"""
sphinx.pycode
~~~~~~~~~~~~~
Utilities parsing and analyzing Python code.
:copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import sys
from os import path
from sphinx import package_dir
from sphinx.errors import PycodeError
from sphinx.pycode import nodes
from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
from sphinx.util import get_module_source, detect_encoding
from sphinx.util.pycompat import StringIO, BytesIO, TextIOWrapper
from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
# load the Python grammar
_grammarfile = path.join(package_dir, 'pycode',
'Grammar-py%d.txt' % sys.version_info[0])
pygrammar = driver.load_grammar(_grammarfile)
pydriver = driver.Driver(pygrammar, convert=nodes.convert)
# an object with attributes corresponding to token and symbol names
class sym: pass
for k, v in pygrammar.symbol2number.iteritems():
setattr(sym, k, v)
for k, v in token.tok_name.iteritems():
setattr(sym, v, k)
# a dict mapping terminal and nonterminal numbers to their names
number2name = pygrammar.number2symbol.copy()
number2name.update(token.tok_name)
_eq = nodes.Leaf(token.EQUAL, '=')
class AttrDocVisitor(nodes.NodeVisitor):
"""
Visitor that collects docstrings for attribute assignments on toplevel and
in classes (class attributes and attributes set in __init__).
The docstrings can either be in special '#:' comments before the assignment
or in a docstring after it.
"""
def init(self, scope, encoding):
self.scope = scope
self.in_init = 0
self.encoding = encoding
self.namespace = []
self.collected = {}
self.tagnumber = 0
self.tagorder = {}
def add_tag(self, name):
name = '.'.join(self.namespace + [name])
self.tagorder[name] = self.tagnumber
self.tagnumber += 1
def visit_classdef(self, node):
"""Visit a class."""
self.add_tag(node[1].value)
self.namespace.append(node[1].value)
self.generic_visit(node)
self.namespace.pop()
def visit_funcdef(self, node):
"""Visit a function (or method)."""
# usually, don't descend into functions -- nothing interesting there
self.add_tag(node[1].value)
if node[1].value == '__init__':
# however, collect attributes set in __init__ methods
self.in_init += 1
self.generic_visit(node)
self.in_init -= 1
def visit_expr_stmt(self, node):
"""Visit an assignment which may have a special comment before (or
after) it.
"""
if _eq not in node.children:
# not an assignment (we don't care for augmented assignments)
return
# look *after* the node; there may be a comment prefixing the NEWLINE
# of the simple_stmt
parent = node.parent
idx = parent.children.index(node) + 1
while idx < len(parent):
if parent[idx].type == sym.SEMI:
idx += 1
continue # skip over semicolon
if parent[idx].type == sym.NEWLINE:
prefix = parent[idx].get_prefix()
if not isinstance(prefix, unicode):
prefix = prefix.decode(self.encoding)
docstring = prepare_commentdoc(prefix)
if docstring:
self.add_docstring(node, docstring)
return # don't allow docstrings both before and after
break
# now look *before* the node
pnode = node[0]
prefix = pnode.get_prefix()
# if the assignment is the first statement on a new indentation
# level, its preceding whitespace and comments are not assigned
# to that token, but the first INDENT or DEDENT token
while not prefix:
pnode = pnode.get_prev_leaf()
if not pnode or pnode.type not in (token.INDENT, token.DEDENT):
break
prefix = pnode.get_prefix()
if not isinstance(prefix, unicode):
prefix = prefix.decode(self.encoding)
docstring = prepare_commentdoc(prefix)
self.add_docstring(node, docstring)
def visit_simple_stmt(self, node):
"""Visit a docstring statement which may have an assignment before."""
if node[0].type != token.STRING:
# not a docstring; but still need to visit children
return self.generic_visit(node)
prev = node.get_prev_sibling()
if not prev:
return
if prev.type == sym.simple_stmt and \
prev[0].type == sym.expr_stmt and _eq in prev[0].children:
# need to "eval" the string because it's returned in its
# original form
docstring = literals.evalString(node[0].value, self.encoding)
docstring = prepare_docstring(docstring)
self.add_docstring(prev[0], docstring)
def add_docstring(self, node, docstring):
# add an item for each assignment target
for i in range(0, len(node) - 1, 2):
target = node[i]
if self.in_init and self.number2name[target.type] == 'power':
# maybe an attribute assignment -- check necessary conditions
if (# node must have two children
len(target) != 2 or
# first child must be "self"
target[0].type != token.NAME or target[0].value != 'self' or
# second child must be a "trailer" with two children
self.number2name[target[1].type] != 'trailer' or
len(target[1]) != 2 or
# first child must be a dot, second child a name
target[1][0].type != token.DOT or
target[1][1].type != token.NAME):
continue
name = target[1][1].value
elif target.type != token.NAME:
# don't care about other complex targets
continue
else:
name = target.value
self.add_tag(name)
if docstring:
namespace = '.'.join(self.namespace)
if namespace.startswith(self.scope):
self.collected[namespace, name] = docstring
class ModuleAnalyzer(object):
# cache for analyzer objects -- caches both by module and file name
cache = {}
@classmethod
def for_string(cls, string, modname, srcname='<string>'):
if isinstance(string, bytes):
return cls(BytesIO(string), modname, srcname)
return cls(StringIO(string), modname, srcname, decoded=True)
@classmethod
def for_file(cls, filename, modname):
if ('file', filename) in cls.cache:
return cls.cache['file', filename]
try:
fileobj = open(filename, 'rb')
except Exception, err:
raise PycodeError('error opening %r' % filename, err)
obj = cls(fileobj, modname, filename)
cls.cache['file', filename] = obj
return obj
@classmethod
def for_module(cls, modname):
if ('module', modname) in cls.cache:
entry = cls.cache['module', modname]
if isinstance(entry, PycodeError):
raise entry
return entry
try:
type, source = get_module_source(modname)
if type == 'string':
obj = cls.for_string(source, modname)
else:
obj = cls.for_file(source, modname)
except PycodeError, err:
cls.cache['module', modname] = err
raise
cls.cache['module', modname] = obj
return obj
def __init__(self, source, modname, srcname, decoded=False):
# name of the module
self.modname = modname
# name of the source file
self.srcname = srcname
# file-like object yielding source lines
self.source = source
# cache the source code as well
pos = self.source.tell()
if not decoded:
self.encoding = detect_encoding(self.source.readline)
self.source.seek(pos)
self.code = self.source.read().decode(self.encoding)
self.source.seek(pos)
self.source = TextIOWrapper(self.source, self.encoding)
else:
self.encoding = None
self.code = self.source.read()
self.source.seek(pos)
# will be filled by tokenize()
self.tokens = None
# will be filled by parse()
self.parsetree = None
# will be filled by find_attr_docs()
self.attr_docs = None
self.tagorder = None
# will be filled by find_tags()
self.tags = None
def tokenize(self):
"""Generate tokens from the source."""
if self.tokens is not None:
return
try:
self.tokens = list(tokenize.generate_tokens(self.source.readline))
except tokenize.TokenError, err:
raise PycodeError('tokenizing failed', err)
self.source.close()
def parse(self):
"""Parse the generated source tokens."""
if self.parsetree is not None:
return
self.tokenize()
try:
self.parsetree = pydriver.parse_tokens(self.tokens)
except parse.ParseError, err:
raise PycodeError('parsing failed', err)
def find_attr_docs(self, scope=''):
"""Find class and module-level attributes and their documentation."""
if self.attr_docs is not None:
return self.attr_docs
self.parse()
attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
attr_visitor.visit(self.parsetree)
self.attr_docs = attr_visitor.collected
self.tagorder = attr_visitor.tagorder
# now that we found everything we could in the tree, throw it away
# (it takes quite a bit of memory for large modules)
self.parsetree = None
return attr_visitor.collected
def find_tags(self):
"""Find class, function and method definitions and their location."""
if self.tags is not None:
return self.tags
self.tokenize()
result = {}
namespace = []
stack = []
indent = 0
defline = False
expect_indent = False
def tokeniter(ignore = (token.COMMENT, token.NL)):
for tokentup in self.tokens:
if tokentup[0] not in ignore:
yield tokentup
tokeniter = tokeniter()
for type, tok, spos, epos, line in tokeniter:
if expect_indent:
if type != token.INDENT:
# no suite -- one-line definition
assert stack
dtype, fullname, startline, _ = stack.pop()
endline = epos[0]
namespace.pop()
result[fullname] = (dtype, startline, endline)
expect_indent = False
if tok in ('def', 'class'):
name = next(tokeniter)[1]
namespace.append(name)
fullname = '.'.join(namespace)
stack.append((tok, fullname, spos[0], indent))
defline = True
elif type == token.INDENT:
expect_indent = False
indent += 1
elif type == token.DEDENT:
indent -= 1
# if the stacklevel is the same as it was before the last
# def/class block, this dedent closes that block
if stack and indent == stack[-1][3]:
dtype, fullname, startline, _ = stack.pop()
endline = spos[0]
namespace.pop()
result[fullname] = (dtype, startline, endline)
elif type == token.NEWLINE:
# if this line contained a definition, expect an INDENT
# to start the suite; if there is no such INDENT
# it's a one-line definition
if defline:
defline = False
expect_indent = True
self.tags = result
return result
if __name__ == '__main__':
import time, pprint
x0 = time.time()
#ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), 'sphinx.builders.html')
ma = ModuleAnalyzer.for_file('sphinx/environment.py',
'sphinx.environment')
ma.tokenize()
x1 = time.time()
ma.parse()
x2 = time.time()
#for (ns, name), doc in ma.find_attr_docs().iteritems():
# print '>>', ns, name
# print '\n'.join(doc)
pprint.pprint(ma.find_tags())
x3 = time.time()
#print nodes.nice_repr(ma.parsetree, number2name)
print "tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2)