pycode: Replace pgen2 by own parser

This commit is contained in:
Takeshi KOMIYA 2017-07-10 01:20:19 +09:00
parent 6900d19b71
commit 8683823536
18 changed files with 74 additions and 7064 deletions

View File

@ -19,13 +19,11 @@ include sphinx/locale/.tx/config
recursive-include sphinx/templates *
recursive-include sphinx/texinputs *
recursive-include sphinx/themes *
recursive-include sphinx/pycode/pgen2 *.c *.pyx
recursive-include sphinx/locale *.js *.pot *.po *.mo
recursive-include sphinx/search/non-minified-js *.js
recursive-include sphinx/ext/autosummary/templates *
recursive-include tests *
recursive-include utils *
include sphinx/pycode/Grammar-py*
recursive-include doc *
prune doc/_build

View File

@ -256,7 +256,7 @@ class LiteralIncludeReader(object):
else:
start = tags[pyobject][1]
end = tags[pyobject][2]
lines = lines[start - 1:end - 1]
lines = lines[start - 1:end]
if 'lineno-match' in self.options:
self.lineno_start = start

View File

@ -1,135 +0,0 @@
# Grammar for Python 2.x
# IMPORTANT: when copying over a new Grammar file, make sure file_input
# is the first nonterminal in the file!
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() and input() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
file_input: (NEWLINE | stmt)* ENDMARKER
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef)
funcdef: 'def' NAME parameters ':' suite
parameters: '(' [varargslist] ')'
varargslist: ((fpdef ['=' test] ',')*
('*' NAME [',' '**' NAME] | '**' NAME) |
fpdef ['=' test] (',' fpdef ['=' test])* [','])
fpdef: NAME | '(' fplist ')'
fplist: fpdef (',' fpdef)* [',']
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | exec_stmt | assert_stmt)
expr_stmt: testlist (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist))*)
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
print_stmt: 'print' ( [ test (',' test)* [','] ] |
'>>' test [ (',' test)+ [','] ] )
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test [',' test [',' test]]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
import_from: ('from' ('.'* dotted_name | '.'+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
exec_stmt: 'exec' expr ['in' test [',' test]]
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test [('as' | ',') test]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
# Backward compatibility cruft to support:
# [ x for x in lambda: True, lambda: False if x() ]
# even while also allowing:
# lambda x: 5 if x else 2
# (But not a mix of the two)
testlist_safe: old_test [(',' old_test)+ [',']]
old_test: or_test | old_lambdef
old_lambdef: 'lambda' [varargslist] ':' old_test
test: or_test ['if' or_test 'else' test] | lambdef
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [listmaker] ']' |
'{' [dictorsetmaker] '}' |
'`' testlist1 '`' |
NAME | NUMBER | STRING+)
listmaker: test ( list_for | (',' test)* [','] )
testlist_comp: test ( comp_for | (',' test)* [','] )
lambdef: 'lambda' [varargslist] ':' test
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: expr (',' expr)* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) |
(test (comp_for | (',' test)* [','])) )
classdef: 'class' NAME ['(' [testlist] ')'] ':' suite
arglist: (argument ',')* (argument [',']
|'*' test (',' argument)* [',' '**' test]
|'**' test)
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
argument: test [comp_for] | test '=' test
list_iter: list_for | list_if
list_for: 'for' exprlist 'in' testlist_safe [list_iter]
list_if: 'if' old_test [list_iter]
comp_iter: comp_for | comp_if
comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' old_test [comp_iter]
testlist1: test (',' test)*
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [testlist]

View File

@ -1,143 +0,0 @@
# Grammar for Python 3.x (with at least x <= 5)
# IMPORTANT: when copying over a new Grammar file, make sure file_input
# is the first nonterminal in the file!
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
file_input: (NEWLINE | stmt)* ENDMARKER
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: ASYNC funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [','
['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [','
['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef)
vfpdef: NAME
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: ASYNC (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: [AWAIT] atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | STRING+ | '...' | 'None' | 'True' | 'False')
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguements are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist

View File

@ -11,175 +11,20 @@
from __future__ import print_function
import re
import sys
from os import path
from six import iteritems, text_type, BytesIO, StringIO
from six import iteritems, BytesIO, StringIO
from sphinx import package_dir
from sphinx.errors import PycodeError
from sphinx.pycode import nodes
from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
from sphinx.pycode.parser import Parser
from sphinx.util import get_module_source, detect_encoding
from sphinx.util.pycompat import TextIOWrapper
from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
if False:
# For type annotation
from typing import Any, Dict, List, Tuple # NOQA
# load the Python grammar
_grammarfile = path.join(package_dir, 'pycode',
'Grammar-py%d.txt' % sys.version_info[0])
pygrammar = driver.load_grammar(_grammarfile)
pydriver = driver.Driver(pygrammar, convert=nodes.convert)
# an object with attributes corresponding to token and symbol names
class sym(object):
pass
for k, v in iteritems(pygrammar.symbol2number):
setattr(sym, k, v)
for k, v in iteritems(token.tok_name):
setattr(sym, v, k)
# a dict mapping terminal and nonterminal numbers to their names
number2name = pygrammar.number2symbol.copy()
number2name.update(token.tok_name)
_eq = nodes.Leaf(token.EQUAL, '=')
from typing import Any, Dict, IO, List, Tuple # NOQA
emptyline_re = re.compile(r'^\s*(#.*)?$')
class AttrDocVisitor(nodes.NodeVisitor):
"""
Visitor that collects docstrings for attribute assignments on toplevel and
in classes (class attributes and attributes set in __init__).
The docstrings can either be in special '#:' comments before the assignment
or in a docstring after it.
"""
def init(self, scope, encoding):
self.scope = scope
self.in_init = 0
self.encoding = encoding
self.namespace = [] # type: List[unicode]
self.collected = {} # type: Dict[Tuple[unicode, unicode], unicode]
self.tagnumber = 0
self.tagorder = {} # type: Dict[unicode, int]
def add_tag(self, name):
name = '.'.join(self.namespace + [name])
self.tagorder[name] = self.tagnumber
self.tagnumber += 1
def visit_classdef(self, node):
"""Visit a class."""
self.add_tag(node[1].value)
self.namespace.append(node[1].value)
self.generic_visit(node)
self.namespace.pop()
def visit_funcdef(self, node):
"""Visit a function (or method)."""
# usually, don't descend into functions -- nothing interesting there
self.add_tag(node[1].value)
if node[1].value == '__init__':
# however, collect attributes set in __init__ methods
self.in_init += 1
self.generic_visit(node)
self.in_init -= 1
def visit_expr_stmt(self, node):
"""Visit an assignment which may have a special comment before (or
after) it.
"""
if _eq not in node.children:
# not an assignment (we don't care for augmented assignments)
return
# look *after* the node; there may be a comment prefixing the NEWLINE
# of the simple_stmt
parent = node.parent
idx = parent.children.index(node) + 1
while idx < len(parent):
if parent[idx].type == sym.SEMI: # type: ignore
idx += 1
continue # skip over semicolon
if parent[idx].type == sym.NEWLINE: # type: ignore
prefix = parent[idx].get_prefix()
if not isinstance(prefix, text_type):
prefix = prefix.decode(self.encoding)
docstring = prepare_commentdoc(prefix)
if docstring:
self.add_docstring(node, docstring)
return # don't allow docstrings both before and after
break
# now look *before* the node
pnode = node[0]
prefix = pnode.get_prefix()
# if the assignment is the first statement on a new indentation
# level, its preceding whitespace and comments are not assigned
# to that token, but the first INDENT or DEDENT token
while not prefix:
pnode = pnode.get_prev_leaf()
if not pnode or pnode.type not in (token.INDENT, token.DEDENT):
break
prefix = pnode.get_prefix()
if not isinstance(prefix, text_type):
prefix = prefix.decode(self.encoding)
docstring = prepare_commentdoc(prefix)
self.add_docstring(node, docstring)
def visit_simple_stmt(self, node):
"""Visit a docstring statement which may have an assignment before."""
if node[0].type != token.STRING:
# not a docstring; but still need to visit children
return self.generic_visit(node)
prev = node.get_prev_sibling()
if not prev:
return
if (prev.type == sym.simple_stmt and # type: ignore
prev[0].type == sym.expr_stmt and _eq in prev[0].children): # type: ignore
# need to "eval" the string because it's returned in its
# original form
docstring = literals.evalString(node[0].value, self.encoding)
docstring = prepare_docstring(docstring)
self.add_docstring(prev[0], docstring)
def add_docstring(self, node, docstring):
# add an item for each assignment target
for i in range(0, len(node) - 1, 2):
target = node[i]
if self.in_init and self.number2name[target.type] == 'power':
# maybe an attribute assignment -- check necessary conditions
if ( # node must have two children
len(target) != 2 or
# first child must be "self"
target[0].type != token.NAME or target[0].value != 'self' or
# second child must be a "trailer" with two children
self.number2name[target[1].type] != 'trailer' or
len(target[1]) != 2 or
# first child must be a dot, second child a name
target[1][0].type != token.DOT or
target[1][1].type != token.NAME):
continue
name = target[1][1].value
elif target.type != token.NAME:
# don't care about other complex targets
continue
else:
name = target.value
self.add_tag(name)
if docstring:
namespace = '.'.join(self.namespace)
if namespace.startswith(self.scope):
self.collected[namespace, name] = docstring
class ModuleAnalyzer(object):
# cache for analyzer objects -- caches both by module and file name
cache = {} # type: Dict[Tuple[unicode, unicode], Any]
@ -223,137 +68,59 @@ class ModuleAnalyzer(object):
return obj
def __init__(self, source, modname, srcname, decoded=False):
# name of the module
self.modname = modname
# name of the source file
self.srcname = srcname
# file-like object yielding source lines
self.source = source
# type: (IO, unicode, unicode, bool) -> None
self.modname = modname # name of the module
self.srcname = srcname # name of the source file
# cache the source code as well
pos = self.source.tell()
pos = source.tell()
if not decoded:
self.encoding = detect_encoding(self.source.readline)
self.source.seek(pos)
self.code = self.source.read().decode(self.encoding)
self.source.seek(pos)
self.source = TextIOWrapper(self.source, self.encoding)
self.encoding = detect_encoding(source.readline)
source.seek(pos)
self.code = source.read().decode(self.encoding)
else:
self.encoding = None
self.code = self.source.read()
self.source.seek(pos)
self.code = source.read()
# will be filled by tokenize()
self.tokens = None # type: List[unicode]
# will be filled by parse()
self.parsetree = None # type: Any
# will be filled by find_attr_docs()
self.attr_docs = None # type: List[unicode]
self.attr_docs = None # type: Dict[Tuple[unicode, unicode], List[unicode]]
self.tagorder = None # type: Dict[unicode, int]
# will be filled by find_tags()
self.tags = None # type: List[unicode]
def tokenize(self):
"""Generate tokens from the source."""
if self.tokens is not None:
return
try:
self.tokens = list(tokenize.generate_tokens(self.source.readline))
except tokenize.TokenError as err:
raise PycodeError('tokenizing failed', err)
self.source.close()
self.tags = None # type: Dict[unicode, Tuple[unicode, int, int]]
def parse(self):
"""Parse the generated source tokens."""
if self.parsetree is not None:
return
self.tokenize()
# type: () -> None
"""Parse the source code."""
try:
self.parsetree = pydriver.parse_tokens(self.tokens)
except parse.ParseError as err:
raise PycodeError('parsing failed', err)
parser = Parser(self.code, self.encoding)
parser.parse()
def find_attr_docs(self, scope=''):
self.attr_docs = {}
for (scope, comment) in iteritems(parser.comments):
if comment:
self.attr_docs[scope] = comment.splitlines() + ['']
else:
self.attr_docs[scope] = ['']
self.tags = parser.definitions
self.tagorder = parser.deforders
except Exception as exc:
raise PycodeError('parsing failed: %r' % exc)
def find_attr_docs(self):
# type: () -> Dict[Tuple[unicode, unicode], List[unicode]]
"""Find class and module-level attributes and their documentation."""
if self.attr_docs is not None:
return self.attr_docs
self.parse()
attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
attr_visitor.visit(self.parsetree)
self.attr_docs = attr_visitor.collected
self.tagorder = attr_visitor.tagorder
# now that we found everything we could in the tree, throw it away
# (it takes quite a bit of memory for large modules)
self.parsetree = None
return attr_visitor.collected
if self.attr_docs is None:
self.parse()
return self.attr_docs
def find_tags(self):
# type: () -> Dict[unicode, Tuple[unicode, int, int]]
"""Find class, function and method definitions and their location."""
if self.tags is not None:
return self.tags
self.tokenize()
result = {}
namespace = [] # type: List[unicode]
stack = [] # type: List[Tuple[unicode, unicode, unicode, int]]
indent = 0
decopos = None
defline = False
expect_indent = False
emptylines = 0
if self.tags is None:
self.parse()
def tokeniter(ignore = (token.COMMENT,)):
for tokentup in self.tokens:
if tokentup[0] not in ignore:
yield tokentup
tokeniter = tokeniter()
for type, tok, spos, epos, line in tokeniter: # type: ignore
if expect_indent and type != token.NL:
if type != token.INDENT:
# no suite -- one-line definition
assert stack
dtype, fullname, startline, _ = stack.pop()
endline = epos[0]
namespace.pop()
result[fullname] = (dtype, startline, endline - emptylines)
expect_indent = False
if tok in ('def', 'class'):
name = next(tokeniter)[1] # type: ignore
namespace.append(name)
fullname = '.'.join(namespace)
stack.append((tok, fullname, decopos or spos[0], indent))
defline = True
decopos = None
elif type == token.OP and tok == '@':
if decopos is None:
decopos = spos[0]
elif type == token.INDENT:
expect_indent = False
indent += 1
elif type == token.DEDENT:
indent -= 1
# if the stacklevel is the same as it was before the last
# def/class block, this dedent closes that block
if stack and indent == stack[-1][3]:
dtype, fullname, startline, _ = stack.pop()
endline = spos[0]
namespace.pop()
result[fullname] = (dtype, startline, endline - emptylines)
elif type == token.NEWLINE:
# if this line contained a definition, expect an INDENT
# to start the suite; if there is no such INDENT
# it's a one-line definition
if defline:
defline = False
expect_indent = True
emptylines = 0
elif type == token.NL:
# count up if line is empty or comment only
if emptyline_re.match(line):
emptylines += 1
else:
emptylines = 0
self.tags = result
return result
return self.tags
if __name__ == '__main__':

View File

@ -1,212 +0,0 @@
# -*- coding: utf-8 -*-
"""
sphinx.pycode.nodes
~~~~~~~~~~~~~~~~~~~
Parse tree node implementations.
:copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
if False:
# For type annotation
from typing import Callable # NOQA
class BaseNode(object):
"""
Node superclass for both terminal and nonterminal nodes.
"""
parent = None # type: BaseNode
def _eq(self, other):
raise NotImplementedError
def __eq__(self, other):
if self.__class__ is not other.__class__:
return NotImplemented
return self._eq(other)
def __ne__(self, other):
if self.__class__ is not other.__class__:
return NotImplemented
return not self._eq(other)
__hash__ = None # type: Callable[[object], int]
def get_prev_sibling(self):
"""Return previous child in parent's children, or None."""
if self.parent is None:
return None
for i, child in enumerate(self.parent.children):
if child is self:
if i == 0:
return None
return self.parent.children[i - 1]
def get_next_sibling(self):
"""Return next child in parent's children, or None."""
if self.parent is None:
return None
for i, child in enumerate(self.parent.children):
if child is self:
try:
return self.parent.children[i + 1]
except IndexError:
return None
def get_prev_leaf(self):
"""Return the leaf node that precedes this node in the parse tree."""
def last_child(node):
if isinstance(node, Leaf):
return node
elif not node.children:
return None
else:
return last_child(node.children[-1])
if self.parent is None:
return None
prev = self.get_prev_sibling()
if isinstance(prev, Leaf):
return prev
elif prev is not None:
return last_child(prev)
return self.parent.get_prev_leaf()
def get_next_leaf(self):
"""Return self if leaf, otherwise the leaf node that succeeds this
node in the parse tree.
"""
node = self
while not isinstance(node, Leaf):
assert node.children
node = node.children[0]
return node
def get_lineno(self):
"""Return the line number which generated the invocant node."""
return self.get_next_leaf().lineno
def get_prefix(self):
"""Return the prefix of the next leaf node."""
# only leaves carry a prefix
return self.get_next_leaf().prefix
class Node(BaseNode):
"""
Node implementation for nonterminals.
"""
def __init__(self, type, children, context=None):
# type of nonterminals is >= 256
# assert type >= 256, type
self.type = type
self.children = list(children)
for ch in self.children:
# assert ch.parent is None, repr(ch)
ch.parent = self
def __repr__(self):
return '%s(%s, %r)' % (self.__class__.__name__,
self.type, self.children)
def __str__(self):
"""This reproduces the input source exactly."""
return ''.join(map(str, self.children))
def _eq(self, other):
return (self.type, self.children) == (other.type, other.children)
# support indexing the node directly instead of .children
def __getitem__(self, index):
return self.children[index]
def __iter__(self):
return iter(self.children)
def __len__(self):
return len(self.children)
class Leaf(BaseNode):
"""
Node implementation for leaf nodes (terminals).
"""
prefix = '' # Whitespace and comments preceding this token in the input
lineno = 0 # Line where this token starts in the input
column = 0 # Column where this token tarts in the input
def __init__(self, type, value, context=None):
# type of terminals is below 256
# assert 0 <= type < 256, type
self.type = type
self.value = value
if context is not None:
self.prefix, (self.lineno, self.column) = context
def __repr__(self):
return '%s(%r, %r, %r)' % (self.__class__.__name__,
self.type, self.value, self.prefix)
def __str__(self):
"""This reproduces the input source exactly."""
return self.prefix + str(self.value)
def _eq(self, other):
"""Compares two nodes for equality."""
return (self.type, self.value) == (other.type, other.value)
def convert(grammar, raw_node):
"""Convert raw node to a Node or Leaf instance."""
type, value, context, children = raw_node
if children or type in grammar.number2symbol:
# If there's exactly one child, return that child instead of
# creating a new node.
if len(children) == 1:
return children[0]
return Node(type, children, context=context)
else:
return Leaf(type, value, context=context)
def nice_repr(node, number2name, prefix=False):
def _repr(node):
if isinstance(node, Leaf):
return "%s(%r)" % (number2name[node.type], node.value)
else:
return "%s(%s)" % (number2name[node.type],
', '.join(map(_repr, node.children)))
def _prepr(node):
if isinstance(node, Leaf):
return "%s(%r, %r)" % (number2name[node.type],
node.prefix, node.value)
else:
return "%s(%s)" % (number2name[node.type],
', '.join(map(_prepr, node.children)))
return (prefix and _prepr or _repr)(node)
class NodeVisitor(object):
def __init__(self, number2name, *args):
self.number2name = number2name
self.init(*args)
def init(self, *args):
pass
def visit(self, node):
"""Visit a node."""
method = 'visit_' + self.number2name[node.type]
visitor = getattr(self, method, self.generic_visit)
return visitor(node)
def generic_visit(self, node):
"""Called if no explicit visitor function exists for a node."""
if isinstance(node, Node):
for child in node: # type: ignore
self.visit(child)

View File

@ -1,4 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""The pgen2 package."""

View File

@ -1,154 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser driver.
This provides a high-level interface to parse a file into a syntax tree.
"""
__author__ = "Guido van Rossum <guido@python.org>"
__all__ = ["Driver", "load_grammar"]
# Python imports
import os
import logging
import sphinx
# Pgen imports
from sphinx.pycode.pgen2 import grammar, parse, token, tokenize, pgen
class Driver(object):
def __init__(self, grammar, convert=None, logger=None):
self.grammar = grammar
if logger is None:
logger = logging.getLogger()
self.logger = logger
self.convert = convert
def parse_tokens(self, tokens, debug=False):
"""Parse a series of tokens and return the syntax tree."""
# X X X Move the prefix computation into a wrapper around tokenize.
p = parse.Parser(self.grammar, self.convert)
p.setup()
lineno = 1
column = 0
type = value = start = end = line_text = None
prefix = ""
opmap = grammar.opmap
for type, value, start, end, line_text in tokens:
if start != (lineno, column):
assert (lineno, column) <= start, ((lineno, column), start)
s_lineno, s_column = start
if lineno < s_lineno:
prefix += "\n" * (s_lineno - lineno)
lineno = s_lineno
column = 0
if column < s_column:
prefix += line_text[column:s_column]
column = s_column
if type in (tokenize.COMMENT, tokenize.NL):
prefix += value
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
continue
if type == token.OP:
type = opmap[value]
# if debug:
# self.logger.debug("%s %r (prefix=%r)",
# token.tok_name[type], value, prefix)
if p.addtoken(type, value, (prefix, start)):
# if debug:
# self.logger.debug("Stop.")
break
prefix = ""
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
else:
# We never broke out -- EOF is too soon (how can this happen???)
raise parse.ParseError("incomplete input", type, value, line_text)
return p.rootnode
def parse_stream_raw(self, stream, debug=False):
"""Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline)
return self.parse_tokens(tokens, debug)
def parse_stream(self, stream, debug=False):
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
def parse_file(self, filename, debug=False):
"""Parse a file and return the syntax tree."""
with open(filename) as stream:
return self.parse_stream(stream, debug)
def parse_string(self, text, debug=False):
"""Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens(generate_lines(text).next)
return self.parse_tokens(tokens, debug)
def generate_lines(text):
"""Generator that behaves like readline without using StringIO."""
for line in text.splitlines(True):
yield line
while True:
yield ""
def get_compiled_path(filename):
head, tail = os.path.splitext(filename)
if tail == ".txt":
tail = ""
return "%s%s.pickle" % (head, tail)
def compile_grammar(gt='Grammar.txt', logger=None):
"""Compile the grammer."""
if logger is None:
logger = logging.getLogger()
logger.info("Generating grammar tables from %s", gt)
g = pgen.generate_grammar(gt)
gp = get_compiled_path(gt)
logger.info("Writing grammar tables to %s", gp)
try:
g.dump(gp)
except IOError as e:
logger.info("Writing failed:"+str(e))
def load_grammar(gt="Grammar.txt", logger=None):
"""Load the grammar (maybe from a pickle)."""
if logger is None:
logger = logging.getLogger()
gp = get_compiled_path(gt)
if not os.path.exists(gp):
logger.info("Generating grammar tables from %s", gt)
g = pgen.generate_grammar(gt)
else:
g = grammar.Grammar()
g.load(gp)
return g
def _newer(a, b):
"""Inquire whether file a was written since file b."""
if not os.path.exists(a):
return False
if not os.path.exists(b):
return True
return os.path.getmtime(a) >= os.path.getmtime(b)

View File

@ -1,177 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""This module defines the data structures used to represent a grammar.
These are a bit arcane because they are derived from the data
structures used by Python's 'pgen' parser generator.
There's also a table here mapping operators to their names in the
token module; the Python tokenize module reports all operators as the
fallback token code OP, but the parser needs the actual token code.
"""
from __future__ import print_function
# Python imports
import pickle
# Local imports
from sphinx.pycode.pgen2 import token
if False:
# For type annotation
from typing import Dict, List, Tuple # NOQA
class Grammar(object):
"""Pgen parsing tables tables conversion class.
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly. The class here does not
provide initialization of the tables; several subclasses exist to
do this (see the conv and pgen modules).
The load() method reads the tables from a pickle file, which is
much faster than the other ways offered by subclasses. The pickle
file is written by calling dump() (after loading the grammar
tables using a subclass). The report() method prints a readable
representation of the tables to stdout, for debugging.
The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol
numbers are always 256 or higher, to distinguish
them from token numbers, which are between 0 and
255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names;
these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of
states, each state is is a list of arcs, and each
arc is a (i, j) pair where i is a label and j is
a state number. The DFA number is the index into
this list. (This name is slightly confusing.)
Final states are represented by a special arc of
the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first)
pairs, where DFA is an item from the states list
above, and first is a set of tokens that can
begin this grammar rule (represented by a dict
whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None
or a string; the strings are keywords. The label
number is the index in this list; label numbers
are used to mark state transitions (arcs) in the
DFAs.
start -- the number of the grammar's start symbol.
keywords -- a dict mapping keyword strings to arc labels.
tokens -- a dict mapping token numbers to arc labels.
"""
def __init__(self):
self.symbol2number = {} # type: Dict[unicode, int]
self.number2symbol = {} # type: Dict[int, unicode]
self.states = [] # type: List[List[List[Tuple[int, int]]]]
self.dfas = {} # type: Dict[int, Tuple[List[List[Tuple[int, int]]], unicode]]
self.labels = [(0, "EMPTY")]
self.keywords = {} # type: Dict[unicode, unicode]
self.tokens = {} # type: Dict[unicode, unicode]
self.symbol2label = {} # type: Dict[unicode, unicode]
self.start = 256
def dump(self, filename):
"""Dump the grammar tables to a pickle file."""
with open(filename, "wb") as f:
pickle.dump(self.__dict__, f, 2)
def load(self, filename):
"""Load the grammar tables from a pickle file."""
f = open(filename, "rb")
d = pickle.load(f)
f.close()
self.__dict__.update(d)
def report(self):
"""Dump the grammar tables to standard output, for debugging."""
from pprint import pprint
print("s2n")
pprint(self.symbol2number)
print("n2s")
pprint(self.number2symbol)
print("states")
pprint(self.states)
print("dfas")
pprint(self.dfas)
print("labels")
pprint(self.labels)
print("start", self.start)
# Map from operator to number (since tokenize doesn't do this)
opmap_raw = """
( LPAR
) RPAR
[ LSQB
] RSQB
: COLON
, COMMA
; SEMI
+ PLUS
- MINUS
* STAR
/ SLASH
| VBAR
& AMPER
< LESS
> GREATER
= EQUAL
. DOT
% PERCENT
` BACKQUOTE
{ LBRACE
} RBRACE
@ AT
@= ATEQUAL
== EQEQUAL
!= NOTEQUAL
<> NOTEQUAL
<= LESSEQUAL
>= GREATEREQUAL
~ TILDE
^ CIRCUMFLEX
<< LEFTSHIFT
>> RIGHTSHIFT
** DOUBLESTAR
+= PLUSEQUAL
-= MINEQUAL
*= STAREQUAL
/= SLASHEQUAL
%= PERCENTEQUAL
&= AMPEREQUAL
|= VBAREQUAL
^= CIRCUMFLEXEQUAL
<<= LEFTSHIFTEQUAL
>>= RIGHTSHIFTEQUAL
**= DOUBLESTAREQUAL
// DOUBLESLASH
//= DOUBLESLASHEQUAL
-> RARROW
... ELLIPSIS
"""
opmap = {}
for line in opmap_raw.splitlines():
if line:
op, name = line.split()
opmap[op] = getattr(token, name)

View File

@ -1,100 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Extended to handle raw and unicode literals by Georg Brandl.
"""Safely evaluate Python string literals without using eval()."""
from __future__ import print_function
import re
from six import text_type
simple_escapes = {"a": "\a",
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
"'": "'",
'"': '"',
"\\": "\\"}
def convert_hex(x, n):
if len(x) < n+1:
raise ValueError("invalid hex string escape ('\\%s')" % x)
try:
return int(x[1:], 16)
except ValueError:
raise ValueError("invalid hex string escape ('\\%s')" % x)
def escape(m):
all, tail = m.group(0, 1)
assert all.startswith("\\")
esc = simple_escapes.get(tail)
if esc is not None:
return esc
elif tail.startswith("x"):
return chr(convert_hex(tail, 2))
elif tail.startswith('u'):
return unichr(convert_hex(tail, 4))
elif tail.startswith('U'):
return unichr(convert_hex(tail, 8))
elif tail.startswith('N'):
import unicodedata
try:
return unicodedata.lookup(tail[1:-1])
except KeyError:
raise ValueError("undefined character name %r" % tail[1:-1])
else:
try:
return chr(int(tail, 8))
except ValueError:
raise ValueError("invalid octal string escape ('\\%s')" % tail)
def escaperaw(m):
all, tail = m.group(0, 1)
if tail.startswith('u'):
return unichr(convert_hex(tail, 4))
elif tail.startswith('U'):
return unichr(convert_hex(tail, 8))
else:
return all
escape_re = re.compile(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})")
uni_escape_re = re.compile(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3}|"
r"u[0-9a-fA-F]{0,4}|U[0-9a-fA-F]{0,8}|N\{.+?\})")
def evalString(s, encoding=None):
regex = escape_re
repl = escape
if encoding and not isinstance(s, text_type):
s = s.decode(encoding)
if s.startswith('u') or s.startswith('U'):
regex = uni_escape_re
s = s[1:]
if s.startswith('r') or s.startswith('R'):
repl = escaperaw
s = s[1:]
assert s.startswith("'") or s.startswith('"'), repr(s[:1])
q = s[0]
if s[:3] == q*3:
q = q*3
assert s.endswith(q), repr(s[-len(q):])
assert len(s) >= 2*len(q)
s = s[len(q):-len(q)]
return regex.sub(repl, s)
def test():
for i in range(256):
c = chr(i)
s = repr(c)
e = evalString(s)
if e != c:
print(i, c, s, e)
if __name__ == "__main__":
test()

File diff suppressed because it is too large Load Diff

View File

@ -1,206 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
# Local imports
from sphinx.pycode.pgen2 import token
if False:
# For type annotation
from typing import Any, List, Set, Tuple # NOQA
class ParseError(Exception):
"""Exception to signal the parser is stuck."""
def __init__(self, msg, type, value, context):
Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
(msg, type, value, context))
self.msg = msg
self.type = type
self.value = value
self.context = context
class Parser(object):
"""Parser engine.
The proper usage sequence is:
p = Parser(grammar, [converter]) # create instance
p.setup([start]) # prepare for parsing
<for each input token>:
if p.addtoken(...): # parse a token; may raise ParseError
break
root = p.rootnode # root of abstract syntax tree
A Parser instance may be reused by calling setup() repeatedly.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See driver.py for how to get input tokens by tokenizing a file or
string.
Parsing is complete when addtoken() returns True; the root of the
abstract syntax tree can then be retrieved from the rootnode
instance variable. When a syntax error occurs, addtoken() raises
the ParseError exception. There is no error recovery; the parser
cannot be used after a syntax error was reported (but it can be
reinitialized by calling setup()).
"""
def __init__(self, grammar, convert=None):
"""Constructor.
The grammar argument is a grammar.Grammar instance; see the
grammar module for more information.
The parser is not ready yet for parsing; you must call the
setup() method to get it started.
The optional convert argument is a function mapping concrete
syntax tree nodes to abstract syntax tree nodes. If not
given, no conversion is done and the syntax tree produced is
the concrete syntax tree. If given, it must be a function of
two arguments, the first being the grammar (a grammar.Grammar
instance), and the second being the concrete syntax tree node
to be converted. The syntax tree is converted from the bottom
up.
A concrete syntax tree node is a (type, value, context, nodes)
tuple, where type is the node type (a token or symbol number),
value is None for symbols and a string for tokens, context is
None or an opaque value used for error reporting (typically a
(lineno, offset) pair), and nodes is a list of children for
symbols, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
self.convert = convert or (lambda grammar, node: node)
def setup(self, start=None):
"""Prepare for parsing.
This *must* be called before starting to parse.
The optional argument is an alternative start symbol; it
defaults to the grammar's start symbol.
You can use a Parser instance to parse any number of programs;
each time you call setup() the parser is reset to an initial
state determined by the (implicit or explicit) start symbol.
"""
if start is None:
start = self.grammar.start
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, value, context, children),
# where children is a list of nodes or None, and context may be None.
newnode = (start, None, None, []) # type: Tuple[unicode, unicode, unicode, List]
stackentry = (self.grammar.dfas[start], 0, newnode)
self.stack = [stackentry]
self.rootnode = None # type: Any
self.used_names = set() # type: Set[unicode]
# Aliased to self.rootnode.used_names in pop()
def addtoken(self, type, value, context):
"""Add a token; return True iff this is the end of the program."""
# Map from token to label
ilabel = self.classify(type, value, context)
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t, v = self.grammar.labels[i]
if ilabel == i:
# Look it up in the list of labels
assert t < 256
# Shift a token; we're done with it
self.shift(type, value, newstate, context)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
self.pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = self.grammar.dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
self.push(t, self.grammar.dfas[t], newstate, context)
break # To continue the outer while loop
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
self.pop()
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input",
type, value, context)
else:
# No success finding a transition
raise ParseError("bad input", type, value, context)
def classify(self, type, value, context):
"""Turn a token into a label. (Internal)"""
if type == token.NAME:
# Keep a listing of all used names
self.used_names.add(value)
# Check for reserved words
ilabel = self.grammar.keywords.get(value)
if ilabel is not None:
return ilabel
ilabel = self.grammar.tokens.get(type)
if ilabel is None:
raise ParseError("bad token", type, value, context)
return ilabel
def shift(self, type, value, newstate, context):
"""Shift a token. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, value, context, None) # type: Tuple[unicode, unicode, unicode, List]
newnode = self.convert(self.grammar, newnode)
if newnode is not None:
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
def push(self, type, newdfa, newstate, context):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, None, context, []) # type: Tuple[unicode, unicode, unicode, List]
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
def pop(self):
"""Pop a nonterminal. (Internal)"""
popdfa, popstate, popnode = self.stack.pop()
newnode = self.convert(self.grammar, popnode)
if newnode is not None:
if self.stack:
dfa, state, node = self.stack[-1]
node[-1].append(newnode)
else:
self.rootnode = newnode
self.rootnode.used_names = self.used_names

View File

@ -1,165 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Adapted from parse.py to be compiled with Cython by Georg Brandl.
"""Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
from sphinx.pycode.nodes import Node, Leaf
DEF NAME = 1
class ParseError(Exception):
"""Exception to signal the parser is stuck."""
def __init__(self, msg, type, value, context):
Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
(msg, type, value, context))
self.msg = msg
self.type = type
self.value = value
self.context = context
cdef class Parser:
cdef public object grammar
cdef public object rootnode
cdef public list stack
cdef public set used_names
cdef int _grammar_start
cdef list _grammar_labels
cdef dict _grammar_dfas
cdef dict _grammar_keywords
cdef dict _grammar_tokens
cdef dict _grammar_number2symbol
def __init__(self, grammar, convert=None):
self.grammar = grammar
#self.convert = convert or noconvert
self._grammar_dfas = grammar.dfas
self._grammar_labels = grammar.labels
self._grammar_keywords = grammar.keywords
self._grammar_tokens = grammar.tokens
self._grammar_number2symbol = grammar.number2symbol
self._grammar_start = grammar.start
def setup(self, start=None):
if start is None:
start = self._grammar_start
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, value, context, children),
# where children is a list of nodes or None, and context may be None.
newnode = (start, None, None, [])
stackentry = (self._grammar_dfas[start], 0, newnode)
self.stack = [stackentry]
self.rootnode = None
self.used_names = set() # Aliased to self.rootnode.used_names in pop()
def addtoken(self, int type, value, context):
"""Add a token; return True iff this is the end of the program."""
cdef int ilabel, i, t, state, newstate
# Map from token to label
ilabel = self.classify(type, value, context)
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t, v = self._grammar_labels[i]
if ilabel == i:
# Look it up in the list of labels
## assert t < 256
# Shift a token; we're done with it
self.shift(type, value, newstate, context)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
self.pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = self._grammar_dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
self.push(t, itsdfa, newstate, context)
break # To continue the outer while loop
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
self.pop()
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input",
type, value, context)
else:
# No success finding a transition
raise ParseError("bad input", type, value, context)
cdef int classify(self, int type, value, context):
"""Turn a token into a label. (Internal)"""
if type == NAME:
# Keep a listing of all used names
self.used_names.add(value)
# Check for reserved words
if value in self._grammar_keywords:
return self._grammar_keywords[value]
if type not in self._grammar_tokens:
raise ParseError("bad token", type, value, context)
return self._grammar_tokens[type]
cdef void shift(self, type, value, newstate, context):
"""Shift a token. (Internal)"""
cdef tuple node
dfa, state, node = self.stack[-1]
newnode = (type, value, context, None)
newnode = self.convert(newnode)
if newnode is not None:
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
cdef void push(self, type, newdfa, newstate, context):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, None, context, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
cdef void pop(self):
"""Pop a nonterminal. (Internal)"""
popdfa, popstate, popnode = self.stack.pop()
newnode = self.convert(popnode)
if newnode is not None:
if self.stack:
dfa, state, node = self.stack[-1]
node[-1].append(newnode)
else:
self.rootnode = newnode
self.rootnode.used_names = self.used_names
cdef convert(self, tuple raw_node):
type, value, context, children = raw_node
if children or type in self._grammar_number2symbol:
# If there's exactly one child, return that child instead of
# creating a new node.
if len(children) == 1:
return children[0]
return Node(type, children, context=context)
else:
return Leaf(type, value, context=context)

View File

@ -1,403 +0,0 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
from __future__ import print_function
from six import iteritems
from collections import OrderedDict
# Pgen imports
from sphinx.pycode.pgen2 import grammar, token, tokenize
if False:
# For type annotation
from typing import Any, Dict, List, Tuple # NOQA
class PgenGrammar(grammar.Grammar):
pass
class ParserGenerator(object):
def __init__(self, filename, stream=None):
close_stream = None
if stream is None:
stream = open(filename)
close_stream = stream.close
self.filename = filename
self.stream = stream
self.generator = tokenize.generate_tokens(stream.readline)
self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse()
if close_stream is not None:
close_stream()
self.first = {} # type: Dict[unicode, List[unicode]]
# map from symbol name to set of tokens
self.addfirstsets()
def make_grammar(self):
c = PgenGrammar()
names = list(self.dfas.keys())
names.sort()
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[name] = i
c.number2symbol[i] = name
for name in names:
dfa = self.dfas[name]
states = [] # type: List[List[Tuple[int, int]]]
for state in dfa:
arcs = []
for label, next in iteritems(state.arcs):
arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal:
arcs.append((0, dfa.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
c.start = c.symbol2number[self.startsymbol]
return c
def make_first(self, c, name):
rawfirst = self.first[name]
first = {}
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # X X X failed on <> ... !=
first[ilabel] = 1
return first
def make_label(self, c, label):
# X X X Maybe this should be a method on a subclass of converter?
ilabel = len(c.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in c.symbol2number:
# A symbol name (a non-terminal)
if label in c.symbol2label:
return c.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = getattr(token, label, None)
assert isinstance(itoken, int), label
assert itoken in token.tok_name, label
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
value = eval(label)
if value[0].isalpha():
# A keyword
if value in c.keywords:
return c.keywords[value]
else:
c.labels.append((token.NAME, value))
c.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = grammar.opmap[value] # Fails if unknown token
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
def addfirstsets(self):
names = list(self.dfas.keys())
names.sort()
for name in names:
if name not in self.first:
self.calcfirst(name)
#print name, self.first[name].keys()
def calcfirst(self, name):
dfa = self.dfas[name]
self.first[name] = None # dummy to detect left recursion
state = dfa[0]
totalset = {} # type: Dict[unicode, int]
overlapcheck = {}
for label, next in iteritems(state.arcs):
if label in self.dfas:
if label in self.first:
fset = self.first[label]
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
self.calcfirst(label)
fset = self.first[label]
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset[label] = 1
overlapcheck[label] = {label: 1}
inverse = {} # type: Dict[unicode, unicode]
for label, itsfirst in sorted(overlapcheck.items()):
for symbol in sorted(itsfirst):
if symbol in inverse:
raise ValueError("rule %s is ambiguous; %s is in the"
" first sets of %s as well as %s" %
(name, symbol, label, inverse[symbol]))
inverse[symbol] = label
self.first[name] = totalset
def parse(self):
dfas = {}
startsymbol = None
# MSTART: (NEWLINE | RULE)* ENDMARKER
while self.type != token.ENDMARKER:
while self.type == token.NEWLINE:
self.gettoken()
# RULE: NAME ':' RHS NEWLINE
name = self.expect(token.NAME)
self.expect(token.OP, ":")
a, z = self.parse_rhs()
self.expect(token.NEWLINE)
#self.dump_nfa(name, a, z)
dfa = self.make_dfa(a, z)
#self.dump_dfa(name, dfa)
#oldlen = len(dfa)
self.simplify_dfa(dfa)
#newlen = len(dfa)
dfas[name] = dfa
#print name, oldlen, newlen
if startsymbol is None:
startsymbol = name
return dfas, startsymbol
def make_dfa(self, start, finish):
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction. Let's represent sets as dicts with 1 for
# values.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def closure(state):
base = {} # type: Dict
addclosure(state, base)
return base
def addclosure(state, base):
assert isinstance(state, NFAState)
if state in base:
return
base[state] = 1
for label, next in state.arcs:
if label is None:
addclosure(next, base)
states = [DFAState(closure(start), finish)]
for state in states: # NB states grows while we're iterating
arcs = {} # type: Dict[unicode, Dict]
for nfastate in state.nfaset:
for label, next in nfastate.arcs:
if label is not None:
addclosure(next, arcs.setdefault(label, {}))
for label, nfaset in iteritems(arcs):
for st in states:
if st.nfaset == nfaset:
break
else:
st = DFAState(nfaset, finish)
states.append(st)
state.addarc(st, label)
return states # List of DFAState instances; first one is start
def dump_nfa(self, name, start, finish):
print("Dump of NFA for", name)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for label, next in state.arcs:
if next in todo:
j = todo.index(next)
else:
j = len(todo)
todo.append(next)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def dump_dfa(self, name, dfa):
print("Dump of DFA for", name)
for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "")
for label, next in iteritems(state.arcs):
print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa):
# This is not theoretically optimal, but works well enough.
# Algorithm: repeatedly look for two states that have the same
# set of arcs (same labels pointing to the same nodes) and
# unify them, until things stop changing.
# dfa is a list of DFAState instances
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfa):
for j in range(i+1, len(dfa)):
state_j = dfa[j]
if state_i == state_j:
#print " unify", i, j
del dfa[j]
for state in dfa:
state.unifystate(state_j, state_i)
changes = True
break
def parse_rhs(self):
# RHS: ALT ('|' ALT)*
a, z = self.parse_alt()
if self.value != "|":
return a, z
else:
aa = NFAState()
zz = NFAState()
aa.addarc(a)
z.addarc(zz)
while self.value == "|":
self.gettoken()
a, z = self.parse_alt()
aa.addarc(a)
z.addarc(zz)
return aa, zz
def parse_alt(self):
# ALT: ITEM+
a, b = self.parse_item()
while (self.value in ("(", "[") or
self.type in (token.NAME, token.STRING)):
c, d = self.parse_item()
b.addarc(c)
b = d
return a, b
def parse_item(self):
# ITEM: '[' RHS ']' | ATOM ['+' | '*']
if self.value == "[":
self.gettoken()
a, z = self.parse_rhs()
self.expect(token.OP, "]")
a.addarc(z)
return a, z
else:
a, z = self.parse_atom()
value = self.value
if value not in ("+", "*"):
return a, z
self.gettoken()
z.addarc(a)
if value == "+":
return a, z
else:
return a, a
def parse_atom(self):
# ATOM: '(' RHS ')' | NAME | STRING
if self.value == "(":
self.gettoken()
a, z = self.parse_rhs()
self.expect(token.OP, ")")
return a, z
elif self.type in (token.NAME, token.STRING):
a = NFAState()
z = NFAState()
a.addarc(z, self.value)
self.gettoken()
return a, z
else:
self.raise_error("expected (...) or NAME or STRING, got %s/%s",
self.type, self.value)
def expect(self, type, value=None):
if self.type != type or (value is not None and self.value != value):
self.raise_error("expected %s/%s, got %s/%s",
type, value, self.type, self.value)
value = self.value
self.gettoken()
return value
def gettoken(self):
tup = next(self.generator)
while tup[0] in (tokenize.COMMENT, tokenize.NL):
tup = next(self.generator)
self.type, self.value, self.begin, self.end, self.line = tup
#print token.tok_name[self.type], repr(self.value)
def raise_error(self, msg, *args):
if args:
try:
msg = msg % args
except:
msg = " ".join([msg] + [str(x) for x in args])
raise SyntaxError(msg, (self.filename, self.end[0],
self.end[1], self.line))
class NFAState(object):
def __init__(self):
self.arcs = [] # type: List[Tuple[unicode, Any]]
# list of (label, NFAState) pairs
def addarc(self, next, label=None):
assert label is None or isinstance(label, str)
assert isinstance(next, NFAState)
self.arcs.append((label, next))
def __hash__(self):
return hash(tuple(x[0] for x in self.arcs))
class DFAState(object):
def __init__(self, nfaset, final):
assert isinstance(nfaset, dict)
assert isinstance(next(iter(nfaset)), NFAState)
assert isinstance(final, NFAState)
self.nfaset = nfaset
self.isfinal = final in nfaset
self.arcs = OrderedDict() # type: OrderedDict
# map from label to DFAState
def __hash__(self):
return hash(tuple(self.arcs))
def addarc(self, next, label):
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next, DFAState)
self.arcs[label] = next
def unifystate(self, old, new):
for label, next in iteritems(self.arcs):
if next is old:
self.arcs[label] = new
def __eq__(self, other):
# Equality test -- ignore the nfaset instance variable
assert isinstance(other, DFAState)
if self.isfinal != other.isfinal:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next in iteritems(self.arcs):
if next is not other.arcs.get(label):
return False
return True
def generate_grammar(filename="Grammar.txt"):
p = ParserGenerator(filename)
return p.make_grammar()

View File

@ -1,86 +0,0 @@
#! /usr/bin/env python
"""Token constants (from "token.h")."""
# Taken from Python (r53757) and modified to include some tokens
# originally monkeypatched in by pgen2.tokenize
#--start constants--
ENDMARKER = 0
NAME = 1
NUMBER = 2
STRING = 3
NEWLINE = 4
INDENT = 5
DEDENT = 6
LPAR = 7
RPAR = 8
LSQB = 9
RSQB = 10
COLON = 11
COMMA = 12
SEMI = 13
PLUS = 14
MINUS = 15
STAR = 16
SLASH = 17
VBAR = 18
AMPER = 19
LESS = 20
GREATER = 21
EQUAL = 22
DOT = 23
PERCENT = 24
BACKQUOTE = 25
LBRACE = 26
RBRACE = 27
EQEQUAL = 28
NOTEQUAL = 29
LESSEQUAL = 30
GREATEREQUAL = 31
TILDE = 32
CIRCUMFLEX = 33
LEFTSHIFT = 34
RIGHTSHIFT = 35
DOUBLESTAR = 36
PLUSEQUAL = 37
MINEQUAL = 38
STAREQUAL = 39
SLASHEQUAL = 40
PERCENTEQUAL = 41
AMPEREQUAL = 42
VBAREQUAL = 43
CIRCUMFLEXEQUAL = 44
LEFTSHIFTEQUAL = 45
RIGHTSHIFTEQUAL = 46
DOUBLESTAREQUAL = 47
DOUBLESLASH = 48
DOUBLESLASHEQUAL = 49
AT = 50
ATEQUAL = 51
RARROW = 52
ELLIPSIS = 53
OP = 54
AWAIT = 55
ASYNC = 56
COMMENT = 57
NL = 58
ERRORTOKEN = 59
N_TOKENS = 60
NT_OFFSET = 256
#--end constants--
tok_name = {}
for _name, _value in list(globals().items()):
if type(_value) is type(0):
tok_name[_value] = _name
def ISTERMINAL(x):
return x < NT_OFFSET
def ISNONTERMINAL(x):
return x >= NT_OFFSET
def ISEOF(x):
return x == ENDMARKER

View File

@ -1,441 +0,0 @@
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
"""Tokenization help for Python programs.
generate_tokens(readline) is a generator that breaks a stream of
text into Python tokens. It accepts a readline-like method which is called
repeatedly to get the next line of input (or "" for EOF). It generates
5-tuples with these members:
the token type (see token.py)
the token (a string)
the starting (row, column) indices of the token (a 2-tuple of ints)
the ending (row, column) indices of the token (a 2-tuple of ints)
the original line (string)
It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators
Older entry points
tokenize_loop(readline, tokeneater)
tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found.
"""
from __future__ import print_function
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
__credits__ = \
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
import string, re
from six import PY3
from sphinx.pycode.pgen2.token import *
from sphinx.pycode.pgen2 import token
if False:
# For type annotation
from typing import List # NOQA
__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
"generate_tokens", "untokenize"]
del token
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
Binnumber = r'0[bB][01]*'
Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
Octnumber = r'0[oO]?[0-7]*[lL]?'
Decnumber = r'[1-9]\d*[lL]?'
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Expfloat = r'\d+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
# Single-line ' or " string.
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
r"//=?", r"->",
r"[+\-*/%&|^=<>]=?",
r"~")
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'[:;.,`@]')
if PY3:
Ellipsis_ = r'\.{3}'
Special = group(Ellipsis_, Special)
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
tokenprog, pseudoprog, single3prog, double3prog = [
re.compile(x) for x in (Token, PseudoToken, Single3, Double3)
]
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
"'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog,
"u'''": single3prog, 'u"""': double3prog,
"b'''": single3prog, 'b"""': double3prog,
"ur'''": single3prog, 'ur"""': double3prog,
"br'''": single3prog, 'br"""': double3prog,
"R'''": single3prog, 'R"""': double3prog,
"U'''": single3prog, 'U"""': double3prog,
"B'''": single3prog, 'B"""': double3prog,
"uR'''": single3prog, 'uR"""': double3prog,
"Ur'''": single3prog, 'Ur"""': double3prog,
"UR'''": single3prog, 'UR"""': double3prog,
"bR'''": single3prog, 'bR"""': double3prog,
"Br'''": single3prog, 'Br"""': double3prog,
"BR'''": single3prog, 'BR"""': double3prog,
'r': None, 'R': None,
'u': None, 'U': None,
'b': None, 'B': None}
triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
"u'''", 'u"""', "U'''", 'U"""',
"b'''", 'b"""', "B'''", 'B"""',
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
"uR'''", 'uR"""', "UR'''", 'UR"""',
"br'''", 'br"""', "Br'''", 'Br"""',
"bR'''", 'bR"""', "BR'''", 'BR"""',):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"u'", 'u"', "U'", 'U"',
"b'", 'b"', "B'", 'B"',
"ur'", 'ur"', "Ur'", 'Ur"',
"uR'", 'uR"', "UR'", 'UR"',
"br'", 'br"', "Br'", 'Br"',
"bR'", 'bR"', "BR'", 'BR"', ):
single_quoted[t] = t
tabsize = 8
class TokenError(Exception): pass
class StopTokenizing(Exception): pass
def printtoken(type, token, scell, ecell, line): # for testing
srow, scol = scell
erow, ecol = ecell
print("%d,%d-%d,%d:\t%s\t%s" %
(srow, scol, erow, ecol, tok_name[type], repr(token)))
def tokenize(readline, tokeneater=printtoken):
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
The first parameter, readline, must be a callable object which provides
the same interface as the readline() method of built-in file objects.
Each call to the function should return one line of input as a string.
The second parameter, tokeneater, must also be a callable object. It is
called once for each token, with five arguments, corresponding to the
tuples generated by generate_tokens().
"""
try:
tokenize_loop(readline, tokeneater)
except StopTokenizing:
pass
# backwards compatible interface
def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
class Untokenizer:
def __init__(self):
self.tokens = [] # type: List[unicode]
self.prev_row = 1
self.prev_col = 0
def add_whitespace(self, start):
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def untokenize(self, iterable):
for t in iterable:
if len(t) == 2:
self.compat(t, iterable)
break
tok_type, token, start, end, line = t
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
if tok_type in (NEWLINE, NL):
self.prev_row += 1
self.prev_col = 0
return "".join(self.tokens)
def compat(self, token, iterable):
startline = False
indents = []
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum in (NEWLINE, NL):
startline = True
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
def untokenize(iterable):
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
with at least two elements, a token number and token value. If
only two tokens are passed, the resulting output is poor.
Round-trip invariant for full input:
Untokenized source will match input source exactly
Round-trip invariant for limited intput:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
readline = iter(newcode.splitlines(1)).next
t2 = [tok[:2] for tokin generate_tokens(readline)]
assert t1 == t2
"""
ut = Untokenizer()
return ut.untokenize(iterable)
def generate_tokens(readline):
"""
The generate_tokens() generator requires one argment, readline, which
must be a callable object which provides the same interface as the
readline() method of built-in file objects. Each call to the function
should return one line of input as a string. Alternately, readline
can be a callable function terminating with StopIteration:
readline = open(myfile).next # Example of alternate readline
The generator produces 5-tuples with these members: the token type; the
token string; a 2-tuple (srow, scol) of ints specifying the row and
column where the token begins in the source; a 2-tuple (erow, ecol) of
ints specifying the row and column where the token ends in the source;
and the line on which the token was found. The line passed is the
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
namechars, numchars = string.ascii_letters + '_', '0123456789'
contstr, needcont = '', 0
contline = None
indents = [0]
while 1: # loop over lines in stream
try:
line = readline()
except StopIteration:
line = ''
# if we are not at the end of the file make sure the
# line ends with a newline because the parser depends
# on that.
if line:
line = line.rstrip() + '\n'
lnum = lnum + 1
pos, max = 0, len(line)
if contstr: # continued string
if not line:
raise TokenError("EOF in multi-line string", strstart) # type: ignore
endmatch = endprog.match(line) # type: ignore
if endmatch:
pos = end = endmatch.end(0)
yield (STRING, contstr + line[:end],
strstart, (lnum, end), contline + line) # type: ignore
contstr, needcont = '', 0
contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
yield (ERRORTOKEN, contstr + line,
strstart, (lnum, len(line)), contline) # type: ignore
contstr = ''
contline = None
continue
else:
contstr = contstr + line
contline = contline + line
continue
elif parenlev == 0 and not continued: # new statement
if not line: break
column = 0
while pos < max: # measure leading whitespace
if line[pos] == ' ': column = column + 1
elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
elif line[pos] == '\f': column = 0
else: break
pos = pos + 1
if pos == max: break
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n')
nl_pos = pos + len(comment_token)
yield (COMMENT, comment_token,
(lnum, pos), (lnum, pos + len(comment_token)), line)
yield (NL, line[nl_pos:],
(lnum, nl_pos), (lnum, len(line)), line)
else:
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
(lnum, pos), (lnum, len(line)), line)
continue
if column > indents[-1]: # count indents or dedents
indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
while column < indents[-1]:
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
if end < max:
next_pseudomatch = pseudoprog.match(line, end)
if next_pseudomatch:
n_start, n_end = next_pseudomatch.span(1)
n_token = line[n_start:n_end]
else:
n_token = None
else:
n_token = None
if initial in numchars or (
initial == '.' and token not in ('.', '...')
): # ordinary number
yield (NUMBER, token, spos, epos, line)
elif initial in '\r\n':
newline = NEWLINE
if parenlev > 0:
newline = NL
yield (newline, token, spos, epos, line)
elif initial == '#':
assert not token.endswith("\n")
yield (COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
yield (STRING, token, spos, (lnum, pos), line)
else:
strstart = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
elif initial in single_quoted or \
token[:2] in single_quoted or \
token[:3] in single_quoted:
if token[-1] == '\n': # continued string
strstart = (lnum, start)
endprog = (endprogs[initial] or endprogs[token[1]] or
endprogs[token[2]])
contstr, needcont = line[start:], 1
contline = line
break
else: # ordinary string
yield (STRING, token, spos, epos, line)
elif token == 'await' and n_token:
yield (AWAIT, token, spos, epos, line)
elif token == 'async' and n_token in ('def', 'for', 'with'):
yield (ASYNC, token, spos, epos, line)
elif initial in namechars: # ordinary name
yield (NAME, token, spos, epos, line)
elif token in ('...',): # ordinary name
yield (NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt
# This yield is new; needed for better idempotency:
yield (NL, token, spos, (lnum, pos), line)
continued = 1
else:
if initial in '([{': parenlev = parenlev + 1
elif initial in ')]}': parenlev = parenlev - 1
yield (OP, token, spos, epos, line)
else:
yield (ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos+1), line)
pos = pos + 1
for _ in indents[1:]: # pop remaining indent levels
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
if __name__ == '__main__': # testing
import sys
if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
else: tokenize(sys.stdin.readline)

View File

@ -63,20 +63,30 @@ def test_ModuleAnalyzer_find_tags():
' """function baz"""\n'
' pass\n'
'\n'
'@decorator\n'
'@decorator1\n'
'@decorator2\n'
'def quux():\n'
' pass\n')
' pass\n' # line: 21
'\n'
'class Corge(object):\n'
' @decorator1\n'
' @decorator2\n'
' def grault(self):\n'
' pass\n')
analyzer = ModuleAnalyzer.for_string(code, 'module')
tags = analyzer.find_tags()
assert set(tags.keys()) == {'Foo', 'Foo.__init__', 'Foo.bar',
'Foo.Baz', 'Foo.Baz.__init__', 'qux', 'quux'}
assert tags['Foo'] == ('class', 1, 13) # type, start, end
assert tags['Foo.__init__'] == ('def', 3, 5)
assert tags['Foo.bar'] == ('def', 6, 9)
assert tags['Foo.Baz'] == ('class', 10, 13)
assert tags['Foo.Baz.__init__'] == ('def', 11, 13)
assert tags['qux'] == ('def', 14, 17)
assert tags['quux'] == ('def', 18, 21) # decorator
'Foo.Baz', 'Foo.Baz.__init__', 'qux', 'quux',
'Corge', 'Corge.grault'}
assert tags['Foo'] == ('class', 1, 12) # type, start, end
assert tags['Foo.__init__'] == ('def', 3, 4)
assert tags['Foo.bar'] == ('def', 6, 8)
assert tags['Foo.Baz'] == ('class', 10, 12)
assert tags['Foo.Baz.__init__'] == ('def', 11, 12)
assert tags['qux'] == ('def', 14, 16)
assert tags['quux'] == ('def', 18, 21)
assert tags['Corge'] == ('class', 23, 27)
assert tags['Corge.grault'] == ('def', 24, 27)
def test_ModuleAnalyzer_find_attr_docs():
@ -114,6 +124,8 @@ def test_ModuleAnalyzer_find_attr_docs():
('Foo', 'attr3'),
('Foo', 'attr4'),
('Foo', 'attr5'),
('Foo', 'attr6'),
('Foo', 'attr7'),
('Foo', 'attr8'),
('Foo', 'attr9')}
assert docs[('Foo', 'attr1')] == ['comment before attr1', '']
@ -121,19 +133,23 @@ def test_ModuleAnalyzer_find_attr_docs():
assert docs[('Foo', 'attr4')] == ['long attribute comment', '']
assert docs[('Foo', 'attr4')] == ['long attribute comment', '']
assert docs[('Foo', 'attr5')] == ['attribute comment for attr5', '']
assert docs[('Foo', 'attr6')] == ['this comment is ignored', '']
assert docs[('Foo', 'attr7')] == ['this comment is ignored', '']
assert docs[('Foo', 'attr8')] == ['attribute comment for attr8', '']
assert docs[('Foo', 'attr9')] == ['string after attr9', '']
assert analyzer.tagorder == {'Foo': 0,
'Foo.__init__': 6,
'Foo.__init__': 8,
'Foo.attr1': 1,
'Foo.attr2': 2,
'Foo.attr3': 3,
'Foo.attr4': 4,
'Foo.attr5': 5,
'Foo.attr8': 8,
'Foo.attr9': 10,
'Foo.bar': 11,
'baz': 12,
'Qux': 13,
'Qux.attr1': 14,
'Qux.attr2': 15}
'Foo.attr6': 6,
'Foo.attr7': 7,
'Foo.attr8': 10,
'Foo.attr9': 12,
'Foo.bar': 13,
'baz': 14,
'Qux': 15,
'Qux.attr1': 16,
'Qux.attr2': 17}

View File

@ -20,7 +20,6 @@ Release checklist
* Check diff by `git diff`
* `git commit -am 'Bump to x.y.z final'`
* `make clean`
* `python setup.py compile_grammar`
* `python setup.py release bdist_wheel sdist upload --identity=[your key]`
* open https://pypi.python.org/pypi/Sphinx and check there are no obvious errors
* `git tag x.y.z` with version number