Some speedups in pytree.

Add Cython parse.py replacement, yielding a 2x speedup in parsing.
This commit is contained in:
Georg Brandl 2008-12-30 01:29:49 +01:00
parent eb66f15544
commit eb7d654cb0
2 changed files with 165 additions and 14 deletions

View File

@ -0,0 +1,156 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
from sphinx.pycode.pytree import Node, Leaf
DEF NAME = 1
class ParseError(Exception):
"""Exception to signal the parser is stuck."""
def __init__(self, msg, type, value, context):
Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
(msg, type, value, context))
self.msg = msg
self.type = type
self.value = value
self.context = context
cdef class Parser:
cdef public grammar, stack, rootnode, used_names
cdef _grammar_dfas, _grammar_labels, _grammar_keywords, _grammar_tokens
cdef _grammar_number2symbol
def __init__(self, grammar, convert=None):
self.grammar = grammar
#self.convert = convert or noconvert
self._grammar_dfas = grammar.dfas
self._grammar_labels = grammar.labels
self._grammar_keywords = grammar.keywords
self._grammar_tokens = grammar.tokens
self._grammar_number2symbol = grammar.number2symbol
def setup(self, start=None):
if start is None:
start = self.grammar.start
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, value, context, children),
# where children is a list of nodes or None, and context may be None.
newnode = (start, None, None, [])
stackentry = (self._grammar_dfas[start], 0, newnode)
self.stack = [stackentry]
self.rootnode = None
self.used_names = set() # Aliased to self.rootnode.used_names in pop()
def addtoken(self, type, value, context):
"""Add a token; return True iff this is the end of the program."""
cdef int ilabel, i, t, state, newstate
# Map from token to label
ilabel = self.classify(type, value, context)
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t, v = self._grammar_labels[i]
if ilabel == i:
# Look it up in the list of labels
## assert t < 256
# Shift a token; we're done with it
self.shift(type, value, newstate, context)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
self.pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
elif t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = self._grammar_dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
self.push(t, itsdfa, newstate, context)
break # To continue the outer while loop
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
self.pop()
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input",
type, value, context)
else:
# No success finding a transition
raise ParseError("bad input", type, value, context)
cdef int classify(self, type, value, context):
"""Turn a token into a label. (Internal)"""
if type == NAME:
# Keep a listing of all used names
self.used_names.add(value)
# Check for reserved words
ilabel = self._grammar_keywords.get(value)
if ilabel is not None:
return ilabel
ilabel = self._grammar_tokens.get(type)
if ilabel is None:
raise ParseError("bad token", type, value, context)
return ilabel
cdef void shift(self, type, value, newstate, context):
"""Shift a token. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, value, context, None)
newnode = self.convert(newnode)
if newnode is not None:
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
cdef void push(self, type, newdfa, newstate, context):
"""Push a nonterminal. (Internal)"""
dfa, state, node = self.stack[-1]
newnode = (type, None, context, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
cdef void pop(self):
"""Pop a nonterminal. (Internal)"""
popdfa, popstate, popnode = self.stack.pop()
newnode = self.convert(popnode)
if newnode is not None:
if self.stack:
dfa, state, node = self.stack[-1]
node[-1].append(newnode)
else:
self.rootnode = newnode
self.rootnode.used_names = self.used_names
cdef convert(self, raw_node):
type, value, context, children = raw_node
if children or type in self._grammar_number2symbol:
# If there's exactly one child, return that child instead of
# creating a new node.
if len(children) == 1:
return children[0]
return Node(type, children, context=context)
else:
return Leaf(type, value, context=context)

View File

@ -29,11 +29,6 @@ class Base(object):
children = () # Tuple of subnodes
was_changed = False
def __new__(cls, *args, **kwds):
"""Constructor that prevents Base from being instantiated."""
assert cls is not Base, "Cannot instantiate Base"
return object.__new__(cls)
def __eq__(self, other):
"""Compares two nodes for equality.
@ -132,7 +127,7 @@ class Node(Base):
"""Concrete implementation for interior nodes."""
def __init__(self, type, children, context=None, prefix=None):
def __init__(self, type, children, context=None):
"""Initializer.
Takes a type constant (a symbol number >= 256), a sequence of
@ -140,14 +135,14 @@ class Node(Base):
As a side effect, the parent pointers of the children are updated.
"""
assert type >= 256, type
# assert type >= 256, type
self.type = type
self.children = list(children)
for ch in self.children:
assert ch.parent is None, repr(ch)
# assert ch.parent is None, repr(ch)
ch.parent = self
if prefix is not None:
self.set_prefix(prefix)
# if prefix is not None:
# self.set_prefix(prefix)
def __repr__(self):
return "%s(%s, %r)" % (self.__class__.__name__,
@ -206,19 +201,19 @@ class Leaf(Base):
lineno = 0 # Line where this token starts in the input
column = 0 # Column where this token tarts in the input
def __init__(self, type, value, context=None, prefix=None):
def __init__(self, type, value, context=None):
"""Initializer.
Takes a type constant (a token number < 256), a string value,
and an optional context keyword argument.
"""
assert 0 <= type < 256, type
# assert 0 <= type < 256, type
if context is not None:
self.prefix, (self.lineno, self.column) = context
self.type = type
self.value = value
if prefix is not None:
self.prefix = prefix
# if prefix is not None:
# self.prefix = prefix
def __repr__(self):
return "%s(%r, %r, %r)" % (self.__class__.__name__,