Replaced the merging algorithm with one that handles similarities better, it's awfully slow though, if anybody has a better idea please implement it

This commit is contained in:
Daniel Neuhäuser 2010-08-15 20:34:08 +02:00
parent 0cf175e0b2
commit 85b8a451a6

View File

@ -10,12 +10,9 @@
:license: BSD, see LICENSE for details. :license: BSD, see LICENSE for details.
""" """
from uuid import uuid4 from uuid import uuid4
from operator import itemgetter
from collections import defaultdict
from itertools import product from itertools import product
try:
from itertools import izip_longest as zip_longest
except ImportError:
from itertools import zip_longest
from difflib import SequenceMatcher
from sphinx.util import PeekableIterator from sphinx.util import PeekableIterator
@ -34,19 +31,6 @@ def add_uids(doctree, condition):
node.uid = uuid4().hex node.uid = uuid4().hex
yield node yield node
def merge_node(old, new):
"""
Merges the `old` node with the `new` one, if it's successful the `new` node
get's the unique identifier of the `new` one and ``True`` is returned. If
the merge is unsuccesful ``False`` is returned.
"""
equals, changed, replaced = make_diff(old.rawsource,
new.rawsource)
if equals or changed:
new.uid = old.uid
return True
return False
def merge_doctrees(old, new, condition): def merge_doctrees(old, new, condition):
""" """
Merges the `old` doctree with the `new` one while looking at nodes matching Merges the `old` doctree with the `new` one while looking at nodes matching
@ -58,78 +42,34 @@ def merge_doctrees(old, new, condition):
:param condition: :param condition:
A callable which returns either ``True`` or ``False`` for a given node. A callable which returns either ``True`` or ``False`` for a given node.
""" """
old_iter = PeekableIterator(old.traverse(condition)) old_nodes = old.traverse(condition)
new_iter = PeekableIterator(new.traverse(condition)) new_nodes = new.traverse(condition)
old_nodes = [] ratios = defaultdict(list)
new_nodes = [] for old_node, new_node in product(old_nodes, new_nodes):
for old_node, new_node in zip_longest(old_iter, new_iter): ratios[old_node, new_node] = get_ratio(old_node.rawsource,
if old_node is None: new_node.rawsource)
new_nodes.append(new_node) ratios = sorted(ratios.iteritems(), key=itemgetter(1))
seen = set()
for (old_node, new_node), ratio in ratios:
if new_node in seen:
continue continue
if new_node is None: else:
old_nodes.append(old_node) seen.add(new_node)
continue if ratio < 65:
if not merge_node(old_node, new_node): new_node.uid = old_node.uid
if old_nodes: else:
for i, very_old_node in enumerate(old_nodes): new_node.uid = uuid4().hex
if merge_node(very_old_node, new_node): yield new_node
del old_nodes[i]
# If the last identified node which has not matched the
# unidentified node matches the current one, we have to
# assume that the last unidentified one has been
# inserted.
#
# As the required time multiplies with each insert, we
# want to avoid that by checking if the next
# unidentified node matches the current identified one
# and if so we make a shift.
if i == len(old_nodes):
next_new_node = new_iter.next()
if not merge_node(old_node, next_new_node):
new_iter.push(next_new_node)
break
else:
old_nodes.append(old_node)
new_nodes.append(new_node)
for (i, new_node), (j, old_node) in product(enumerate(new_nodes),
enumerate(old_nodes)):
if merge_node(old_node, new_node):
del new_nodes[i]
del old_nodes[j]
for node in new_nodes:
node.uid = uuid4().hex
# Yielding the new nodes here makes it possible to use this generator
# like add_uids
yield node
def make_diff(old, new): def get_ratio(old, new):
""" """
Takes two strings `old` and `new` and returns a :class:`tuple` of boolean Returns a "similiarity ratio" representing the similarity between the two
values ``(equals, changed, replaced)``. strings where 0 is equal and anything above less than equal.
equals
``True`` if the `old` string and the `new` one are equal.
changed
``True`` if the `new` string is a changed version of the `old` one.
replaced
``True`` if the `new` string and the `old` string are totally
different.
.. note:: This assumes the two strings are human readable text or at least
something very similar to that, otherwise it can not detect if
the string has been changed or replaced. In any case the
detection should not be considered reliable.
""" """
if old == new: if old == new:
return True, False, False return 0
if new in old or levenshtein_distance(old, new) / (len(old) / 100.0) < 70: ratio = levenshtein_distance(old, new) / (len(old) / 100.0)
return False, True, False return ratio
return False, False, True
def levenshtein_distance(a, b): def levenshtein_distance(a, b):
if len(a) < len(b): if len(a) < len(b):