Replaced the merging algorithm with one that handles similarities better, it's awfully slow though, if anybody has a better idea please implement it

2025-02-25 18:55:22 -06:00 · 2010-08-15 20:34:08 +02:00 · 2010-08-15 20:34:08 +02:00 · 85b8a451a6
commit 85b8a451a6
parent 0cf175e0b2
1 changed files with 25 additions and 85 deletions
--- a/sphinx/versioning.py
+++ b/sphinx/versioning.py
@ -10,12 +10,9 @@
    :license: BSD, see LICENSE for details.
 """
 from uuid import uuid4
+from operator import itemgetter
+from collections import defaultdict
 from itertools import product
-try:
-    from itertools import izip_longest as zip_longest
-except ImportError:
-    from itertools import zip_longest
-from difflib import SequenceMatcher

 from sphinx.util import PeekableIterator

@ -34,19 +31,6 @@ def add_uids(doctree, condition):
        node.uid = uuid4().hex
        yield node

-def merge_node(old, new):
-    """
-    Merges the `old` node with the `new` one, if it's successful the `new` node
-    get's the unique identifier of the `new` one and ``True`` is returned. If
-    the merge is unsuccesful ``False`` is returned.
-    """
-    equals, changed, replaced = make_diff(old.rawsource,
-                                          new.rawsource)
-    if equals or changed:
-        new.uid = old.uid
-        return True
-    return False
-
 def merge_doctrees(old, new, condition):
    """
    Merges the `old` doctree with the `new` one while looking at nodes matching
@ -58,78 +42,34 @@ def merge_doctrees(old, new, condition):
    :param condition:
        A callable which returns either ``True`` or ``False`` for a given node.
    """
-    old_iter = PeekableIterator(old.traverse(condition))
-    new_iter = PeekableIterator(new.traverse(condition))
-    old_nodes = []
-    new_nodes = []
-    for old_node, new_node in zip_longest(old_iter, new_iter):
-        if old_node is None:
-            new_nodes.append(new_node)
+    old_nodes = old.traverse(condition)
+    new_nodes = new.traverse(condition)
+    ratios = defaultdict(list)
+    for old_node, new_node in product(old_nodes, new_nodes):
+        ratios[old_node, new_node] = get_ratio(old_node.rawsource,
+                                               new_node.rawsource)
+    ratios = sorted(ratios.iteritems(), key=itemgetter(1))
+    seen = set()
+    for (old_node, new_node), ratio in ratios:
+        if new_node in seen:
            continue
-        if new_node is None:
-            old_nodes.append(old_node)
-            continue
-        if not merge_node(old_node, new_node):
-            if old_nodes:
-                for i, very_old_node in enumerate(old_nodes):
-                    if merge_node(very_old_node, new_node):
-                        del old_nodes[i]
-                        # If the last identified node which has not matched the
-                        # unidentified node matches the current one, we have to
-                        # assume that the last unidentified one has been
-                        # inserted.
-                        #
-                        # As the required time multiplies with each insert, we
-                        # want to avoid that by checking if the next
-                        # unidentified node matches the current identified one
-                        # and if so we make a shift.
-                        if i == len(old_nodes):
-                            next_new_node = new_iter.next()
-                            if not merge_node(old_node, next_new_node):
-                                new_iter.push(next_new_node)
-                        break
-            else:
-                old_nodes.append(old_node)
-                new_nodes.append(new_node)
-    for (i, new_node), (j, old_node) in product(enumerate(new_nodes),
-                                                enumerate(old_nodes)):
-        if merge_node(old_node, new_node):
-            del new_nodes[i]
-            del old_nodes[j]
-    for node in new_nodes:
-        node.uid = uuid4().hex
-        # Yielding the new nodes here makes it possible to use this generator
-        # like add_uids
-        yield node
+        else:
+            seen.add(new_node)
+        if ratio < 65:
+            new_node.uid = old_node.uid
+        else:
+            new_node.uid = uuid4().hex
+            yield new_node

-def make_diff(old, new):
+def get_ratio(old, new):
    """
-    Takes two strings `old` and `new` and returns a :class:`tuple` of boolean
-    values ``(equals, changed, replaced)``.
-
-    equals
-
-        ``True`` if the `old` string and the `new` one are equal.
-
-    changed
-
-        ``True`` if the `new` string is a changed version of the `old` one.
-
-    replaced
-
-        ``True`` if the `new` string and the `old` string are totally
-        different.
-
-    .. note:: This assumes the two strings are human readable text or at least
-              something very similar to that, otherwise it can not detect if
-              the string has been changed or replaced. In any case the
-              detection should not be considered reliable.
+    Returns a "similiarity ratio" representing the similarity between the two
+    strings where 0 is equal and anything above less than equal.
    """
    if old == new:
-        return True, False, False
-    if new in old or levenshtein_distance(old, new) / (len(old) / 100.0) < 70:
-        return False, True, False
-    return False, False, True
+        return 0
+    ratio = levenshtein_distance(old, new) / (len(old) / 100.0)
+    return ratio

 def levenshtein_distance(a, b):
    if len(a) < len(b):