mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Replaced the merging algorithm with one that handles similarities better, it's awfully slow though, if anybody has a better idea please implement it
This commit is contained in:
parent
0cf175e0b2
commit
85b8a451a6
@ -10,12 +10,9 @@
|
|||||||
:license: BSD, see LICENSE for details.
|
:license: BSD, see LICENSE for details.
|
||||||
"""
|
"""
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
from operator import itemgetter
|
||||||
|
from collections import defaultdict
|
||||||
from itertools import product
|
from itertools import product
|
||||||
try:
|
|
||||||
from itertools import izip_longest as zip_longest
|
|
||||||
except ImportError:
|
|
||||||
from itertools import zip_longest
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
|
|
||||||
from sphinx.util import PeekableIterator
|
from sphinx.util import PeekableIterator
|
||||||
|
|
||||||
@ -34,19 +31,6 @@ def add_uids(doctree, condition):
|
|||||||
node.uid = uuid4().hex
|
node.uid = uuid4().hex
|
||||||
yield node
|
yield node
|
||||||
|
|
||||||
def merge_node(old, new):
|
|
||||||
"""
|
|
||||||
Merges the `old` node with the `new` one, if it's successful the `new` node
|
|
||||||
get's the unique identifier of the `new` one and ``True`` is returned. If
|
|
||||||
the merge is unsuccesful ``False`` is returned.
|
|
||||||
"""
|
|
||||||
equals, changed, replaced = make_diff(old.rawsource,
|
|
||||||
new.rawsource)
|
|
||||||
if equals or changed:
|
|
||||||
new.uid = old.uid
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def merge_doctrees(old, new, condition):
|
def merge_doctrees(old, new, condition):
|
||||||
"""
|
"""
|
||||||
Merges the `old` doctree with the `new` one while looking at nodes matching
|
Merges the `old` doctree with the `new` one while looking at nodes matching
|
||||||
@ -58,78 +42,34 @@ def merge_doctrees(old, new, condition):
|
|||||||
:param condition:
|
:param condition:
|
||||||
A callable which returns either ``True`` or ``False`` for a given node.
|
A callable which returns either ``True`` or ``False`` for a given node.
|
||||||
"""
|
"""
|
||||||
old_iter = PeekableIterator(old.traverse(condition))
|
old_nodes = old.traverse(condition)
|
||||||
new_iter = PeekableIterator(new.traverse(condition))
|
new_nodes = new.traverse(condition)
|
||||||
old_nodes = []
|
ratios = defaultdict(list)
|
||||||
new_nodes = []
|
for old_node, new_node in product(old_nodes, new_nodes):
|
||||||
for old_node, new_node in zip_longest(old_iter, new_iter):
|
ratios[old_node, new_node] = get_ratio(old_node.rawsource,
|
||||||
if old_node is None:
|
new_node.rawsource)
|
||||||
new_nodes.append(new_node)
|
ratios = sorted(ratios.iteritems(), key=itemgetter(1))
|
||||||
|
seen = set()
|
||||||
|
for (old_node, new_node), ratio in ratios:
|
||||||
|
if new_node in seen:
|
||||||
continue
|
continue
|
||||||
if new_node is None:
|
else:
|
||||||
old_nodes.append(old_node)
|
seen.add(new_node)
|
||||||
continue
|
if ratio < 65:
|
||||||
if not merge_node(old_node, new_node):
|
new_node.uid = old_node.uid
|
||||||
if old_nodes:
|
else:
|
||||||
for i, very_old_node in enumerate(old_nodes):
|
new_node.uid = uuid4().hex
|
||||||
if merge_node(very_old_node, new_node):
|
yield new_node
|
||||||
del old_nodes[i]
|
|
||||||
# If the last identified node which has not matched the
|
|
||||||
# unidentified node matches the current one, we have to
|
|
||||||
# assume that the last unidentified one has been
|
|
||||||
# inserted.
|
|
||||||
#
|
|
||||||
# As the required time multiplies with each insert, we
|
|
||||||
# want to avoid that by checking if the next
|
|
||||||
# unidentified node matches the current identified one
|
|
||||||
# and if so we make a shift.
|
|
||||||
if i == len(old_nodes):
|
|
||||||
next_new_node = new_iter.next()
|
|
||||||
if not merge_node(old_node, next_new_node):
|
|
||||||
new_iter.push(next_new_node)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
old_nodes.append(old_node)
|
|
||||||
new_nodes.append(new_node)
|
|
||||||
for (i, new_node), (j, old_node) in product(enumerate(new_nodes),
|
|
||||||
enumerate(old_nodes)):
|
|
||||||
if merge_node(old_node, new_node):
|
|
||||||
del new_nodes[i]
|
|
||||||
del old_nodes[j]
|
|
||||||
for node in new_nodes:
|
|
||||||
node.uid = uuid4().hex
|
|
||||||
# Yielding the new nodes here makes it possible to use this generator
|
|
||||||
# like add_uids
|
|
||||||
yield node
|
|
||||||
|
|
||||||
def make_diff(old, new):
|
def get_ratio(old, new):
|
||||||
"""
|
"""
|
||||||
Takes two strings `old` and `new` and returns a :class:`tuple` of boolean
|
Returns a "similiarity ratio" representing the similarity between the two
|
||||||
values ``(equals, changed, replaced)``.
|
strings where 0 is equal and anything above less than equal.
|
||||||
|
|
||||||
equals
|
|
||||||
|
|
||||||
``True`` if the `old` string and the `new` one are equal.
|
|
||||||
|
|
||||||
changed
|
|
||||||
|
|
||||||
``True`` if the `new` string is a changed version of the `old` one.
|
|
||||||
|
|
||||||
replaced
|
|
||||||
|
|
||||||
``True`` if the `new` string and the `old` string are totally
|
|
||||||
different.
|
|
||||||
|
|
||||||
.. note:: This assumes the two strings are human readable text or at least
|
|
||||||
something very similar to that, otherwise it can not detect if
|
|
||||||
the string has been changed or replaced. In any case the
|
|
||||||
detection should not be considered reliable.
|
|
||||||
"""
|
"""
|
||||||
if old == new:
|
if old == new:
|
||||||
return True, False, False
|
return 0
|
||||||
if new in old or levenshtein_distance(old, new) / (len(old) / 100.0) < 70:
|
ratio = levenshtein_distance(old, new) / (len(old) / 100.0)
|
||||||
return False, True, False
|
return ratio
|
||||||
return False, False, True
|
|
||||||
|
|
||||||
def levenshtein_distance(a, b):
|
def levenshtein_distance(a, b):
|
||||||
if len(a) < len(b):
|
if len(a) < len(b):
|
||||||
|
Loading…
Reference in New Issue
Block a user