Skip to content

Commit

Permalink
Made accurate and standard matching a bit more accurate (#122)
Browse files Browse the repository at this point in the history
Seems to affect fast matching negatively, but that should be acceptable.
  • Loading branch information
regebro authored Oct 16, 2023
1 parent 6217983 commit d82e1fe
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 21 deletions.
3 changes: 2 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ Changes
2.6.4 (unreleased)
------------------

- Nothing changed yet.
- Changed the comparison to make accurate and standard more accurate,
although fast gets less accurate as a result.


2.6.3 (2023-05-21)
Expand Down
38 changes: 19 additions & 19 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ def test_compare_different_leafs(self):
left = lefttree.xpath("/document/story/section[2]/para")[0]
right = righttree.xpath("/document/story/section[2]/para")[0]

self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.75)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.80952380)

# These nodes should not be very similar
left = lefttree.xpath("/document/story/section[1]/para")[0]
right = righttree.xpath("/document/story/section[1]/para")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45614035087719)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.53731343)

def test_compare_different_nodes(self):
left = """<document>
Expand Down Expand Up @@ -293,7 +293,7 @@ def test_compare_with_xmlid(self):
right = differ.right.xpath("/document/story/section[1]")[0]

# These are very similar
self.assertEqual(differ.leaf_ratio(left, right), 0.9)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9210526)
# And one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0.5)
# But different id's, hence 0 as match
Expand All @@ -312,7 +312,7 @@ def test_compare_with_xmlid(self):
# has an xml:id, so they do not match.
left = differ.left.xpath("/document/story/section[3]")[0]
right = differ.right.xpath("/document/story/section[3]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.81818181818)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8666667)
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

Expand Down Expand Up @@ -376,7 +376,7 @@ def test_compare_with_uniqueattrs(self):
right = differ.right.xpath("/document/story/section[1]")[0]

# These are very similar
self.assertEqual(differ.leaf_ratio(left, right), 0.90625)
self.assertEqual(differ.leaf_ratio(left, right), 0.925)
# And one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0.5)
# But different names, hence 0 as match
Expand All @@ -395,17 +395,17 @@ def test_compare_with_uniqueattrs(self):
# has an name, so they do not match.
left = differ.left.xpath("/document/story/section[3]")[0]
right = differ.right.xpath("/document/story/section[3]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8387097)
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

# Now these are structurally similar, have the same name, but
# one of them is not a section, so the uniqueattr does not match
left = differ.left.xpath("/document/story/section[1]")[0]
right = differ.right.xpath("/document/story/subsection[1]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9638554)
self.assertEqual(differ.child_ratio(left, right), 0.5)
self.assertAlmostEqual(differ.node_ratio(left, right), 0.7905694150420949)
self.assertAlmostEqual(differ.node_ratio(left, right), 0.7677947)

def test_compare_node_rename(self):
left = """<document>
Expand All @@ -430,24 +430,24 @@ def test_compare_node_rename(self):
left = differ.left.xpath("/document/para[1]")[0]
right = differ.right.xpath("/document/section[1]")[0]

# These have different tags, but should still match
self.assertEqual(differ.leaf_ratio(left, right), 1.0)
# These have different tags, so don't match that great any more
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.7441860)

# These have different tags, and different attribute value,
# but still similar enough
left = differ.left.xpath("/document/para[2]")[0]
right = differ.right.xpath("/document/section[2]")[0]

# These have different tags, but should still match
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.76190476190476)
# These have different tags, so don't match that great any more
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.6578947)

# These have different tags, and different attribute value,
# but still similar enough
left = differ.left.xpath("/document/para[3]")[0]
right = differ.right.xpath("/document/section[3]")[0]

# These are too different
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45161290322580)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.4)

def test_compare_namespaces(self):
left = """<document>
Expand All @@ -472,8 +472,8 @@ def test_compare_namespaces(self):
"/document/foo:para[1]", namespaces={"foo": "otheruri"}
)[0]

# These have different namespaces, but should still match
self.assertEqual(differ.leaf_ratio(left, right), 1.0)
# These have different namespaces, but should still match OK
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.915254237)

def test_different_ratio_modes(self):
node1 = etree.Element("para")
Expand All @@ -485,19 +485,19 @@ def test_different_ratio_modes(self):

# These texts are very different
differ = Differ(ratio_mode="accurate")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.24)
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.3666667)
# However, the quick_ratio doesn't catch that, and think they match
differ = Differ(ratio_mode="fast")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.64)
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.7)
# It still realizes these sentences are different, though.
differ = Differ(ratio_mode="fast")
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.4561403508)
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.53731343)
# Faster thinks the first two are the same!
differ = Differ(ratio_mode="faster")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 1.0)
# And that the third is almost the same
differ = Differ(ratio_mode="faster")
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.8771929824)
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.89552238)

# Invalid modes raise error:
with self.assertRaises(ValueError):
Expand Down
2 changes: 1 addition & 1 deletion xmldiff/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def node_text(self, node):
if node in self._text_cache:
return self._text_cache[node]
# Get the texts and the tag as a start
texts = node.xpath("text()")
texts = [node.tag] + node.xpath("text()")

# Then add attributes and values
for tag, value in sorted(self.node_attribs(node).items()):
Expand Down

0 comments on commit d82e1fe

Please sign in to comment.