Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Made accurate and standard matching a bit more accurate #122

Merged
merged 1 commit into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ Changes
2.6.4 (unreleased)
------------------

- Nothing changed yet.
- Changed the comparison to make accurate and standard more accurate,
although fast gets less accurate as a result.


2.6.3 (2023-05-21)
Expand Down
38 changes: 19 additions & 19 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ def test_compare_different_leafs(self):
left = lefttree.xpath("/document/story/section[2]/para")[0]
right = righttree.xpath("/document/story/section[2]/para")[0]

self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.75)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.80952380)

# These nodes should not be very similar
left = lefttree.xpath("/document/story/section[1]/para")[0]
right = righttree.xpath("/document/story/section[1]/para")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45614035087719)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.53731343)

def test_compare_different_nodes(self):
left = """<document>
Expand Down Expand Up @@ -293,7 +293,7 @@ def test_compare_with_xmlid(self):
right = differ.right.xpath("/document/story/section[1]")[0]

# These are very similar
self.assertEqual(differ.leaf_ratio(left, right), 0.9)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9210526)
# And one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0.5)
# But different id's, hence 0 as match
Expand All @@ -312,7 +312,7 @@ def test_compare_with_xmlid(self):
# has an xml:id, so they do not match.
left = differ.left.xpath("/document/story/section[3]")[0]
right = differ.right.xpath("/document/story/section[3]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.81818181818)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8666667)
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

Expand Down Expand Up @@ -376,7 +376,7 @@ def test_compare_with_uniqueattrs(self):
right = differ.right.xpath("/document/story/section[1]")[0]

# These are very similar
self.assertEqual(differ.leaf_ratio(left, right), 0.90625)
self.assertEqual(differ.leaf_ratio(left, right), 0.925)
# And one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0.5)
# But different names, hence 0 as match
Expand All @@ -395,17 +395,17 @@ def test_compare_with_uniqueattrs(self):
# has an name, so they do not match.
left = differ.left.xpath("/document/story/section[3]")[0]
right = differ.right.xpath("/document/story/section[3]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8387097)
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

# Now these are structurally similar, have the same name, but
# one of them is not a section, so the uniqueattr does not match
left = differ.left.xpath("/document/story/section[1]")[0]
right = differ.right.xpath("/document/story/subsection[1]")[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9638554)
self.assertEqual(differ.child_ratio(left, right), 0.5)
self.assertAlmostEqual(differ.node_ratio(left, right), 0.7905694150420949)
self.assertAlmostEqual(differ.node_ratio(left, right), 0.7677947)

def test_compare_node_rename(self):
left = """<document>
Expand All @@ -430,24 +430,24 @@ def test_compare_node_rename(self):
left = differ.left.xpath("/document/para[1]")[0]
right = differ.right.xpath("/document/section[1]")[0]

# These have different tags, but should still match
self.assertEqual(differ.leaf_ratio(left, right), 1.0)
# These have different tags, so don't match that great any more
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.7441860)

# These have different tags, and different attribute value,
# but still similar enough
left = differ.left.xpath("/document/para[2]")[0]
right = differ.right.xpath("/document/section[2]")[0]

# These have different tags, but should still match
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.76190476190476)
# These have different tags, so don't match that great any more
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.6578947)

# These have different tags, and different attribute value,
# but still similar enough
left = differ.left.xpath("/document/para[3]")[0]
right = differ.right.xpath("/document/section[3]")[0]

# These are too different
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45161290322580)
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.4)

def test_compare_namespaces(self):
left = """<document>
Expand All @@ -472,8 +472,8 @@ def test_compare_namespaces(self):
"/document/foo:para[1]", namespaces={"foo": "otheruri"}
)[0]

# These have different namespaces, but should still match
self.assertEqual(differ.leaf_ratio(left, right), 1.0)
# These have different namespaces, but should still match OK
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.915254237)

def test_different_ratio_modes(self):
node1 = etree.Element("para")
Expand All @@ -485,19 +485,19 @@ def test_different_ratio_modes(self):

# These texts are very different
differ = Differ(ratio_mode="accurate")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.24)
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.3666667)
# However, the quick_ratio doesn't catch that, and think they match
differ = Differ(ratio_mode="fast")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.64)
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.7)
# It still realizes these sentences are different, though.
differ = Differ(ratio_mode="fast")
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.4561403508)
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.53731343)
# Faster thinks the first two are the same!
differ = Differ(ratio_mode="faster")
self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 1.0)
# And that the third is almost the same
differ = Differ(ratio_mode="faster")
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.8771929824)
self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.89552238)

# Invalid modes raise error:
with self.assertRaises(ValueError):
Expand Down
2 changes: 1 addition & 1 deletion xmldiff/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def node_text(self, node):
if node in self._text_cache:
return self._text_cache[node]
# Get the texts and the tag as a start
texts = node.xpath("text()")
texts = [node.tag] + node.xpath("text()")

# Then add attributes and values
for tag, value in sorted(self.node_attribs(node).items()):
Expand Down