From 621798331fbbc8b6ce48eff31d27042e7922ea3b Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 16 Oct 2023 10:06:53 +0200 Subject: [PATCH 1/4] Update versions (#123) --- .github/workflows/test.yml | 9 ++------- setup.cfg | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ca0be98..7d85693 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,15 +16,10 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.7', '3.11', 'pypy-3.7'] + python-version: ['3.8', '3.12', 'pypy-3.10'] exclude: - os: windows-latest - python-version: '3.7' - - os: windows-latest - python-version: 'pypy-3.7' - include: - - os: windows-latest - python-version: '3.8' + python-version: 'pypy-3.10' steps: - name: Checkout uses: actions/checkout@v3 diff --git a/setup.cfg b/setup.cfg index e218707..b63980b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,11 +12,11 @@ classifiers = Operating System :: OS Independent Programming Language :: Python Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: Implementation :: PyPy keywords = xml, html, diff @@ -28,7 +28,7 @@ project_urls = Source Code = https://github.com/Shoobx/xmldiff [options] -python_requires = >=3.7 +python_requires = >=3.8 zip_safe = True include_package_data = True packages = find: From d82e1fef461c4f0b36c3a83936558c9a28842090 Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 16 Oct 2023 10:17:38 +0200 Subject: [PATCH 2/4] Made accurate and standard matching a bit more accurate (#122) Seems to affect fast matching negatively, but that should be acceptable. --- CHANGES.rst | 3 ++- tests/test_diff.py | 38 +++++++++++++++++++------------------- xmldiff/diff.py | 2 +- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7e117d8..57ed36b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changes 2.6.4 (unreleased) ------------------ -- Nothing changed yet. +- Changed the comparison to make accurate and standard more accurate, + although fast gets less accurate as a result. 2.6.3 (2023-05-21) diff --git a/tests/test_diff.py b/tests/test_diff.py index 2947eb9..7433e7e 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -183,12 +183,12 @@ def test_compare_different_leafs(self): left = lefttree.xpath("/document/story/section[2]/para")[0] right = righttree.xpath("/document/story/section[2]/para")[0] - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.75) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.80952380) # These nodes should not be very similar left = lefttree.xpath("/document/story/section[1]/para")[0] right = righttree.xpath("/document/story/section[1]/para")[0] - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45614035087719) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.53731343) def test_compare_different_nodes(self): left = """ @@ -293,7 +293,7 @@ def test_compare_with_xmlid(self): right = differ.right.xpath("/document/story/section[1]")[0] # These are very similar - self.assertEqual(differ.leaf_ratio(left, right), 0.9) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9210526) # And one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0.5) # But different id's, hence 0 as match @@ -312,7 +312,7 @@ def test_compare_with_xmlid(self): # has an xml:id, so they do not match. left = differ.left.xpath("/document/story/section[3]")[0] right = differ.right.xpath("/document/story/section[3]")[0] - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.81818181818) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8666667) self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) @@ -376,7 +376,7 @@ def test_compare_with_uniqueattrs(self): right = differ.right.xpath("/document/story/section[1]")[0] # These are very similar - self.assertEqual(differ.leaf_ratio(left, right), 0.90625) + self.assertEqual(differ.leaf_ratio(left, right), 0.925) # And one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0.5) # But different names, hence 0 as match @@ -395,7 +395,7 @@ def test_compare_with_uniqueattrs(self): # has an name, so they do not match. left = differ.left.xpath("/document/story/section[3]")[0] right = differ.right.xpath("/document/story/section[3]")[0] - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.8387097) self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) @@ -403,9 +403,9 @@ def test_compare_with_uniqueattrs(self): # one of them is not a section, so the uniqueattr does not match left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/subsection[1]")[0] - self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.9638554) self.assertEqual(differ.child_ratio(left, right), 0.5) - self.assertAlmostEqual(differ.node_ratio(left, right), 0.7905694150420949) + self.assertAlmostEqual(differ.node_ratio(left, right), 0.7677947) def test_compare_node_rename(self): left = """ @@ -430,16 +430,16 @@ def test_compare_node_rename(self): left = differ.left.xpath("/document/para[1]")[0] right = differ.right.xpath("/document/section[1]")[0] - # These have different tags, but should still match - self.assertEqual(differ.leaf_ratio(left, right), 1.0) + # These have different tags, so don't match that great any more + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.7441860) # These have different tags, and different attribute value, # but still similar enough left = differ.left.xpath("/document/para[2]")[0] right = differ.right.xpath("/document/section[2]")[0] - # These have different tags, but should still match - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.76190476190476) + # These have different tags, so don't match that great any more + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.6578947) # These have different tags, and different attribute value, # but still similar enough @@ -447,7 +447,7 @@ def test_compare_node_rename(self): right = differ.right.xpath("/document/section[3]")[0] # These are too different - self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45161290322580) + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.4) def test_compare_namespaces(self): left = """ @@ -472,8 +472,8 @@ def test_compare_namespaces(self): "/document/foo:para[1]", namespaces={"foo": "otheruri"} )[0] - # These have different namespaces, but should still match - self.assertEqual(differ.leaf_ratio(left, right), 1.0) + # These have different namespaces, but should still match OK + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.915254237) def test_different_ratio_modes(self): node1 = etree.Element("para") @@ -485,19 +485,19 @@ def test_different_ratio_modes(self): # These texts are very different differ = Differ(ratio_mode="accurate") - self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.24) + self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.3666667) # However, the quick_ratio doesn't catch that, and think they match differ = Differ(ratio_mode="fast") - self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.64) + self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.7) # It still realizes these sentences are different, though. differ = Differ(ratio_mode="fast") - self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.4561403508) + self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.53731343) # Faster thinks the first two are the same! differ = Differ(ratio_mode="faster") self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 1.0) # And that the third is almost the same differ = Differ(ratio_mode="faster") - self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.8771929824) + self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.89552238) # Invalid modes raise error: with self.assertRaises(ValueError): diff --git a/xmldiff/diff.py b/xmldiff/diff.py index 81ac63b..f14e322 100644 --- a/xmldiff/diff.py +++ b/xmldiff/diff.py @@ -223,7 +223,7 @@ def node_text(self, node): if node in self._text_cache: return self._text_cache[node] # Get the texts and the tag as a start - texts = node.xpath("text()") + texts = [node.tag] + node.xpath("text()") # Then add attributes and values for tag, value in sorted(self.node_attribs(node).items()): From b1f29ac976d6c74259f75102b255cc17be8ae188 Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 16 Oct 2023 10:24:28 +0200 Subject: [PATCH 3/4] We can use importlib.metadata now (#124) --- CHANGES.rst | 2 ++ README.rst | 2 ++ xmldiff/main.py | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 57ed36b..cdf4ec3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,8 @@ Changes - Changed the comparison to make accurate and standard more accurate, although fast gets less accurate as a result. +- Changed usage of deprecated `pkg_resources` package to `importlib.metadata`. + 2.6.3 (2023-05-21) ------------------ diff --git a/README.rst b/README.rst index 200438b..077575e 100644 --- a/README.rst +++ b/README.rst @@ -88,5 +88,7 @@ Contributors * Filip Demski, glamhoth@protonmail.com + * Jacek Chałupka, krunchfrompoland@gmail.com + The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm. diff --git a/xmldiff/main.py b/xmldiff/main.py index 0288d86..7849705 100644 --- a/xmldiff/main.py +++ b/xmldiff/main.py @@ -1,11 +1,11 @@ """All major API points and command-line tools""" -import pkg_resources +from importlib import metadata from argparse import ArgumentParser, ArgumentTypeError from lxml import etree from xmldiff import diff, formatting, patch -__version__ = pkg_resources.require("xmldiff")[0].version +__version__ = metadata.version("xmldiff") FORMATTERS = { "diff": formatting.DiffFormatter, From 681b4f954365ed5958a2e84f3eb098deb10980cc Mon Sep 17 00:00:00 2001 From: Jingren Wang <34462943+wjrforcyber@users.noreply.github.com> Date: Sat, 30 Dec 2023 18:22:33 +0800 Subject: [PATCH 4/4] Link failed, update the reference link to a valid one (#127) --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 077575e..ccf9a4d 100644 --- a/README.rst +++ b/README.rst @@ -90,5 +90,5 @@ Contributors * Jacek Chałupka, krunchfrompoland@gmail.com -The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", +The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm.