diff --git a/.gitignore b/.gitignore index 15a841e..40ab963 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ build docs/build venv build -dist \ No newline at end of file +dist +test.ipynb +.venv/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 082c582..0a2b5e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,18 +1,18 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-yaml - id: trailing-whitespace - repo: https://github.com/psf/black - rev: 22.12.0 + rev: 23.12.1 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 + rev: 6.1.0 hooks: - id: flake8 - repo: https://github.com/regebro/pyroma - rev: '4.1' + rev: '4.2' hooks: - id: pyroma diff --git a/CHANGES.rst b/CHANGES.rst index cdf4ec3..ee81d06 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -9,6 +9,11 @@ Changes - Changed usage of deprecated `pkg_resources` package to `importlib.metadata`. +- A `use_replace` flag was added to the `XMLFormatter` by Thomas Pfitzinger. + It changes text replacement from delete and insert tags to a replace tag. + It's not currently accessaible thtough the CLI, the question is it is better + to add a new formatter name, or an option to pass in formatter flags. + 2.6.3 (2023-05-21) ------------------ diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 0000000..56d1156 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,7 @@ +- Added option to XMLFormatter to use replace tags + +- in _make_diff_tags after diffing, neighboring delete/insert diffs are joined to a replace tag + +- the deleted text is added as an attribute ("old-text") + +- the inserted text is the element's text \ No newline at end of file diff --git a/README.rst b/README.rst index ccf9a4d..f5d6bd5 100644 --- a/README.rst +++ b/README.rst @@ -90,5 +90,8 @@ Contributors * Jacek ChaƂupka, krunchfrompoland@gmail.com -The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", + * Thomas Pfitzinger, thpfitzinger@web.de + +The diff algorithm is based on +"`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm. diff --git a/tests/test_diff.py b/tests/test_diff.py index 7433e7e..ab5e2d0 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -640,7 +640,6 @@ def test_with_xmlid(self): ) def test_change_attribs(self): - left = """
diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 76f0830..9ae8edf 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -16,13 +16,13 @@ def test_get_placeholder(self): replacer = formatting.PlaceholderMaker() # Get a placeholder: ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_OPEN, None) - self.assertEqual(ph, "\ue005") + self.assertEqual(ph, "\ue007") # Do it again: ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_OPEN, None) - self.assertEqual(ph, "\ue005") + self.assertEqual(ph, "\ue007") # Get another one ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_CLOSE, ph) - self.assertEqual(ph, "\ue006") + self.assertEqual(ph, "\ue008") def test_do_element(self): replacer = formatting.PlaceholderMaker(["p"], ["b"]) @@ -34,7 +34,7 @@ def test_do_element(self): self.assertEqual( etree.tounicode(element), - "

This is a tag with \ue006formatted\ue005 text.

", + "

This is a tag with \ue008formatted\ue007 text.

", ) replacer.undo_element(element) @@ -45,14 +45,14 @@ def test_do_element(self): element = etree.fromstring(text) replacer.do_element(element) result = etree.tounicode(element) - self.assertEqual(result, "

This is a tag with \ue007 text.

") + self.assertEqual(result, "

This is a tag with \ue009 text.

") # Single formatting tags still get two placeholders. text = "

This is a with text.

" element = etree.fromstring(text) replacer.do_element(element) result = etree.tounicode(element) - self.assertEqual(result, "

This is a \ue009\ue008 with \ue00a text.

") + self.assertEqual(result, "

This is a \ue00b\ue00a with \ue00c text.

") def test_do_undo_element(self): replacer = formatting.PlaceholderMaker(["p"], ["b"]) @@ -63,7 +63,7 @@ def test_do_undo_element(self): replacer.do_element(element) self.assertEqual( - element.text, "This \ue005 a \ue006 with \ue008formatted" "\ue007 text." + element.text, "This \ue007 a \ue008 with \ue00aformatted" "\ue009 text." ) replacer.undo_element(element) @@ -79,7 +79,7 @@ def test_do_undo_element_double_format(self): replacer.do_element(element) self.assertEqual( - element.text, "This is \ue006doubly \ue008formatted\ue007" "\ue005 text." + element.text, "This is \ue008doubly \ue00aformatted\ue009" "\ue007 text." ) replacer.undo_element(element) @@ -110,8 +110,8 @@ def test_rml_bug(self): after_diff = """
- \ue005. - \ue007\ue009At Will Employment\ue008\ue006 + \ue007. + \ue009\ue00bAt Will Employment\ue00a\ue008 .\u201cNew Text\u201d
@@ -119,8 +119,8 @@ def test_rml_bug(self): # The diff formatting will find some text to insert. delete_attrib = "{%s}delete-format" % formatting.DIFF_NS - replacer.placeholder2tag["\ue006"].element.attrib[delete_attrib] = "" - replacer.placeholder2tag["\ue007"].element.attrib[delete_attrib] = "" + replacer.placeholder2tag["\ue008"].element.attrib[delete_attrib] = "" + replacer.placeholder2tag["\ue009"].element.attrib[delete_attrib] = "" tree = etree.fromstring(after_diff) replacer.undo_tree(tree) result = etree.tounicode(tree) @@ -153,7 +153,7 @@ def test_placeholder_overflow(self): # self.assertEqual( - element.text, "This \uf904 a \uf905 with \uf907some" "\uf906 text." + element.text, "This \uf906 a \uf907 with \uf909some" "\uf908 text." ) try: @@ -172,8 +172,8 @@ def test_placeholder_overflow(self): # This should raise an error on a narrow build self.assertEqual( element.text, - "This \U00010004 a \U00010005 with \U00010007some" - "\U00010006 text.", + "This \U00010006 a \U00010007 with \U00010009some" + "\U00010008 text.", ) except ValueError: if sys.maxunicode > 0x10000: @@ -186,8 +186,8 @@ def test_placeholder_overflow(self): class XMLFormatTests(unittest.TestCase): - def _format_test(self, left, action, expected): - formatter = formatting.XMLFormatter(pretty_print=False) + def _format_test(self, left, action, expected, use_replace=False): + formatter = formatting.XMLFormatter(pretty_print=False, use_replace=use_replace) result = formatter.format([action], etree.fromstring(left)) self.assertEqual(result, expected) @@ -317,6 +317,51 @@ def test_update_text_after_2(self): self._format_test(left, action, expected) + def test_replace_text_in(self): + left = '' + action = actions.UpdateTextIn("/document/node", "Text") + expected = START + ' attr="val">Text' + END + + self._format_test(left, action, expected, use_replace=True) + + left = "This is a bit of text, right" + END + action = actions.UpdateTextIn("/document/node", "Also a bit of text, rick") + expected = ( + START + '>Also' + ' a bit of text, rick' + "" + END + ) + + self._format_test(left, action, expected, use_replace=True) + + def test_replace_text_after_1(self): + left = "" + action = actions.UpdateTextAfter("/document/node[1]", "Text") + expected = START + "/>Text" "" + + self._format_test(left, action, expected, use_replace=True) + + def test_replace_text_after_2(self): + left = "This is a bit of text, right" + action = actions.UpdateTextAfter("/document/node", "Also a bit of text, rick") + expected = ( + START + '/>Also' + ' a bit of text, rick' + "
" + ) + + self._format_test(left, action, expected, use_replace=True) + + def test_replace_complete_text(self): + left = "aaaaaaa bbbbbb" + action = actions.UpdateTextIn("/document/node", "ccccc dddd eee") + expected = ( + START + '>ccccc dddd eee' + "" + END + ) + + self._format_test(left, action, expected, use_replace=True) + class DiffFormatTests(unittest.TestCase): def _format_test(self, action, expected): @@ -515,7 +560,6 @@ def test_all_actions(self): class FormatterFileTests(unittest.TestCase): - formatter = None # Override this maxDiff = None @@ -524,7 +568,6 @@ def process(self, left, right): class XMLFormatterFileTests(FormatterFileTests): - # The XMLFormatter has no text or formatting tags, so formatter = formatting.XMLFormatter( pretty_print=False, normalize=formatting.WS_TEXT @@ -535,7 +578,6 @@ class XMLFormatterFileTests(FormatterFileTests): class HTMLFormatterFileTests(FormatterFileTests): - # We use a few tags for the placeholder tests. #
is intentionally left out, to test an edge case # with empty non-formatting tags in text. diff --git a/tests/test_patch.py b/tests/test_patch.py index 62ff76e..702c1fe 100644 --- a/tests/test_patch.py +++ b/tests/test_patch.py @@ -23,7 +23,6 @@ class PatcherTests(unittest.TestCase): - patcher = Patcher() def _test(self, start, action, end): diff --git a/tests/test_utils.py b/tests/test_utils.py index d8f60a5..1089f8a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -111,7 +111,6 @@ def _diff(self, left, right, result): self.assertEqual("".join(res), result) def test_lcs(self): - self._diff("ABCDEF", "ABCDEF", "ABCDEF") self._diff("ABCDEF", "GHIJKL", "") diff --git a/xmldiff/diff_match_patch.py b/xmldiff/diff_match_patch.py index 971761d..bfe59e7 100644 --- a/xmldiff/diff_match_patch.py +++ b/xmldiff/diff_match_patch.py @@ -74,6 +74,7 @@ def __init__(self): DIFF_DELETE = -1 DIFF_INSERT = 1 DIFF_EQUAL = 0 + DIFF_REPLACE = 2 def diff_main(self, text1, text2, checklines=True, deadline=None): """Find the differences between two texts. Simplifies the problem by @@ -1135,7 +1136,7 @@ def diff_prettyHtml(self, diffs): HTML representation. """ html = [] - for (op, data) in diffs: + for op, data in diffs: text = ( data.replace("&", "&") .replace("<", "<") @@ -1160,7 +1161,7 @@ def diff_text1(self, diffs): Source text. """ text = [] - for (op, data) in diffs: + for op, data in diffs: if op != self.DIFF_INSERT: text.append(data) return "".join(text) @@ -1175,7 +1176,7 @@ def diff_text2(self, diffs): Destination text. """ text = [] - for (op, data) in diffs: + for op, data in diffs: if op != self.DIFF_DELETE: text.append(data) return "".join(text) @@ -1193,7 +1194,7 @@ def diff_levenshtein(self, diffs): levenshtein = 0 insertions = 0 deletions = 0 - for (op, data) in diffs: + for op, data in diffs: if op == self.DIFF_INSERT: insertions += len(data) elif op == self.DIFF_DELETE: @@ -1219,7 +1220,7 @@ def diff_toDelta(self, diffs): Delta text. """ text = [] - for (op, data) in diffs: + for op, data in diffs: if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") @@ -1707,7 +1708,7 @@ def patch_apply(self, patches, text): else: self.diff_cleanupSemanticLossless(diffs) index1 = 0 - for (op, data) in patch.diffs: + for op, data in patch.diffs: if op != self.DIFF_EQUAL: index2 = self.diff_xIndex(diffs, index1) if op == self.DIFF_INSERT: # Insertion @@ -2006,7 +2007,7 @@ def __str__(self): coords2 = str(self.start2 + 1) + "," + str(self.length2) text = ["@@ -", coords1, " +", coords2, " @@\n"] # Escape the body of the patch with %xx notation. - for (op, data) in self.diffs: + for op, data in self.diffs: if op == diff_match_patch.DIFF_INSERT: text.append("+") elif op == diff_match_patch.DIFF_DELETE: diff --git a/xmldiff/formatting.py b/xmldiff/formatting.py index e7c19a3..640ec15 100644 --- a/xmldiff/formatting.py +++ b/xmldiff/formatting.py @@ -13,6 +13,7 @@ INSERT_NAME = "{%s}insert" % DIFF_NS DELETE_NAME = "{%s}delete" % DIFF_NS +REPLACE_NAME = "{%s}replace" % DIFF_NS RENAME_NAME = "{%s}rename" % DIFF_NS # Flags for whitespace handling in the text aware formatters: @@ -104,9 +105,14 @@ def __init__(self, text_tags=(), formatting_tags=()): delete_close = self.get_placeholder(delete_elem, T_CLOSE, None) delete_open = self.get_placeholder(delete_elem, T_OPEN, delete_close) + replace_elem = etree.Element(REPLACE_NAME) + replace_close = self.get_placeholder(replace_elem, T_CLOSE, None) + replace_open = self.get_placeholder(replace_elem, T_OPEN, replace_close) + self.diff_tags = { "insert": (insert_open, insert_close), "delete": (delete_open, delete_close), + "replace": (replace_open, replace_close), } def get_placeholder(self, element, ttype, close_ph): @@ -228,7 +234,7 @@ def undo_element(self, elem): def undo_tree(self, tree): self.undo_element(tree) - def mark_diff(self, ph, action): + def mark_diff(self, ph, action, attributes=None): entry = self.placeholder2tag[ph] if entry.ttype == T_CLOSE: # Close tag, nothing to mark @@ -242,12 +248,22 @@ def mark_diff(self, ph, action): # Formatting element, add a diff attribute action += "-formatting" elem.attrib[f"{{{DIFF_NS}}}{action}"] = "" + if attributes is not None: + for attrib, value in attributes.items(): + elem.attrib[attrib] = value # And make a new placeholder for this new entry: return self.get_placeholder(elem, entry.ttype, entry.close_ph) - def wrap_diff(self, text, action): + def wrap_diff(self, text, action, attributes=None): open_ph, close_ph = self.diff_tags[action] + if attributes is not None and len(attributes) > 0: + entry = self.placeholder2tag[open_ph] + elem = entry.element + elem = deepcopy(elem) + for attrib, value in attributes.items(): + elem.attrib[attrib] = value + open_ph = self.get_placeholder(elem, entry.ttype, entry.close_ph) return open_ph + text + close_ph @@ -295,16 +311,25 @@ class XMLFormatter(BaseFormatter): WS_TEXT normalizes only inside text_tags, WS_TAGS will remove ignorable whitespace between tags, WS_BOTH do both, and WS_NONE will preserve all whitespace. + + The ``use_replace`` flag decides, if a replace tag (with the old text + as an attribute) should be used instead of one delete and one insert tag. """ def __init__( - self, normalize=WS_NONE, pretty_print=True, text_tags=(), formatting_tags=() + self, + normalize=WS_NONE, + pretty_print=True, + text_tags=(), + formatting_tags=(), + use_replace=False, ): # Mapping from placeholders -> structural content and vice versa. self.normalize = normalize self.pretty_print = pretty_print self.text_tags = text_tags self.formatting_tags = formatting_tags + self.use_replace = use_replace self.placeholderer = PlaceholderMaker( text_tags=text_tags, formatting_tags=formatting_tags ) @@ -570,6 +595,36 @@ def _stack_pop(): new_diff.append((op, seg)) return new_diff + def _join_delete_insert(self, diffs): + new_diffs = [] + skip_next = False + for i in range(len(diffs) - 1): + if skip_next: + skip_next = False + continue + op, text = diffs[i] + next_op, next_text = diffs[i + 1] + # insert, then delete + if ( + op == diff_match_patch.DIFF_INSERT + and next_op == diff_match_patch.DIFF_DELETE + ): + new_diffs.append((diff_match_patch.DIFF_REPLACE, text, next_text)) + skip_next = True # also skip upcoming delete + # delete, then insert + elif ( + next_op == diff_match_patch.DIFF_INSERT + and op == diff_match_patch.DIFF_DELETE + ): + new_diffs.append((diff_match_patch.DIFF_REPLACE, next_text, text)) + skip_next = True # also skip upcoming insert + else: + new_diffs.append(diffs[i]) + # append last diff, if it shouldn't be skipped + if not skip_next: + new_diffs.append(diffs[-1]) + return new_diffs + def _make_diff_tags(self, left_value, right_value, node, target=None): if bool(self.normalize & WS_TEXT): left_value = utils.cleanup_whitespace(left_value or "").strip() @@ -578,36 +633,46 @@ def _make_diff_tags(self, left_value, right_value, node, target=None): text_diff = diff_match_patch() diff = text_diff.diff_main(left_value or "", right_value or "") text_diff.diff_cleanupSemantic(diff) - diff = self._realign_placeholders(diff) + if self.use_replace: + diff = self._join_delete_insert(diff) cur_child = None if target is None: target = node else: cur_child = node - for op, text in diff: - if op == 0: + for d in diff: + op = d[0] + text = d[1] + if op == diff_match_patch.DIFF_REPLACE: + old_text = d[2] + + if op == diff_match_patch.DIFF_EQUAL: if cur_child is None: node.text = (node.text or "") + text else: cur_child.tail = (cur_child.tail or "") + text continue - if op == -1: + attributes = {} + if op == diff_match_patch.DIFF_DELETE: action = "delete" - elif op == 1: + elif op == diff_match_patch.DIFF_INSERT: action = "insert" + elif op == diff_match_patch.DIFF_REPLACE: + action = "replace" + attributes["old-text"] = old_text if self.placeholderer.is_placeholder(text): - ph = self.placeholderer.mark_diff(text, action) + ph = self.placeholderer.mark_diff(text, action, attributes) if cur_child is None: node.text = (node.text or "") + ph else: - new_text = self.placeholderer.wrap_diff(text, action) + new_text = self.placeholderer.wrap_diff(text, action, attributes) if cur_child is None: node.text = (node.text or "") + new_text diff --git a/xmldiff/utils.py b/xmldiff/utils.py index 1b8f943..354f5e3 100644 --- a/xmldiff/utils.py +++ b/xmldiff/utils.py @@ -37,7 +37,6 @@ def breadth_first_traverse(node): # It also skips any items that are equal in the beginning and end, speeding # up the search, and using even less memory. def longest_common_subsequence(left_sequence, right_sequence, eqfn=eq): - start = 0 lend = lslen = len(left_sequence) rend = rslen = len(right_sequence)