Skip to content

Commit

Permalink
Merge pull request #128 from Shoobx/Tomp0801-pfitzinger-xmlformatter-…
Browse files Browse the repository at this point in the history
…replacing

use_replace tag
  • Loading branch information
regebro authored Jan 4, 2024
2 parents 681b4f9 + 3d0363c commit 7434865
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 47 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ build
docs/build
venv
build
dist
dist
test.ipynb
.venv/
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-yaml
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 22.12.0
rev: 23.12.1
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8
- repo: https://github.com/regebro/pyroma
rev: '4.1'
rev: '4.2'
hooks:
- id: pyroma
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ Changes

- Changed usage of deprecated `pkg_resources` package to `importlib.metadata`.

- A `use_replace` flag was added to the `XMLFormatter` by Thomas Pfitzinger.
It changes text replacement from delete and insert tags to a replace tag.
It's not currently accessaible thtough the CLI, the question is it is better
to add a new formatter name, or an option to pass in formatter flags.


2.6.3 (2023-05-21)
------------------
Expand Down
7 changes: 7 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- Added option to XMLFormatter to use replace tags

- in _make_diff_tags after diffing, neighboring delete/insert diffs are joined to a replace tag

- the deleted text is added as an attribute ("old-text")

- the inserted text is the element's text
5 changes: 4 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,8 @@ Contributors

* Jacek Chałupka, [email protected]

The diff algorithm is based on "`Change Detection in Hierarchically Structured Information <http://infolab.stanford.edu/c3/papers/html/tdiff3-8/tdiff3-8.html>`_",
* Thomas Pfitzinger, [email protected]

The diff algorithm is based on
"`Change Detection in Hierarchically Structured Information <http://infolab.stanford.edu/c3/papers/html/tdiff3-8/tdiff3-8.html>`_",
and the text diff is using Google's ``diff_match_patch`` algorithm.
1 change: 0 additions & 1 deletion tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,6 @@ def test_with_xmlid(self):
)

def test_change_attribs(self):

left = """<document>
<story firstPageTemplate="FirstPage">
<section xml:id="oldfirst" ref="3" single-ref="3">
Expand Down
82 changes: 62 additions & 20 deletions tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ def test_get_placeholder(self):
replacer = formatting.PlaceholderMaker()
# Get a placeholder:
ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_OPEN, None)
self.assertEqual(ph, "\ue005")
self.assertEqual(ph, "\ue007")
# Do it again:
ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_OPEN, None)
self.assertEqual(ph, "\ue005")
self.assertEqual(ph, "\ue007")
# Get another one
ph = replacer.get_placeholder(etree.Element("tag"), formatting.T_CLOSE, ph)
self.assertEqual(ph, "\ue006")
self.assertEqual(ph, "\ue008")

def test_do_element(self):
replacer = formatting.PlaceholderMaker(["p"], ["b"])
Expand All @@ -34,7 +34,7 @@ def test_do_element(self):

self.assertEqual(
etree.tounicode(element),
"<p>This is a tag with \ue006formatted\ue005 text.</p>",
"<p>This is a tag with \ue008formatted\ue007 text.</p>",
)

replacer.undo_element(element)
Expand All @@ -45,14 +45,14 @@ def test_do_element(self):
element = etree.fromstring(text)
replacer.do_element(element)
result = etree.tounicode(element)
self.assertEqual(result, "<p>This is a tag with \ue007 text.</p>")
self.assertEqual(result, "<p>This is a tag with \ue009 text.</p>")

# Single formatting tags still get two placeholders.
text = "<p>This is a <b/> with <foo/> text.</p>"
element = etree.fromstring(text)
replacer.do_element(element)
result = etree.tounicode(element)
self.assertEqual(result, "<p>This is a \ue009\ue008 with \ue00a text.</p>")
self.assertEqual(result, "<p>This is a \ue00b\ue00a with \ue00c text.</p>")

def test_do_undo_element(self):
replacer = formatting.PlaceholderMaker(["p"], ["b"])
Expand All @@ -63,7 +63,7 @@ def test_do_undo_element(self):
replacer.do_element(element)

self.assertEqual(
element.text, "This \ue005 a \ue006 with \ue008formatted" "\ue007 text."
element.text, "This \ue007 a \ue008 with \ue00aformatted" "\ue009 text."
)

replacer.undo_element(element)
Expand All @@ -79,7 +79,7 @@ def test_do_undo_element_double_format(self):
replacer.do_element(element)

self.assertEqual(
element.text, "This is \ue006doubly \ue008formatted\ue007" "\ue005 text."
element.text, "This is \ue008doubly \ue00aformatted\ue009" "\ue007 text."
)

replacer.undo_element(element)
Expand Down Expand Up @@ -110,17 +110,17 @@ def test_rml_bug(self):
after_diff = """<document xmlns:diff="http://namespaces.shoobx.com/diff">
<section>
<para>
<insert>\ue005</insert>.
\ue007\ue009At Will Employment\ue008\ue006
<insert>\ue007</insert>.
\ue009\ue00bAt Will Employment\ue00a\ue008
.\u201c<insert>New </insert>Text\u201d
</para>
</section>
</document>"""

# The diff formatting will find some text to insert.
delete_attrib = "{%s}delete-format" % formatting.DIFF_NS
replacer.placeholder2tag["\ue006"].element.attrib[delete_attrib] = ""
replacer.placeholder2tag["\ue007"].element.attrib[delete_attrib] = ""
replacer.placeholder2tag["\ue008"].element.attrib[delete_attrib] = ""
replacer.placeholder2tag["\ue009"].element.attrib[delete_attrib] = ""
tree = etree.fromstring(after_diff)
replacer.undo_tree(tree)
result = etree.tounicode(tree)
Expand Down Expand Up @@ -153,7 +153,7 @@ def test_placeholder_overflow(self):

#
self.assertEqual(
element.text, "This \uf904 a \uf905 with \uf907some" "\uf906 text."
element.text, "This \uf906 a \uf907 with \uf909some" "\uf908 text."
)

try:
Expand All @@ -172,8 +172,8 @@ def test_placeholder_overflow(self):
# This should raise an error on a narrow build
self.assertEqual(
element.text,
"This \U00010004 a \U00010005 with \U00010007some"
"\U00010006 text.",
"This \U00010006 a \U00010007 with \U00010009some"
"\U00010008 text.",
)
except ValueError:
if sys.maxunicode > 0x10000:
Expand All @@ -186,8 +186,8 @@ def test_placeholder_overflow(self):


class XMLFormatTests(unittest.TestCase):
def _format_test(self, left, action, expected):
formatter = formatting.XMLFormatter(pretty_print=False)
def _format_test(self, left, action, expected, use_replace=False):
formatter = formatting.XMLFormatter(pretty_print=False, use_replace=use_replace)
result = formatter.format([action], etree.fromstring(left))
self.assertEqual(result, expected)

Expand Down Expand Up @@ -317,6 +317,51 @@ def test_update_text_after_2(self):

self._format_test(left, action, expected)

def test_replace_text_in(self):
left = '<document><node attr="val"/></document>'
action = actions.UpdateTextIn("/document/node", "Text")
expected = START + ' attr="val"><diff:insert>Text</diff:insert>' + END

self._format_test(left, action, expected, use_replace=True)

left = "<document><node>This is a bit of text, right" + END
action = actions.UpdateTextIn("/document/node", "Also a bit of text, rick")
expected = (
START + '><diff:replace old-text="This is">Also</diff:replace>'
' a bit of text, ri<diff:replace old-text="ght">ck'
"</diff:replace>" + END
)

self._format_test(left, action, expected, use_replace=True)

def test_replace_text_after_1(self):
left = "<document><node/><node/></document>"
action = actions.UpdateTextAfter("/document/node[1]", "Text")
expected = START + "/><diff:insert>Text</diff:insert>" "<node/></document>"

self._format_test(left, action, expected, use_replace=True)

def test_replace_text_after_2(self):
left = "<document><node/>This is a bit of text, right</document>"
action = actions.UpdateTextAfter("/document/node", "Also a bit of text, rick")
expected = (
START + '/><diff:replace old-text="This is">Also</diff:replace>'
' a bit of text, ri<diff:replace old-text="ght">ck'
"</diff:replace></document>"
)

self._format_test(left, action, expected, use_replace=True)

def test_replace_complete_text(self):
left = "<document><node>aaaaaaa bbbbbb</node></document>"
action = actions.UpdateTextIn("/document/node", "ccccc dddd eee")
expected = (
START + '><diff:replace old-text="aaaaaaa bbbbbb">ccccc dddd eee'
"</diff:replace>" + END
)

self._format_test(left, action, expected, use_replace=True)


class DiffFormatTests(unittest.TestCase):
def _format_test(self, action, expected):
Expand Down Expand Up @@ -515,7 +560,6 @@ def test_all_actions(self):


class FormatterFileTests(unittest.TestCase):

formatter = None # Override this
maxDiff = None

Expand All @@ -524,7 +568,6 @@ def process(self, left, right):


class XMLFormatterFileTests(FormatterFileTests):

# The XMLFormatter has no text or formatting tags, so
formatter = formatting.XMLFormatter(
pretty_print=False, normalize=formatting.WS_TEXT
Expand All @@ -535,7 +578,6 @@ class XMLFormatterFileTests(FormatterFileTests):


class HTMLFormatterFileTests(FormatterFileTests):

# We use a few tags for the placeholder tests.
# <br/> is intentionally left out, to test an edge case
# with empty non-formatting tags in text.
Expand Down
1 change: 0 additions & 1 deletion tests/test_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@


class PatcherTests(unittest.TestCase):

patcher = Patcher()

def _test(self, start, action, end):
Expand Down
1 change: 0 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def _diff(self, left, right, result):
self.assertEqual("".join(res), result)

def test_lcs(self):

self._diff("ABCDEF", "ABCDEF", "ABCDEF")

self._diff("ABCDEF", "GHIJKL", "")
Expand Down
15 changes: 8 additions & 7 deletions xmldiff/diff_match_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __init__(self):
DIFF_DELETE = -1
DIFF_INSERT = 1
DIFF_EQUAL = 0
DIFF_REPLACE = 2

def diff_main(self, text1, text2, checklines=True, deadline=None):
"""Find the differences between two texts. Simplifies the problem by
Expand Down Expand Up @@ -1135,7 +1136,7 @@ def diff_prettyHtml(self, diffs):
HTML representation.
"""
html = []
for (op, data) in diffs:
for op, data in diffs:
text = (
data.replace("&", "&amp;")
.replace("<", "&lt;")
Expand All @@ -1160,7 +1161,7 @@ def diff_text1(self, diffs):
Source text.
"""
text = []
for (op, data) in diffs:
for op, data in diffs:
if op != self.DIFF_INSERT:
text.append(data)
return "".join(text)
Expand All @@ -1175,7 +1176,7 @@ def diff_text2(self, diffs):
Destination text.
"""
text = []
for (op, data) in diffs:
for op, data in diffs:
if op != self.DIFF_DELETE:
text.append(data)
return "".join(text)
Expand All @@ -1193,7 +1194,7 @@ def diff_levenshtein(self, diffs):
levenshtein = 0
insertions = 0
deletions = 0
for (op, data) in diffs:
for op, data in diffs:
if op == self.DIFF_INSERT:
insertions += len(data)
elif op == self.DIFF_DELETE:
Expand All @@ -1219,7 +1220,7 @@ def diff_toDelta(self, diffs):
Delta text.
"""
text = []
for (op, data) in diffs:
for op, data in diffs:
if op == self.DIFF_INSERT:
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
data = data.encode("utf-8")
Expand Down Expand Up @@ -1707,7 +1708,7 @@ def patch_apply(self, patches, text):
else:
self.diff_cleanupSemanticLossless(diffs)
index1 = 0
for (op, data) in patch.diffs:
for op, data in patch.diffs:
if op != self.DIFF_EQUAL:
index2 = self.diff_xIndex(diffs, index1)
if op == self.DIFF_INSERT: # Insertion
Expand Down Expand Up @@ -2006,7 +2007,7 @@ def __str__(self):
coords2 = str(self.start2 + 1) + "," + str(self.length2)
text = ["@@ -", coords1, " +", coords2, " @@\n"]
# Escape the body of the patch with %xx notation.
for (op, data) in self.diffs:
for op, data in self.diffs:
if op == diff_match_patch.DIFF_INSERT:
text.append("+")
elif op == diff_match_patch.DIFF_DELETE:
Expand Down
Loading

0 comments on commit 7434865

Please sign in to comment.