Fix stringifying lines

etree.stringify(chunk, encoding='unicode', method='text') is magic
AjaxMultiCommentary · Jul 26, 2024 · e302f1a · e302f1a
1 parent 6e37690
commit e302f1a
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 31 deletions.
diff --git a/ajmc/nlp/lemlink/data_preparation.py b/ajmc/nlp/lemlink/data_preparation.py
@@ -33,7 +33,7 @@ def __init__(self, text_url: str, chunk_by: str = "tei:l"):
         str_offset = 0
 
         for chunk in self.tree.iterfind(f".//{self.chunk_by}", namespaces=NAMESPACES):
-            chunk_text = chunk.text
+            chunk_text = etree.tostring(chunk, encoding='unicode', method='text') # type: ignore
 
             if chunk_text is not None:
                 t = unicodedata.normalize("NFC", chunk_text)
@@ -47,6 +47,8 @@ def __init__(self, text_url: str, chunk_by: str = "tei:l"):
                     )
                 )
                 str_offset += len(t)
+            else:
+                print(chunk.get('n'))
 
         self.text = "".join(c.text for c in self.chunks)
 
@@ -101,35 +103,35 @@ def offsets_to_selector(self, offsets: list[int]):
 
 
 # %%
-from pathlib import Path
-from ajmc.nlp.token_classification.data_preparation.hipe_iob import read_lemlink_tsv
-from ajmc.nlp.lemlink.data_preparation import TEI2TextMapper
-
-sample_tsv_path = Path("~/Downloads/lemlink-v1.0.beta-test_NOCOMMENT.tsv")
-
-data = read_lemlink_tsv(sample_tsv_path)
-data = data.to_dict(orient="list")
-
-mapper = TEI2TextMapper(
-    "http://raw.githubusercontent.com/gregorycrane/Wolf1807/master/ajax-2019/ajax-lj.xml"
-)
-
-for i in range(len(data["ANCHOR_TARGET"])):
-    if data["ANCHOR_TARGET"][i] != "_":
-        sample_selector = data["ANCHOR_TARGET"][i]
-        sample_text = data["ANCHOR_TEXT"][i]
-        # break
-
-        if sample_selector is not None:
-            offsets = mapper.selector_to_offsets(data["ANCHOR_TARGET"][i])
-
-            if offsets is not None:
-                lines = mapper.lines_for_offsets(offsets)
-                text = " ".join(l.text for l in lines)
-                if text != data["ANCHOR_TEXT"][i]:
-                    print(i, text, " |||| ", data["ANCHOR_TEXT"][i])
-                else:
-                    print(i, "OK")
+# from pathlib import Path
+# from ajmc.nlp.token_classification.data_preparation.hipe_iob import read_lemlink_tsv
+# from ajmc.nlp.lemlink.data_preparation import TEI2TextMapper
+
+# sample_tsv_path = Path("~/Downloads/lemlink-v1.0.beta-test_NOCOMMENT.tsv")
+
+# data = read_lemlink_tsv(sample_tsv_path)
+# data = data.to_dict(orient="list")
+
+# mapper = TEI2TextMapper(
+#     "http://raw.githubusercontent.com/gregorycrane/Wolf1807/master/ajax-2019/ajax-lj.xml"
+# )
+
+# for i in range(len(data["ANCHOR_TARGET"])):
+#     if data["ANCHOR_TARGET"][i] != "_":
+#         sample_selector = data["ANCHOR_TARGET"][i]
+#         sample_text = data["ANCHOR_TEXT"][i]
+#         # break
+
+#         if sample_selector is not None:
+#             offsets = mapper.selector_to_offsets(data["ANCHOR_TARGET"][i])
+
+#             if offsets is not None:
+#                 lines = mapper.lines_for_offsets(offsets)
+#                 text = " ".join(l.text for l in lines)
+#                 if text != data["ANCHOR_TEXT"][i]:
+#                     print(i, text, " |||| ", data["ANCHOR_TEXT"][i])
+#                 else:
+#                     print(i, "OK")
 #         # break
 
 # offsets = mapper.selector_to_offsets(sample_selector)

diff --git a/tests/test_nlp/test_lemlink_data_preparation.py b/tests/test_nlp/test_lemlink_data_preparation.py
@@ -14,6 +14,12 @@ def test_init(self, mapper):
         assert isinstance(mapper.text, str)
         assert unicodedata.is_normalized('NFC', mapper.text)
 
+    def test_line_799(self, mapper):
+        l_799 = [chunk for chunk in mapper.chunks if chunk.n == '799'][0]
+
+        assert l_799.text is not None
+        assert l_799.text == 'τὴν ὀλεθρίαν Αἴαντος ἐλπίζει φέρειν.'
+
     def test_lines_for_offsets(self, mapper):
         offsets = mapper.selector_to_offsets('tei-l@n=9[0]:tei-l@n=24[34]')
         lines = mapper.lines_for_offsets(offsets)
@@ -25,7 +31,7 @@ def test_lines_for_offsets(self, mapper):
         lines = mapper.lines_for_offsets(offsets)
 
         assert len(lines) == 2
-        
+
         for idx, line in enumerate(lines, start=208):
             assert line.n == str(idx)