Skip to content

Commit

Permalink
Fix stringifying lines
Browse files Browse the repository at this point in the history
etree.stringify(chunk, encoding='unicode', method='text') is magic
  • Loading branch information
pletcher committed Jul 26, 2024
1 parent 6e37690 commit e302f1a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 31 deletions.
62 changes: 32 additions & 30 deletions ajmc/nlp/lemlink/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, text_url: str, chunk_by: str = "tei:l"):
str_offset = 0

for chunk in self.tree.iterfind(f".//{self.chunk_by}", namespaces=NAMESPACES):
chunk_text = chunk.text
chunk_text = etree.tostring(chunk, encoding='unicode', method='text') # type: ignore

if chunk_text is not None:
t = unicodedata.normalize("NFC", chunk_text)
Expand All @@ -47,6 +47,8 @@ def __init__(self, text_url: str, chunk_by: str = "tei:l"):
)
)
str_offset += len(t)
else:
print(chunk.get('n'))

self.text = "".join(c.text for c in self.chunks)

Expand Down Expand Up @@ -101,35 +103,35 @@ def offsets_to_selector(self, offsets: list[int]):


# %%
from pathlib import Path
from ajmc.nlp.token_classification.data_preparation.hipe_iob import read_lemlink_tsv
from ajmc.nlp.lemlink.data_preparation import TEI2TextMapper

sample_tsv_path = Path("~/Downloads/lemlink-v1.0.beta-test_NOCOMMENT.tsv")

data = read_lemlink_tsv(sample_tsv_path)
data = data.to_dict(orient="list")

mapper = TEI2TextMapper(
"http://raw.githubusercontent.com/gregorycrane/Wolf1807/master/ajax-2019/ajax-lj.xml"
)

for i in range(len(data["ANCHOR_TARGET"])):
if data["ANCHOR_TARGET"][i] != "_":
sample_selector = data["ANCHOR_TARGET"][i]
sample_text = data["ANCHOR_TEXT"][i]
# break

if sample_selector is not None:
offsets = mapper.selector_to_offsets(data["ANCHOR_TARGET"][i])

if offsets is not None:
lines = mapper.lines_for_offsets(offsets)
text = " ".join(l.text for l in lines)
if text != data["ANCHOR_TEXT"][i]:
print(i, text, " |||| ", data["ANCHOR_TEXT"][i])
else:
print(i, "OK")
# from pathlib import Path
# from ajmc.nlp.token_classification.data_preparation.hipe_iob import read_lemlink_tsv
# from ajmc.nlp.lemlink.data_preparation import TEI2TextMapper

# sample_tsv_path = Path("~/Downloads/lemlink-v1.0.beta-test_NOCOMMENT.tsv")

# data = read_lemlink_tsv(sample_tsv_path)
# data = data.to_dict(orient="list")

# mapper = TEI2TextMapper(
# "http://raw.githubusercontent.com/gregorycrane/Wolf1807/master/ajax-2019/ajax-lj.xml"
# )

# for i in range(len(data["ANCHOR_TARGET"])):
# if data["ANCHOR_TARGET"][i] != "_":
# sample_selector = data["ANCHOR_TARGET"][i]
# sample_text = data["ANCHOR_TEXT"][i]
# # break

# if sample_selector is not None:
# offsets = mapper.selector_to_offsets(data["ANCHOR_TARGET"][i])

# if offsets is not None:
# lines = mapper.lines_for_offsets(offsets)
# text = " ".join(l.text for l in lines)
# if text != data["ANCHOR_TEXT"][i]:
# print(i, text, " |||| ", data["ANCHOR_TEXT"][i])
# else:
# print(i, "OK")
# # break

# offsets = mapper.selector_to_offsets(sample_selector)
Expand Down
8 changes: 7 additions & 1 deletion tests/test_nlp/test_lemlink_data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ def test_init(self, mapper):
assert isinstance(mapper.text, str)
assert unicodedata.is_normalized('NFC', mapper.text)

def test_line_799(self, mapper):
l_799 = [chunk for chunk in mapper.chunks if chunk.n == '799'][0]

assert l_799.text is not None
assert l_799.text == 'τὴν ὀλεθρίαν Αἴαντος ἐλπίζει φέρειν.'

def test_lines_for_offsets(self, mapper):
offsets = mapper.selector_to_offsets('tei-l@n=9[0]:tei-l@n=24[34]')
lines = mapper.lines_for_offsets(offsets)
Expand All @@ -25,7 +31,7 @@ def test_lines_for_offsets(self, mapper):
lines = mapper.lines_for_offsets(offsets)

assert len(lines) == 2

for idx, line in enumerate(lines, start=208):
assert line.n == str(idx)

Expand Down

0 comments on commit e302f1a

Please sign in to comment.