-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathunzip_aocp.py
250 lines (195 loc) · 8.29 KB
/
unzip_aocp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
"""
Unzip the OpenITI AOCP zip files
Each zip file in turn contains two zip files:
one containing the teixml export from eScriptorium,
the other containing the openitimarkdown export.
The `main()` function unzips this zip file
and merges all OpenITI mARkdown page transcriptions
into one single OpenITI mARkdown text file.
"""
import os
import re
import shutil
import zipfile
from openiti.helper.funcs import natural_sort
yml_template = """\
00#VERS#LENGTH###:
00#VERS#CLENGTH##:
00#VERS#URI######: {}
80#VERS#BASED####: permalink, permalink, permalink
80#VERS#COLLATED#: permalink, permalink, permalink
80#VERS#LINKS####: permalink, permalink, permalink
90#VERS#ANNOTATOR: the name of the annotator (latin characters; please
use consistently)
90#VERS#COMMENT##: This text was OCR'ed as part of the first phase of the OpenITI AOCP project,
generously funded by the Andrew W. Mellon Foundation.
This file was created on {}
by eScriptorium version {}.
90#VERS#DATE#####: YYYY-MM-DD
90#VERS#ISSUES###: UNCORRECTED_OCR
"""
def merge_md(zip_fp, uri_folder):
"""Merge all page transcription OpenITI mARkdown files into a single mARkdown file
Args:
zip_fp (str): path to the zip file containing all page transcriptions
uri_folder (str): path to the destination folder (named after the text's version URI)
Returns:
None
"""
# unzip the zip file:
folder = unzip_text_folder(zip_fp, uri_folder, file_type="md")
# merge all separate markdown files (one file per page):
full_text = ""
for fn in natural_sort(os.listdir(folder)):
if not fn.endswith(".mARkdown"):
continue
#print(fn)
# read the text file
fp = os.path.join(folder, fn)
with open(fp, mode="r", encoding="utf-8") as file:
page_text = file.read()
# extract the page number from the filename:
page_no = re.findall("\d+", fn)[-1]
page_no = "\nPageV00P{:03d}\n".format(int(page_no))
# remove the metadata header from the text:
header, splitter, page_text = re.split("(#META#Header#End#[\r\n]+)", page_text, maxsplit=1)
# extract the source image filename from the filename:
img_fn = re.findall("#META# IMAGE FILENAME: (.+)", header)[0]
img_link = "\n![page image](img/{})\n\n".format(img_fn)
# add the page text to the full_text:
full_text += img_link + page_text + page_no
# extract the page-specific sections from the header:
header = re.sub("#META# IMAGE.+[\r\n]+", "", header)
header += splitter
# add the metadata header to the text
full_text = header + full_text
# stor the text file with temporary ".txt" extension
# (in order not to overwrite the original folder):
outfp = uri_folder+".txt"
with open(outfp, mode="w", encoding="utf-8") as file:
file.write(full_text)
# Create the yml file:
yml_fp = uri_folder+".yml"
with open(yml_fp, mode="w", encoding="utf-8") as file:
uri = os.path.basename(uri_folder)
created_date = re.findall("CREATED: (.+)", header)[0]
created_by = re.findall("CREATED BY: (.+)", header)[0]
file.write(yml_template.format(uri, created_date, created_by))
def merge_tei(zip_fp, uri_folder):
"""Merge all page transcription TEI XML files into a single TEI XML file
Args:
zip_fp (str): path to the zip file containing all page transcriptions
uri_folder (str): path to the destination folder (named after the text's version URI)
Returns:
None
"""
# unzip the zip file:
folder = unzip_text_folder(zip_fp, uri_folder, file_type="tei")
print("write code to unzip tei xml zips")
# merge all separate tei xml files (one file per page):
full_text = ""
for fn in natural_sort(os.listdir(folder)):
if not fn.endswith(".xml"):
continue
print(fn)
# read the text file
fp = os.path.join(folder, fn)
with open(fp, mode="r", encoding="utf-8") as file:
page_text = file.read()
# extract the page number from the filename:
page_no = re.findall("\d+", fn)[-1]
page_no = '\n <pb n="{}"/>\n'.format(page_no)
# remove the metadata header from the text:
split_regex = "(<body>[\r\n ]+<div>[\r\n ]+<p> *)"
header, splitter, page_text = re.split(split_regex, page_text, maxsplit=1)
# strip off the closing p, div, TEI, body and text tags:
footer_regex = " *</p>\n[\r\n ]+</div>[\r\n ]+</body>[\r\n ]+</text>[\r\n ]+</TEI>[\r\n ]*"
footer = re.findall(footer_regex, page_text)[0]
page_text = page_text[:-len(footer)]
# extract the source image filename from the filename:
img_fn = re.findall("IMAGE FILENAME: (.+)", header)[0]
img_width = re.findall("IMAGE WIDTH: (.+)", header)[0]
img_height = re.findall("IMAGE HEIGHT: (.+)", header)[0]
img_link = '\n <graphic url="{}" width={} height={} />\n'.format(img_fn, img_width, img_height)
# add the page text to the full_text:
full_text += page_no + img_link + page_text
# remove the page-specific sections from the header:
header = re.sub("IMAGE.+[\r\n]+", "", header)
header += splitter
# add the metadata header to the text
full_text = header + full_text + footer
# store the xml file:
outfp = uri_folder+".xml"
with open(outfp, mode="w", encoding="utf-8") as file:
file.write(full_text)
def unzip_text_folder(fp, uri_folder, file_type="tei"):
"""Unzip the folder containing the page transcriptions
The folder will get the name of the file type ("tei", "md")
Args:
fp (str): path to the zip file
uri_folder (str): path to the parent folder of the output folder
file_type (str): file type of the transcription files ("tei", "md")
Returns:
str
"""
dest_folder = os.path.join(uri_folder, file_type)
if not os.path.exists(dest_folder):
os.mkdir(dest_folder)
with zipfile.ZipFile(fp, mode="r") as archive:
archive.extractall(path=dest_folder)
return dest_folder
def unzip_container(fp):
"""Unzip the main zip file to a folder with the URI as name
Returns:
str
"""
uri_folder = fp[:-4]
if os.path.exists(uri_folder):
os.remove(uri_folder)
with zipfile.ZipFile(fp, mode="r") as archive:
#for fn in archive.namelist():
# print(fn)
archive.extractall(path=uri_folder)
#os.remove(fp)
return uri_folder
def main(zip_fp, delete_zip=False):
"""Extract the OpenITI mARkdown page transcriptions and merge them into one file.
Args:
zip_fp (str): path to the zip file
delete_zip (bool): if True, the original zip file will be deleted
after the text was extracted from it
Returns:
None
"""
# if a folder is passed as first argument instead of a zip file, convert all zip files in that folder:
if not zip_fp.endswith(".zip"):
for fn in os.listdir(zip_fp):
print(fn)
fp = os.path.join(zip_fp, fn)
if fp.endswith(".zip"):
main(fp, delete_zip=delete_zip)
return
# unzip the container zip file, which contains two nested zips
# (the container will be the URI of the text):
uri_folder = unzip_container(zip_fp)
# find the zip file containing the openiti markdown page transcriptions:
for root, dirs, files in os.walk(uri_folder):
for fn in files:
fp = os.path.join(root, fn)
print(fp)
if fp.endswith(".zip"):
if "openitimarkdown" in fn:
# merge all openiti markdown page transcriptions into one text file:
merge_md(fp, uri_folder)
# better to convert tei from OpenITI mARkdown directly
#elif "teixml" in fn:
# merge_tei(fp, uri_folder)
# remove the original zip file and the temporary unzipped folder:
shutil.rmtree(uri_folder)
if delete_zip:
os.remove(zip_fp)
# remove the temporary extension of the OpenITI mARkdown file:
os.rename(uri_folder+".txt", uri_folder)
if __name__ == "__main__":
fp = r"0650QaysRazi.Mucjam.AOCP1017-per1.zip"
main(".")