-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlextract.py
executable file
·276 lines (233 loc) · 9.36 KB
/
lextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python3
#
# lextract.py -- A small script to (pre-)process TEI XML files
#
# Copyright (c) 2017 Henning Gebhard
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
from os import path, walk
from lxml import etree
import nltk
from nltk.stem.snowball import SnowballStemmer
###############################################################################
# Configuration
PROJECT_DIR = path.dirname(path.abspath(__file__))
NLTK_DIR = path.join(PROJECT_DIR, "nltk_data")
MOD_DIR = path.join(PROJECT_DIR, "edited")
SOURCE_DIR = path.join(PROJECT_DIR, "source")
PLAIN_DIR = path.join(PROJECT_DIR, "plaintext")
REF_DIR = path.join(PROJECT_DIR, "reference")
STE_DIR = path.join(PROJECT_DIR, "stemmed")
XML_DIR = path.join(PROJECT_DIR, "results")
###############################################################################
"""
lextract.py
A library as well as a script to (pre-)process TEI XML files. It mainly
provides tokenization and text extraction, but it can also fix some
common whitespace problems before that.
It is tailored specifically to the needs of the course "Praxis der DH"
and I doubt that it is very useful for anything else without at least
some custom modifications.
"""
nltk.data.path.append(NLTK_DIR)
def load_file(filepath):
"""Load XML source file with lxml and parse it into an etree."""
with open(filepath, "r") as f:
tree = etree.parse(f)
return tree
def fix_whitespace(tree):
"""Fix some common whitespace problems to make it suitable
for tokenization.
"""
text = tree.getroot().find("text")
for p in text.iter("note", "lb"):
if(p.tag == "note"):
# insert whitespace into every <note>
try:
p.text = " " + p.text
except TypeError:
# If p.text is None, because it immediately contains another tag,
# just insert a single space character.
p.text = " "
else:
# insert whitespace after every <lb/>
try:
p.tail = " " + p.tail
except TypeError:
p.tail = " "
return tree
def remove_notes(tree):
"""Exclude all <note> tags from the tree."""
for note in tree.getroot().iter("note"):
parent = note.getparent()
try:
parent.text += note.tail
except TypeError:
pass
finally:
parent.remove(note)
return tree
def extract_ps(root):
"""Extract all the paragraphs from the tree and return its
text content as a list of tokenized strings."""
ps = []
text = root.find("text")
for p in text.iter("p"):
s = etree.tostring(p, method="text", encoding="unicode")
ps.append(tokenize_paragraph(s))
return ps
def tokenize_paragraph(p):
"""Tokenize a Paragraph with nltk and remove redundant whitespace.
'Tokens' which end up being only punctuation are also removed."""
tokenizer = nltk.data.load("tokenizers/punkt/german.pickle")
sentences = tokenizer.tokenize(p)
tokens = []
for s in sentences:
tokens.extend(nltk.tokenize.word_tokenize(s))
tokens = [
t.strip() for t in tokens
if not (t.strip().isspace() or t.strip() in [",", ".", ";", "!", "?",
"-", '"', "'", "``", "''", ":", "(", ")", "–"])
]
return tokens
def write_reference(ps, filename):
"""Write a plain text file with one line per paragraph
and a index number in parentheses after each token.
"""
with open(path.join(filename), "w") as f:
for stuff in ps:
for idx, s in enumerate(stuff):
if idx != 0:
f.write(" ")
f.write(str(s) + "(" + str(idx) + ")")
f.write("\n\n")
def write_plain_text(tree, filepath):
"""Take any xml file, get its text tag and write its plaintext
content into a corresponding file in the PLAIN_DIR.
"""
el = tree.getroot().find("text")
with open(filepath, "w") as f:
f.write(etree.tostring(el, method="text", encoding="unicode"))
def write_stemmed_plain_text(tree, filename):
"""Write a plain text file, with each token of the text replaced
by its stemmed version
"""
stemmer = SnowballStemmer("german")
plaintext = etree.tostring(
tree.getroot().find("text"),
method="text", encoding="utf-8"
)
tokenizer = nltk.data.load("tokenizers/punkt/german.pickle")
sentences = tokenizer.tokenize(plaintext.decode())
tokens = []
for s in sentences:
t = nltk.tokenize.word_tokenize(s)
for word in t:
if not (word.strip().isspace() or
word.strip() in [",", ".", ";", "!", "?", "-", '"', "'", "``",
"''", ":", "(", ")", "–"]):
tokens.append(stemmer.stem(word))
with open(filename, "w") as f:
f.write(" ".join(tokens))
def write_xml(ps, tree, filename):
"""Wrap every token of the tree in a <w> tag and write the result
into a new file.
"""
raise NotImplementedError("")
# This is executed when the script is called directly via `python3 tokenizer.py`.
if __name__ == "__main__":
# Process all the command line arguments.
parser = argparse.ArgumentParser()
parser.add_argument("source", help="directory with source xml files")
parser.add_argument(
"-n", "--no-notes", action="store_true",
help="exclude all <note> tags from the source files")
parser.add_argument(
"-f", "--fix-whitespace", action="store_true",
help="preprocess the xml before tokenization")
parser.add_argument(
"-s", "--use-stemming", action="store_true",
help="use stemming on plaintext")
parser.add_argument(
"-r", "--write-reference", action="store_true",
help="write reference text files")
parser.add_argument(
"-m", "--write-modified", action="store_true",
help="write preprocessed xml into a file")
parser.add_argument(
"-p", "--write-plaintext", action="store_true",
help="write plaintext files")
parser.add_argument(
"-x", "--write-xml", action="store_true",
help="write tokenized xml files")
# Location where to write files
parser.add_argument(
"--stemm-dir", help="where to write stemmed plain text files. Implies -s option.")
parser.add_argument(
"--mod-dir",
help="where to create intermediate xml files with fixed whitespace. Implies -f option.")
parser.add_argument(
"--ref-dir", help="where to create reference files. Implies -r option.")
parser.add_argument(
"--plain-dir", help="where to create plaintext files. Implies -p option.")
parser.add_argument(
"--xml-dir", help="where to create tokenized xml files. Implies -x option.")
args = parser.parse_args()
opts = {
"no_notes": args.no_notes,
"fix_whitespace": args.fix_whitespace,
"write_stemm": args.use_stemming or bool(args.stemm_dir),
"write_ref": args.write_reference or bool(args.ref_dir),
"write_plain": args.write_plaintext or bool(args.plain_dir),
"write_xml": args.write_xml or bool(args.xml_dir),
"src_dir": args.source,
"mod_dir": args.mod_dir or MOD_DIR,
"ste_dir": args.stemm_dir or STE_DIR,
"ref_dir": args.ref_dir or REF_DIR,
"xml_dir": args.xml_dir or XML_DIR,
"plain_dir": args.plain_dir or PLAIN_DIR,
}
# Determine files to process.
files = []
filepath = None
for (dirpath, dirnames, filenames) in walk(path.join(opts["src_dir"])):
files.extend(filenames)
filepath = dirpath
break
# Process files.
for filename in files:
tree = None
tree = load_file(path.join(filepath, filename))
# Preprocessing steps
if opts["no_notes"]:
tree = remove_notes(tree)
if opts["fix_whitespace"]:
tree = fix_whitespace(tree)
if opts["mod_dir"] or opts["write_modified"]:
tree.write(path.join(opts["mod_dir"], filename))
# Processing and output generation
paragraphs = None
# Do we actually need to tokenize?
if opts["write_ref"] or opts["write_xml"]:
paragraphs = extract_ps(tree)
if opts["write_plain"]:
write_plain_text(tree, path.join(opts["plain_dir"], filename + ".txt"))
if opts["write_stemm"]:
write_stemmed_plain_text(tree, path.join(opts["ste_dir"], filename + ".txt"))
if opts["write_ref"] and paragraphs is not None:
write_reference(paragraphs, path.join(opts["ref_dir"], filename + ".txt"),
opts["stemming"])
if opts["write_xml"] and paragraphs is not None:
write_xml(tree, paragraphs, path.join(opts["xml_dir"], filename))