-
Notifications
You must be signed in to change notification settings - Fork 0
/
Doc.py
97 lines (80 loc) · 3.07 KB
/
Doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
class Doc:
def __init__(self, filepath):
SENTENCE_SPLIT = r'\n\n+'
DOC_ID = r'newdoc id\s*=\s*(\w+)\s*$'
SENT_ID = r'sent_id\s*=\s*(.+)\s*$'
TEXT = r'text\s*=\s*(.+)\s*$'
content = open(filepath, 'r', encoding='utf-8').read()
self.filepath = filepath
self.sentences = []
sents = re.split(SENTENCE_SPLIT, content)
for sentence_id, sentence in enumerate(sents):
if not sentence.strip(): continue # if sentence is empty due to last or first lines
sentence = sents[sentence_id]
sent_id = 'not_found'
m = re.search(SENT_ID, sentence, flags=re.MULTILINE)
if m: sent_id = m.group(1)
doc_id = 'not_found'
m = re.search(DOC_ID, sentence, flags=re.MULTILINE)
if m: doc_id = m.group(1)
text = 'not_found'
m = re.search(TEXT, sentence, flags=re.MULTILINE)
if m: text = m.group(1)
# removes hashtagged lines
empty_lines = 0
for l in sentence.splitlines():
if not l.startswith('#'): break
empty_lines += 1
words = sentence.splitlines()[empty_lines:]
self.sentences.append(Sentence(doc_id, sent_id, text, words))
def write(self):
content = ""
for sentence in self.sentences:
content += '# sent_id = ' + sentence.sent_id + '\n'
content += '# text = ' + sentence.text + '\n'
for word in sentence.words:
content += '\t'.join(word.get_list()) + '\n'
content += '\n'
open(self.filepath, 'w', encoding='utf-8').write(content)
class Sentence:
def __init__(self, doc_id, sent_id, text, words):
self.text = text
self.sent_id = sent_id
self.sent_address = 'n' + str(sent_id)
self.doc_id = doc_id
self.words = []
for word in words:
w = Word(word, self.sent_address)
self.words.append(w)
def get_head(self):
for word in self.words:
if word.head == '0': return word.address()
return 'Null'
def get_raw(self):
content = '# sent_id = ' + self.sent_id + '\n'
content += '# text = ' + self.text + '\n'
for word in self.words:
content += '\t'.join(word.get_list()) + '\n'
content += '\n'
return content
class Word:
def __init__(self, word, sa):
items = word.split('\t')
self.sent_add = sa
self.id = items[0]
self.form = items[1]
self.lemma = items[2]
self.upos = items[3]
self.xpos = items[4]
self.feats = items[5]
self.head = items[6]
self.deprel = items[7]
self.deps = items[8]
self.misc = items[9]
self.unitword = False
if '-' in self.id: self.unitword = True
def get_list(self):
return [self.id, self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps, self.misc]
def address(self):
return self.sent_add + '-' + self.id