-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_dataset.py
50 lines (40 loc) · 1.4 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import spacy
from TexSoup import TexSoup
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf')
def main():
with open('tac.json') as infile:
data = json.load(infile)
results = []
for article in tqdm(data):
title = article['title']
text = article['text']
try:
soup = TexSoup(text, tolerance=10)
except EOFError:
text = text.replace('$', '')
soup = TexSoup(text)
text_doc = nlp(''.join(soup.text))
for sentence in text_doc.sents:
start = sentence.start
indices = []
for token in sentence:
if token.head.i == token.i:
indices.append(-1)
else:
indices.append(token.head.i - start)
if len(sentence) > 100:
continue
results.append({
'tokens': [token.text for token in sentence],
'labels': ['O' for token in sentence],
'heads': indices,
'pos': [token.pos_ for token in sentence],
'label': "none",
'dep_path': [],
})
with open('tac_data.json', 'w') as outfile:
json.dump(results, outfile)
if __name__ == "__main__":
main()