This repository was archived by the owner on Sep 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathtest_parser.py
124 lines (89 loc) · 3.16 KB
/
test_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Run unit tests for the ZPar constituency parser.
:author: Nitin Madnani ([email protected])
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import glob
import os
from io import open
from itertools import product
from os.path import abspath, dirname, join
from nose.tools import assert_equal
from zpar import ZPar
_my_dir = abspath(dirname(__file__))
z = None
parser = None
def setUp():
"""
set up things we need for the tests
"""
global z, parser
assert 'ZPAR_MODEL_DIR' in os.environ
model_dir = os.environ['ZPAR_MODEL_DIR']
z = ZPar(model_dir)
parser = z.get_parser()
def tearDown():
"""
Clean up after the tests
"""
global z, parser
if z:
z.close()
del parser
del z
# delete all the files we may have created
data_dir = abspath(join(_my_dir, '..', 'examples'))
for f in glob.glob(join(data_dir, 'test*.parse')):
os.unlink(f)
def check_parse_sentence(tokenize=False, tagged=False):
"""
Check parse_sentence method with and without tokenization
and with and without pre-tagged output.
"""
global parser
if tagged:
sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
else:
if tokenize:
sentence = "I'm going to the market."
else:
sentence = "I 'm going to the market ."
correct_output = "(S (NP (PRP I)) (VP (VBP 'm) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))"
if not tagged:
parsed_sentence = parser.parse_sentence(sentence, tokenize=tokenize)
else:
parsed_sentence = parser.parse_tagged_sentence(sentence)
assert_equal(parsed_sentence, correct_output)
def test_parse_sentence():
for (tokenize, tagged) in product([True, False], [True, False]):
yield check_parse_sentence, tokenize, tagged
def check_parse_file(tokenize=False, tagged=False):
"""
Check parse_file method with and without tokenization
and with and without pre-tagged output
"""
global parser
if tagged:
prefix = 'test_tagged'
else:
if tokenize:
prefix = 'test'
else:
prefix = 'test_tokenized'
correct_output = ["(S (NP (PRP I)) (VP (VBP am) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))",
"(SQ (VBP Are) (NP (PRP you)) (VP (VBG going) (S (VP (TO to) (VP (VB come) (PP (IN with) (NP (PRP me))))))) (. ?))"]
input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix)))
output_file = abspath(join(_my_dir, '..', 'examples', '{}.parse'.format(prefix)))
# parse the file
if not tagged:
parser.parse_file(input_file, output_file, tokenize=tokenize)
else:
parser.parse_tagged_file(input_file, output_file)
# read the output file and make sure we have the expected output
with open(output_file, 'r') as outf:
output = [l.strip() for l in outf.readlines()]
assert_equal(output, correct_output)
def test_parse_file():
for (tokenize, tagged) in product([True, False], [True, False]):
yield check_parse_file, tokenize, tagged