forked from andreasvc/disco-dop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample.prm
119 lines (113 loc) · 5.01 KB
/
sample.prm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
stages=[
dict(
name='pcfg', # an identifier, used as filename when writing results
mode='pcfg', # use the PCFG CKY parser
split=True, # split discontinuous nodes to get a PCFG: VP_2 => {VP*, VP*}
markorigin=True, # when splitting nodes, mark origin: VP_2 => {VP*1, VP*2}
),
dict(
name='plcfrs',
mode='plcfrs', # use the agenda-based PLCFRS parser
prune='pcfg', # name of previous chart to use to prune parsing of this stage
k=50, # number of coarse pcfg derivations to prune with; k=0 => filter only
),
dict(
name='dop',
mode='plcfrs',
prune='plcfrs', # name of previous chart to use to prune parsing of this stage
k=1e-5, # number of coarse plcfrs derivations to prune with; k=0 => filter only
dop='doubledop', # enable DOP mode
m=1000, # number of derivations to sample/enumerate
estimator='rfe', # choices: rfe, ewe, bon
objective = 'mpp', # choices: mpp, mpd, shortest, sl-dop[-simple]
# NB: w/shortest derivation, estimator only affects tie breaking.
),
],
evalparam='proper.prm', # EVALB-style parameter file
# train / test sets
corpusfmt='export', # choices: export, bracket, discbracket, alpino, tiger
traincorpus=dict(
path='alpinosample.export',
encoding='utf-8',
maxwords=100, # max number of words for sentences in train corpus
numsents=3, # length (sents) of training corpus
),
testcorpus=dict(
path='alpinosample.export',
encoding='utf-8',
maxwords=100, # max number of words for sentences in test corpus
numsents=3, # (max) number of test sentences to parse
skiptrain=False, # when the train & test set are read from the same file,
# enable this to skip the training sentences to get to the test set.
skip=0, # skip (additional) sentences between train & test set
),
punct='move', # options:
# None: leave punctuation as-is
# 'move': re-attach punctuation to appropriate constituents
# 'remove': remove all punctuation
# 'root': attach punctuation under root node
functions=None, # options:
# None: leave syntactic labels as is
# 'add': concatenate grammatical function to syntactic label,
# separated by a hypen: e.g., NP => NP-SBJ
# 'remove': strip away hyphen-separated grammatical function
# from syntactic label, e.g., NP-SBJ => NP
# 'replace': replace syntactic label with grammatical function,
# e.g., NP => SBJ
morphology=None, # options:
# None: use POS tags as preterminals
# 'add': concatenate morphological information to POS tags,
# e.g., DET/sg.def
# 'replace': use morphological information as preterminal label
# 'between': add node with morphological information between
# POS tag and word, e.g., (DET (sg.def the))
# apply treebank-specific transformations (state splits) to training set trees
# see source of `discodop.treebanktransforms.transform`
# transformations=('lassy-func', ),
relationalrealizational=None,
#relationalrealizational=dict( # Negra/Tiger
# # the function labels to treat as adjunction
# adjunctionlabel='MO',
# # functions that are ignored for argument structures
# ignorefunctions=('--', 'CD', 'CP', 'NK'),
# # auxiliaries:
# ignorecategories=(
# 'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF', 'VMPP'),
# morphlevels=1, # percolate morph. features this many levels up
# # percolate only the following features:
# percolatefeatures=set('1 2 3 Sg Pl Akk Acc Dat Nom Gen'.split()),
#),
# postagging: pass None to use tags from treebank.
postagging=dict(
# choices: unknownword (assign during parsing),
# treetagger, stanford (external taggers)
method='unknownword',
# choices unknownword: 4, 6, base,
# for treetagger / stanford: [filename of external tagger model]
model='4',
# options for unknown word models:
unknownthreshold=1, # use probs of rare words for unknown words
openclassthreshold=50, # add unseen tags for known words. 0 to disable.
),
# binarization options
binarization=dict(
method='default', # choices: default, optimal, optimalhead
factor='right', # right factored binarization
# (applicable for non-optimal binarizations)
headrules='alpino.headrules', # file with rules for head assignment
h=1, # horizontal Markovization: number of siblings of context
v=1, # vertical Markovization; v=1 means no additional parent annotation.
revh=0, # horizontal Markovization: number of siblings of preceding context
pospa=False, # when v > 1, add parent annotation to POS tags?
markhead=True, # prepend label of head node to siblings
leftmostunary=False, # start binarization with unary node
rightmostunary=False, # end binarization with unary node
tailmarker='', # symbol to add to last node in a binarization, to mark head node
revmarkov=False, # reverse order for horizontal Markovization
fanout_marks_before_bin=False, # whether to add fanout markers before
# binarization, to distinguish them for markovization,
# e.g., VP|<NP_2-VVFIN> instead of VP|<NP-VVFIN>
),
# misc
verbosity=2, # 0=silent; 1=summary report; 2=per sentence results; 3=dump derivations/parse trees.
numproc=1, # increase to use multiple CPUs. Set to None to use all CPUs.