sample.prm

stages=[
  dict(
    name='pcfg',  # an identifier, used as filename when writing results
    mode='pcfg',  # use the PCFG CKY parser
    split=True,  # split discontinuous nodes to get a PCFG: VP_2 => {VP*, VP*}
    markorigin=True,  # when splitting nodes, mark origin: VP_2 => {VP*1, VP*2}
  ),
  dict(
    name='plcfrs',
    mode='plcfrs',  # use the agenda-based PLCFRS parser
    prune='pcfg',  # name of previous chart to use to prune parsing of this stage
    k=50,  # number of coarse pcfg derivations to prune with; k=0 => filter only
  ),
  dict(
    name='dop',
    mode='plcfrs',
    prune='plcfrs',  # name of previous chart to use to prune parsing of this stage
    k=1e-5,  # number of coarse plcfrs derivations to prune with; k=0 => filter only
    dop='doubledop',  # enable DOP mode
    m=1000,  # number of derivations to sample/enumerate
    estimator='rfe',  # choices: rfe, ewe, bon
    objective = 'mpp',  # choices: mpp, mpd, shortest, sl-dop[-simple]
        # NB: w/shortest derivation, estimator only affects tie breaking.
  ),
],

evalparam='proper.prm',  # EVALB-style parameter file
# train / test sets
corpusfmt='export',  # choices: export, bracket, discbracket, alpino, tiger
traincorpus=dict(
	path='alpinosample.export',
	encoding='utf-8',
	maxwords=100,  # max number of words for sentences in train corpus
	numsents=3,  # length (sents) of training corpus
),
testcorpus=dict(
	path='alpinosample.export',
	encoding='utf-8',
	maxwords=100,  # max number of words for sentences in test corpus
	numsents=3,  # (max) number of test sentences to parse
	skiptrain=False,  # when the train & test set are read from the same file,
		# enable this to skip the training sentences to get to the test set.
	skip=0,  # skip (additional) sentences between train & test set
),

punct='move',  # options:
#	None: leave punctuation as-is
#	'move': re-attach punctuation to appropriate constituents
#	'remove': remove all punctuation
#	'root': attach punctuation under root node
functions=None,  # options:
#	None: leave syntactic labels as is
#	'add': concatenate grammatical function to syntactic label,
#		separated by a hypen: e.g., NP => NP-SBJ
#	'remove': strip away hyphen-separated grammatical function
#		from syntactic label, e.g., NP-SBJ => NP
#	'replace': replace syntactic label with grammatical function,
#		e.g., NP => SBJ
morphology=None,  # options:
#	None: use POS tags as preterminals
#	'add': concatenate morphological information to POS tags,
#		e.g., DET/sg.def
#	'replace': use morphological information as preterminal label
#	'between': add node with morphological information between
#		POS tag and word, e.g., (DET (sg.def the))

# apply treebank-specific transformations (state splits) to training set trees
# see source of `discodop.treebanktransforms.transform`
# transformations=('lassy-func', ),
relationalrealizational=None,
#relationalrealizational=dict(  # Negra/Tiger
#	# the function labels to treat as adjunction
#	adjunctionlabel='MO',
#	# functions that are ignored for argument structures
#	ignorefunctions=('--', 'CD', 'CP', 'NK'),
#		# auxiliaries:
#	ignorecategories=(
#		'VAFIN', 'VAIMP', 'VAINF', 'VAPP', 'VMFIN', 'VMINF', 'VMPP'),
#	morphlevels=1,  # percolate morph. features this many levels up
#	# percolate only the following features:
#	percolatefeatures=set('1 2 3 Sg Pl Akk Acc Dat Nom Gen'.split()),
#),

# postagging: pass None to use tags from treebank.
postagging=dict(
    # choices: unknownword (assign during parsing),
    #    treetagger, stanford (external taggers)
    method='unknownword',
    # choices unknownword: 4, 6, base,
    # for treetagger / stanford: [filename of external tagger model]
    model='4',
    # options for unknown word models:
    unknownthreshold=1,  # use probs of rare words for unknown words
    openclassthreshold=50,  # add unseen tags for known words. 0 to disable.
),

# binarization options
binarization=dict(
	method='default',  # choices: default, optimal, optimalhead
	factor='right',  # right factored binarization
		# (applicable for non-optimal binarizations)
	headrules='alpino.headrules',  # file with rules for head assignment
	h=1,  # horizontal Markovization: number of siblings of context
	v=1,  # vertical Markovization; v=1 means no additional parent annotation.
	revh=0,  # horizontal Markovization: number of siblings of preceding context
	pospa=False,  # when v > 1, add parent annotation to POS tags?
	markhead=True,  # prepend label of head node to siblings
	leftmostunary=False,  # start binarization with unary node
	rightmostunary=False,  # end binarization with unary node
	tailmarker='',  # symbol to add to last node in a binarization, to mark head node
	revmarkov=False,  # reverse order for horizontal Markovization
	fanout_marks_before_bin=False,  # whether to add fanout markers before
		# binarization, to distinguish them for markovization,
		# e.g., VP|<NP_2-VVFIN> instead of VP|<NP-VVFIN>
),

# misc
verbosity=2,  # 0=silent; 1=summary report; 2=per sentence results; 3=dump derivations/parse trees.
numproc=1,  # increase to use multiple CPUs. Set to None to use all CPUs.