forked from ymfa/seq2seq-summarizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparams.py
72 lines (64 loc) · 3.59 KB
/
params.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from typing import Optional
class Params:
# Model architecture
vocab_size: int = 30000
hidden_size: int = 100 # of the encoder; default decoder size is doubled if encoder is bidi
dec_hidden_size: Optional[int] = 100 # if set, a matrix will transform enc state into dec state
embed_size: int = 100
enc_bidi: bool = True
enc_attn: bool = True # decoder has attention over encoder states?
dec_attn: bool = False # decoder has attention over previous decoder states?
pointer: bool = True # use pointer network (copy mechanism) in addition to word generator?
out_embed_size: Optional[int] = None # if set, use an additional layer before decoder output
tie_embed: bool = True # tie the decoder output layer to the input embedding layer?
# Coverage (to turn on/off, change both `enc_attn_cover` and `cover_loss`)
enc_attn_cover: bool = True # provide coverage as input when computing enc attn?
cover_func: str = 'max' # how to aggregate previous attention distributions? sum or max
cover_loss: float = 1 # add coverage loss if > 0; weight of coverage loss as compared to NLLLoss
show_cover_loss: bool = False # include coverage loss in the loss shown in the progress bar?
# Regularization
enc_rnn_dropout: float = 0
dec_in_dropout: float = 0
dec_rnn_dropout: float = 0
dec_out_dropout: float = 0
# Training
optimizer: str = 'adam' # adam or adagrad
lr: float = 0.001 # learning rate
adagrad_accumulator: float = 0.1
lr_decay_step: int = 5 # decay lr every how many epochs?
lr_decay: Optional[float] = None # decay lr by multiplying this factor
batch_size: int = 64
n_batches: int = 1000 # how many batches per epoch
val_batch_size: int = 64
n_val_batches: int = 20 # how many validation batches per epoch
n_epochs: int = 75
pack_seq: bool = True # use packed sequence to skip PAD inputs?
forcing_ratio: float = 0.75 # initial percentage of using teacher forcing
partial_forcing: bool = True # in a seq, can some steps be teacher forced and some not?
forcing_decay_type: Optional[str] = 'linear' # linear, exp, sigmoid, or None
forcing_decay: float = 0.0001
sample: bool = True # are non-teacher forced inputs based on sampling or greedy selection?
grad_norm: float = 1 # use gradient clipping if > 0; max gradient norm
# note: enabling reinforcement learning can significantly slow down training
rl_ratio: float = 0 # use mixed objective if > 0; ratio of RL in the loss function
rl_ratio_power: float = 1 # increase rl_ratio by **= rl_ratio_power after each epoch; (0, 1]
rl_start_epoch: int = 1 # start RL at which epoch (later start can ensure a strong baseline)?
# Data
embed_file: Optional[str] = 'data/.vector_cache/glove.6B.100d.txt' # use pre-trained embeddings
data_path: str = 'data/cnndm.gz'
val_data_path: Optional[str] = 'data/cnndm.val.gz'
max_src_len: int = 400 # exclusive of special tokens such as EOS
max_tgt_len: int = 100 # exclusive of special tokens such as EOS
truncate_src: bool = True # truncate to max_src_len? if false, drop example if too long
truncate_tgt: bool = True # truncate to max_tgt_len? if false, drop example if too long
# Saving model automatically during training
model_path_prefix: Optional[str] = 'checkpoints/cnndm'
keep_every_epoch: bool = False # save all epochs, or only the best and the latest one?
# Testing
beam_size: int = 4
min_out_len: int = 1
max_out_len: Optional[int] = None
out_len_in_words: bool = True
test_data_path: str = 'data/cnndm.test.gz'
test_sample_ratio: float = 1 # what portion of the test data is used? (1 for all data)
test_save_results: bool = True