draft feature for max length padding/truncating #326
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Lint & Tests | |
on: [push, pull_request] | |
jobs: | |
lint-and-tests: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
python-version: [3.8] # build only for 3.8 for now | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v2 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install --upgrade setuptools | |
pip install -e . | |
pip install -r requirements.opt.txt | |
pip install flake8==4.0.1 | |
pip install pytest==7.0.1 pytest-flake8==1.1.1 | |
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | |
- name: Lint with flake8 | |
run: | | |
flake8 --max-line-length 120 . | |
- name: Unit tests | |
run: | | |
python -m pytest | |
# ## Broken in FoTraNMT | |
# - name: Test vocabulary build | |
# run: | | |
# python onmt/bin/build_vocab.py \ | |
# -config data/data.yaml \ | |
# -save_data /tmp/onmt \ | |
# -n_sample 5000 \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# && rm -rf /tmp/sample | |
# - name: Test vocabulary build with features | |
# run: | | |
# python onmt/bin/build_vocab.py \ | |
# -config data/features_data.yaml \ | |
# -save_data /tmp/onmt_feat \ | |
# -src_vocab /tmp/onmt_feat.vocab.src \ | |
# -tgt_vocab /tmp/onmt_feat.vocab.tgt \ | |
# -src_feats_vocab '{"feat0": "/tmp/onmt_feat.vocab.feat0"}' \ | |
# -n_sample -1 \ | |
# && rm -rf /tmp/sample | |
# - name: Test field/transform dump | |
# run: | | |
# # The dumped fields are used later when testing tools | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -save_data /tmp/onmt.train.check \ | |
# -dump_fields \ | |
# -dump_transforms \ | |
# -n_sample 30 \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 | |
# - name: Test RNN training | |
# run: | | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -rnn_size 2 \ | |
# -batch_size 10 \ | |
# -word_vec_size 5 \ | |
# -report_every 5\ | |
# -rnn_size 10 \ | |
# -train_steps 10 | |
# - name: Test RNN training with copy | |
# run: | | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -rnn_size 2 \ | |
# -batch_size 10 \ | |
# -word_vec_size 5 \ | |
# -report_every 5 \ | |
# -rnn_size 10 \ | |
# -train_steps 10 \ | |
# -copy_attn | |
# - name: Test RNN training with coverage | |
# run: | | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -rnn_size 2 -batch_size 10 \ | |
# -word_vec_size 5 -report_every 5 \ | |
# -coverage_attn true -lambda_coverage 0.1 \ | |
# -rnn_size 10 -train_steps 10 | |
# - name: Test Transformer training with align | |
# run: | | |
# python train.py \ | |
# -config data/align_data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -max_generator_batches 0 \ | |
# -encoder_type transformer \ | |
# -decoder_type transformer \ | |
# -layers 4 \ | |
# -word_vec_size 16 \ | |
# -rnn_size 16 \ | |
# -heads 2 \ | |
# -transformer_ff 64 \ | |
# -lambda_align 0.05 \ | |
# -alignment_layer 2 \ | |
# -alignment_heads 0 \ | |
# -report_every 5 \ | |
# -train_steps 10 | |
# - name: Test LM training | |
# run: | | |
# python train.py \ | |
# -config data/lm_data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.src \ | |
# -model_task lm \ | |
# -encoder_type transformer_lm \ | |
# -decoder_type transformer_lm \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -dec_layers 2 -batch_size 10 \ | |
# -heads 4 -transformer_ff 64 \ | |
# -word_vec_size 16 -report_every 5 \ | |
# -rnn_size 16 -train_steps 10 | |
# - name: Test LM training with copy | |
# run: | | |
# python train.py \ | |
# -config data/lm_data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.src \ | |
# -model_task lm \ | |
# -encoder_type transformer_lm \ | |
# -decoder_type transformer_lm \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -dec_layers 2 -batch_size 10 \ | |
# -heads 4 -transformer_ff 64 \ | |
# -word_vec_size 16 -report_every 5 \ | |
# -rnn_size 16 -train_steps 10 \ | |
# -copy_attn | |
# - name: Test Graph neural network training | |
# run: | | |
# python train.py \ | |
# -config data/ggnn_data.yaml \ | |
# -src_seq_length 1000 \ | |
# -tgt_seq_length 30 \ | |
# -encoder_type ggnn \ | |
# -layers 2 \ | |
# -decoder_type rnn \ | |
# -rnn_size 256 \ | |
# -learning_rate 0.1 \ | |
# -learning_rate_decay 0.8 \ | |
# -global_attention general \ | |
# -batch_size 32 \ | |
# -word_vec_size 256 \ | |
# -bridge \ | |
# -train_steps 10 \ | |
# -n_edge_types 9 \ | |
# -state_dim 256 \ | |
# -n_steps 10 \ | |
# -n_node 64 | |
# - name: Testing training with features | |
# run: | | |
# python onmt/bin/train.py \ | |
# -config data/features_data.yaml \ | |
# -src_vocab /tmp/onmt_feat.vocab.src \ | |
# -tgt_vocab /tmp/onmt_feat.vocab.tgt \ | |
# -src_feats_vocab '{"feat0": "/tmp/onmt_feat.vocab.feat0"}' \ | |
# -src_vocab_size 1000 -tgt_vocab_size 1000 \ | |
# -rnn_size 2 -batch_size 10 \ | |
# -word_vec_size 5 -rnn_size 10 \ | |
# -report_every 5 -train_steps 10 \ | |
# -save_model /tmp/onmt.model \ | |
# -save_checkpoint_steps 10 | |
# - name: Testing translation with features | |
# run: | | |
# python translate.py \ | |
# -model /tmp/onmt.model_step_10.pt \ | |
# -src data/data_features/src-test.txt \ | |
# -src_feats "{'feat0': 'data/data_features/src-test.feat0'}" \ | |
# -verbose | |
# - name: Test RNN translation | |
# run: | | |
# head data/src-test.txt > /tmp/src-test.txt | |
# python translate.py \ | |
# -model onmt/tests/test_model.pt \ | |
# -src /tmp/src-test.txt \ | |
# -verbose | |
# - name: Test RNN ensemble translation | |
# run: | | |
# head data/src-test.txt > /tmp/src-test.txt | |
# python translate.py \ | |
# -model onmt/tests/test_model.pt \ | |
# onmt/tests/test_model.pt \ | |
# -src /tmp/src-test.txt \ | |
# -verbose | |
# - name: Test RNN translation with beam search | |
# run: | | |
# python translate.py \ | |
# -model onmt/tests/test_model2.pt \ | |
# -src data/morph/src.valid \ | |
# -verbose \ | |
# -batch_size 10 \ | |
# -beam_size 10 \ | |
# -tgt data/morph/tgt.valid \ | |
# -out /tmp/trans | |
# diff data/morph/tgt.valid /tmp/trans && rm /tmp/trans | |
# - name: Test RNN translation with random sampling | |
# run: | | |
# python translate.py \ | |
# -model onmt/tests/test_model2.pt \ | |
# -src data/morph/src.valid \ | |
# -verbose \ | |
# -batch_size 10 \ | |
# -beam_size 1 \ | |
# -seed 1 \ | |
# -random_sampling_topk "-1" \ | |
# -random_sampling_temp 0.0001 \ | |
# -tgt data/morph/tgt.valid \ | |
# -out /tmp/trans | |
# diff data/morph/tgt.valid /tmp/trans && rm /tmp/trans | |
# - name: Test LM generation | |
# run: | | |
# head data/src-test.txt > /tmp/src-test.txt | |
# python translate.py \ | |
# -model onmt/tests/test_model_lm.pt \ | |
# -src data/src-test.txt \ | |
# -verbose | |
# - name: Test LM generation with beam search | |
# run: | | |
# python translate.py \ | |
# -model onmt/tests/test_model_lm.pt \ | |
# -src data/data_lm/src-gen.txt \ | |
# -verbose -batch_size 10 \ | |
# -beam_size 10 \ | |
# -ban_unk_token \ | |
# -out /tmp/gen | |
# diff data/data_lm/gen-beam-sol.txt /tmp/gen && rm /tmp/gen | |
# - name: Test LM generation with random sampling | |
# run: | | |
# python translate.py -model onmt/tests/test_model_lm.pt \ | |
# -src data/data_lm/src-gen.txt \ | |
# -verbose -batch_size 10 \ | |
# -beam_size 1 \ | |
# -seed 1 \ | |
# -random_sampling_topk -1 \ | |
# -random_sampling_temp 0.0001 \ | |
# -ban_unk_token \ | |
# -out /tmp/gen | |
# diff data/data_lm/gen-sampling-sol.txt /tmp/gen && rm /tmp/gen | |
# - name: Test LM generation with random top-k/nucleus sampling | |
# run: | | |
# python translate.py -model onmt/tests/test_model_lm.pt \ | |
# -src data/data_lm/src-gen.txt \ | |
# -verbose -batch_size 10 \ | |
# -beam_size 1 \ | |
# -seed 3 \ | |
# -random_sampling_topk -1 \ | |
# -random_sampling_topp 0.95 \ | |
# -random_sampling_temp 1 \ | |
# -ban_unk_token \ | |
# -out /tmp/gen | |
# diff data/data_lm/gen-nucleus-sampling-sol.txt /tmp/gen && rm /tmp/gen | |
# - name: Test LM generation with random sampling multi-beams | |
# run: | | |
# python translate.py -model onmt/tests/test_model_lm.pt \ | |
# -src data/data_lm/src-gen.txt \ | |
# -verbose -batch_size 10 \ | |
# -beam_size 10 \ | |
# -seed 2 \ | |
# -random_sampling_topk 50 \ | |
# -random_sampling_topp 0.95 \ | |
# -random_sampling_temp 1 \ | |
# -length_penalty avg \ | |
# -ban_unk_token \ | |
# -min_length 5 \ | |
# -out /tmp/gen | |
# diff data/data_lm/gen-sampling-beams-sol.txt /tmp/gen && rm /tmp/gen | |
# - name: Test extract_vocabulary tool | |
# run: | | |
# python tools/extract_vocabulary.py \ | |
# -file /tmp/onmt.train.check.vocab.pt \ | |
# -file_type field \ | |
# -side src \ | |
# -out_file /tmp/onmt.vocab.txt | |
# if ! wc -l /tmp/onmt.vocab.txt | grep -qF "1002" | |
# then echo "wrong word count" && exit 1 | |
# else | |
# echo "create vocabulary pass" | |
# fi | |
# - name: Test embeddings_to_torch tool | |
# run: | | |
# python tools/embeddings_to_torch.py \ | |
# -emb_file_enc onmt/tests/sample_glove.txt \ | |
# -emb_file_dec onmt/tests/sample_glove.txt \ | |
# -dict_file /tmp/onmt.train.check.vocab.pt \ | |
# -output_file /tmp/q_gloveembeddings \ | |
# && rm /tmp/q_gloveembeddings* | |
# rm /tmp/onmt.train.check.*.pt | |
# - name: Test extract_embeddings tool | |
# run: | | |
# python tools/extract_embeddings.py \ | |
# -model onmt/tests/test_model.pt | |
# - name: Test checkpoint vocabulary update | |
# run: | | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -rnn_size 2 \ | |
# -batch_size 10 \ | |
# -word_vec_size 5 \ | |
# -report_every 5\ | |
# -rnn_size 10 \ | |
# -train_steps 10 \ | |
# -save_model /tmp/onmt.model \ | |
# -save_checkpoint_steps 10 | |
# sed -i '1s/^/new_tok\t100000000\n/' /tmp/onmt.vocab.src | |
# python train.py \ | |
# -config data/data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -rnn_size 2 \ | |
# -batch_size 10 \ | |
# -word_vec_size 5 \ | |
# -report_every 5\ | |
# -rnn_size 10 \ | |
# -train_steps 20 \ | |
# -update_vocab \ | |
# -reset_optim "states" \ | |
# -train_from /tmp/onmt.model_step_10.pt | |
# - name: Test checkpoint vocabulary update with LM | |
# run: | | |
# python train.py \ | |
# -config data/lm_data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -model_task lm \ | |
# -encoder_type transformer_lm \ | |
# -decoder_type transformer_lm \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -dec_layers 2 -batch_size 10 \ | |
# -heads 4 -transformer_ff 64 \ | |
# -word_vec_size 16 -report_every 5 \ | |
# -save_model /tmp/lm.onmt.model \ | |
# -save_checkpoint_steps 10 \ | |
# -rnn_size 16 -train_steps 10 | |
# sed -i '1s/^/new_tok\t100000000\n/' /tmp/onmt.vocab.src | |
# python train.py \ | |
# -config data/lm_data.yaml \ | |
# -src_vocab /tmp/onmt.vocab.src \ | |
# -tgt_vocab /tmp/onmt.vocab.tgt \ | |
# -model_task lm \ | |
# -encoder_type transformer_lm \ | |
# -decoder_type transformer_lm \ | |
# -src_vocab_size 1000 \ | |
# -tgt_vocab_size 1000 \ | |
# -dec_layers 2 -batch_size 10 \ | |
# -heads 4 -transformer_ff 64 \ | |
# -word_vec_size 16 -report_every 5 \ | |
# -rnn_size 16 -train_steps 20 \ | |
# -update_vocab -reset_optim "states" \ | |
# -train_from /tmp/lm.onmt.model_step_10.pt |