diff --git a/2024-dynamic-encoder-size/README.md b/2024-dynamic-encoder-size/README.md index d1326a17..28f6fd28 100644 --- a/2024-dynamic-encoder-size/README.md +++ b/2024-dynamic-encoder-size/README.md @@ -12,4 +12,6 @@ We use models parts from [i6-models](https://github.com/rwth-i6/i6_models/tree/j ConformerCTCModel, ConformerCTCConfig and train_step in returnn config is defined in [here](https://github.com/rwth-i6/i6_experiments/blob/main/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py) -### TED-LIUM-v2 Iterative-Zero-Out \ No newline at end of file +### TED-LIUM-v2 Iterative-Zero-Out + +ConformerCTCModel, ConformerCTCConfig and train_step in returnn config is defined in [here](https://github.com/rwth-i6/i6_experiments/blob/main/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py) diff --git a/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_1_subnet/returnn.config b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_1_subnet/returnn.config new file mode 100644 index 00000000..fb92d59b --- /dev/null +++ b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_1_subnet/returnn.config @@ -0,0 +1,425 @@ +#!rnn.py + + +import numpy as np + +backend = "torch" +batch_size = 15000 +batching = "random" +cache_size = "0" +cleanup_old_models = True +dev = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.U8uxDBlFQCdV/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.H5yw1ICx8RsS/output/targets.hdf" + ], + "partition_epoch": 1, + "seq_ordering": "sorted", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +device = "gpu" +extern_data = {"data": {"dim": 50}, "targets": {"dim": 79, "sparse": True}} +gradient_clip = 0.0 +gradient_noise = 0.0 +learning_rate_file = "learning_rates" +learning_rates = [ + 4e-06, + 7.6e-06, + 1.1200000000000001e-05, + 1.48e-05, + 1.84e-05, + 2.2e-05, + 2.5600000000000002e-05, + 2.92e-05, + 3.2800000000000004e-05, + 3.6400000000000004e-05, + 4e-05, + 4.36e-05, + 4.720000000000001e-05, + 5.080000000000001e-05, + 5.440000000000001e-05, + 5.800000000000001e-05, + 6.16e-05, + 6.520000000000001e-05, + 6.88e-05, + 7.240000000000001e-05, + 7.6e-05, + 7.960000000000001e-05, + 8.32e-05, + 8.680000000000001e-05, + 9.040000000000002e-05, + 9.400000000000001e-05, + 9.760000000000001e-05, + 0.00010120000000000001, + 0.00010480000000000001, + 0.0001084, + 0.00011200000000000001, + 0.0001156, + 0.00011920000000000001, + 0.0001228, + 0.0001264, + 0.00013, + 0.0001336, + 0.0001372, + 0.0001408, + 0.00014439999999999999, + 0.000148, + 0.0001516, + 0.0001552, + 0.0001588, + 0.0001624, + 0.000166, + 0.0001696, + 0.0001732, + 0.00017680000000000001, + 0.0001804, + 0.000184, + 0.0001876, + 0.0001912, + 0.0001948, + 0.0001984, + 0.000202, + 0.0002056, + 0.00020920000000000002, + 0.0002128, + 0.0002164, + 0.00022, + 0.00022360000000000001, + 0.0002272, + 0.0002308, + 0.0002344, + 0.000238, + 0.00024160000000000002, + 0.0002452, + 0.00024880000000000003, + 0.0002524, + 0.000256, + 0.0002596, + 0.0002632, + 0.00026680000000000003, + 0.0002704, + 0.000274, + 0.0002776, + 0.0002812, + 0.0002848, + 0.0002884, + 0.000292, + 0.00029560000000000003, + 0.0002992, + 0.0003028, + 0.0003064, + 0.00031, + 0.00031360000000000003, + 0.0003172, + 0.0003208, + 0.0003244, + 0.000328, + 0.00033160000000000004, + 0.0003352, + 0.0003388, + 0.00034240000000000003, + 0.000346, + 0.00034960000000000004, + 0.0003532, + 0.0003568, + 0.00036040000000000003, + 0.000364, + 0.0003676, + 0.0003712, + 0.0003748, + 0.00037840000000000004, + 0.000382, + 0.0003856, + 0.00038920000000000003, + 0.0003928, + 0.00039640000000000004, + 0.0004, + 0.00039640000000000004, + 0.0003928, + 0.00038920000000000003, + 0.0003856, + 0.000382, + 0.00037840000000000004, + 0.0003748, + 0.0003712, + 0.00036760000000000004, + 0.000364, + 0.00036040000000000003, + 0.0003568, + 0.0003532, + 0.00034960000000000004, + 0.000346, + 0.00034240000000000003, + 0.0003388, + 0.0003352, + 0.00033160000000000004, + 0.000328, + 0.0003244, + 0.0003208, + 0.0003172, + 0.00031360000000000003, + 0.00031, + 0.0003064, + 0.0003028, + 0.0002992, + 0.00029560000000000003, + 0.000292, + 0.0002884, + 0.0002848, + 0.0002812, + 0.0002776, + 0.00027400000000000005, + 0.0002704, + 0.0002668, + 0.0002632, + 0.0002596, + 0.00025600000000000004, + 0.0002524, + 0.0002488, + 0.0002452, + 0.00024160000000000002, + 0.000238, + 0.0002344, + 0.0002308, + 0.0002272, + 0.00022360000000000001, + 0.00022, + 0.0002164, + 0.0002128, + 0.00020920000000000002, + 0.0002056, + 0.000202, + 0.0001984, + 0.0001948, + 0.0001912, + 0.0001876, + 0.000184, + 0.0001804, + 0.00017680000000000001, + 0.0001732, + 0.0001696, + 0.000166, + 0.0001624, + 0.0001588, + 0.00015519999999999998, + 0.0001516, + 0.00014800000000000002, + 0.00014439999999999999, + 0.0001408, + 0.00013719999999999997, + 0.0001336, + 0.00013000000000000002, + 0.00012639999999999998, + 0.0001228, + 0.00011920000000000002, + 0.00011559999999999999, + 0.00011200000000000001, + 0.00010839999999999998, + 0.0001048, + 0.00010120000000000002, + 9.759999999999999e-05, + 9.400000000000001e-05, + 9.039999999999997e-05, + 8.68e-05, + 8.320000000000002e-05, + 7.959999999999998e-05, + 7.6e-05, + 7.239999999999997e-05, + 6.879999999999999e-05, + 6.520000000000001e-05, + 6.159999999999998e-05, + 5.8e-05, + 5.439999999999997e-05, + 5.079999999999999e-05, + 4.720000000000001e-05, + 4.3599999999999976e-05, + 3.9999999999999996e-05, + 3.640000000000002e-05, + 3.2799999999999984e-05, + 2.9200000000000005e-05, + 2.5599999999999972e-05, + 2.1999999999999993e-05, + 1.8400000000000014e-05, + 1.479999999999998e-05, + 1.1200000000000001e-05, + 7.599999999999968e-06, + 4e-06, + 3.862413793103448e-06, + 3.7248275862068965e-06, + 3.587241379310345e-06, + 3.449655172413793e-06, + 3.3120689655172415e-06, + 3.1744827586206894e-06, + 3.0368965517241378e-06, + 2.899310344827586e-06, + 2.7617241379310344e-06, + 2.6241379310344828e-06, + 2.4865517241379307e-06, + 2.348965517241379e-06, + 2.2113793103448274e-06, + 2.0737931034482757e-06, + 1.936206896551724e-06, + 1.7986206896551724e-06, + 1.6610344827586207e-06, + 1.523448275862069e-06, + 1.3858620689655174e-06, + 1.2482758620689657e-06, + 1.1106896551724136e-06, + 9.73103448275862e-07, + 8.355172413793103e-07, + 6.979310344827586e-07, + 5.60344827586207e-07, + 4.227586206896553e-07, + 2.8517241379310365e-07, + 1.4758620689655157e-07, + 1e-08, +] +log = ["./returnn.log"] +log_batch_size = True +log_verbosity = 5 +max_seqs = 128 +model = "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/training/ReturnnTrainingJob.IOudLdcvvkUP/output/models/epoch" +num_epochs = 250 +num_inputs = 50 +num_outputs = {"targets": 79} +optimizer = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 0.001} +save_interval = 1 +target = "targets" +task = "train" +tf_log_memory_usage = True +train = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.jxsVEaCgJLqB/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.cEWylWI7BnXk/output/targets.hdf" + ], + "partition_epoch": 5, + "seq_ordering": "laplace:.1000", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +update_on_device = True +window = 1 +config = {} + +locals().update(**config) + +import os +import sys + +sys.path.insert(0, "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/recipe") +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.iterative_zero_out_refactored.jointly_train_iterative_zero_out_layerwise import ( + ConformerCTCModel, +) +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.iterative_zero_out_refactored.jointly_train_iterative_zero_out_layerwise import ( + ConformerCTCConfig, +) +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerEncoderConfig, +) +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from torch.nn.modules.activation import ReLU +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerBlockConfig, +) +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, +) +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.norm import LayerNormNC +from torch.nn.modules.activation import Sigmoid + +cfg = ConformerCTCConfig( + conformer_cfg=ConformerEncoderConfig( + num_layers=12, + frontend=ModuleFactoryV1( + module_class=VGG4LayerActFrontendV1, + cfg=VGG4LayerActFrontendV1Config( + in_features=50, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=ReLU(), + out_features=384, + ), + ), + block_cfg=ConformerBlockConfig( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=384, hidden_dim=1536, dropout=0.2, activation=SiLU() + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=384, num_att_heads=6, att_weights_dropout=0.1, dropout=0.2 + ), + conv_cfg=ConformerConvolutionV1Config( + channels=384, + kernel_size=31, + dropout=0.2, + activation=SiLU(), + norm=LayerNormNC((384,), eps=1e-05, elementwise_affine=True), + ), + layer_dropout=0, + modules=["ff", "conv", "mhsa", "ff"], + scales=[0.5, 1.0, 1.0, 0.5], + ), + num_layers_set=[24, 48], + layer_dropout_kwargs={"layer_dropout_stage_1": 0, "layer_dropout_stage_2": 0.3}, + layer_gate_activation=Sigmoid(), + ), + target_size=79, + recog_num_layers=48, + zero_out_kwargs={ + "num_steps_per_iter": [63000, 21000, 21000, 21000, 21000, 21000, 21000, 21000], + "num_zeroout_elements_per_iter": [3, 6, 9, 12, 15, 18, 21, 24], + "zeroout_val": -5, + }, +) +model_kwargs = {"cfg": cfg} + + +def get_model(epoch, step, **kwargs): + return ConformerCTCModel(epoch=epoch, step=step, **model_kwargs, **kwargs) + + +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.iterative_zero_out_refactored.jointly_train_iterative_zero_out_layerwise import ( + train_step, +) diff --git a/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_2_subnets/returnn.config b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_2_subnets/returnn.config new file mode 100644 index 00000000..6c4941c3 --- /dev/null +++ b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_2_subnets/returnn.config @@ -0,0 +1,425 @@ +#!rnn.py + + +import numpy as np + +backend = "torch" +batch_size = 15000 +batching = "random" +cache_size = "0" +cleanup_old_models = True +dev = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.U8uxDBlFQCdV/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.H5yw1ICx8RsS/output/targets.hdf" + ], + "partition_epoch": 1, + "seq_ordering": "sorted", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +device = "gpu" +extern_data = {"data": {"dim": 50}, "targets": {"dim": 79, "sparse": True}} +gradient_clip = 0.0 +gradient_noise = 0.0 +learning_rate_file = "learning_rates" +learning_rates = [ + 4e-06, + 7.6e-06, + 1.1200000000000001e-05, + 1.48e-05, + 1.84e-05, + 2.2e-05, + 2.5600000000000002e-05, + 2.92e-05, + 3.2800000000000004e-05, + 3.6400000000000004e-05, + 4e-05, + 4.36e-05, + 4.720000000000001e-05, + 5.080000000000001e-05, + 5.440000000000001e-05, + 5.800000000000001e-05, + 6.16e-05, + 6.520000000000001e-05, + 6.88e-05, + 7.240000000000001e-05, + 7.6e-05, + 7.960000000000001e-05, + 8.32e-05, + 8.680000000000001e-05, + 9.040000000000002e-05, + 9.400000000000001e-05, + 9.760000000000001e-05, + 0.00010120000000000001, + 0.00010480000000000001, + 0.0001084, + 0.00011200000000000001, + 0.0001156, + 0.00011920000000000001, + 0.0001228, + 0.0001264, + 0.00013, + 0.0001336, + 0.0001372, + 0.0001408, + 0.00014439999999999999, + 0.000148, + 0.0001516, + 0.0001552, + 0.0001588, + 0.0001624, + 0.000166, + 0.0001696, + 0.0001732, + 0.00017680000000000001, + 0.0001804, + 0.000184, + 0.0001876, + 0.0001912, + 0.0001948, + 0.0001984, + 0.000202, + 0.0002056, + 0.00020920000000000002, + 0.0002128, + 0.0002164, + 0.00022, + 0.00022360000000000001, + 0.0002272, + 0.0002308, + 0.0002344, + 0.000238, + 0.00024160000000000002, + 0.0002452, + 0.00024880000000000003, + 0.0002524, + 0.000256, + 0.0002596, + 0.0002632, + 0.00026680000000000003, + 0.0002704, + 0.000274, + 0.0002776, + 0.0002812, + 0.0002848, + 0.0002884, + 0.000292, + 0.00029560000000000003, + 0.0002992, + 0.0003028, + 0.0003064, + 0.00031, + 0.00031360000000000003, + 0.0003172, + 0.0003208, + 0.0003244, + 0.000328, + 0.00033160000000000004, + 0.0003352, + 0.0003388, + 0.00034240000000000003, + 0.000346, + 0.00034960000000000004, + 0.0003532, + 0.0003568, + 0.00036040000000000003, + 0.000364, + 0.0003676, + 0.0003712, + 0.0003748, + 0.00037840000000000004, + 0.000382, + 0.0003856, + 0.00038920000000000003, + 0.0003928, + 0.00039640000000000004, + 0.0004, + 0.00039640000000000004, + 0.0003928, + 0.00038920000000000003, + 0.0003856, + 0.000382, + 0.00037840000000000004, + 0.0003748, + 0.0003712, + 0.00036760000000000004, + 0.000364, + 0.00036040000000000003, + 0.0003568, + 0.0003532, + 0.00034960000000000004, + 0.000346, + 0.00034240000000000003, + 0.0003388, + 0.0003352, + 0.00033160000000000004, + 0.000328, + 0.0003244, + 0.0003208, + 0.0003172, + 0.00031360000000000003, + 0.00031, + 0.0003064, + 0.0003028, + 0.0002992, + 0.00029560000000000003, + 0.000292, + 0.0002884, + 0.0002848, + 0.0002812, + 0.0002776, + 0.00027400000000000005, + 0.0002704, + 0.0002668, + 0.0002632, + 0.0002596, + 0.00025600000000000004, + 0.0002524, + 0.0002488, + 0.0002452, + 0.00024160000000000002, + 0.000238, + 0.0002344, + 0.0002308, + 0.0002272, + 0.00022360000000000001, + 0.00022, + 0.0002164, + 0.0002128, + 0.00020920000000000002, + 0.0002056, + 0.000202, + 0.0001984, + 0.0001948, + 0.0001912, + 0.0001876, + 0.000184, + 0.0001804, + 0.00017680000000000001, + 0.0001732, + 0.0001696, + 0.000166, + 0.0001624, + 0.0001588, + 0.00015519999999999998, + 0.0001516, + 0.00014800000000000002, + 0.00014439999999999999, + 0.0001408, + 0.00013719999999999997, + 0.0001336, + 0.00013000000000000002, + 0.00012639999999999998, + 0.0001228, + 0.00011920000000000002, + 0.00011559999999999999, + 0.00011200000000000001, + 0.00010839999999999998, + 0.0001048, + 0.00010120000000000002, + 9.759999999999999e-05, + 9.400000000000001e-05, + 9.039999999999997e-05, + 8.68e-05, + 8.320000000000002e-05, + 7.959999999999998e-05, + 7.6e-05, + 7.239999999999997e-05, + 6.879999999999999e-05, + 6.520000000000001e-05, + 6.159999999999998e-05, + 5.8e-05, + 5.439999999999997e-05, + 5.079999999999999e-05, + 4.720000000000001e-05, + 4.3599999999999976e-05, + 3.9999999999999996e-05, + 3.640000000000002e-05, + 3.2799999999999984e-05, + 2.9200000000000005e-05, + 2.5599999999999972e-05, + 2.1999999999999993e-05, + 1.8400000000000014e-05, + 1.479999999999998e-05, + 1.1200000000000001e-05, + 7.599999999999968e-06, + 4e-06, + 3.862413793103448e-06, + 3.7248275862068965e-06, + 3.587241379310345e-06, + 3.449655172413793e-06, + 3.3120689655172415e-06, + 3.1744827586206894e-06, + 3.0368965517241378e-06, + 2.899310344827586e-06, + 2.7617241379310344e-06, + 2.6241379310344828e-06, + 2.4865517241379307e-06, + 2.348965517241379e-06, + 2.2113793103448274e-06, + 2.0737931034482757e-06, + 1.936206896551724e-06, + 1.7986206896551724e-06, + 1.6610344827586207e-06, + 1.523448275862069e-06, + 1.3858620689655174e-06, + 1.2482758620689657e-06, + 1.1106896551724136e-06, + 9.73103448275862e-07, + 8.355172413793103e-07, + 6.979310344827586e-07, + 5.60344827586207e-07, + 4.227586206896553e-07, + 2.8517241379310365e-07, + 1.4758620689655157e-07, + 1e-08, +] +log = ["./returnn.log"] +log_batch_size = True +log_verbosity = 5 +max_seqs = 128 +model = "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/training/ReturnnTrainingJob.TryLPoyqOkV6/output/models/epoch" +num_epochs = 250 +num_inputs = 50 +num_outputs = {"targets": 79} +optimizer = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 0.001} +save_interval = 1 +target = "targets" +task = "train" +tf_log_memory_usage = True +train = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.jxsVEaCgJLqB/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.cEWylWI7BnXk/output/targets.hdf" + ], + "partition_epoch": 5, + "seq_ordering": "laplace:.1000", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +update_on_device = True +window = 1 +config = {} + +locals().update(**config) + +import os +import sys + +sys.path.insert(0, "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/recipe") +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + ConformerCTCModel, +) +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + ConformerCTCConfig, +) +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerEncoderConfig, +) +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from torch.nn.modules.activation import ReLU +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerBlockConfig, +) +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, +) +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.norm import LayerNormNC +from torch.nn.modules.activation import Sigmoid + +cfg = ConformerCTCConfig( + conformer_cfg=ConformerEncoderConfig( + num_layers=12, + frontend=ModuleFactoryV1( + module_class=VGG4LayerActFrontendV1, + cfg=VGG4LayerActFrontendV1Config( + in_features=50, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=ReLU(), + out_features=384, + ), + ), + block_cfg=ConformerBlockConfig( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=384, hidden_dim=1536, dropout=0.2, activation=SiLU() + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=384, num_att_heads=6, att_weights_dropout=0.1, dropout=0.2 + ), + conv_cfg=ConformerConvolutionV1Config( + channels=384, + kernel_size=31, + dropout=0.2, + activation=SiLU(), + norm=LayerNormNC((384,), eps=1e-05, elementwise_affine=True), + ), + layer_dropout=0, + modules=["ff", "conv", "mhsa", "ff"], + scales=[0.5, 1.0, 1.0, 0.5], + ), + num_layers_set=[16, 32, 48], + layer_dropout_kwargs={"layer_dropout_stage_1": 0, "layer_dropout_stage_2": 0.3}, + layer_gate_activation=Sigmoid(), + ), + target_size=79, + recog_num_layers=48, + zero_out_kwargs={ + "num_steps_per_iter": [42000, 25200, 25200, 25200, 25200, 25200, 25200, 25200], + "num_zeroout_elements_per_iter": [4, 8, 12, 16, 20, 24, 28, 32], + "zeroout_val": -5, + }, +) +model_kwargs = {"cfg": cfg} + + +def get_model(epoch, step, **kwargs): + return ConformerCTCModel(epoch=epoch, step=step, **model_kwargs, **kwargs) + + +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + train_step, +) diff --git a/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_3_subnets/returnn.config b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_3_subnets/returnn.config new file mode 100644 index 00000000..cbcade00 --- /dev/null +++ b/2024-dynamic-encoder-size/TED-LIUM-v2/iterative_zero_out/1_supernet_3_subnets/returnn.config @@ -0,0 +1,499 @@ +#!rnn.py + + +import numpy as np + +backend = "torch" +batch_size = 15000 +batching = "random" +cache_size = "0" +cleanup_old_models = True +dev = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.U8uxDBlFQCdV/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.H5yw1ICx8RsS/output/targets.hdf" + ], + "partition_epoch": 1, + "seq_ordering": "sorted", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +device = "gpu" +extern_data = {"data": {"dim": 50}, "targets": {"dim": 79, "sparse": True}} +gradient_clip = 0.0 +gradient_noise = 0.0 +learning_rate_file = "learning_rates" +learning_rates = [ + 4e-06, + 7.6e-06, + 1.1200000000000001e-05, + 1.48e-05, + 1.84e-05, + 2.2e-05, + 2.5600000000000002e-05, + 2.92e-05, + 3.2800000000000004e-05, + 3.6400000000000004e-05, + 4e-05, + 4.36e-05, + 4.720000000000001e-05, + 5.080000000000001e-05, + 5.440000000000001e-05, + 5.800000000000001e-05, + 6.16e-05, + 6.520000000000001e-05, + 6.88e-05, + 7.240000000000001e-05, + 7.6e-05, + 7.960000000000001e-05, + 8.32e-05, + 8.680000000000001e-05, + 9.040000000000002e-05, + 9.400000000000001e-05, + 9.760000000000001e-05, + 0.00010120000000000001, + 0.00010480000000000001, + 0.0001084, + 0.00011200000000000001, + 0.0001156, + 0.00011920000000000001, + 0.0001228, + 0.0001264, + 0.00013, + 0.0001336, + 0.0001372, + 0.0001408, + 0.00014439999999999999, + 0.000148, + 0.0001516, + 0.0001552, + 0.0001588, + 0.0001624, + 0.000166, + 0.0001696, + 0.0001732, + 0.00017680000000000001, + 0.0001804, + 0.000184, + 0.0001876, + 0.0001912, + 0.0001948, + 0.0001984, + 0.000202, + 0.0002056, + 0.00020920000000000002, + 0.0002128, + 0.0002164, + 0.00022, + 0.00022360000000000001, + 0.0002272, + 0.0002308, + 0.0002344, + 0.000238, + 0.00024160000000000002, + 0.0002452, + 0.00024880000000000003, + 0.0002524, + 0.000256, + 0.0002596, + 0.0002632, + 0.00026680000000000003, + 0.0002704, + 0.000274, + 0.0002776, + 0.0002812, + 0.0002848, + 0.0002884, + 0.000292, + 0.00029560000000000003, + 0.0002992, + 0.0003028, + 0.0003064, + 0.00031, + 0.00031360000000000003, + 0.0003172, + 0.0003208, + 0.0003244, + 0.000328, + 0.00033160000000000004, + 0.0003352, + 0.0003388, + 0.00034240000000000003, + 0.000346, + 0.00034960000000000004, + 0.0003532, + 0.0003568, + 0.00036040000000000003, + 0.000364, + 0.0003676, + 0.0003712, + 0.0003748, + 0.00037840000000000004, + 0.000382, + 0.0003856, + 0.00038920000000000003, + 0.0003928, + 0.00039640000000000004, + 0.0004, + 0.00039640000000000004, + 0.0003928, + 0.00038920000000000003, + 0.0003856, + 0.000382, + 0.00037840000000000004, + 0.0003748, + 0.0003712, + 0.00036760000000000004, + 0.000364, + 0.00036040000000000003, + 0.0003568, + 0.0003532, + 0.00034960000000000004, + 0.000346, + 0.00034240000000000003, + 0.0003388, + 0.0003352, + 0.00033160000000000004, + 0.000328, + 0.0003244, + 0.0003208, + 0.0003172, + 0.00031360000000000003, + 0.00031, + 0.0003064, + 0.0003028, + 0.0002992, + 0.00029560000000000003, + 0.000292, + 0.0002884, + 0.0002848, + 0.0002812, + 0.0002776, + 0.00027400000000000005, + 0.0002704, + 0.0002668, + 0.0002632, + 0.0002596, + 0.00025600000000000004, + 0.0002524, + 0.0002488, + 0.0002452, + 0.00024160000000000002, + 0.000238, + 0.0002344, + 0.0002308, + 0.0002272, + 0.00022360000000000001, + 0.00022, + 0.0002164, + 0.0002128, + 0.00020920000000000002, + 0.0002056, + 0.000202, + 0.0001984, + 0.0001948, + 0.0001912, + 0.0001876, + 0.000184, + 0.0001804, + 0.00017680000000000001, + 0.0001732, + 0.0001696, + 0.000166, + 0.0001624, + 0.0001588, + 0.00015519999999999998, + 0.0001516, + 0.00014800000000000002, + 0.00014439999999999999, + 0.0001408, + 0.00013719999999999997, + 0.0001336, + 0.00013000000000000002, + 0.00012639999999999998, + 0.0001228, + 0.00011920000000000002, + 0.00011559999999999999, + 0.00011200000000000001, + 0.00010839999999999998, + 0.0001048, + 0.00010120000000000002, + 9.759999999999999e-05, + 9.400000000000001e-05, + 9.039999999999997e-05, + 8.68e-05, + 8.320000000000002e-05, + 7.959999999999998e-05, + 7.6e-05, + 7.239999999999997e-05, + 6.879999999999999e-05, + 6.520000000000001e-05, + 6.159999999999998e-05, + 5.8e-05, + 5.439999999999997e-05, + 5.079999999999999e-05, + 4.720000000000001e-05, + 4.3599999999999976e-05, + 3.9999999999999996e-05, + 3.640000000000002e-05, + 3.2799999999999984e-05, + 2.9200000000000005e-05, + 2.5599999999999972e-05, + 2.1999999999999993e-05, + 1.8400000000000014e-05, + 1.479999999999998e-05, + 1.1200000000000001e-05, + 7.599999999999968e-06, + 4e-06, + 3.862413793103448e-06, + 3.7248275862068965e-06, + 3.587241379310345e-06, + 3.449655172413793e-06, + 3.3120689655172415e-06, + 3.1744827586206894e-06, + 3.0368965517241378e-06, + 2.899310344827586e-06, + 2.7617241379310344e-06, + 2.6241379310344828e-06, + 2.4865517241379307e-06, + 2.348965517241379e-06, + 2.2113793103448274e-06, + 2.0737931034482757e-06, + 1.936206896551724e-06, + 1.7986206896551724e-06, + 1.6610344827586207e-06, + 1.523448275862069e-06, + 1.3858620689655174e-06, + 1.2482758620689657e-06, + 1.1106896551724136e-06, + 9.73103448275862e-07, + 8.355172413793103e-07, + 6.979310344827586e-07, + 5.60344827586207e-07, + 4.227586206896553e-07, + 2.8517241379310365e-07, + 1.4758620689655157e-07, + 1e-08, +] +log = ["./returnn.log"] +log_batch_size = True +log_verbosity = 5 +max_seqs = 128 +model = "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/training/ReturnnTrainingJob.aLCjBKpJEw1L/output/models/epoch" +num_epochs = 250 +num_inputs = 50 +num_outputs = {"targets": 79} +optimizer = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 0.001} +save_interval = 1 +target = "targets" +task = "train" +tf_log_memory_usage = True +train = { + "class": "MetaDataset", + "data_map": {"data": ("features", "data"), "targets": ("targets", "data")}, + "datasets": { + "features": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_core/returnn/hdf/ReturnnDumpHDFJob.jxsVEaCgJLqB/output/data.hdf" + ], + "use_cache_manager": True, + }, + "targets": { + "class": "HDFDataset", + "files": [ + "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/work/i6_experiments/users/berger/recipe/returnn/hdf/BlissCorpusToTargetHdfJob.cEWylWI7BnXk/output/targets.hdf" + ], + "partition_epoch": 5, + "seq_ordering": "laplace:.1000", + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "targets", +} +update_on_device = True +window = 1 +config = {} + +locals().update(**config) + +import os +import sys + +sys.path.insert(0, "/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/recipe") +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + ConformerCTCModel, +) +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + ConformerCTCConfig, +) +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerEncoderConfig, +) +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from torch.nn.modules.activation import ReLU +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerBlockConfig, +) +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, +) +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from torch.nn.modules.activation import SiLU +from i6_models.parts.conformer.norm import LayerNormNC +from torch.nn.modules.activation import Sigmoid + +cfg = ConformerCTCConfig( + conformer_cfg=ConformerEncoderConfig( + num_layers=12, + frontend=ModuleFactoryV1( + module_class=VGG4LayerActFrontendV1, + cfg=VGG4LayerActFrontendV1Config( + in_features=50, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=ReLU(), + out_features=384, + ), + ), + block_cfg=ConformerBlockConfig( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=384, hidden_dim=1536, dropout=0.2, activation=SiLU() + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=384, num_att_heads=6, att_weights_dropout=0.1, dropout=0.2 + ), + conv_cfg=ConformerConvolutionV1Config( + channels=384, + kernel_size=31, + dropout=0.2, + activation=SiLU(), + norm=LayerNormNC((384,), eps=1e-05, elementwise_affine=True), + ), + layer_dropout=0, + modules=["ff", "conv", "mhsa", "ff"], + scales=[0.5, 1.0, 1.0, 0.5], + ), + num_layers_set=[12, 24, 36, 48], + layer_dropout_kwargs={"layer_dropout_stage_1": 0, "layer_dropout_stage_2": 0.3}, + layer_gate_activation=Sigmoid(), + ), + target_size=79, + recog_num_layers=48, + zero_out_kwargs={ + "num_steps_per_iter": [ + 14000, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + 5600, + ], + "num_zeroout_elements_per_iter": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + ], + "zeroout_val": -5, + }, +) +model_kwargs = {"cfg": cfg} + + +def get_model(epoch, step, **kwargs): + return ConformerCTCModel(epoch=epoch, step=step, **model_kwargs, **kwargs) + + +from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import ( + train_step, +) diff --git a/2024-dynamic-encoder-size/TED-LIUM-v2/simple_top_k/1_supernet_1_subnets/returnn.config b/2024-dynamic-encoder-size/TED-LIUM-v2/simple_top_k/1_supernet_1_subnet/returnn.config similarity index 100% rename from 2024-dynamic-encoder-size/TED-LIUM-v2/simple_top_k/1_supernet_1_subnets/returnn.config rename to 2024-dynamic-encoder-size/TED-LIUM-v2/simple_top_k/1_supernet_1_subnet/returnn.config