diff --git a/configs/recognition/arr_tsm/README.md b/configs/recognition/arr_tsm/README.md new file mode 100644 index 0000000000..aae497f3c6 --- /dev/null +++ b/configs/recognition/arr_tsm/README.md @@ -0,0 +1,142 @@ +# TSM + +## Introduction + +[ALGORITHM] + +```BibTeX +@inproceedings{lin2019tsm, + title={TSM: Temporal Shift Module for Efficient Video Understanding}, + author={Lin, Ji and Gan, Chuang and Han, Song}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + year={2019} +} +``` + +[BACKBONE] + +```BibTeX +@article{NonLocal2018, + author = {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He}, + title = {Non-local Neural Networks}, + journal = {CVPR}, + year = {2018} +} +``` + +## Model Zoo + +### Kinetics-400 + +|config | resolution | gpus | backbone | pretrain | top1 acc| top5 acc | reference top1 acc | reference top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json| +|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |340x256|8| ResNet50| ImageNet |70.24|89.56|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)| +|[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.59|89.52|x|x|x|7079|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json)| +|[tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.48|89.40|x|x|x|7076|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json)| +|[tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.25|89.66|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7077 | [ckpt]( https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json)| +|[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |340x256|8x4| ResNet50 | ImageNet|72.9|90.44|[72.22](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|[90.37](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|11.5 (8x10 frames)| 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20200626-91a54551.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log.json)| +|[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50 | ImageNet|73.38|91.02|x|x|x|7079|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb_20200727-e1e0c785.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log.json)| +|[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |340x256|8| ResNet50| ImageNet |72.09|90.37|[70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh)|[89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh)|47.0 (16x1 frames)| 10404 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json)| +|[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |short-side 256|8x4| ResNet50| ImageNet |71.89|90.73|x|x|x|10398|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json)| +|[tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py)|short-side 320|8x4| ResNet50| ImageNet |72.03|90.25|71.81|90.36|x|8931|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json)| +|[tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py)|short-side 320|8x4| ResNet50| ImageNet |70.70|89.90|x|x|x|10125|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json)| +|[tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py)|short-side 320|8x4|ResNet50| ImageNet |71.60|90.34|x|x|x|8358|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json)| +|[tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py)|short-side 320|8|MobileNetV2| ImageNet |68.46|88.64|x|x|x|3385|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json)| + +### Something-Something V1 + +|config | resolution | gpus | backbone| pretrain | top1 acc (efficient/accurate)| top5 acc (efficient/accurate)| reference top1 acc (efficient/accurate)| reference top5 acc (efficient/accurate)| gpu_mem(M) | ckpt | log| json| +|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 45.58 / 47.70|75.02 / 76.12|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json)| +|[tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 47.10 / 48.51|76.02 / 77.56|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json)| +|[tsm_r50_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 47.16 / 48.90|76.07 / 77.92|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json)| +|[tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 47.85 / 50.31|76.78 / 78.18|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json)| +|[tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py)|height 100|8| ResNet50 | ImageNet|47.62 / 49.28|76.63 / 77.82|[47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|10390|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20201010-17fa49f6.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/20201010_221240.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/20201010_221240.log.json)| +|[tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py)|height 100|8| ResNet50 | ImageNet|45.72 / 48.43|74.67 / 76.72|[46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|9800|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20201010-43fedf2e.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/20201010_224055.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/20201010_224055.log.json)| + +### Something-Something V2 + +|config | resolution | gpus | backbone | pretrain| top1 acc (efficient/accurate)| top5 acc (efficient/accurate)| reference top1 acc (efficient/accurate)| reference top5 acc (efficient/accurate)| gpu_mem(M) | ckpt | log| json| +|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) |height 240|8| ResNet50| ImageNet |57.86 / 61.12|84.67 / 86.26|[57.98 / 60.69](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[84.57 / 86.28](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7069 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_1x1x8_50e_sthv2_rgb_20200912-033c4ac6.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20200912_140737.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20200912_140737.log.json)| +|[tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py) |height 240|8| ResNet50| ImageNet |59.93 / 62.04|86.10 / 87.35|[58.90 / 60.98](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[85.29 / 86.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 10400| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_1x1x16_50e_sthv2_rgb_20201010-16469c6f.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20201010_224215.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20201010_224215.log.json)| +|[tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) |height 240|8| ResNet101 | ImageNet|58.59 / 61.51|85.07 / 86.90|[58.89 / 61.36](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[85.14 / 87.00](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 9784 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_1x1x8_50e_sthv2_rgb_20201010-98cdedb8.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20201010_224100.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20201010_224100.log.json)| + +### MixUp & CutMix on Something-Something V1 + +| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | delta top1 acc (efficient/accurate) | delta top5 acc (efficient/accurate) | ckpt | log | json | +| :----------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :---------------------------------: | :---------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| [tsm_r50_mixup_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 46.35 / 48.49 | 75.07 / 76.88 | +0.77 / +0.79 | +0.05 / +0.70 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json) | +| [tsm_r50_cutmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 45.92 / 47.46 | 75.23 / 76.71 | +0.34 / -0.24 | +0.21 / +0.59 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json) | + +Notes: + +1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, + e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, + not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time. +3. The values in columns named after "reference" are the results got by training on the original repo, using the same model settings. The checkpoints for reference repo can be downloaded [here](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_reference_ckpt.rar). +4. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip), which is referred from the [original repo](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd). + We use efficient setting as default provided in config files, and it can be changed to accurate setting by + +```python +... +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, # `num_clips = 8` when using 8 segments + twice_sample=True, # set `twice_sample=True` for twice sample in accurate setting + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + # dict(type='CenterCrop', crop_size=224), it is used for efficient setting + dict(type='ThreeCrop', crop_size=256), # it is used for accurate setting + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +``` + +For more details on data preparation, you can refer to Kinetics400, Something-Something V1 and Something-Something V2 in [Data Preparation](/docs/data_preparation.md). + +5. When applying Mixup and CutMix, we use the hyper parameter `alpha=0.2`. + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train TSM model on Kinetics-400 dataset in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \ + --work-dir work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb \ + --validate --seed 0 --deterministic +``` + +For more details, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test TSM model on Kinetics-400 dataset and dump the result to a json file. + +```shell +python tools/test.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \ + --out result.json +``` + +For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset). diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..647cb609ac --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,92 @@ +_base_ = [ + '../../_base_/models/tsm_mobilenet_v2.py', + '../../_base_/schedules/sgd_tsm_mobilenet_v2_50e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.01, # this lr is used for 8 gpus +) + +# runtime settings +checkpoint_config = dict(interval=1) +work_dir = './work_dirs/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py new file mode 100644 index 0000000000..e0a3c4873b --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_mobilenet_v2.py', + '../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.01, # this lr is used for 8 gpus +) + +# runtime settings +checkpoint_config = dict(interval=5) +work_dir = './work_dirs/tsm_mobilenetv2_dense_video_1x1x8_100e_kinetics400_rgb/' # noqa diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py new file mode 100644 index 0000000000..a66c772a9b --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py @@ -0,0 +1,34 @@ +_base_ = ['../../_base_/models/tsm_mobilenet_v2.py'] + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/kinetics400/videos_val' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] + +data = dict( + videos_per_gpu=4, + workers_per_gpu=4, + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) diff --git a/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..a85104da17 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='dot_product'))) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..c501dcd6d0 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian'))) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +work_dir = './work_dirs/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/' # noqa: E501 diff --git a/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..8713150fda --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=False, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='gaussian'))) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..02c43a3808 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,7 @@ +_base_ = ['./tsm_r50_1x1x8_50e_sthv1_rgb.py'] + +# model settings +model = dict(backbone=dict(pretrained='torchvision://resnet101', depth=101)) + +# runtime settings +work_dir = './work_dirs/tsm_r101_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py new file mode 100644 index 0000000000..a4c5ce7d41 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py @@ -0,0 +1,88 @@ +_base_ = ['./tsm_r50_1x1x8_50e_sthv2_rgb.py'] + +# model settings +model = dict(backbone=dict(pretrained='torchvision://resnet101', depth=101)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv2/rawframes' +data_root_val = 'data/sthv2/rawframes' +ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.01, # this lr is used for 8 gpus +) +# runtime settings +work_dir = './work_dirs/tsm_r101_1x1x8_50e_sthv2_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..c53bee6c09 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py @@ -0,0 +1,96 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(backbone=dict(num_segments=16), cls_head=dict(num_segments=16)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.0075, # this lr is used for 8 gpus +) + +# runtime settings +checkpoint_config = dict(interval=5) +work_dir = './work_dirs/tsm_r50_1x1x16_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py new file mode 100644 index 0000000000..590a8d1ac9 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict(num_segments=16), + cls_head=dict(num_classes=174, num_segments=16)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.0075, # this lr is used for 8 gpus + weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_1x1x16_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py new file mode 100644 index 0000000000..04bd982d77 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py @@ -0,0 +1,94 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict(num_segments=16), + cls_head=dict(num_classes=174, num_segments=16)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv2/rawframes' +data_root_val = 'data/sthv2/rawframes' +ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=16, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.0075, # this lr is used for 8 gpus + weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_1x1x16_50e_sthv2_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py new file mode 100644 index 0000000000..821d717738 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py @@ -0,0 +1,92 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/household/rawframes' +data_root_val = 'data/household/rawframes' +ann_file_train = 'data/household/household_train_list_rawframes.txt' +ann_file_val = 'data/household/household_val_list_rawframes.txt' +ann_file_test = 'data/household/household_test_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.0075, # this lr is used for 8 gpus + weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_1x1x8_50e_household_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..ec8ef9678e --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,88 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +checkpoint_config = dict(interval=5) +work_dir = './work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..4967fa23ac --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,93 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..abf672adc2 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,114 @@ +_base_ = [ + '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' +] + +# model settings +# model settings# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=8), + cls_head=dict( + type='TSMHead', + num_classes=174, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True), + # model training and testing settings + train_cfg=dict( + blending=dict(type='CutmixBlending', num_classes=174, alpha=.2)), + test_cfg=dict(average_clips='prob')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py new file mode 100644 index 0000000000..5f81caa280 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py @@ -0,0 +1,88 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_100e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='DenseSampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + val_dataloader=dict(videos_per_gpu=4), + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +work_dir = './work_dirs/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..00f40cbd58 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) + +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_flip_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..d03ba632b4 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,98 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) + +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='Imgaug', transforms='default'), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..803d793223 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,94 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +module_hooks = [ + dict( + type='GPUNormalize', + hooked_module='backbone', + hook_pos='forward_pre', + input_format='NCHW', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375]) +] + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' + +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# runtime settings +checkpoint_config = dict(interval=5) +work_dir = './work_dirs/tsm_r50_gpu_normalize_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..73d6321081 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,113 @@ +_base_ = [ + '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=8), + cls_head=dict( + type='TSMHead', + num_classes=174, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True), + # model training and testing settings + train_cfg=dict( + blending=dict(type='MixupBlending', num_classes=174, alpha=.2)), + test_cfg=dict(average_clips='prob')) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + twice_sample=True, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..448908c5b4 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,94 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Imgaug', transforms='default'), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..0a172d0205 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,95 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + lr=0.02, # this lr is used for 8 gpus +) +# runtime settings +checkpoint_config = dict(interval=5) +work_dir = './work_dirs/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py new file mode 100644 index 0000000000..7c355ade2f --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py @@ -0,0 +1,31 @@ +_base_ = ['../../_base_/models/tsm_r50.py'] + +# dataset settings +dataset_type = 'VideoDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +test_pipeline = [ + dict(type='DecordInit', num_threads=1), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] + +data = dict( + videos_per_gpu=1, + workers_per_gpu=2, + test=dict( + type=dataset_type, + ann_file=None, + data_prefix=None, + pipeline=test_pipeline)) diff --git a/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py new file mode 100644 index 0000000000..2984d37968 --- /dev/null +++ b/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py @@ -0,0 +1,8 @@ +_base_ = ['./tsm_r50_1x1x8_50e_kinetics400_rgb.py'] + +# model settings +model = dict( + backbone=dict(temporal_pool=True), cls_head=dict(temporal_pool=True)) + +# runtime settings +work_dir = './work_dirs/tsm_temporal_pool_r50_1x1x8_100e_kinetics400_rgb/' diff --git a/docker/Dockerfile b/docker/Dockerfile index 7a47d42e01..f8c89a472b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,12 +12,12 @@ RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 lib && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Inserted to use caches +ENV updated-on 29-March-21 + # Install mmcv-full RUN pip install mmcv-full==latest+torch1.6.0+cu101 -f https://download.openmmlab.com/mmcv/dist/index.html -# Inserted to use caches -ENV updated-on 26-March-21 - # Install MMAction2 RUN conda clean --all RUN git clone https://github.com/open-mmlab/mmaction2.git /mmaction2 diff --git a/docker/run.sh b/docker/run.sh index 139601e2ff..bb61fb66df 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -5,6 +5,10 @@ docker run --rm \ --privileged \ --volume="/dev:/dev" \ --volume="/mnt/hdd/video/household:/mmaction2/data/household" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/tools/data:/mmaction2/tools/data" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/configs/recognition/arr_tsm:/mmaction2/configs/recognition/arr_tsm" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/work_dirs:/mmaction2/work_dirs" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/mmaction/datasets:/mmaction2/mmaction/datasets" \ --runtime=nvidia \ --device /dev/snd:/dev/snd \ -e DISPLAY=$DISPLAY \ diff --git a/mmaction/datasets/rawframe_dataset.py b/mmaction/datasets/rawframe_dataset.py index 6c632851f2..be6fd83a88 100644 --- a/mmaction/datasets/rawframe_dataset.py +++ b/mmaction/datasets/rawframe_dataset.py @@ -93,7 +93,7 @@ def __init__(self, with_offset=False, multi_class=False, num_classes=None, - start_index=1, + start_index=0,#1 modality='RGB', sample_by_class=False, power=None): diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py index b0c22a074b..0fa5121f51 100644 --- a/tools/data/build_file_list.py +++ b/tools/data/build_file_list.py @@ -155,8 +155,9 @@ def build_list(split): return rgb_list, flow_list train_rgb_list, train_flow_list = build_list(splits[0]) - test_rgb_list, test_flow_list = build_list(splits[1]) - return (train_rgb_list, test_rgb_list), (train_flow_list, test_flow_list) + valid_rgb_list, valid_flow_list = build_list(splits[1]) + test_rgb_list, test_flow_list = build_list(splits[2]) + return (train_rgb_list, valid_rgb_list, test_rgb_list), (train_flow_list, valid_flow_list, test_flow_list) def main(): diff --git a/tools/data/household/generate_rawframes_filelist.sh b/tools/data/household/generate_rawframes_filelist.sh index 00c8d913fc..3b570231e7 100644 --- a/tools/data/household/generate_rawframes_filelist.sh +++ b/tools/data/household/generate_rawframes_filelist.sh @@ -3,6 +3,8 @@ cd ../../../ PYTHONPATH=. python tools/data/build_file_list.py household data/household/rawframes/ --num-split 1 --level 2 --subset train --format rawframes --shuffle PYTHONPATH=. python tools/data/build_file_list.py household data/household/rawframes/ --num-split 1 --level 2 --subset val --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py household data/household/rawframes/ --num-split 1 --level 2 --subset test --format rawframes --shuffle + echo "Filelist for rawframes generated." cd tools/data/household/ diff --git a/tools/data/household/generate_videos_filelist.sh b/tools/data/household/generate_videos_filelist.sh index ee5e56860f..e50bb4dec0 100644 --- a/tools/data/household/generate_videos_filelist.sh +++ b/tools/data/household/generate_videos_filelist.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash cd ../../../ -PYTHONPATH=. python tools/data/build_file_list.py household data/household/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle -PYTHONPATH=. python tools/data/build_file_list.py household data/household/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py household data/household/videos/ --num-split 1 --level 2 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py household data/household/videos/ --num-split 1 --level 2 --subset val --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py household data/household/videos/ --num-split 1 --level 2 --subset test --format videos --shuffle echo "Filelist for videos generated." cd tools/data/household/ diff --git a/tools/data/parse_file_list.py b/tools/data/parse_file_list.py index a937d5cae9..d9bc292fe6 100644 --- a/tools/data/parse_file_list.py +++ b/tools/data/parse_file_list.py @@ -290,11 +290,11 @@ def parse_household_splits(level): """ # Read the annotations # yapf: disable - class_index_file = 'data/sthv2/annotations/shousehold-labels.json' # noqa + class_index_file = 'data/household/annotations/household-labels.json' # noqa # yapf: enable - train_file = 'data/sthv2/annotations/household-train.json' - val_file = 'data/sthv2/annotations/household-validation.json' - test_file = 'data/sthv2/annotations/household-test.json' + train_file = 'data/household/annotations/household-train.json' + val_file = 'data/household/annotations/household-validation.json' + test_file = 'data/household/annotations/household-test.json' with open(class_index_file, 'r') as fin: class_mapping = json.loads(fin.read()) @@ -309,9 +309,7 @@ def line_to_map(item, test_mode=False): if test_mode: return video else: - template = item['template'].replace('[', '') - template = template.replace(']', '') - label = int(class_mapping[template]) + label = int(class_mapping[item['label']]) return video, label with open(train_file, 'r') as fin: @@ -324,7 +322,8 @@ def line_to_map(item, test_mode=False): with open(test_file, 'r') as fin: items = json.loads(fin.read()) - test_list = [line_to_map(item, test_mode=True) for item in items] + #test_list = [line_to_map(item, test_mode=True) for item in items] + test_list = [line_to_map(item) for item in items] splits = ((train_list, val_list, test_list), ) return splits