From 284a8460c8d2ba73aae97f00abfd2f58586f7070 Mon Sep 17 00:00:00 2001 From: Naoki-Wake Date: Mon, 5 Apr 2021 19:27:29 +0900 Subject: [PATCH] enable pretraining using sthv2 --- .gitignore | 1 + .../_base_/models/household_addlayer_r50.py | 21 ++++ .../models/household_nonaddlayer_r50.py | 21 ++++ ...m_mobilenetv2_1x1x8_50e_kinetics400_rgb.py | 92 -------------- ..._video_dense_1x1x8_100e_kinetics400_rgb.py | 97 --------------- ...erence_dense_1x1x8_100e_kinetics400_rgb.py | 34 ------ ...t_product_r50_1x1x8_50e_kinetics400_rgb.py | 97 --------------- ..._gaussian_r50_1x1x8_50e_kinetics400_rgb.py | 97 --------------- ..._gaussian_r50_1x1x8_50e_kinetics400_rgb.py | 97 --------------- .../arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py | 7 -- .../arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py | 88 -------------- .../tsm_r50_1x1x16_50e_kinetics400_rgb.py | 96 --------------- .../tsm_r50_1x1x8_50e_household_rgb.py | 11 +- ...0e_household_rgb_usepretrain_add_layer.py} | 34 +++--- ...household_rgb_usepretrain_nonadd_layer.py} | 31 ++--- .../tsm_r50_1x1x8_50e_kinetics400_rgb.py | 88 -------------- .../arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py | 93 -------------- .../tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py | 114 ----------------- ...sm_r50_dense_1x1x8_100e_kinetics400_rgb.py | 88 -------------- .../tsm_r50_flip_1x1x8_50e_sthv1_rgb.py | 97 --------------- ...50_flip_randaugment_1x1x8_50e_sthv1_rgb.py | 98 --------------- ...gpu_normalize_1x1x8_50e_kinetics400_rgb.py | 94 -------------- .../tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py | 113 ----------------- ...tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py | 94 -------------- ...tsm_r50_video_1x1x8_50e_kinetics400_rgb.py | 95 --------------- ...eo_inference_1x1x8_100e_kinetics400_rgb.py | 31 ----- ...oral_pool_r50_1x1x8_50e_kinetics400_rgb.py | 8 -- docker/run.sh | 7 +- mmaction/models/heads/__init__.py | 4 +- .../models/heads/household_head_addlayer.py | 115 ++++++++++++++++++ .../heads/household_head_nonaddlayer.py | 112 +++++++++++++++++ tools/data/build_file_list.py | 26 ++-- tools/data/parse_file_list.py | 64 ++++++++++ .../generate_rawframes_filelist.sh | 10 ++ .../generate_videos_filelist.sh | 8 ++ 35 files changed, 418 insertions(+), 1765 deletions(-) create mode 100644 configs/_base_/models/household_addlayer_r50.py create mode 100644 configs/_base_/models/household_nonaddlayer_r50.py delete mode 100644 configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py rename configs/recognition/arr_tsm/{tsm_r50_1x1x16_50e_sthv1_rgb.py => tsm_r50_1x1x8_50e_household_rgb_usepretrain_add_layer.py} (74%) rename configs/recognition/arr_tsm/{tsm_r50_1x1x16_50e_sthv2_rgb.py => tsm_r50_1x1x8_50e_household_rgb_usepretrain_nonadd_layer.py} (74%) delete mode 100644 configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py delete mode 100644 configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py create mode 100644 mmaction/models/heads/household_head_addlayer.py create mode 100644 mmaction/models/heads/household_head_nonaddlayer.py create mode 100644 tools/data/sthv2_pretrain/generate_rawframes_filelist.sh create mode 100644 tools/data/sthv2_pretrain/generate_videos_filelist.sh diff --git a/.gitignore b/.gitignore index b01921b39f..2c025068dd 100644 --- a/.gitignore +++ b/.gitignore @@ -113,6 +113,7 @@ venv.bak/ *.log.json benchlist.txt work_dirs/ +work_dirs_analyses/ # Pytorch *.pth diff --git a/configs/_base_/models/household_addlayer_r50.py b/configs/_base_/models/household_addlayer_r50.py new file mode 100644 index 0000000000..6512063970 --- /dev/null +++ b/configs/_base_/models/household_addlayer_r50.py @@ -0,0 +1,21 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=8), + cls_head=dict( + type='HOUSEHOLDHead_ADDLAYER', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True), + # model training and testing settings + train_cfg=None, + test_cfg=dict(average_clips='prob')) diff --git a/configs/_base_/models/household_nonaddlayer_r50.py b/configs/_base_/models/household_nonaddlayer_r50.py new file mode 100644 index 0000000000..61106d713b --- /dev/null +++ b/configs/_base_/models/household_nonaddlayer_r50.py @@ -0,0 +1,21 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNetTSM', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False, + shift_div=8), + cls_head=dict( + type='HOUSEHOLDHead_NONADDLAYER', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.5, + init_std=0.001, + is_shift=True), + # model training and testing settings + train_cfg=None, + test_cfg=dict(average_clips='prob')) diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index 647cb609ac..0000000000 --- a/configs/recognition/arr_tsm/tsm_mobilenetv2_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,92 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_mobilenet_v2.py', - '../../_base_/schedules/sgd_tsm_mobilenet_v2_50e.py', - '../../_base_/default_runtime.py' -] - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='ThreeCrop', crop_size=256), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict( - lr=0.01, # this lr is used for 8 gpus -) - -# runtime settings -checkpoint_config = dict(interval=1) -work_dir = './work_dirs/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py deleted file mode 100644 index e0a3c4873b..0000000000 --- a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py +++ /dev/null @@ -1,97 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_mobilenet_v2.py', - '../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py', - '../../_base_/default_runtime.py' -] - -# dataset settings -dataset_type = 'VideoDataset' -data_root = 'data/kinetics400/videos_train' -data_root_val = 'data/kinetics400/videos_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='DecordInit'), - dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict(type='DecordInit'), - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict(type='DecordInit'), - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='ThreeCrop', crop_size=256), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict( - lr=0.01, # this lr is used for 8 gpus -) - -# runtime settings -checkpoint_config = dict(interval=5) -work_dir = './work_dirs/tsm_mobilenetv2_dense_video_1x1x8_100e_kinetics400_rgb/' # noqa diff --git a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py deleted file mode 100644 index a66c772a9b..0000000000 --- a/configs/recognition/arr_tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py +++ /dev/null @@ -1,34 +0,0 @@ -_base_ = ['../../_base_/models/tsm_mobilenet_v2.py'] - -# dataset settings -dataset_type = 'VideoDataset' -data_root_val = 'data/kinetics400/videos_val' -ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -test_pipeline = [ - dict(type='DecordInit'), - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] - -data = dict( - videos_per_gpu=4, - workers_per_gpu=4, - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) diff --git a/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index a85104da17..0000000000 --- a/configs/recognition/arr_tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,97 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict( - backbone=dict( - non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), - non_local_cfg=dict( - sub_sample=True, - use_scale=False, - norm_cfg=dict(type='BN3d', requires_grad=True), - mode='dot_product'))) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index c501dcd6d0..0000000000 --- a/configs/recognition/arr_tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,97 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict( - backbone=dict( - non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), - non_local_cfg=dict( - sub_sample=True, - use_scale=False, - norm_cfg=dict(type='BN3d', requires_grad=True), - mode='embedded_gaussian'))) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -work_dir = './work_dirs/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/' # noqa: E501 diff --git a/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index 8713150fda..0000000000 --- a/configs/recognition/arr_tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,97 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict( - backbone=dict( - non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), - non_local_cfg=dict( - sub_sample=True, - use_scale=False, - norm_cfg=dict(type='BN3d', requires_grad=True), - mode='gaussian'))) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index 02c43a3808..0000000000 --- a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,7 +0,0 @@ -_base_ = ['./tsm_r50_1x1x8_50e_sthv1_rgb.py'] - -# model settings -model = dict(backbone=dict(pretrained='torchvision://resnet101', depth=101)) - -# runtime settings -work_dir = './work_dirs/tsm_r101_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py b/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py deleted file mode 100644 index a4c5ce7d41..0000000000 --- a/configs/recognition/arr_tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py +++ /dev/null @@ -1,88 +0,0 @@ -_base_ = ['./tsm_r50_1x1x8_50e_sthv2_rgb.py'] - -# model settings -model = dict(backbone=dict(pretrained='torchvision://resnet101', depth=101)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv2/rawframes' -data_root_val = 'data/sthv2/rawframes' -ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' -ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' -ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict( - lr=0.01, # this lr is used for 8 gpus -) -# runtime settings -work_dir = './work_dirs/tsm_r101_1x1x8_50e_sthv2_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py deleted file mode 100644 index c53bee6c09..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py +++ /dev/null @@ -1,96 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict(backbone=dict(num_segments=16), cls_head=dict(num_segments=16)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=16, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=16, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=6, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict( - lr=0.0075, # this lr is used for 8 gpus -) - -# runtime settings -checkpoint_config = dict(interval=5) -work_dir = './work_dirs/tsm_r50_1x1x16_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py index 821d717738..33d3ee9c83 100644 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb.py @@ -1,11 +1,14 @@ +#_base_ = [ +# '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', +# '../../_base_/default_runtime.py' +#] _base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/models/household_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' ] - # model settings -model = dict(cls_head=dict(num_classes=174)) - +model = dict(cls_head=dict(num_classes=26))#174 +load_from = '/mmaction2/pretrained_models/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth' # model path can be found in model zoo # dataset settings dataset_type = 'RawframeDataset' data_root = 'data/household/rawframes' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_add_layer.py similarity index 74% rename from configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py rename to configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_add_layer.py index 590a8d1ac9..71ea03ff77 100644 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_add_layer.py @@ -1,24 +1,25 @@ +#_base_ = [ +# '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', +# '../../_base_/default_runtime.py' +#] _base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/models/household_addlayer_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' ] - # model settings -model = dict( - backbone=dict(num_segments=16), - cls_head=dict(num_classes=174, num_segments=16)) - +model = dict(cls_head=dict(num_classes=26))#174 +load_from = '/mmaction2/pretrained_models/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth' # model path can be found in model zoo # dataset settings dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +data_root = 'data/household/rawframes' +data_root_val = 'data/household/rawframes' +ann_file_train = 'data/household/household_train_list_rawframes.txt' +ann_file_val = 'data/household/household_val_list_rawframes.txt' +ann_file_test = 'data/household/household_test_list_rawframes.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), dict( @@ -39,7 +40,7 @@ type='SampleFrames', clip_len=1, frame_interval=1, - num_clips=16, + num_clips=8, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), @@ -54,7 +55,7 @@ type='SampleFrames', clip_len=1, frame_interval=1, - num_clips=16, + num_clips=8, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), @@ -71,19 +72,16 @@ type=dataset_type, ann_file=ann_file_train, data_prefix=data_root, - filename_tmpl='{:05}.jpg', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', pipeline=val_pipeline), test=dict( type=dataset_type, ann_file=ann_file_test, data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', pipeline=test_pipeline)) evaluation = dict( interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) @@ -94,4 +92,4 @@ weight_decay=0.0005) # runtime settings -work_dir = './work_dirs/tsm_r50_1x1x16_50e_sthv1_rgb/' +work_dir = './work_dirs/tsm_r50_1x1x8_50e_household_rgb_usepretrain_addlayer/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_nonadd_layer.py similarity index 74% rename from configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py rename to configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_nonadd_layer.py index 04bd982d77..0ea57a451a 100644 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py +++ b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_household_rgb_usepretrain_nonadd_layer.py @@ -1,24 +1,25 @@ +#_base_ = [ +# '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', +# '../../_base_/default_runtime.py' +#] _base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/models/household_nonaddlayer_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' ] - # model settings -model = dict( - backbone=dict(num_segments=16), - cls_head=dict(num_classes=174, num_segments=16)) - +model = dict(cls_head=dict(num_classes=26))#174 +load_from = '/mmaction2/pretrained_models/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth' # model path can be found in model zoo # dataset settings dataset_type = 'RawframeDataset' -data_root = 'data/sthv2/rawframes' -data_root_val = 'data/sthv2/rawframes' -ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' -ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' -ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' +data_root = 'data/household/rawframes' +data_root_val = 'data/household/rawframes' +ann_file_train = 'data/household/household_train_list_rawframes.txt' +ann_file_val = 'data/household/household_val_list_rawframes.txt' +ann_file_test = 'data/household/household_test_list_rawframes.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), dict( @@ -39,7 +40,7 @@ type='SampleFrames', clip_len=1, frame_interval=1, - num_clips=16, + num_clips=8, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), @@ -54,7 +55,7 @@ type='SampleFrames', clip_len=1, frame_interval=1, - num_clips=16, + num_clips=8, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), @@ -91,4 +92,4 @@ weight_decay=0.0005) # runtime settings -work_dir = './work_dirs/tsm_r50_1x1x16_50e_sthv2_rgb/' +work_dir = './work_dirs/tsm_r50_1x1x8_50e_household_rgb_usepretrain_nonaddlayer/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index ec8ef9678e..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,88 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -checkpoint_config = dict(interval=5) -work_dir = './work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index 4967fa23ac..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,93 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict(cls_head=dict(num_classes=174)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index abf672adc2..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,114 +0,0 @@ -_base_ = [ - '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' -] - -# model settings -# model settings# model settings -model = dict( - type='Recognizer2D', - backbone=dict( - type='ResNetTSM', - pretrained='torchvision://resnet50', - depth=50, - norm_eval=False, - shift_div=8), - cls_head=dict( - type='TSMHead', - num_classes=174, - in_channels=2048, - spatial_type='avg', - consensus=dict(type='AvgConsensus', dim=1), - dropout_ratio=0.5, - init_std=0.001, - is_shift=True), - # model training and testing settings - train_cfg=dict( - blending=dict(type='CutmixBlending', num_classes=174, alpha=.2)), - test_cfg=dict(average_clips='prob')) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - twice_sample=True, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='ThreeCrop', crop_size=256), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py deleted file mode 100644 index 5f81caa280..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py +++ /dev/null @@ -1,88 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_100e.py', - '../../_base_/default_runtime.py' -] - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - val_dataloader=dict(videos_per_gpu=4), - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -work_dir = './work_dirs/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index 00f40cbd58..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,97 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict(cls_head=dict(num_classes=174)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' - -sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) - -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_flip_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index d03ba632b4..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,98 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict(cls_head=dict(num_classes=174)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' - -sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) - -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), - dict(type='Imgaug', transforms='default'), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index 803d793223..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,94 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -module_hooks = [ - dict( - type='GPUNormalize', - hooked_module='backbone', - hook_pos='forward_pre', - input_format='NCHW', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375]) -] - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/kinetics400/rawframes_train' -data_root_val = 'data/kinetics400/rawframes_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' - -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# runtime settings -checkpoint_config = dict(interval=5) -work_dir = './work_dirs/tsm_r50_gpu_normalize_1x1x8_100e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index 73d6321081..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,113 +0,0 @@ -_base_ = [ - '../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py' -] - -# model settings -model = dict( - type='Recognizer2D', - backbone=dict( - type='ResNetTSM', - pretrained='torchvision://resnet50', - depth=50, - norm_eval=False, - shift_div=8), - cls_head=dict( - type='TSMHead', - num_classes=174, - in_channels=2048, - spatial_type='avg', - consensus=dict(type='AvgConsensus', dim=1), - dropout_ratio=0.5, - init_std=0.001, - is_shift=True), - # model training and testing settings - train_cfg=dict( - blending=dict(type='MixupBlending', num_classes=174, alpha=.2)), - test_cfg=dict(average_clips='prob')) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - twice_sample=True, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='ThreeCrop', crop_size=256), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py b/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py deleted file mode 100644 index 448908c5b4..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py +++ /dev/null @@ -1,94 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# model settings -model = dict(cls_head=dict(num_classes=174)) - -# dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Imgaug', transforms='default'), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - filename_tmpl='{:05}.jpg', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - filename_tmpl='{:05}.jpg', - pipeline=test_pipeline)) -evaluation = dict( - interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict(weight_decay=0.0005) - -# runtime settings -work_dir = './work_dirs/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index 0a172d0205..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,95 +0,0 @@ -_base_ = [ - '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', - '../../_base_/default_runtime.py' -] - -# dataset settings -dataset_type = 'VideoDataset' -data_root = 'data/kinetics400/videos_train' -data_root_val = 'data/kinetics400/videos_val' -ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='DecordInit'), - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict( - type='MultiScaleCrop', - input_size=224, - scales=(1, 0.875, 0.75, 0.66), - random_crop=False, - max_wh_scale_gap=1, - num_fixed_crops=13), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=8, - workers_per_gpu=4, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) - -# optimizer -optimizer = dict( - lr=0.02, # this lr is used for 8 gpus -) -# runtime settings -checkpoint_config = dict(interval=5) -work_dir = './work_dirs/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb/' diff --git a/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py deleted file mode 100644 index 7c355ade2f..0000000000 --- a/configs/recognition/arr_tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py +++ /dev/null @@ -1,31 +0,0 @@ -_base_ = ['../../_base_/models/tsm_r50.py'] - -# dataset settings -dataset_type = 'VideoDataset' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -test_pipeline = [ - dict(type='DecordInit', num_threads=1), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=224), - dict(type='Normalize', **img_norm_cfg), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] - -data = dict( - videos_per_gpu=1, - workers_per_gpu=2, - test=dict( - type=dataset_type, - ann_file=None, - data_prefix=None, - pipeline=test_pipeline)) diff --git a/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py deleted file mode 100644 index 2984d37968..0000000000 --- a/configs/recognition/arr_tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py +++ /dev/null @@ -1,8 +0,0 @@ -_base_ = ['./tsm_r50_1x1x8_50e_kinetics400_rgb.py'] - -# model settings -model = dict( - backbone=dict(temporal_pool=True), cls_head=dict(temporal_pool=True)) - -# runtime settings -work_dir = './work_dirs/tsm_temporal_pool_r50_1x1x8_100e_kinetics400_rgb/' diff --git a/docker/run.sh b/docker/run.sh index bb61fb66df..8532996874 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -4,11 +4,15 @@ docker run --rm \ --network=host \ --privileged \ --volume="/dev:/dev" \ - --volume="/mnt/hdd/video/household:/mmaction2/data/household" \ + --volume="/mnt/ssd_2T/video/sthv2:/mmaction2/data/sthv2" \ + --volume="/mnt/ssd_2T/video/household:/mmaction2/data/household" \ --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/tools/data:/mmaction2/tools/data" \ --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/configs/recognition/arr_tsm:/mmaction2/configs/recognition/arr_tsm" \ --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/work_dirs:/mmaction2/work_dirs" \ --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/mmaction/datasets:/mmaction2/mmaction/datasets" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/configs/_base_/models:/mmaction2/configs/_base_/models" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/mmaction/models/heads:/mmaction2/mmaction/models/heads" \ + --volume="/home/ubuntu18/Codes/actionrecognition/mmaction2/pretrained_models:/mmaction2/pretrained_models" \ --runtime=nvidia \ --device /dev/snd:/dev/snd \ -e DISPLAY=$DISPLAY \ @@ -16,3 +20,4 @@ docker run --rm \ --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ -it naoki:mmaction2_actionrecognition xhost +local:docker +#--volume="/mnt/hdd/video/household:/mmaction2/data/household" \ \ No newline at end of file diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py index f21c89d44c..7309639cd5 100644 --- a/mmaction/models/heads/__init__.py +++ b/mmaction/models/heads/__init__.py @@ -11,9 +11,11 @@ from .tsm_head import TSMHead from .tsn_head import TSNHead from .x3d_head import X3DHead +from .household_head_nonaddlayer import HOUSEHOLDHead_NONADDLAYER +from .household_head_addlayer import HOUSEHOLDHead_ADDLAYER __all__ = [ 'TSNHead', 'I3DHead', 'BaseHead', 'TSMHead', 'SlowFastHead', 'SSNHead', 'TPNHead', 'AudioTSNHead', 'X3DHead', 'BBoxHeadAVA', 'AVARoIHead', - 'FBOHead', 'LFBInferHead' + 'FBOHead', 'LFBInferHead', 'HOUSEHOLDHead_ADDLAYER', 'HOUSEHOLDHead_NONADDLAYER' ] diff --git a/mmaction/models/heads/household_head_addlayer.py b/mmaction/models/heads/household_head_addlayer.py new file mode 100644 index 0000000000..d8d3ef2bb6 --- /dev/null +++ b/mmaction/models/heads/household_head_addlayer.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn +from mmcv.cnn import normal_init + +from ..registry import HEADS +from .base import AvgConsensus, BaseHead + + +@HEADS.register_module() +class HOUSEHOLDHead_ADDLAYER(BaseHead): + """Class head for HOUSEHOLD on top of TSM. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + num_segments (int): Number of frame segments. Default: 8. + loss_cls (dict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + consensus (dict): Consensus config dict. + dropout_ratio (float): Probability of dropout layer. Default: 0.4. + init_std (float): Std value for Initiation. Default: 0.01. + is_shift (bool): Indicating whether the feature is shifted. + Default: True. + temporal_pool (bool): Indicating whether feature is temporal pooled. + Default: False. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes, + in_channels, + num_segments=8, + loss_cls=dict(type='CrossEntropyLoss'), + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.8, + init_std=0.001, + is_shift=True, + temporal_pool=False, + **kwargs): + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.default_class = 174 + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.num_segments = num_segments + self.init_std = init_std + self.is_shift = is_shift + self.temporal_pool = temporal_pool + + consensus_ = consensus.copy() + + consensus_type = consensus_.pop('type') + if consensus_type == 'AvgConsensus': + self.consensus = AvgConsensus(**consensus_) + else: + self.consensus = None + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + + self.fc_cls = nn.Linear(self.in_channels, self.default_class) + self.fc_cls_household = nn.Linear(self.default_class, self.num_classes) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d(1) + else: + self.avg_pool = None + + def init_weights(self): + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + normal_init(self.fc_cls_household, std=self.init_std) + + def forward(self, x, num_segs): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + num_segs (int): Useless in TSMHead. By default, `num_segs` + is equal to `clip_len * num_clips * num_crops`, which is + automatically generated in Recognizer forward phase and + useless in TSM models. The `self.num_segments` we need is a + hyper parameter to build TSM models. + Returns: + torch.Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N * num_segs, in_channels, 1, 1] + x = torch.flatten(x, 1) + # [N * num_segs, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N * num_segs, num_classes] + x = self.fc_cls(x) + cls_score = self.fc_cls_household(x) + + if self.is_shift and self.temporal_pool: + # [2 * N, num_segs // 2, num_classes] + cls_score = cls_score.view((-1, self.num_segments // 2) + + cls_score.size()[1:]) + else: + # [N, num_segs, num_classes] + cls_score = cls_score.view((-1, self.num_segments) + + cls_score.size()[1:]) + # [N, 1, num_classes] + cls_score = self.consensus(cls_score) + # [N, num_classes] + return cls_score.squeeze(1) diff --git a/mmaction/models/heads/household_head_nonaddlayer.py b/mmaction/models/heads/household_head_nonaddlayer.py new file mode 100644 index 0000000000..b97eaf509a --- /dev/null +++ b/mmaction/models/heads/household_head_nonaddlayer.py @@ -0,0 +1,112 @@ +import torch +import torch.nn as nn +from mmcv.cnn import normal_init + +from ..registry import HEADS +from .base import AvgConsensus, BaseHead + + +@HEADS.register_module() +class HOUSEHOLDHead_NONADDLAYER(BaseHead): + """Class head for HOUSEHOLD on top of TSM. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + num_segments (int): Number of frame segments. Default: 8. + loss_cls (dict): Config for building loss. + Default: dict(type='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + consensus (dict): Consensus config dict. + dropout_ratio (float): Probability of dropout layer. Default: 0.4. + init_std (float): Std value for Initiation. Default: 0.01. + is_shift (bool): Indicating whether the feature is shifted. + Default: True. + temporal_pool (bool): Indicating whether feature is temporal pooled. + Default: False. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes, + in_channels, + num_segments=8, + loss_cls=dict(type='CrossEntropyLoss'), + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.8, + init_std=0.001, + is_shift=True, + temporal_pool=False, + **kwargs): + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.default_class = 174 + self.spatial_type = spatial_type + self.dropout_ratio = dropout_ratio + self.num_segments = num_segments + self.init_std = init_std + self.is_shift = is_shift + self.temporal_pool = temporal_pool + + consensus_ = consensus.copy() + + consensus_type = consensus_.pop('type') + if consensus_type == 'AvgConsensus': + self.consensus = AvgConsensus(**consensus_) + else: + self.consensus = None + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + + self.fc_cls = nn.Linear(self.in_channels, self.default_class) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool2d(1) + else: + self.avg_pool = None + + def init_weights(self): + """Initiate the parameters from scratch.""" + normal_init(self.fc_cls, std=self.init_std) + + def forward(self, x, num_segs): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + num_segs (int): Useless in TSMHead. By default, `num_segs` + is equal to `clip_len * num_clips * num_crops`, which is + automatically generated in Recognizer forward phase and + useless in TSM models. The `self.num_segments` we need is a + hyper parameter to build TSM models. + Returns: + torch.Tensor: The classification scores for input samples. + """ + # [N * num_segs, in_channels, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N * num_segs, in_channels, 1, 1] + x = torch.flatten(x, 1) + # [N * num_segs, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N * num_segs, num_classes] + cls_score = self.fc_cls(x) + + if self.is_shift and self.temporal_pool: + # [2 * N, num_segs // 2, num_classes] + cls_score = cls_score.view((-1, self.num_segments // 2) + + cls_score.size()[1:]) + else: + # [N, num_segs, num_classes] + cls_score = cls_score.view((-1, self.num_segments) + + cls_score.size()[1:]) + # [N, 1, num_classes] + cls_score = self.consensus(cls_score) + # [N, num_classes] + return cls_score.squeeze(1) diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py index 0fa5121f51..dc11eaff4c 100644 --- a/tools/data/build_file_list.py +++ b/tools/data/build_file_list.py @@ -11,7 +11,7 @@ parse_kinetics_splits, parse_mit_splits, parse_mmit_splits, parse_sthv1_splits, parse_sthv2_splits, - parse_household_splits, + parse_household_splits, parse_sthv2_pretrain_splits, parse_ucf101_splits) @@ -22,7 +22,7 @@ def parse_args(): type=str, choices=[ 'ucf101', 'kinetics400', 'kinetics600', 'kinetics700', 'thumos14', - 'sthv1', 'sthv2', 'mit', 'mmit', 'activitynet', 'hmdb51', 'jester', 'household' + 'sthv1', 'sthv2', 'mit', 'mmit', 'activitynet', 'hmdb51', 'jester', 'household', 'sthv2_pretrain' ], help='dataset to be built file list') parser.add_argument( @@ -84,7 +84,7 @@ def parse_args(): return args -def build_file_list(splits, frame_info, shuffle=False): +def build_file_list(splits, frame_info, shuffle=False, mode = None): """Build file list for a certain data split. Args: @@ -156,8 +156,12 @@ def build_list(split): train_rgb_list, train_flow_list = build_list(splits[0]) valid_rgb_list, valid_flow_list = build_list(splits[1]) - test_rgb_list, test_flow_list = build_list(splits[2]) - return (train_rgb_list, valid_rgb_list, test_rgb_list), (train_flow_list, valid_flow_list, test_flow_list) + #import pdb;pdb.set_trace() + if mode == 'household': + test_rgb_list, test_flow_list = build_list(splits[2]) + return (train_rgb_list, valid_rgb_list, test_rgb_list), (train_flow_list, valid_flow_list, test_flow_list) + else: + return (train_rgb_list, valid_rgb_list), (train_flow_list, valid_flow_list) def main(): @@ -197,6 +201,8 @@ def main(): splits = parse_sthv1_splits(args.level) elif args.dataset == 'sthv2': splits = parse_sthv2_splits(args.level) + elif args.dataset == 'sthv2_pretrain': + splits = parse_sthv2_pretrain_splits(args.level) elif args.dataset == 'household': splits = parse_household_splits(args.level) elif args.dataset == 'mit': @@ -212,13 +218,14 @@ def main(): else: raise ValueError( f"Supported datasets are 'ucf101, sthv1, sthv2', 'jester', " - f"'mmit', 'mit', 'kinetics400', 'kinetics600', 'kinetics700', 'household', but " + f"'mmit', 'mit', 'kinetics400', 'kinetics600', 'kinetics700', 'household', 'sthv2_pretrain', but " f'got {args.dataset}') assert len(splits) == args.num_split out_path = args.out_root_path + args.dataset - + if args.dataset == 'sthv2_pretrain': + out_path = args.out_root_path + 'sthv2' if len(splits) > 1: for i, split in enumerate(splits): file_lists = build_file_list( @@ -240,7 +247,7 @@ def main(): with open(osp.join(out_path, val_name), 'w') as f: json.dump(val_list, f) else: - lists = build_file_list(splits[0], frame_info, shuffle=args.shuffle) + lists = build_file_list(splits[0], frame_info, shuffle=args.shuffle, mode = args.dataset) if args.subset == 'train': ind = 0 @@ -253,6 +260,9 @@ def main(): f'but got {args.subset}.') filename = f'{args.dataset}_{args.subset}_list_{args.format}.txt' + if args.dataset == 'sthv2_pretrain': + filename = f'sthv2_{args.subset}_list_{args.format}.txt' + if args.output_format == 'txt': with open(osp.join(out_path, filename), 'w') as f: f.writelines(lists[0][ind]) diff --git a/tools/data/parse_file_list.py b/tools/data/parse_file_list.py index d9bc292fe6..ddc08ad2e9 100644 --- a/tools/data/parse_file_list.py +++ b/tools/data/parse_file_list.py @@ -278,6 +278,70 @@ def line_to_map(item, test_mode=False): splits = ((train_list, val_list, test_list), ) return splits +def parse_sthv2_pretrain_splits(level): + """Parse Something-Something dataset V2 into "train", "val" splits. + + Args: + level (int): Directory level of data. 1 for the single-level directory, + 2 for the two-level directory. + + Returns: + list: "train", "val", "test" splits of Something-Something V2 dataset. + """ + # Read the annotations + # yapf: disable + class_index_remap_file = 'data/sthv2/annotations/something-something-v2-labels_remap.json' # noqa + class_index_file = 'data/sthv2/annotations/something-something-v2-labels.json' # noqa + # yapf: enable + train_file = 'data/sthv2/annotations/something-something-v2-train.json' + val_file = 'data/sthv2/annotations/something-something-v2-validation.json' + test_file = 'data/sthv2/annotations/something-something-v2-test.json' + + with open(class_index_file, 'r') as fin: + class_mapping = json.loads(fin.read()) + + with open(class_index_remap_file, 'r') as fin: + class_mapping_remap = json.loads(fin.read()) + + def line_to_map(item, test_mode=False): + video = item['id'] + if level == 1: + video = osp.basename(video) + elif level == 2: + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) + if test_mode: + return video + else: + template = item['template'].replace('[', '') + template = template.replace(']', '') + #label = int(class_mapping[template]) + orig_label = class_mapping[template] + if orig_label in class_mapping_remap.keys(): + label = int(class_mapping_remap[orig_label]) + return video, label + else: + return None + + + with open(train_file, 'r') as fin: + items = json.loads(fin.read()) + train_list = [line_to_map(item) for item in items] + train_list = list(filter(None, train_list)) + + with open(val_file, 'r') as fin: + items = json.loads(fin.read()) + val_list = [line_to_map(item) for item in items] + val_list = list(filter(None, val_list)) + + with open(test_file, 'r') as fin: + items = json.loads(fin.read()) + test_list = [line_to_map(item, test_mode=True) for item in items] + test_list = list(filter(None, test_list)) + + splits = ((train_list, val_list, test_list), ) + return splits + def parse_household_splits(level): """Parse household dataset V2 into "train", "val" splits. diff --git a/tools/data/sthv2_pretrain/generate_rawframes_filelist.sh b/tools/data/sthv2_pretrain/generate_rawframes_filelist.sh new file mode 100644 index 0000000000..a6a3e68d21 --- /dev/null +++ b/tools/data/sthv2_pretrain/generate_rawframes_filelist.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv2_pretrain data/sthv2/rawframes/ --num-split 1 --level 1 --subset train --format rawframes --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv2_pretrain data/sthv2/rawframes/ --num-split 1 --level 1 --subset val --format rawframes --shuffle + + +echo "Filelist for rawframes generated." + +cd tools/data/household/ diff --git a/tools/data/sthv2_pretrain/generate_videos_filelist.sh b/tools/data/sthv2_pretrain/generate_videos_filelist.sh new file mode 100644 index 0000000000..80d3cbc802 --- /dev/null +++ b/tools/data/sthv2_pretrain/generate_videos_filelist.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd ../../../ +PYTHONPATH=. python tools/data/build_file_list.py sthv2_pretrain data/sthv2/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle +PYTHONPATH=. python tools/data/build_file_list.py sthv2_pretrain data/sthv2/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle +echo "Filelist for videos generated." + +cd tools/data/household/