From 54829b96502a58409f71ecef7d4a6ab6554f13a4 Mon Sep 17 00:00:00 2001 From: yan-gao-GY Date: Thu, 30 Nov 2023 13:49:52 +0000 Subject: [PATCH] Change dataset_preparation.py to command lines --- baselines/fedvssl/README.md | 74 ++++++++---- .../fedvssl/dataset_convert_to_json.py | 33 ++++++ .../fedvssl/fedvssl/dataset_preparation.py | 108 ------------------ 3 files changed, 84 insertions(+), 131 deletions(-) create mode 100644 baselines/fedvssl/fedvssl/dataset_convert_to_json.py delete mode 100644 baselines/fedvssl/fedvssl/dataset_preparation.py diff --git a/baselines/fedvssl/README.md b/baselines/fedvssl/README.md index 76f770a972fe..e57d048327ab 100644 --- a/baselines/fedvssl/README.md +++ b/baselines/fedvssl/README.md @@ -64,52 +64,87 @@ Please make sure you have installed CUDA 11.0 on your machine To construct the Python environment follow these steps: ```bash -# install the base Poetry environment +# Install the base Poetry environment poetry install -# activate the environment +# Activate the environment poetry shell -# install mmcv package +# Install mmcv package pip install mmcv-full==1.7.1 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html ``` ## Running the Experiments -To run FedVSSL with UCF-101 baseline, please ensure you have activated your Poetry environment (execute `poetry shell` from this directory). Then, download the `CtP` repo, download the datasets and preprocess it: +To run FedVSSL with UCF-101 baseline, please ensure you have activated your Poetry environment (execute `poetry shell` from this directory). +Then, download the `CtP` repo and install required packages: ```bash -# clone CtP repo +# Clone CtP repo git clone https://github.com/yan-gao-GY/CtP.git fedvssl/CtP sudo apt install unrar unzip +``` + +### Dataset preparation +Let's first download UCF-101 dataset and related annotation files: +```bash cd fedvssl -python dataset_preparation.py +mkdir -p data/ucf101/ -cd .. +# Downloading +wget https://www.crcv.ucf.edu/data/UCF101/UCF101.rar -O data/ucf101/UCF101.rar --no-check-certificate + +# Unzipping +unrar e data/ucf101/UCF101.rar data/ucf101/UCF101_raw/ + +# Downloading the train/test split +wget https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip -O data/ucf101/UCF101TrainTestSplits-RecognitionTask.zip --no-check-certificate + +# Unzipping +unzip data/ucf101/UCF101TrainTestSplits-RecognitionTask.zip -d data/ucf101/ ``` +Then, we use the scripts to pre-process the dataset: -Finally, we can launch the training. +```bash +# Pre-processing the dataset +python CtP/scripts/process_ucf101.py --raw_dir data/ucf101/UCF101_raw/ --ann_dir data/ucf101/ucfTrainTestlist/ --out_dir data/ucf101/ -### Federated SSL pre-training +# Covert to .json files +python dataset_convert_to_json.py + +# (optional) +rm data/ucf101/UCF101.rar +rm -r data/ucf101/UCF101_raw/ +``` + +Then, we perform data partitioning for FL: -To run using FedVSSL: ```bash -# run federated SSL training with FedVSSL +python data_partitioning_ucf.py --json_path data/ucf101/annotations --output_path data/ucf101/annotations/client_distribution/ --num_clients 5 + +cd .. +``` + + +### Federated SSL pre-training +Finally, we can launch the training. To run using FedVSSL: +```bash +# Run federated SSL training with FedVSSL python -m fedvssl.main pre_training=true # this will run using the default settings. -# you can override settings directly from the command line +# You can override settings directly from the command line python -m fedvssl.main pre_training=true mix_coeff=1 rounds=100 # will set hyper-parameter alpha to 1 and the number of rounds to 100 ``` To run using FedAvg: ```bash -# this will run FedAvg baseline +# This will run FedAvg baseline # This is done so to match the experimental setup in the paper python -m fedvssl.main pre_training=true fedavg=true -# this config can also be overriden. +# This config can also be overriden. ``` ### Downstream fine-tuning @@ -150,10 +185,8 @@ python -m fedvssl.main pre_training=true # this will run using the default setti This will create a folder named fedvssl_results to save the global checkpoints and the local clients' training logs. To check the results, please direct to `fedvssl_results/clientN/*.log.json` files in default, and check the loss changes during training. -After pre-training one can use the following command to run the fine-tuning. -```bash -python -m fedvssl.main pre_training=false pretrained_model_path=.npz -``` +After pre-training one can use the provided commands to run the fine-tuning. + The fine-tuning lasts for 150 epochs. | Method | FL-Setup| Clients| Pretrain Dataset | Finetune Dataset| Top-1% Acc. | Top 5% Acc.| @@ -166,11 +199,6 @@ The fine-tuning lasts for 150 epochs. We provide the checkpoints of the pre-trained SSL models on Kinectics-400. With them as starting points, one can run downstream fine-tuning on UCF-101 to obtain the expected results in the paper. -```bash -python -m fedvssl.main pre_training=false pretrained_model_path=.npz - -# following the table below to change the checkpoints path. -``` | Method | Checkpoint file | UCF Top-1 | |---------|-----------------------------------------------------------------------------------------------------|-----------| diff --git a/baselines/fedvssl/fedvssl/dataset_convert_to_json.py b/baselines/fedvssl/fedvssl/dataset_convert_to_json.py new file mode 100644 index 000000000000..95c4ffbd8bf1 --- /dev/null +++ b/baselines/fedvssl/fedvssl/dataset_convert_to_json.py @@ -0,0 +1,33 @@ +""" +Dataset pre-processing: convert .txt files to .json files +""" +import json + + +# We use the .json files for the annotations. +# One can convert the train_split_1.txt to train_split_1.json +# by using the following code: + +ann_path = [ + "data/ucf101/annotations/train_split_1.txt", + "data/ucf101/annotations/test_split_1.txt", +] +out_path = [ + "data/ucf101/annotations/train_split_1.json", + "data/ucf101/annotations/test_split_1.json", +] + +assert len(ann_path) == len(out_path) + +for i in range(len(ann_path)): + with open(ann_path[i], "r") as f: + lines = f.read().splitlines() + anns = [] + for line in lines: + if line.strip() == "": + continue + name, label = line.split(" ") + anns.append({"name": name, "label": int(label)}) + with open(out_path[i], "w") as f: + json.dump(anns, f, indent=2) + \ No newline at end of file diff --git a/baselines/fedvssl/fedvssl/dataset_preparation.py b/baselines/fedvssl/fedvssl/dataset_preparation.py deleted file mode 100644 index 9cc137d38f5d..000000000000 --- a/baselines/fedvssl/fedvssl/dataset_preparation.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Handle the dataset partitioning and (optionally) complex downloads. - -Please add here all the necessary logic to either download, uncompress, pre/post-process -your dataset (or all of the above). If the desired way of running your baseline is to -first download the dataset and partition it and then run the experiments, please -uncomment the lines below and tell us in the README.md (see the "Running the Experiment" -block) that this file should be executed first. -""" - -# make sure you have installed unrar package. -# One can install it using `sudo apt install unrar`. - -import json -import subprocess - -# Data downloading and preprocessing -# ---------------------------------- - -# first download the raw videos from the official website - -subprocess.run(["mkdir -p data/ucf101/"], shell=True) -subprocess.run( - [ - "wget https://www.crcv.ucf.edu/data/UCF101/UCF101.rar \ - -O data/ucf101/UCF101.rar \ - --no-check-certificate" - ], - shell=True, -) - -print("---Unzipping the compressed file---") -subprocess.run(["unrar e data/ucf101/UCF101.rar data/ucf101/UCF101_raw/"], shell=True) - -print("---Downloading the train/test split---") -subprocess.run( - [ - "wget \ - https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip \ - -O data/ucf101/UCF101TrainTestSplits-RecognitionTask.zip --no-check-certificate" - ], - shell=True, -) - -subprocess.run( - ["unzip data/ucf101/UCF101TrainTestSplits-RecognitionTask.zip -d data/ucf101/"], - shell=True, -) - -print("--Pre-processing the dataset script---") -subprocess.run( - [ - "python CtP/scripts/process_ucf101.py --raw_dir data/ucf101/UCF101_raw/ \ ---ann_dir data/ucf101/ucfTrainTestlist/ --out_dir data/ucf101/" - ], - shell=True, -) - - -# We use the .json files for the annotations. -# One can convert the train_split_1.txt to train_split_1.json -# by using the following code: - -ann_path = [ - "data/ucf101/annotations/train_split_1.txt", - "data/ucf101/annotations/test_split_1.txt", -] -out_path = [ - "data/ucf101/annotations/train_split_1.json", - "data/ucf101/annotations/test_split_1.json", -] - -assert len(ann_path) == len(out_path) - -for i in range(len(ann_path)): - with open(ann_path[i], "r") as f: - lines = f.read().splitlines() - anns = [] - for line in lines: - if line.strip() == "": - continue - name, label = line.split(" ") - anns.append({"name": name, "label": int(label)}) # +1)) - with open(out_path[i], "w") as f: - json.dump(anns, f, indent=2) - - -# optional -# ---------- -# rm data/ucf101/UCF101.rar -# rm -r data/ucf101/UCF101_raw/ - - -# Data partitioning for federated learning -# --------------------------------------- -# We provide `data_partitioning_ucf.py` -# to generate non-iid data from UCF-101 dataset. -# The above scripts will generate the client_x.json file, -# where "x" denotes the client number. -# To perform partitioning on UCF-101: - -subprocess.run( - [ - "python data_partitioning_ucf.py --json_path data/ucf101/annotations \ ---output_path data/ucf101/annotations/client_distribution/ \ ---num_clients 5" - ], - shell=True, -)