From 79b5f89d3e8ec7faf7d23c7fa0593a88898e818c Mon Sep 17 00:00:00 2001 From: Ruotian Luo Date: Tue, 14 Jan 2020 14:05:55 -0600 Subject: [PATCH] drop the need of cocobu_fc; use zero size tensor when use_fc or use_att is False. --- configs/a2i2.yml | 1 - configs/topdown.yml | 1 - configs/transformer.yml | 1 - data/README.md | 5 +---- dataloader.py | 10 +++++++--- models/CaptionModel.py | 2 +- models/TransformerModel.py | 2 +- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/configs/a2i2.yml b/configs/a2i2.yml index 808582b4..9ea0eb32 100644 --- a/configs/a2i2.yml +++ b/configs/a2i2.yml @@ -1,7 +1,6 @@ # base caption_model: att2in2 input_json: data/cocotalk.json -input_fc_dir: data/cocobu_fc input_att_dir: data/cocobu_att input_label_h5: data/cocotalk_label.h5 learning_rate: 0.0005 diff --git a/configs/topdown.yml b/configs/topdown.yml index 324892e8..3babc24e 100644 --- a/configs/topdown.yml +++ b/configs/topdown.yml @@ -1,7 +1,6 @@ # base caption_model: topdown input_json: data/cocotalk.json -input_fc_dir: data/cocobu_fc input_att_dir: data/cocobu_att input_label_h5: data/cocotalk_label.h5 learning_rate: 0.0005 diff --git a/configs/transformer.yml b/configs/transformer.yml index a08ef544..b7f90607 100644 --- a/configs/transformer.yml +++ b/configs/transformer.yml @@ -4,7 +4,6 @@ noamopt_warmup: 20000 label_smoothing: 0.0 input_json: data/cocotalk.json input_label_h5: data/cocotalk_label.h5 -input_fc_dir: data/cocobu_fc input_att_dir: data/cocobu_att seq_per_img: 5 batch_size: 10 diff --git a/data/README.md b/data/README.md index 78d972f6..0dc7b2c5 100644 --- a/data/README.md +++ b/data/README.md @@ -57,15 +57,12 @@ Then: python script/make_bu_data.py --output_dir data/cocobu ``` -This will create `data/cocobu_fc`, `data/cocobu_att` and `data/cocobu_box`. If you want to use bottom-up feature, you can just follow the following steps and replace all cocotalk with cocobu. +This will create `data/cocobu_fc`(not necessary), `data/cocobu_att` and `data/cocobu_box`. If you want to use bottom-up feature, you can just replace all `"cocotalk"` with `"cocobu"` in the training/test scripts. #### Download converted files -bottomup-fc: [link](https://drive.google.com/file/d/1IpjCJ5LYC4kX2krxHcPgxAIipgA8uqTU/view?usp=sharing) (The fc features here are simply the average of the attention features) - bottomup-att: [link](https://drive.google.com/file/d/1hun0tsel34aXO4CYyTRIvHJkcbZHwjrD/view?usp=sharing) - ## Flickr30k. It's similar. diff --git a/dataloader.py b/dataloader.py index 95679a3b..9ae88be8 100644 --- a/dataloader.py +++ b/dataloader.py @@ -255,11 +255,15 @@ def __getitem__(self, index): # sort the features by the size of boxes att_feat = np.stack(sorted(att_feat, key=lambda x:x[-1], reverse=True)) else: - att_feat = np.zeros((1,1,1), dtype='float32') + att_feat = np.zeros((0,0), dtype='float32') if self.use_fc: - fc_feat = self.fc_loader.get(str(self.info['images'][ix]['id'])) + try: + fc_feat = self.fc_loader.get(str(self.info['images'][ix]['id'])) + except: + # Use average of attention when there is no fc provided (For bottomup feature) + fc_feat = att_feat.mean(0) else: - fc_feat = np.zeros((1), dtype='float32') + fc_feat = np.zeros((0), dtype='float32') if hasattr(self, 'h5_label_file'): seq = self.get_captions(ix, self.seq_per_img) else: diff --git a/models/CaptionModel.py b/models/CaptionModel.py index bb6c4198..c12c7333 100644 --- a/models/CaptionModel.py +++ b/models/CaptionModel.py @@ -235,7 +235,7 @@ def repeat_tensor(n, x): if x is not None: x = x.unsqueeze(1) # Bx1x... x = x.expand(-1, n, *([-1]*len(x.shape[2:]))) # Bxnx... - x = x.reshape(-1, *x.shape[2:]) # Bnx... + x = x.reshape(x.shape[0]*n, *x.shape[2:]) # Bnx... return x @staticmethod diff --git a/models/TransformerModel.py b/models/TransformerModel.py index 766c2ecc..8ecb332a 100644 --- a/models/TransformerModel.py +++ b/models/TransformerModel.py @@ -296,7 +296,7 @@ def _prepare_feature(self, fc_feats, att_feats, att_masks): att_feats, seq, att_masks, seq_mask = self._prepare_feature_forward(att_feats, att_masks) memory = self.model.encode(att_feats, att_masks) - return fc_feats[...,:1], att_feats[...,:1], memory, att_masks + return fc_feats[...,:0], att_feats[...,:0], memory, att_masks def _prepare_feature_forward(self, att_feats, att_masks=None, seq=None): att_feats, att_masks = self.clip_att(att_feats, att_masks)