real_multiple-choice_dev-challenge.json

[{"submissionRound": 1, "challenge": {"overall": 61.95, "perAnswerType": {"other": 54.75, "number": 36.19, "yes/no": 77.03}}, "dev": {"overall": 61.68, "perAnswerType": {"other": 54.44, "number": 37.05, "yes/no": 76.68}}, "standard": {"overall": 61.97, "perAnswerType": {"other": 54.6, "number": 37.3, "yes/no": 76.86}}, "team_name_order": 1, "team_members": "Bolei Zhou (Facebook AI Research & MIT), Yuandong Tian (Facebook AI Research), Sainbayar Sukhbaatar (Facebook AI Research & NYU), Arthur Szlam (Facebook AI Research), Rob Fergus (Facebook AI Research).", "team_name": "Bolei", "ref": "http://arxiv.org/pdf/1512.02167.pdf", "method": "A improved version of bag of words plus standard deep features."}, {"submissionRound": 2, "challenge": {"overall": 65.17, "perAnswerType": {"other": 56.35, "number": 37.3, "yes/no": 82.67}}, "dev": {"overall": 64.81, "perAnswerType": {"other": 55.82, "number": 39.14, "yes/no": 82.13}}, "standard": {"overall": 65.07, "perAnswerType": {"other": 56.4, "number": 38.56, "yes/no": 81.95}}, "team_name_order": 2, "team_members": "Aaditya Prakash (Brandeis University)", "team_name": "Brandeis", "ref": "http://iamaaditya.github.io/research/vqa/", "method": "We propose a variant of highway network designed to achieve multi-modal learning like VQA. We alter the signal of 'carry' gate with a multiplicand learned from word embeddings of the question. A multi-layered highway MLP learns the memory required to associate the image features with word vectors and thus achieves implicit soft attention over learned parameters."}, {"submissionRound": 3, "challenge": {"overall": 68.54, "perAnswerType": {"other": 61.58, "number": 41.42, "yes/no": 83.68}}, "dev": {"overall": 68.52, "perAnswerType": {"other": 61.34, "number": 42.84, "yes/no": 83.71}}, "standard": {"overall": 68.3, "perAnswerType": {"other": 61.18, "number": 42.07, "yes/no": 83.28}}, "team_name_order": 3, "team_members": "Gyu-tae Park* (Samsung Electronics, AI Lab),  YoungChul Sohn* (Samsung Electronics, AI Lab), Kibeom Lee* (Samsung Electronics, AI Lab), Jong-Ryul Lee* (Samsung Electronics, AI Lab)", "team_name": "DLAIT", "ref": "", "method": "Multimodal attention networks with pretrained word embedding vectors and question-specific answer prediction mechanisms."}, {"submissionRound": 2, "challenge": {"overall": 64.16, "perAnswerType": {"other": 55.28, "number": 37.25, "yes/no": 81.49}}, "dev": {"overall": 64.01, "perAnswerType": {"other": 54.72, "number": 39.0, "yes/no": 81.5}}, "standard": {"overall": 64.18, "perAnswerType": {"other": 55.2, "number": 38.3, "yes/no": 81.25}}, "team_name_order": 4, "team_members": "Ilija Ilievski (Graduate School for Integrative Sciences and Engineering,, National University of Singapore), Shuicheng Yan (Department of Electrical & Computer Engineering,, National University of Singapore), Jiashi Feng (Department of Electrical & Computer Engineering,, National University of Singapore)", "team_name": "FDA-NUS", "ref": "https://arxiv.org/abs/1604.01485", "method": "We propose a novel Focused Dynamic Attention (FDA) model to provide better aligned image content representation with proposed questions. Being aware of the key words in the question, FDA employs off-the-shelf object detector to identify important regions, and fuse the information from the regions and global features via an LSTM unit."}, {"submissionRound": 5, "challenge": {"overall": 66.87, "perAnswerType": {"other": 59.68, "number": 38.54, "yes/no": 82.6}}, "dev": {"overall": 66.66, "perAnswerType": {"other": 59.54, "number": 39.8, "yes/no": 82.1}}, "standard": {"overall": 66.72, "perAnswerType": {"other": 59.55, "number": 39.72, "yes/no": 81.95}}, "team_name_order": 5, "team_members": "Kuniaki Saito (University of Tokyo), Andrew Shin (University of Tokyo), Yoshitaka Ushiku (University of Tokyo), Tatsuya Harada (University of Tokyo)", "team_name": "MIL-UT", "ref": "", "method": "Multimodal Dual-Network in which one network performs an addition of all input features to form a common embedding space, and the other performs multiplication. Inputs to each network consist of fc6 from VGG-19, and the uppermost fully-connected layer from Resnet-152 and Resnet-101. We implemented 19 such dual networks with varying dimensions, and averaged out."}, {"submissionRound": 1, "challenge": {"overall": 62.78, "perAnswerType": {"other": 52.72, "number": 36.94, "yes/no": 81.22}}, "dev": {"overall": 62.48, "perAnswerType": {"other": 52.16, "number": 38.94, "yes/no": 80.79}}, "standard": {"overall": 62.69, "perAnswerType": {"other": 52.79, "number": 38.79, "yes/no": 80.35}}, "team_name_order": 6, "team_members": "Mujtaba hasan (Indian Institute of Technology, Delhi)", "team_name": "Mujtaba hasan iitd", "ref": "", "method": "We use finetuned VGG_19 for image representation and a novel combination of deep LSTMs and GRUs for the text analysis and train a fully connected layer on top of that for final task. We use backpropagation for end to end training and testing purposes.The weights of proposed joint network are initialized with pretrained CNN and GRU."}, {"submissionRound": 2, "challenge": {"overall": 69.37, "perAnswerType": {"other": 63.6, "number": 39.62, "yes/no": 83.82}}, "dev": {"overall": 69.4, "perAnswerType": {"other": 63.54, "number": 41.82, "yes/no": 83.54}}, "standard": {"overall": 69.26, "perAnswerType": {"other": 63.43, "number": 40.69, "yes/no": 83.33}}, "team_name_order": 7, "team_members": "Hyeonseob Nam (Naver Labs), Jeonghee Kim (Naver Labs)", "team_name": "Naver Labs", "ref": "", "method": "Dual Attention Networks (DANs) apply an attention mechanism on both image regions and question words through multiple stages. DANs focus on specific words that are relevant to the answers or the regions to attend to. 152-layer Deep Residual Network is used to extract high-level image features."}, {"submissionRound": 1, "challenge": {"overall": 67.66, "perAnswerType": {"other": 61.65, "number": 39.84, "yes/no": 81.89}}, "dev": {"overall": 67.66, "perAnswerType": {"other": 61.46, "number": 41.14, "yes/no": 81.93}}, "standard": {"overall": 67.34, "perAnswerType": {"other": 61.01, "number": 39.95, "yes/no": 81.69}}, "team_name_order": 8, "team_members": "Hyeonwoo Noh (Department of Computer Science and Engineering, POSTECH, Korea), Bohyung Han (Department of Computer Science and Engineering, POSTECH, Korea)", "team_name": "POSTECH", "ref": "http://arxiv.org/abs/1606.03647", "method": "Training Recurrent Answering Units with Joint Loss Minimization for VQA. The model is trained only on VQA dataset. ResNet101 features  are used for the representation of an image, and two-layer LSTM is employed to model questions."}, {"submissionRound": 5, "challenge": {"overall": 65.82, "perAnswerType": {"other": 57.63, "number": 38.27, "yes/no": 82.51}}, "dev": {"overall": 65.43, "perAnswerType": {"other": 57.12, "number": 38.69, "yes/no": 82.24}}, "standard": {"overall": 65.44, "perAnswerType": {"other": 57.18, "number": 37.85, "yes/no": 82.1}}, "team_name_order": 9, "team_members": "Ruiyu Li", "team_name": "SHB_1026", "ref": "", "method": "A deep reasoning network for VQA with question representation update."}, {"submissionRound": 6, "challenge": {"overall": 64.63, "perAnswerType": {"other": 57.78, "number": 37.95, "yes/no": 79.54}}, "dev": {"overall": 64.31, "perAnswerType": {"other": 57.22, "number": 39.06, "yes/no": 79.29}}, "standard": {"overall": 64.45, "perAnswerType": {"other": 57.59, "number": 38.62, "yes/no": 79.02}}, "team_name_order": 10, "team_members": "Byungju Kim (School of Electrical Engineering, KAIST, South Korea), and Junmo Kim (School of Electrical Engineering, KAIST, South Korea)", "team_name": "SIIT_KAIST", "ref": "", "method": "We used Question Aware Network with Candidate Answer Recommendation which recommend plausible answers only with question. Also, we used bag-of-word to understand the question."}, {"submissionRound": 2, "temp_team_members": "{Fukui, Akira and Park, Dong Huk and Yang, Daylen and Rohrbach, Anna and Darrell, Trevor and Rohrbach, Marcus},", "challenge": {"overall": 70.52, "perAnswerType": {"other": 65.83, "number": 40.32, "yes/no": 83.83}}, "dev": {"overall": 70.24, "perAnswerType": {"other": 65.54, "number": 41.25, "yes/no": 83.41}}, "standard": {"overall": 70.1, "perAnswerType": {"other": 65.16, "number": 41.01, "yes/no": 83.26}}, "team_name_order": 11, "team_members": "Akira Fukui (UC Berkeley EECS, Sony Corp. Tokyo), Dong Huk Park (UC Berkeley EECS), Daylen Yang (UC Berkeley EECS), Anna Rohrbach (UC Berkeley EECS, Max Planck Institute for Informatics, Saarbrucken), Trevor Darrell (UC Berkeley EECS), Marcus Rohrbach (UC Berkeley EECS)", "team_name": "UC Berkeley &amp; Sony", "ref": "https://arxiv.org/abs/1606.01847", "method": "We propose utilizing Multimodal Compact Bilinear pooling (MCB) to efficiently and expressively combine multimodal features. We present an architecture which uses MCB twice, once for predicting attention over spatial features and again to combine the attended representation with the question representation."}, {"submissionRound": 2, "challenge": {"overall": 63.52, "perAnswerType": {"other": 57.38, "number": 34.06, "yes/no": 78.32}}, "dev": {"overall": 63.3, "perAnswerType": {"other": 57.23, "number": 34.22, "yes/no": 78.09}}, "standard": {"overall": 63.53, "perAnswerType": {"other": 57.43, "number": 34.26, "yes/no": 78.08}}, "team_name_order": 12, "team_members": "Kevin J Shih (University of Illinois Urbana-Champaign), Saurabh Singh (University of Illinois Urbana-Champaign), Derek Hoiem (University of Illinois Urbana-Champaign)", "team_name": "UIUC", "ref": "http://vision.cs.illinois.edu/wtl/", "method": "We project image region features and fixed-length representations of question-answer pairs into a shared subspace where the inner-product produces a relevance score. The relevance scores are then passed through a softmax to produce a distribution over all image regions, which is then used to take a weighted sum over all regions. The weighted sum is then used to produce a score for the question-answer pair, where our objective encourages higher scores for correct pairings."}, {"submissionRound": 2, "challenge": {"overall": 66.27, "perAnswerType": {"other": 60.22, "number": 39.22, "yes/no": 80.34}}, "dev": {"overall": 65.82, "perAnswerType": {"other": 59.78, "number": 40.03, "yes/no": 79.72}}, "standard": {"overall": 66.07, "perAnswerType": {"other": 59.95, "number": 39.5, "yes/no": 79.96}}, "team_name_order": 13, "team_members": "Jiasen Lu (Virginia Tech), Jianwei Yang (Virginia Tech), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "VTComputerVison", "ref": "https://arxiv.org/abs/1606.00061", "method": "We present a novel co-attention model for VQA that jointly reasons about image and question attention. And our model reasons about the question (and consequently the image via the co-attention mechanism) in a hierarchical fashion via a novel 1-dimensional convolution neural networks (CNN) model."}, {"submissionRound": 3, "challenge": {"overall": 66.84, "perAnswerType": {"other": 59.77, "number": 40.26, "yes/no": 81.98}}, "dev": {"overall": 66.75, "perAnswerType": {"other": 59.85, "number": 40.47, "yes/no": 81.79}}, "standard": {"overall": 66.91, "perAnswerType": {"other": 60.03, "number": 40.7, "yes/no": 81.61}}, "team_name_order": 14, "team_members": "Kushal Kafle (Chester F. Carlson Center for Imaging Science, Rochester Institute of Technology), and, Christopher Kanan (Chester F. Carlson Center for Imaging Science, Rochester Institute of Technology)", "team_name": "klab", "ref": "http://www.kushalkafle.com/kafle2016.pdf", "method": "Observing that the type of answer can be predicted from question alone, we formulated a Bayesian framework to incorporate answer-type prediction into a VQA pipeline. The current result consists of an improved MLP model trained using data augmentation.  The probabilities produced by this MLP model are then combined with a residual attention mechanism to get the predicted answers."}, {"submissionRound": 4, "challenge": {"overall": 67.76, "perAnswerType": {"other": 60.27, "number": 40.1, "yes/no": 83.67}}, "dev": {"overall": 67.51, "perAnswerType": {"other": 59.7, "number": 41.5, "yes/no": 83.54}}, "standard": {"overall": 67.54, "perAnswerType": {"other": 59.99, "number": 40.73, "yes/no": 83.18}}, "team_name_order": 15, "team_members": "Jin-Hwa Kim (Seoul National University), Sang-Woo Lee (Seoul National University), Dong-Hyun Kwak (Seoul National University), Min-Oh Heo (Seoul National University), Jeonghee Kim (Naver Labs, Naver Corp.), Jung-Woo Ha (Naver Labs, Naver Corp.), Byoung-Tak Zhang (Seoul National University)", "team_name": "snubi-naverlabs", "ref": "http://goo.gl/ZYQHR0", "method": "An ensemble of multimodal residual networks three-block layered without data augmentation. GRUs initialized with Skip-Thought Vectors for question embedding and ResNet-152 for extracting visual feature vectors are used. Joint representations are learned by element-wise multiplication, which leads to implicit attentional model without attentional parameters."}, {"submissionRound": 1, "temp_team_members": "{Jiasen Lu and Aishwarya Agrawal and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Dhruv Batra and Devi Parikh},", "challenge": {"overall": 29.88, "perAnswerType": {"other": 1.21, "number": 0.36, "yes/no": 70.97}}, "dev": {"overall": 29.66, "perAnswerType": {"other": 1.15, "number": 0.39, "yes/no": 70.81}}, "standard": {"overall": 29.72, "perAnswerType": {"other": 1.26, "number": 0.43, "yes/no": 70.53}}, "team_name_order": 16, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-all_yes", "ref": "", "method": "&quot;yes&quot; (prior) is picked as the predicted answer for all questions"}, {"submissionRound": 1, "temp_team_members": "{Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},", "challenge": {"overall": 63.18, "perAnswerType": {"other": 53.84, "number": 37.08, "yes/no": 80.85}}, "dev": {"overall": 62.7, "perAnswerType": {"other": 53.01, "number": 38.22, "yes/no": 80.52}}, "standard": {"overall": 63.09, "perAnswerType": {"other": 53.64, "number": 37.7, "yes/no": 80.59}}, "team_name_order": 17, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-deeperLSTM_NormlizeCNN", "ref": "", "method": "2-channel (image and question) model. Question channel (LSTM with 2 hidden layers) provides question representation and the image channel (activations from last hidden layer of VGGNet) provides image representation. The image features thus obtained are l2 normalized. Question and image features are pointwise multiplied and fed to fully connected layer to obtain softmax distribution over 1000 answers. The answer (from the provided multiple choices) having the highest activation is the predicted answer."}, {"submissionRound": 1, "temp_team_members": "{Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},", "challenge": {"overall": 57.56, "perAnswerType": {"other": 43.87, "number": 35.19, "yes/no": 79.32}}, "dev": {"overall": 57.18, "perAnswerType": {"other": 43.42, "number": 35.77, "yes/no": 78.95}}, "standard": {"overall": 57.57, "perAnswerType": {"other": 43.93, "number": 36.1, "yes/no": 79.02}}, "team_name_order": 18, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-lstm_cnn", "ref": "", "method": "2-channel (image and question) model. Question channel (LSTM with 1 hidden layer) provides question representation and the image channel (activations from last hidden layer of VGGNet) provides image representation. Question and image features are pointwise multiplied and fed to fully connected layer to obtainsoftmax distribution over 1000 answers. The answer (from the provided multiple choices) having the highest activation is the predicted answer."}, {"submissionRound": 1, "challenge": {"overall": 48.54, "perAnswerType": {"other": 33.46, "number": 26.06, "yes/no": 71.96}}, "dev": {"overall": 48.49, "perAnswerType": {"other": 33.56, "number": 26.0, "yes/no": 71.94}}, "standard": {"overall": 48.75, "perAnswerType": {"other": 34.09, "number": 25.81, "yes/no": 71.75}}, "team_name_order": 19, "team_members": "Aishwarya Agrawal (Virginia Tech), Jiasen Lu (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-nearest_neighbor", "ref": "", "method": "For every question in the VQA test-standard set, we find its k nearest neighbor questions in the training set using cosine similarity in Skip-Thought feature space. In this set of k questions and their associated images, we find the image which is most similar to the query image using cosine similarity in fc7 feature space. The most common ground truth answer of this most similar image and question pair is the predicted answer for the query image and question pair. We pick the multiple-choice answer which is most similar to this predicted answer."}, {"team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "challenge": {"overall": 39.39, "perAnswerType": {"other": 12.94, "number": 35.04, "yes/no": 71.38}}, "dev": {"overall": 39.45, "perAnswerType": {"other": 13.34, "number": 35.86, "yes/no": 71.02}}, "standard": {"overall": 39.38, "perAnswerType": {"other": 13.1, "number": 35.7, "yes/no": 71.15}}, "team_name_order": 20, "submissionRound": 6, "team_name": "vqateam-prior_per_qtype", "ref": "", "method": "We pick the most popular answer per question type from the training dataset and predict the multiple choice which is most similar to this most popular answer."}, {"submissionRound": 1, "temp_team_members": "{Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},", "challenge": {"overall": 55.27, "perAnswerType": {"other": 39.63, "number": 35.94, "yes/no": 78.52}}, "dev": {"overall": 54.76, "perAnswerType": {"other": 38.8, "number": 36.78, "yes/no": 78.22}}, "standard": {"overall": 55.01, "perAnswerType": {"other": 39.44, "number": 35.86, "yes/no": 78.12}}, "team_name_order": 21, "team_members": "Jiasen Lu (Virginia Tech), Aishwarya Agrawal (Virginia Tech), Stanislaw Antol (Virginia Tech), Margaret Mitchell (Microsoft Research), C. Lawrence Zitnick (Facebook AI Research), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "team_name": "vqateam-q_lstm_alone", "ref": "", "method": "1-channel model (question channel only). An LSTM is used to extract representation for questions which are then fed to fully connected layer to obtain softmax distribution over 1000 answers. The answer (from the provided multiple choices) having the highest activation is the predicted answer."}, {"date": "2016-06-24"}]