abstract_open-ended.json

[{"team_members": "Damien Teney (University of Adelaide), Lingqiao Liu (University of Adelaide), Anton van den Hengel (University of Adelaide)", "standard": {"overall": 70.42, "perAnswerType": {"other": 56.28, "number": 76.47, "yes/no": 81.26}}, "team_name_order": 1, "submissionRound": 5, "team_name": "ACVT_Adelaide", "ref": "http://arxiv.org/abs/1609.05600", "method": "VQA with graph representations of scene and question, using language pre-parsing and pretrained word embeddings."}, {"team_members": "None", "standard": {"overall": 69.73, "perAnswerType": {"other": 62.08, "number": 58.82, "yes/no": 80.7}}, "team_name_order": 2, "submissionRound": 1, "team_name": "MIL", "ref": "", "method": "constract DualNet and ensemble 5 model"}, {"team_members": "None", "standard": {"overall": 53.49, "perAnswerType": {"other": 34.08, "number": 44.35, "yes/no": 74.67}}, "team_name_order": 3, "submissionRound": 2, "team_name": "PicSOM", "ref": "", "method": "Attempt 1"}, {"team_members": "Jin-Hwa Kim (Seoul National University), Sang-Woo Lee (Seoul National University), Dong-Hyun Kwak (Seoul National University), Min-Oh Heo (Seoul National University), Jeonghee Kim (Naver Labs, Naver Corp.), Jung-Woo Ha (Naver Labs, Naver Corp.), Byoung-Tak Zhang (Seoul National University)", "standard": {"overall": 62.56, "perAnswerType": {"other": 48.94, "number": 51.57, "yes/no": 79.06}}, "team_name_order": 4, "submissionRound": 2, "team_name": "snubi-naverlabs", "ref": "http://goo.gl/ZYQHR0", "method": "A single multimodal residual networks three-block layered without data augmentation. GRUs initialized with Skip-Thought Vectors for question embedding and ResNet-152 for extracting visual feature vectors from abstract images are used. Joint representations are learned by element-wise multiplication, which leads to implicit attentional model without attentional parameters."}, {"team_members": "None", "standard": {"overall": 29.15, "perAnswerType": {"other": 1.67, "number": 0.22, "yes/no": 64.9}}, "team_name_order": 5, "submissionRound": 1, "team_name": "vt-all_yes", "ref": "", "method": "&quot;yes&quot; (prior) is picked as the predicted answer for all questions"}, {"team_members": "Peng Zhang (Virginia Tech), Yash Goyal (Virginia Tech), Douglas Summers-Stay (Army Research Laboratory), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "standard": {"overall": 35.25, "perAnswerType": {"other": 1.31, "number": 0.21, "yes/no": 79.14}}, "team_name_order": 6, "submissionRound": 2, "team_name": "vt_arl_binary", "ref": "http://arxiv.org/pdf/1511.05099v4.pdf", "method": "We first identify primary object and secondary object from questions, which tell us which regions should be paid attention on images. And we extract image features based on that. Then we verify the visual concepts by encoding the questions via LSTM, combing image features, and feeding into MLP."}, {"team_members": "Yash Goyal (Virginia Tech), Peng Zhang (Virginia Tech), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "standard": {"overall": 65.02, "perAnswerType": {"other": 56.41, "number": 52.54, "yes/no": 77.45}}, "team_name_order": 7, "submissionRound": 1, "team_name": "vt_qLSTM-globalImage", "ref": "http://arxiv.org/abs/1511.05099", "method": "This model uses holistic image features for abstract scenes such as objects occurrence, categories occurrence, instances (for large and small objects), expressions and poses (for humans), and LSTM embedding for questions. Question and image features are point-wise multiplied and passed though a 2-layer MLP to obtain softmax distribution over 270 answers."}, {"team_members": "Yash Goyal (Virginia Tech), Peng Zhang (Virginia Tech), Dhruv Batra (Virginia Tech), Devi Parikh (Virginia Tech)", "standard": {"overall": 57.19, "perAnswerType": {"other": 38.79, "number": 49.55, "yes/no": 76.88}}, "team_name_order": 8, "submissionRound": 1, "team_name": "vt_qLSTMalone", "ref": "http://arxiv.org/abs/1511.05099", "method": "This model extracts LSTM embedding for questions, passes them though a 2-layer MLP to obtain softmax distribution over most frequent 270 answers in the training dataset."}, {"date": "2018-07-28"}]