Add complete codes & presentation file

TEAM-IKYO · May 23, 2021 · 4122bc7 · 4122bc7
1 parent c0bdeda
commit 4122bc7
Show file tree

Hide file tree

Showing 35 changed files with 5,090 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# 📖 Stage 3 - MRC (Machine Reading Comprehension) ✏️
+> Hosted by [2021 Boostcamp AI Tech](https://boostcamp.connect.or.kr/)
+
+<p align="center">
+  <img width="250" src="https://user-images.githubusercontent.com/59340911/119260977-29b29480-bc10-11eb-8543-cf7ef73ddcd4.png">
+</p>    
+
+| [권태양](https://github.com/sunnight9507) | [류재희](https://github.com/JaeheeRyu) | [박종헌](https://github.com/PJHgh) | [오수지](https://github.com/ohsuz) | [이현규](https://github.com/LeeHyeonKyu) | [정익효](https://github.com/dlrgy22) |
+| :----------: |  :--------:  |  :---------: |  :---------: | :---------: | :---------: |
+| ![image](https://user-images.githubusercontent.com/59340911/119260030-eeae6200-bc0b-11eb-92e3-23e69ba35984.png) | ![image](https://user-images.githubusercontent.com/59340911/119260176-8f9d1d00-bc0c-11eb-9a7b-32a33c1a1072.png)| ![50470448](https://user-images.githubusercontent.com/55614265/116680649-e2592f80-a9e6-11eb-8f9e-631c15313c5d.png) | ![image](https://user-images.githubusercontent.com/59340911/119260225-cffc9b00-bc0c-11eb-9fe4-9bf9efd0716f.png)| ![image](https://user-images.githubusercontent.com/59340911/119260159-84e28800-bc0c-11eb-8164-6810a92bff38.png)| ![image](https://user-images.githubusercontent.com/59340911/119260159-84e28800-bc0c-11eb-8164-6810a92bff38.png)|
+| [Notion](https://www.notion.so/Sunny-1349e293c9f74de092dce9ee359bd77c) | [Notion](https://www.notion.so/AI-Tech-72ce6764e1974a91b2c25d633288e0e4) | [Notion](https://www.notion.so/Boostcamp-deef2c0783f24c0b8022ba30b5782986) | [Notion](https://www.ohsuz.dev/) | [Notion](https://www.notion.so/thinkwisely/Naver-Boost-Camp-AI-Tech-ba743126e68749d58bdbb7af0580c8ee) | |
+
+## 🥇 Final Scores
+||Public Leaderboard|Private Leaderboard|
+| :----------: |  :--------:  | :--------:  |
+||![image](https://user-images.githubusercontent.com/59340911/119259895-4dbfa700-bc0b-11eb-9633-6eb4d4c633d7.png)|![image](https://user-images.githubusercontent.com/59340911/119259913-5d3ef000-bc0b-11eb-8c10-2f667e960d31.png)|
+|Score|1st|1st|
+|EM|72.92|70.00|
+|F1|81.52|79.78|
+
+---
+
+## 💻 Task Description
+> More Description can be found at [AI Stages](http://boostcamp.stages.ai/competitions/31/overview/description)  
+  
+When you have a question like, "What is the oldest tree in Korea?", you may have asked it at a search engine. And these days it gives you an especially surprisingly accurate answer. How is it possible?
+
+Question Answering is a field of research that creates artificial intelligence model that answers various kinds of questions. Among them, Open-Domain Question Answering is a more challenging issue because it has to find documents that can answer questions only using pre-built knowledge resources.
+
+![image](https://user-images.githubusercontent.com/59340911/119260267-118d4600-bc0d-11eb-95bc-6ea68f7b0df4.png)
+
+The model we'll create in this competition is made up of 2 stages.The first stage is called "retriever", which is the step to find question-related documents, and the next stage is called "reader", which is the step to read the document we've found at 1st stage and find the answer in the document. If we concatenate these two stages properly, we can make a question answering system that can answer no matter how tough questions are. The team which create a model that makes a more accurate answer will win this stage.
+![image](https://user-images.githubusercontent.com/59340911/119260915-f1ab5180-bc0f-11eb-9ddc-cad4585bc8ce.png)
+
+---
diff --git a/TEAM-IKYO/README.md b/TEAM-IKYO/README.md
diff --git a/TEAM-IKYO/code/arguments.py b/TEAM-IKYO/code/arguments.py
@@ -0,0 +1,87 @@
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, List, Optional
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        default="xlm-roberta-large",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    use_custom_model: str = field(
+        default='QAConvModelV2',
+        metadata={"help": "Choose one of ['ConvModel', 'QueryAttentionModel', 'QAConvModelV1', 'QAConvModelV2']"}
+    )
+    use_pretrained_model: bool = field(
+        default=False,
+        metadata={"help": "use_pretrained_koquard_model"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    retrieval_type: Optional[str] = field(
+        default="elastic", metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    retrieval_elastic_index: Optional[str] = field(
+        default="wiki-index-split-800", metadata={"help": "Elastic search index name[wiki-index, wiki-index-split-400, wiki-index-split-800(best), wiki-index-split-1000]"}
+    )
+    retrieval_elastic_num: Optional[int] = field(
+        default=35,
+        metadata={"help": "The number of context or passage from Elastic search"},
+    )
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default="question_type", metadata={"help": "Choose one of ['basic', 'preprocessed', 'concat', 'korquad', 'only_korquad', 'quetion_type', 'ai_hub', 'random_masking', 'token_masking']"}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+                    "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+                    "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+                    "and end predictions are not conditioned on one another."
+        },
+    )
+    train_retrieval: bool = field(
+        default=True,
+        metadata={"help": "Whether to train sparse/dense embedding (prepare for retrieval)."},
+    )
+    eval_retrieval: bool = field(
+        default=True,
+        metadata={"help":"Whether to run passage retrieval using sparse/dense embedding )."},
+    )
+
diff --git a/TEAM-IKYO/code/data_processing.py b/TEAM-IKYO/code/data_processing.py
@@ -0,0 +1,126 @@
+from datasets import load_metric, load_from_disk
+from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer
+
+class DataProcessor():
+    def __init__(self, tokenizer, max_length = 384, doc_stride = 128):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.doc_stride = doc_stride
+
+    def prepare_train_features(self, examples):
+        tokenized_examples = self.tokenizer(
+            examples["question"],
+            examples["context"],
+            truncation="only_second",
+            max_length=self.max_length,
+            stride=self.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        if 'question_type' in examples.keys() :
+            tokenized_examples['question_type'] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            input_ids = tokenized_examples["input_ids"][i]
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            cls_index = input_ids.index(self.tokenizer.cls_token_id)
+
+            sample_index = sample_mapping[i]
+            answers = examples["answers"][sample_index]
+
+            if 'question_type' in examples.keys() :
+                tokenized_examples['question_type'].append(examples['question_type'][sample_index])
+
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1):
+                    token_start_index += 1
+
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1):
+                    token_end_index -= 1
+
+                if not (
+                    offsets[token_start_index][0] <= start_char
+                    and offsets[token_end_index][1] >= end_char
+                ):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    while (
+                        token_start_index < len(offsets)
+                        and offsets[token_start_index][0] <= start_char
+                    ):
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    def prepare_validation_features(self, examples):
+        tokenized_examples = self.tokenizer(
+            examples["question"],
+            examples["context"],
+            truncation="only_second",
+            max_length=self.max_length,
+            stride=self.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        tokenized_examples["example_id"] = []
+
+        if 'question_type' in examples.keys() :
+            tokenized_examples['question_type'] = []
+
+        for i in range(len(tokenized_examples['input_ids'])):
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            if 'question_type' in examples.keys() :
+                tokenized_examples['question_type'].append(examples['question_type'][sample_index])
+
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    def train_tokenizer(self, train_dataset, column_names):
+        train_dataset = train_dataset.map(
+            self.prepare_train_features,
+            batched=True,
+            num_proc=4,
+            remove_columns=column_names,
+        )
+
+        return train_dataset
+
+    def val_tokenzier(self, val_dataset, column_names):
+        val_dataset = val_dataset.map(
+            self.prepare_validation_features,
+            batched=True,
+            num_proc=4,
+            remove_columns=column_names,
+        )
+
+        return val_dataset
diff --git a/TEAM-IKYO/code/elasticsearch_retrieval.py b/TEAM-IKYO/code/elasticsearch_retrieval.py
@@ -0,0 +1,37 @@
+import json
+import os
+import time
+
+from elasticsearch import Elasticsearch
+from datasets import load_from_disk
+from torch.utils.data import DataLoader, TensorDataset
+from subprocess import Popen, PIPE, STDOUT
+from tqdm import tqdm
+
+def elastic_setting(index_name='wiki-index'):
+    config = {'host':'localhost', 'port':9200}
+    es = Elasticsearch([config])
+
+    return es, index_name
+
+
+def search_es(es_obj, index_name, question_text, n_results):
+    # search query
+    query = {
+            'query': {
+                'match': {
+                    'document_text': question_text
+                    }
+                }
+            }
+    # n_result => 상위 몇개를 선택?
+    res = es_obj.search(index=index_name, body=query, size=n_results)
+
+    return res
+
+
+def elastic_retrieval(es, index_name, question_text, n_results):
+    res = search_es(es, index_name, question_text, n_results)
+    # 매칭된 context만 list형태로 만든다.
+    context_list = list((hit['_source']['document_text'], hit['_score']) for hit in res['hits']['hits'])
+    return context_list