Add files via upload

TEAM-IKYO · May 23, 2021 · 872e738 · 872e738
1 parent d8eed9d
commit 872e738
Showing 1 changed file with 211 additions and 0 deletions.
diff --git a/TEAM-IKYO-BOOK/Question Type Dataset Maker.ipynb b/TEAM-IKYO-BOOK/Question Type Dataset Maker.ipynb
@@ -0,0 +1,211 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 5,
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.7"
+    },
+    "colab": {
+      "name": "Make_Dataset_Code.ipynb의 사본",
+      "provenance": [],
+      "collapsed_sections": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6df75117-5ef4-48c4-805e-23783e4ba22e"
+      },
+      "source": [
+        "# Processing AI hub dataset"
+      ],
+      "id": "6df75117-5ef4-48c4-805e-23783e4ba22e"
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40"
+      },
+      "source": [
+        "import json\n",
+        "with open('outer_datas/ko_nia_normal_squad_all.json', encoding='utf-8') as json_file :\n",
+        "    normal_data = json.load(json_file)\n",
+        "\n",
+        "normal_data"
+      ],
+      "id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "914ecf4a-895c-428b-93c0-f334bfca7386"
+      },
+      "source": [
+        "# Json Parsing해서 List로 저장하기.\n",
+        "from tqdm import tqdm\n",
+        "\n",
+        "question_type_map = {'work_how':0, 'work_what':1, 'work_when':2, 'work_where':3, 'work_who':4, 'work_why':5}\n",
+        "context_lst = []\n",
+        "question_lst = []\n",
+        "answers_lst = []\n",
+        "qa_id_lst = []\n",
+        "question_type_lst = []\n",
+        "\n",
+        "for datas in tqdm(normal_data['data']) :\n",
+        "    context = datas['paragraphs'][0]['context']\n",
+        "    for qas in datas['paragraphs'][0]['qas'] :\n",
+        "        question = qas['question']\n",
+        "        answer_start = qas['answers'][0]['answer_start']\n",
+        "        answer_text = qas['answers'][0]['text']\n",
+        "        qa_id = qas['id']\n",
+        "        question_type = question_type_map[qas['classtype']]\n",
+        "\n",
+        "        context_lst.append(context)\n",
+        "        question_lst.append(question)\n",
+        "        answers_lst.append({'answer_start':[answer_start], 'text':[answer_text]})\n",
+        "        qa_id_lst.append(qa_id)\n",
+        "        question_type_lst.append(question_type)"
+      ],
+      "id": "914ecf4a-895c-428b-93c0-f334bfca7386",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9"
+      },
+      "source": [
+        "# Dataset 형태로 만들기.\n",
+        "from datasets import Dataset, Features, Sequence, Value, DatasetDict\n",
+        "\n",
+        "ai_hub_dataset = Dataset.from_dict({'id' : qa_id_lst,\n",
+        "                                    'context' : context_lst,\n",
+        "                                    'answers' : answers_lst,\n",
+        "                                    'question' : question_lst,\n",
+        "                                    'question_type' : question_type_lst\n",
+        "                                   })"
+      ],
+      "id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "44f8fd91-aa76-4cf8-b658-915c97a850f5"
+      },
+      "source": [
+        "# 각종 함수 등 정의.\n",
+        "import re\n",
+        "import pickle\n",
+        "\n",
+        "def save_pickle(save_path, data_set):\n",
+        "    file = open(save_path, \"wb\")\n",
+        "    pickle.dump(data_set, file)\n",
+        "    file.close()\n",
+        "    return None\n",
+        "\n",
+        "def preprocess(text):\n",
+        "    text = re.sub(r'\\n', ' ', text)\n",
+        "    text = re.sub(r\"\\\\n\", \" \", text)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "    text = re.sub(r'#', ' ', text)\n",
+        "    text = re.sub(r\"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\\s\\.\\?!》《≪≫\\'<>〈〉:‘’%,『』「」＜＞・\\\"-“”∧]\", \"\", text)\n",
+        "    return text\n",
+        "\n",
+        "def run_preprocess(data_dict):\n",
+        "    context = data_dict[\"context\"]\n",
+        "    start_ids = data_dict[\"answers\"][\"answer_start\"][0]\n",
+        "    before = data_dict[\"context\"][:start_ids]\n",
+        "    after = data_dict[\"context\"][start_ids:]\n",
+        "    process_before = preprocess(before)\n",
+        "    process_after = preprocess(after)\n",
+        "    process_data = process_before + process_after\n",
+        "    ids_move = len(before) - len(process_before)\n",
+        "    data_dict[\"context\"] = process_data\n",
+        "    data_dict[\"answers\"][\"answer_start\"][0] = start_ids - ids_move\n",
+        "    return data_dict\n",
+        "\n",
+        "new_f = Features({'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),\n",
+        "                  'context': Value(dtype='string', id=None),\n",
+        "                  'id': Value(dtype='string', id=None),\n",
+        "                  'question': Value(dtype='string', id=None),\n",
+        "                  'question_type' : Value(dtype='int32', id=None)\n",
+        "                })"
+      ],
+      "id": "44f8fd91-aa76-4cf8-b658-915c97a850f5",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5a5e5d85-1c52-4eb4-b023-619e8f075802"
+      },
+      "source": [
+        "# Train과 Validation을 나누기 위한 Index 생성 작업.\n",
+        "import numpy as np\n",
+        "\n",
+        "valid_indeces = np.random.choice(len(ai_hub_dataset), 1000, replace=False)\n",
+        "train_indeces = np.array(list(set(range(len(ai_hub_dataset))) - set(valid_indeces)))"
+      ],
+      "id": "5a5e5d85-1c52-4eb4-b023-619e8f075802",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "bf7454df-40d8-49a1-899e-0753d609c77d"
+      },
+      "source": [
+        "# 최종적으로 Dataset Dict로 만듦.\n",
+        "import pandas as pd\n",
+        "\n",
+        "ai_hub_train_datset, ai_hub_valid_datset = [], []\n",
+        "for data in ai_hub_dataset.select(train_indeces) :\n",
+        "    ai_hub_train_datset.append(run_preprocess(data))\n",
+        "for data in ai_hub_dataset.select(valid_indeces) :\n",
+        "    ai_hub_valid_datset.append(run_preprocess(data))\n",
+        "\n",
+        "ai_hub_train_datset = pd.DataFrame(ai_hub_train_datset)\n",
+        "ai_hub_valid_datset = pd.DataFrame(ai_hub_valid_datset)\n",
+        "ai_hub_dataset = DatasetDict({'train': Dataset.from_pandas(ai_hub_train_datset, features=new_f), 'validation': Dataset.from_pandas(ai_hub_valid_datset, features=new_f)})"
+      ],
+      "id": "bf7454df-40d8-49a1-899e-0753d609c77d",
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598"
+      },
+      "source": [
+        "# Dataset Dict 저장하기. 파일 경로 수정 필요.\n",
+        "save_pickle(\"/opt/ml/outer_datas/ai_hub_dataset.pkl\", ai_hub_dataset)"
+      ],
+      "id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598",
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}