Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ohsuz authored May 23, 2021
1 parent d8eed9d commit 872e738
Showing 1 changed file with 211 additions and 0 deletions.
211 changes: 211 additions & 0 deletions TEAM-IKYO-BOOK/Question Type Dataset Maker.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"colab": {
"name": "Make_Dataset_Code.ipynb의 사본",
"provenance": [],
"collapsed_sections": []
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "6df75117-5ef4-48c4-805e-23783e4ba22e"
},
"source": [
"# Processing AI hub dataset"
],
"id": "6df75117-5ef4-48c4-805e-23783e4ba22e"
},
{
"cell_type": "code",
"metadata": {
"id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40"
},
"source": [
"import json\n",
"with open('outer_datas/ko_nia_normal_squad_all.json', encoding='utf-8') as json_file :\n",
" normal_data = json.load(json_file)\n",
"\n",
"normal_data"
],
"id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "914ecf4a-895c-428b-93c0-f334bfca7386"
},
"source": [
"# Json Parsing해서 List로 저장하기.\n",
"from tqdm import tqdm\n",
"\n",
"question_type_map = {'work_how':0, 'work_what':1, 'work_when':2, 'work_where':3, 'work_who':4, 'work_why':5}\n",
"context_lst = []\n",
"question_lst = []\n",
"answers_lst = []\n",
"qa_id_lst = []\n",
"question_type_lst = []\n",
"\n",
"for datas in tqdm(normal_data['data']) :\n",
" context = datas['paragraphs'][0]['context']\n",
" for qas in datas['paragraphs'][0]['qas'] :\n",
" question = qas['question']\n",
" answer_start = qas['answers'][0]['answer_start']\n",
" answer_text = qas['answers'][0]['text']\n",
" qa_id = qas['id']\n",
" question_type = question_type_map[qas['classtype']]\n",
"\n",
" context_lst.append(context)\n",
" question_lst.append(question)\n",
" answers_lst.append({'answer_start':[answer_start], 'text':[answer_text]})\n",
" qa_id_lst.append(qa_id)\n",
" question_type_lst.append(question_type)"
],
"id": "914ecf4a-895c-428b-93c0-f334bfca7386",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9"
},
"source": [
"# Dataset 형태로 만들기.\n",
"from datasets import Dataset, Features, Sequence, Value, DatasetDict\n",
"\n",
"ai_hub_dataset = Dataset.from_dict({'id' : qa_id_lst,\n",
" 'context' : context_lst,\n",
" 'answers' : answers_lst,\n",
" 'question' : question_lst,\n",
" 'question_type' : question_type_lst\n",
" })"
],
"id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "44f8fd91-aa76-4cf8-b658-915c97a850f5"
},
"source": [
"# 각종 함수 등 정의.\n",
"import re\n",
"import pickle\n",
"\n",
"def save_pickle(save_path, data_set):\n",
" file = open(save_path, \"wb\")\n",
" pickle.dump(data_set, file)\n",
" file.close()\n",
" return None\n",
"\n",
"def preprocess(text):\n",
" text = re.sub(r'\\n', ' ', text)\n",
" text = re.sub(r\"\\\\n\", \" \", text)\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
" text = re.sub(r'#', ' ', text)\n",
" text = re.sub(r\"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\\s\\.\\?!》《≪≫\\'<>〈〉:‘’%,『』「」<>・\\\"-“”∧]\", \"\", text)\n",
" return text\n",
"\n",
"def run_preprocess(data_dict):\n",
" context = data_dict[\"context\"]\n",
" start_ids = data_dict[\"answers\"][\"answer_start\"][0]\n",
" before = data_dict[\"context\"][:start_ids]\n",
" after = data_dict[\"context\"][start_ids:]\n",
" process_before = preprocess(before)\n",
" process_after = preprocess(after)\n",
" process_data = process_before + process_after\n",
" ids_move = len(before) - len(process_before)\n",
" data_dict[\"context\"] = process_data\n",
" data_dict[\"answers\"][\"answer_start\"][0] = start_ids - ids_move\n",
" return data_dict\n",
"\n",
"new_f = Features({'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),\n",
" 'context': Value(dtype='string', id=None),\n",
" 'id': Value(dtype='string', id=None),\n",
" 'question': Value(dtype='string', id=None),\n",
" 'question_type' : Value(dtype='int32', id=None)\n",
" })"
],
"id": "44f8fd91-aa76-4cf8-b658-915c97a850f5",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5a5e5d85-1c52-4eb4-b023-619e8f075802"
},
"source": [
"# Train과 Validation을 나누기 위한 Index 생성 작업.\n",
"import numpy as np\n",
"\n",
"valid_indeces = np.random.choice(len(ai_hub_dataset), 1000, replace=False)\n",
"train_indeces = np.array(list(set(range(len(ai_hub_dataset))) - set(valid_indeces)))"
],
"id": "5a5e5d85-1c52-4eb4-b023-619e8f075802",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bf7454df-40d8-49a1-899e-0753d609c77d"
},
"source": [
"# 최종적으로 Dataset Dict로 만듦.\n",
"import pandas as pd\n",
"\n",
"ai_hub_train_datset, ai_hub_valid_datset = [], []\n",
"for data in ai_hub_dataset.select(train_indeces) :\n",
" ai_hub_train_datset.append(run_preprocess(data))\n",
"for data in ai_hub_dataset.select(valid_indeces) :\n",
" ai_hub_valid_datset.append(run_preprocess(data))\n",
"\n",
"ai_hub_train_datset = pd.DataFrame(ai_hub_train_datset)\n",
"ai_hub_valid_datset = pd.DataFrame(ai_hub_valid_datset)\n",
"ai_hub_dataset = DatasetDict({'train': Dataset.from_pandas(ai_hub_train_datset, features=new_f), 'validation': Dataset.from_pandas(ai_hub_valid_datset, features=new_f)})"
],
"id": "bf7454df-40d8-49a1-899e-0753d609c77d",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598"
},
"source": [
"# Dataset Dict 저장하기. 파일 경로 수정 필요.\n",
"save_pickle(\"/opt/ml/outer_datas/ai_hub_dataset.pkl\", ai_hub_dataset)"
],
"id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598",
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit 872e738

Please sign in to comment.