-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
211 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 5, | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.7" | ||
}, | ||
"colab": { | ||
"name": "Make_Dataset_Code.ipynb의 사본", | ||
"provenance": [], | ||
"collapsed_sections": [] | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "6df75117-5ef4-48c4-805e-23783e4ba22e" | ||
}, | ||
"source": [ | ||
"# Processing AI hub dataset" | ||
], | ||
"id": "6df75117-5ef4-48c4-805e-23783e4ba22e" | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40" | ||
}, | ||
"source": [ | ||
"import json\n", | ||
"with open('outer_datas/ko_nia_normal_squad_all.json', encoding='utf-8') as json_file :\n", | ||
" normal_data = json.load(json_file)\n", | ||
"\n", | ||
"normal_data" | ||
], | ||
"id": "9e3dfb6f-3eb1-41b9-99d1-32ac72a1ba40", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "914ecf4a-895c-428b-93c0-f334bfca7386" | ||
}, | ||
"source": [ | ||
"# Json Parsing해서 List로 저장하기.\n", | ||
"from tqdm import tqdm\n", | ||
"\n", | ||
"question_type_map = {'work_how':0, 'work_what':1, 'work_when':2, 'work_where':3, 'work_who':4, 'work_why':5}\n", | ||
"context_lst = []\n", | ||
"question_lst = []\n", | ||
"answers_lst = []\n", | ||
"qa_id_lst = []\n", | ||
"question_type_lst = []\n", | ||
"\n", | ||
"for datas in tqdm(normal_data['data']) :\n", | ||
" context = datas['paragraphs'][0]['context']\n", | ||
" for qas in datas['paragraphs'][0]['qas'] :\n", | ||
" question = qas['question']\n", | ||
" answer_start = qas['answers'][0]['answer_start']\n", | ||
" answer_text = qas['answers'][0]['text']\n", | ||
" qa_id = qas['id']\n", | ||
" question_type = question_type_map[qas['classtype']]\n", | ||
"\n", | ||
" context_lst.append(context)\n", | ||
" question_lst.append(question)\n", | ||
" answers_lst.append({'answer_start':[answer_start], 'text':[answer_text]})\n", | ||
" qa_id_lst.append(qa_id)\n", | ||
" question_type_lst.append(question_type)" | ||
], | ||
"id": "914ecf4a-895c-428b-93c0-f334bfca7386", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9" | ||
}, | ||
"source": [ | ||
"# Dataset 형태로 만들기.\n", | ||
"from datasets import Dataset, Features, Sequence, Value, DatasetDict\n", | ||
"\n", | ||
"ai_hub_dataset = Dataset.from_dict({'id' : qa_id_lst,\n", | ||
" 'context' : context_lst,\n", | ||
" 'answers' : answers_lst,\n", | ||
" 'question' : question_lst,\n", | ||
" 'question_type' : question_type_lst\n", | ||
" })" | ||
], | ||
"id": "c3a03d9a-9063-4dbc-a576-1dbdeefd2de9", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "44f8fd91-aa76-4cf8-b658-915c97a850f5" | ||
}, | ||
"source": [ | ||
"# 각종 함수 등 정의.\n", | ||
"import re\n", | ||
"import pickle\n", | ||
"\n", | ||
"def save_pickle(save_path, data_set):\n", | ||
" file = open(save_path, \"wb\")\n", | ||
" pickle.dump(data_set, file)\n", | ||
" file.close()\n", | ||
" return None\n", | ||
"\n", | ||
"def preprocess(text):\n", | ||
" text = re.sub(r'\\n', ' ', text)\n", | ||
" text = re.sub(r\"\\\\n\", \" \", text)\n", | ||
" text = re.sub(r\"\\s+\", \" \", text)\n", | ||
" text = re.sub(r'#', ' ', text)\n", | ||
" text = re.sub(r\"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\\s\\.\\?!》《≪≫\\'<>〈〉:‘’%,『』「」<>・\\\"-“”∧]\", \"\", text)\n", | ||
" return text\n", | ||
"\n", | ||
"def run_preprocess(data_dict):\n", | ||
" context = data_dict[\"context\"]\n", | ||
" start_ids = data_dict[\"answers\"][\"answer_start\"][0]\n", | ||
" before = data_dict[\"context\"][:start_ids]\n", | ||
" after = data_dict[\"context\"][start_ids:]\n", | ||
" process_before = preprocess(before)\n", | ||
" process_after = preprocess(after)\n", | ||
" process_data = process_before + process_after\n", | ||
" ids_move = len(before) - len(process_before)\n", | ||
" data_dict[\"context\"] = process_data\n", | ||
" data_dict[\"answers\"][\"answer_start\"][0] = start_ids - ids_move\n", | ||
" return data_dict\n", | ||
"\n", | ||
"new_f = Features({'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),\n", | ||
" 'context': Value(dtype='string', id=None),\n", | ||
" 'id': Value(dtype='string', id=None),\n", | ||
" 'question': Value(dtype='string', id=None),\n", | ||
" 'question_type' : Value(dtype='int32', id=None)\n", | ||
" })" | ||
], | ||
"id": "44f8fd91-aa76-4cf8-b658-915c97a850f5", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "5a5e5d85-1c52-4eb4-b023-619e8f075802" | ||
}, | ||
"source": [ | ||
"# Train과 Validation을 나누기 위한 Index 생성 작업.\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"valid_indeces = np.random.choice(len(ai_hub_dataset), 1000, replace=False)\n", | ||
"train_indeces = np.array(list(set(range(len(ai_hub_dataset))) - set(valid_indeces)))" | ||
], | ||
"id": "5a5e5d85-1c52-4eb4-b023-619e8f075802", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "bf7454df-40d8-49a1-899e-0753d609c77d" | ||
}, | ||
"source": [ | ||
"# 최종적으로 Dataset Dict로 만듦.\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"ai_hub_train_datset, ai_hub_valid_datset = [], []\n", | ||
"for data in ai_hub_dataset.select(train_indeces) :\n", | ||
" ai_hub_train_datset.append(run_preprocess(data))\n", | ||
"for data in ai_hub_dataset.select(valid_indeces) :\n", | ||
" ai_hub_valid_datset.append(run_preprocess(data))\n", | ||
"\n", | ||
"ai_hub_train_datset = pd.DataFrame(ai_hub_train_datset)\n", | ||
"ai_hub_valid_datset = pd.DataFrame(ai_hub_valid_datset)\n", | ||
"ai_hub_dataset = DatasetDict({'train': Dataset.from_pandas(ai_hub_train_datset, features=new_f), 'validation': Dataset.from_pandas(ai_hub_valid_datset, features=new_f)})" | ||
], | ||
"id": "bf7454df-40d8-49a1-899e-0753d609c77d", | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598" | ||
}, | ||
"source": [ | ||
"# Dataset Dict 저장하기. 파일 경로 수정 필요.\n", | ||
"save_pickle(\"/opt/ml/outer_datas/ai_hub_dataset.pkl\", ai_hub_dataset)" | ||
], | ||
"id": "2f45edd3-e307-4c44-8ea3-21d7f52c1598", | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |