Skip to content

Commit

Permalink
change dir name
Browse files Browse the repository at this point in the history
  • Loading branch information
ohsuz committed May 23, 2021
1 parent 49dfb9f commit 37f7175
Show file tree
Hide file tree
Showing 33 changed files with 5,055 additions and 0 deletions.
Empty file added TEAM-IKYO-CODE/README.md
Empty file.
87 changes: 87 additions & 0 deletions TEAM-IKYO-CODE/code/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Optional

@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
default="xlm-roberta-large",
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
use_custom_model: str = field(
default='QAConvModelV2',
metadata={"help": "Choose one of ['ConvModel', 'QueryAttentionModel', 'QAConvModelV1', 'QAConvModelV2']"}
)
use_pretrained_model: bool = field(
default=False,
metadata={"help": "use_pretrained_koquard_model"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
retrieval_type: Optional[str] = field(
default="elastic", metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
retrieval_elastic_index: Optional[str] = field(
default="wiki-index-split-800", metadata={"help": "Elastic search index name[wiki-index, wiki-index-split-400, wiki-index-split-800(best), wiki-index-split-1000]"}
)
retrieval_elastic_num: Optional[int] = field(
default=35,
metadata={"help": "The number of context or passage from Elastic search"},
)

@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
default="question_type", metadata={"help": "Choose one of ['basic', 'preprocessed', 'concat', 'korquad', 'only_korquad', 'quetion_type', 'ai_hub', 'random_masking', 'token_masking']"}
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=384,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": "Whether to pad all samples to `max_seq_length`. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
"be faster on GPU but will be slower on TPU)."
},
)
doc_stride: int = field(
default=128,
metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
)
max_answer_length: int = field(
default=30,
metadata={
"help": "The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
},
)
train_retrieval: bool = field(
default=True,
metadata={"help": "Whether to train sparse/dense embedding (prepare for retrieval)."},
)
eval_retrieval: bool = field(
default=True,
metadata={"help":"Whether to run passage retrieval using sparse/dense embedding )."},
)

126 changes: 126 additions & 0 deletions TEAM-IKYO-CODE/code/data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from datasets import load_metric, load_from_disk
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer

class DataProcessor():
def __init__(self, tokenizer, max_length = 384, doc_stride = 128):
self.tokenizer = tokenizer
self.max_length = max_length
self.doc_stride = doc_stride

def prepare_train_features(self, examples):
tokenized_examples = self.tokenizer(
examples["question"],
examples["context"],
truncation="only_second",
max_length=self.max_length,
stride=self.doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)

sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
offset_mapping = tokenized_examples.pop("offset_mapping")

tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
if 'question_type' in examples.keys() :
tokenized_examples['question_type'] = []

for i, offsets in enumerate(offset_mapping):
input_ids = tokenized_examples["input_ids"][i]
sequence_ids = tokenized_examples.sequence_ids(i)
cls_index = input_ids.index(self.tokenizer.cls_token_id)

sample_index = sample_mapping[i]
answers = examples["answers"][sample_index]

if 'question_type' in examples.keys() :
tokenized_examples['question_type'].append(examples['question_type'][sample_index])

if len(answers["answer_start"]) == 0:
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])

token_start_index = 0
while sequence_ids[token_start_index] != (1):
token_start_index += 1

token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != (1):
token_end_index -= 1

if not (
offsets[token_start_index][0] <= start_char
and offsets[token_end_index][1] >= end_char
):
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
while (
token_start_index < len(offsets)
and offsets[token_start_index][0] <= start_char
):
token_start_index += 1
tokenized_examples["start_positions"].append(token_start_index - 1)
while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
tokenized_examples["end_positions"].append(token_end_index + 1)

return tokenized_examples

def prepare_validation_features(self, examples):
tokenized_examples = self.tokenizer(
examples["question"],
examples["context"],
truncation="only_second",
max_length=self.max_length,
stride=self.doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
tokenized_examples["example_id"] = []

if 'question_type' in examples.keys() :
tokenized_examples['question_type'] = []

for i in range(len(tokenized_examples['input_ids'])):
sequence_ids = tokenized_examples.sequence_ids(i)
context_index = 1
sample_index = sample_mapping[i]
tokenized_examples["example_id"].append(examples["id"][sample_index])

if 'question_type' in examples.keys() :
tokenized_examples['question_type'].append(examples['question_type'][sample_index])

tokenized_examples["offset_mapping"][i] = [
(o if sequence_ids[k] == context_index else None)
for k, o in enumerate(tokenized_examples["offset_mapping"][i])
]

return tokenized_examples

def train_tokenizer(self, train_dataset, column_names):
train_dataset = train_dataset.map(
self.prepare_train_features,
batched=True,
num_proc=4,
remove_columns=column_names,
)

return train_dataset

def val_tokenzier(self, val_dataset, column_names):
val_dataset = val_dataset.map(
self.prepare_validation_features,
batched=True,
num_proc=4,
remove_columns=column_names,
)

return val_dataset
37 changes: 37 additions & 0 deletions TEAM-IKYO-CODE/code/elasticsearch_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json
import os
import time

from elasticsearch import Elasticsearch
from datasets import load_from_disk
from torch.utils.data import DataLoader, TensorDataset
from subprocess import Popen, PIPE, STDOUT
from tqdm import tqdm

def elastic_setting(index_name='wiki-index'):
config = {'host':'localhost', 'port':9200}
es = Elasticsearch([config])

return es, index_name


def search_es(es_obj, index_name, question_text, n_results):
# search query
query = {
'query': {
'match': {
'document_text': question_text
}
}
}
# n_result => 상위 몇개를 선택?
res = es_obj.search(index=index_name, body=query, size=n_results)

return res


def elastic_retrieval(es, index_name, question_text, n_results):
res = search_es(es, index_name, question_text, n_results)
# 매칭된 context만 list형태로 만든다.
context_list = list((hit['_source']['document_text'], hit['_score']) for hit in res['hits']['hits'])
return context_list
Loading

0 comments on commit 37f7175

Please sign in to comment.