forked from dsindex/ntagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
126 lines (110 loc) · 5.19 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from __future__ import absolute_import, division, print_function
import os
import pdb
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def prepare_dataset(config, filepath, DatasetClass, sampling=False, num_workers=1, batch_size=0):
opt = config['opt']
dataset = DatasetClass(config, filepath)
if sampling:
sampler = RandomSampler(dataset)
else:
sampler = SequentialSampler(dataset)
if hasattr(opt, 'distributed') and opt.distributed:
sampler = DistributedSampler(dataset)
bz = opt.batch_size
if batch_size > 0: bz = batch_size
loader = DataLoader(dataset, batch_size=bz, num_workers=num_workers, sampler=sampler, pin_memory=True)
logger.info("[{} data loaded]".format(filepath))
return loader
class CoNLLGloveDataset(Dataset):
def __init__(self, config, path):
from allennlp.modules.elmo import batch_to_ids
pad_ids = [config['pad_token_id']] * config['char_n_ctx']
all_token_ids = []
all_pos_ids = []
all_char_ids = []
all_label_ids = []
with open(path,'r',encoding='utf-8') as f:
for line in f:
line = line.strip()
items = line.split('\t')
token_ids = [int(d) for d in items[1].split()]
pos_ids = [int(d) for d in items[2].split()]
# using ELMo.batch_to_ids, compute character ids: ex) 'The' [259, 85, 105, 102, 260, 261, 261, ...]
# (actually byte-based, char_vocab_size == 262, char_padding_idx == 261)
tokens = items[3].split()
char_ids = batch_to_ids([tokens])[0].detach().cpu().numpy().tolist()
for _ in range(len(token_ids) - len(char_ids)):
char_ids.append(pad_ids)
label_ids = [int(d) for d in items[0].split()]
all_token_ids.append(token_ids)
all_pos_ids.append(pos_ids)
all_char_ids.append(char_ids)
all_label_ids.append(label_ids)
all_token_ids = torch.tensor(all_token_ids, dtype=torch.long)
all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long)
all_char_ids = torch.tensor(all_char_ids, dtype=torch.long)
all_label_ids = torch.tensor(all_label_ids, dtype=torch.long)
self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids)
self.y = all_label_ids
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class CoNLLBertDataset(Dataset):
def __init__(self, config, path):
# load features from file
features = torch.load(path)
# convert to tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_pos_ids = torch.tensor([f.pos_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
self.x = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_pos_ids)
self.y = all_label_ids
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class CoNLLElmoDataset(Dataset):
def __init__(self, config, path):
from allennlp.modules.elmo import batch_to_ids
pad_ids = [config['pad_token_id']] * config['char_n_ctx']
all_token_ids = []
all_pos_ids = []
all_char_ids = []
all_label_ids = []
with open(path,'r',encoding='utf-8') as f:
for line in f:
line = line.strip()
items = line.split('\t')
token_ids = [int(d) for d in items[1].split()]
pos_ids = [int(d) for d in items[2].split()]
# compute ELMo character ids
tokens = items[3].split()
char_ids = batch_to_ids([tokens])[0].detach().cpu().numpy().tolist()
for _ in range(len(token_ids) - len(char_ids)):
char_ids.append(pad_ids)
label_ids = [int(d) for d in items[0].split()]
all_token_ids.append(token_ids)
all_pos_ids.append(pos_ids)
all_char_ids.append(char_ids)
all_label_ids.append(label_ids)
all_token_ids = torch.tensor(all_token_ids, dtype=torch.long)
all_pos_ids = torch.tensor(all_pos_ids, dtype=torch.long)
all_char_ids = torch.tensor(all_char_ids, dtype=torch.long)
all_label_ids = torch.tensor(all_label_ids, dtype=torch.long)
self.x = TensorDataset(all_token_ids, all_pos_ids, all_char_ids)
self.y = all_label_ids
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]