Skip to content

Commit

Permalink
Merge pull request #11 from uf-hobi-informatics-lab/origin/advDev
Browse files Browse the repository at this point in the history
update fp16, predict function in task.py, and data loader
  • Loading branch information
bugface authored Sep 24, 2020
2 parents 427310d + 0c448fb commit f4e5936
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 74 deletions.
47 changes: 31 additions & 16 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
1. load and save data (checked)
2. merge non-seq and seq data (checked)
3. convert data to tensor (checked)
4. prepare 5-CV (todo)
"""

from torch import tensor, float32
from torch import tensor, float32, long
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from config import ModelType
from config import ModelType, ModelLossMode


class SeqEHRDataLoader:

def __init__(self, data, model_type, task='train'):
# TODO switch to pad_packed_seq and pack_padded_seq then we can use batch size
def __init__(self, data, model_type, loss_mode, batch_size, task='train'):
self.batch_size = 1
self.data = data
self.task = task
self.model_type = model_type
self.batch_size = batch_size
self.loss_mode = loss_mode

def __create_tensor_dataset(self):
nonseq, seq, label = [], [], []
Expand All @@ -33,11 +33,18 @@ def __create_tensor_dataset(self):
seq.append(each[1])
label.append(each[2])

return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(label, dtype=float32)
)
if self.loss_mode is ModelLossMode.BIN:
return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(label, dtype=float32)
)
else:
return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(label, dtype=long)
)

def __create_tensor_dataset_with_time(self):
nonseq, seq, time, label = [], [], [], []
Expand All @@ -48,12 +55,20 @@ def __create_tensor_dataset_with_time(self):
time.append(each[2])
label.append(each[3])

return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(time, dtype=float32),
tensor(label, dtype=float32)
)
if self.loss_mode is ModelLossMode.BIN:
return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(time, dtype=float32),
tensor(label, dtype=float32)
)
else:
return TensorDataset(
tensor(nonseq, dtype=float32),
tensor(seq, dtype=float32),
tensor(time, dtype=float32),
tensor(label, dtype=long)
)

def create_data_loader(self):
if self.model_type is ModelType.M_TLSTM:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch>=1.2.0
torch>=1.6.0
transformers>=2.11.0
numpy>1.15.0
scikit-learn~=0.23.0
Expand Down
11 changes: 6 additions & 5 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
export CUDA_VISIBLE_DEVICES=-1

# # define data path
train_data='./data/hdp_sample_data/new_labeled_train_cbp.pkl'
test_data='./data/hdp_sample_data/new_labeled_test_cbp.pkl'
new_model='./model/hdp'
res_output='./result/hdp'
log='log.txt'
train_data='./data/train.pkl'
test_data='./data/test.pkl'
new_model='./model'
res_output='./result'
mlog='./log.txt'

# # run experiment
# # train and test
Expand All @@ -28,6 +28,7 @@ python task.py \
--nonseq_hidden_dim 64 \
--seq_hidden_dim 64 \
--mix_hidden_dim 64 \
--log $mlog \
--nonseq_representation_dim 64 \
--mix_output_dim 2 \
--loss_mode bin
Expand Down
30 changes: 20 additions & 10 deletions seq_ehr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class MixModelConfig(object):

def __init__(self, seq_input_dim, nonseq_input_dim, dropout_rate=0.1,
nonseq_hidden_dim=128, seq_hidden_dim=128, mix_hidden_dim=128,
nonseq_output_dim=64, mix_output_dim=2, loss_mode=ModelLossMode.BIN):
nonseq_output_dim=64, mix_output_dim=2, loss_mode=ModelLossMode.BIN, **kwargs):
super(MixModelConfig, self).__init__()
self.seq_input_dim = seq_input_dim
self.seq_hidden_dim = seq_hidden_dim
Expand All @@ -38,6 +38,12 @@ def __init__(self, seq_input_dim, nonseq_input_dim, dropout_rate=0.1,
self.mix_output_dim = mix_output_dim
self.loss_mode = loss_mode

for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
raise Warning("Can't set {} with value {} for {}".format(key, value, self))

def __str__(self):
s = ""
for k, v in self.__dict__.items():
Expand Down Expand Up @@ -86,25 +92,29 @@ def forward(self, x=None):
else:
# seq rep dim = (1, B, h)
_, (seq_rep, _) = self.seq_model(seq_x)
seq_rep = seq_rep.squeeze(0) # (B, h)
# (B, h)
seq_rep = seq_rep.squeeze(0)

# non_seq_rep: (B, h) seq_rep: (B, h)
m_rep = torch.cat([non_seq_rep, seq_rep], dim=1)

# TODO we need to work on this part of the network: test different non-linear function; test number of layers
m_rep = torch.tanh(F.dropout(self.merge_layer(m_rep), p=self.dropout_rate))
raw_rep = self.merge_layer(m_rep)
m_rep = torch.tanh(F.dropout(raw_rep, p=self.dropout_rate))

logits = self.classifier(m_rep) # (B, 2)
# (B, 2)
logits = self.classifier(m_rep)
pred_prob = F.softmax(logits, dim=-1)

# y dim (B, 2)
if self.loss_mode is ModelLossMode.BIN:
# loss = F.binary_cross_entropy(pred_prob, y)
loss = F.binary_cross_entropy_with_logits(logits, y)
# y dim (B, 2)
loss = F.binary_cross_entropy(pred_prob, y)
# loss = F.binary_cross_entropy_with_logits(logits, y)
elif self.loss_mode is ModelLossMode.MUL:
y_hat = y.type(torch.long)
loss = F.cross_entropy(logits, y_hat)
# y dim (B, 1)
# y_hat = y.type(torch.long)
loss = F.cross_entropy(logits, y)
else:
raise NotImplementedError("loss mode only support bin or mul but get {}".format(self.loss_mode.value))

return loss, pred_prob, torch.argmax(pred_prob, dim=-1)
return loss, pred_prob, torch.argmax(pred_prob, dim=-1), m_rep
16 changes: 8 additions & 8 deletions task.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def main(args):
# general set up
# general set up (random see for reproducibility, we default seed as 13)
random.seed(13)
np.random.seed(13)
torch.manual_seed(13)
Expand All @@ -37,9 +37,12 @@ def main(args):
# collect input dim for model init (seq, dim)
args.nonseq_input_dim = train_data[0][0].shape
args.seq_input_dim = train_data[0][1].shape

# create data loader (pin_memory is set to True) -> (B, S, T)
train_data_loader = SeqEHRDataLoader(train_data, args.model_type, task='train').create_data_loader()
test_data_loader = SeqEHRDataLoader(test_data, args.model_type, task='test').create_data_loader()
train_data_loader = SeqEHRDataLoader(
train_data, args.model_type, args.loss_mode, args.batch_size, task='train').create_data_loader()
test_data_loader = SeqEHRDataLoader(
test_data, args.model_type, args.loss_mode, args.batch_size, task='test').create_data_loader()
args.total_step = len(train_data_loader)

# init task runner
Expand Down Expand Up @@ -94,14 +97,11 @@ def main(args):
parser.add_argument("--log_step", default=-1, type=int,
help='steps before logging after run training. If -1, log every epoch')
parser.add_argument("--mix_output_dim", default=2, type=int, help='mix model output dim')
parser.add_argument("--batch_size", default=1, type=int, help='how many patients data we feed in each iteration')
parser.add_argument("--loss_mode", default='bin', type=str,
help='using "bin" for Softmax+BCELoss or "mul" for CrossEntropyLoss')
# TODO: enable mix-percision training
parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument("--fp16_opt_level", type=str, default="O1",
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")
help="Whether to use 16-bit float precision (PyTorch 1.6 naive implementation)")

args = parser.parse_args()
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand Down
89 changes: 55 additions & 34 deletions training.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,28 @@ def train(self, train_data_loader):
# load batch to GPU or CPU
batch = tuple(b.to(self.args.device) for b in batch)
# the last element is label
loss, _, _ = self.model(batch)
if self.args.fp16:
with self.autocast:
loss, _, _, _ = self.model(batch)
else:
loss, _, _, _ = self.model(batch)

if self.args.fp16:
with self.amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(self.amp.master_params(self.optimizer), self.args.max_grad_norm)
loss = self.scaler.scale(loss)
loss.backward()
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
self.scaler.step(self.optimizer)
self.scaler.update()
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
tr_loss += loss.item()

self.optimizer.step()
self.optimizer.step()

if self.args.do_warmup:
self.scheduler.step()

tr_loss += loss.item()
global_step += 1

if self.args.log_step > 0 and (step + 1) % self.args.log_step == 0:
Expand All @@ -80,37 +86,48 @@ def predict(self, test_data_loader, do_eval=True):
:param do_eval: if true try to run evaluation (GS must be provided)
"""
batch_iter = tqdm(iterable=test_data_loader, desc='Batch', disable=False)
yt_probs, yp_probs, yt_tags, yp_tags, eval_loss = self._eval(batch_iter)
yt_probs, yp_probs, yt_tags, yp_tags, eval_loss, representations = self._eval(batch_iter)

res_path = None
if self.args.result_path:
self.args.logger.info("Results are reported in {}".format(self.args.result_path))
res_path = Path(self.args.result_path)
res_path.mkdir(parents=True, exist_ok=True)
raw_res_fn = res_path / "raw_results.tsv"

with open(raw_res_fn, "w") as f:
header = "\t".join(
["\t".join([str(i) for i in range(len(yp_probs[0]))]), "predict_label", "true_label"])
f.write(header + "\n")
for each in zip(yp_probs, yp_tags, yt_tags):
probs = "\t".join([str(e) for e in each[0]])
line = "\t".join([probs, str(each[1]), str(each[2])]) + "\n"
f.write(line)

if do_eval:
if self.args.loss_mode is ModelLossMode.BIN:
# BIN use acc and ROC-AUC
acc = self._get_acc(yt=yt_tags, yp=yp_tags)
auc_score, auc_score_1, sensitivity, specificity, J_idx = self._get_auc(yt=yt_probs, yp=yp_probs)
eval_res = "accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}"\
eval_res = "accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}\n"\
.format(acc, auc_score, sensitivity, specificity, J_idx)
else:
# ModelLossMode.MUL use acc and PRF
acc = self._get_acc(yt=yt_tags, yp=yp_tags)
pre, rec, f1 = self._get_prf(yt=yt_tags, yp=yp_tags)
eval_res = "accuracy:{:.4f}\nprecision:{:.4f}\nrecall:{:.4f}\nF1-micro:{:.4f}"\
eval_res = "accuracy:{:.4f}\nprecision:{:.4f}\nrecall:{:.4f}\nF1-micro:{:.4f}\n"\
.format(acc, pre, rec, f1)

if self.args.result_path:
self.args.logger.info("Results are reported in {}".format(self.args.result_path))
res_path = Path(self.args.result_path)
res_path.mkdir(parents=True, exist_ok=True)
raw_res_fn = res_path / "raw_results.tsv"
eval_metric_fn = res_path / "evaluation.txt"

with open(raw_res_fn, "w") as f:
header = "\t".join(["\t".join([str(i) for i in range(len(yp_probs[0]))]), "predict_label", "true_label"])
f.write(header + "\n")
for each in zip(yp_probs, yp_tags, yt_tags):
probs = "\t".join([str(e) for e in each[0]])
line = "\t".join([probs, str(each[1]), str(each[2])]) + "\n"
f.write(line)
try:
auc_score, auc_score_1, sensitivity, specificity, J_idx = self._get_auc(yt=yt_probs, yp=yp_probs)
eval_res += \
"accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}\n" \
.format(acc, auc_score, sensitivity, specificity, J_idx)
except Exception:
pass

if res_path:
eval_metric_fn = res_path / "evaluation.txt"
with open(eval_metric_fn, "w") as f:
f.write(eval_res)

Expand Down Expand Up @@ -181,45 +198,49 @@ def _init_new_model(self):
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps,
num_training_steps=t_total)

# mix precision training TODO: update to pytorch naive implementation
# mix precision training
self.scaler = None
self.autocast = None
if self.args.fp16:
try:
from apex import amp
self.amp = amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
self.model, self.optimizer = self.amp.initialize(self.model, self.optimizer,
opt_level=self.args.fp16_opt_level)
self.autocast = torch.cuda.amp.autocast()
self.scaler = torch.cuda.amp.GradScaler()
except Exception:
raise ImportError("You need to update to PyTorch 1.6, the current PyTorch version is {}"
.format(torch.__version__))

def _eval(self, batch_iter):
self.model.eval()
eval_loss = 0.
global_step = 0
yt_probs, yp_probs, yt_tags, yp_tags = None, None, None, None
yt_probs, yp_probs, yt_tags, yp_tags, reps = None, None, None, None, None
for step, batch in enumerate(batch_iter):
batch = tuple(b.to(self.args.device) for b in batch)
with torch.no_grad():
loss, pred_probs, pred_tags = self.model(batch)
loss, pred_probs, pred_tags, rep = self.model(batch)
eval_loss += loss.item()
global_step += 1

pred_probs = pred_probs.detach().cpu().numpy() # to cpu as np array
pred_tags = pred_tags.detach().cpu().numpy()
true_probs = batch[-1].detach().cpu().numpy()
true_tags = np.argmax(true_probs, axis=-1)
rep = rep.detach().cpu().numpy()

if yt_probs is None:
yt_probs = true_probs
yp_probs = pred_probs
yt_tags = true_tags
yp_tags = pred_tags
reps = rep
else:
yp_probs = np.concatenate([yp_probs, pred_probs], axis=0)
yp_tags = np.concatenate([yp_tags, pred_tags], axis=0)
yt_probs = np.concatenate([yt_probs, true_probs], axis=0)
yt_tags = np.concatenate([yt_tags, true_tags], axis=0)
reps = np.concatenate([reps, rep], axis=0)

return yt_probs, yp_probs, yt_tags, yp_tags, eval_loss/global_step
return yt_probs, yp_probs, yt_tags, yp_tags, eval_loss/global_step, reps

@staticmethod
def _get_auc(yt, yp):
Expand Down

0 comments on commit f4e5936

Please sign in to comment.