Merge pull request #11 from uf-hobi-informatics-lab/origin/advDev

update fp16, predict function in task.py, and data loader
uf-hobi-informatics-lab · Sep 24, 2020 · f4e5936 · f4e5936
2 parents 427310d + 0c448fb
commit f4e5936
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 74 deletions.
diff --git a/data_utils.py b/data_utils.py
@@ -8,22 +8,22 @@
   1. load and save data (checked)
   2. merge non-seq and seq data (checked)
   3. convert data to tensor (checked)
-  4. prepare 5-CV (todo)
 """
 
-from torch import tensor, float32
+from torch import tensor, float32, long
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from config import ModelType
+from config import ModelType, ModelLossMode
 
 
 class SeqEHRDataLoader:
 
-    def __init__(self, data, model_type, task='train'):
-        # TODO switch to pad_packed_seq and pack_padded_seq then we can use batch size
+    def __init__(self, data, model_type, loss_mode, batch_size, task='train'):
         self.batch_size = 1
         self.data = data
         self.task = task
         self.model_type = model_type
+        self.batch_size = batch_size
+        self.loss_mode = loss_mode
 
     def __create_tensor_dataset(self):
         nonseq, seq, label = [], [], []
@@ -33,11 +33,18 @@ def __create_tensor_dataset(self):
             seq.append(each[1])
             label.append(each[2])
 
-        return TensorDataset(
-            tensor(nonseq, dtype=float32),
-            tensor(seq, dtype=float32),
-            tensor(label, dtype=float32)
-        )
+        if self.loss_mode is ModelLossMode.BIN:
+            return TensorDataset(
+                tensor(nonseq, dtype=float32),
+                tensor(seq, dtype=float32),
+                tensor(label, dtype=float32)
+            )
+        else:
+            return TensorDataset(
+                tensor(nonseq, dtype=float32),
+                tensor(seq, dtype=float32),
+                tensor(label, dtype=long)
+            )
 
     def __create_tensor_dataset_with_time(self):
         nonseq, seq, time, label = [], [], [], []
@@ -48,12 +55,20 @@ def __create_tensor_dataset_with_time(self):
             time.append(each[2])
             label.append(each[3])
 
-        return TensorDataset(
-            tensor(nonseq, dtype=float32),
-            tensor(seq, dtype=float32),
-            tensor(time, dtype=float32),
-            tensor(label, dtype=float32)
-        )
+        if self.loss_mode is ModelLossMode.BIN:
+            return TensorDataset(
+                tensor(nonseq, dtype=float32),
+                tensor(seq, dtype=float32),
+                tensor(time, dtype=float32),
+                tensor(label, dtype=float32)
+            )
+        else:
+            return TensorDataset(
+                tensor(nonseq, dtype=float32),
+                tensor(seq, dtype=float32),
+                tensor(time, dtype=float32),
+                tensor(label, dtype=long)
+            )
 
     def create_data_loader(self):
         if self.model_type is ModelType.M_TLSTM:

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-torch>=1.2.0
+torch>=1.6.0
 transformers>=2.11.0
 numpy>1.15.0
 scikit-learn~=0.23.0

diff --git a/run.sh b/run.sh
@@ -2,11 +2,11 @@
 export CUDA_VISIBLE_DEVICES=-1
 
 # # define data path
-train_data='./data/hdp_sample_data/new_labeled_train_cbp.pkl'
-test_data='./data/hdp_sample_data/new_labeled_test_cbp.pkl'
-new_model='./model/hdp'
-res_output='./result/hdp'
-log='log.txt'
+train_data='./data/train.pkl'
+test_data='./data/test.pkl'
+new_model='./model'
+res_output='./result'
+mlog='./log.txt'
 
 # # run experiment
 # # train and test
@@ -28,6 +28,7 @@ python task.py \
   --nonseq_hidden_dim 64 \
   --seq_hidden_dim 64 \
   --mix_hidden_dim 64 \
+  --log $mlog \
   --nonseq_representation_dim 64 \
   --mix_output_dim 2 \
   --loss_mode bin

diff --git a/seq_ehr_model.py b/seq_ehr_model.py
@@ -26,7 +26,7 @@ class MixModelConfig(object):
 
     def __init__(self, seq_input_dim, nonseq_input_dim, dropout_rate=0.1,
                  nonseq_hidden_dim=128, seq_hidden_dim=128, mix_hidden_dim=128,
-                 nonseq_output_dim=64, mix_output_dim=2, loss_mode=ModelLossMode.BIN):
+                 nonseq_output_dim=64, mix_output_dim=2, loss_mode=ModelLossMode.BIN, **kwargs):
         super(MixModelConfig, self).__init__()
         self.seq_input_dim = seq_input_dim
         self.seq_hidden_dim = seq_hidden_dim
@@ -38,6 +38,12 @@ def __init__(self, seq_input_dim, nonseq_input_dim, dropout_rate=0.1,
         self.mix_output_dim = mix_output_dim
         self.loss_mode = loss_mode
 
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                raise Warning("Can't set {} with value {} for {}".format(key, value, self))
+
     def __str__(self):
         s = ""
         for k, v in self.__dict__.items():
@@ -86,25 +92,29 @@ def forward(self, x=None):
         else:
             # seq rep dim = (1, B, h)
             _, (seq_rep, _) = self.seq_model(seq_x)
-            seq_rep = seq_rep.squeeze(0)  # (B, h)
+            # (B, h)
+            seq_rep = seq_rep.squeeze(0)
 
         # non_seq_rep: (B, h)   seq_rep: (B, h)
         m_rep = torch.cat([non_seq_rep, seq_rep], dim=1)
 
         # TODO we need to work on this part of the network: test different non-linear function; test number of layers
-        m_rep = torch.tanh(F.dropout(self.merge_layer(m_rep), p=self.dropout_rate))
+        raw_rep = self.merge_layer(m_rep)
+        m_rep = torch.tanh(F.dropout(raw_rep, p=self.dropout_rate))
 
-        logits = self.classifier(m_rep)  # (B, 2)
+        # (B, 2)
+        logits = self.classifier(m_rep)
         pred_prob = F.softmax(logits, dim=-1)
 
-        # y dim (B, 2)
         if self.loss_mode is ModelLossMode.BIN:
-            # loss = F.binary_cross_entropy(pred_prob, y)
-            loss = F.binary_cross_entropy_with_logits(logits, y)
+            # y dim (B, 2)
+            loss = F.binary_cross_entropy(pred_prob, y)
+            # loss = F.binary_cross_entropy_with_logits(logits, y)
         elif self.loss_mode is ModelLossMode.MUL:
-            y_hat = y.type(torch.long)
-            loss = F.cross_entropy(logits, y_hat)
+            # y dim (B, 1)
+            # y_hat = y.type(torch.long)
+            loss = F.cross_entropy(logits, y)
         else:
             raise NotImplementedError("loss mode only support bin or mul but get {}".format(self.loss_mode.value))
 
-        return loss, pred_prob, torch.argmax(pred_prob, dim=-1)
+        return loss, pred_prob, torch.argmax(pred_prob, dim=-1), m_rep
diff --git a/task.py b/task.py
@@ -10,7 +10,7 @@
 
 
 def main(args):
-    # general set up
+    # general set up (random see for reproducibility, we default seed as 13)
     random.seed(13)
     np.random.seed(13)
     torch.manual_seed(13)
@@ -37,9 +37,12 @@ def main(args):
     # collect input dim for model init (seq, dim)
     args.nonseq_input_dim = train_data[0][0].shape
     args.seq_input_dim = train_data[0][1].shape
+
     # create data loader (pin_memory is set to True) -> (B, S, T)
-    train_data_loader = SeqEHRDataLoader(train_data, args.model_type, task='train').create_data_loader()
-    test_data_loader = SeqEHRDataLoader(test_data, args.model_type, task='test').create_data_loader()
+    train_data_loader = SeqEHRDataLoader(
+        train_data, args.model_type, args.loss_mode, args.batch_size, task='train').create_data_loader()
+    test_data_loader = SeqEHRDataLoader(
+        test_data, args.model_type, args.loss_mode, args.batch_size, task='test').create_data_loader()
     args.total_step = len(train_data_loader)
 
     # init task runner
@@ -94,14 +97,11 @@ def main(args):
     parser.add_argument("--log_step", default=-1, type=int,
                         help='steps before logging after run training. If -1, log every epoch')
     parser.add_argument("--mix_output_dim", default=2, type=int, help='mix model output dim')
+    parser.add_argument("--batch_size", default=1, type=int, help='how many patients data we feed in each iteration')
     parser.add_argument("--loss_mode", default='bin', type=str,
                         help='using "bin" for Softmax+BCELoss or "mul" for CrossEntropyLoss')
-    # TODO: enable mix-percision training
     parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument("--fp16_opt_level", type=str, default="O1",
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
+                        help="Whether to use 16-bit float precision (PyTorch 1.6 naive implementation)")
 
     args = parser.parse_args()
     args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

diff --git a/training.py b/training.py
@@ -45,22 +45,28 @@ def train(self, train_data_loader):
                 # load batch to GPU or CPU
                 batch = tuple(b.to(self.args.device) for b in batch)
                 # the last element is label
-                loss, _, _ = self.model(batch)
+                if self.args.fp16:
+                    with self.autocast:
+                        loss, _, _, _ = self.model(batch)
+                else:
+                    loss, _, _, _ = self.model(batch)
 
                 if self.args.fp16:
-                    with self.amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    torch.nn.utils.clip_grad_norm_(self.amp.master_params(self.optimizer), self.args.max_grad_norm)
+                    loss = self.scaler.scale(loss)
+                    loss.backward()
+                    self.scaler.unscale_(self.optimizer)
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
                 else:
                     loss.backward()
                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
-                tr_loss += loss.item()
-
-                self.optimizer.step()
+                    self.optimizer.step()
 
                 if self.args.do_warmup:
                     self.scheduler.step()
 
+                tr_loss += loss.item()
                 global_step += 1
 
                 if self.args.log_step > 0 and (step + 1) % self.args.log_step == 0:
@@ -80,37 +86,48 @@ def predict(self, test_data_loader, do_eval=True):
         :param do_eval: if true try to run evaluation (GS must be provided)
         """
         batch_iter = tqdm(iterable=test_data_loader, desc='Batch', disable=False)
-        yt_probs, yp_probs, yt_tags, yp_tags, eval_loss = self._eval(batch_iter)
+        yt_probs, yp_probs, yt_tags, yp_tags, eval_loss, representations = self._eval(batch_iter)
+
+        res_path = None
+        if self.args.result_path:
+            self.args.logger.info("Results are reported in {}".format(self.args.result_path))
+            res_path = Path(self.args.result_path)
+            res_path.mkdir(parents=True, exist_ok=True)
+            raw_res_fn = res_path / "raw_results.tsv"
+
+            with open(raw_res_fn, "w") as f:
+                header = "\t".join(
+                    ["\t".join([str(i) for i in range(len(yp_probs[0]))]), "predict_label", "true_label"])
+                f.write(header + "\n")
+                for each in zip(yp_probs, yp_tags, yt_tags):
+                    probs = "\t".join([str(e) for e in each[0]])
+                    line = "\t".join([probs, str(each[1]), str(each[2])]) + "\n"
+                    f.write(line)
 
         if do_eval:
             if self.args.loss_mode is ModelLossMode.BIN:
                 # BIN use acc and ROC-AUC
                 acc = self._get_acc(yt=yt_tags, yp=yp_tags)
                 auc_score, auc_score_1, sensitivity, specificity, J_idx = self._get_auc(yt=yt_probs, yp=yp_probs)
-                eval_res = "accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}"\
+                eval_res = "accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}\n"\
                     .format(acc, auc_score, sensitivity, specificity, J_idx)
             else:
                 # ModelLossMode.MUL use acc and PRF
                 acc = self._get_acc(yt=yt_tags, yp=yp_tags)
                 pre, rec, f1 = self._get_prf(yt=yt_tags, yp=yp_tags)
-                eval_res = "accuracy:{:.4f}\nprecision:{:.4f}\nrecall:{:.4f}\nF1-micro:{:.4f}"\
+                eval_res = "accuracy:{:.4f}\nprecision:{:.4f}\nrecall:{:.4f}\nF1-micro:{:.4f}\n"\
                     .format(acc, pre, rec, f1)
 
-            if self.args.result_path:
-                self.args.logger.info("Results are reported in {}".format(self.args.result_path))
-                res_path = Path(self.args.result_path)
-                res_path.mkdir(parents=True, exist_ok=True)
-                raw_res_fn = res_path / "raw_results.tsv"
-                eval_metric_fn = res_path / "evaluation.txt"
-
-                with open(raw_res_fn, "w") as f:
-                    header = "\t".join(["\t".join([str(i) for i in range(len(yp_probs[0]))]), "predict_label", "true_label"])
-                    f.write(header + "\n")
-                    for each in zip(yp_probs, yp_tags, yt_tags):
-                        probs = "\t".join([str(e) for e in each[0]])
-                        line = "\t".join([probs, str(each[1]), str(each[2])]) + "\n"
-                        f.write(line)
+                try:
+                    auc_score, auc_score_1, sensitivity, specificity, J_idx = self._get_auc(yt=yt_probs, yp=yp_probs)
+                    eval_res += \
+                        "accuracy:{:.4f}\nauc_score:{:.4f}\nsensitivity:{:.4f}\nspecificity:{:.4f}\nJ_index:{}\n" \
+                        .format(acc, auc_score, sensitivity, specificity, J_idx)
+                except Exception:
+                    pass
 
+            if res_path:
+                eval_metric_fn = res_path / "evaluation.txt"
                 with open(eval_metric_fn, "w") as f:
                     f.write(eval_res)
 
@@ -181,45 +198,49 @@ def _init_new_model(self):
             self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps,
                                                              num_training_steps=t_total)
 
-        # mix precision training TODO: update to pytorch naive implementation
+        # mix precision training
+        self.scaler = None
+        self.autocast = None
         if self.args.fp16:
             try:
-                from apex import amp
-                self.amp = amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-            self.model, self.optimizer = self.amp.initialize(self.model, self.optimizer,
-                                                             opt_level=self.args.fp16_opt_level)
+                self.autocast = torch.cuda.amp.autocast()
+                self.scaler = torch.cuda.amp.GradScaler()
+            except Exception:
+                raise ImportError("You need to update to PyTorch 1.6, the current PyTorch version is {}"
+                                  .format(torch.__version__))
 
     def _eval(self, batch_iter):
         self.model.eval()
         eval_loss = 0.
         global_step = 0
-        yt_probs, yp_probs, yt_tags, yp_tags = None, None, None, None
+        yt_probs, yp_probs, yt_tags, yp_tags, reps = None, None, None, None, None
         for step, batch in enumerate(batch_iter):
             batch = tuple(b.to(self.args.device) for b in batch)
             with torch.no_grad():
-                loss, pred_probs, pred_tags = self.model(batch)
+                loss, pred_probs, pred_tags, rep = self.model(batch)
                 eval_loss += loss.item()
                 global_step += 1
 
                 pred_probs = pred_probs.detach().cpu().numpy()  # to cpu as np array
                 pred_tags = pred_tags.detach().cpu().numpy()
                 true_probs = batch[-1].detach().cpu().numpy()
                 true_tags = np.argmax(true_probs, axis=-1)
+                rep = rep.detach().cpu().numpy()
 
                 if yt_probs is None:
                     yt_probs = true_probs
                     yp_probs = pred_probs
                     yt_tags = true_tags
                     yp_tags = pred_tags
+                    reps = rep
                 else:
                     yp_probs = np.concatenate([yp_probs, pred_probs], axis=0)
                     yp_tags = np.concatenate([yp_tags, pred_tags], axis=0)
                     yt_probs = np.concatenate([yt_probs, true_probs], axis=0)
                     yt_tags = np.concatenate([yt_tags, true_tags], axis=0)
+                    reps = np.concatenate([reps, rep], axis=0)
 
-        return yt_probs, yp_probs, yt_tags, yp_tags, eval_loss/global_step
+        return yt_probs, yp_probs, yt_tags, yp_tags, eval_loss/global_step, reps
 
     @staticmethod
     def _get_auc(yt, yp):