Merge pull request #12 from uf-hobi-informatics-lab/origin/advDev

refactor project layout; add embedding models
uf-hobi-informatics-lab · Nov 28, 2020 · 066f7bc · 066f7bc
2 parents f4e5936 + 77e6cc9
commit 066f7bc
Show file tree

Hide file tree

Showing 22 changed files with 846 additions and 147 deletions.
diff --git a/Embeddings/__init__.py b/Embeddings/__init__.py
diff --git a/Embeddings/embedding_models.py b/Embeddings/embedding_models.py
@@ -0,0 +1,123 @@
+"""
+a simple model to handle EHR seq data with embeddings
+we support LSTM, GRU, TLSTM, and TCN as learning framework
+"""
+
+
+import torch
+from torch import nn
+import sys
+sys.path.append("../")
+
+from TLSTM.tlstm import TLSTMCell
+from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode
+
+
+class SeqEmbEHRConfig:
+    def __init__(self, input_dim=10, output_dim=1, hidden_dim=128, emb_dim=32, drop_prob=0.1, emb_freeze=False,
+                 model_type=ModelType.M_GRU, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM):
+        self.input_dim = input_dim
+        self.emb_dim = emb_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.model_type = model_type
+        self.loss_type = loss_type
+        self.merge_type = merge_type
+        self.drop_prob = drop_prob
+        self.emb_freeze=emb_freeze
+
+    def __str__(self):
+        s = ""
+        for k, v in self.__dict__.items():
+            s += "{}={}\n".format(k, v)
+        return s
+
+
+class SeqEmbEHR(nn.Module):
+
+    def __init__(self, config, emb_weights=None):
+        super().__init__()
+
+        self.merge_type = config.merge_type
+        self.loss_type = config.loss_type
+        self.model_type = config.model_type
+
+        self.classifier = nn.Linear(config.hidden_dim, config.output_dim)
+        self.drop_output = nn.Dropout(p=config.drop_prob)
+
+        # could be replaced by EmbeddingBag
+        self.embedding_layer = nn.Embedding.from_pretrained(
+            torch.tensor(emb_weights, dtype=torch.float32), freeze=config.emb_freeze)
+        self.emb_dim = self.embedding_layer.embedding_dim
+
+        if self.merge_type is EmbeddingReductionMode.AVG:
+            # we do not apply adjust linear transformation on average case
+            self.adjust_layer = None
+        elif self.merge_type is EmbeddingReductionMode.FUSE:
+            raise NotImplementedError("TODO: keep all embedding weights as features")
+        else:
+            self.adjust_layer = nn.Linear(self.emb_dim, self.emb_dim)
+
+        if self.model_type is ModelType.M_TLSTM:
+            # TLSTM hidden state dim = (B, h)
+            self.seq_model = TLSTMCell(config.emb_dim, config.hidden_dim)
+        elif self.model_type is ModelType.M_LSTM:
+            # LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size)
+            self.seq_model = nn.LSTM(config.emb_dim, config.hidden_dim, batch_first=True)
+        elif self.model_type is ModelType.M_GRU:
+            # LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size)
+            self.seq_model = nn.GRU(config.emb_dim, config.hidden_dim, batch_first=True)
+        else:
+            raise NotImplementedError(
+                "We only support model lstm, gru, tlstm but get {}".format(
+                    self.model_type.value))
+
+    def forward(self, seqs, labels, times=None):
+        # seqs (B, S, F) - batch, seq, feature as ids
+        # labels (B, L)
+
+        # (B, S, F) = > (B, S, F, E)
+        x = self.embedding_layer(seqs)
+
+        # merge F and E
+        if self.merge_type is EmbeddingReductionMode.SUM:
+            x = torch.sum(x, dim=2)
+            x = self.adjust_layer(x)
+        elif self.merge_type is EmbeddingReductionMode.MAX:
+            # torch.max return a tuple: (metrics, indices)
+            x = torch.max(x, dim=2)[0]
+            x = self.adjust_layer(x)
+        elif self.merge_type is EmbeddingReductionMode.AVG:
+            x = torch.mean(x, dim=2)
+        elif self.merge_type is EmbeddingReductionMode.FUSE:
+            raise NotImplementedError("TODO: keep all embedding weights as features")
+        else:
+            raise ValueError("Not support current mode: {}".format(self.merge_type))
+
+        # sequence model
+        if self.model_type is ModelType.M_TLSTM:
+            h_f, (h_t, c_t) = self.seq_model(x)
+            h_t = h_t.squeeze(0)
+        elif self.model_type is ModelType.M_GRU:
+            h_f, h_t = self.seq_model(x)
+            h_t = h_t.squeeze(0)
+        else:
+            h_f, (h_t, c_t) = self.seq_model(x, times)
+
+        raw_rep = self.drop_output(h_t)
+
+        # output
+        outputs = self.classifier(raw_rep)
+        pred_prob = nn.functional.softmax(outputs, dim=-1)
+
+        # calc loss
+        if self.loss_type is ModelLossMode.BIN:
+            # y dim (B, 2)
+            loss = nn.functional.binary_cross_entropy_with_logits(outputs, labels)
+        elif self.loss_type is ModelLossMode.MUL:
+            # y dim (B, 1)
+            loss = nn.functional.cross_entropy(outputs, labels)
+        else:
+            raise NotImplementedError("loss mode only support bin or mul but get {}".format(self.loss_mode.value))
+
+        return loss, pred_prob, torch.argmax(outputs, dim=-1), raw_rep
diff --git a/Embeddings/medical_embeddings.py b/Embeddings/medical_embeddings.py
@@ -0,0 +1,111 @@
+import sys
+sys.path.append("../")
+
+from common_utils.utils import load_text, save_text
+import numpy as np
+
+
+def code2index(codes, c2i):
+    """
+     the function input codes should be a list of list with all real code
+     the first list is time steps; the second list is features
+     the c2i is the code to index mappings
+     function will return a list of list with all mapped indexes
+    """
+    return [[c2i[c] for c in temp] for temp in codes]
+
+
+def load_embeddings(embedding_file):
+    """
+    function used to load pre-trained embeddings
+    input file should be organized as word2vec/fasttext pre-trained embedding format (a txt file):
+    each line is for a unique code
+    each line start with the code followed by its embedding vectors
+    we have pad at index 0 with all values set to 0
+    we have unk at index -1 with all values random initialized ~N(0, 1)
+    pad is for padding
+    unk is for code that is not in the code vocab
+
+    :param embedding_file: the pretrained embedding files
+    :return: numpy embedding matrix, code2index, index2code
+    """
+    raw_embeddings = load_text(embedding_file).strip()
+    lines = raw_embeddings.split("\n")
+
+    emb_dim = -1
+    code2index = dict()
+    code2index['pad'] = 0
+    code2index['unk'] = len(lines) + 1
+
+    embeddings = []
+    for idx, line in enumerate(lines):
+        info = line.split(" ")
+        tok = info[0]
+        code2index[tok] = idx + 1
+        vector = [float(each) for each in info[1:]]
+        embeddings.append(vector)
+
+        if idx == 0:
+            emb_dim = len(vector)
+        else:
+            assert emb_dim == len(vector), \
+                "expect embeddings have same dim but get {} and {}".format(emb_dim, len(vector))
+
+    embeddings.insert(0, list(np.zeros(emb_dim)))
+    np.random.seed(13)
+    embeddings.append(list(np.random.normal(0, 1, size=emb_dim)))
+    index2code = {v: k for k, v in code2index.items()}
+
+    return embeddings, code2index, index2code
+
+
+def random_generate_embeddings(vocab, emb_dim=50):
+    """
+    The function is used to create a random initialized embeddings based on a pre-defined vocab
+    :param emb_dim: embedding dimension
+    :param vocab: a list of medical codes (ICD or RXCUI)
+    :return:
+    """
+    vocab = sorted(list(set(vocab)))
+
+    code2index = dict()
+    code2index['pad'] = 0
+    code2index['unk'] = len(vocab) + 1
+
+    embeddings = np.zeros(emb_dim).reshape(1, -1)
+
+    for idx, code in enumerate(vocab):
+        code2index[code] = idx + 1
+
+    np.random.seed(2)
+    embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0)
+
+    index2code = {v: k for k, v in code2index.items()}
+
+    return embeddings, code2index, index2code
+
+
+def main(vocab_file, dim, output_file):
+    """
+        in vocab file, each line should be a unique medical code
+    """
+    codes = load_text(vocab_file).strip().split("\n")
+    embeddings, code2index, index2code = random_generate_embeddings(codes, emb_dim=dim)
+
+    outputs = []
+    for code, index in code2index.items():
+        vector = embeddings[index]
+        str_vec = " ".join([str(each) for each in vector])
+        line = "{} {}".format(code, str_vec)
+        outputs.append(line)
+
+    outputs = "\n".join(outputs)
+    save_text(outputs, output_file)
+
+
+if __name__ == '__main__':
+    import sys
+    vocab_file, emb_dim, emb_file = sys.argv[1:]
+    emb_dim = int(emb_dim)
+
+    main(vocab_file, emb_dim, emb_file)
diff --git a/Embeddings/test_embeddings.py b/Embeddings/test_embeddings.py
@@ -0,0 +1,144 @@
+import numpy as np
+import torch
+from torch import nn
+import sys
+sys.path.append("../")
+
+from common_utils.utils import pkl_load
+from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode
+from Embeddings.embedding_models import SeqEmbEHR, SeqEmbEHRConfig
+
+
+def ohe2idx(data):
+    # convert OHE to index np.array(0,1,0,1) => [1, 3]
+    uniques = set()
+    nd = []
+    for each in data:
+        d1 = []
+        for e1 in each:
+            d2 = []
+            for e2 in e1:
+                idxs = list(np.where(e2 == 1)[0])
+                for i in idxs:
+                    uniques.add(i)
+                d2.append(idxs)
+            d1.append(np.array(d2))
+        nd.append(np.array(d1))
+    return nd, uniques
+
+
+def random_generate_embeddings(vocab, emb_dim=50):
+    """
+    The function is used to create a random initialized embeddings based on a pre-defined vocab
+    :param vocab: a list of medical codes (ICD or RXCUI)
+    :return:
+    """
+    vocab = sorted(list(set(vocab)))
+
+    code2index = dict()
+    code2index['pad'] = 0
+    code2index['unk'] = len(vocab) + 1
+
+    embeddings = np.zeros(emb_dim).reshape(1, -1)
+
+    for idx, code in enumerate(vocab):
+        code2index[code] = idx + 1
+
+    np.random.seed(2)
+    embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0)
+
+    index2code = {v: k for k, v in code2index.items()}
+
+    return embeddings, code2index, index2code
+
+
+if __name__ == '__main__':
+    trs = pkl_load("../data/tlstm_sync/data_train.pkl")
+    ttrs = pkl_load("../data/tlstm_sync/elapsed_train.pkl")
+    trsl = pkl_load("../data/tlstm_sync/label_train.pkl")
+    ntrs, s1 = ohe2idx(trs)
+
+    tss = pkl_load("../data/tlstm_sync/data_test.pkl")
+    ttss = pkl_load("../data/tlstm_sync/elapsed_test.pkl")
+    tssl = pkl_load("../data/tlstm_sync/label_test.pkl")
+    ntss, s2 = ohe2idx(tss)
+
+    # create a embedding with dim as 10
+    emb, c2i, i2c = random_generate_embeddings(s1.union(s2), 10)
+
+    conf = SeqEmbEHRConfig(
+        input_dim=10, output_dim=2, hidden_dim=64, emb_dim=10, drop_prob=0.1,
+        model_type=ModelType.M_TLSTM, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM)
+    model = SeqEmbEHR(config=conf, emb_weights=emb)
+
+    lr = 0.001
+    epn = 50
+    mgn = 2.0
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    idxes = list(range(len(ntrs)))
+    tr_loss = .0
+
+    # ### Training
+    for ep in range(epn):
+        np.random.shuffle(idxes)
+        for idx in idxes:
+            model.zero_grad()
+            model.train()
+
+            feature = ntrs[idx]
+            labels = trsl[idx]
+            time = ttrs[idx]
+            time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]])
+
+            feature_tensor = torch.tensor(feature, dtype=torch.long)
+            time_tensor = torch.tensor(time, dtype=torch.float32)
+            label_tensor = torch.tensor(labels, dtype=torch.float32)
+
+            loss, _, _, _ = model(feature_tensor, label_tensor, time_tensor)
+            tr_loss += loss.item()
+            loss.backward()
+            optimizer.step()
+        print("epoch: {}; training loss: {}".format(ep + 1, tr_loss / (ep + 1)))
+
+
+    # ### evaluation
+    model.eval()
+
+    idxes = list(range(len(ntss)))
+    y_preds, y_trues, gs_labels, pred_labels = None, None, None, None
+
+    for idx in idxes:
+        feature = ntss[idx]
+        labels = tssl[idx]
+        time = ttss[idx]
+        time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]])
+
+        feature_tensor = torch.tensor(feature, dtype=torch.long)
+        time_tensor = torch.tensor(time, dtype=torch.float32)
+        label_tensor = torch.tensor(labels, dtype=torch.float32)
+
+        with torch.no_grad():
+            _, logits, y_pred, _ = model(feature_tensor, label_tensor, time_tensor)
+
+            logits = logits.detach().cpu().numpy()
+            y_pred = y_pred.detach().cpu().numpy()
+
+            if y_preds is None:
+                pred_labels = logits
+                y_preds = y_pred
+                gs_labels = labels
+                y_trues = labels[:, 1]
+            else:
+                pred_labels = np.concatenate([pred_labels, logits], axis=0)
+                y_preds = np.concatenate([y_preds, y_pred], axis=0)
+                gs_labels = np.concatenate([gs_labels, labels], axis=0)
+                y_trues = np.concatenate([y_trues, labels[:, 1]], axis=0)
+
+    from sklearn.metrics import roc_auc_score, accuracy_score
+    total_acc = accuracy_score(y_trues, y_preds)
+    total_auc = roc_auc_score(gs_labels, pred_labels, average='micro')
+    total_auc_macro = roc_auc_score(gs_labels, pred_labels, average='macro')
+    print("Accuracy = {:.3f}".format(total_acc))
+    print("AUC = {:.3f}".format(total_auc))
+    print("AUC Macro = {:.3f}".format(total_auc_macro))
diff --git a/MixStaticSeq/__init__.py b/MixStaticSeq/__init__.py