Skip to content

Commit

Permalink
Merge pull request #12 from uf-hobi-informatics-lab/origin/advDev
Browse files Browse the repository at this point in the history
refactor project layout; add embedding models
  • Loading branch information
bugface authored Nov 28, 2020
2 parents f4e5936 + 77e6cc9 commit 066f7bc
Show file tree
Hide file tree
Showing 22 changed files with 846 additions and 147 deletions.
Empty file added Embeddings/__init__.py
Empty file.
123 changes: 123 additions & 0 deletions Embeddings/embedding_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""
a simple model to handle EHR seq data with embeddings
we support LSTM, GRU, TLSTM, and TCN as learning framework
"""


import torch
from torch import nn
import sys
sys.path.append("../")

from TLSTM.tlstm import TLSTMCell
from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode


class SeqEmbEHRConfig:
def __init__(self, input_dim=10, output_dim=1, hidden_dim=128, emb_dim=32, drop_prob=0.1, emb_freeze=False,
model_type=ModelType.M_GRU, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM):
self.input_dim = input_dim
self.emb_dim = emb_dim
self.output_dim = output_dim
self.hidden_dim = hidden_dim
self.model_type = model_type
self.loss_type = loss_type
self.merge_type = merge_type
self.drop_prob = drop_prob
self.emb_freeze=emb_freeze

def __str__(self):
s = ""
for k, v in self.__dict__.items():
s += "{}={}\n".format(k, v)
return s


class SeqEmbEHR(nn.Module):

def __init__(self, config, emb_weights=None):
super().__init__()

self.merge_type = config.merge_type
self.loss_type = config.loss_type
self.model_type = config.model_type

self.classifier = nn.Linear(config.hidden_dim, config.output_dim)
self.drop_output = nn.Dropout(p=config.drop_prob)

# could be replaced by EmbeddingBag
self.embedding_layer = nn.Embedding.from_pretrained(
torch.tensor(emb_weights, dtype=torch.float32), freeze=config.emb_freeze)
self.emb_dim = self.embedding_layer.embedding_dim

if self.merge_type is EmbeddingReductionMode.AVG:
# we do not apply adjust linear transformation on average case
self.adjust_layer = None
elif self.merge_type is EmbeddingReductionMode.FUSE:
raise NotImplementedError("TODO: keep all embedding weights as features")
else:
self.adjust_layer = nn.Linear(self.emb_dim, self.emb_dim)

if self.model_type is ModelType.M_TLSTM:
# TLSTM hidden state dim = (B, h)
self.seq_model = TLSTMCell(config.emb_dim, config.hidden_dim)
elif self.model_type is ModelType.M_LSTM:
# LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size)
self.seq_model = nn.LSTM(config.emb_dim, config.hidden_dim, batch_first=True)
elif self.model_type is ModelType.M_GRU:
# LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size)
self.seq_model = nn.GRU(config.emb_dim, config.hidden_dim, batch_first=True)
else:
raise NotImplementedError(
"We only support model lstm, gru, tlstm but get {}".format(
self.model_type.value))

def forward(self, seqs, labels, times=None):
# seqs (B, S, F) - batch, seq, feature as ids
# labels (B, L)

# (B, S, F) = > (B, S, F, E)
x = self.embedding_layer(seqs)

# merge F and E
if self.merge_type is EmbeddingReductionMode.SUM:
x = torch.sum(x, dim=2)
x = self.adjust_layer(x)
elif self.merge_type is EmbeddingReductionMode.MAX:
# torch.max return a tuple: (metrics, indices)
x = torch.max(x, dim=2)[0]
x = self.adjust_layer(x)
elif self.merge_type is EmbeddingReductionMode.AVG:
x = torch.mean(x, dim=2)
elif self.merge_type is EmbeddingReductionMode.FUSE:
raise NotImplementedError("TODO: keep all embedding weights as features")
else:
raise ValueError("Not support current mode: {}".format(self.merge_type))

# sequence model
if self.model_type is ModelType.M_TLSTM:
h_f, (h_t, c_t) = self.seq_model(x)
h_t = h_t.squeeze(0)
elif self.model_type is ModelType.M_GRU:
h_f, h_t = self.seq_model(x)
h_t = h_t.squeeze(0)
else:
h_f, (h_t, c_t) = self.seq_model(x, times)

raw_rep = self.drop_output(h_t)

# output
outputs = self.classifier(raw_rep)
pred_prob = nn.functional.softmax(outputs, dim=-1)

# calc loss
if self.loss_type is ModelLossMode.BIN:
# y dim (B, 2)
loss = nn.functional.binary_cross_entropy_with_logits(outputs, labels)
elif self.loss_type is ModelLossMode.MUL:
# y dim (B, 1)
loss = nn.functional.cross_entropy(outputs, labels)
else:
raise NotImplementedError("loss mode only support bin or mul but get {}".format(self.loss_mode.value))

return loss, pred_prob, torch.argmax(outputs, dim=-1), raw_rep
111 changes: 111 additions & 0 deletions Embeddings/medical_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import sys
sys.path.append("../")

from common_utils.utils import load_text, save_text
import numpy as np


def code2index(codes, c2i):
"""
the function input codes should be a list of list with all real code
the first list is time steps; the second list is features
the c2i is the code to index mappings
function will return a list of list with all mapped indexes
"""
return [[c2i[c] for c in temp] for temp in codes]


def load_embeddings(embedding_file):
"""
function used to load pre-trained embeddings
input file should be organized as word2vec/fasttext pre-trained embedding format (a txt file):
each line is for a unique code
each line start with the code followed by its embedding vectors
we have pad at index 0 with all values set to 0
we have unk at index -1 with all values random initialized ~N(0, 1)
pad is for padding
unk is for code that is not in the code vocab
:param embedding_file: the pretrained embedding files
:return: numpy embedding matrix, code2index, index2code
"""
raw_embeddings = load_text(embedding_file).strip()
lines = raw_embeddings.split("\n")

emb_dim = -1
code2index = dict()
code2index['pad'] = 0
code2index['unk'] = len(lines) + 1

embeddings = []
for idx, line in enumerate(lines):
info = line.split(" ")
tok = info[0]
code2index[tok] = idx + 1
vector = [float(each) for each in info[1:]]
embeddings.append(vector)

if idx == 0:
emb_dim = len(vector)
else:
assert emb_dim == len(vector), \
"expect embeddings have same dim but get {} and {}".format(emb_dim, len(vector))

embeddings.insert(0, list(np.zeros(emb_dim)))
np.random.seed(13)
embeddings.append(list(np.random.normal(0, 1, size=emb_dim)))
index2code = {v: k for k, v in code2index.items()}

return embeddings, code2index, index2code


def random_generate_embeddings(vocab, emb_dim=50):
"""
The function is used to create a random initialized embeddings based on a pre-defined vocab
:param emb_dim: embedding dimension
:param vocab: a list of medical codes (ICD or RXCUI)
:return:
"""
vocab = sorted(list(set(vocab)))

code2index = dict()
code2index['pad'] = 0
code2index['unk'] = len(vocab) + 1

embeddings = np.zeros(emb_dim).reshape(1, -1)

for idx, code in enumerate(vocab):
code2index[code] = idx + 1

np.random.seed(2)
embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0)

index2code = {v: k for k, v in code2index.items()}

return embeddings, code2index, index2code


def main(vocab_file, dim, output_file):
"""
in vocab file, each line should be a unique medical code
"""
codes = load_text(vocab_file).strip().split("\n")
embeddings, code2index, index2code = random_generate_embeddings(codes, emb_dim=dim)

outputs = []
for code, index in code2index.items():
vector = embeddings[index]
str_vec = " ".join([str(each) for each in vector])
line = "{} {}".format(code, str_vec)
outputs.append(line)

outputs = "\n".join(outputs)
save_text(outputs, output_file)


if __name__ == '__main__':
import sys
vocab_file, emb_dim, emb_file = sys.argv[1:]
emb_dim = int(emb_dim)

main(vocab_file, emb_dim, emb_file)
144 changes: 144 additions & 0 deletions Embeddings/test_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import numpy as np
import torch
from torch import nn
import sys
sys.path.append("../")

from common_utils.utils import pkl_load
from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode
from Embeddings.embedding_models import SeqEmbEHR, SeqEmbEHRConfig


def ohe2idx(data):
# convert OHE to index np.array(0,1,0,1) => [1, 3]
uniques = set()
nd = []
for each in data:
d1 = []
for e1 in each:
d2 = []
for e2 in e1:
idxs = list(np.where(e2 == 1)[0])
for i in idxs:
uniques.add(i)
d2.append(idxs)
d1.append(np.array(d2))
nd.append(np.array(d1))
return nd, uniques


def random_generate_embeddings(vocab, emb_dim=50):
"""
The function is used to create a random initialized embeddings based on a pre-defined vocab
:param vocab: a list of medical codes (ICD or RXCUI)
:return:
"""
vocab = sorted(list(set(vocab)))

code2index = dict()
code2index['pad'] = 0
code2index['unk'] = len(vocab) + 1

embeddings = np.zeros(emb_dim).reshape(1, -1)

for idx, code in enumerate(vocab):
code2index[code] = idx + 1

np.random.seed(2)
embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0)

index2code = {v: k for k, v in code2index.items()}

return embeddings, code2index, index2code


if __name__ == '__main__':
trs = pkl_load("../data/tlstm_sync/data_train.pkl")
ttrs = pkl_load("../data/tlstm_sync/elapsed_train.pkl")
trsl = pkl_load("../data/tlstm_sync/label_train.pkl")
ntrs, s1 = ohe2idx(trs)

tss = pkl_load("../data/tlstm_sync/data_test.pkl")
ttss = pkl_load("../data/tlstm_sync/elapsed_test.pkl")
tssl = pkl_load("../data/tlstm_sync/label_test.pkl")
ntss, s2 = ohe2idx(tss)

# create a embedding with dim as 10
emb, c2i, i2c = random_generate_embeddings(s1.union(s2), 10)

conf = SeqEmbEHRConfig(
input_dim=10, output_dim=2, hidden_dim=64, emb_dim=10, drop_prob=0.1,
model_type=ModelType.M_TLSTM, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM)
model = SeqEmbEHR(config=conf, emb_weights=emb)

lr = 0.001
epn = 50
mgn = 2.0

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
idxes = list(range(len(ntrs)))
tr_loss = .0

# ### Training
for ep in range(epn):
np.random.shuffle(idxes)
for idx in idxes:
model.zero_grad()
model.train()

feature = ntrs[idx]
labels = trsl[idx]
time = ttrs[idx]
time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]])

feature_tensor = torch.tensor(feature, dtype=torch.long)
time_tensor = torch.tensor(time, dtype=torch.float32)
label_tensor = torch.tensor(labels, dtype=torch.float32)

loss, _, _, _ = model(feature_tensor, label_tensor, time_tensor)
tr_loss += loss.item()
loss.backward()
optimizer.step()
print("epoch: {}; training loss: {}".format(ep + 1, tr_loss / (ep + 1)))


# ### evaluation
model.eval()

idxes = list(range(len(ntss)))
y_preds, y_trues, gs_labels, pred_labels = None, None, None, None

for idx in idxes:
feature = ntss[idx]
labels = tssl[idx]
time = ttss[idx]
time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]])

feature_tensor = torch.tensor(feature, dtype=torch.long)
time_tensor = torch.tensor(time, dtype=torch.float32)
label_tensor = torch.tensor(labels, dtype=torch.float32)

with torch.no_grad():
_, logits, y_pred, _ = model(feature_tensor, label_tensor, time_tensor)

logits = logits.detach().cpu().numpy()
y_pred = y_pred.detach().cpu().numpy()

if y_preds is None:
pred_labels = logits
y_preds = y_pred
gs_labels = labels
y_trues = labels[:, 1]
else:
pred_labels = np.concatenate([pred_labels, logits], axis=0)
y_preds = np.concatenate([y_preds, y_pred], axis=0)
gs_labels = np.concatenate([gs_labels, labels], axis=0)
y_trues = np.concatenate([y_trues, labels[:, 1]], axis=0)

from sklearn.metrics import roc_auc_score, accuracy_score
total_acc = accuracy_score(y_trues, y_preds)
total_auc = roc_auc_score(gs_labels, pred_labels, average='micro')
total_auc_macro = roc_auc_score(gs_labels, pred_labels, average='macro')
print("Accuracy = {:.3f}".format(total_acc))
print("AUC = {:.3f}".format(total_auc))
print("AUC Macro = {:.3f}".format(total_auc_macro))
Empty file added MixStaticSeq/__init__.py
Empty file.
Loading

0 comments on commit 066f7bc

Please sign in to comment.