-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from uf-hobi-informatics-lab/origin/advDev
refactor project layout; add embedding models
- Loading branch information
Showing
22 changed files
with
846 additions
and
147 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
""" | ||
a simple model to handle EHR seq data with embeddings | ||
we support LSTM, GRU, TLSTM, and TCN as learning framework | ||
""" | ||
|
||
|
||
import torch | ||
from torch import nn | ||
import sys | ||
sys.path.append("../") | ||
|
||
from TLSTM.tlstm import TLSTMCell | ||
from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode | ||
|
||
|
||
class SeqEmbEHRConfig: | ||
def __init__(self, input_dim=10, output_dim=1, hidden_dim=128, emb_dim=32, drop_prob=0.1, emb_freeze=False, | ||
model_type=ModelType.M_GRU, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM): | ||
self.input_dim = input_dim | ||
self.emb_dim = emb_dim | ||
self.output_dim = output_dim | ||
self.hidden_dim = hidden_dim | ||
self.model_type = model_type | ||
self.loss_type = loss_type | ||
self.merge_type = merge_type | ||
self.drop_prob = drop_prob | ||
self.emb_freeze=emb_freeze | ||
|
||
def __str__(self): | ||
s = "" | ||
for k, v in self.__dict__.items(): | ||
s += "{}={}\n".format(k, v) | ||
return s | ||
|
||
|
||
class SeqEmbEHR(nn.Module): | ||
|
||
def __init__(self, config, emb_weights=None): | ||
super().__init__() | ||
|
||
self.merge_type = config.merge_type | ||
self.loss_type = config.loss_type | ||
self.model_type = config.model_type | ||
|
||
self.classifier = nn.Linear(config.hidden_dim, config.output_dim) | ||
self.drop_output = nn.Dropout(p=config.drop_prob) | ||
|
||
# could be replaced by EmbeddingBag | ||
self.embedding_layer = nn.Embedding.from_pretrained( | ||
torch.tensor(emb_weights, dtype=torch.float32), freeze=config.emb_freeze) | ||
self.emb_dim = self.embedding_layer.embedding_dim | ||
|
||
if self.merge_type is EmbeddingReductionMode.AVG: | ||
# we do not apply adjust linear transformation on average case | ||
self.adjust_layer = None | ||
elif self.merge_type is EmbeddingReductionMode.FUSE: | ||
raise NotImplementedError("TODO: keep all embedding weights as features") | ||
else: | ||
self.adjust_layer = nn.Linear(self.emb_dim, self.emb_dim) | ||
|
||
if self.model_type is ModelType.M_TLSTM: | ||
# TLSTM hidden state dim = (B, h) | ||
self.seq_model = TLSTMCell(config.emb_dim, config.hidden_dim) | ||
elif self.model_type is ModelType.M_LSTM: | ||
# LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size) | ||
self.seq_model = nn.LSTM(config.emb_dim, config.hidden_dim, batch_first=True) | ||
elif self.model_type is ModelType.M_GRU: | ||
# LSTM hidden state dim = (batch, num_layers * num_directions, hidden_size) | ||
self.seq_model = nn.GRU(config.emb_dim, config.hidden_dim, batch_first=True) | ||
else: | ||
raise NotImplementedError( | ||
"We only support model lstm, gru, tlstm but get {}".format( | ||
self.model_type.value)) | ||
|
||
def forward(self, seqs, labels, times=None): | ||
# seqs (B, S, F) - batch, seq, feature as ids | ||
# labels (B, L) | ||
|
||
# (B, S, F) = > (B, S, F, E) | ||
x = self.embedding_layer(seqs) | ||
|
||
# merge F and E | ||
if self.merge_type is EmbeddingReductionMode.SUM: | ||
x = torch.sum(x, dim=2) | ||
x = self.adjust_layer(x) | ||
elif self.merge_type is EmbeddingReductionMode.MAX: | ||
# torch.max return a tuple: (metrics, indices) | ||
x = torch.max(x, dim=2)[0] | ||
x = self.adjust_layer(x) | ||
elif self.merge_type is EmbeddingReductionMode.AVG: | ||
x = torch.mean(x, dim=2) | ||
elif self.merge_type is EmbeddingReductionMode.FUSE: | ||
raise NotImplementedError("TODO: keep all embedding weights as features") | ||
else: | ||
raise ValueError("Not support current mode: {}".format(self.merge_type)) | ||
|
||
# sequence model | ||
if self.model_type is ModelType.M_TLSTM: | ||
h_f, (h_t, c_t) = self.seq_model(x) | ||
h_t = h_t.squeeze(0) | ||
elif self.model_type is ModelType.M_GRU: | ||
h_f, h_t = self.seq_model(x) | ||
h_t = h_t.squeeze(0) | ||
else: | ||
h_f, (h_t, c_t) = self.seq_model(x, times) | ||
|
||
raw_rep = self.drop_output(h_t) | ||
|
||
# output | ||
outputs = self.classifier(raw_rep) | ||
pred_prob = nn.functional.softmax(outputs, dim=-1) | ||
|
||
# calc loss | ||
if self.loss_type is ModelLossMode.BIN: | ||
# y dim (B, 2) | ||
loss = nn.functional.binary_cross_entropy_with_logits(outputs, labels) | ||
elif self.loss_type is ModelLossMode.MUL: | ||
# y dim (B, 1) | ||
loss = nn.functional.cross_entropy(outputs, labels) | ||
else: | ||
raise NotImplementedError("loss mode only support bin or mul but get {}".format(self.loss_mode.value)) | ||
|
||
return loss, pred_prob, torch.argmax(outputs, dim=-1), raw_rep |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import sys | ||
sys.path.append("../") | ||
|
||
from common_utils.utils import load_text, save_text | ||
import numpy as np | ||
|
||
|
||
def code2index(codes, c2i): | ||
""" | ||
the function input codes should be a list of list with all real code | ||
the first list is time steps; the second list is features | ||
the c2i is the code to index mappings | ||
function will return a list of list with all mapped indexes | ||
""" | ||
return [[c2i[c] for c in temp] for temp in codes] | ||
|
||
|
||
def load_embeddings(embedding_file): | ||
""" | ||
function used to load pre-trained embeddings | ||
input file should be organized as word2vec/fasttext pre-trained embedding format (a txt file): | ||
each line is for a unique code | ||
each line start with the code followed by its embedding vectors | ||
we have pad at index 0 with all values set to 0 | ||
we have unk at index -1 with all values random initialized ~N(0, 1) | ||
pad is for padding | ||
unk is for code that is not in the code vocab | ||
:param embedding_file: the pretrained embedding files | ||
:return: numpy embedding matrix, code2index, index2code | ||
""" | ||
raw_embeddings = load_text(embedding_file).strip() | ||
lines = raw_embeddings.split("\n") | ||
|
||
emb_dim = -1 | ||
code2index = dict() | ||
code2index['pad'] = 0 | ||
code2index['unk'] = len(lines) + 1 | ||
|
||
embeddings = [] | ||
for idx, line in enumerate(lines): | ||
info = line.split(" ") | ||
tok = info[0] | ||
code2index[tok] = idx + 1 | ||
vector = [float(each) for each in info[1:]] | ||
embeddings.append(vector) | ||
|
||
if idx == 0: | ||
emb_dim = len(vector) | ||
else: | ||
assert emb_dim == len(vector), \ | ||
"expect embeddings have same dim but get {} and {}".format(emb_dim, len(vector)) | ||
|
||
embeddings.insert(0, list(np.zeros(emb_dim))) | ||
np.random.seed(13) | ||
embeddings.append(list(np.random.normal(0, 1, size=emb_dim))) | ||
index2code = {v: k for k, v in code2index.items()} | ||
|
||
return embeddings, code2index, index2code | ||
|
||
|
||
def random_generate_embeddings(vocab, emb_dim=50): | ||
""" | ||
The function is used to create a random initialized embeddings based on a pre-defined vocab | ||
:param emb_dim: embedding dimension | ||
:param vocab: a list of medical codes (ICD or RXCUI) | ||
:return: | ||
""" | ||
vocab = sorted(list(set(vocab))) | ||
|
||
code2index = dict() | ||
code2index['pad'] = 0 | ||
code2index['unk'] = len(vocab) + 1 | ||
|
||
embeddings = np.zeros(emb_dim).reshape(1, -1) | ||
|
||
for idx, code in enumerate(vocab): | ||
code2index[code] = idx + 1 | ||
|
||
np.random.seed(2) | ||
embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0) | ||
|
||
index2code = {v: k for k, v in code2index.items()} | ||
|
||
return embeddings, code2index, index2code | ||
|
||
|
||
def main(vocab_file, dim, output_file): | ||
""" | ||
in vocab file, each line should be a unique medical code | ||
""" | ||
codes = load_text(vocab_file).strip().split("\n") | ||
embeddings, code2index, index2code = random_generate_embeddings(codes, emb_dim=dim) | ||
|
||
outputs = [] | ||
for code, index in code2index.items(): | ||
vector = embeddings[index] | ||
str_vec = " ".join([str(each) for each in vector]) | ||
line = "{} {}".format(code, str_vec) | ||
outputs.append(line) | ||
|
||
outputs = "\n".join(outputs) | ||
save_text(outputs, output_file) | ||
|
||
|
||
if __name__ == '__main__': | ||
import sys | ||
vocab_file, emb_dim, emb_file = sys.argv[1:] | ||
emb_dim = int(emb_dim) | ||
|
||
main(vocab_file, emb_dim, emb_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import numpy as np | ||
import torch | ||
from torch import nn | ||
import sys | ||
sys.path.append("../") | ||
|
||
from common_utils.utils import pkl_load | ||
from common_utils.config import ModelType, ModelLossMode, EmbeddingReductionMode | ||
from Embeddings.embedding_models import SeqEmbEHR, SeqEmbEHRConfig | ||
|
||
|
||
def ohe2idx(data): | ||
# convert OHE to index np.array(0,1,0,1) => [1, 3] | ||
uniques = set() | ||
nd = [] | ||
for each in data: | ||
d1 = [] | ||
for e1 in each: | ||
d2 = [] | ||
for e2 in e1: | ||
idxs = list(np.where(e2 == 1)[0]) | ||
for i in idxs: | ||
uniques.add(i) | ||
d2.append(idxs) | ||
d1.append(np.array(d2)) | ||
nd.append(np.array(d1)) | ||
return nd, uniques | ||
|
||
|
||
def random_generate_embeddings(vocab, emb_dim=50): | ||
""" | ||
The function is used to create a random initialized embeddings based on a pre-defined vocab | ||
:param vocab: a list of medical codes (ICD or RXCUI) | ||
:return: | ||
""" | ||
vocab = sorted(list(set(vocab))) | ||
|
||
code2index = dict() | ||
code2index['pad'] = 0 | ||
code2index['unk'] = len(vocab) + 1 | ||
|
||
embeddings = np.zeros(emb_dim).reshape(1, -1) | ||
|
||
for idx, code in enumerate(vocab): | ||
code2index[code] = idx + 1 | ||
|
||
np.random.seed(2) | ||
embeddings = np.concatenate([embeddings, np.random.rand(len(vocab)+1, emb_dim)], axis=0) | ||
|
||
index2code = {v: k for k, v in code2index.items()} | ||
|
||
return embeddings, code2index, index2code | ||
|
||
|
||
if __name__ == '__main__': | ||
trs = pkl_load("../data/tlstm_sync/data_train.pkl") | ||
ttrs = pkl_load("../data/tlstm_sync/elapsed_train.pkl") | ||
trsl = pkl_load("../data/tlstm_sync/label_train.pkl") | ||
ntrs, s1 = ohe2idx(trs) | ||
|
||
tss = pkl_load("../data/tlstm_sync/data_test.pkl") | ||
ttss = pkl_load("../data/tlstm_sync/elapsed_test.pkl") | ||
tssl = pkl_load("../data/tlstm_sync/label_test.pkl") | ||
ntss, s2 = ohe2idx(tss) | ||
|
||
# create a embedding with dim as 10 | ||
emb, c2i, i2c = random_generate_embeddings(s1.union(s2), 10) | ||
|
||
conf = SeqEmbEHRConfig( | ||
input_dim=10, output_dim=2, hidden_dim=64, emb_dim=10, drop_prob=0.1, | ||
model_type=ModelType.M_TLSTM, loss_type=ModelLossMode.BIN, merge_type=EmbeddingReductionMode.SUM) | ||
model = SeqEmbEHR(config=conf, emb_weights=emb) | ||
|
||
lr = 0.001 | ||
epn = 50 | ||
mgn = 2.0 | ||
|
||
optimizer = torch.optim.Adam(model.parameters(), lr=lr) | ||
idxes = list(range(len(ntrs))) | ||
tr_loss = .0 | ||
|
||
# ### Training | ||
for ep in range(epn): | ||
np.random.shuffle(idxes) | ||
for idx in idxes: | ||
model.zero_grad() | ||
model.train() | ||
|
||
feature = ntrs[idx] | ||
labels = trsl[idx] | ||
time = ttrs[idx] | ||
time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]]) | ||
|
||
feature_tensor = torch.tensor(feature, dtype=torch.long) | ||
time_tensor = torch.tensor(time, dtype=torch.float32) | ||
label_tensor = torch.tensor(labels, dtype=torch.float32) | ||
|
||
loss, _, _, _ = model(feature_tensor, label_tensor, time_tensor) | ||
tr_loss += loss.item() | ||
loss.backward() | ||
optimizer.step() | ||
print("epoch: {}; training loss: {}".format(ep + 1, tr_loss / (ep + 1))) | ||
|
||
|
||
# ### evaluation | ||
model.eval() | ||
|
||
idxes = list(range(len(ntss))) | ||
y_preds, y_trues, gs_labels, pred_labels = None, None, None, None | ||
|
||
for idx in idxes: | ||
feature = ntss[idx] | ||
labels = tssl[idx] | ||
time = ttss[idx] | ||
time = np.reshape(time, [time.shape[0], time.shape[2], time.shape[1]]) | ||
|
||
feature_tensor = torch.tensor(feature, dtype=torch.long) | ||
time_tensor = torch.tensor(time, dtype=torch.float32) | ||
label_tensor = torch.tensor(labels, dtype=torch.float32) | ||
|
||
with torch.no_grad(): | ||
_, logits, y_pred, _ = model(feature_tensor, label_tensor, time_tensor) | ||
|
||
logits = logits.detach().cpu().numpy() | ||
y_pred = y_pred.detach().cpu().numpy() | ||
|
||
if y_preds is None: | ||
pred_labels = logits | ||
y_preds = y_pred | ||
gs_labels = labels | ||
y_trues = labels[:, 1] | ||
else: | ||
pred_labels = np.concatenate([pred_labels, logits], axis=0) | ||
y_preds = np.concatenate([y_preds, y_pred], axis=0) | ||
gs_labels = np.concatenate([gs_labels, labels], axis=0) | ||
y_trues = np.concatenate([y_trues, labels[:, 1]], axis=0) | ||
|
||
from sklearn.metrics import roc_auc_score, accuracy_score | ||
total_acc = accuracy_score(y_trues, y_preds) | ||
total_auc = roc_auc_score(gs_labels, pred_labels, average='micro') | ||
total_auc_macro = roc_auc_score(gs_labels, pred_labels, average='macro') | ||
print("Accuracy = {:.3f}".format(total_acc)) | ||
print("AUC = {:.3f}".format(total_auc)) | ||
print("AUC Macro = {:.3f}".format(total_auc_macro)) |
Empty file.
Oops, something went wrong.