From c213f4869fc2984e6ab33dce3465099a991a20d9 Mon Sep 17 00:00:00 2001 From: macournoyer Date: Tue, 3 Nov 2015 13:40:34 -0500 Subject: [PATCH] Split eval into another file. --- e.lua | 4 ++++ eval.lua | 43 +++++++++++++++++++++++++++++++++++++++++ train.lua | 57 ++++++++++++------------------------------------------- 3 files changed, 59 insertions(+), 45 deletions(-) create mode 100644 eval.lua diff --git a/e.lua b/e.lua index abd3f3c..62a9807 100644 --- a/e.lua +++ b/e.lua @@ -1,3 +1,7 @@ +require 'torch' +require 'nn' +require 'rnn' + e = {} torch.include('e', 'cornell_movie_dialogs.lua') diff --git a/eval.lua b/eval.lua new file mode 100644 index 0000000..db80a8b --- /dev/null +++ b/eval.lua @@ -0,0 +1,43 @@ +require 'e' +local tokenizer = require "tokenizer" + +dataset = e.DataSet("data/cornell_movie_dialogs.t7", + e.CornellMovieDialogs("data/cornell_movie_dialogs")) + +EOS = dataset.word2id[""] + +print("-- Loading model") +model = torch.load("data/model.t7") + +function output2wordId(t) + local max = t:max() + for i = 1, t:size(1) do + if t[i] == max then + return i + end + end +end + +function say(text) + local inputs = {} + for t, word in tokenizer.tokenize(text) do + local t = dataset.word2id[word:lower()] + table.insert(inputs, t) + end + + model:forget() + + for i = #inputs, 1, -1 do + local input = inputs[i] + model:forward(torch.Tensor{input}) + end + + local input = EOS + repeat + local output = model:forward(torch.Tensor{input}) + io.write(dataset.id2word[output2wordId(output)] .. " ") + input = output2wordId(output) + until input == EOS + + print("") +end diff --git a/train.lua b/train.lua index c4f89d5..00cddc2 100644 --- a/train.lua +++ b/train.lua @@ -1,13 +1,11 @@ -require 'nn' -require 'rnn' -require 'xlua' require 'e' +require 'xlua' -- Data --- local dataset = e.DataSet("data/cornell_movie_dialogs.t7", --- e.CornellMovieDialogs("data/cornell_movie_dialogs")) -dataset = e.DataSet("data/cornell_movie_dialogs_tiny.t7", - e.CornellMovieDialogs("data/cornell_movie_dialogs"), 1000) +dataset = e.DataSet("data/cornell_movie_dialogs.t7", + e.CornellMovieDialogs("data/cornell_movie_dialogs")) +-- dataset = e.DataSet("data/cornell_movie_dialogs_tiny.t7", +-- e.CornellMovieDialogs("data/cornell_movie_dialogs"), 1000) EOS = dataset.word2id[""] @@ -22,8 +20,8 @@ model:add(nn.LookupTable(dataset.wordsCount, inputSize)) model:add(nn.SplitTable(1,2)) model:add(nn.Sequencer(nn.FastLSTM(inputSize, hiddenSize))) model:add(nn.Sequencer(nn.Dropout(dropout))) --- model:add(nn.Sequencer(nn.FastLSTM(hiddenSize, hiddenSize))) --- model:add(nn.Sequencer(nn.Dropout(dropout))) +model:add(nn.Sequencer(nn.FastLSTM(hiddenSize, hiddenSize))) +model:add(nn.Sequencer(nn.Dropout(dropout))) model:add(nn.Sequencer(nn.Linear(hiddenSize, dataset.wordsCount))) model:add(nn.JoinTable(1,2)) model:add(nn.LogSoftMax()) @@ -70,44 +68,13 @@ for epoch = 1, epochCount do model:forget() xlua.progress(i, #dataset.examples) - end - - print("-- Saving model") - torch.save("data/model.t7", model) -end - --- Testing -function output2wordId(t) - local max = t:max() - for i = 1, t:size(1) do - if t[i] == max then - return i + -- TODO remove this when training is faster + if i % 1000 == 0 then + torch.save("data/model.t7", model) end end -end - -local tokenizer = require "tokenizer" -function say(text) - local inputs = {} - for t, word in tokenizer.tokenize(text) do - local t = dataset.word2id[word:lower()] - table.insert(inputs, t) - end - model:forget() - - for i = #inputs, 1, -1 do - local input = inputs[i] - model:forward(torch.Tensor{input}) - end - - local input = EOS - repeat - local output = model:forward(torch.Tensor{input}) - io.write(dataset.id2word[output2wordId(output)] .. " ") - input = output2wordId(output) - until input == EOS - - print("") + print("-- Saving model") + torch.save("data/model.t7", model) end