Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

multilayer support, dataset improvments and more #38

Open
wants to merge 43 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b32fb17
add support for multilayer LSTM
chenb67 Jun 15, 2016
38bc326
update dataset handling to support vocab size , shuffle before every …
chenb67 Jun 20, 2016
c8467b3
fix bug with initial dataset parsing
chenb67 Jun 20, 2016
f57a66b
clean params memory when saving
chenb67 Jun 20, 2016
d890d60
fix bug with vocaulary creation (penlight bug)
chenb67 Jun 21, 2016
2265f55
add weight decay
chenb67 Jun 21, 2016
d0211c3
add support for validation set in dataset + eval function in seq2seq …
chenb67 Jun 22, 2016
1ffcb70
fix potential bug with samples loadingcsv loading
chenb67 Jun 23, 2016
2584439
fix next bug + eval end of batch bug
chenb67 Jun 23, 2016
f6a7a38
fix dataset end of batch handling
chenb67 Jun 23, 2016
8d42e0a
add dropout + gradnorm monitoring
chenb67 Jun 26, 2016
43906b8
fix eval with dropout
chenb67 Jun 26, 2016
31b7115
fix eval procedure
chenb67 Jun 27, 2016
d4cf6d8
simplify criterion usage
chenb67 Jun 27, 2016
86124b7
fix loss calculation for minibatches
chenb67 Jun 27, 2016
cfecf99
use minibatches for validation set
chenb67 Jun 27, 2016
3d2aa19
add option to select param for early stopping
chenb67 Jun 28, 2016
899c8e4
change to SeqLSTM
chenb67 Jul 3, 2016
61c4438
fix optimParams memory efficiency between epochs
chenb67 Jul 3, 2016
38cb321
move criterion to CPU before saving
chenb67 Jul 3, 2016
b93c224
add support for multilayer LSTM
chenb67 Jun 15, 2016
e5fa3b1
update dataset handling to support vocab size , shuffle before every …
chenb67 Jun 20, 2016
586bd1b
fix bug with initial dataset parsing
chenb67 Jun 20, 2016
eb735d1
clean params memory when saving
chenb67 Jun 20, 2016
379508a
fix bug with vocaulary creation (penlight bug)
chenb67 Jun 21, 2016
3c400d5
add weight decay
chenb67 Jun 21, 2016
fe7bb24
add support for validation set in dataset + eval function in seq2seq …
chenb67 Jun 22, 2016
eccfd8a
fix potential bug with samples loadingcsv loading
chenb67 Jun 23, 2016
5d598c2
fix next bug + eval end of batch bug
chenb67 Jun 23, 2016
ce0a9f5
fix dataset end of batch handling
chenb67 Jun 23, 2016
7082b0d
add dropout + gradnorm monitoring
chenb67 Jun 26, 2016
fa59e54
fix eval with dropout
chenb67 Jun 26, 2016
35f40a6
fix eval procedure
chenb67 Jun 27, 2016
80059eb
simplify criterion usage
chenb67 Jun 27, 2016
b1a0e59
fix loss calculation for minibatches
chenb67 Jun 27, 2016
b864799
use minibatches for validation set
chenb67 Jun 27, 2016
df6154f
add option to select param for early stopping
chenb67 Jun 28, 2016
b20c13e
change to SeqLSTM
chenb67 Jul 3, 2016
0414609
fix optimParams memory efficiency between epochs
chenb67 Jul 3, 2016
b72b6c2
move criterion to CPU before saving
chenb67 Jul 3, 2016
9f53e82
fix merge issues
chenb67 Jul 3, 2016
94fce61
merge master
chenb67 Jul 3, 2016
8a57f72
make seqlstm turned off by default and controlled via the flag --seqLstm
chenb67 Jul 12, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions cornell_movie_dialogs.lua
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ function CornellMovieDialogs:load()
end

xlua.progress(TOTAL_LINES, TOTAL_LINES)

return conversations

print("-- Saving context-response samples ...")

contextResponse = {contexts = {},responses = {}}
for cnum,conv in ipairs(conversations) do
for i = 2, #conv do
table.insert(contextResponse.contexts,conv[i-1]['text'])
table.insert(contextResponse.responses,conv[i]['text'])
end
end
csvigo.save(self.dir .. "/contextResponse.csv",contextResponse)
end
235 changes: 137 additions & 98 deletions dataset.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,35 @@ local DataSet = torch.class("neuralconvo.DataSet")
local xlua = require "xlua"
local tokenizer = require "tokenizer"
local list = require "pl.List"
local utils = require "pl.utils"
local function_arg = utils.function_arg

function DataSet:__init(loader, options)

function DataSet:__init(samples_file, options)
options = options or {}

self.examplesFilename = "data/examples.t7"

-- Reject words once vocab size reaches this threshold
self.maxVocabSize = options.maxVocabSize or 0
self.vocabSize = options.vocabSize or -1

-- Maximum number of words in an example sentence
self.maxExampleLen = options.maxExampleLen or 25

-- Load only first fews examples (approximately)
self.loadFirst = options.loadFirst
self.loadFirst = options.dataset or 0

-- Train/Dev/Test split
self.devSplit = options.valSetSize or 0
self.trainSplit = 1 - self.devSplit

self.examples = {}
self.word2id = {}
self.id2word = {}
self.wordsCount = 0

self:load(loader)
self.devExamples = {}
self.examplesCount = 0
self.samples_file = csvigo.load{path=samples_file,mode='large'}
end

function DataSet:load(loader)
function DataSet:load(vocabOnly)
local filename = "data/vocab.t7"

if path.exists(filename) then
Expand All @@ -49,79 +54,98 @@ function DataSet:load(loader)
self.goToken = data.goToken
self.eosToken = data.eosToken
self.unknownToken = data.unknownToken
self.examplesCount = data.examplesCount
else
print("" .. filename .. " not found")
self:visit(loader:load())
print("Writing " .. filename .. " ...")
self:buildVocab()
print("\nWriting " .. filename .. " ...")
torch.save(filename, {
word2id = self.word2id,
id2word = self.id2word,
wordsCount = self.wordsCount,
goToken = self.goToken,
eosToken = self.eosToken,
unknownToken = self.unknownToken,
examplesCount = self.examplesCount
})
unknownToken = self.unknownToken
})
end
if vocabOnly then
return
end
print "-- Loading samples"
self:readSamples()
end

function DataSet:visit(conversations)
self.examples = {}

function DataSet:buildVocab()
-- Table for keeping track of word frequency
self.wordFreqs = {}
self.word2id = {}
self.id2word = {}
self.wordsCount = 0

-- Add magic tokens
self.goToken = self:makeWordId("<go>") -- Start of sequence
self.eosToken = self:makeWordId("<eos>") -- End of sequence
self.unknownToken = self:makeWordId("<unknown>") -- Word dropped from vocabulary

print("-- Pre-processing data")

local total = self.loadFirst or #conversations * 2
self.goToken = self:addWordToVocab("<go>") -- Start of sequence
self.eosToken = self:addWordToVocab("<eos>") -- End of sequence
self.unknownToken = self:addWordToVocab("<unknown>") -- Word dropped from vocabulary

for i, conversation in ipairs(conversations) do
if i > total then break end
self:visitConversation(conversation)
xlua.progress(i, total)
print("-- Build vocab")

local nb_samples = #self.samples_file
if self.loadFirst > 0 then
nb_samples = self.loadFirst
end

-- Revisit from the perspective of 2nd character
for i, conversation in ipairs(conversations) do
if #conversations + i > total then break end
self:visitConversation(conversation, 2)
xlua.progress(#conversations + i, total)

for i=2, nb_samples do
self:countWords(self.samples_file[i][1])
self:countWords(self.samples_file[i][2])
if i % 1000 == 0 then
xlua.progress(i,nb_samples)
end
end

local sortedCounts = f_sortv(self.wordFreqs,function(x,y) return x>y end)

for word,freq in sortedCounts do
nWordId = self:addWordToVocab(word)
if self.vocabSize > 0 and nWordId >= self.vocabSize then
break
end
end
end

function DataSet:shuffleExamples()
print("-- Shuffling ")
newIdxs = torch.randperm(#self.examples)
local sExamples = {}
for i, sample in ipairs(self.examples) do
sExamples[i] = self.examples[newIdxs[i]]
end
self.examples = sExamples

self.examplesCount = #self.examples
self:writeExamplesToFile()
self.examples = nil

collectgarbage()
end

function DataSet:writeExamplesToFile()
print("Writing " .. self.examplesFilename .. " ...")
local file = torch.DiskFile(self.examplesFilename, "w")

for i, example in ipairs(self.examples) do
file:writeObject(example)
xlua.progress(i, #self.examples)
function DataSet:readSamples()
local nb_samples = #self.samples_file
if self.loadFirst > 0 then
nb_samples = self.loadFirst
end

local responses_idx,contexts_idx = 1,2
if self.samples_file[1][2] == 'responses' then
responses_idx,contexts_idx = 2,1
end

for i=2, nb_samples do
self:processSample(self.samples_file[i][contexts_idx],self.samples_file[i][responses_idx])
if i % 1000 == 0 then
xlua.progress(i,nb_samples)
end
end

file:close()
self.examplesCount = #self.examples
end

function DataSet:batches(size)
local file = torch.DiskFile(self.examplesFilename, "r")
file:quiet()
function DataSet:batches(dataSource,size)
local done = false
local cursor = 1

return function()
if done then
Expand All @@ -132,11 +156,11 @@ function DataSet:batches(size)
local maxInputSeqLen,maxTargetOutputSeqLen = 0,0

for i = 1, size do
local example = file:readObject()
local example = dataSource[cursor]
cursor = cursor + 1
if example == nil then
done = true
file:close()
return examples
break
end
inputSeq,targetSeq = unpack(example)
if inputSeq:size(1) > maxInputSeqLen then
Expand All @@ -150,15 +174,9 @@ function DataSet:batches(size)
end

local encoderInputs,decoderInputs,decoderTargets = nil,nil,nil
if size == 1 then
encoderInputs = torch.IntTensor(maxInputSeqLen):fill(0)
decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1):fill(0)
decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1):fill(0)
else
encoderInputs = torch.IntTensor(maxInputSeqLen,size):fill(0)
decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
end
encoderInputs = torch.IntTensor(maxInputSeqLen,size):fill(0)
decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)

for samplenb = 1, #inputSeqs do
for word = 1,inputSeqs[samplenb]:size(1) do
Expand Down Expand Up @@ -197,42 +215,42 @@ function DataSet:batches(size)
end
end

function DataSet:visitConversation(lines, start)
start = start or 1

for i = start, #lines, 2 do
local input = lines[i]
local target = lines[i+1]
function DataSet:processSample(sampleInput, sampleTarget)
if sampleTarget then
local inputIds = self:visitText(sampleInput)
local targetIds = self:visitText(sampleTarget)

if target then
local inputIds = self:visitText(input.text)
local targetIds = self:visitText(target.text, 2)

if inputIds and targetIds then
-- Revert inputs
inputIds = list.reverse(inputIds)

table.insert(targetIds, 1, self.goToken)
table.insert(targetIds, self.eosToken)
if inputIds and targetIds then
-- Revert inputs
inputIds = list.reverse(inputIds)

table.insert(targetIds, 1, self.goToken)
table.insert(targetIds, self.eosToken)

if torch.uniform() >= self.devSplit then
table.insert(self.examples, { torch.IntTensor(inputIds), torch.IntTensor(targetIds) })
else
table.insert(self.devExamples, { torch.IntTensor(inputIds), torch.IntTensor(targetIds) })
end
end
end
end

function DataSet:visitText(text, additionalTokens)
function DataSet:visitText(text)
local words = {}
additionalTokens = additionalTokens or 0

if text == "" then
return
end

for t, word in tokenizer.tokenize(text) do
table.insert(words, self:makeWordId(word))
local cWord = self.word2id[word:lower()]
if not cWord then
cWord = self.unknownToken
end
table.insert(words, cWord)
-- Only keep the first sentence
if t == "endpunct" or #words >= self.maxExampleLen - additionalTokens then
if t == "endpunct" or #words >= self.maxExampleLen then
break
end
end
Expand All @@ -244,22 +262,43 @@ function DataSet:visitText(text, additionalTokens)
return words
end

function DataSet:makeWordId(word)
if self.maxVocabSize > 0 and self.wordsCount >= self.maxVocabSize then
-- We've reached the maximum size for the vocab. Replace w/ unknown token
return self.unknownToken
end

word = word:lower()

local id = self.word2id[word]

if not id then
self.wordsCount = self.wordsCount + 1
id = self.wordsCount
self.id2word[id] = word
self.word2id[word] = id
function DataSet:countWords(sentence)
--if text == "" then
-- return
--end
for t, word in tokenizer.tokenize(sentence) do
local lword = word:lower()
if self.wordFreqs[lword] == nil then
self.wordFreqs[lword] = 0
end
self.wordFreqs[lword] = self.wordFreqs[lword] + 1
end
end

return id
function DataSet:addWordToVocab(word)
word = word:lower()
self.wordsCount = self.wordsCount + 1
self.word2id[word] = self.wordsCount
self.id2word[self.wordsCount] = word
return self.wordsCount
end

-- penlight from luarocks is outdated.. below fixed version for sortv
--- return an iterator to a table sorted by its values
-- @within Iterating
-- @tab t the table
-- @func f an optional comparison function (f(x,y) is true if x < y)
-- @usage for k,v in tablex.sortv(t) do print(k,v) end
-- @return an iterator to traverse elements sorted by the values
function f_sortv(t,f)
f = function_arg(2, f or '<')
local keys = {}
for k in pairs(t) do keys[#keys + 1] = k end
table.sort(keys,function(x, y) return f(t[x], t[y]) end)
local i = 0
return function()
i = i + 1
return keys[i], t[keys[i]]
end
end
8 changes: 5 additions & 3 deletions eval.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@ local tokenizer = require "tokenizer"
local list = require "pl.List"
local options = {}


cmd = torch.CmdLine()
cmd:text('Options:')
cmd:option('--debug', false, 'show debug info')
cmd:text()
options = cmd:parse(arg)

-- Data
dataset = neuralconvo.DataSet()

local dataset = neuralconvo.DataSet("data/cornell_movie_dialogs/contextResponse.csv")
dataset:load(true)

print("-- Loading model")
model = torch.load("data/model.t7")
Expand Down Expand Up @@ -51,7 +53,7 @@ function say(text)
table.insert(wordIds, id)
end

local input = torch.Tensor(list.reverse(wordIds))
local input = torch.Tensor({list.reverse(wordIds)}):t()
local wordIds, probabilities = model:eval(input)

print("neuralconvo> " .. pred2sent(wordIds))
Expand Down
1 change: 1 addition & 0 deletions neuralconvo.lua
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require 'torch'
require 'nn'
require 'rnn'
require 'csvigo'

neuralconvo = {}

Expand Down
Loading