macournoyer · chenb67 · Jun 15, 2016 · Jun 20, 2016 · Jun 20, 2016 · Jun 20, 2016
diff --git a/cornell_movie_dialogs.lua b/cornell_movie_dialogs.lua
@@ -67,6 +67,15 @@ function CornellMovieDialogs:load()
   end
 
   xlua.progress(TOTAL_LINES, TOTAL_LINES)
-
-  return conversations
+
+  print("-- Saving context-response samples ...")
+
+  contextResponse = {contexts = {},responses = {}}
+  for cnum,conv in ipairs(conversations) do
+    for i = 2, #conv do
+      table.insert(contextResponse.contexts,conv[i-1]['text'])
+      table.insert(contextResponse.responses,conv[i]['text'])
+    end
+  end
+  csvigo.save(self.dir .. "/contextResponse.csv",contextResponse)
 end
diff --git a/dataset.lua b/dataset.lua
@@ -14,30 +14,35 @@ local DataSet = torch.class("neuralconvo.DataSet")
 local xlua = require "xlua"
 local tokenizer = require "tokenizer"
 local list = require "pl.List"
+local utils = require "pl.utils"
+local function_arg = utils.function_arg
 
-function DataSet:__init(loader, options)
+
+function DataSet:__init(samples_file, options)
   options = options or {}
 
   self.examplesFilename = "data/examples.t7"
 
   -- Reject words once vocab size reaches this threshold
-  self.maxVocabSize = options.maxVocabSize or 0
+  self.vocabSize = options.vocabSize or -1
 
   -- Maximum number of words in an example sentence
   self.maxExampleLen = options.maxExampleLen or 25
 
   -- Load only first fews examples (approximately)
-  self.loadFirst = options.loadFirst
+  self.loadFirst = options.dataset or 0
+
+  -- Train/Dev/Test split
+  self.devSplit = options.valSetSize or 0
+  self.trainSplit = 1 - self.devSplit
 
   self.examples = {}
-  self.word2id = {}
-  self.id2word = {}
-  self.wordsCount = 0
-
-  self:load(loader)
+  self.devExamples = {}
+  self.examplesCount = 0
+  self.samples_file = csvigo.load{path=samples_file,mode='large'}
 end
 
-function DataSet:load(loader)
+function DataSet:load(vocabOnly)
   local filename = "data/vocab.t7"
 
   if path.exists(filename) then
@@ -49,79 +54,98 @@ function DataSet:load(loader)
     self.goToken = data.goToken
     self.eosToken = data.eosToken
     self.unknownToken = data.unknownToken
-    self.examplesCount = data.examplesCount
   else
     print("" .. filename .. " not found")
-    self:visit(loader:load())
-    print("Writing " .. filename .. " ...")
+    self:buildVocab()
+    print("\nWriting " .. filename .. " ...")
     torch.save(filename, {
       word2id = self.word2id,
       id2word = self.id2word,
       wordsCount = self.wordsCount,
       goToken = self.goToken,
       eosToken = self.eosToken,
-      unknownToken = self.unknownToken,
-      examplesCount = self.examplesCount
-    })
+      unknownToken = self.unknownToken
+      })
   end
+  if vocabOnly then
+    return
+  end
+  print "-- Loading samples"
+  self:readSamples()
 end
 
-function DataSet:visit(conversations)
-  self.examples = {}
-
+function DataSet:buildVocab()
+  -- Table for keeping track of word frequency
+  self.wordFreqs = {}
+  self.word2id = {}
+  self.id2word = {}
+  self.wordsCount = 0
+
   -- Add magic tokens
-  self.goToken = self:makeWordId("<go>") -- Start of sequence
-  self.eosToken = self:makeWordId("<eos>") -- End of sequence
-  self.unknownToken = self:makeWordId("<unknown>") -- Word dropped from vocabulary
-
-  print("-- Pre-processing data")
-
-  local total = self.loadFirst or #conversations * 2
+  self.goToken = self:addWordToVocab("<go>") -- Start of sequence
+  self.eosToken = self:addWordToVocab("<eos>") -- End of sequence
+  self.unknownToken = self:addWordToVocab("<unknown>") -- Word dropped from vocabulary
 
-  for i, conversation in ipairs(conversations) do
-    if i > total then break end
-    self:visitConversation(conversation)
-    xlua.progress(i, total)
+  print("-- Build vocab")
+
+  local nb_samples = #self.samples_file
+  if self.loadFirst > 0 then
+    nb_samples = self.loadFirst
   end
-
-  -- Revisit from the perspective of 2nd character
-  for i, conversation in ipairs(conversations) do
-    if #conversations + i > total then break end
-    self:visitConversation(conversation, 2)
-    xlua.progress(#conversations + i, total)
+
+  for i=2, nb_samples do
+    self:countWords(self.samples_file[i][1])
+    self:countWords(self.samples_file[i][2])
+    if i % 1000 == 0 then
+      xlua.progress(i,nb_samples)
+    end
   end
 
+  local sortedCounts = f_sortv(self.wordFreqs,function(x,y) return x>y end)
+
+  for word,freq in sortedCounts do
+    nWordId = self:addWordToVocab(word)
+    if self.vocabSize > 0 and nWordId >= self.vocabSize then
+      break
+    end
+  end
+end
+
+function DataSet:shuffleExamples()
   print("-- Shuffling ")
   newIdxs = torch.randperm(#self.examples)
   local sExamples = {}
   for i, sample in ipairs(self.examples) do
     sExamples[i] = self.examples[newIdxs[i]]
   end
   self.examples = sExamples
-
-  self.examplesCount = #self.examples
-  self:writeExamplesToFile()
-  self.examples = nil
-
   collectgarbage()
 end
 
-function DataSet:writeExamplesToFile()
-  print("Writing " .. self.examplesFilename .. " ...")
-  local file = torch.DiskFile(self.examplesFilename, "w")
-
-  for i, example in ipairs(self.examples) do
-    file:writeObject(example)
-    xlua.progress(i, #self.examples)
+function DataSet:readSamples()
+  local nb_samples = #self.samples_file
+  if self.loadFirst > 0 then
+    nb_samples = self.loadFirst
+  end
+
+  local responses_idx,contexts_idx = 1,2
+  if self.samples_file[1][2] == 'responses' then
+    responses_idx,contexts_idx = 2,1
+  end
+
+  for i=2, nb_samples do
+    self:processSample(self.samples_file[i][contexts_idx],self.samples_file[i][responses_idx])
+    if i % 1000 == 0 then
+      xlua.progress(i,nb_samples)
+    end
   end
 
-  file:close()
+  self.examplesCount = #self.examples
 end
 
-function DataSet:batches(size)
-  local file = torch.DiskFile(self.examplesFilename, "r")
-  file:quiet()
+function DataSet:batches(dataSource,size)
   local done = false
+  local cursor = 1
 
   return function()
     if done then
@@ -132,11 +156,11 @@ function DataSet:batches(size)
     local maxInputSeqLen,maxTargetOutputSeqLen = 0,0
 
     for i = 1, size do
-      local example = file:readObject()
+      local example = dataSource[cursor]
+      cursor = cursor + 1
       if example == nil then
         done = true
-        file:close()
-        return examples
+        break
       end
       inputSeq,targetSeq = unpack(example)
       if inputSeq:size(1) > maxInputSeqLen then
@@ -150,15 +174,9 @@ function DataSet:batches(size)
     end
 
     local encoderInputs,decoderInputs,decoderTargets = nil,nil,nil
-    if size == 1 then
-      encoderInputs = torch.IntTensor(maxInputSeqLen):fill(0)
-      decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1):fill(0)
-      decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1):fill(0)
-    else
-      encoderInputs = torch.IntTensor(maxInputSeqLen,size):fill(0)
-      decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
-      decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
-    end
+    encoderInputs = torch.IntTensor(maxInputSeqLen,size):fill(0)
+    decoderInputs = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
+    decoderTargets = torch.IntTensor(maxTargetOutputSeqLen-1,size):fill(0)
 
     for samplenb = 1, #inputSeqs do
       for word = 1,inputSeqs[samplenb]:size(1) do
@@ -197,42 +215,42 @@ function DataSet:batches(size)
   end
 end
 
-function DataSet:visitConversation(lines, start)
-  start = start or 1
-
-  for i = start, #lines, 2 do
-    local input = lines[i]
-    local target = lines[i+1]
+function DataSet:processSample(sampleInput, sampleTarget)
+  if sampleTarget then
+    local inputIds = self:visitText(sampleInput)
+    local targetIds = self:visitText(sampleTarget)
 
-    if target then
-      local inputIds = self:visitText(input.text)
-      local targetIds = self:visitText(target.text, 2)
-
-      if inputIds and targetIds then
-        -- Revert inputs
-        inputIds = list.reverse(inputIds)
-
-        table.insert(targetIds, 1, self.goToken)
-        table.insert(targetIds, self.eosToken)
+    if inputIds and targetIds then
+      -- Revert inputs
+      inputIds = list.reverse(inputIds)
 
+      table.insert(targetIds, 1, self.goToken)
+      table.insert(targetIds, self.eosToken)
+
+      if torch.uniform() >= self.devSplit then
         table.insert(self.examples, { torch.IntTensor(inputIds), torch.IntTensor(targetIds) })
+      else
+        table.insert(self.devExamples, { torch.IntTensor(inputIds), torch.IntTensor(targetIds) })
       end
     end
   end
 end
 
-function DataSet:visitText(text, additionalTokens)
+function DataSet:visitText(text)
   local words = {}
-  additionalTokens = additionalTokens or 0
 
   if text == "" then
     return
   end
 
   for t, word in tokenizer.tokenize(text) do
-    table.insert(words, self:makeWordId(word))
+    local cWord = self.word2id[word:lower()]
+    if not cWord then
+      cWord = self.unknownToken
+    end
+    table.insert(words, cWord)
     -- Only keep the first sentence
-    if t == "endpunct" or #words >= self.maxExampleLen - additionalTokens then
+    if t == "endpunct" or #words >= self.maxExampleLen then
       break
     end
   end
@@ -244,22 +262,43 @@ function DataSet:visitText(text, additionalTokens)
   return words
 end
 
-function DataSet:makeWordId(word)
-  if self.maxVocabSize > 0 and self.wordsCount >= self.maxVocabSize then
-    -- We've reached the maximum size for the vocab. Replace w/ unknown token
-    return self.unknownToken
-  end
-
-  word = word:lower()
-
-  local id = self.word2id[word]
 
-  if not id then
-    self.wordsCount = self.wordsCount + 1
-    id = self.wordsCount
-    self.id2word[id] = word
-    self.word2id[word] = id
+function DataSet:countWords(sentence)
+  --if text == "" then
+  --  return
+  --end
+  for t, word in tokenizer.tokenize(sentence) do
+    local lword = word:lower()
+    if self.wordFreqs[lword] == nil then
+      self.wordFreqs[lword] = 0
+    end
+    self.wordFreqs[lword] = self.wordFreqs[lword] + 1
   end
+end
 
-  return id
+function DataSet:addWordToVocab(word)
+  word = word:lower()
+  self.wordsCount = self.wordsCount + 1
+  self.word2id[word] = self.wordsCount
+  self.id2word[self.wordsCount] = word
+  return self.wordsCount
 end
+
+-- penlight from luarocks is outdated.. below fixed version for sortv
+--- return an iterator to a table sorted by its values
+-- @within Iterating
+-- @tab t the table
+-- @func f an optional comparison function (f(x,y) is true if x < y)
+-- @usage for k,v in tablex.sortv(t) do print(k,v) end
+-- @return an iterator to traverse elements sorted by the values
+function f_sortv(t,f)
+    f = function_arg(2, f or '<')
+    local keys = {}
+    for k in pairs(t) do keys[#keys + 1] = k end
+    table.sort(keys,function(x, y) return f(t[x], t[y]) end)
+    local i = 0
+    return function()
+        i = i + 1
+        return keys[i], t[keys[i]]
+    end
+end
diff --git a/eval.lua b/eval.lua
@@ -3,14 +3,16 @@ local tokenizer = require "tokenizer"
 local list = require "pl.List"
 local options = {}
 
+
 cmd = torch.CmdLine()
 cmd:text('Options:')
 cmd:option('--debug', false, 'show debug info')
 cmd:text()
 options = cmd:parse(arg)
 
--- Data
-dataset = neuralconvo.DataSet()
+
+local dataset = neuralconvo.DataSet("data/cornell_movie_dialogs/contextResponse.csv")
+dataset:load(true)
 
 print("-- Loading model")
 model = torch.load("data/model.t7")
@@ -51,7 +53,7 @@ function say(text)
     table.insert(wordIds, id)
   end
 
-  local input = torch.Tensor(list.reverse(wordIds))
+  local input = torch.Tensor({list.reverse(wordIds)}):t()
   local wordIds, probabilities = model:eval(input)
 
   print("neuralconvo> " .. pred2sent(wordIds))

diff --git a/neuralconvo.lua b/neuralconvo.lua
@@ -1,6 +1,7 @@
 require 'torch'
 require 'nn'
 require 'rnn'
+require 'csvigo'
 
 neuralconvo = {}