batch run loader

yjernite · Aug 17, 2015 · d72d88b · d72d88b
1 parent e26ea49
commit d72d88b
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -36,32 +36,33 @@ Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10
 widely used by the language modeling community) is given as the default.
 
 ### Model
-Here are some example scripts.
+Here are some example scripts. Add `-gpuid 0` to use a GPU (which is
+required to get any reasonable speed with the CNN)
 
 Large character-level model (`LSTM-CharCNN-Large` in the paper).
 This is the default: should get ~82 on valid and ~79 on test.
 ```
-th main.lua -gpuid 0 -savefile char-large
+th main.lua -savefile char-large
 ```
 
 Small character-level model (`LSTM-CharCNN-Small` in the paper).
 This should get ~96 on valid and ~93 on test.
 ```
-th main.lua -gpuid 0 -savefile char-small -rnn_size 300 -highway_layers 1 
+th main.lua -savefile char-small -rnn_size 300 -highway_layers 1 
 -kernels '{1,2,3,4,5,6}' -feature_maps '{25,50,75,100,125,150}'
 ```
 
 Large word-level model (`LSTM-Word-Large` in the paper).
 This should get ~89 on valid and ~85 on test.
 ```
-th main.lua -gpuid 0 -savefile word-large -word_vec_size 650 -highway_layers 0 
+th main.lua -savefile word-large -word_vec_size 650 -highway_layers 0 
 -use_chars 0 -use_words 1
 ```
 
 Small word-level model (`LSTM-Word-Small` in the paper).
 This should get ~101 on valid and ~98 on test.
 ```
-th main.lua -gpuid 0 -savefile word-small -word_vec_size 200 -highway_layers 0 
+th main.lua -savefile word-small -word_vec_size 200 -highway_layers 0 
 -use_chars 0 -use_words 1 -rnn_size 200
 ```
 

diff --git a/main.lua b/main.lua
@@ -61,7 +61,7 @@ cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid c
 cmd:option('-EOS', '+', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
 -- GPU/CPU
 cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
-cmd:option('-time', 0, 'print batch times')
+cmd:option('-time', 1, 'print batch times')
 cmd:text()
 
 -- parse input params

diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua
@@ -33,14 +33,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
     self.vocab_size = #self.idx2word
     print(string.format('Word vocab size: %d, Char vocab size: %d', #self.idx2word, #self.idx2char))
     -- create word-char mappings
-    self.max_word_l = 0
-    for i = 1, #self.idx2word do
-	self.max_word_l = math.max(self.max_word_l, self.idx2word[i]:len()) -- get max word length 
-    end
-    self.max_word_l = self.max_word_l + 2*self.padding -- pad at start and end
-    if self.max_word_l ~= nil then
-        self.max_word_l = math.min(self.max_word_l, max_word_l)
-    end
+    self.max_word_l = all_data_char[1]:size(2)
     -- cut off the end for train/valid sets so that it divides evenly
     -- test set is not cut off
     self.batch_size = batch_size
@@ -117,6 +110,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     local output_tensors = {} -- output tensors for train/val/test
     local output_chars = {} -- output character for train/val/test sets (not tensors yet)
     local vocab_count = {} -- vocab count 
+    local max_word_l_tmp = 0
     local idx2word = {tokens.UNK} -- unknown word token
     local word2idx = {}; word2idx[tokens.UNK] = 1
     local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
@@ -128,23 +122,23 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
        -- First count all the words in the string.
        local counts = 0
        for line in f:lines() do
-          line = stringx.replace(line, '<unk>', tokens.UNK)
-
+          line = stringx.replace(line, '<unk>', tokens.UNK) -- replace unk with a single character
           for word in line:gmatch'([^%s]+)' do
              counts = counts + 1
+	     max_word_l_tmp = math.max(max_word_l_tmp, word:len())
           end
           counts = counts + 1
        end
        f:close()
+
+       -- if actual max word length is less than the limit, use that
+       max_word_l = math.min(max_word_l_tmp, max_word_l)
 
        -- Next preallocate the tensors we will need.
        -- Watch out the second one needs a lot of RAM.
        output_tensors[split] = torch.LongTensor(counts)
-       output_chars[split] = torch.LongTensor(counts, max_word_l):ones()
-
-       -- Next preallocate the tensors we will need.
-       -- Watch out the second one needs a lot of RAM.
-
+       output_chars[split] = torch.ones(counts, max_word_l):long()
+       print(output_chars[split]:size(2))
        f = io.open(input_files[split], 'r')
        local word_num = 0
        for line in f:lines() do
@@ -181,7 +175,9 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
              end
              append(rword)
           end
-          append("+")
+	  if tokens.EOS ~= '' then
+              append(tokens.EOS)
+	  end
        end
     end
     print "done"