readme

yjernite · Aug 17, 2015 · 11fd291 · 11fd291
1 parent d72d88b
commit 11fd291
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 ## Neural Language Modeling with Characters
 A neural language model (NLM) built on character inputs only. 
-The model employs a convolutional neural network (CNN) over character
-embeddings to use as inputs into an long short-term memory (LSTM)
-recurrent neural network language model (RNNLM). Also optionally
+The model employs a convolutional neural network (CNN) over characters 
+to use as inputs into an long short-term memory (LSTM)
+recurrent neural network language model (RNN-LM). Also optionally
 passes the output from the CNN through a [Highway Network](http://arxiv.org/abs/1507.06228), 
 which improves performance.
 
@@ -12,6 +12,12 @@ will be posted on arXiv very soon.
 Much of the base code is from Andrej Karpathy's excellent character RNN implementation,
 available at https://github.com/karpathy/char-rnn
 
+Also, the repo name 'word-char-rnn' is bit of a misnomer, as the primary motivation
+is to use character-level inputs only. But as a baseline we implemented the
+word-level models (and also experimented with models whereby the input
+is a concatenation of the word embedding and the output from a character CNN),
+hence the name.
+
 ### Requirements
 Code is written in Lua and requires Torch. It additionally requires
 the `nngraph` and `optim` packages, which can be installed via:
@@ -36,7 +42,7 @@ Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10
 widely used by the language modeling community) is given as the default.
 
 ### Model
-Here are some example scripts. Add `-gpuid 0` to use a GPU (which is
+Here are some example scripts. Add `-gpuid 0` to each line to use a GPU (which is
 required to get any reasonable speed with the CNN)
 
 Large character-level model (`LSTM-CharCNN-Large` in the paper).

diff --git a/main.lua b/main.lua
@@ -61,7 +61,7 @@ cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid c
 cmd:option('-EOS', '+', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
 -- GPU/CPU
 cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
-cmd:option('-time', 1, 'print batch times')
+cmd:option('-time', 0, 'print batch times')
 cmd:text()
 
 -- parse input params

diff --git a/model/TDNN.lua b/model/TDNN.lua
@@ -23,7 +23,9 @@ function TDNN.tdnn(length, input_size, feature_maps, kernels)
           local conv = nn.TemporalConvolution(input_size, feature_maps[i], kernels[i])
           local conv_layer = conv(input)
           conv.name = 'conv_filter_' .. kernels[i] .. '_' .. feature_maps[i]
-          pool_layer = nn.Max(2)(nn.Tanh()(conv_layer))    
+          --pool_layer = nn.Max(2)(nn.Tanh()(conv_layer))    
+	  pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
+	  pool_layer = nn.Squeeze()(pool_layer)
        else
           -- Use CuDNN for temporal convolution.
           if not cudnn then require 'cudnn' end

diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua
@@ -115,6 +115,24 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     local word2idx = {}; word2idx[tokens.UNK] = 1
     local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
     local char2idx = {}; char2idx[tokens.ZEROPAD] = 1; char2idx[tokens.START] = 2; char2idx[tokens.END] = 3
+
+    -- first go through train/valid/test to get max word length
+    -- if actual max word length (e.g. 19 for PTB) is smaller than specified
+    -- we use that instead. this is inefficient, but only a one-off thing
+    for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
+       f = io.open(input_files[split], 'r')       
+       for line in f:lines() do
+          for word in line:gmatch'([^%s]+)' do
+	     max_word_l_tmp = math.max(max_word_l_tmp, word:len())
+          end
+       end
+       f:close()
+    end
+
+    print('After first pass of data, max word length is: ' .. max_word_l_tmp)
+    -- if actual max word length is less than the limit, use that
+    max_word_l = math.min(max_word_l_tmp, max_word_l)
+
     for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
        output = {}
        output_char = {}
@@ -123,26 +141,25 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
        local counts = 0
        for line in f:lines() do
           line = stringx.replace(line, '<unk>', tokens.UNK) -- replace unk with a single character
+	  line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved
+	  line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved
           for word in line:gmatch'([^%s]+)' do
              counts = counts + 1
-	     max_word_l_tmp = math.max(max_word_l_tmp, word:len())
           end
           counts = counts + 1
        end
        f:close()
 
-       -- if actual max word length is less than the limit, use that
-       max_word_l = math.min(max_word_l_tmp, max_word_l)
-
        -- Next preallocate the tensors we will need.
        -- Watch out the second one needs a lot of RAM.
        output_tensors[split] = torch.LongTensor(counts)
        output_chars[split] = torch.ones(counts, max_word_l):long()
-       print(output_chars[split]:size(2))
        f = io.open(input_files[split], 'r')
        local word_num = 0
        for line in f:lines() do
           line = stringx.replace(line, '<unk>', tokens.UNK)
+	  line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved
+	  line = stringx.replace(line, tokens.END, '')
           for rword in line:gmatch'([^%s]+)' do
              function append(word)
                 word_num = word_num + 1
@@ -175,8 +192,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
              end
              append(rword)
           end
-	  if tokens.EOS ~= '' then
-              append(tokens.EOS)
+	  if tokens.EOS ~= '' then --PTB does not have <eos> so we add a character for <eos> tokens
+              append(tokens.EOS)   --other datasets with periods or <eos> already present do not need this
 	  end
        end
     end