diff --git a/README.md b/README.md index 4b084c1..585124b 100644 --- a/README.md +++ b/README.md @@ -36,32 +36,33 @@ Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10 widely used by the language modeling community) is given as the default. ### Model -Here are some example scripts. +Here are some example scripts. Add `-gpuid 0` to use a GPU (which is +required to get any reasonable speed with the CNN) Large character-level model (`LSTM-CharCNN-Large` in the paper). This is the default: should get ~82 on valid and ~79 on test. ``` -th main.lua -gpuid 0 -savefile char-large +th main.lua -savefile char-large ``` Small character-level model (`LSTM-CharCNN-Small` in the paper). This should get ~96 on valid and ~93 on test. ``` -th main.lua -gpuid 0 -savefile char-small -rnn_size 300 -highway_layers 1 +th main.lua -savefile char-small -rnn_size 300 -highway_layers 1 -kernels '{1,2,3,4,5,6}' -feature_maps '{25,50,75,100,125,150}' ``` Large word-level model (`LSTM-Word-Large` in the paper). This should get ~89 on valid and ~85 on test. ``` -th main.lua -gpuid 0 -savefile word-large -word_vec_size 650 -highway_layers 0 +th main.lua -savefile word-large -word_vec_size 650 -highway_layers 0 -use_chars 0 -use_words 1 ``` Small word-level model (`LSTM-Word-Small` in the paper). This should get ~101 on valid and ~98 on test. ``` -th main.lua -gpuid 0 -savefile word-small -word_vec_size 200 -highway_layers 0 +th main.lua -savefile word-small -word_vec_size 200 -highway_layers 0 -use_chars 0 -use_words 1 -rnn_size 200 ``` diff --git a/main.lua b/main.lua index 58c984c..9e19a4b 100644 --- a/main.lua +++ b/main.lua @@ -61,7 +61,7 @@ cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid c cmd:option('-EOS', '+', ' symbol. should be a single unused character (like +) for PTB and blank for others') -- GPU/CPU cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU') -cmd:option('-time', 0, 'print batch times') +cmd:option('-time', 1, 'print batch times') cmd:text() -- parse input params diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua index 71f070a..43d5bac 100644 --- a/util/BatchLoaderUnk.lua +++ b/util/BatchLoaderUnk.lua @@ -33,14 +33,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo self.vocab_size = #self.idx2word print(string.format('Word vocab size: %d, Char vocab size: %d', #self.idx2word, #self.idx2char)) -- create word-char mappings - self.max_word_l = 0 - for i = 1, #self.idx2word do - self.max_word_l = math.max(self.max_word_l, self.idx2word[i]:len()) -- get max word length - end - self.max_word_l = self.max_word_l + 2*self.padding -- pad at start and end - if self.max_word_l ~= nil then - self.max_word_l = math.min(self.max_word_l, max_word_l) - end + self.max_word_l = all_data_char[1]:size(2) -- cut off the end for train/valid sets so that it divides evenly -- test set is not cut off self.batch_size = batch_size @@ -117,6 +110,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil local output_tensors = {} -- output tensors for train/val/test local output_chars = {} -- output character for train/val/test sets (not tensors yet) local vocab_count = {} -- vocab count + local max_word_l_tmp = 0 local idx2word = {tokens.UNK} -- unknown word token local word2idx = {}; word2idx[tokens.UNK] = 1 local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens @@ -128,23 +122,23 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil -- First count all the words in the string. local counts = 0 for line in f:lines() do - line = stringx.replace(line, '', tokens.UNK) - + line = stringx.replace(line, '', tokens.UNK) -- replace unk with a single character for word in line:gmatch'([^%s]+)' do counts = counts + 1 + max_word_l_tmp = math.max(max_word_l_tmp, word:len()) end counts = counts + 1 end f:close() + + -- if actual max word length is less than the limit, use that + max_word_l = math.min(max_word_l_tmp, max_word_l) -- Next preallocate the tensors we will need. -- Watch out the second one needs a lot of RAM. output_tensors[split] = torch.LongTensor(counts) - output_chars[split] = torch.LongTensor(counts, max_word_l):ones() - - -- Next preallocate the tensors we will need. - -- Watch out the second one needs a lot of RAM. - + output_chars[split] = torch.ones(counts, max_word_l):long() + print(output_chars[split]:size(2)) f = io.open(input_files[split], 'r') local word_num = 0 for line in f:lines() do @@ -181,7 +175,9 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil end append(rword) end - append("+") + if tokens.EOS ~= '' then + append(tokens.EOS) + end end end print "done"