Skip to content

Commit

Permalink
batch run loader
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoon Kim committed Aug 17, 2015
1 parent e26ea49 commit d72d88b
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 22 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,33 @@ Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10
widely used by the language modeling community) is given as the default.

### Model
Here are some example scripts.
Here are some example scripts. Add `-gpuid 0` to use a GPU (which is
required to get any reasonable speed with the CNN)

Large character-level model (`LSTM-CharCNN-Large` in the paper).
This is the default: should get ~82 on valid and ~79 on test.
```
th main.lua -gpuid 0 -savefile char-large
th main.lua -savefile char-large
```

Small character-level model (`LSTM-CharCNN-Small` in the paper).
This should get ~96 on valid and ~93 on test.
```
th main.lua -gpuid 0 -savefile char-small -rnn_size 300 -highway_layers 1
th main.lua -savefile char-small -rnn_size 300 -highway_layers 1
-kernels '{1,2,3,4,5,6}' -feature_maps '{25,50,75,100,125,150}'
```

Large word-level model (`LSTM-Word-Large` in the paper).
This should get ~89 on valid and ~85 on test.
```
th main.lua -gpuid 0 -savefile word-large -word_vec_size 650 -highway_layers 0
th main.lua -savefile word-large -word_vec_size 650 -highway_layers 0
-use_chars 0 -use_words 1
```

Small word-level model (`LSTM-Word-Small` in the paper).
This should get ~101 on valid and ~98 on test.
```
th main.lua -gpuid 0 -savefile word-small -word_vec_size 200 -highway_layers 0
th main.lua -savefile word-small -word_vec_size 200 -highway_layers 0
-use_chars 0 -use_words 1 -rnn_size 200
```

Expand Down
2 changes: 1 addition & 1 deletion main.lua
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid c
cmd:option('-EOS', '+', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
-- GPU/CPU
cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
cmd:option('-time', 0, 'print batch times')
cmd:option('-time', 1, 'print batch times')
cmd:text()

-- parse input params
Expand Down
28 changes: 12 additions & 16 deletions util/BatchLoaderUnk.lua
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
self.vocab_size = #self.idx2word
print(string.format('Word vocab size: %d, Char vocab size: %d', #self.idx2word, #self.idx2char))
-- create word-char mappings
self.max_word_l = 0
for i = 1, #self.idx2word do
self.max_word_l = math.max(self.max_word_l, self.idx2word[i]:len()) -- get max word length
end
self.max_word_l = self.max_word_l + 2*self.padding -- pad at start and end
if self.max_word_l ~= nil then
self.max_word_l = math.min(self.max_word_l, max_word_l)
end
self.max_word_l = all_data_char[1]:size(2)
-- cut off the end for train/valid sets so that it divides evenly
-- test set is not cut off
self.batch_size = batch_size
Expand Down Expand Up @@ -117,6 +110,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
local output_tensors = {} -- output tensors for train/val/test
local output_chars = {} -- output character for train/val/test sets (not tensors yet)
local vocab_count = {} -- vocab count
local max_word_l_tmp = 0
local idx2word = {tokens.UNK} -- unknown word token
local word2idx = {}; word2idx[tokens.UNK] = 1
local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
Expand All @@ -128,23 +122,23 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
-- First count all the words in the string.
local counts = 0
for line in f:lines() do
line = stringx.replace(line, '<unk>', tokens.UNK)

line = stringx.replace(line, '<unk>', tokens.UNK) -- replace unk with a single character
for word in line:gmatch'([^%s]+)' do
counts = counts + 1
max_word_l_tmp = math.max(max_word_l_tmp, word:len())
end
counts = counts + 1
end
f:close()

-- if actual max word length is less than the limit, use that
max_word_l = math.min(max_word_l_tmp, max_word_l)

-- Next preallocate the tensors we will need.
-- Watch out the second one needs a lot of RAM.
output_tensors[split] = torch.LongTensor(counts)
output_chars[split] = torch.LongTensor(counts, max_word_l):ones()

-- Next preallocate the tensors we will need.
-- Watch out the second one needs a lot of RAM.

output_chars[split] = torch.ones(counts, max_word_l):long()
print(output_chars[split]:size(2))
f = io.open(input_files[split], 'r')
local word_num = 0
for line in f:lines() do
Expand Down Expand Up @@ -181,7 +175,9 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
end
append(rword)
end
append("+")
if tokens.EOS ~= '' then
append(tokens.EOS)
end
end
end
print "done"
Expand Down

0 comments on commit d72d88b

Please sign in to comment.