Skip to content

Commit

Permalink
readme
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoon Kim committed Aug 17, 2015
1 parent d72d88b commit 11fd291
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 13 deletions.
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
## Neural Language Modeling with Characters
A neural language model (NLM) built on character inputs only.
The model employs a convolutional neural network (CNN) over character
embeddings to use as inputs into an long short-term memory (LSTM)
recurrent neural network language model (RNNLM). Also optionally
The model employs a convolutional neural network (CNN) over characters
to use as inputs into an long short-term memory (LSTM)
recurrent neural network language model (RNN-LM). Also optionally
passes the output from the CNN through a [Highway Network](http://arxiv.org/abs/1507.06228),
which improves performance.

Expand All @@ -12,6 +12,12 @@ will be posted on arXiv very soon.
Much of the base code is from Andrej Karpathy's excellent character RNN implementation,
available at https://github.com/karpathy/char-rnn

Also, the repo name 'word-char-rnn' is bit of a misnomer, as the primary motivation
is to use character-level inputs only. But as a baseline we implemented the
word-level models (and also experimented with models whereby the input
is a concatenation of the word embedding and the output from a character CNN),
hence the name.

### Requirements
Code is written in Lua and requires Torch. It additionally requires
the `nngraph` and `optim` packages, which can be installed via:
Expand All @@ -36,7 +42,7 @@ Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10
widely used by the language modeling community) is given as the default.

### Model
Here are some example scripts. Add `-gpuid 0` to use a GPU (which is
Here are some example scripts. Add `-gpuid 0` to each line to use a GPU (which is
required to get any reasonable speed with the CNN)

Large character-level model (`LSTM-CharCNN-Large` in the paper).
Expand Down
2 changes: 1 addition & 1 deletion main.lua
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid c
cmd:option('-EOS', '+', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
-- GPU/CPU
cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
cmd:option('-time', 1, 'print batch times')
cmd:option('-time', 0, 'print batch times')
cmd:text()

-- parse input params
Expand Down
4 changes: 3 additions & 1 deletion model/TDNN.lua
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ function TDNN.tdnn(length, input_size, feature_maps, kernels)
local conv = nn.TemporalConvolution(input_size, feature_maps[i], kernels[i])
local conv_layer = conv(input)
conv.name = 'conv_filter_' .. kernels[i] .. '_' .. feature_maps[i]
pool_layer = nn.Max(2)(nn.Tanh()(conv_layer))
--pool_layer = nn.Max(2)(nn.Tanh()(conv_layer))
pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
pool_layer = nn.Squeeze()(pool_layer)
else
-- Use CuDNN for temporal convolution.
if not cudnn then require 'cudnn' end
Expand Down
31 changes: 24 additions & 7 deletions util/BatchLoaderUnk.lua
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,24 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
local word2idx = {}; word2idx[tokens.UNK] = 1
local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
local char2idx = {}; char2idx[tokens.ZEROPAD] = 1; char2idx[tokens.START] = 2; char2idx[tokens.END] = 3

-- first go through train/valid/test to get max word length
-- if actual max word length (e.g. 19 for PTB) is smaller than specified
-- we use that instead. this is inefficient, but only a one-off thing
for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
f = io.open(input_files[split], 'r')
for line in f:lines() do
for word in line:gmatch'([^%s]+)' do
max_word_l_tmp = math.max(max_word_l_tmp, word:len())
end
end
f:close()
end

print('After first pass of data, max word length is: ' .. max_word_l_tmp)
-- if actual max word length is less than the limit, use that
max_word_l = math.min(max_word_l_tmp, max_word_l)

for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
output = {}
output_char = {}
Expand All @@ -123,26 +141,25 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
local counts = 0
for line in f:lines() do
line = stringx.replace(line, '<unk>', tokens.UNK) -- replace unk with a single character
line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved
line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved
for word in line:gmatch'([^%s]+)' do
counts = counts + 1
max_word_l_tmp = math.max(max_word_l_tmp, word:len())
end
counts = counts + 1
end
f:close()

-- if actual max word length is less than the limit, use that
max_word_l = math.min(max_word_l_tmp, max_word_l)

-- Next preallocate the tensors we will need.
-- Watch out the second one needs a lot of RAM.
output_tensors[split] = torch.LongTensor(counts)
output_chars[split] = torch.ones(counts, max_word_l):long()
print(output_chars[split]:size(2))
f = io.open(input_files[split], 'r')
local word_num = 0
for line in f:lines() do
line = stringx.replace(line, '<unk>', tokens.UNK)
line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved
line = stringx.replace(line, tokens.END, '')
for rword in line:gmatch'([^%s]+)' do
function append(word)
word_num = word_num + 1
Expand Down Expand Up @@ -175,8 +192,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
end
append(rword)
end
if tokens.EOS ~= '' then
append(tokens.EOS)
if tokens.EOS ~= '' then --PTB does not have <eos> so we add a character for <eos> tokens
append(tokens.EOS) --other datasets with periods or <eos> already present do not need this
end
end
end
Expand Down

0 comments on commit 11fd291

Please sign in to comment.