Merge remote branch 'yoonkim/master'

Conflicts: main.lua
yjernite · Aug 20, 2015 · 6d66160 · 6d66160
2 parents 99a9907 + 6446794
commit 6d66160
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,23 +1,16 @@
-## Neural Language Modeling with Characters
+## Character-Aware Neural Language Models
 A neural language model (NLM) built on character inputs only. Predictions
 are still made at the word-level. The model employs a convolutional neural network (CNN) over characters 
 to use as inputs into an long short-term memory (LSTM)
 recurrent neural network language model (RNN-LM). Also optionally
 passes the output from the CNN through a [Highway Network](http://arxiv.org/abs/1507.06228), 
 which improves performance.
 
-Note: Code is messy/experimental. Cleaner (and faster) code coming. Paper 
-will be posted on arXiv very soon.
+Note: Paper will be posted on arXiv very soon.
 
 Much of the base code is from Andrej Karpathy's excellent character RNN implementation,
 available at https://github.com/karpathy/char-rnn
 
-Also, the repo name 'word-char-rnn' is bit of a misnomer, as the primary motivation
-is to use character-level inputs only. But as a baseline we implemented the
-word-level models (and also experimented with models whereby the input
-is a concatenation of the word embedding and the output from a character CNN),
-hence the name.
-
 ### Requirements
 Code is written in Lua and requires Torch. It also requires
 the `nngraph` and `optim` packages, which can be installed via:
@@ -45,9 +38,23 @@ Data should be put into the `data/` directory, split into `train.txt`,
 `valid.txt`, and `test.txt`
 
 Each line of the .txt file should be a sentence. The English Penn 
-Treebank data (Tomas Mikolov's pre-processed version with vocab size equal to 10K,
+Treebank (PTB) data (Tomas Mikolov's pre-processed version with vocab size equal to 10K,
 widely used by the language modeling community) is given as the default.
 
+The paper also runs the models on non-English data (Czech, French, German, Russian, and Spanish), from the ICML 2014
+paper [Compositional Morphology for Word Representations and Language Modelling](http://arxiv.org/abs/1405.4273)
+by Jan Botha and Phil Blunsom. This can be downloaded from [Jan's website](https://bothameister.github.io).
+We also provide a script to download them and save in the relevant folders (see `get_data.sh`).
+
+#### Note on PTB
+The PTB data above does not have end-of-sentence tokens for each sentence, and hence these must be
+manually appended. This can be done by adding `-EOS '+'` to the script (obviously you 
+can use other characters than `+` to represent an end-of-sentence token---we recommend a single
+unused character).
+
+Jan's datasets already have end-of-sentence tokens for each line so you do not need to 
+add the `-EOS` command (equivalent to adding `-EOS ''`, which is the default).
+
 ### Model
 Here are some example scripts. Add `-gpuid 0` to each line to use a GPU (which is
 required to get any reasonable speed with the CNN), and `-cudnn 1` to use the
@@ -57,27 +64,27 @@ cudnn package.
 Large character-level model (LSTM-CharCNN-Large in the paper).
 This is the default: should get ~82 on valid and ~79 on test.
 ```
-th main.lua -savefile char-large
+th main.lua -savefile char-large -EOS '+'
 ```
 Small character-level model (LSTM-CharCNN-Small in the paper).
 This should get ~96 on valid and ~93 on test.
 ```
 th main.lua -savefile char-small -rnn_size 300 -highway_layers 1 
--kernels '{1,2,3,4,5,6}' -feature_maps '{25,50,75,100,125,150}'
+-kernels '{1,2,3,4,5,6}' -feature_maps '{25,50,75,100,125,150}' -EOS '+'
 ```
 
 #### Word-level models
 Large word-level model (LSTM-Word-Large in the paper).
 This should get ~89 on valid and ~85 on test.
 ```
 th main.lua -savefile word-large -word_vec_size 650 -highway_layers 0 
--use_chars 0 -use_words 1
+-use_chars 0 -use_words 1 -EOS '+'
 ```
 Small word-level model (LSTM-Word-Small in the paper).
 This should get ~101 on valid and ~98 on test.
 ```
 th main.lua -savefile word-small -word_vec_size 200 -highway_layers 0 
--use_chars 0 -use_words 1 -rnn_size 200
+-use_chars 0 -use_words 1 -rnn_size 200 -EOS '+'
 ```
 
 #### Combining both

diff --git a/evaluate.lua b/evaluate.lua
@@ -16,9 +16,6 @@ require 'util.misc'
 
 BatchLoader = require 'util.BatchLoaderUnk'
 model_utils = require 'util.model_utils'
-HighwayMLP = require 'model.HighwayMLP'
-TDNN = require 'model.TDNN'
-LSTMTDNN = require 'model.LSTMTDNN'
 
 local stringx = require('pl.stringx')
 
@@ -32,7 +29,6 @@ cmd:option('-model', 'final-results/en-large-word-model.t7', 'model checkpoint f
 cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
 cmd:text()
 
-
 -- parse input params
 opt2 = cmd:parse(arg)
 if opt2.gpuid >= 0 then
@@ -41,6 +37,17 @@ if opt2.gpuid >= 0 then
     require 'cunn'
     cutorch.setDevice(opt2.gpuid + 1)
 end
+
+if opt.cudnn == 1 then
+    assert(opt2.gpuid >= 0, 'GPU must be used if using cudnn')
+    print('using cudnn')
+    require 'cudnn'
+end
+
+HighwayMLP = require 'model.HighwayMLP'
+TDNN = require 'model.TDNN'
+LSTMTDNN = require 'model.LSTMTDNN'
+
 checkpoint = torch.load(opt2.model)
 opt = checkpoint.opt
 protos = checkpoint.protos
@@ -131,4 +138,4 @@ test_results.vocab = {idx2word, word2idx, idx2char, char2idx}
 test_results.opt = opt
 test_results.val_losses = checkpoint.val_losses
 torch.save(opt2.savefile, test_results)
-collectgarbage()
+collectgarbage()
diff --git a/get_data.sh b/get_data.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+wget https://github.com/bothameister/bothameister.github.io/raw/master/icml14-data.tar.bz2
+tar xf icml14-data.tar.bz2
+
+mkdir data/de/
+cp en-de/1m-mono/train.in data/de/train.txt
+cp en-de/1m-mono/test.in data/de/valid.txt
+cp en-de/1m-mono/finaltest.in data/de/test.txt
+
+mkdir data/es/
+cp en-es/1m-mono/train.in data/es/train.txt
+cp en-es/1m-mono/test.in data/es/valid.txt
+cp en-es/1m-mono/finaltest.in data/es/test.txt
+
+mkdir data/cs/
+cp en-cs/1m-mono/train.in data/cs/train.txt
+cp en-cs/1m-mono/test.in data/cs/valid.txt
+cp en-cs/1m-mono/finaltest.in data/cs/test.txt
+
+mkdir data/fr/
+cp en-fr/1m-mono/train.in data/fr/train.txt
+cp en-fr/1m-mono/test.in data/fr/valid.txt
+cp en-fr/1m-mono/finaltest.in data/fr/test.txt
+
+mkdir data/ru/
+cp en-ru/1m-mono/train.in data/ru/train.txt
+cp en-ru/1m-mono/test.in data/ru/valid.txt
+cp en-ru/1m-mono/finaltest.in data/ru/test.txt
diff --git a/introspect.lua b/introspect.lua
@@ -25,7 +25,7 @@ cmd:text('Perform model introspection')
 cmd:text()
 cmd:text('Options')
 -- data
-cmd:option('-model','final-results/en-large-model.t7', 'model file')
+cmd:option('-model','final-results/en-large-word-model.t7', 'model file')
 cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
 cmd:option('-savefile', 'chargrams.tsv', 'save max chargrams to')
 cmd:text()

diff --git a/main.lua b/main.lua
@@ -1,5 +1,6 @@
 --[[
-Trains a word+character-level multi-layer rnn language model
+Trains a word-level or character-level (for inputs) lstm language model
+Predictions are still made at the word-level.
 
 Much of the code is borrowed from the following implementations
 https://github.com/karpathy/char-rnn
@@ -52,10 +53,10 @@ cmd:option('-threads', 16, 'number of threads')
 -- bookkeeping
 cmd:option('-seed',3435,'torch manual random number generator seed')
 cmd:option('-print_every',100,'how many steps/minibatches between printing out the loss')
-cmd:option('-checkpoint_dir', 'cv-ptb', 'output directory where checkpoints get written')
+cmd:option('-checkpoint_dir', 'cv', 'output directory where checkpoints get written')
 cmd:option('-savefile','char','filename to autosave the checkpont to. Will be inside checkpoint_dir/')
 cmd:option('-checkpoint', 'checkpoint.t7', 'start from a checkpoint if a valid checkpoint.t7 file is given')
-cmd:option('-EOS', '+', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
+cmd:option('-EOS', '', '<EOS> symbol. should be a single unused character (like +) for PTB and blank for others')
 -- GPU/CPU
 cmd:option('-gpuid',-1,'which gpu to use. -1 = use CPU')
 cmd:option('-cudnn', 0,'use cudnn (1=yes). this should greatly speed up convolutions')
@@ -87,7 +88,8 @@ if opt.cudnn == 1 then
    require 'cudnn'
 end
 
--- load models
+-- load models. we do this here instead of before
+-- because of cudnn
 TDNN = require 'model.TDNN'
 LSTMTDNN = require 'model.LSTMTDNN'
 HighwayMLP = require 'model.HighwayMLP'
@@ -362,12 +364,6 @@ for i = 1, iterations do
     end
 end
 
-some_function()
-another_function()
-coroutine.resume( some_coroutine )
-ProFi:stop()
-ProFi:writeReport( 'MyProfilingReport.txt' )
-
 --evaluate on full test set. this just uses the model from the last epoch
 --rather than best-performing model. it is also incredibly inefficient
 --because of batch size issues. for faster evaluation, use evaluate.lua, i.e.