Merge remote branch 'srush/morpho'

yjernite · Aug 19, 2015 · cb0c8dd · cb0c8dd
2 parents a93e61e + 4d7a8a7
commit cb0c8dd
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 6 deletions.
diff --git a/morpho.py b/morpho.py
@@ -0,0 +1,24 @@
+import sys, os
+out = open("/tmp/words", 'w')
+
+data_dir = sys.argv[1]
+morfessor_dir = sys.argv[2]
+
+
+for f in ["train.txt", "valid.txt", "test.txt"]:
+    for l in open(data_dir + "/" + f):
+        words = l.strip().split()
+        for w in words:
+            print >>out,  w
+
+os.system("perl %s/bin/morfessor1.0.pl -data /tmp/words > /tmp/morph"%(morfessor_dir))
+f = open("/tmp/morph")
+words = {}
+for line in f:
+    if line[0] == "#": 
+        continue
+    word_parts = line.replace("+", "").strip().split()[1:]
+    words["".join(word_parts)] =  word_parts
+
+for word, factors in words.iteritems():
+    print word, " ".join(factors)
diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua
@@ -6,6 +6,8 @@ local BatchLoaderUnk = {}
 local stringx = require('pl.stringx')
 BatchLoaderUnk.__index = BatchLoaderUnk
 
+max_factor_l = 5
+
 function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_word_l)
     local self = {}
     setmetatable(self, BatchLoaderUnk)
@@ -14,20 +16,25 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
     local train_file = path.join(data_dir, 'train.txt')
     local valid_file = path.join(data_dir, 'valid.txt')
     local test_file = path.join(data_dir, 'test.txt')
+    local in_morpho_file = path.join(data_dir, 'morpho.txt')
     local input_files = {train_file, valid_file, test_file}
     local vocab_file = path.join(data_dir, 'vocab.t7')
     local tensor_file = path.join(data_dir, 'data.t7')
     local char_file = path.join(data_dir, 'data_char.t7')
+    local out_morpho_file = path.join(data_dir, 'morpho.t7')
 
     -- construct a tensor with all the data
     if not (path.exists(vocab_file) or path.exists(tensor_file) or path.exists(char_file)) then
         print('one-time setup: preprocessing input train/valid/test files in dir: ' .. data_dir)
-        BatchLoaderUnk.text_to_tensor(input_files, vocab_file, tensor_file, char_file, max_word_l)
+        BatchLoaderUnk.text_to_tensor(input_files, in_morpho_file, vocab_file, tensor_file, char_file, 
+                                      out_morpho_file,
+                                      max_word_l)
     end
 
     print('loading data files...')
     local all_data = torch.load(tensor_file) -- train, valid, test tensors
     local all_data_char = torch.load(char_file) -- train, valid, test character indices
+    local all_data_morpho = torch.load(morpho_file) -- train, valid, test character indices
     local vocab_mapping = torch.load(vocab_file)
     self.idx2word, self.word2idx, self.idx2char, self.char2idx = table.unpack(vocab_mapping)
     self.vocab_size = #self.idx2word
@@ -51,13 +58,16 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
        ydata:sub(1,-2):copy(data:sub(2,-1))
        ydata[-1] = data[1]
        local data_char = torch.zeros(data:size(1), self.max_word_l):long()
+       local data_morpho = torch.ones(data:size(1), self.max_word_f):long()
        for i = 1, data:size(1) do
           data_char[i] = self:expand(all_data_char[split][i]:totable())
+          data_morpho[i] = all_data_morpho[split][i]
        end
        if split < 3 then
           x_batches = data:view(batch_size, -1):split(seq_length, 2)
           y_batches = ydata:view(batch_size, -1):split(seq_length, 2)
           x_char_batches = data_char:view(batch_size, -1, self.max_word_l):split(seq_length,2)
+          x_morpho_batches = data_morpho:view(batch_size, -1, self.max_word_f):split(seq_length,2)
           nbatches = #x_batches	   
           self.split_sizes[split] = nbatches
           assert(#x_batches == #y_batches)
@@ -66,10 +76,12 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
           x_batches = {data:resize(1, data:size(1)):expand(batch_size, data:size(2))}
           y_batches = {ydata:resize(1, ydata:size(1)):expand(batch_size, ydata:size(2))}
           data_char = data_char:resize(1, data_char:size(1), data_char:size(2))
+          data_morpho = data_morpho:resize(1, data_morpho:size(1), data_morpho:size(2))
           x_char_batches = {data_char:expand(batch_size, data_char:size(2), data_char:size(3))}
-          self.split_sizes[split] = 1	
+          x_morpho_batches = {data_morpho:expand(batch_size, data_morpho:size(2), data_morpho:size(3))}
+          self.split_sizes[split] = 1
        end
-       self.all_batches[split] = {x_batches, y_batches, x_char_batches}
+       self.all_batches[split] = {x_batches, y_batches, x_char_batches, x_morpho_batches}
     end
     self.batch_idx = {0,0,0}
     print(string.format('data load done. Number of batches in train: %d, val: %d, test: %d', self.split_sizes[1], self.split_sizes[2], self.split_sizes[3]))
@@ -103,20 +115,51 @@ function BatchLoaderUnk:next_batch(split_idx)
     return self.all_batches[split_idx][1][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
 end
 
-function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfile, out_charfile, max_word_l)
+function BatchLoaderUnk.text_to_tensor(input_files, morpho_file,
+                                       out_vocabfile, out_tensorfile, out_charfile, 
+                                       out_morphofile,
+                                       max_word_l)
     print('Processing text into tensors...')
     local tokens = opt.tokens -- inherit global constants for tokens
     local f, rawdata
     local output_tensors = {} -- output tensors for train/val/test
     local output_chars = {} -- output character tensors for train/val/test sets
+    local output_factors = {} 
     local vocab_count = {} -- vocab count 
     local max_word_l_tmp = 0 -- max word length of the corpus
     local idx2word = {tokens.UNK} -- unknown word token
     local word2idx = {}; word2idx[tokens.UNK] = 1
     local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
     local char2idx = {}; char2idx[tokens.ZEROPAD] = 1; char2idx[tokens.START] = 2; char2idx[tokens.END] = 3
+    local idx2factor = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
+    local factor2idx = {}; factor2idx[tokens.ZEROPAD] = 1; factor2idx[tokens.START] = 2; facor2idx[tokens.END] = 3
     local split_counts = {}
-
+    local morpho_dict = {}
+
+    f = io.open(morpho_file, 'r')
+    for l in f:lines() do
+       local n = 0
+       for factor in line:gmatch'([^%s]+)' do
+          local word = nil
+          if n == 0 then
+             word = factor
+             if word2idx[word] == nil then
+                idx2word[#idx2word + 1] = word
+                word2idx[word] = #idx2word
+             end
+             wordidx = word2idx[word]
+             morpho_dict[wordidx] = torch.Tensor(max_factor_l):ones()
+          else
+             if factor2idx[factor] == nil then
+                idx2factor[#idx2factor + 1] = factor
+                factor2idx[factor] = #idx2factor
+             end
+             morpho_dict[wordidx][n] = factor2idx[factor]
+          end
+          n = n + 1
+       end
+    end
+
     -- first go through train/valid/test to get max word length
     -- if actual max word length (e.g. 19 for PTB) is smaller than specified
     -- we use that instead. this is inefficient, but only a one-off thing so should be fine
@@ -147,11 +190,12 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     -- if actual max word length is less than the limit, use that
     max_word_l = math.min(max_word_l_tmp, max_word_l)
 
-    for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)     
+    for	split = 1, 3 do -- split = 1 (train), 2 (val), or 3 (test)     
        -- Preallocate the tensors we will need.
        -- Watch out the second one needs a lot of RAM.
        output_tensors[split] = torch.LongTensor(split_counts[split])
        output_chars[split] = torch.ones(split_counts[split], max_word_l):long()
+       output_factors[split] = torch.ones(split_counts[split], max_factor_l):long()
 
        f = io.open(input_files[split], 'r')
        local word_num = 0
@@ -170,13 +214,17 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
                 if string.sub(word,1,1) == tokens.UNK and word:len() > 1 then -- unk token with character info available
                    word = string.sub(word, 3)
                    output_tensors[split][word_num] = word2idx[tokens.UNK]
+                   output_morphos[split][word_num] = morpho_dict[word2idx[tokens.UNK]]
                 else
                    if word2idx[word]==nil then
                       idx2word[#idx2word + 1] = word -- create word-idx/idx-word mappings
                       word2idx[word] = #idx2word
                    end
                    output_tensors[split][word_num] = word2idx[word]
+                   output_morphos[split][word_num] = morpho_dict[word2idx[word]]
                 end
+
+
                 for char in word:gmatch'.' do
                    if char2idx[char]==nil then
                       idx2char[#idx2char + 1] = char -- create char-idx/idx-char mappings
@@ -204,6 +252,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     torch.save(out_tensorfile, output_tensors)
     print('saving ' .. out_charfile)
     torch.save(out_charfile, output_chars)
+    print('saving ' .. out_morphofile)
+    torch.save(out_morphofile, output_morphos)
 end
 
 return BatchLoaderUnk