Skip to content

Commit

Permalink
Merge remote branch 'srush/morpho'
Browse files Browse the repository at this point in the history
  • Loading branch information
yjernite committed Aug 19, 2015
2 parents a93e61e + 4d7a8a7 commit cb0c8dd
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 6 deletions.
24 changes: 24 additions & 0 deletions morpho.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys, os
out = open("/tmp/words", 'w')

data_dir = sys.argv[1]
morfessor_dir = sys.argv[2]


for f in ["train.txt", "valid.txt", "test.txt"]:
for l in open(data_dir + "/" + f):
words = l.strip().split()
for w in words:
print >>out, w

os.system("perl %s/bin/morfessor1.0.pl -data /tmp/words > /tmp/morph"%(morfessor_dir))
f = open("/tmp/morph")
words = {}
for line in f:
if line[0] == "#":
continue
word_parts = line.replace("+", "").strip().split()[1:]
words["".join(word_parts)] = word_parts

for word, factors in words.iteritems():
print word, " ".join(factors)
62 changes: 56 additions & 6 deletions util/BatchLoaderUnk.lua
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ local BatchLoaderUnk = {}
local stringx = require('pl.stringx')
BatchLoaderUnk.__index = BatchLoaderUnk

max_factor_l = 5

function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_word_l)
local self = {}
setmetatable(self, BatchLoaderUnk)
Expand All @@ -14,20 +16,25 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
local train_file = path.join(data_dir, 'train.txt')
local valid_file = path.join(data_dir, 'valid.txt')
local test_file = path.join(data_dir, 'test.txt')
local in_morpho_file = path.join(data_dir, 'morpho.txt')
local input_files = {train_file, valid_file, test_file}
local vocab_file = path.join(data_dir, 'vocab.t7')
local tensor_file = path.join(data_dir, 'data.t7')
local char_file = path.join(data_dir, 'data_char.t7')
local out_morpho_file = path.join(data_dir, 'morpho.t7')

-- construct a tensor with all the data
if not (path.exists(vocab_file) or path.exists(tensor_file) or path.exists(char_file)) then
print('one-time setup: preprocessing input train/valid/test files in dir: ' .. data_dir)
BatchLoaderUnk.text_to_tensor(input_files, vocab_file, tensor_file, char_file, max_word_l)
BatchLoaderUnk.text_to_tensor(input_files, in_morpho_file, vocab_file, tensor_file, char_file,
out_morpho_file,
max_word_l)
end

print('loading data files...')
local all_data = torch.load(tensor_file) -- train, valid, test tensors
local all_data_char = torch.load(char_file) -- train, valid, test character indices
local all_data_morpho = torch.load(morpho_file) -- train, valid, test character indices
local vocab_mapping = torch.load(vocab_file)
self.idx2word, self.word2idx, self.idx2char, self.char2idx = table.unpack(vocab_mapping)
self.vocab_size = #self.idx2word
Expand All @@ -51,13 +58,16 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
ydata:sub(1,-2):copy(data:sub(2,-1))
ydata[-1] = data[1]
local data_char = torch.zeros(data:size(1), self.max_word_l):long()
local data_morpho = torch.ones(data:size(1), self.max_word_f):long()
for i = 1, data:size(1) do
data_char[i] = self:expand(all_data_char[split][i]:totable())
data_morpho[i] = all_data_morpho[split][i]
end
if split < 3 then
x_batches = data:view(batch_size, -1):split(seq_length, 2)
y_batches = ydata:view(batch_size, -1):split(seq_length, 2)
x_char_batches = data_char:view(batch_size, -1, self.max_word_l):split(seq_length,2)
x_morpho_batches = data_morpho:view(batch_size, -1, self.max_word_f):split(seq_length,2)
nbatches = #x_batches
self.split_sizes[split] = nbatches
assert(#x_batches == #y_batches)
Expand All @@ -66,10 +76,12 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
x_batches = {data:resize(1, data:size(1)):expand(batch_size, data:size(2))}
y_batches = {ydata:resize(1, ydata:size(1)):expand(batch_size, ydata:size(2))}
data_char = data_char:resize(1, data_char:size(1), data_char:size(2))
data_morpho = data_morpho:resize(1, data_morpho:size(1), data_morpho:size(2))
x_char_batches = {data_char:expand(batch_size, data_char:size(2), data_char:size(3))}
self.split_sizes[split] = 1
x_morpho_batches = {data_morpho:expand(batch_size, data_morpho:size(2), data_morpho:size(3))}
self.split_sizes[split] = 1
end
self.all_batches[split] = {x_batches, y_batches, x_char_batches}
self.all_batches[split] = {x_batches, y_batches, x_char_batches, x_morpho_batches}
end
self.batch_idx = {0,0,0}
print(string.format('data load done. Number of batches in train: %d, val: %d, test: %d', self.split_sizes[1], self.split_sizes[2], self.split_sizes[3]))
Expand Down Expand Up @@ -103,20 +115,51 @@ function BatchLoaderUnk:next_batch(split_idx)
return self.all_batches[split_idx][1][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
end

function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfile, out_charfile, max_word_l)
function BatchLoaderUnk.text_to_tensor(input_files, morpho_file,
out_vocabfile, out_tensorfile, out_charfile,
out_morphofile,
max_word_l)
print('Processing text into tensors...')
local tokens = opt.tokens -- inherit global constants for tokens
local f, rawdata
local output_tensors = {} -- output tensors for train/val/test
local output_chars = {} -- output character tensors for train/val/test sets
local output_factors = {}
local vocab_count = {} -- vocab count
local max_word_l_tmp = 0 -- max word length of the corpus
local idx2word = {tokens.UNK} -- unknown word token
local word2idx = {}; word2idx[tokens.UNK] = 1
local idx2char = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
local char2idx = {}; char2idx[tokens.ZEROPAD] = 1; char2idx[tokens.START] = 2; char2idx[tokens.END] = 3
local idx2factor = {tokens.ZEROPAD, tokens.START, tokens.END} -- zero-pad, start-of-word, end-of-word tokens
local factor2idx = {}; factor2idx[tokens.ZEROPAD] = 1; factor2idx[tokens.START] = 2; facor2idx[tokens.END] = 3
local split_counts = {}

local morpho_dict = {}

f = io.open(morpho_file, 'r')
for l in f:lines() do
local n = 0
for factor in line:gmatch'([^%s]+)' do
local word = nil
if n == 0 then
word = factor
if word2idx[word] == nil then
idx2word[#idx2word + 1] = word
word2idx[word] = #idx2word
end
wordidx = word2idx[word]
morpho_dict[wordidx] = torch.Tensor(max_factor_l):ones()
else
if factor2idx[factor] == nil then
idx2factor[#idx2factor + 1] = factor
factor2idx[factor] = #idx2factor
end
morpho_dict[wordidx][n] = factor2idx[factor]
end
n = n + 1
end
end

-- first go through train/valid/test to get max word length
-- if actual max word length (e.g. 19 for PTB) is smaller than specified
-- we use that instead. this is inefficient, but only a one-off thing so should be fine
Expand Down Expand Up @@ -147,11 +190,12 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
-- if actual max word length is less than the limit, use that
max_word_l = math.min(max_word_l_tmp, max_word_l)

for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
for split = 1, 3 do -- split = 1 (train), 2 (val), or 3 (test)
-- Preallocate the tensors we will need.
-- Watch out the second one needs a lot of RAM.
output_tensors[split] = torch.LongTensor(split_counts[split])
output_chars[split] = torch.ones(split_counts[split], max_word_l):long()
output_factors[split] = torch.ones(split_counts[split], max_factor_l):long()

f = io.open(input_files[split], 'r')
local word_num = 0
Expand All @@ -170,13 +214,17 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
if string.sub(word,1,1) == tokens.UNK and word:len() > 1 then -- unk token with character info available
word = string.sub(word, 3)
output_tensors[split][word_num] = word2idx[tokens.UNK]
output_morphos[split][word_num] = morpho_dict[word2idx[tokens.UNK]]
else
if word2idx[word]==nil then
idx2word[#idx2word + 1] = word -- create word-idx/idx-word mappings
word2idx[word] = #idx2word
end
output_tensors[split][word_num] = word2idx[word]
output_morphos[split][word_num] = morpho_dict[word2idx[word]]
end


for char in word:gmatch'.' do
if char2idx[char]==nil then
idx2char[#idx2char + 1] = char -- create char-idx/idx-char mappings
Expand Down Expand Up @@ -204,6 +252,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
torch.save(out_tensorfile, output_tensors)
print('saving ' .. out_charfile)
torch.save(out_charfile, output_chars)
print('saving ' .. out_morphofile)
torch.save(out_morphofile, output_morphos)
end

return BatchLoaderUnk
Expand Down

0 comments on commit cb0c8dd

Please sign in to comment.