diff --git a/main/src/main/python/embeddings/wordEmbeddingMap.py b/main/src/main/python/embeddings/wordEmbeddingMap.py index a82c2108a..327b4befc 100644 --- a/main/src/main/python/embeddings/wordEmbeddingMap.py +++ b/main/src/main/python/embeddings/wordEmbeddingMap.py @@ -1,29 +1,29 @@ import numpy as np +import math class WordEmbeddingMap: def __init__(self, config): - self.emb_dict = self.load(config) - self.dim = self.emb_dict.shape[-1] - - def load(self): - emb_matrix = None - emb_dict = dict() - for line in open(config.get_string("glove.matrixResourceName")): - if not len(line.split()) == 2: - if "\t" in line: - delimiter = "\t" - else: - delimiter = " " - line_split = line.rstrip().split(delimiter) - # extract word and vector - word = line_split[0] - x = np.array([float(i) for i in line_split[1:]]) - vector = (x /np.linalg.norm(x)) - embedding_size = vector.shape[0] - emb_dict[word] = vector - base = math.sqrt(6/embedding_size) - emb_dict[""] = np.random.uniform(-base,base,(embedding_size)) - return emb_dict + self.emb_dict, self.dim = load(config) def isOutOfVocabulary(self, word): - return word not in self.emb_dict \ No newline at end of file + return word not in self.emb_dict + +def load(config): + emb_matrix = None + emb_dict = dict() + for line in open(config.get_string("glove.matrixResourceName")): + if not len(line.split()) == 2: + if "\t" in line: + delimiter = "\t" + else: + delimiter = " " + line_split = line.rstrip().split(delimiter) + # extract word and vector + word = line_split[0] + x = np.array([float(i) for i in line_split[1:]]) + vector = (x /np.linalg.norm(x)) + embedding_size = vector.shape[0] + emb_dict[word] = vector + base = math.sqrt(6/embedding_size) + emb_dict[""] = np.random.uniform(-base,base,(embedding_size)) + return emb_dict, embedding_size \ No newline at end of file diff --git a/main/src/main/python/pytorch/constEmbeddingsGlove.py b/main/src/main/python/pytorch/constEmbeddingsGlove.py index be32c2f39..52547b6a8 100644 --- a/main/src/main/python/pytorch/constEmbeddingsGlove.py +++ b/main/src/main/python/pytorch/constEmbeddingsGlove.py @@ -1,16 +1,18 @@ from dataclasses import dataclass import torch.nn as nn from embeddings.wordEmbeddingMap import * +from pyhocon import ConfigFactory @dataclass class ConstEmbeddingParameters: emb: nn.Embedding w2i: dict -def ConstEmbeddingsGlove: +class _ConstEmbeddingsGlove: def __init__(self): self.SINGLETON_WORD_EMBEDDING_MAP = None - self.load('../resources/org/clulab/glove.conf') + config = ConfigFactory.parse_file('../resources/org/clulab/glove.conf') + self.load(config) self.dim = self.SINGLETON_WORD_EMBEDDING_MAP.dim def load(self, config): @@ -25,3 +27,5 @@ def mkConstLookupParams(self, words): emd = nn.Embedding.from_pretrained(weight) emd.weight.requires_grad=False return ConstEmbeddingParameters(emb ,w2i) + +ConstEmbeddingsGlove = _ConstEmbeddingsGlove() diff --git a/main/src/main/python/pytorch/embeddingLayer.py b/main/src/main/python/pytorch/embeddingLayer.py index a506bfa02..2ebea6f7b 100644 --- a/main/src/main/python/pytorch/embeddingLayer.py +++ b/main/src/main/python/pytorch/embeddingLayer.py @@ -1,8 +1,9 @@ -from initialLayer import InitialLayer +from pytorch.initialLayer import InitialLayer import random -from utils import * +from pytorch.utils import * import torch.nn as nn import torch +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove DEFAULT_DROPOUT_PROB: float = DEFAULT_DROPOUT_PROBABILITY DEFAULT_LEARNED_WORD_EMBEDDING_SIZE: int = 128 @@ -16,7 +17,7 @@ DEFAULT_USE_IS_PREDICATE: int = -1 class EmbeddingLayer(InitialLayer): - def __init__(w2i, # word to index + def __init__(self, w2i, # word to index w2f, # word to frequency c2i, # character to index tag2i, # POS tag to index @@ -68,7 +69,7 @@ def __init__(w2i, # word to index positionDim = 1 if distanceLookupParameters and useIsPredicate else 0 predicateDim = positionEmbeddingSize if positionLookupParameters else 0 - self.outDim = ConstEmbeddingsGlove().dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim + self.outDim = ConstEmbeddingsGlove.dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim random.seed(RANDOM_SEED) def forward(self, sentence, constEmbeddings, doDropout): @@ -81,9 +82,9 @@ def forward(self, sentence, constEmbeddings, doDropout): # const word embeddings such as GloVe constEmbeddingsExpressions = self.mkConstEmbeddings(words, constEmbeddings) assert(constEmbeddingsExpressions.size(0) == len(words)) - if(tags) assert(len(tags) == len(words)) - if(nes) assert(len(nes) == len(words)) - if(headPositions) assert(len(headPositions) == len(words)) + if(tags): assert(len(tags) == len(words)) + if(nes): assert(len(nes) == len(words)) + if(headPositions): assert(len(headPositions) == len(words)) # build the word embeddings one by one embeddings = self.mkEmbeddings(words, constEmbeddingsExpressions, tags, nes, headPositions) @@ -250,7 +251,7 @@ def load(cls, x2i): @classmethod def initialize(cls, config, paramPrefix, wordCounter): - if(not config.__contains__(paramPrefix)): + if(not config.contains(paramPrefix)): return None learnedWordEmbeddingSize = config.get_int(paramPrefix + ".learnedWordEmbeddingSize",DEFAULT_LEARNED_WORD_EMBEDDING_SIZE) @@ -260,9 +261,9 @@ def initialize(cls, config, paramPrefix, wordCounter): neTagEmbeddingSize = config.get_int(paramPrefix + ".neTagEmbeddingSize",DEFAULT_NE_TAG_EMBEDDING_SIZE) distanceEmbeddingSize = config.get_int(paramPrefix + ".distanceEmbeddingSize",DEFAULT_DISTANCE_EMBEDDING_SIZE) distanceWindowSize = config.get_int(paramPrefix + ".distanceWindowSize",DEFAULT_DISTANCE_WINDOW_SIZE) - useIsPredicate = config.getArgBoolean(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1) + useIsPredicate = config.get_bool(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1) positionEmbeddingSize = config.get_int(paramPrefix + ".positionEmbeddingSize",DEFAULT_POSITION_EMBEDDING_SIZE) - dropoutProb = config.get_float(paramPrefix + ".dropoutProb",EmbeddingLayer.DEFAULT_DROPOUT_PROB) + dropoutProb = config.get_float(paramPrefix + ".dropoutProb",DEFAULT_DROPOUT_PROB) wordList = [UNK_WORD] + sorted(wordCounter.keys()) w2i = {w:i for i, w in enumerate(wordList)} @@ -293,7 +294,7 @@ def initialize(cls, config, paramPrefix, wordCounter): distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize) if distanceEmbeddingSize > 0 else None positionLookupParameters = nn.Embedding(101, positionEmbeddingSize) if positionEmbeddingSize > 0 else None - return cls(w2i, w2f, c2i, tag2i, ne2i, + return cls(w2i, wordCounter, c2i, tag2i, ne2i, learnedWordEmbeddingSize, charEmbeddingSize, charRnnStateSize, @@ -331,6 +332,8 @@ def initialize(cls, config, paramPrefix, wordCounter): + + diff --git a/main/src/main/python/pytorch/forwardLayer.py b/main/src/main/python/pytorch/forwardLayer.py index 2f43be5a7..92b7a133d 100644 --- a/main/src/main/python/pytorch/forwardLayer.py +++ b/main/src/main/python/pytorch/forwardLayer.py @@ -3,14 +3,13 @@ from torch.autograd import Variable import torch.nn.functional as F -from finalLayer import FinalLayer -from greedyForwardLayer import GreedyForwardLayer -from viterbiForwardLayer import ViterbiForwardLayer +from pytorch.finalLayer import FinalLayer -from utils import * +from pytorch.utils import * class ForwardLayer(FinalLayer): def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None): + super().__init__() self.inputSize = inputSize self.isDual = isDual self.t2i = t2i @@ -73,6 +72,8 @@ def forward(inputExpressions, doDropout, headPositionsOpt = None): @staticmethod def load(x2i): + from pytorch.greedyForwardLayer import GreedyForwardLayer + from pytorch.viterbiForwardLayer import ViterbiForwardLayer inferenceType = x2i["inferenceType"] if inferenceType == TYPE_VITERBI: pass @@ -85,7 +86,9 @@ def load(x2i): @staticmethod def initialize(config, paramPrefix, labelCounter, isDual, inputSize): - if(not config.__contains__(paramPrefix)): + from pytorch.greedyForwardLayer import GreedyForwardLayer + from pytorch.viterbiForwardLayer import ViterbiForwardLayer + if(not config.contains(paramPrefix)): return None inferenceType = config.get_string(paramPrefix + ".inference", "greedy") diff --git a/main/src/main/python/pytorch/greedyForwardLayer.py b/main/src/main/python/pytorch/greedyForwardLayer.py index be776a6b5..2d9ddeeae 100644 --- a/main/src/main/python/pytorch/greedyForwardLayer.py +++ b/main/src/main/python/pytorch/greedyForwardLayer.py @@ -1,5 +1,5 @@ -from forwardLayer import * -from utils import * +from pytorch.forwardLayer import * +from pytorch.utils import * import numpy as np class GreedyForwardLayer(ForwardLayer): @@ -23,7 +23,7 @@ def saveX2i(self): return x2i def __str__(self): - return f"GreedyForwardLayer({inDim}, {outDim})" + return f"GreedyForwardLayer({self.inDim}, {self.outDim})" def inference(self, emissionScores): labelIds = np.argmax(lattice.data.numpy(), axis=1).tolist() diff --git a/main/src/main/python/pytorch/layers.py b/main/src/main/python/pytorch/layers.py index 363ff29a9..4c1a1889d 100644 --- a/main/src/main/python/pytorch/layers.py +++ b/main/src/main/python/pytorch/layers.py @@ -1,7 +1,9 @@ import torch.nn as nn -from utils import * -from embeddingLayer import EmbeddingLayer -from constEmbeddingsGlove import ConstEmbeddingsGlove +from pytorch.utils import * +from pytorch.embeddingLayer import EmbeddingLayer +from pytorch.rnnLayer import RnnLayer +from pytorch.forwardLayer import ForwardLayer +from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove class Layers(object): def __init__(self, initialLayer, intermediateLayers, finalLayer): @@ -14,8 +16,7 @@ def __init__(self, initialLayer, intermediateLayers, finalLayer): else: self.outDim = None - if initialLayer and intermediateLayers and finalLayer: - self.nonEmpty = True + self.nonEmpty = initialLayer is not None and intermediateLayers is not None and finalLayer is not None self.isEmpty = not self.nonEmpty self.initialLayer = initialLayer @@ -25,43 +26,53 @@ def __init__(self, initialLayer, intermediateLayers, finalLayer): def __str__(self): s = "" started = False - if(initialLayer.nonEmpty): - s += "initial = " + initialLayer + if(self.initialLayer is not None): + s += "initial = " + str(self.initialLayer) started = True - for i in intermediateLayers.indices: - if(started) s += " " - s += s"intermediate ({i+1}) = " + intermediateLayers[i] + for i in range(len(self.intermediateLayers)): + if(started): s += " " + s += f"intermediate ({i+1}) = " + str(self.intermediateLayers[i]) started = True - if(finalLayer.nonEmpty): - if(started) s += " " - s += "final = " + finalLayer + if(self.finalLayer is not None): + if(started): s += " " + s += "final = " + str(self.finalLayer) return s + def get_parameters(self): + parameters = list() + if self.initialLayer is not None: + parameters += [p for p in self.initialLayer.parameters() if p.requires_grad] + for il in self.intermediateLayers: + parameters += [p for p in il.parameters() if p.requires_grad] + if self.finalLayer is not None: + parameters += [p for p in self.finalLayer.parameters() if p.requires_grad] + return parameters + def forward(self, sentence, constEmbeddings, doDropout): if self.initialLayer.isEmpty: raise RuntimeError(f"ERROR: you can't call forward() on a Layers object that does not have an initial layer: {self}!") states = self.initialLayer(sentence, constEmbeddings, doDropout) for intermediateLayer in self.intermediateLayers: states = intermediateLayer(states, doDropout) - if self.finalLayer.nonEmpty: + if self.finalLayer is not None: states = self.finalLayer(states, sentence.headPositions, doDropout) return states def forwardFrom(self, inStates, headPositions, doDropout): - if self.initialLayer.nonEmpty: + if self.initialLayer is not None: raise RuntimeError(f"ERROR: you can't call forwardFrom() on a Layers object that has an initial layer: {self}") states = inStates for intermediateLayer in self.intermediateLayers: states = intermediateLayer(states, doDropout) - if self.finalLayer.nonEmpty: + if self.finalLayer is not None: states = self.finalLayer(states, sentence.headPositions, doDropout) return states def saveX2i(self): x2i = dict() - if self.initialLayer.nonEmpty: + if self.initialLayer is not None: x2i['hasInitial'] = 1 x2i['initialLayer'] = self.initialLayer.saveX2i() else: @@ -70,7 +81,7 @@ def saveX2i(self): x2i['intermediateLayers'] = list() for il in self.intermediateLayers: x2i['intermediateLayers'].append(il.saveX2i()) - if self.finalLayer.nonEmpty: + if self.finalLayer is not None: x2i['hasFinal'] = 1 x2i['finalLayer'] = self.finalLayer.saveX2i() else: @@ -227,7 +238,7 @@ def parse(layers, sentence, constEmbeddings): @staticmethod def loss(layers, taskId, sentence, goldLabels): # Zheng: I am not sure this is the suitable way to load embeddings or not, need help... - constEmbeddings = ConstEmbeddingsGlove().mkConstLookupParams(sentence.words) + constEmbeddings = ConstEmbeddingsGlove.mkConstLookupParams(sentence.words) states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=True) # use dropout during training! return layers[taskId+1].finalLayer.loss(states, goldLabels) diff --git a/main/src/main/python/pytorch/metal.py b/main/src/main/python/pytorch/metal.py index ace00e73e..c95e7747f 100644 --- a/main/src/main/python/pytorch/metal.py +++ b/main/src/main/python/pytorch/metal.py @@ -1,29 +1,33 @@ from pytorch.utils import * from collections import Counter from sequences.rowReaders import * +from pytorch.layers import Layers -class Metal(): +from torch.optim import SGD, Adam + +class Metal(object): """docstring for Metal""" def __init__(self, taskManager, modelOpt): + self.taskManager = taskManager + # One Layers object per task; model(0) contains the Layers shared between all tasks (if any) if modelOpt: self.model = modelOpt else: self.model = self.initialize() - self.taskManager = taskManager def initialize(self): - taskWords, taskLabels = mkVocabularies() + taskWords, taskLabels = self.mkVocabularies() - layersPerTask = [None for _ in range(taskManager.taskCount + 1)] + layersPerTask = [None for _ in range(self.taskManager.taskCount + 1)] - layersPerTask[0] = Layers.apply(taskManager, "mtl.layers", taskWords[0], None, False, None) + layersPerTask[0] = Layers.apply(self.taskManager, "mtl.layers", taskWords[0], None, False, None) inputSize = layersPerTask[0].outDim - for i in taskManager.indices: - layersPerTask[i+1] = Layers.apply(taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], taskManager.tasks[i].isDual, inputSize) + for i in self.taskManager.indices: + layersPerTask[i+1] = Layers.apply(self.taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], self.taskManager.tasks[i].isDual, inputSize) for i in range(len(layersPerTask)): print (f"Summary of layersPerTask({i}):") @@ -33,17 +37,17 @@ def initialize(self): def mkVocabularies(self): # index 0 reserved for the shared Layers; tid + 1 corresponds to each task - labels = [Counter() for _ in range(taskManager.taskCount + 1)] + labels = [Counter() for _ in range(self.taskManager.taskCount + 1)] for i in range(1, len(labels)): # labels(0) not used, since only task-specific layers have a final layer labels[i][START_TAG] += 1 labels[i][STOP_TAG] += 1 - words = [Counter() for _ in range(taskManager.taskCount + 1)] + words = [Counter() for _ in range(self.taskManager.taskCount + 1)] reader = MetalRowReader() - for tid in taskManager.indices: - for sentence in taskManager.tasks[tid].trainSentences: + for tid in self.taskManager.indices: + for sentence in self.taskManager.tasks[tid].trainSentences: annotatedSentences = reader.toAnnotatedSentences(sentence) for asent in annotatedSentences: @@ -56,3 +60,46 @@ def mkVocabularies(self): return words, labels + def train(self, modelNamePrefix): + learningRate = self.taskManager.get_float("mtl.learningRate", 0.001) + trainerType = self.taskManager.get_string("mtl.trainer", "adam") + batchSize = self.taskManager.get_int("mtl.batchSize", 1) + assert(batchSize>0) + + parameters = list() + for layers in self.model: + parameters += layers.get_parameters() + + if trainerType == "adam": + trainer = Adam(parameters, lr=learningRate) + elif trainerType == "rmsprop": + trainer = RMSprop(parameters, lr=learningRate) + elif trainerType == "sgd": + trainer = SDG(parameters, lr=learningRate) + else: + raise RuntimeError(f"ERROR: unknown trainer {trainerType}!") + + reader = MetalRowReader() + + cummulativeLoss = 0.0 + numTagged = 0 + + maxAvgAcc = 0.0 + maxAvgF1 = 0.0 + bestEpoch = 0 + + allEpochScores = list() + epochPatience = self.taskManager.epochPatience + + for epoch in range(0, self.taskManager.maxEpochs): + if epochPatience <= 0: + break + + + + + + + + + diff --git a/main/src/main/python/pytorch/rnnLayer.py b/main/src/main/python/pytorch/rnnLayer.py index c5aef820a..ee1896f8e 100644 --- a/main/src/main/python/pytorch/rnnLayer.py +++ b/main/src/main/python/pytorch/rnnLayer.py @@ -1,5 +1,5 @@ -from intermediateLayer import IntermediateLayer -from utils import * +from pytorch.intermediateLayer import IntermediateLayer +from pytorch.utils import * import torch import torch.nn as nn @@ -12,7 +12,7 @@ def __init__(self, rnnType, wordRnnBuilder, dropoutProb): - + super().__init__() self.inDim = self.inputSize = inputSize self.numLayers = numLayers self.rnnStateSize = rnnStateSize @@ -64,7 +64,7 @@ def load(cls, x2i): @classmethod def initialize(cls, config, paramPrefix, inputSize): - if(not config.__contains__(paramPrefix)): + if(not config.contains(paramPrefix)): return None numLayers = config.get_int(paramPrefix + ".numLayers", 1) @@ -73,9 +73,9 @@ def initialize(cls, config, paramPrefix, inputSize): rnnType = config.get_string(paramPrefix + ".type", "lstm") dropoutProb = config.get_float(paramPrefix + ".dropoutProb", DEFAULT_DROPOUT_PROBABILITY) - builder = mkBuilder(rnnType, numLayers, inputSize, rnnStateSize) + builder = mkBuilder(rnnType, numLayers, inputSize, rnnStateSize, dropoutProb) - return (inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb) + return cls(inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb) def mkBuilder(rnnType, numLayers, inputSize, rnnStateSize, dropoutProb): if rnnType == 'gru': diff --git a/main/src/main/python/pytorch/taskManager.py b/main/src/main/python/pytorch/taskManager.py index f5d1ae868..25e669eb0 100644 --- a/main/src/main/python/pytorch/taskManager.py +++ b/main/src/main/python/pytorch/taskManager.py @@ -6,7 +6,7 @@ TYPE_BASIC = 0 TYPE_DUAL = 1 -class TaskManager: +class TaskManager(): def __init__(self, config, seed): @@ -31,6 +31,27 @@ def __init__(self, config, seed): # Training shards from all tasks self.shards = self.mkShards() + def contains(self, paramPrefix): + return self.config.__contains__(paramPrefix) + + def get_int(self, x, defualt=None): + return self.config.get_int(x, defualt) + + def get_string(self, x, defualt=None): + return self.config.get_string(x, defualt) + + def get_float(self, x, defualt=None): + return self.config.get_float(x, defualt) + + def get_bool(self, x, defualt=None): + return self.config.get_bool(x, defualt) + + def get_list(self, x, defualt=None): + return self.config.get_list(x, defualt) + + def get_config(self, x, defualt=None): + return self.config.get_config(x, defualt) + # Construct training shards by interleaving shards from all tasks def mkShards(self): shardsByTasks = list() diff --git a/main/src/main/python/pytorch/utils.py b/main/src/main/python/pytorch/utils.py index dd1709a8f..049ca8845 100644 --- a/main/src/main/python/pytorch/utils.py +++ b/main/src/main/python/pytorch/utils.py @@ -56,14 +56,16 @@ def readString2Ids(s2iFilename): if not line.startswith("#"): k, v = line.strip().split('\t') s2i[k] = int(v) + return s2i def readChar2Ids(s2iFilename): s2i = dict() with open(s2iFilename) as f: for line in f: - if not line.startswith("#"): + if not line.startswith("#") and line.rstrip(): k, v = line.strip().split('\t') - s2i[char(int(k))] = int(v) + s2i[chr(int(k))] = int(v) + return s2i def transduce(embeddings, builder): @@ -75,14 +77,14 @@ def transduce(embeddings, builder): if bi_direct: (h, c) = (torch.zeros(2, 1, hidden_dim), torch.zeros(2, 1, hidden_dim)) output, (result, c) = builder(embeddings.view(len(word), 1, -1), (h, c)) - else; + else: (h, c) = (torch.zeros(1, 1, hidden_dim), torch.zeros(1, 1, hidden_dim)) output, (result, c) = builder(embeddings.view(len(word), 1, -1), (h, c)) elif mode == 'GRU': if bi_direct: h = torch.zeros(2, 1, hidden_dim) output, result = builder(embeddings.view(len(word), 1, -1), h) - else; + else: h = torch.zeros(1, 1, hidden_dim) output, result = builder(embeddings.view(len(word), 1, -1), h) diff --git a/main/src/main/python/pytorch/viterbiForwardLayer.py b/main/src/main/python/pytorch/viterbiForwardLayer.py index 636b130c4..4b025293b 100644 --- a/main/src/main/python/pytorch/viterbiForwardLayer.py +++ b/main/src/main/python/pytorch/viterbiForwardLayer.py @@ -1,7 +1,7 @@ -from forwardLayer import * -from utils import * +from pytorch.forwardLayer import * +from pytorch.utils import * -class GreedyForwardLayer(ForwardLayer): +class ViterbiForwardLayer(ForwardLayer): def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None): super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans) diff --git a/main/src/main/python/run.py b/main/src/main/python/run.py index c75532f0e..fc4e1385a 100644 --- a/main/src/main/python/run.py +++ b/main/src/main/python/run.py @@ -20,8 +20,8 @@ modelName = args.model_file print (taskManager.debugTraversal()) - mtl = Metal(taskManager, None, None) - # mtl.train(modelName) + mtl = Metal(taskManager, None) + mtl.train(modelName) elif args.test: pass elif args.shell: diff --git a/main/src/main/python/sequences/columnReader.py b/main/src/main/python/sequences/columnReader.py index 0f8c04610..e162316f7 100644 --- a/main/src/main/python/sequences/columnReader.py +++ b/main/src/main/python/sequences/columnReader.py @@ -44,4 +44,4 @@ def __init__(self, tokens): def get(self, idx): if(idx >= self.length): raise RuntimeError(f"ERROR: trying to read field #{idx}, which does not exist in this row: {tokens}!") - return tokens[idx] + return self.tokens[idx] diff --git a/main/src/main/python/sequences/rowReaders.py b/main/src/main/python/sequences/rowReaders.py index 0aa409756..58a15cb71 100644 --- a/main/src/main/python/sequences/rowReaders.py +++ b/main/src/main/python/sequences/rowReaders.py @@ -26,18 +26,18 @@ def __init__(self): self.LABEL_START_OFFSET = 3 def toAnnotatedSentences(self, rows): - if (len(rows.head) == 2): - self.parseSimple(rows) - elif (len(rows.head) == 4): - self.parseSimpleExtended(rows) - elif (len(rows.head) >= 5): - self.parseFull(rows) + if (rows[0].length == 2): + return self.parseSimple(rows) + elif (rows[0].length == 4): + return self.parseSimpleExtended(rows) + elif (rows[0].length >= 5): + return self.parseFull(rows) else: raise RuntimeError("ERROR: the Metal format expects 2, 4, or 5+ columns!") # Parser for the simple format: word, label - def parseSimple(rows): - assert(len(rows.head) == 2) + def parseSimple(self, rows): + assert(rows[0].length == 2) words = list() labels = list() @@ -45,11 +45,11 @@ def parseSimple(rows): words += [row.get(self.WORD_POSITION)] labels += [row.get(self.WORD_POSITION + 1)] - return AnnotatedSentence(words), labels + return [(AnnotatedSentence(words), labels)] # Parser for the simple extended format: word, POS tag, NE label, label - def parseSimpleExtended(rows): - assert(len(rows.head) == 4) + def parseSimpleExtended(self, rows): + assert(rows[0].length == 4) words = list() posTags = list() neLabels = list() @@ -61,12 +61,12 @@ def parseSimpleExtended(rows): neLabels += [row.get(self.NE_LABEL_POSITION)] labels += [row.get(self.LABEL_START_OFFSET)] - return AnnotatedSentence(words), posTags, neLabels, labels + return [(AnnotatedSentence(words), posTags, neLabels, labels)] # Parser for the full format: word, POS tag, NE label, (label head)+ - def parseFull(rows): - assert(len(rows.head) >= 5) - numSent = (len(rows.head) - 3) / 2 + def parseFull(self, rows): + assert(rows[0].length >= 5) + numSent = (rows[0].length - 3) / 2 assert(numSent >= 1) words = list()