diff --git a/main/src/main/python/embeddings/wordEmbeddingMap.py b/main/src/main/python/embeddings/wordEmbeddingMap.py
index a82c2108a..327b4befc 100644
--- a/main/src/main/python/embeddings/wordEmbeddingMap.py
+++ b/main/src/main/python/embeddings/wordEmbeddingMap.py
@@ -1,29 +1,29 @@
 import numpy as np
+import math
 
 class WordEmbeddingMap:
     def __init__(self, config):
-        self.emb_dict = self.load(config)
-        self.dim = self.emb_dict.shape[-1]
-
-    def load(self):
-        emb_matrix = None
-        emb_dict = dict()
-        for line in open(config.get_string("glove.matrixResourceName")):
-            if not len(line.split()) == 2:
-                if "\t" in line:
-                    delimiter = "\t"
-                else:
-                    delimiter = " "
-                line_split = line.rstrip().split(delimiter)
-                # extract word and vector
-                word = line_split[0]
-                x = np.array([float(i) for i in line_split[1:]])
-                vector = (x /np.linalg.norm(x))
-                embedding_size = vector.shape[0]
-                emb_dict[word] = vector
-        base = math.sqrt(6/embedding_size)
-        emb_dict["<UNK>"] = np.random.uniform(-base,base,(embedding_size))
-        return emb_dict
+        self.emb_dict, self.dim = load(config)
 
     def isOutOfVocabulary(self, word):
-        return word not in self.emb_dict
\ No newline at end of file
+        return word not in self.emb_dict
+
+def load(config):
+    emb_matrix = None
+    emb_dict = dict()
+    for line in open(config.get_string("glove.matrixResourceName")):
+        if not len(line.split()) == 2:
+            if "\t" in line:
+                delimiter = "\t"
+            else:
+                delimiter = " "
+            line_split = line.rstrip().split(delimiter)
+            # extract word and vector
+            word = line_split[0]
+            x = np.array([float(i) for i in line_split[1:]])
+            vector = (x /np.linalg.norm(x))
+            embedding_size = vector.shape[0]
+            emb_dict[word] = vector
+    base = math.sqrt(6/embedding_size)
+    emb_dict["<UNK>"] = np.random.uniform(-base,base,(embedding_size))
+    return emb_dict, embedding_size
\ No newline at end of file
diff --git a/main/src/main/python/pytorch/constEmbeddingsGlove.py b/main/src/main/python/pytorch/constEmbeddingsGlove.py
index be32c2f39..52547b6a8 100644
--- a/main/src/main/python/pytorch/constEmbeddingsGlove.py
+++ b/main/src/main/python/pytorch/constEmbeddingsGlove.py
@@ -1,16 +1,18 @@
 from dataclasses import dataclass
 import torch.nn as nn
 from embeddings.wordEmbeddingMap import *
+from pyhocon import ConfigFactory
 
 @dataclass
 class ConstEmbeddingParameters:
     emb: nn.Embedding
     w2i: dict
 
-def ConstEmbeddingsGlove:
+class _ConstEmbeddingsGlove:
     def __init__(self):
         self.SINGLETON_WORD_EMBEDDING_MAP = None
-        self.load('../resources/org/clulab/glove.conf')
+        config = ConfigFactory.parse_file('../resources/org/clulab/glove.conf')
+        self.load(config)
         self.dim = self.SINGLETON_WORD_EMBEDDING_MAP.dim
 
     def load(self, config):
@@ -25,3 +27,5 @@ def mkConstLookupParams(self, words):
         emd = nn.Embedding.from_pretrained(weight)
         emd.weight.requires_grad=False
         return ConstEmbeddingParameters(emb ,w2i)
+
+ConstEmbeddingsGlove = _ConstEmbeddingsGlove()
diff --git a/main/src/main/python/pytorch/embeddingLayer.py b/main/src/main/python/pytorch/embeddingLayer.py
index a506bfa02..2ebea6f7b 100644
--- a/main/src/main/python/pytorch/embeddingLayer.py
+++ b/main/src/main/python/pytorch/embeddingLayer.py
@@ -1,8 +1,9 @@
-from initialLayer import InitialLayer
+from pytorch.initialLayer import InitialLayer
 import random
-from utils import *
+from pytorch.utils import *
 import torch.nn as nn
 import torch
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
 
 DEFAULT_DROPOUT_PROB: float = DEFAULT_DROPOUT_PROBABILITY
 DEFAULT_LEARNED_WORD_EMBEDDING_SIZE: int = 128
@@ -16,7 +17,7 @@
 DEFAULT_USE_IS_PREDICATE: int = -1
 
 class EmbeddingLayer(InitialLayer):
-    def __init__(w2i, # word to index
+    def __init__(self, w2i, # word to index
                  w2f, # word to frequency
                  c2i, # character to index
                  tag2i, # POS tag to index
@@ -68,7 +69,7 @@ def __init__(w2i, # word to index
         positionDim = 1 if distanceLookupParameters and useIsPredicate else 0
         predicateDim = positionEmbeddingSize if positionLookupParameters else 0
 
-        self.outDim =    ConstEmbeddingsGlove().dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim
+        self.outDim =    ConstEmbeddingsGlove.dim + learnedWordEmbeddingSize + charRnnStateSize * 2 + posTagDim + neTagDim + distanceDim + positionDim + predicateDim
         random.seed(RANDOM_SEED)
     
     def forward(self, sentence, constEmbeddings, doDropout):
@@ -81,9 +82,9 @@ def forward(self, sentence, constEmbeddings, doDropout):
         # const word embeddings such as GloVe
         constEmbeddingsExpressions = self.mkConstEmbeddings(words, constEmbeddings)
         assert(constEmbeddingsExpressions.size(0) == len(words))
-        if(tags) assert(len(tags) == len(words))
-        if(nes) assert(len(nes) == len(words))
-        if(headPositions) assert(len(headPositions) == len(words))
+        if(tags): assert(len(tags) == len(words))
+        if(nes): assert(len(nes) == len(words))
+        if(headPositions): assert(len(headPositions) == len(words))
 
         # build the word embeddings one by one
         embeddings = self.mkEmbeddings(words, constEmbeddingsExpressions, tags, nes, headPositions)
@@ -250,7 +251,7 @@ def load(cls, x2i):
     @classmethod
     def initialize(cls, config, paramPrefix, wordCounter):
 
-        if(not config.__contains__(paramPrefix)):
+        if(not config.contains(paramPrefix)):
             return None
 
         learnedWordEmbeddingSize = config.get_int(paramPrefix + ".learnedWordEmbeddingSize",DEFAULT_LEARNED_WORD_EMBEDDING_SIZE)
@@ -260,9 +261,9 @@ def initialize(cls, config, paramPrefix, wordCounter):
         neTagEmbeddingSize       = config.get_int(paramPrefix + ".neTagEmbeddingSize",DEFAULT_NE_TAG_EMBEDDING_SIZE)
         distanceEmbeddingSize    = config.get_int(paramPrefix + ".distanceEmbeddingSize",DEFAULT_DISTANCE_EMBEDDING_SIZE)
         distanceWindowSize       = config.get_int(paramPrefix + ".distanceWindowSize",DEFAULT_DISTANCE_WINDOW_SIZE)
-        useIsPredicate           = config.getArgBoolean(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1)
+        useIsPredicate           = config.get_bool(paramPrefix + ".useIsPredicate",DEFAULT_USE_IS_PREDICATE == 1)
         positionEmbeddingSize    = config.get_int(paramPrefix + ".positionEmbeddingSize",DEFAULT_POSITION_EMBEDDING_SIZE)
-        dropoutProb              = config.get_float(paramPrefix + ".dropoutProb",EmbeddingLayer.DEFAULT_DROPOUT_PROB)
+        dropoutProb              = config.get_float(paramPrefix + ".dropoutProb",DEFAULT_DROPOUT_PROB)
 
         wordList = [UNK_WORD] + sorted(wordCounter.keys())
         w2i = {w:i for i, w in enumerate(wordList)}
@@ -293,7 +294,7 @@ def initialize(cls, config, paramPrefix, wordCounter):
         distanceLookupParameters = nn.Embedding(distanceWindowSize * 2 + 3, distanceEmbeddingSize) if distanceEmbeddingSize > 0 else None
         positionLookupParameters = nn.Embedding(101, positionEmbeddingSize) if positionEmbeddingSize > 0 else None
 
-        return cls(w2i, w2f, c2i, tag2i, ne2i,
+        return cls(w2i, wordCounter, c2i, tag2i, ne2i,
                   learnedWordEmbeddingSize,
                   charEmbeddingSize,
                   charRnnStateSize,
@@ -331,6 +332,8 @@ def initialize(cls, config, paramPrefix, wordCounter):
 
 
 
+
+
 
 
 
diff --git a/main/src/main/python/pytorch/forwardLayer.py b/main/src/main/python/pytorch/forwardLayer.py
index 2f43be5a7..92b7a133d 100644
--- a/main/src/main/python/pytorch/forwardLayer.py
+++ b/main/src/main/python/pytorch/forwardLayer.py
@@ -3,14 +3,13 @@
 from torch.autograd import Variable
 import torch.nn.functional as F
 
-from finalLayer import FinalLayer
-from greedyForwardLayer import GreedyForwardLayer
-from viterbiForwardLayer import ViterbiForwardLayer
+from pytorch.finalLayer import FinalLayer
 
-from utils import *
+from pytorch.utils import *
 
 class ForwardLayer(FinalLayer):
     def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None):
+        super().__init__()
         self.inputSize = inputSize
         self.isDual = isDual
         self.t2i = t2i
@@ -73,6 +72,8 @@ def forward(inputExpressions, doDropout, headPositionsOpt = None):
 
     @staticmethod
     def load(x2i):
+        from pytorch.greedyForwardLayer import GreedyForwardLayer
+        from pytorch.viterbiForwardLayer import ViterbiForwardLayer
         inferenceType = x2i["inferenceType"]
         if inferenceType == TYPE_VITERBI:
             pass
@@ -85,7 +86,9 @@ def load(x2i):
 
     @staticmethod
     def initialize(config, paramPrefix, labelCounter, isDual, inputSize):
-        if(not config.__contains__(paramPrefix)):
+        from pytorch.greedyForwardLayer import GreedyForwardLayer
+        from pytorch.viterbiForwardLayer import ViterbiForwardLayer
+        if(not config.contains(paramPrefix)):
             return None
 
         inferenceType = config.get_string(paramPrefix + ".inference", "greedy")
diff --git a/main/src/main/python/pytorch/greedyForwardLayer.py b/main/src/main/python/pytorch/greedyForwardLayer.py
index be776a6b5..2d9ddeeae 100644
--- a/main/src/main/python/pytorch/greedyForwardLayer.py
+++ b/main/src/main/python/pytorch/greedyForwardLayer.py
@@ -1,5 +1,5 @@
-from forwardLayer import *
-from utils import *
+from pytorch.forwardLayer import *
+from pytorch.utils import *
 import numpy as np
 
 class GreedyForwardLayer(ForwardLayer):
@@ -23,7 +23,7 @@ def saveX2i(self):
         return x2i
 
     def __str__(self):
-        return f"GreedyForwardLayer({inDim}, {outDim})"
+        return f"GreedyForwardLayer({self.inDim}, {self.outDim})"
 
     def inference(self, emissionScores):
         labelIds = np.argmax(lattice.data.numpy(), axis=1).tolist()
diff --git a/main/src/main/python/pytorch/layers.py b/main/src/main/python/pytorch/layers.py
index 363ff29a9..4c1a1889d 100644
--- a/main/src/main/python/pytorch/layers.py
+++ b/main/src/main/python/pytorch/layers.py
@@ -1,7 +1,9 @@
 import torch.nn as nn
-from utils import *
-from embeddingLayer import EmbeddingLayer
-from constEmbeddingsGlove import ConstEmbeddingsGlove
+from pytorch.utils import *
+from pytorch.embeddingLayer import EmbeddingLayer
+from pytorch.rnnLayer import RnnLayer
+from pytorch.forwardLayer import ForwardLayer
+from pytorch.constEmbeddingsGlove import ConstEmbeddingsGlove
 
 class Layers(object):
     def __init__(self, initialLayer, intermediateLayers, finalLayer):
@@ -14,8 +16,7 @@ def __init__(self, initialLayer, intermediateLayers, finalLayer):
         else:
             self.outDim = None
 
-        if initialLayer and intermediateLayers and finalLayer:
-            self.nonEmpty = True
+        self.nonEmpty = initialLayer is not None and intermediateLayers is not None and finalLayer is not None
         self.isEmpty = not self.nonEmpty
 
         self.initialLayer = initialLayer
@@ -25,43 +26,53 @@ def __init__(self, initialLayer, intermediateLayers, finalLayer):
     def __str__(self):
         s = ""
         started = False
-        if(initialLayer.nonEmpty):
-            s += "initial = " + initialLayer
+        if(self.initialLayer is not None):
+            s += "initial = " + str(self.initialLayer)
             started = True
-        for i in intermediateLayers.indices:
-            if(started) s += " "
-            s += s"intermediate ({i+1}) = " + intermediateLayers[i]
+        for i in range(len(self.intermediateLayers)):
+            if(started): s += " "
+            s += f"intermediate ({i+1}) = " + str(self.intermediateLayers[i])
             started = True
-        if(finalLayer.nonEmpty):
-          if(started) s += " "
-          s += "final = " + finalLayer
+        if(self.finalLayer is not None):
+          if(started): s += " "
+          s += "final = " + str(self.finalLayer)
         return s
 
+    def get_parameters(self):
+        parameters = list()
+        if self.initialLayer is not None:
+            parameters += [p for p in self.initialLayer.parameters() if p.requires_grad]
+        for il in self.intermediateLayers:
+            parameters += [p for p in il.parameters() if p.requires_grad]
+        if self.finalLayer is not None:
+            parameters += [p for p in self.finalLayer.parameters() if p.requires_grad]
+        return parameters
+
     def forward(self, sentence, constEmbeddings, doDropout):
         if self.initialLayer.isEmpty:
             raise RuntimeError(f"ERROR: you can't call forward() on a Layers object that does not have an initial layer: {self}!")
         states = self.initialLayer(sentence, constEmbeddings, doDropout)
         for intermediateLayer in self.intermediateLayers:
             states = intermediateLayer(states, doDropout)
-        if self.finalLayer.nonEmpty:
+        if self.finalLayer is not None:
             states = self.finalLayer(states, sentence.headPositions, doDropout)
 
         return states
 
     def forwardFrom(self, inStates, headPositions, doDropout):
-        if self.initialLayer.nonEmpty:
+        if self.initialLayer is not None:
             raise RuntimeError(f"ERROR: you can't call forwardFrom() on a Layers object that has an initial layer: {self}")
         states = inStates
         for intermediateLayer in self.intermediateLayers:
             states = intermediateLayer(states, doDropout)
-        if self.finalLayer.nonEmpty:
+        if self.finalLayer is not None:
             states = self.finalLayer(states, sentence.headPositions, doDropout)
 
         return states
 
     def saveX2i(self):
         x2i = dict()
-        if self.initialLayer.nonEmpty:
+        if self.initialLayer is not None:
             x2i['hasInitial'] = 1
             x2i['initialLayer'] = self.initialLayer.saveX2i()
         else:
@@ -70,7 +81,7 @@ def saveX2i(self):
         x2i['intermediateLayers'] = list()
         for il in self.intermediateLayers:
             x2i['intermediateLayers'].append(il.saveX2i())
-        if self.finalLayer.nonEmpty:
+        if self.finalLayer is not None:
             x2i['hasFinal'] = 1
             x2i['finalLayer'] = self.finalLayer.saveX2i()
         else:
@@ -227,7 +238,7 @@ def parse(layers, sentence, constEmbeddings):
     @staticmethod
     def loss(layers, taskId, sentence, goldLabels):
         # Zheng: I am not sure this is the suitable way to load embeddings or not, need help...
-        constEmbeddings = ConstEmbeddingsGlove().mkConstLookupParams(sentence.words)
+        constEmbeddings = ConstEmbeddingsGlove.mkConstLookupParams(sentence.words)
         states = Layers.forwardForTask(layers, taskId, sentence, constEmbeddings, doDropout=True) # use dropout during training!
         return layers[taskId+1].finalLayer.loss(states, goldLabels)
 
diff --git a/main/src/main/python/pytorch/metal.py b/main/src/main/python/pytorch/metal.py
index ace00e73e..c95e7747f 100644
--- a/main/src/main/python/pytorch/metal.py
+++ b/main/src/main/python/pytorch/metal.py
@@ -1,29 +1,33 @@
 from pytorch.utils import *
 from collections import Counter
 from sequences.rowReaders import *
+from pytorch.layers import Layers
 
-class Metal():
+from torch.optim import SGD, Adam
+
+class Metal(object):
     """docstring for Metal"""
     def __init__(self, taskManager, modelOpt):
+        self.taskManager = taskManager
+
         # One Layers object per task; model(0) contains the Layers shared between all tasks (if any)
         if modelOpt:
             self.model = modelOpt
         else:
             self.model = self.initialize()
-        self.taskManager = taskManager
 
     def initialize(self):
 
-        taskWords, taskLabels = mkVocabularies()
+        taskWords, taskLabels = self.mkVocabularies()
 
-        layersPerTask = [None for _ in range(taskManager.taskCount + 1)]
+        layersPerTask = [None for _ in range(self.taskManager.taskCount + 1)]
 
-        layersPerTask[0] = Layers.apply(taskManager, "mtl.layers", taskWords[0], None, False, None)
+        layersPerTask[0] = Layers.apply(self.taskManager, "mtl.layers", taskWords[0], None, False, None)
 
         inputSize = layersPerTask[0].outDim
 
-        for i in taskManager.indices:
-            layersPerTask[i+1] = Layers.apply(taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], taskManager.tasks[i].isDual, inputSize)
+        for i in self.taskManager.indices:
+            layersPerTask[i+1] = Layers.apply(self.taskManager, f"mtl.task{i+1}.layers", taskWords[i + 1], taskLabels[i + 1], self.taskManager.tasks[i].isDual, inputSize)
 
         for i in range(len(layersPerTask)):
             print (f"Summary of layersPerTask({i}):")
@@ -33,17 +37,17 @@ def initialize(self):
     
     def mkVocabularies(self):
         # index 0 reserved for the shared Layers; tid + 1 corresponds to each task
-        labels = [Counter() for _ in range(taskManager.taskCount + 1)]
+        labels = [Counter() for _ in range(self.taskManager.taskCount + 1)]
         for i in range(1, len(labels)): # labels(0) not used, since only task-specific layers have a final layer
           labels[i][START_TAG] += 1
           labels[i][STOP_TAG] += 1
 
-        words = [Counter() for _ in range(taskManager.taskCount + 1)]
+        words = [Counter() for _ in range(self.taskManager.taskCount + 1)]
 
         reader = MetalRowReader()
 
-        for tid in taskManager.indices:
-          for sentence in taskManager.tasks[tid].trainSentences:
+        for tid in self.taskManager.indices:
+          for sentence in self.taskManager.tasks[tid].trainSentences:
             annotatedSentences = reader.toAnnotatedSentences(sentence)
 
             for asent in annotatedSentences:
@@ -56,3 +60,46 @@ def mkVocabularies(self):
 
         return words, labels
 
+    def train(self, modelNamePrefix):
+        learningRate = self.taskManager.get_float("mtl.learningRate", 0.001)
+        trainerType = self.taskManager.get_string("mtl.trainer", "adam")
+        batchSize = self.taskManager.get_int("mtl.batchSize", 1)
+        assert(batchSize>0)
+
+        parameters = list()
+        for layers in self.model:
+            parameters += layers.get_parameters()
+
+        if trainerType == "adam":
+            trainer = Adam(parameters, lr=learningRate)
+        elif trainerType == "rmsprop":
+            trainer = RMSprop(parameters, lr=learningRate)
+        elif trainerType == "sgd":
+            trainer = SDG(parameters, lr=learningRate)
+        else:
+            raise RuntimeError(f"ERROR: unknown trainer {trainerType}!")
+
+        reader = MetalRowReader()
+
+        cummulativeLoss = 0.0
+        numTagged = 0
+        
+        maxAvgAcc = 0.0
+        maxAvgF1 = 0.0
+        bestEpoch = 0
+
+        allEpochScores = list()
+        epochPatience = self.taskManager.epochPatience
+
+        for epoch in range(0, self.taskManager.maxEpochs):
+            if epochPatience <= 0:
+                break
+            
+
+
+
+
+
+
+
+
diff --git a/main/src/main/python/pytorch/rnnLayer.py b/main/src/main/python/pytorch/rnnLayer.py
index c5aef820a..ee1896f8e 100644
--- a/main/src/main/python/pytorch/rnnLayer.py
+++ b/main/src/main/python/pytorch/rnnLayer.py
@@ -1,5 +1,5 @@
-from intermediateLayer import IntermediateLayer
-from utils import *
+from pytorch.intermediateLayer import IntermediateLayer
+from pytorch.utils import *
 import torch
 import torch.nn as nn
 
@@ -12,7 +12,7 @@ def __init__(self,
         rnnType, 
         wordRnnBuilder, 
         dropoutProb):
-
+        super().__init__()
         self.inDim = self.inputSize = inputSize
         self.numLayers = numLayers
         self.rnnStateSize = rnnStateSize 
@@ -64,7 +64,7 @@ def load(cls, x2i):
     @classmethod
     def initialize(cls, config, paramPrefix, inputSize):
 
-        if(not config.__contains__(paramPrefix)):
+        if(not config.contains(paramPrefix)):
             return None
 
         numLayers = config.get_int(paramPrefix + ".numLayers", 1)
@@ -73,9 +73,9 @@ def initialize(cls, config, paramPrefix, inputSize):
         rnnType = config.get_string(paramPrefix + ".type", "lstm")
         dropoutProb = config.get_float(paramPrefix + ".dropoutProb", DEFAULT_DROPOUT_PROBABILITY)
 
-        builder = mkBuilder(rnnType, numLayers, inputSize, rnnStateSize)
+        builder = mkBuilder(rnnType, numLayers, inputSize, rnnStateSize, dropoutProb)
 
-        return (inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb)
+        return cls(inputSize, numLayers, rnnStateSize, useHighwayConnections, rnnType, builder, dropoutProb)
 
 def mkBuilder(rnnType, numLayers, inputSize, rnnStateSize, dropoutProb):
     if rnnType == 'gru':
diff --git a/main/src/main/python/pytorch/taskManager.py b/main/src/main/python/pytorch/taskManager.py
index f5d1ae868..25e669eb0 100644
--- a/main/src/main/python/pytorch/taskManager.py
+++ b/main/src/main/python/pytorch/taskManager.py
@@ -6,7 +6,7 @@
 TYPE_BASIC = 0
 TYPE_DUAL = 1
 
-class TaskManager:
+class TaskManager():
 
   def __init__(self, config, seed):
 
@@ -31,6 +31,27 @@ def __init__(self, config, seed):
     # Training shards from all tasks 
     self.shards = self.mkShards()
 
+  def contains(self, paramPrefix):
+    return self.config.__contains__(paramPrefix)
+
+  def get_int(self, x, defualt=None):
+    return self.config.get_int(x, defualt)
+
+  def get_string(self, x, defualt=None):
+    return self.config.get_string(x, defualt)
+
+  def get_float(self, x, defualt=None):
+    return self.config.get_float(x, defualt)
+
+  def get_bool(self, x, defualt=None):
+    return self.config.get_bool(x, defualt)
+
+  def get_list(self, x, defualt=None):
+    return self.config.get_list(x, defualt)
+
+  def get_config(self, x, defualt=None):
+    return self.config.get_config(x, defualt)
+
   # Construct training shards by interleaving shards from all tasks 
   def mkShards(self):
     shardsByTasks = list()
diff --git a/main/src/main/python/pytorch/utils.py b/main/src/main/python/pytorch/utils.py
index dd1709a8f..049ca8845 100644
--- a/main/src/main/python/pytorch/utils.py
+++ b/main/src/main/python/pytorch/utils.py
@@ -56,14 +56,16 @@ def readString2Ids(s2iFilename):
             if not line.startswith("#"):
                 k, v = line.strip().split('\t')
                 s2i[k] = int(v)
+    return s2i
 
 def readChar2Ids(s2iFilename):
     s2i = dict()
     with open(s2iFilename) as f:
         for line in f:
-            if not line.startswith("#"):
+            if not line.startswith("#") and line.rstrip():
                 k, v = line.strip().split('\t')
-                s2i[char(int(k))] = int(v)
+                s2i[chr(int(k))] = int(v)
+    return s2i
 
 def transduce(embeddings, builder):
 
@@ -75,14 +77,14 @@ def transduce(embeddings, builder):
         if bi_direct:
             (h, c) =  (torch.zeros(2, 1, hidden_dim), torch.zeros(2, 1, hidden_dim)) 
             output, (result, c) = builder(embeddings.view(len(word), 1, -1), (h, c))
-        else;
+        else:
             (h, c) =  (torch.zeros(1, 1, hidden_dim), torch.zeros(1, 1, hidden_dim)) 
             output, (result, c) = builder(embeddings.view(len(word), 1, -1), (h, c))
     elif mode == 'GRU':
         if bi_direct:
             h =  torch.zeros(2, 1, hidden_dim) 
             output, result = builder(embeddings.view(len(word), 1, -1), h)
-        else;
+        else:
             h =  torch.zeros(1, 1, hidden_dim)
             output, result = builder(embeddings.view(len(word), 1, -1), h)
 
diff --git a/main/src/main/python/pytorch/viterbiForwardLayer.py b/main/src/main/python/pytorch/viterbiForwardLayer.py
index 636b130c4..4b025293b 100644
--- a/main/src/main/python/pytorch/viterbiForwardLayer.py
+++ b/main/src/main/python/pytorch/viterbiForwardLayer.py
@@ -1,7 +1,7 @@
-from forwardLayer import *
-from utils import *
+from pytorch.forwardLayer import *
+from pytorch.utils import *
 
-class GreedyForwardLayer(ForwardLayer):
+class ViterbiForwardLayer(ForwardLayer):
     def __init__(self, inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans = None):
         super().__init__(inputSize, isDual, t2i, i2t, actualInputSize, nonlinearity, dropoutProb, spans)
 
diff --git a/main/src/main/python/run.py b/main/src/main/python/run.py
index c75532f0e..fc4e1385a 100644
--- a/main/src/main/python/run.py
+++ b/main/src/main/python/run.py
@@ -20,8 +20,8 @@
         modelName = args.model_file
         print (taskManager.debugTraversal())
 
-        mtl = Metal(taskManager, None, None)
-        # mtl.train(modelName)
+        mtl = Metal(taskManager, None)
+        mtl.train(modelName)
     elif args.test:
         pass
     elif args.shell:
diff --git a/main/src/main/python/sequences/columnReader.py b/main/src/main/python/sequences/columnReader.py
index 0f8c04610..e162316f7 100644
--- a/main/src/main/python/sequences/columnReader.py
+++ b/main/src/main/python/sequences/columnReader.py
@@ -44,4 +44,4 @@ def __init__(self, tokens):
   def get(self, idx):
     if(idx >= self.length):
       raise RuntimeError(f"ERROR: trying to read field #{idx}, which does not exist in this row: {tokens}!")
-    return tokens[idx]
+    return self.tokens[idx]
diff --git a/main/src/main/python/sequences/rowReaders.py b/main/src/main/python/sequences/rowReaders.py
index 0aa409756..58a15cb71 100644
--- a/main/src/main/python/sequences/rowReaders.py
+++ b/main/src/main/python/sequences/rowReaders.py
@@ -26,18 +26,18 @@ def __init__(self):
         self.LABEL_START_OFFSET = 3
 
     def toAnnotatedSentences(self, rows):
-        if (len(rows.head) == 2):
-            self.parseSimple(rows)
-        elif (len(rows.head) == 4):
-            self.parseSimpleExtended(rows)
-        elif (len(rows.head) >= 5):
-            self.parseFull(rows)
+        if (rows[0].length == 2):
+            return self.parseSimple(rows)
+        elif (rows[0].length == 4):
+            return self.parseSimpleExtended(rows)
+        elif (rows[0].length >= 5):
+            return self.parseFull(rows)
         else:
             raise RuntimeError("ERROR: the Metal format expects 2, 4, or 5+ columns!")
 
     # Parser for the simple format: word, label 
-    def parseSimple(rows):
-        assert(len(rows.head) == 2)
+    def parseSimple(self, rows):
+        assert(rows[0].length == 2)
         words = list()
         labels = list()
 
@@ -45,11 +45,11 @@ def parseSimple(rows):
             words += [row.get(self.WORD_POSITION)]
             labels += [row.get(self.WORD_POSITION + 1)]
 
-        return AnnotatedSentence(words), labels
+        return [(AnnotatedSentence(words), labels)]
 
     # Parser for the simple extended format: word, POS tag, NE label, label
-    def parseSimpleExtended(rows):
-        assert(len(rows.head) == 4)
+    def parseSimpleExtended(self, rows):
+        assert(rows[0].length == 4)
         words = list()
         posTags = list()
         neLabels = list()
@@ -61,12 +61,12 @@ def parseSimpleExtended(rows):
             neLabels += [row.get(self.NE_LABEL_POSITION)]
             labels += [row.get(self.LABEL_START_OFFSET)]
 
-        return AnnotatedSentence(words), posTags, neLabels, labels
+        return [(AnnotatedSentence(words), posTags, neLabels, labels)]
 
     # Parser for the full format: word, POS tag, NE label, (label head)+ 
-    def parseFull(rows):
-        assert(len(rows.head) >= 5)
-        numSent = (len(rows.head) - 3) / 2
+    def parseFull(self, rows):
+        assert(rows[0].length >= 5)
+        numSent = (rows[0].length - 3) / 2
         assert(numSent >= 1)
 
         words = list()