From d238ac529ccc77ae16b2ff0dfbdef9d94f5f872e Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 12 Jul 2023 14:41:56 +0900
Subject: [PATCH 01/24] add BERT+BidLSTM and BERT+BidLSTM+CRF base models

---
 delft/applications/grobidTagger.py |   4 +-
 delft/sequenceLabelling/models.py  | 129 ++++++++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index 3c29651b..cfd7f355 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -349,7 +349,9 @@ class Tasks:
     word_embeddings_examples = ['glove-840B', 'fasttext-crawl', 'word2vec']
 
     architectures_transformers_based = [
-                    'BERT', 'BERT_FEATURES', 'BERT_CRF', 'BERT_ChainCRF', 'BERT_CRF_FEATURES', 'BERT_ChainCRF_FEATURES', 'BERT_CRF_CHAR', 'BERT_CRF_CHAR_FEATURES'
+                    'BERT', 'BERT_FEATURES', 'BERT_CRF', 'BERT_ChainCRF', 'BERT_CRF_FEATURES', 'BERT_ChainCRF_FEATURES',
+                    'BERT_CRF_CHAR', 'BERT_CRF_CHAR_FEATURES',
+                    'BERT_BidLSTM', 'BERT_BidLSTM_CRF', 'BERT_BidLSTM_ChainCRF'
                      ]
 
     architectures = architectures_word_embeddings + architectures_transformers_based
diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index e54d34e7..f30f7f51 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -182,6 +182,30 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                                     load_pretrained_weights=load_pretrained_weights, 
                                     local_path=local_path,
                                     preprocessor=preprocessor)
+    elif config.architecture == BERT_BidLSTM.name:
+        preprocessor.return_bert_embeddings = True
+        config.labels = preprocessor.vocab_tag
+        return BERT_BidLSTM(config,
+                    ntags,
+                    load_pretrained_weights=load_pretrained_weights,
+                    local_path=local_path,
+                    preprocessor=preprocessor)
+    elif config.architecture == BERT_BidLSTM_CRF.name:
+        preprocessor.return_bert_embeddings = True
+        config.labels = preprocessor.vocab_tag
+        return BERT_BidLSTM_CRF(config,
+                    ntags,
+                    load_pretrained_weights=load_pretrained_weights,
+                    local_path=local_path,
+                    preprocessor=preprocessor)
+    elif config.architecture == BERT_BidLSTM_ChainCRF.name:
+        preprocessor.return_bert_embeddings = True
+        config.labels = preprocessor.vocab_tag
+        return BERT_BidLSTM_ChainCRF(config,
+                                ntags,
+                                load_pretrained_weights=load_pretrained_weights,
+                                local_path=local_path,
+                                preprocessor=preprocessor)
     else:
         raise (OSError('Model name does exist: ' + config.architecture))
 
@@ -1026,7 +1050,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights=True, local_path:
         self.crf = ChainCRF()
         pred = self.crf(x)
 
-        self.model  = Model(inputs=[input_ids_in, features_input, token_type_ids, attention_mask], outputs=[x])
+        self.model = Model(inputs=[input_ids_in, features_input, token_type_ids, attention_mask], outputs=[pred])
         self.config = config
 
     def get_generator(self):
@@ -1158,3 +1182,106 @@ def __init__(self, config, ntags=None, load_pretrained_weights=True, local_path:
 
     def get_generator(self):
         return DataGeneratorTransformers
+
+class BERT_BidLSTM(BaseModel):
+    """
+    """
+
+    name = 'BERT_BidLSTM'
+
+    def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
+        super().__init__(config, ntags, load_pretrained_weights, local_path)
+
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+
+        input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
+        token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
+        attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
+
+        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
+        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
+        embedding_layer = Dropout(0.1)(embedding_layer)
+
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+                               return_sequences=True,
+                               recurrent_dropout=config.recurrent_dropout))(embedding_layer)
+        bid_lstm = Dropout(config.dropout)(bid_lstm)
+
+        label_logits = Dense(ntags, activation='softmax')(bid_lstm)
+
+        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[label_logits])
+        self.config = config
+
+    def get_generator(self):
+        return DataGeneratorTransformers
+
+
+class BERT_BidLSTM_CRF(BaseModel):
+    """
+
+    """
+
+    name = 'BERT_BidLSTM_CRF'
+
+    def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
+        super().__init__(config, ntags, load_pretrained_weights, local_path)
+
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+
+        input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
+        token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
+        attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
+
+        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
+        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
+        embedding_layer = Dropout(0.1)(embedding_layer)
+
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+                                      return_sequences=True,
+                                      recurrent_dropout=config.recurrent_dropout))(embedding_layer)
+        bid_lstm = Dropout(config.dropout)(bid_lstm)
+
+        base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
+
+        self.model = CRFModelWrapperForBERT(base_model, ntags)
+        self.model.build(input_shape=[(None, None, ), (None, None, ), (None, None, )])
+        self.config = config
+
+    def get_generator(self):
+        return DataGeneratorTransformers
+
+
+class BERT_BidLSTM_ChainCRF(BaseModel):
+    """
+
+    """
+
+    name = 'BERT_BidLSTM_ChainCRF'
+
+    def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
+        super().__init__(config, ntags, load_pretrained_weights, local_path)
+
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+
+        input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
+        token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
+        attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
+
+        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
+        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
+        embedding_layer = Dropout(0.1)(embedding_layer)
+
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+                                      return_sequences=True,
+                                      recurrent_dropout=config.recurrent_dropout))(embedding_layer)
+        bid_lstm = Dropout(config.dropout)(bid_lstm)
+
+
+        self.crf = ChainCRF()
+        pred = self.crf(bid_lstm)
+
+        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[pred])
+        self.config = config
+
+    def get_generator(self):
+        return DataGeneratorTransformers
\ No newline at end of file

From f634d24a583c9384b7fa0c1aa666cedf68abfcad Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 12 Jul 2023 16:47:52 +0900
Subject: [PATCH 02/24] typo

---
 delft/utilities/Transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py
index ab1c9522..77ed01ec 100644
--- a/delft/utilities/Transformer.py
+++ b/delft/utilities/Transformer.py
@@ -148,7 +148,7 @@ def save_tokenizer(self, output_directory):
 
     def instantiate_layer(self, load_pretrained_weights=True) -> Union[object, TFAutoModel, TFBertModel]:
         """
-        Instanciate a transformer to be loaded in a Keras layer using the availability method of the pre-trained transformer.
+        Instantiate a transformer to be loaded in a Keras layer using the availability method of the pre-trained transformer.
         """
         if self.loading_method == LOADING_METHOD_HUGGINGFACE_NAME:
             if load_pretrained_weights:

From 19493c5cc792992b1442e53c8033f336312f724f Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 12 Jul 2023 16:48:19 +0900
Subject: [PATCH 03/24] Fix LSTM size, add Dense layer for ChainCRF

---
 delft/sequenceLabelling/models.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index f30f7f51..2288ce4c 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1202,7 +1202,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
                                return_sequences=True,
                                recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
@@ -1236,10 +1236,11 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
+        bid_lstm = Dense(embedding_layer.shape[-1], activation='tanh')(bid_lstm)
 
         base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
 
@@ -1247,6 +1248,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         self.model.build(input_shape=[(None, None, ), (None, None, ), (None, None, )])
         self.config = config
 
+
     def get_generator(self):
         return DataGeneratorTransformers
 
@@ -1271,11 +1273,12 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
-
+        bid_lstm = Dense(embedding_layer.shape[-1], activation='tanh')(bid_lstm)
+        bid_lstm = Dense(ntags)(bid_lstm)
 
         self.crf = ChainCRF()
         pred = self.crf(bid_lstm)

From fcb764ab11f582d73dc97336918076f7d6c20107 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 12 Jul 2023 16:52:56 +0900
Subject: [PATCH 04/24] add crf related flags in configuration

---
 delft/sequenceLabelling/models.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 2288ce4c..fd3e23af 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -193,6 +193,7 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
     elif config.architecture == BERT_BidLSTM_CRF.name:
         preprocessor.return_bert_embeddings = True
         config.labels = preprocessor.vocab_tag
+        config.use_crf = True
         return BERT_BidLSTM_CRF(config,
                     ntags,
                     load_pretrained_weights=load_pretrained_weights,
@@ -201,6 +202,8 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
     elif config.architecture == BERT_BidLSTM_ChainCRF.name:
         preprocessor.return_bert_embeddings = True
         config.labels = preprocessor.vocab_tag
+        config.use_crf = True
+        config.use_chain_crf = True
         return BERT_BidLSTM_ChainCRF(config,
                                 ntags,
                                 load_pretrained_weights=load_pretrained_weights,

From 536e0de14d4fe7c0312c0a82bbbbe9923e7e8892 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 13 Jul 2023 19:08:48 +0900
Subject: [PATCH 05/24] reduce the size of the LSTM to try avoiding OOM

---
 delft/sequenceLabelling/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index fd3e23af..a631d783 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1205,7 +1205,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
                                return_sequences=True,
                                recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
@@ -1239,7 +1239,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
@@ -1276,7 +1276,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
         embedding_layer = Dropout(0.1)(embedding_layer)
 
-        bid_lstm = Bidirectional(LSTM(units=embedding_layer.shape[-1],
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(embedding_layer)
         bid_lstm = Dropout(config.dropout)(bid_lstm)

From 78c8054f00ef4831ba4605f4125b5cfe56367074 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 08:32:41 +0900
Subject: [PATCH 06/24] freze bert and concatenate embeddings

---
 delft/sequenceLabelling/models.py | 50 +++++++++++++++++------
 delft/utilities/Embeddings.py     | 66 +++++++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index a631d783..0c9da970 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1187,11 +1187,41 @@ def get_generator(self):
         return DataGeneratorTransformers
 
 class BERT_BidLSTM(BaseModel):
-    """
-    """
 
     name = 'BERT_BidLSTM'
 
+    def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
+        super().__init__(config, ntags, load_pretrained_weights, local_path)
+
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+
+        input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
+        token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
+        attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
+
+        embedding_layers = transformer_layers(input_ids_in,
+                                              token_type_ids=token_type_ids,
+                                              attention_mask=attention_mask,
+                                              training=False)[-4:]
+        concatenated_embeddings = Concatenate([layer for layer in embedding_layers])
+
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+                                      return_sequences=True,
+                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
+        bid_lstm = Dropout(config.dropout)(bid_lstm)
+
+        label_logits = Dense(ntags, activation='softmax')(bid_lstm)
+
+        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[label_logits])
+        self.config = config
+
+    def get_generator(self):
+        return DataGeneratorTransformers
+
+
+class BidLSTM_BERT(BaseModel):
+    name = 'BidLSTM_BERT'
+
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
@@ -1220,10 +1250,6 @@ def get_generator(self):
 
 
 class BERT_BidLSTM_CRF(BaseModel):
-    """
-
-    """
-
     name = 'BERT_BidLSTM_CRF'
 
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
@@ -1235,15 +1261,13 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
-        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
-        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
-        embedding_layer = Dropout(0.1)(embedding_layer)
-
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
+        concatenated_embeddings = Concatenate()([layer for layer in transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask,
+                                             training=False)[-4:]])
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
-                                      recurrent_dropout=config.recurrent_dropout))(embedding_layer)
+                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
-        bid_lstm = Dense(embedding_layer.shape[-1], activation='tanh')(bid_lstm)
+        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='tanh')(bid_lstm)
 
         base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
 
diff --git a/delft/utilities/Embeddings.py b/delft/utilities/Embeddings.py
index 7ad7bd92..cff4694d 100644
--- a/delft/utilities/Embeddings.py
+++ b/delft/utilities/Embeddings.py
@@ -18,6 +18,9 @@
 from tqdm import tqdm
 from pathlib import Path
 
+from delft.sequenceLabelling.config import ModelConfig
+from delft.sequenceLabelling.preprocess import BERTPreprocessor, Preprocessor
+from delft.utilities.Transformer import Transformer
 from delft.utilities.simple_elmo import ElmoModel, elmo
 
 logging.basicConfig()
@@ -81,7 +84,7 @@ def __init__(self, name,
         # below init for using ELMo embeddings
         self.use_ELMo = use_ELMo
         self.elmo_model_name = elmo_model_name
-        if elmo_model_name == None:
+        if elmo_model_name is None:
             self.elmo_model_name = 'elmo-'+self.lang
         if use_ELMo:
             #tf.compat.v1.disable_eager_execution()
@@ -479,12 +482,12 @@ def get_elmo_embedding_path(self, description):
                         destination_dir = os.path.join("data/models/ELMo", self.elmo_model_name)
                         if not os.path.exists(destination_dir):
                             os.makedirs(destination_dir)
-                        try:                      
+                        try:
                             shutil.move(embeddings_path, destination_file)
                             weights_file = destination_file
                         except OSError:
                             print ("Copy of ELMo weights file to ELMo directory path", destination_file, "failed")
-            
+
             if "url_weights" not in description or description["url_weights"] == None or len(description["url_weights"]) == 0:
                 print("no download url available for this ELMo model weights embeddings resource, please review the embedding registry for", name)
         print("ELMo weights used:", weights_file)
@@ -761,3 +764,60 @@ def load_resource_registry(path='delft/resources-registry.json'):
     """
     registry_json = open(path).read()
     return json.loads(registry_json)
+
+class ContextualizedEmbeddings(Embeddings):
+
+    def __init__(self, transformer_name: str, registry: dict, max_sequence_length: int):
+
+        super().__init__(transformer_name, use_cache=False, use_ELMo=False, resource_registry=registry, load=False)
+        self.embed_size = 768
+        self.transformer = Transformer(transformer_name, registry)
+        self.model = self.transformer.instantiate_layer(load_pretrained_weights=True, output_hidden_states=True)
+        self.transformer_config = self.transformer.transformer_config
+        self.transformer.init_preprocessor(max_sequence_length=max_sequence_length)
+        self.preprocessor = BERTPreprocessor(self.transformer.tokenizer)
+                                             # self.transformer.tokenizer.empty_features_vector())
+                                             # preprocessor.empty_char_vector())
+
+
+    # def get_sentence_vectors(self, token_list):
+    #     token_vecs = hidden_states[-2][0]
+    #
+    #     # Calculate the average of all 22 token vectors.
+    #     sentence_embedding = torch.mean(token_vecs, dim=0)
+    #
+
+    def get_sentence_vector(self, text_tokens):
+        (target_ids, target_type_ids, target_attention_mask, target_chars,
+         target_features, target_labels, input_tokens) = self.preprocessor.tokenize_and_align_features_and_labels(text_tokens)
+
+        self.model.eval()
+        # segments_ids = [1] * len(target_ids)
+        outputs = self.model(target_ids, target_type_ids)
+        hidden_states = outputs[2]
+
+        # tokens, batches, vector size
+        token_embeddings = tf.stack(hidden_states, axis=0)
+
+        # layers, tokens, batches, vector size
+        token_embeddings = tf.squeeze(token_embeddings, axis=1)
+
+        # layers, tokens, vector size
+        token_embeddings = tf.transpose(token_embeddings, perm=[1, 0, 2])
+
+        # layers, tokens, vector size
+        token_vecs_cat = []
+
+        for token in token_embeddings:
+            cat_vec = tf.concat((token[-1], token[-2], token[-3], token[-4]), dim=0)
+            token_vecs_cat.append(cat_vec)
+
+            # Sum
+            # sum_vec = torch.sum(token[-4:], dim=0)
+            # token_vecs_sum.append(sum_vec)
+
+        print('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))
+
+        return token_vecs_cat
+
+

From 3cd4810e978e0d59ed954ec250fc7e484a80b05b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 08:59:11 +0900
Subject: [PATCH 07/24] freze bert layer

---
 delft/resources-registry.json     | 20 +++++++++++++++-----
 delft/sequenceLabelling/models.py | 18 +++++++++++-------
 delft/utilities/Transformer.py    | 10 +++++++---
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/delft/resources-registry.json b/delft/resources-registry.json
index 0cb5596d..d45b55be 100644
--- a/delft/resources-registry.json
+++ b/delft/resources-registry.json
@@ -4,7 +4,7 @@
     "embeddings": [
         {
             "name": "glove-840B",
-            "path": "/media/lopez/T5/embeddings/glove.840B.300d.txt",
+            "path": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/glove/glove.840B.300d.txt",
             "type": "glove",
             "format": "vec", 
             "lang": "en",
@@ -77,10 +77,18 @@
     ],
     "transformers": [
         {
-            "name": "dmis-lab/biobert-base-cased-v1.2",
-            "path-config": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/bert_config.json",
-            "path-weights": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/model.ckpt-1000000",
-            "path-vocab": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/vocab.txt",
+            "name": "allenai/scibert_scivocab_cased/dir",
+            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/scibert/scibert_scivocab_cased_hf",
+            "lang": "en"
+        },
+        {
+            "name": "portiz/matbert",
+            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matbert",
+            "lang": "en"
+        },
+        {
+            "name": "m3rg-iitd/matscibert/dir",
+            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert",
             "lang": "en"
         }
     ],
@@ -89,6 +97,8 @@
             "name": "elmo-en",
             "path-config": "/media/lopez/T51/embeddings/elmo_2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
             "path_weights": "/media/lopez/T51/embeddings/elmo_2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
+            "path-config_": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
+            "path_weights_": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
             "path-vocab": "data/models/ELMo/en/vocab.txt",
             "path-cache": "data/models/ELMo/en/",
             "cache-training": true,
diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 0c9da970..0c56905d 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -276,10 +276,12 @@ def print_summary(self):
     def init_transformer(self, config: ModelConfig, 
                          load_pretrained_weights: bool, 
                          local_path: str,
-                         preprocessor: Preprocessor):
+                         preprocessor: Preprocessor,
+                         output_hidden_states=False):
         transformer = Transformer(config.transformer_name, resource_registry=self.registry, delft_local_path=local_path)
         print(config.transformer_name, "will be used, loaded via", transformer.loading_method)
-        transformer_model = transformer.instantiate_layer(load_pretrained_weights=load_pretrained_weights)
+        transformer_model = transformer.instantiate_layer(load_pretrained_weights=load_pretrained_weights,
+                                                          output_hidden_states=output_hidden_states)
         self.transformer_config = transformer.transformer_config
         transformer.init_preprocessor(max_sequence_length=config.max_sequence_length)
 
@@ -1193,17 +1195,19 @@ class BERT_BidLSTM(BaseModel):
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
-
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
+                                                   output_hidden_states=True)
+        transformer_layers.bert.trainable=False
         input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
-        embedding_layers = transformer_layers(input_ids_in,
+        embedding_layer = transformer_layers(input_ids_in,
                                               token_type_ids=token_type_ids,
                                               attention_mask=attention_mask,
-                                              training=False)[-4:]
-        concatenated_embeddings = Concatenate([layer for layer in embedding_layers])
+                                              training=False)
+        last_hidden_states = embedding_layer.hidden_states[-4:]
+        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
 
         bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py
index 77ed01ec..9a5dd575 100644
--- a/delft/utilities/Transformer.py
+++ b/delft/utilities/Transformer.py
@@ -146,13 +146,15 @@ def init_preprocessor(self, max_sequence_length: int,
     def save_tokenizer(self, output_directory):
         self.tokenizer.save_pretrained(output_directory)
 
-    def instantiate_layer(self, load_pretrained_weights=True) -> Union[object, TFAutoModel, TFBertModel]:
+    def instantiate_layer(self, load_pretrained_weights=True, output_hidden_states=False) -> Union[object, TFAutoModel, TFBertModel]:
         """
         Instantiate a transformer to be loaded in a Keras layer using the availability method of the pre-trained transformer.
         """
         if self.loading_method == LOADING_METHOD_HUGGINGFACE_NAME:
             if load_pretrained_weights:
-                transformer_model = TFAutoModel.from_pretrained(self.name, from_pt=True)
+                transformer_model = TFAutoModel.from_pretrained(self.name,
+                                                                from_pt=True,
+                                                                output_hidden_states=output_hidden_states)
                 self.transformer_config = transformer_model.config
                 return transformer_model
             else:
@@ -162,7 +164,9 @@ def instantiate_layer(self, load_pretrained_weights=True) -> Union[object, TFAut
 
         elif self.loading_method == LOADING_METHOD_LOCAL_MODEL_DIR:
             if load_pretrained_weights:
-                transformer_model = TFAutoModel.from_pretrained(self.local_dir_path, from_pt=True)
+                transformer_model = TFAutoModel.from_pretrained(self.local_dir_path,
+                                                                from_pt=True,
+                                                                output_hidden_states=output_hidden_states)
                 self.transformer_config = transformer_model.config
                 return transformer_model
             else:

From 40f8648c15e1f24b6fe8550d8aace4b945384e4b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 09:04:43 +0900
Subject: [PATCH 08/24] add frozen bert to bert_lstm_crf

---
 delft/sequenceLabelling/models.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 0c56905d..cfd2dc7c 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1197,15 +1197,15 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
 
         transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
                                                    output_hidden_states=True)
-        transformer_layers.bert.trainable=False
+        transformer_layers.bert.trainable = False
+
         input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
         embedding_layer = transformer_layers(input_ids_in,
                                               token_type_ids=token_type_ids,
-                                              attention_mask=attention_mask,
-                                              training=False)
+                                              attention_mask=attention_mask)
         last_hidden_states = embedding_layer.hidden_states[-4:]
         concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
 
@@ -1259,19 +1259,26 @@ class BERT_BidLSTM_CRF(BaseModel):
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
+                                               output_hidden_states=True)
+        transformer_layers.bert.trainable = False
 
         input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
-        concatenated_embeddings = Concatenate()([layer for layer in transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask,
-                                             training=False)[-4:]])
+        embedding_layer = transformer_layers(input_ids_in,
+                                             token_type_ids=token_type_ids,
+                                             attention_mask=attention_mask)
+
+        last_hidden_states = embedding_layer.hidden_states[-4:]
+        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
+
         bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
-        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='tanh')(bid_lstm)
+        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='softmax')(bid_lstm)
 
         base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
 

From 86cddbf69e0b818c8d4d1d4552b762efd7a666e7 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 18:05:32 +0900
Subject: [PATCH 09/24] update bert_bidlstm_chaincrf

---
 delft/sequenceLabelling/models.py | 82 +++++++++++++++++--------------
 1 file changed, 44 insertions(+), 38 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index cfd2dc7c..437e7a34 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -190,6 +190,14 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                     load_pretrained_weights=load_pretrained_weights,
                     local_path=local_path,
                     preprocessor=preprocessor)
+    # elif config.architecture == BidLSTM_BERT.name:
+    #     preprocessor.return_bert_embeddings = True
+    #     config.labels = preprocessor.vocab_tag
+    #     return BidLSTM_BERT(config,
+    #                         ntags,
+    #                         load_pretrained_weights=load_pretrained_weights,
+    #                         local_path=local_path,
+    #                         preprocessor=preprocessor)
     elif config.architecture == BERT_BidLSTM_CRF.name:
         preprocessor.return_bert_embeddings = True
         config.labels = preprocessor.vocab_tag
@@ -1223,34 +1231,29 @@ def get_generator(self):
         return DataGeneratorTransformers
 
 
-class BidLSTM_BERT(BaseModel):
-    name = 'BidLSTM_BERT'
-
-    def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
-        super().__init__(config, ntags, load_pretrained_weights, local_path)
-
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
-
-        input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
-        token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
-        attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
-
-        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
-        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
-        embedding_layer = Dropout(0.1)(embedding_layer)
-
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
-                               return_sequences=True,
-                               recurrent_dropout=config.recurrent_dropout))(embedding_layer)
-        bid_lstm = Dropout(config.dropout)(bid_lstm)
-
-        label_logits = Dense(ntags, activation='softmax')(bid_lstm)
-
-        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[label_logits])
-        self.config = config
-
-    def get_generator(self):
-        return DataGeneratorTransformers
+# class BidLSTM_BERT(BaseModel):
+#     name = 'BidLSTM_BERT'
+#
+#     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
+#         super().__init__(config, ntags, load_pretrained_weights, local_path, preprocessor=preprocessor)
+#
+#         # build input, directly feed with word embedding by the data generator
+#         word_input = Input(shape=(None, config.word_embedding_size), name='word_input')
+#         length_input = Input(batch_shape=(None, 1), dtype='int32', name='length_input')
+#
+#         x = Dropout(config.dropout)(word_input)
+#
+#         x = Bidirectional(LSTM(units=config.num_word_lstm_units,
+#                                return_sequences=True,
+#                                recurrent_dropout=config.recurrent_dropout))(x)
+#         x = Dropout(config.dropout)(x)
+#         x = Dense(config.num_word_lstm_units, activation='tanh')(x)
+#
+#         self.model = Model(inputs=[word_input, length_input], outputs=[x])
+#         self.config = config
+#
+#     def get_generator(self):
+#         return DataGenerator
 
 
 class BERT_BidLSTM_CRF(BaseModel):
@@ -1279,6 +1282,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
                                       recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
         bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='softmax')(bid_lstm)
+        bid_lstm = Dropout(config.dropout)(bid_lstm)
 
         base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
 
@@ -1292,30 +1296,32 @@ def get_generator(self):
 
 
 class BERT_BidLSTM_ChainCRF(BaseModel):
-    """
-
-    """
 
     name = 'BERT_BidLSTM_ChainCRF'
 
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor)
+        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
+                                                   output_hidden_states=True)
+        transformer_layers.bert.trainable = False
 
         input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
-        #embedding_layer = transformer_model(input_ids_in, token_type_ids=token_type_ids)[0]
-        embedding_layer = transformer_layers(input_ids_in, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
-        embedding_layer = Dropout(0.1)(embedding_layer)
+        embedding_layer = transformer_layers(input_ids_in,
+                                             token_type_ids=token_type_ids,
+                                             attention_mask=attention_mask)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units, #embedding_layer.shape[-1],
+        last_hidden_states = embedding_layer.hidden_states[-4:]
+        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
+
+        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
-                                      recurrent_dropout=config.recurrent_dropout))(embedding_layer)
+                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
         bid_lstm = Dropout(config.dropout)(bid_lstm)
-        bid_lstm = Dense(embedding_layer.shape[-1], activation='tanh')(bid_lstm)
+        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='softmax')(bid_lstm)
         bid_lstm = Dense(ntags)(bid_lstm)
 
         self.crf = ChainCRF()

From f12b8b3fd88f3536b50e9674ea704121d08178c1 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 18:05:42 +0900
Subject: [PATCH 10/24] update tensorflow addons

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7ccd20e6..b0ed251f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,4 @@ pandas==1.3.5
 transformers==4.25.1 
 torch==1.10.1
 pytest
-tensorflow-addons==0.19.0
+tensorflow-addons==0.21.0

From 33eed9b343c0281909127ccac900f6652d2636b3 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 10 Aug 2023 18:14:54 +0900
Subject: [PATCH 11/24] reverted wrong committed file

---
 delft/resources-registry.json | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/delft/resources-registry.json b/delft/resources-registry.json
index d45b55be..0cb5596d 100644
--- a/delft/resources-registry.json
+++ b/delft/resources-registry.json
@@ -4,7 +4,7 @@
     "embeddings": [
         {
             "name": "glove-840B",
-            "path": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/glove/glove.840B.300d.txt",
+            "path": "/media/lopez/T5/embeddings/glove.840B.300d.txt",
             "type": "glove",
             "format": "vec", 
             "lang": "en",
@@ -77,18 +77,10 @@
     ],
     "transformers": [
         {
-            "name": "allenai/scibert_scivocab_cased/dir",
-            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/scibert/scibert_scivocab_cased_hf",
-            "lang": "en"
-        },
-        {
-            "name": "portiz/matbert",
-            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matbert",
-            "lang": "en"
-        },
-        {
-            "name": "m3rg-iitd/matscibert/dir",
-            "model_dir": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert",
+            "name": "dmis-lab/biobert-base-cased-v1.2",
+            "path-config": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/bert_config.json",
+            "path-weights": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/model.ckpt-1000000",
+            "path-vocab": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/vocab.txt",
             "lang": "en"
         }
     ],
@@ -97,8 +89,6 @@
             "name": "elmo-en",
             "path-config": "/media/lopez/T51/embeddings/elmo_2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
             "path_weights": "/media/lopez/T51/embeddings/elmo_2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
-            "path-config_": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
-            "path_weights_": "/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
             "path-vocab": "data/models/ELMo/en/vocab.txt",
             "path-cache": "data/models/ELMo/en/",
             "cache-training": true,

From 22240ad736a6178f3d771613a00682db239f25c8 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 11 Aug 2023 10:39:35 +0900
Subject: [PATCH 12/24] hacky solution for selecting the default config values

---
 delft/applications/grobidTagger.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index ee5bb0c9..b8f83d2d 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -13,6 +13,20 @@
 MODEL_LIST = ['affiliation-address', 'citation', 'date', 'header', 'name-citation', 'name-header', 'software', 'figure', 'table', 'reference-segmenter']
 
 
+# config = {
+#     "architectures": {
+#         "BERT.*":
+#             {
+#                 "citation": {
+#                     "max_sequence_length": 200,
+#                     "batch_size": 20
+#                 }
+#             },
+#         "BERT_BidLSTM.*": {
+#
+#         }
+# }
+
 def configure(model, architecture, output_path=None, max_sequence_length=-1, batch_size=-1,
               embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1):
     """
@@ -26,7 +40,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat
     multiprocessing = True
     early_stop = True
 
-    if architecture and "BERT" in architecture:
+    if architecture and "BERT" in architecture and "BidLSTM" not in architecture:
         # architectures with some transformer layer/embeddings inside
 
         # non-default settings per model

From 51a0d5623ca8bf802a6ae228654dee4133f94fe7 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 11 Aug 2023 10:47:38 +0900
Subject: [PATCH 13/24] fix the learning rate for the hacky solution

---
 delft/applications/grobidTagger.py | 3 ++-
 delft/sequenceLabelling/wrapper.py | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index b8f83d2d..23643261 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -8,6 +8,7 @@
 
 from delft.sequenceLabelling import Sequence
 from delft.sequenceLabelling.reader import load_data_and_labels_crf_file
+from delft.sequenceLabelling.wrapper import is_transformer_architecture
 from delft.utilities.Utilities import longest_row
 
 MODEL_LIST = ['affiliation-address', 'citation', 'date', 'header', 'name-citation', 'name-header', 'software', 'figure', 'table', 'reference-segmenter']
@@ -40,7 +41,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat
     multiprocessing = True
     early_stop = True
 
-    if architecture and "BERT" in architecture and "BidLSTM" not in architecture:
+    if is_transformer_architecture(architecture):
         # architectures with some transformer layer/embeddings inside
 
         # non-default settings per model
diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py
index 29e09111..153bd5e5 100644
--- a/delft/sequenceLabelling/wrapper.py
+++ b/delft/sequenceLabelling/wrapper.py
@@ -54,6 +54,11 @@
 import transformers
 transformers.logging.set_verbosity(transformers.logging.ERROR)
 
+
+def is_transformer_architecture(architecture):
+    return architecture and "BERT" in architecture and "BidLSTM" not in architecture
+
+
 class Sequence(object):
 
     # number of parallel worker for the data generator
@@ -114,7 +119,7 @@ def __init__(self,
             word_emb_size = 0
 
         if learning_rate is None:
-            if transformer_name is None:
+            if is_transformer_architecture(architecture) is False:
                 learning_rate = 0.001
             else:
                 learning_rate = 2e-5

From 359446c9d007ac98a2dd1a8f6cf4d89b5df81052 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 11 Aug 2023 20:13:12 +0900
Subject: [PATCH 14/24] add examples superconductors

---
 delft/applications/grobidTagger.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index 23643261..fd489af3 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -502,6 +502,10 @@ class Tasks:
             someTexts.append("Wilcoxon signed-ranks tests were performed to calculate statistical significance of comparisons between  alignment programs, which include ProbCons (version 1.10) (23), MAFFT (version 5.667) (11) with several options, MUSCLE (version 3.52) (10) and ClustalW (version 1.83) (7).")
             someTexts.append("All statistical analyses were done using computer software Prism 6 for Windows (version 6.02; GraphPad Software, San Diego, CA, USA). One-Way ANOVA was used to detect differences amongst the groups. To account for the non-normal distribution of the data, all data were sorted by rank status prior to ANOVA statistical analysis. ")
             someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")
+        elif model == 'superconductors':
+            someTexts.append("We are studying the material La 3 A 2 Ge 2 (A = Ir, Rh). The critical temperature T C = 4.7 K discovered for La 3 Ir 2 Ge 2 in this work is by about 1.2 K higher than that found for La 3 Rh 2 Ge 2.")
+            someTexts.append("In just a few months, the superconducting transition temperature (Tc) was increased to 55 K in the electron-doped system, as well as 25 K in hole-doped La1−x SrxOFeAs compound. Soon after, single crystals of LnFeAs(O1−x Fx) (Ln = Pr, Nd, Sm) were grown successfully by the NaCl/KCl flux method, though the sub-millimeter sizes limit the experimental studies on them. Therefore, FeAs-based single crystals with high crystalline quality, homogeneity and large sizes are highly desired for precise measurements of the properties. Very recently, the BaFe2As2 compound in a tetragonal ThCr2Si2-type structure with infinite Fe–As layers was reported. By replacing the alkaline earth elements (Ba and Sr) with alkali elements (Na, K, and Cs), superconductivity up to 38 K was discovered both in hole-doped and electron-doped samples. Tc leties from 2.7 K in CsFe2As2 to 38 K in A1−xKxFe2As2 (A = Ba, Sr). Meanwhile, superconductivity could also be induced in the parent phase by high pressure or by replacing some of the Fe by Co. More excitingly, large single crystals could be obtained by the Sn flux method in this family to study the rather low melting temperature and the intermetallic characteristics.")
+
 
         if architecture.find("FEATURE") == -1:
             result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo)

From 3e07c97cb096d609ad772d00cb61f3b777290c5d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 11 Aug 2023 20:13:31 +0900
Subject: [PATCH 15/24] use the same method everywhere to know if a model is
 using transformers

---
 delft/applications/datasetTagger.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py
index c414b62a..ec700dd0 100644
--- a/delft/applications/datasetTagger.py
+++ b/delft/applications/datasetTagger.py
@@ -8,6 +8,7 @@
 
 from delft.sequenceLabelling import Sequence
 from delft.sequenceLabelling.reader import load_data_and_labels_json_offsets
+from delft.sequenceLabelling.wrapper import is_transformer_architecture
 from delft.utilities.misc import parse_number_ranges
 
 def configure(architecture, output_path=None, max_sequence_length=-1, batch_size=-1, embeddings_name=None,
@@ -20,7 +21,7 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size
     multiprocessing = True
     early_stop = True
 
-    if "BERT" in architecture:
+    if is_transformer_architecture(architecture):
         # architectures with some transformer layer/embeddings inside
         if batch_size == -1:
             #default

From fb509d1efa6a6fd0edcdc594095d8b136a441e6a Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 11 Aug 2023 20:16:30 +0900
Subject: [PATCH 16/24] tag startwith

---
 delft/applications/grobidTagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index fd489af3..64e3be13 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -502,7 +502,7 @@ class Tasks:
             someTexts.append("Wilcoxon signed-ranks tests were performed to calculate statistical significance of comparisons between  alignment programs, which include ProbCons (version 1.10) (23), MAFFT (version 5.667) (11) with several options, MUSCLE (version 3.52) (10) and ClustalW (version 1.83) (7).")
             someTexts.append("All statistical analyses were done using computer software Prism 6 for Windows (version 6.02; GraphPad Software, San Diego, CA, USA). One-Way ANOVA was used to detect differences amongst the groups. To account for the non-normal distribution of the data, all data were sorted by rank status prior to ANOVA statistical analysis. ")
             someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")
-        elif model == 'superconductors':
+        elif model.startswith('superconductors'):
             someTexts.append("We are studying the material La 3 A 2 Ge 2 (A = Ir, Rh). The critical temperature T C = 4.7 K discovered for La 3 Ir 2 Ge 2 in this work is by about 1.2 K higher than that found for La 3 Rh 2 Ge 2.")
             someTexts.append("In just a few months, the superconducting transition temperature (Tc) was increased to 55 K in the electron-doped system, as well as 25 K in hole-doped La1−x SrxOFeAs compound. Soon after, single crystals of LnFeAs(O1−x Fx) (Ln = Pr, Nd, Sm) were grown successfully by the NaCl/KCl flux method, though the sub-millimeter sizes limit the experimental studies on them. Therefore, FeAs-based single crystals with high crystalline quality, homogeneity and large sizes are highly desired for precise measurements of the properties. Very recently, the BaFe2As2 compound in a tetragonal ThCr2Si2-type structure with infinite Fe–As layers was reported. By replacing the alkaline earth elements (Ba and Sr) with alkali elements (Na, K, and Cs), superconductivity up to 38 K was discovered both in hole-doped and electron-doped samples. Tc leties from 2.7 K in CsFe2As2 to 38 K in A1−xKxFe2As2 (A = Ba, Sr). Meanwhile, superconductivity could also be induced in the parent phase by high pressure or by replacing some of the Fe by Co. More excitingly, large single crystals could be obtained by the Sn flux method in this family to study the rather low melting temperature and the intermetallic characteristics.")
 

From 679b9a49a98c52ca452a69c41524fea90a9be45b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 15 Aug 2023 19:49:47 +0900
Subject: [PATCH 17/24] remove unused fields

---
 delft/sequenceLabelling/preprocess.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
index 2f33e521..8c3fea32 100644
--- a/delft/sequenceLabelling/preprocess.py
+++ b/delft/sequenceLabelling/preprocess.py
@@ -590,7 +590,6 @@ def __init__(self,
                  return_casing=False,
                  return_features=False,
                  return_chars=False,
-                 return_bert_embeddings=False,
                  max_char_length=30,
                  feature_preprocessor: FeaturesPreprocessor = None,
                  ):
@@ -601,7 +600,6 @@ def __init__(self,
         self.return_casing = return_casing
         self.return_features = return_features
         self.return_chars = return_chars
-        self.return_bert_embeddings = return_bert_embeddings
         self.vocab_char = None
         self.vocab_tag = None
         self.vocab_case = [k for k, v in case_index.items()]

From 8976db3b0c3b97160e322c29b18d2ed868215eb1 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 15 Aug 2023 19:49:59 +0900
Subject: [PATCH 18/24] add character embedding channel

---
 delft/sequenceLabelling/models.py | 166 ++++++++++++++++--------------
 1 file changed, 91 insertions(+), 75 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 437e7a34..a4d5c384 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -98,7 +98,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
         return BidLSTM_CRF_CASING(config, ntags)
 
     elif config.architecture == BERT.name:
-        preprocessor.return_bert_embeddings = True
         config.labels = preprocessor.vocab_tag
         return BERT(config, 
                     ntags, 
@@ -107,7 +106,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                     preprocessor=preprocessor)
 
     elif config.architecture == BERT_FEATURES.name:
-        preprocessor.return_bert_embeddings = True
         preprocessor.return_features = True
         config.labels = preprocessor.vocab_tag
         return BERT_FEATURES(config, 
@@ -117,7 +115,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                     preprocessor=preprocessor)
 
     elif config.architecture == BERT_CRF.name:
-        preprocessor.return_bert_embeddings = True
         config.use_crf = True
         config.labels = preprocessor.vocab_tag
         return BERT_CRF(config, 
@@ -127,7 +124,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                         preprocessor=preprocessor)
 
     elif config.architecture == BERT_ChainCRF.name:
-        preprocessor.return_bert_embeddings = True
         config.use_crf = True
         config.use_chain_crf = True
         config.labels = preprocessor.vocab_tag
@@ -138,7 +134,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                         preprocessor=preprocessor)
 
     elif config.architecture == BERT_CRF_FEATURES.name:
-        preprocessor.return_bert_embeddings = True
         preprocessor.return_features = True
         config.use_crf = True
         config.labels = preprocessor.vocab_tag
@@ -149,7 +144,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                                 preprocessor=preprocessor)
 
     elif config.architecture == BERT_ChainCRF_FEATURES.name:
-        preprocessor.return_bert_embeddings = True
         preprocessor.return_features = True
         config.use_crf = True
         config.use_chain_crf = True
@@ -161,7 +155,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                                 preprocessor=preprocessor)    
 
     elif config.architecture == BERT_CRF_CHAR.name:
-        preprocessor.return_bert_embeddings = True
         preprocessor.return_chars = True
         config.use_crf = True
         config.labels = preprocessor.vocab_tag
@@ -172,7 +165,6 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                             preprocessor=preprocessor)
 
     elif config.architecture == BERT_CRF_CHAR_FEATURES.name:
-        preprocessor.return_bert_embeddings = True
         preprocessor.return_features = True
         preprocessor.return_chars = True
         config.use_crf = True
@@ -183,35 +175,30 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                                     local_path=local_path,
                                     preprocessor=preprocessor)
     elif config.architecture == BERT_BidLSTM.name:
-        preprocessor.return_bert_embeddings = True
+        preprocessor.return_word_embeddings = False
+        preprocessor.return_chars = True
         config.labels = preprocessor.vocab_tag
         return BERT_BidLSTM(config,
                     ntags,
                     load_pretrained_weights=load_pretrained_weights,
                     local_path=local_path,
                     preprocessor=preprocessor)
-    # elif config.architecture == BidLSTM_BERT.name:
-    #     preprocessor.return_bert_embeddings = True
-    #     config.labels = preprocessor.vocab_tag
-    #     return BidLSTM_BERT(config,
-    #                         ntags,
-    #                         load_pretrained_weights=load_pretrained_weights,
-    #                         local_path=local_path,
-    #                         preprocessor=preprocessor)
     elif config.architecture == BERT_BidLSTM_CRF.name:
-        preprocessor.return_bert_embeddings = True
-        config.labels = preprocessor.vocab_tag
+        preprocessor.return_word_embeddings = False
+        preprocessor.return_chars = True
         config.use_crf = True
+        config.labels = preprocessor.vocab_tag
         return BERT_BidLSTM_CRF(config,
                     ntags,
                     load_pretrained_weights=load_pretrained_weights,
                     local_path=local_path,
                     preprocessor=preprocessor)
     elif config.architecture == BERT_BidLSTM_ChainCRF.name:
-        preprocessor.return_bert_embeddings = True
-        config.labels = preprocessor.vocab_tag
+        preprocessor.return_word_embeddings = False
+        preprocessor.return_chars = True
         config.use_crf = True
         config.use_chain_crf = True
+        config.labels = preprocessor.vocab_tag
         return BERT_BidLSTM_ChainCRF(config,
                                 ntags,
                                 load_pretrained_weights=load_pretrained_weights,
@@ -387,7 +374,6 @@ def __init__(self, config, ntags=None):
 
         self.model = CRFModelWrapperDefault(base_model, ntags)
         self.model.build(input_shape=[(None, None, config.word_embedding_size), (None, None, config.max_char_length), (None, None, 1)])
-        #self.model.summary()
         self.config = config
 
 
@@ -1203,7 +1189,10 @@ class BERT_BidLSTM(BaseModel):
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
+        transformer_layers = self.init_transformer(config,
+                                                   load_pretrained_weights,
+                                                   local_path,
+                                                   preprocessor,
                                                    output_hidden_states=True)
         transformer_layers.bert.trainable = False
 
@@ -1211,83 +1200,96 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
+
+        # build character based embedding
+        char_input = Input(shape=(None, config.max_char_length),
+                           dtype='int32',
+                           name='char_input')
+        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
+                                                    output_dim=config.char_embedding_size,
+                                                    mask_zero=True,
+                                                    name='char_embeddings'
+                                                    ))(char_input)
+
+        chars = TimeDistributed(Bidirectional(
+            LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
+
         embedding_layer = transformer_layers(input_ids_in,
                                               token_type_ids=token_type_ids,
                                               attention_mask=attention_mask)
-        last_hidden_states = embedding_layer.hidden_states[-4:]
-        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
+        last_hidden_states = [layer for layer in embedding_layer.hidden_states[-4:]]
+        last_hidden_states.append(chars)
+        x = Concatenate()(last_hidden_states)
+        x = Dropout(config.dropout)(x)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        x = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
-                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
-        bid_lstm = Dropout(config.dropout)(bid_lstm)
+                                      recurrent_dropout=config.recurrent_dropout))(x)
+        x = Dropout(config.dropout)(x)
 
-        label_logits = Dense(ntags, activation='softmax')(bid_lstm)
+        x = Dense(ntags, activation='softmax')(x)
 
-        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[label_logits])
+        self.model = Model(
+            inputs=[input_ids_in, char_input, token_type_ids, attention_mask],
+            outputs=[x]
+        )
         self.config = config
 
     def get_generator(self):
         return DataGeneratorTransformers
 
 
-# class BidLSTM_BERT(BaseModel):
-#     name = 'BidLSTM_BERT'
-#
-#     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
-#         super().__init__(config, ntags, load_pretrained_weights, local_path, preprocessor=preprocessor)
-#
-#         # build input, directly feed with word embedding by the data generator
-#         word_input = Input(shape=(None, config.word_embedding_size), name='word_input')
-#         length_input = Input(batch_shape=(None, 1), dtype='int32', name='length_input')
-#
-#         x = Dropout(config.dropout)(word_input)
-#
-#         x = Bidirectional(LSTM(units=config.num_word_lstm_units,
-#                                return_sequences=True,
-#                                recurrent_dropout=config.recurrent_dropout))(x)
-#         x = Dropout(config.dropout)(x)
-#         x = Dense(config.num_word_lstm_units, activation='tanh')(x)
-#
-#         self.model = Model(inputs=[word_input, length_input], outputs=[x])
-#         self.config = config
-#
-#     def get_generator(self):
-#         return DataGenerator
-
-
 class BERT_BidLSTM_CRF(BaseModel):
     name = 'BERT_BidLSTM_CRF'
 
     def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, local_path: str = None, preprocessor=None):
         super().__init__(config, ntags, load_pretrained_weights, local_path)
 
-        transformer_layers = self.init_transformer(config, load_pretrained_weights, local_path, preprocessor,
-                                               output_hidden_states=True)
+        transformer_layers = self.init_transformer(config,
+                                                   load_pretrained_weights,
+                                                   local_path,
+                                                   preprocessor,
+                                                   output_hidden_states=True)
         transformer_layers.bert.trainable = False
 
         input_ids_in = Input(shape=(None,), name='input_token', dtype='int32')
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
+
+        # build character based embedding
+        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
+        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
+                                                    output_dim=config.char_embedding_size,
+                                                    mask_zero=True,
+                                                    name='char_embeddings'
+                                                    ))(char_input)
+
+        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
+
         embedding_layer = transformer_layers(input_ids_in,
                                              token_type_ids=token_type_ids,
                                              attention_mask=attention_mask)
 
-        last_hidden_states = embedding_layer.hidden_states[-4:]
-        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
+        last_hidden_states = [layer for layer in embedding_layer.hidden_states[-4:]]
+        last_hidden_states.append(chars)
+        x = Concatenate()(last_hidden_states)
+        x = Dropout(config.dropout)(x)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        x = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
-                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
-        bid_lstm = Dropout(config.dropout)(bid_lstm)
-        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='softmax')(bid_lstm)
-        bid_lstm = Dropout(config.dropout)(bid_lstm)
+                                      recurrent_dropout=config.recurrent_dropout))(x)
+        x = Dropout(config.dropout)(x)
+        x = Dense(config.num_word_lstm_units, activation='softmax')(x)
+        x = Dropout(config.dropout)(x)
 
-        base_model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[bid_lstm])
+        base_model = Model(
+            inputs=[input_ids_in, char_input, token_type_ids, attention_mask],
+            outputs=[x])
 
         self.model = CRFModelWrapperForBERT(base_model, ntags)
-        self.model.build(input_shape=[(None, None, ), (None, None, ), (None, None, )])
+        self.model.build(
+            input_shape=[(None, None, ), (None, None, config.max_char_length), (None, None, ), (None, None, )])
         self.config = config
 
 
@@ -1310,24 +1312,38 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         token_type_ids = Input(shape=(None,), name='input_token_type', dtype='int32')
         attention_mask = Input(shape=(None,), name='input_attention_mask', dtype='int32')
 
+        # build character based embedding
+        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
+        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
+                                                    output_dim=config.char_embedding_size,
+                                                    mask_zero=True,
+                                                    name='char_embeddings'
+                                                    ))(char_input)
+
+        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
+
         embedding_layer = transformer_layers(input_ids_in,
                                              token_type_ids=token_type_ids,
                                              attention_mask=attention_mask)
 
-        last_hidden_states = embedding_layer.hidden_states[-4:]
-        concatenated_embeddings = Concatenate()([layer for layer in last_hidden_states])
+        last_hidden_states = [layer for layer in embedding_layer.hidden_states[-4:]]
+        last_hidden_states.append(chars)
+        x = Concatenate()(last_hidden_states)
+        x = Dropout(config.dropout)(x)
 
-        bid_lstm = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        x = Bidirectional(LSTM(units=config.num_word_lstm_units,
                                       return_sequences=True,
-                                      recurrent_dropout=config.recurrent_dropout))(concatenated_embeddings)
-        bid_lstm = Dropout(config.dropout)(bid_lstm)
-        bid_lstm = Dense(concatenated_embeddings.shape[-1], activation='softmax')(bid_lstm)
-        bid_lstm = Dense(ntags)(bid_lstm)
+                                      recurrent_dropout=config.recurrent_dropout))(x)
+        x = Dropout(config.dropout)(x)
+        x = Dense(config.num_word_lstm_units, activation='softmax')(x)
+        x = Dense(ntags)(x)
 
         self.crf = ChainCRF()
-        pred = self.crf(bid_lstm)
+        pred = self.crf(x)
 
-        self.model = Model(inputs=[input_ids_in, token_type_ids, attention_mask], outputs=[pred])
+        self.model = Model(
+            inputs=[input_ids_in, char_input, token_type_ids, attention_mask],
+            outputs=[pred])
         self.config = config
 
     def get_generator(self):

From 5e9596016fad60293f3ff0aa4d2af4442765693d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 16 Aug 2023 11:18:25 +0900
Subject: [PATCH 19/24] LSTM output the same size of a single embedding

---
 delft/sequenceLabelling/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index a4d5c384..47da4cc0 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1222,7 +1222,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         x = Concatenate()(last_hidden_states)
         x = Dropout(config.dropout)(x)
 
-        x = Bidirectional(LSTM(units=config.num_word_lstm_units,
+        x = Bidirectional(LSTM(units=768,
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(x)
         x = Dropout(config.dropout)(x)

From 87144fcdadd4e577831ceef5e02e03c0fa9ecf9e Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 16 Aug 2023 11:29:22 +0900
Subject: [PATCH 20/24] revert change

---
 delft/sequenceLabelling/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 47da4cc0..74086c1c 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -1222,7 +1222,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         x = Concatenate()(last_hidden_states)
         x = Dropout(config.dropout)(x)
 
-        x = Bidirectional(LSTM(units=768,
+        x = Bidirectional(LSTM(units=config.word_embedding_size,
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(x)
         x = Dropout(config.dropout)(x)

From 9ecc4007dc772d659c5ecc8fa44b3afa6f5484fd Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 18 Aug 2023 11:19:55 +0900
Subject: [PATCH 21/24] update

---
 delft/sequenceLabelling/models.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 74086c1c..258bef6d 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -866,7 +866,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         x = Dropout(config.dropout)(x)
         label_logits = Dense(ntags, activation='softmax')(x)
 
-        self.model  = Model(inputs=[input_ids_in, features_input, token_type_ids, attention_mask], outputs=[label_logits])
+        self.model = Model(inputs=[input_ids_in, features_input, token_type_ids, attention_mask], outputs=[label_logits])
         self.config = config
 
     def get_generator(self):
@@ -1320,14 +1320,15 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
                                                     name='char_embeddings'
                                                     ))(char_input)
 
-        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
+        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)),
+                                name="chars_rnn")(char_embeddings)
 
         embedding_layer = transformer_layers(input_ids_in,
                                              token_type_ids=token_type_ids,
                                              attention_mask=attention_mask)
 
         last_hidden_states = [layer for layer in embedding_layer.hidden_states[-4:]]
-        last_hidden_states.append(chars)
+        last_hidden_states.append(chars) # some issue arise with this line
         x = Concatenate()(last_hidden_states)
         x = Dropout(config.dropout)(x)
 
@@ -1335,7 +1336,7 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
                                       return_sequences=True,
                                       recurrent_dropout=config.recurrent_dropout))(x)
         x = Dropout(config.dropout)(x)
-        x = Dense(config.num_word_lstm_units, activation='softmax')(x)
+        x = Dense(config.num_word_lstm_units, activation='tanh')(x)
         x = Dense(ntags)(x)
 
         self.crf = ChainCRF()

From 3550c5b4914c2ad33179a999b6ca1cc8004c7e75 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 22 Aug 2023 10:55:57 +0900
Subject: [PATCH 22/24] cleanup

---
 delft/utilities/Embeddings.py | 57 -----------------------------------
 1 file changed, 57 deletions(-)

diff --git a/delft/utilities/Embeddings.py b/delft/utilities/Embeddings.py
index cff4694d..225bb603 100644
--- a/delft/utilities/Embeddings.py
+++ b/delft/utilities/Embeddings.py
@@ -764,60 +764,3 @@ def load_resource_registry(path='delft/resources-registry.json'):
     """
     registry_json = open(path).read()
     return json.loads(registry_json)
-
-class ContextualizedEmbeddings(Embeddings):
-
-    def __init__(self, transformer_name: str, registry: dict, max_sequence_length: int):
-
-        super().__init__(transformer_name, use_cache=False, use_ELMo=False, resource_registry=registry, load=False)
-        self.embed_size = 768
-        self.transformer = Transformer(transformer_name, registry)
-        self.model = self.transformer.instantiate_layer(load_pretrained_weights=True, output_hidden_states=True)
-        self.transformer_config = self.transformer.transformer_config
-        self.transformer.init_preprocessor(max_sequence_length=max_sequence_length)
-        self.preprocessor = BERTPreprocessor(self.transformer.tokenizer)
-                                             # self.transformer.tokenizer.empty_features_vector())
-                                             # preprocessor.empty_char_vector())
-
-
-    # def get_sentence_vectors(self, token_list):
-    #     token_vecs = hidden_states[-2][0]
-    #
-    #     # Calculate the average of all 22 token vectors.
-    #     sentence_embedding = torch.mean(token_vecs, dim=0)
-    #
-
-    def get_sentence_vector(self, text_tokens):
-        (target_ids, target_type_ids, target_attention_mask, target_chars,
-         target_features, target_labels, input_tokens) = self.preprocessor.tokenize_and_align_features_and_labels(text_tokens)
-
-        self.model.eval()
-        # segments_ids = [1] * len(target_ids)
-        outputs = self.model(target_ids, target_type_ids)
-        hidden_states = outputs[2]
-
-        # tokens, batches, vector size
-        token_embeddings = tf.stack(hidden_states, axis=0)
-
-        # layers, tokens, batches, vector size
-        token_embeddings = tf.squeeze(token_embeddings, axis=1)
-
-        # layers, tokens, vector size
-        token_embeddings = tf.transpose(token_embeddings, perm=[1, 0, 2])
-
-        # layers, tokens, vector size
-        token_vecs_cat = []
-
-        for token in token_embeddings:
-            cat_vec = tf.concat((token[-1], token[-2], token[-3], token[-4]), dim=0)
-            token_vecs_cat.append(cat_vec)
-
-            # Sum
-            # sum_vec = torch.sum(token[-4:], dim=0)
-            # token_vecs_sum.append(sum_vec)
-
-        print('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))
-
-        return token_vecs_cat
-
-

From 43318b5a477454026ab585e4c976be298d63f411 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 22 Aug 2023 10:56:23 +0900
Subject: [PATCH 23/24] remove chain embedding channel temporarly for ChainCRF

---
 delft/sequenceLabelling/models.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/delft/sequenceLabelling/models.py b/delft/sequenceLabelling/models.py
index 258bef6d..555fb931 100644
--- a/delft/sequenceLabelling/models.py
+++ b/delft/sequenceLabelling/models.py
@@ -185,7 +185,7 @@ def get_model(config: ModelConfig, preprocessor, ntags=None, load_pretrained_wei
                     preprocessor=preprocessor)
     elif config.architecture == BERT_BidLSTM_CRF.name:
         preprocessor.return_word_embeddings = False
-        preprocessor.return_chars = True
+        # preprocessor.return_chars = True
         config.use_crf = True
         config.labels = preprocessor.vocab_tag
         return BERT_BidLSTM_CRF(config,
@@ -1258,21 +1258,21 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
 
 
         # build character based embedding
-        char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
-        char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
-                                                    output_dim=config.char_embedding_size,
-                                                    mask_zero=True,
-                                                    name='char_embeddings'
-                                                    ))(char_input)
+        # char_input = Input(shape=(None, config.max_char_length), dtype='int32', name='char_input')
+        # char_embeddings = TimeDistributed(Embedding(input_dim=config.char_vocab_size,
+        #                                             output_dim=config.char_embedding_size,
+        #                                             mask_zero=True,
+        #                                             name='char_embeddings'
+        #                                             ))(char_input)
 
-        chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
+        # chars = TimeDistributed(Bidirectional(LSTM(config.num_char_lstm_units, return_sequences=False)))(char_embeddings)
 
         embedding_layer = transformer_layers(input_ids_in,
                                              token_type_ids=token_type_ids,
                                              attention_mask=attention_mask)
 
         last_hidden_states = [layer for layer in embedding_layer.hidden_states[-4:]]
-        last_hidden_states.append(chars)
+        # last_hidden_states.append(chars)
         x = Concatenate()(last_hidden_states)
         x = Dropout(config.dropout)(x)
 
@@ -1284,12 +1284,14 @@ def __init__(self, config, ntags=None, load_pretrained_weights: bool = True, loc
         x = Dropout(config.dropout)(x)
 
         base_model = Model(
-            inputs=[input_ids_in, char_input, token_type_ids, attention_mask],
+            inputs=[input_ids_in, token_type_ids, attention_mask],
+            # inputs=[input_ids_in, char_input, token_type_ids, attention_mask],
             outputs=[x])
 
         self.model = CRFModelWrapperForBERT(base_model, ntags)
         self.model.build(
-            input_shape=[(None, None, ), (None, None, config.max_char_length), (None, None, ), (None, None, )])
+            input_shape=[(None, None, ), (None, None, ), (None, None, )])
+            # input_shape=[(None, None, ), (None, None, config.max_char_length), (None, None, ), (None, None, )])
         self.config = config
 
 

From 4a546b4274e5d0b1d51f7f0f4a48e46061fd677c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 22 Aug 2023 10:56:31 +0900
Subject: [PATCH 24/24] typo

---
 delft/sequenceLabelling/preprocess.py | 2 +-
 delft/sequenceLabelling/wrapper.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
index 8c3fea32..9ab6f40e 100644
--- a/delft/sequenceLabelling/preprocess.py
+++ b/delft/sequenceLabelling/preprocess.py
@@ -357,7 +357,7 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_
                 and offset[0] == 0  
                 and len(self.tokenizer.convert_ids_to_tokens(input_ids[i])) == 1
                 and not empty_token):
-                # another trick to support sentence piece tokenizer: sometimes a out of vocabulary
+                # another trick to support sentence piece tokenizer: sometimes an out of vocabulary
                 # character is tokenized as several known bytes, leading to 2 tokens for instance
                 # with the second one staring from offset 0 too. In order to align correctly the  
                 # original string, we need to skip this extra spurious token by looking at it decoded
diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py
index 153bd5e5..cd95ef9c 100644
--- a/delft/sequenceLabelling/wrapper.py
+++ b/delft/sequenceLabelling/wrapper.py
@@ -163,7 +163,7 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va
         features_all = concatenate_or_none((f_train, f_valid), axis=0)
 
         if incremental:
-            if self.model == None and self.models == None:
+            if self.model is None and self.models is None:
                 print("error: you must load a model first for an incremental training")
                 return
             print("Incremental training from loaded model", self.model_config.model_name)