Merge pull request #36 from aodn/feat/tailor-dependency

use tfbert model and update environment
aodn · Dec 18, 2024 · c53ebdf · c53ebdf
2 parents 1541cef + fac6fc0
commit c53ebdf
Show file tree

Hide file tree

Showing 9 changed files with 281 additions and 279 deletions.
diff --git a/.github/workflows/build_deploy_edge.yml b/.github/workflows/build_deploy_edge.yml
@@ -41,6 +41,7 @@ jobs:
         with:
           context: .
           #          Only building for AMD64 for now
+
           #          platforms: linux/amd64,linux/arm64
           push: true
           tags: |

diff --git a/data_discovery_ai/pipeline/pipeline.py b/data_discovery_ai/pipeline/pipeline.py
@@ -167,17 +167,18 @@ def prepare_sample_set(self, raw_data: pd.DataFrame) -> pd.DataFrame:
         vocabs = self.params["preprocessor"]["vocabs"].split(", ")
         labelled_ds = preprocessor.identify_km_sample(raw_data, vocabs)
         preprocessed_samples = preprocessor.sample_preprocessor(labelled_ds, vocabs)
-        sample_set = preprocessor.calculate_embedding(preprocessed_samples)
 
         # drop empty keywords rows
-        filtered_sample_set = sample_set[
-            sample_set["keywords"].apply(lambda x: x != [])
+        filtered_sample_set = preprocessed_samples[
+            preprocessed_samples["keywords"].apply(lambda x: x != [])
         ]
 
+        sample_set = preprocessor.calculate_embedding(filtered_sample_set)
+
         full_path = os.path.join(self.temp_dir, KEYWORD_SAMPLE_FILE)
 
-        preprocessor.save_to_file(filtered_sample_set, full_path)
-        return filtered_sample_set
+        preprocessor.save_to_file(sample_set, full_path)
+        return sample_set
 
     def prepare_train_test_sets(self, sample_set: pd.DataFrame) -> TrainTestData:
         """

diff --git a/data_discovery_ai/resources/KeywordClassifier/development.keras b/data_discovery_ai/resources/KeywordClassifier/development.keras
diff --git a/data_discovery_ai/resources/KeywordClassifier/keyword_label.pkl b/data_discovery_ai/resources/KeywordClassifier/keyword_label.pkl
diff --git a/data_discovery_ai/resources/KeywordClassifier/keyword_sample.pkl b/data_discovery_ai/resources/KeywordClassifier/keyword_sample.pkl
diff --git a/data_discovery_ai/utils/preprocessor.py b/data_discovery_ai/utils/preprocessor.py
@@ -11,10 +11,9 @@
 import configparser
 from typing import Any, List, Tuple, Union, Dict, Optional
 
-import torch
 from sklearn.preprocessing import MultiLabelBinarizer
-from transformers import BertTokenizer, BertModel
-from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, TFBertModel
+import tensorflow as tf
 from imblearn.under_sampling import RandomUnderSampler
 from imblearn.over_sampling import RandomOverSampler, SMOTE
 from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
@@ -30,6 +29,11 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+# hide warning information from transformers
+from transformers import logging as tf_logging
+
+tf_logging.set_verbosity_error()
+
 
 class Concept:
     def __init__(self, value: str, url: str, vocab_type: str) -> None:
@@ -115,7 +119,6 @@ def identify_ddm_sample(raw_data: pd.DataFrame) -> pd.DataFrame:
         + preprocessed_data["description"]
         + " [SEP] "
         + preprocessed_data["lineage"]
-        + " [SEP]"
     )
 
     # only focus on onGoing records
@@ -162,9 +165,7 @@ def sample_preprocessor(sampleSet: pd.DataFrame, vocabs: List[str]) -> pd.DataFr
     sampleSet["keywords"] = sampleSet["keywords"].apply(
         lambda x: keywords_formatter(x, vocabs)
     )
-    sampleSet["information"] = (
-        sampleSet["title"] + " [SEP] " + sampleSet["description"] + " [SEP]"
-    )
+    sampleSet["information"] = sampleSet["title"] + " [SEP] " + sampleSet["description"]
     return sampleSet
 
 
@@ -216,20 +217,18 @@ def get_description_embedding(text: str) -> np.ndarray:
     Output:
         text_embedding: np.ndarray. A numpy array representing the text embedding as a feature vector.
     """
-    tokenizer = BertTokenizer.from_pretrained(
-        "bert-base-uncased", clean_up_tokenization_spaces=False
-    )
-    model = BertModel.from_pretrained("bert-base-uncased")
-
-    inputs = tokenizer(
-        text, return_tensors="pt", max_length=512, truncation=True, padding="max_length"
-    )
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-    cls_embedding = outputs.last_hidden_state[:, 0, :]
-    text_embedding = cls_embedding.squeeze().numpy()
-    return text_embedding
+    # https://huggingface.co/docs/transformers/v4.47.1/en/model_doc/bert#transformers.TFBertModel
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+
+    # use in Tensorflow https://huggingface.co/google-bert/bert-base-uncased
+    model = TFBertModel.from_pretrained("bert-base-uncased")
+
+    # https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.return_tensors, set as 'tf' to return tensorflow tensor
+    inputs = tokenizer(text, return_tensors="tf", max_length=512, truncation=True)
+    outputs = model(inputs)
+    text_embedding = outputs.last_hidden_state[:, 0, :].numpy()
+    # output as a 1D array, shape (768,)
+    return text_embedding.squeeze()
 
 
 def calculate_embedding(ds: pd.DataFrame) -> pd.DataFrame:

diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python>=3.10
+  - python=3.10
   - pip>=24.0
   - poetry
   - requests