Skip to content

Commit

Permalink
Merge pull request #36 from aodn/feat/tailor-dependency
Browse files Browse the repository at this point in the history
use tfbert model and update environment
  • Loading branch information
vietnguyengit authored Dec 18, 2024
2 parents 1541cef + fac6fc0 commit c53ebdf
Show file tree
Hide file tree
Showing 9 changed files with 281 additions and 279 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build_deploy_edge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ jobs:
with:
context: .
# Only building for AMD64 for now

# platforms: linux/amd64,linux/arm64
push: true
tags: |
Expand Down
11 changes: 6 additions & 5 deletions data_discovery_ai/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,18 @@ def prepare_sample_set(self, raw_data: pd.DataFrame) -> pd.DataFrame:
vocabs = self.params["preprocessor"]["vocabs"].split(", ")
labelled_ds = preprocessor.identify_km_sample(raw_data, vocabs)
preprocessed_samples = preprocessor.sample_preprocessor(labelled_ds, vocabs)
sample_set = preprocessor.calculate_embedding(preprocessed_samples)

# drop empty keywords rows
filtered_sample_set = sample_set[
sample_set["keywords"].apply(lambda x: x != [])
filtered_sample_set = preprocessed_samples[
preprocessed_samples["keywords"].apply(lambda x: x != [])
]

sample_set = preprocessor.calculate_embedding(filtered_sample_set)

full_path = os.path.join(self.temp_dir, KEYWORD_SAMPLE_FILE)

preprocessor.save_to_file(filtered_sample_set, full_path)
return filtered_sample_set
preprocessor.save_to_file(sample_set, full_path)
return sample_set

def prepare_train_test_sets(self, sample_set: pd.DataFrame) -> TrainTestData:
"""
Expand Down
Binary file modified data_discovery_ai/resources/KeywordClassifier/development.keras
Binary file not shown.
Binary file modified data_discovery_ai/resources/KeywordClassifier/keyword_label.pkl
Binary file not shown.
Binary file not shown.
41 changes: 20 additions & 21 deletions data_discovery_ai/utils/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
import configparser
from typing import Any, List, Tuple, Union, Dict, Optional

import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertModel
import tensorflow as tf
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
Expand All @@ -30,6 +29,11 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# hide warning information from transformers
from transformers import logging as tf_logging

tf_logging.set_verbosity_error()


class Concept:
def __init__(self, value: str, url: str, vocab_type: str) -> None:
Expand Down Expand Up @@ -115,7 +119,6 @@ def identify_ddm_sample(raw_data: pd.DataFrame) -> pd.DataFrame:
+ preprocessed_data["description"]
+ " [SEP] "
+ preprocessed_data["lineage"]
+ " [SEP]"
)

# only focus on onGoing records
Expand Down Expand Up @@ -162,9 +165,7 @@ def sample_preprocessor(sampleSet: pd.DataFrame, vocabs: List[str]) -> pd.DataFr
sampleSet["keywords"] = sampleSet["keywords"].apply(
lambda x: keywords_formatter(x, vocabs)
)
sampleSet["information"] = (
sampleSet["title"] + " [SEP] " + sampleSet["description"] + " [SEP]"
)
sampleSet["information"] = sampleSet["title"] + " [SEP] " + sampleSet["description"]
return sampleSet


Expand Down Expand Up @@ -216,20 +217,18 @@ def get_description_embedding(text: str) -> np.ndarray:
Output:
text_embedding: np.ndarray. A numpy array representing the text embedding as a feature vector.
"""
tokenizer = BertTokenizer.from_pretrained(
"bert-base-uncased", clean_up_tokenization_spaces=False
)
model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer(
text, return_tensors="pt", max_length=512, truncation=True, padding="max_length"
)

with torch.no_grad():
outputs = model(**inputs)
cls_embedding = outputs.last_hidden_state[:, 0, :]
text_embedding = cls_embedding.squeeze().numpy()
return text_embedding
# https://huggingface.co/docs/transformers/v4.47.1/en/model_doc/bert#transformers.TFBertModel
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

# use in Tensorflow https://huggingface.co/google-bert/bert-base-uncased
model = TFBertModel.from_pretrained("bert-base-uncased")

# https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.return_tensors, set as 'tf' to return tensorflow tensor
inputs = tokenizer(text, return_tensors="tf", max_length=512, truncation=True)
outputs = model(inputs)
text_embedding = outputs.last_hidden_state[:, 0, :].numpy()
# output as a 1D array, shape (768,)
return text_embedding.squeeze()


def calculate_embedding(ds: pd.DataFrame) -> pd.DataFrame:
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
- defaults
dependencies:
- python>=3.10
- python=3.10
- pip>=24.0
- poetry
- requests
Loading

0 comments on commit c53ebdf

Please sign in to comment.