main.py

import os
from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from datetime import date
import pickle
import re
import pandas as pd

from firebase import add_model, update_model,upload_model

CONTRACTION_MAP = {"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have',
    "she's": 'she is', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is', "there'd": 'there would', "there'd've": 'there would have', "there's": 'there is', "they'd": 'they would', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you'll've": 'you will have', "you're": 'you are', "you've": 'you have'}


def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    text = text.split()
    for i in range(len(text)):
        word = text[i]
        if word in contraction_mapping:
            text[i] = contraction_mapping[word]
    text = " ".join(text)
    text = text.replace("'s",'')
    return text

def preprocess(data):
    new_list = []
    for text in data:
        text = text.lower()
        clean_text = re.sub(r'[^a-zA-Z0-9. ]','',expand_contractions(text))
        new_list.append(clean_text)
    final_text = pad_sequences(tokenizer.texts_to_sequences(new_list),maxlen = 30, padding='pre')
    return final_text

def retrain_model(data,model_version):
    update_model(model_version,{'status':'Preprocessing Data'})
    nondup_data = pd.json_normalize(data)
    nondup_data.columns = ['Text', 'oh_label']

    new_list_1 = []
    for text in nondup_data['Text']:
        new_list_1.append(re.sub(r'[^a-zA-Z0-9. ]', '', expand_contractions(text)))

    nondup_data['Updated_Text'] = new_list_1
    nondup_data['Updated_Text'] = nondup_data['Updated_Text'].str.lower()

    nondup_data.reset_index(inplace=True, drop=True)
    master_data = pd.DataFrame([nondup_data.Updated_Text, nondup_data.oh_label]).transpose()
    master_data = master_data.rename(columns={"Updated_Text": "text", "oh_label": "label"})

    train_data = master_data
    test_data = pd.read_csv('test_data.csv')
    train_data['text'] = train_data['text'].astype(str)
    test_data['text'] = test_data['text'].astype(str)

    MAXLENGTH = 30

    x_train = pad_sequences(tokenizer.texts_to_sequences(train_data.text), maxlen=MAXLENGTH, padding='pre')
    x_test = pad_sequences(tokenizer.texts_to_sequences(test_data.text), maxlen=MAXLENGTH, padding='pre')

    y_train = train_data.label

    encoder = LabelEncoder()
    encoder.fit(train_data.label.to_list())

    y_train = encoder.transform(train_data.label.to_list())
    y_test = encoder.transform(test_data.label.to_list())

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    update_model(model_version,{'status':'Training Started'})
    lstm_history = model.fit(x_train, y_train, batch_size=512, epochs=2,validation_data=(x_test, y_test))

    update_model(model_version,{'status':'Saving Model'})
    save_path = f"retrain_model/model_{model_version}.h5"
    model.save(save_path)
    
    update_model(model_version,{'status':'Uploading Model'})
    dest_path = upload_model(save_path)
    # delete the local file
    os.remove(save_path)
    os.rmdir('retrain_model')
    
    print(lstm_history.history)
    update_model(model_version,{'location':dest_path,'status':'Trained', "history": lstm_history.history})

    print(f"Model {model_version} trained and saved")


with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = keras.models.load_model("model_bidir_lstm.h5")

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def get():
    return { 'message': 'Welcome to yBully api !' }

@app.post("/predict")
async def get_prediction(request: Request):
    body = await request.json()
    preprocessed_data = preprocess(body['data'])
    
    predictions = model.predict(preprocessed_data)
    predictions = predictions.tolist()

    ans = []
    for i, pred in enumerate(predictions):
        ans.append({"text": body["data"][i], "confidence": pred[0]})
    
    return {'predictions': ans}

@app.post("/retrain")
async def retrain_req(request: Request, background_tasks:BackgroundTasks):
    body = await request.json()
    data = body['data']
    
    model_version = add_model()
    background_tasks.add_task(retrain_model, data, model_version)
    return {"message": "Model retraining started", "model_version":model_version}