From d0846018e821587dfbb2fcc5163780decf2d18c9 Mon Sep 17 00:00:00 2001 From: Jaycent Gunawan Ongris Date: Tue, 11 Jun 2024 20:39:18 +0700 Subject: [PATCH 1/2] fix: remove translator, convert to batch processing --- app.py | 23 +++++++++++------------ utils.py | 17 ++++++++--------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/app.py b/app.py index a2f77ff..066d474 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,5 @@ from concurrent.futures import ThreadPoolExecutor, as_completed -from utils import get_model, get_tokenizer, predict_sentiment +from utils import get_model, get_tokenizer, predict_results from flask import Flask, request, jsonify, abort from dotenv import load_dotenv from functools import wraps @@ -44,21 +44,20 @@ def predict(): # statements: list of reviews report = {'Positive': 0, 'Negative': 0} - def process_statement(statement): + def process_statements(statements): try: - sentiment = predict_sentiment(statement, tokenizer, model, MAX_LENGTH) - return sentiment + logits = predict_results(statements, tokenizer, model, MAX_LENGTH) + return logits except Exception as e: - print(f"Error occurred: {e}, statement: {statement}") + print(f"Error occurred: {e}") return None - with ThreadPoolExecutor() as executor: - futures = {executor.submit(process_statement, statement): statement for statement in statements} - - for future in as_completed(futures): - result = future.result() - if result: - report[result] += 1 + predictions = process_statements(statements) + for pred in predictions: + if pred[0] > pred[1]: + report['Negative'] += 1 + else: + report['Positive'] += 1 return jsonify(report) diff --git a/utils.py b/utils.py index 599d47a..b814994 100644 --- a/utils.py +++ b/utils.py @@ -7,23 +7,22 @@ def get_tokenizer(model_name): def get_model(pretrained_path): return TFBertForSequenceClassification.from_pretrained(pretrained_path) +# dump def translate_to_indo(text): translator = GoogleTranslator(source='en', target='id') translated_text = translator.translate(text) return translated_text -def predict_sentiment(text, tokenizer, model, max_length): - translated_text = translate_to_indo(text) - tokenized_text = tokenizer( - text=translated_text, +def predict_results(texts, tokenizer, model, max_length): + tokenized_texts = tokenizer( + text=texts, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf' ) - input_ids = tokenized_text['input_ids'] - attention_mask = tokenized_text['attention_mask'] - prediction = model.predict([input_ids, attention_mask]) - sentiment = "Positive" if prediction[0][0][1] >= 1 else "Negative" - return sentiment \ No newline at end of file + input_ids = tokenized_texts['input_ids'] + attention_masks = tokenized_texts['attention_mask'] + predictions = model.predict([input_ids, attention_masks], use_multiprocessing=True, workers=2) + return predictions.logits \ No newline at end of file From ae93e1a95bcc01ec7832f6cf7f2258d3963d0f8f Mon Sep 17 00:00:00 2001 From: iyoubee Date: Wed, 12 Jun 2024 12:43:27 +0700 Subject: [PATCH 2/2] feat: make the batch processing more reliable --- app.py | 60 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/app.py b/app.py index 066d474..c2ea987 100644 --- a/app.py +++ b/app.py @@ -20,6 +20,33 @@ # Get API key from environment variable API_KEY = os.getenv('API_KEY') +def process_statements(statements): + try: + logits = predict_results(statements, tokenizer, model, MAX_LENGTH) + return logits + except Exception as e: + print(f"Error occurred: {e}") + return None + +def process_statement(statement): + try: + # Tokenize the statement + _ = tokenizer( + text=statement, + add_special_tokens=True, + max_length=MAX_LENGTH, + truncation=True, + padding='max_length', + return_tensors='tf' + ) + return statement, True # Return the statement and True if it's valid + except ValueError as e: + print(f"Skipping invalid statement: {statement}. Error: {e}") + return statement, False # Return the statement and False if it's invalid + except Exception as e: + print(f"Error occurred during tokenization: {e}") + return statement, False # Return the statement and False if an error occurs + def require_api_key(f): @wraps(f) def decorated_function(*args, **kwargs): @@ -44,20 +71,27 @@ def predict(): # statements: list of reviews report = {'Positive': 0, 'Negative': 0} - def process_statements(statements): - try: - logits = predict_results(statements, tokenizer, model, MAX_LENGTH) - return logits - except Exception as e: - print(f"Error occurred: {e}") - return None + with ThreadPoolExecutor() as executor: + futures = [executor.submit(process_statement, statement) for statement in statements] + + valid_statements = [] + for future in as_completed(futures): + statement, is_valid = future.result() + if is_valid: + valid_statements.append(statement) - predictions = process_statements(statements) - for pred in predictions: - if pred[0] > pred[1]: - report['Negative'] += 1 - else: - report['Positive'] += 1 + if not valid_statements: + return jsonify({"error": "No valid statements provided"}), 400 + + predictions = process_statements(valid_statements) + if predictions is not None: + for pred in predictions: + if pred[0] > pred[1]: + report['Negative'] += 1 + else: + report['Positive'] += 1 + else: + return jsonify({"error": "Error processing statements"}), 500 return jsonify(report)