From 42d789bdbfda992b33c56df7e536aa0c400f0e1c Mon Sep 17 00:00:00 2001 From: Olivier Filangi Date: Wed, 23 Oct 2024 16:49:45 +0200 Subject: [PATCH] fix name score according model. colorize message --- config/simple.json | 2 +- llm_semantic_annotator/__init__.py | 24 +++- llm_semantic_annotator/__main__.py | 120 +++++++++--------- .../abstract/abstract_manager.py | 11 +- llm_semantic_annotator/misc/utils.py | 38 +++--- requirements.txt | 1 + 6 files changed, 106 insertions(+), 90 deletions(-) diff --git a/config/simple.json b/config/simple.json index 0227932..f085c8e 100644 --- a/config/simple.json +++ b/config/simple.json @@ -1,6 +1,6 @@ { "encodeur" : "sentence-transformers/all-MiniLM-L6-v2", - "threshold_similarity_tag_chunk" : 0.65, + "threshold_similarity_tag_chunk" : 0.62, "threshold_similarity_tag" : 0.80, "batch_size" : 32, diff --git a/llm_semantic_annotator/__init__.py b/llm_semantic_annotator/__init__.py index 2f9193f..d7f17ca 100644 --- a/llm_semantic_annotator/__init__.py +++ b/llm_semantic_annotator/__init__.py @@ -24,4 +24,26 @@ from .core import main_build_dataset_abstracts_annotation from .core import get_scores_files -from .similarity_evaluator import similarity_evaluator_main \ No newline at end of file +from .similarity_evaluator import similarity_evaluator_main + + +from colorama import init, Fore, Back, Style +# Initialiser colorama +init(autoreset=True) + +def custom_exception_handler(exc_type, exc_value, exc_traceback): + # Formater le message d'exception + error_msg = f"{exc_type.__name__}: {exc_value}" + + # Afficher le message en rouge + print(f"{Fore.RED}{Back.WHITE}{Style.BRIGHT}{error_msg}{Style.RESET_ALL}") + + # Afficher la traceback en jaune + import traceback + for line in traceback.format_tb(exc_traceback): + print(f"{Fore.YELLOW}{line}{Style.RESET_ALL}") + +# Remplacer le gestionnaire d'exceptions par défaut +import sys +sys.excepthook = custom_exception_handler + diff --git a/llm_semantic_annotator/__main__.py b/llm_semantic_annotator/__main__.py index db40b14..70cac5e 100644 --- a/llm_semantic_annotator/__main__.py +++ b/llm_semantic_annotator/__main__.py @@ -1,95 +1,89 @@ -import json, sys - -from llm_semantic_annotator import get_retention_dir -from llm_semantic_annotator import main_populate_owl_tag_embeddings -from llm_semantic_annotator import main_populate_abstract_embeddings -from llm_semantic_annotator import main_populate_gbif_taxon_tag_embeddings -from llm_semantic_annotator import main_populate_ncbi_taxon_tag_embeddings -from llm_semantic_annotator import main_compute_tag_chunk_similarities -from llm_semantic_annotator import similarity_evaluator_main -from llm_semantic_annotator import main_display_summary -from llm_semantic_annotator import main_build_graph -from llm_semantic_annotator import main_build_dataset_abstracts_annotation - +import json +import sys +import os from rich import print import argparse +from llm_semantic_annotator import ( + get_retention_dir, + main_populate_owl_tag_embeddings, + main_populate_abstract_embeddings, + main_populate_gbif_taxon_tag_embeddings, + main_populate_ncbi_taxon_tag_embeddings, + main_compute_tag_chunk_similarities, + similarity_evaluator_main, + main_display_summary, + main_build_graph, + main_build_dataset_abstracts_annotation +) + def load_config(config_file): - """Charge la configuration à partir d'un fichier JSON.""" + """Load configuration from a JSON file.""" try: with open(config_file, 'r') as f: return json.load(f) except FileNotFoundError: - print(f"Le fichier de configuration {config_file} est introuvable.") + print(f"[bold red]Error:[/bold red] Configuration file {config_file} not found.") sys.exit(1) except json.JSONDecodeError: - print(f"Erreur de décodage JSON dans le fichier {config_file}.") + print(f"[bold red]Error:[/bold red] JSON decoding error in file {config_file}.") sys.exit(1) def parse_arguments(): - """Analyse les arguments de la ligne de commande.""" - parser = argparse.ArgumentParser(description="Programme avec plusieurs types d'exécution.") + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Program with multiple execution types.") parser.add_argument( "config_file", - help="Chemin vers le fichier de configuration JSON." + help="Path to the JSON configuration file." ) parser.add_argument( "execution_type", - choices=["populate_owl_tag_embeddings", - "populate_gbif_taxon_tag_embeddings", - "populate_ncbi_taxon_tag_embeddings", - "populate_abstract_embeddings", - "compute_tag_chunk_similarities", - "display_summary", - "build_rdf_graph", - "build_dataset_abstracts_annotations", - "evaluate_encoder"], - help="Type d'exécution à effectuer." + choices=[ + "populate_owl_tag_embeddings", + "populate_gbif_taxon_tag_embeddings", + "populate_ncbi_taxon_tag_embeddings", + "populate_abstract_embeddings", + "compute_tag_chunk_similarities", + "display_summary", + "build_rdf_graph", + "build_dataset_abstracts_annotations", + "evaluate_encoder" + ], + help="Type of execution to perform." ) - parser.add_argument('--force', action='store_true', - help="Forcer l'exécution sans demander de confirmation") - + help="Force execution without asking for confirmation") return parser.parse_args() def main(): - import os args = parse_arguments() config = load_config(args.config_file) config['retention_dir'] = get_retention_dir(args.config_file) + config['force'] = args.force - if args.force: - config['force'] = True - else: - config['force'] = False + execution_functions = { + "populate_owl_tag_embeddings": main_populate_owl_tag_embeddings, + "populate_gbif_taxon_tag_embeddings": main_populate_gbif_taxon_tag_embeddings, + "populate_ncbi_taxon_tag_embeddings": main_populate_ncbi_taxon_tag_embeddings, + "populate_abstract_embeddings": main_populate_abstract_embeddings, + "compute_tag_chunk_similarities": main_compute_tag_chunk_similarities, + "display_summary": main_display_summary, + "build_rdf_graph": main_build_graph, + "build_dataset_abstracts_annotations": main_build_dataset_abstracts_annotation, + "evaluate_encoder": similarity_evaluator_main + } - if args.execution_type == "populate_owl_tag_embeddings": - main_populate_owl_tag_embeddings(config) - elif args.execution_type == "populate_gbif_taxon_tag_embeddings": - main_populate_gbif_taxon_tag_embeddings(config) - elif args.execution_type == "populate_ncbi_taxon_tag_embeddings": - main_populate_ncbi_taxon_tag_embeddings(config) - elif args.execution_type == "populate_abstract_embeddings": - main_populate_abstract_embeddings(config) - elif args.execution_type == "compute_tag_chunk_similarities": - main_compute_tag_chunk_similarities(config) - elif args.execution_type == "display_summary": - main_display_summary(config) - elif args.execution_type == "build_rdf_graph": - main_build_graph(config) - elif args.execution_type == "build_dataset_abstracts_annotations": - main_build_dataset_abstracts_annotation(config) - elif args.execution_type == "evaluate_encoder": - similarity_evaluator_main(config) - else: - raise ValueError("Type d'exécution non reconnu.") + try: + execution_function = execution_functions[args.execution_type] + print(f"[bold green]Executing:[/bold green] {args.execution_type}") + execution_function(config) + except KeyError: + print(f"[bold red]Error:[/bold red] Unrecognized execution type: {args.execution_type}") + sys.exit(1) + except Exception as e: + print(f"[bold red]Error during execution:[/bold red] {str(e)}") + sys.exit(1) if __name__ == "__main__": main() - - - - - - diff --git a/llm_semantic_annotator/abstract/abstract_manager.py b/llm_semantic_annotator/abstract/abstract_manager.py index dd9cbc7..38495da 100644 --- a/llm_semantic_annotator/abstract/abstract_manager.py +++ b/llm_semantic_annotator/abstract/abstract_manager.py @@ -260,8 +260,10 @@ def build_dataset_abstracts_annotations(self): for filename in files: if pattern.search(filename): abstracts_json = os.path.join(root, filename) - abstracts_gen = filename.split('.json')[0] - abstracts_scores = self.mem.get_filename_pth(abstracts_gen).split('.pth')[0]+"_scores.json" + abstracts_origin_gen = filename.split('.json')[0] + abstracts_gen = self.mem.get_filename_pth(abstracts_origin_gen).split('.pth')[0] + abstracts_scores = abstracts_gen+"_scores.json" + abstracts_annotations_results_file = abstracts_gen+"_queryresults.json" print(abstracts_json) abstracts_data = self._get_data_abstracts_file(abstracts_json) abstracts_annot = load_results(abstracts_scores) @@ -305,7 +307,6 @@ def build_dataset_abstracts_annotations(self): 'reference_id' : reference_id_list }) if not df.empty: - outf = self.config['retention_dir']+f"/QueryResultEntry_{abstracts_gen}.csv" - print(outf) - df.to_csv(outf, index=False) + print(abstracts_annotations_results_file) + df.to_csv(abstracts_annotations_results_file, index=False) diff --git a/llm_semantic_annotator/misc/utils.py b/llm_semantic_annotator/misc/utils.py index c524e80..950389c 100644 --- a/llm_semantic_annotator/misc/utils.py +++ b/llm_semantic_annotator/misc/utils.py @@ -1,61 +1,59 @@ -import os,csv,json +import os, csv, json from pathlib import Path -def save_results(data,filename): +def save_results(data, filename): """ - Sauvegarde les résultats dans un fichier JSON. + Saves the results to a JSON file. """ with open(filename, 'w') as f: json.dump(data, f) - print(f"Résultats sauvegardés dans {filename}") + print(f"Results saved in {filename}") def load_results(filename): """ - Charge les résultats depuis un fichier JSON s'il existe. + Loads the results from a JSON file if it exists. """ if os.path.exists(filename): with open(filename, 'r') as f: return json.load(f) - return None + raise FileNotFoundError(f"The file {filename} does not exist.") def list_of_dicts_to_csv(data, filename): - # Vérifier si la liste n'est pas vide + # Check if the list is not empty if not data: - print("La liste est vide.") + print("The list is empty.") return - # Obtenir les en-têtes (toutes les clés uniques de tous les dictionnaires) + # Get headers (all unique keys from all dictionaries) headers = set().union(*(d.keys() for d in data)) - # Ouvrir le fichier en mode écriture + # Open the file in write mode with open(filename, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=headers) - # Écrire les en-têtes + # Write the headers writer.writeheader() - # Écrire les données + # Write the data for row in data: writer.writerow(row) - - def dict_to_csv(dictionary, filename): - # Déterminer les en-têtes (clés du dictionnaire) + # Determine the headers (keys of the dictionary) headers = list(dictionary.keys()) - # Ouvrir le fichier en mode écriture + # Open the file in write mode with open(filename, 'w', newline='') as csvfile: - # Créer un objet writer CSV + # Create a CSV writer object writer = csv.DictWriter(csvfile, fieldnames=headers) - # Écrire les en-têtes + # Write the headers writer.writeheader() - # Écrire les données + # Write the data writer.writerow(dictionary) -def get_retention_dir(config_file) : +def get_retention_dir(config_file): config_base_name = os.path.basename(config_file) config_name_without_ext = os.path.splitext(config_base_name)[0] retention_dir = os.path.join(os.getcwd(), f"{config_name_without_ext}_workdir") diff --git a/requirements.txt b/requirements.txt index 09b66f8..849664a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ pandas tabulate np pytest +colorama