From 5df0659b65973a60678233f50d8d583df5f796fc Mon Sep 17 00:00:00 2001 From: Olivier Filangi Date: Wed, 16 Oct 2024 18:31:01 +0200 Subject: [PATCH] add id in json structure --- llm_semantic_annotator/core.py | 6 ++++-- llm_semantic_annotator/misc/console.py | 16 ++++++++++++++-- llm_semantic_annotator/misc/stats.py | 7 ++++++- .../similarity/model_embedding_manager.py | 6 +++--- llm_semantic_annotator/tag/owl_tag_manager.py | 1 + llm_semantic_annotator/tag/taxon_tag_manager.py | 4 ++++ 6 files changed, 32 insertions(+), 8 deletions(-) diff --git a/llm_semantic_annotator/core.py b/llm_semantic_annotator/core.py index 746e818..beb7736 100644 --- a/llm_semantic_annotator/core.py +++ b/llm_semantic_annotator/core.py @@ -83,7 +83,9 @@ def main_compute_tag_chunk_similarities(config_all): keep_tag_embeddings = {} for tags_pth_file in tags_pth_files: - tag_embeddings = mem.load_filepth(tags_pth_file) + tag_embeddings_all = mem.load_filepth(tags_pth_file) + + tag_embeddings = { ele : tag_embeddings_all[ele]['emb'] for ele in tag_embeddings_all } for doi,res in mem.compare_tags_with_chunks(tag_embeddings, chunk_embeddings).items(): if doi not in results_complete_similarities: @@ -98,7 +100,7 @@ def main_compute_tag_chunk_similarities(config_all): for doi in chunk_embeddings: results_complete_similarities[doi] = mem.remove_similar_tags_by_doi(keep_tag_embeddings,results_complete_similarities[doi]) - + print(results_complete_similarities) if len(results_complete_similarities)>0: prefix_file_name=abstracts_pth_file.split(".pth")[0].split("_").pop() print("prefix_file_name:",prefix_file_name) diff --git a/llm_semantic_annotator/misc/console.py b/llm_semantic_annotator/misc/console.py index 384997a..04f36fa 100644 --- a/llm_semantic_annotator/misc/console.py +++ b/llm_semantic_annotator/misc/console.py @@ -42,7 +42,14 @@ def display_ontologies_summary(prefix_file_name,results_complete_similarities,re for doi, complete_similarities in results_complete_similarities.items(): for tag, similarity in complete_similarities.items(): - ontology_tag = tag.split('__')[1] + + try: + ontology_tag = tag.split('__')[1] # Extraire le préfixe entre les doubles underscores + except: + ontology_tag = tag + finally: + pass + if ontology_tag not in ontology: ontology.append(ontology_tag) count_ontology.append(1) @@ -52,7 +59,12 @@ def display_ontologies_summary(prefix_file_name,results_complete_similarities,re count_ontology[index] += 1 similarity_ontology[index].append(similarity) - t = tag.split('__')[2] + try: + t = tag.split('__')[2] + except: + t = tag + finally: + pass if t not in tag_list: ontology_tag_list.append(ontology_tag) diff --git a/llm_semantic_annotator/misc/stats.py b/llm_semantic_annotator/misc/stats.py index c844b7a..b50d866 100644 --- a/llm_semantic_annotator/misc/stats.py +++ b/llm_semantic_annotator/misc/stats.py @@ -7,7 +7,12 @@ def display_ontologies_distribution(data): labels = [] for doi, item in data.items(): for key in item.keys(): - ontology = key.split('__')[1] # Extraire le préfixe entre les doubles underscores + try: + ontology = key.split('__')[1] # Extraire le préfixe entre les doubles underscores + except: + ontology = key + finally: + pass ontologies.append(ontology) labels.append(key) diff --git a/llm_semantic_annotator/similarity/model_embedding_manager.py b/llm_semantic_annotator/similarity/model_embedding_manager.py index 227ba3c..8fea3fb 100644 --- a/llm_semantic_annotator/similarity/model_embedding_manager.py +++ b/llm_semantic_annotator/similarity/model_embedding_manager.py @@ -91,7 +91,7 @@ def get_filename_pth(self,name_embeddings): return f"{self.retention_dir}/{name_embeddings}-{self.model_suffix}.pth" def load_filepth(self,filename_embeddings): - return torch.load(filename_embeddings,weights_only=True) + return torch.load(filename_embeddings,weights_only=False) def load_pth(self,name_embeddings): filename = self.get_filename_pth(name_embeddings) @@ -100,7 +100,7 @@ def load_pth(self,name_embeddings): if os.path.exists(filename): print(f"load embeddings - {filename}") - tag_embeddings = torch.load(filename,weights_only=True) + tag_embeddings = torch.load(filename,weights_only=False) return tag_embeddings def save_pth(self,tag_embeddings,name_embeddings): @@ -209,7 +209,7 @@ def encode_tags(self,tags): tags_embedding={} print("set encoding.....") for idx,item in tqdm(enumerate(embeddings)): - tags_embedding[tags[idx]['label']] = item + tags_embedding[tags[idx]['term']] = { 'label' : tags[idx]['rdfs_label'] , 'emb' : item } return tags_embedding diff --git a/llm_semantic_annotator/tag/owl_tag_manager.py b/llm_semantic_annotator/tag/owl_tag_manager.py index d119d43..287a9f0 100644 --- a/llm_semantic_annotator/tag/owl_tag_manager.py +++ b/llm_semantic_annotator/tag/owl_tag_manager.py @@ -146,6 +146,7 @@ def build_corpus( nb_record+=1 df = pd.DataFrame({ + 'term' : [ ele['term'] for ele in tags ], 'label': [ ele['label'] for ele in tags ], 'rdfs:label': [ ele['rdfs_label'] for ele in tags ], 'description': [ ele['description'] for ele in tags ] diff --git a/llm_semantic_annotator/tag/taxon_tag_manager.py b/llm_semantic_annotator/tag/taxon_tag_manager.py index 0f45f75..6e46dfa 100644 --- a/llm_semantic_annotator/tag/taxon_tag_manager.py +++ b/llm_semantic_annotator/tag/taxon_tag_manager.py @@ -225,6 +225,7 @@ def manage_gbif_taxon_tags(self): if tag_count % self.tags_per_file == 0: df = pd.DataFrame({ + 'term': [ ele['term'] for ele in tags ], 'label': [ ele['label'] for ele in tags ], 'rdfs:label': [ ele['rdfs_label'] for ele in tags ], 'description': [ ele['description'] for ele in tags ] @@ -240,6 +241,7 @@ def manage_gbif_taxon_tags(self): # Sauvegarder les abstracts restants if tags: df = pd.DataFrame({ + 'term': [ ele['term'] for ele in tags ], 'label': [ ele['label'] for ele in tags ], 'rdfs:label': [ ele['rdfs_label'] for ele in tags ], 'description': [ ele['description'] for ele in tags ] @@ -422,6 +424,7 @@ def manage_ncbi_taxon_tags(self): if tag_count % self.tags_per_file == 0: df = pd.DataFrame({ + 'term': [ ele['term'] for ele in tags ], 'label': [ ele['label'] for ele in tags ], 'rdfs:label': [ ele['rdfs_label'] for ele in tags ], 'description': [ ele['description'] for ele in tags ] @@ -437,6 +440,7 @@ def manage_ncbi_taxon_tags(self): # Sauvegarder les abstracts restants if tags: df = pd.DataFrame({ + 'term': [ ele['term'] for ele in tags ], 'label': [ ele['label'] for ele in tags ], 'rdfs:label': [ ele['rdfs_label'] for ele in tags ], 'description': [ ele['description'] for ele in tags ]