Skip to content

Commit

Permalink
add id in json structure
Browse files Browse the repository at this point in the history
  • Loading branch information
ofilangi committed Oct 16, 2024
1 parent 523e3f3 commit 5df0659
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 8 deletions.
6 changes: 4 additions & 2 deletions llm_semantic_annotator/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def main_compute_tag_chunk_similarities(config_all):
keep_tag_embeddings = {}

for tags_pth_file in tags_pth_files:
tag_embeddings = mem.load_filepth(tags_pth_file)
tag_embeddings_all = mem.load_filepth(tags_pth_file)

tag_embeddings = { ele : tag_embeddings_all[ele]['emb'] for ele in tag_embeddings_all }

for doi,res in mem.compare_tags_with_chunks(tag_embeddings, chunk_embeddings).items():
if doi not in results_complete_similarities:
Expand All @@ -98,7 +100,7 @@ def main_compute_tag_chunk_similarities(config_all):

for doi in chunk_embeddings:
results_complete_similarities[doi] = mem.remove_similar_tags_by_doi(keep_tag_embeddings,results_complete_similarities[doi])

print(results_complete_similarities)
if len(results_complete_similarities)>0:
prefix_file_name=abstracts_pth_file.split(".pth")[0].split("_").pop()
print("prefix_file_name:",prefix_file_name)
Expand Down
16 changes: 14 additions & 2 deletions llm_semantic_annotator/misc/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,14 @@ def display_ontologies_summary(prefix_file_name,results_complete_similarities,re
for doi, complete_similarities in results_complete_similarities.items():

for tag, similarity in complete_similarities.items():
ontology_tag = tag.split('__')[1]

try:
ontology_tag = tag.split('__')[1] # Extraire le préfixe entre les doubles underscores
except:
ontology_tag = tag
finally:
pass

if ontology_tag not in ontology:
ontology.append(ontology_tag)
count_ontology.append(1)
Expand All @@ -52,7 +59,12 @@ def display_ontologies_summary(prefix_file_name,results_complete_similarities,re
count_ontology[index] += 1
similarity_ontology[index].append(similarity)

t = tag.split('__')[2]
try:
t = tag.split('__')[2]
except:
t = tag
finally:
pass

if t not in tag_list:
ontology_tag_list.append(ontology_tag)
Expand Down
7 changes: 6 additions & 1 deletion llm_semantic_annotator/misc/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ def display_ontologies_distribution(data):
labels = []
for doi, item in data.items():
for key in item.keys():
ontology = key.split('__')[1] # Extraire le préfixe entre les doubles underscores
try:
ontology = key.split('__')[1] # Extraire le préfixe entre les doubles underscores
except:
ontology = key
finally:
pass
ontologies.append(ontology)
labels.append(key)

Expand Down
6 changes: 3 additions & 3 deletions llm_semantic_annotator/similarity/model_embedding_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def get_filename_pth(self,name_embeddings):
return f"{self.retention_dir}/{name_embeddings}-{self.model_suffix}.pth"

def load_filepth(self,filename_embeddings):
return torch.load(filename_embeddings,weights_only=True)
return torch.load(filename_embeddings,weights_only=False)

def load_pth(self,name_embeddings):
filename = self.get_filename_pth(name_embeddings)
Expand All @@ -100,7 +100,7 @@ def load_pth(self,name_embeddings):

if os.path.exists(filename):
print(f"load embeddings - {filename}")
tag_embeddings = torch.load(filename,weights_only=True)
tag_embeddings = torch.load(filename,weights_only=False)
return tag_embeddings

def save_pth(self,tag_embeddings,name_embeddings):
Expand Down Expand Up @@ -209,7 +209,7 @@ def encode_tags(self,tags):
tags_embedding={}
print("set encoding.....")
for idx,item in tqdm(enumerate(embeddings)):
tags_embedding[tags[idx]['label']] = item
tags_embedding[tags[idx]['term']] = { 'label' : tags[idx]['rdfs_label'] , 'emb' : item }

return tags_embedding

Expand Down
1 change: 1 addition & 0 deletions llm_semantic_annotator/tag/owl_tag_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def build_corpus(
nb_record+=1

df = pd.DataFrame({
'term' : [ ele['term'] for ele in tags ],
'label': [ ele['label'] for ele in tags ],
'rdfs:label': [ ele['rdfs_label'] for ele in tags ],
'description': [ ele['description'] for ele in tags ]
Expand Down
4 changes: 4 additions & 0 deletions llm_semantic_annotator/tag/taxon_tag_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def manage_gbif_taxon_tags(self):

if tag_count % self.tags_per_file == 0:
df = pd.DataFrame({
'term': [ ele['term'] for ele in tags ],
'label': [ ele['label'] for ele in tags ],
'rdfs:label': [ ele['rdfs_label'] for ele in tags ],
'description': [ ele['description'] for ele in tags ]
Expand All @@ -240,6 +241,7 @@ def manage_gbif_taxon_tags(self):
# Sauvegarder les abstracts restants
if tags:
df = pd.DataFrame({
'term': [ ele['term'] for ele in tags ],
'label': [ ele['label'] for ele in tags ],
'rdfs:label': [ ele['rdfs_label'] for ele in tags ],
'description': [ ele['description'] for ele in tags ]
Expand Down Expand Up @@ -422,6 +424,7 @@ def manage_ncbi_taxon_tags(self):

if tag_count % self.tags_per_file == 0:
df = pd.DataFrame({
'term': [ ele['term'] for ele in tags ],
'label': [ ele['label'] for ele in tags ],
'rdfs:label': [ ele['rdfs_label'] for ele in tags ],
'description': [ ele['description'] for ele in tags ]
Expand All @@ -437,6 +440,7 @@ def manage_ncbi_taxon_tags(self):
# Sauvegarder les abstracts restants
if tags:
df = pd.DataFrame({
'term': [ ele['term'] for ele in tags ],
'label': [ ele['label'] for ele in tags ],
'rdfs:label': [ ele['rdfs_label'] for ele in tags ],
'description': [ ele['description'] for ele in tags ]
Expand Down

0 comments on commit 5df0659

Please sign in to comment.