Skip to content

Commit

Permalink
Refactor entity and relation extraction in iText2KG
Browse files Browse the repository at this point in the history
  • Loading branch information
lairgiyassir committed Jul 10, 2024
1 parent c2d879c commit 01a9064
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 3 deletions.
14 changes: 13 additions & 1 deletion itext2kg/graph_integration/itext2kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,32 @@ def extract_entities_for_all_sections(self, sections:List[str], ent_threshold =
print("[INFO] Extracting Entities from the Document", i+1)
entities = self.ientities_extractor.extract_entities(context= sections[i])
processed_entities, global_entities = self.matcher.process_lists(list1 = entities, list2=global_entities, for_entity_or_relation="entity", threshold=ent_threshold)
return global_entities
return self.data_handler.handle_data(global_entities, data_type="entity")


def extract_relations_for_all_sections(self, sections:List[str], entities, rel_threshold = 0.8):
entities = list(map(lambda entity:entity["name"], entities.copy()))
print("[INFO] Extracting Relations from the Document", 1)

global_relationships = self.irelations_extractor.extract_relations(context=sections[0], entities = entities)

relations_with_isolated_entities = self.data_handler.find_relations_with_isolated_entities(global_entities=entities, relations=global_relationships)
if relations_with_isolated_entities:
corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[0], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
global_relationships = [rel for rel in global_relationships if rel not in relations_with_isolated_entities] + [corrected_relations]

for i in range(1, len(sections)):
print("[INFO] Extracting Relations from the Document", i+1)
entities = self.irelations_extractor.extract_relations(context= sections[i], entities=entities)
processed_relationships, global_relationships_ = self.matcher.process_lists(list1 = entities, list2=global_relationships, for_entity_or_relation="relation", threshold = rel_threshold)

relations_with_isolated_entities = self.data_handler.find_relations_with_isolated_entities(global_entities=entities, relations=processed_relationships)
if relations_with_isolated_entities:
corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[i], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
processed_relationships = [rel for rel in processed_relationships if rel not in relations_with_isolated_entities] + [corrected_relations]

global_relationships.extend(processed_relationships)
#return self.data_handler.handle_data(global_relationships, data_type="relation")
return global_relationships


Expand Down
26 changes: 24 additions & 2 deletions itext2kg/irelation_extraction/irelations_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ def extract_relations(self, context: str, entities: List[str], embeddings: bool
formatted_context = f"context : \n -- '{context}' \n entities : \n -- {entities}"
IE_query = '''
# Directives
- Extract relationships between the provided entities based on the context.
- Adhere completely to the provided entities list.
- Do not add any entity outside the provided list.
- Extract ONE predicate per subject and object.
- ALL entities in the provided list should have a relation.
'''

relationships = self.langchain_output_parser.extract_information_as_json_for_context(output_data_structure = RelationshipsExtractor, context=formatted_context, IE_query=IE_query)
Expand All @@ -45,6 +44,29 @@ def extract_relations(self, context: str, entities: List[str], embeddings: bool
return []

return list(map(lambda rel : self.__add_embeddings_as_property(entity = rel, embeddings=embeddings, property_name=property_name, entity_name_name=entity_name_name) , relationships["relationships"]))



def extract_relations_for_isolated_entities(self, context: str, entities: List[str], relations_with_isolated_entities:List[str], embeddings: bool = True, property_name = "properties", entity_name_name = "name"):
print("Some relations with isolated entities were detected ... trying to solve them!")
formatted_context = f"context : \n -- '{context}' \n entities : \n -- {entities}"
relations_with_isolated_entities_names = [f"{rel['startNode']} -> {rel['endNode']}" for rel in relations_with_isolated_entities]
IE_query = f'''
# Directives
The relation {relations_with_isolated_entities_names} contains missed entities in the provided entities list. Try to re-extract a relation from the context based on the provided entities.
'''

relationships = self.langchain_output_parser.extract_information_as_json_for_context(output_data_structure = RelationshipsExtractor, context=formatted_context, IE_query=IE_query)
print(relationships)

if "relationships" not in relationships.keys() or relationships == None:
print("we are retrying ....")
self.extract_relations(context=context, entities=entities, relations_with_isolated_entities=relations_with_isolated_entities, embeddings=embeddings, property_name=property_name, entity_name_name=entity_name_name)
if not entities:
return []

return list(map(lambda rel : self.__add_embeddings_as_property(entity = rel, embeddings=embeddings, property_name=property_name, entity_name_name=entity_name_name) , relationships["relationships"]))




0 comments on commit 01a9064

Please sign in to comment.