From d0e50ff05d9f02c93bb424c34979d0add5528d1d Mon Sep 17 00:00:00 2001 From: Yassir LAIRGI Date: Tue, 16 Jul 2024 03:11:20 +0200 Subject: [PATCH] Adding Examples of use in Jupyter notebook --- .gitignore | 2 +- examples/examples_of_use.ipynb | 463 +++++++++++++++++++++++++++++++++ 2 files changed, 464 insertions(+), 1 deletion(-) create mode 100644 examples/examples_of_use.ipynb diff --git a/.gitignore b/.gitignore index 9f0c3fd..70353f3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ __pycache__/ *$py.class # Notebooks -examples/ +examples/itext2kg.ipynb # C extensions *.so diff --git a/examples/examples_of_use.ipynb b/examples/examples_of_use.ipynb new file mode 100644 index 0000000..8b6d579 --- /dev/null +++ b/examples/examples_of_use.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "\n", + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload your documents\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, we load and process a scientific article named bioclip.pdf from the dataset directory. The document is split into individual pages (chunks) using PyPDFLoader, and we exclude references by selecting only the first 16 pages for further processing." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFLoader\n", + "\n", + "loader = PyPDFLoader(f\"../datasets/scientific_articles/bioclip.pdf\")\n", + "pages = loader.load_and_split()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# excluding references\n", + "pages = pages[:16]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document Distiller\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This code uses an API key to utilize the DocumentsDisiller for extracting structured information from the loaded pages. The distilled information is formatted into semantic blocks, which will serve as input for knowledge graph construction." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_API_KEY = \"##\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from itext2kg.documents_distiller import DocumentsDisiller, Article\n", + "from itext2kg.graph_integration import iText2KG\n", + "\n", + "\n", + "document_distiller = DocumentsDisiller(openai_api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "IE_query = '''\n", + "# DIRECTIVES : \n", + "- Act like an experienced information extractor. \n", + "- You have a chunk of a scientific paper.\n", + "- If you do not find the right information, keep its place empty.\n", + "'''\n", + "# we have replaced the curly braces with square brackets to avoid the error in the query\n", + "distilled_doc = document_distiller.distill(documents=[page.page_content.replace(\"{\", '[').replace(\"}\", \"]\") for page in pages], IE_query=IE_query, output_data_structure=Article)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "semantic_blocks = [f\"{key} - {value}\".replace(\"{\", \"[\").replace(\"}\", \"]\") for key, value in distilled_doc.items()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# iText2KG\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section initializes an instance of iText2KG and uses it to construct a knowledge graph. Two methods are applied: one using local context for higher precision (highly recommended), and another using global context to enrich the graph despite being less precise." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Local entities as context (more precise method - highly recommended)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "itext2kg = iText2KG(openai_api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] Extracting Entities from the Document 1\n", + "{'entities': [{'label': 'Model', 'name': 'BIOCLIP'}, {'label': 'Domain', 'name': 'Tree of Life'}]}\n", + "[INFO] Extracting Relations from the Document 1\n", + "{'relationships': [{'startNode': 'BIOCLIP', 'endNode': 'tree of life', 'name': 'is a vision foundation model for'}]}\n", + "Some isolated entities without relations were detected ... trying to solve them!\n", + "{'relationships': []}\n", + "{'name': 'BIOCLIP', 'label': 'entity', 'properties': {'embeddings': array([ 0.03529191, -0.00025876, -0.02022829, ..., -0.02821856,\n", + " -0.02163173, 0.02438248])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- BIOCLIP -merged--> bioclip \n", + "[INFO] Extracting Entities from the Document 2\n", + "{'entities': [{'label': 'Person', 'name': 'Samuel Stevens'}, {'label': 'Person', 'name': 'Jiaman Wu'}, {'label': 'Person', 'name': 'Matthew J Thompson'}, {'label': 'Person', 'name': 'Elizabeth G Campolongo'}, {'label': 'Person', 'name': 'Chan Hee Song'}, {'label': 'Person', 'name': 'David Edward Carlyn'}, {'label': 'Person', 'name': 'Li Dong'}, {'label': 'Person', 'name': 'Wasila M Dahdul'}, {'label': 'Person', 'name': 'Charles Stewart'}, {'label': 'Person', 'name': 'Tanya Berger-Wolf'}, {'label': 'Person', 'name': 'Wei-Lun Chao'}, {'label': 'Person', 'name': 'Yu Su'}, {'label': 'Organization', 'name': 'The Ohio State University'}, {'label': 'Organization', 'name': 'Microsoft Research'}, {'label': 'Organization', 'name': 'University of California, Irvine'}, {'label': 'Organization', 'name': 'Rensselaer Polytechnic Institute'}]}\n", + "[INFO] Extracting Relations from the Document 2\n", + "{'relationships': [{'startNode': 'Samuel Stevens', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Jiaman Wu', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Matthew J Thompson', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Elizabeth G Campolongo', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Chan Hee Song', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'David Edward Carlyn', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Li Dong', 'endNode': 'Microsoft Research', 'name': 'affiliated_with'}, {'startNode': 'Wasila M Dahdul', 'endNode': 'University of California, Irvine', 'name': 'affiliated_with'}, {'startNode': 'Charles Stewart', 'endNode': 'Rensselaer Polytechnic Institute', 'name': 'affiliated_with'}, {'startNode': 'Tanya Berger-Wolf', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Wei-Lun Chao', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Yu Su', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}]}\n", + "Some isolated entities without relations were detected ... trying to solve them!\n", + "{'relationships': [{'startNode': 'Samuel Stevens', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Jiaman Wu', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Matthew J Thompson', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Elizabeth G Campolongo', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Chan Hee Song', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'David Edward Carlyn', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Li Dong', 'endNode': 'Microsoft Research', 'name': 'affiliated_with'}, {'startNode': 'Wasila M Dahdul', 'endNode': 'University of California, Irvine', 'name': 'affiliated_with'}, {'startNode': 'Charles Stewart', 'endNode': 'Rensselaer Polytechnic Institute', 'name': 'affiliated_with'}, {'startNode': 'Tanya Berger-Wolf', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Wei-Lun Chao', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}, {'startNode': 'Yu Su', 'endNode': 'The Ohio State University', 'name': 'affiliated_with'}]}\n", + "{'name': 'Samuel Stevens', 'label': 'entity', 'properties': {'embeddings': array([ 0.01319295, -0.00159933, -0.0065599 , ..., 0.01359126,\n", + " -0.02688989, -0.01259955])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Samuel Stevens -merged--> samuel stevens \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Jiaman Wu', 'label': 'entity', 'properties': {'embeddings': array([-0.02124246, 0.02894346, -0.01605723, ..., -0.00375747,\n", + " -0.00074927, -0.02002907])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Jiaman Wu -merged--> jiaman wu \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Matthew J Thompson', 'label': 'entity', 'properties': {'embeddings': array([ 0.019909 , -0.00540362, -0.02152748, ..., -0.00784874,\n", + " -0.02345921, 0.00867973])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Matthew J Thompson -merged--> matthew j thompson \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Elizabeth G Campolongo', 'label': 'entity', 'properties': {'embeddings': array([-0.00955247, -0.00485551, -0.021499 , ..., 0.0042451 ,\n", + " 0.00260216, -0.00203139])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Elizabeth G Campolongo -merged--> elizabeth g campolongo \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Chan Hee Song', 'label': 'entity', 'properties': {'embeddings': array([-0.01169247, -0.01854723, -0.02497609, ..., -0.0130586 ,\n", + " 0.01284163, -0.02478322])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Chan Hee Song -merged--> chan hee song \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00575567, 0.05847129, -0.02463111, ..., 0.00478194,\n", + " 0.01209479, -0.00122801])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'David Edward Carlyn', 'label': 'entity', 'properties': {'embeddings': array([ 0.00356838, 0.03543546, -0.03322074, ..., -0.0161544 ,\n", + " 0.01265319, -0.0067785 ])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- David Edward Carlyn -merged--> david edward carlyn \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00575931, 0.05853131, -0.02459829, ..., 0.00478563,\n", + " 0.01214146, -0.00120725])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Li Dong', 'label': 'entity', 'properties': {'embeddings': array([-0.01724486, 0.00562865, -0.0085898 , ..., -0.00198226,\n", + " -0.00758643, -0.02024681])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Li Dong -merged--> li dong \n", + "{'name': 'Microsoft Research', 'label': 'entity', 'properties': {'embeddings': array([-0.00819754, 0.00010021, -0.01921595, ..., -0.02712951,\n", + " 0.00384556, -0.00264811])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Microsoft Research -merged--> microsoft research \n", + "{'name': 'Wasila M Dahdul', 'label': 'entity', 'properties': {'embeddings': array([-0.00469269, 0.01187814, -0.01905607, ..., 0.01227608,\n", + " -0.00190429, 0.00049836])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Wasila M Dahdul -merged--> wasila m dahdul \n", + "{'name': 'University of California, Irvine', 'label': 'entity', 'properties': {'embeddings': array([-0.00452201, 0.01641323, -0.02941302, ..., -0.00701032,\n", + " 0.03432583, -0.00893237])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- University of California, Irvine -merged--> university of california, irvine \n", + "{'name': 'Charles Stewart', 'label': 'entity', 'properties': {'embeddings': array([ 0.02761898, -0.0221835 , -0.01609331, ..., 0.00171572,\n", + " -0.00287381, 0.00434687])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Charles Stewart -merged--> charles stewart \n", + "{'name': 'Rensselaer Polytechnic Institute', 'label': 'entity', 'properties': {'embeddings': array([-0.02283968, 0.00605274, -0.02625268, ..., -0.00856298,\n", + " -0.00080183, 0.0124026 ])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Rensselaer Polytechnic Institute -merged--> rensselaer polytechnic institute \n", + "{'name': 'Tanya Berger-Wolf', 'label': 'entity', 'properties': {'embeddings': array([ 0.01396098, -0.0019643 , -0.03048589, ..., -0.01547452,\n", + " -0.00822937, 0.01690536])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Tanya Berger-Wolf -merged--> tanya berger wolf \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Wei-Lun Chao', 'label': 'entity', 'properties': {'embeddings': array([-0.01098281, 0.00211335, -0.02397597, ..., -0.00561499,\n", + " -0.00096192, -0.00828448])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Wei-Lun Chao -merged--> wei lun chao \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Yu Su', 'label': 'entity', 'properties': {'embeddings': array([-0.00584009, 0.0171121 , 0.0013933 , ..., -0.00952155,\n", + " -0.01110717, -0.00173476])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Yu Su -merged--> yu su \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Samuel Stevens', 'label': 'entity', 'properties': {'embeddings': array([ 0.01322924, -0.00158661, -0.00655774, ..., 0.01360304,\n", + " -0.02688103, -0.01265229])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Samuel Stevens -merged--> samuel stevens \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Jiaman Wu', 'label': 'entity', 'properties': {'embeddings': array([-0.02125024, 0.02895405, -0.01611166, ..., -0.00376289,\n", + " -0.00072628, -0.0200364 ])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Jiaman Wu -merged--> jiaman wu \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00578016, 0.05847981, -0.02461893, ..., 0.00478658,\n", + " 0.01211232, -0.00122129])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Matthew J Thompson', 'label': 'entity', 'properties': {'embeddings': array([ 0.01987364, -0.00540347, -0.02152688, ..., -0.00784852,\n", + " -0.02345855, 0.00869689])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Matthew J Thompson -merged--> matthew j thompson \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00574703, 0.05843203, -0.02462786, ..., 0.00478525,\n", + " 0.01212473, -0.00120222])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Elizabeth G Campolongo', 'label': 'entity', 'properties': {'embeddings': array([-0.00955247, -0.00485551, -0.021499 , ..., 0.0042451 ,\n", + " 0.00260216, -0.00203139])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Elizabeth G Campolongo -merged--> elizabeth g campolongo \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00574703, 0.05843203, -0.02462786, ..., 0.00478525,\n", + " 0.01212473, -0.00120222])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Chan Hee Song', 'label': 'entity', 'properties': {'embeddings': array([-0.01169247, -0.01854723, -0.02497609, ..., -0.0130586 ,\n", + " 0.01284163, -0.02478322])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Chan Hee Song -merged--> chan hee song \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'David Edward Carlyn', 'label': 'entity', 'properties': {'embeddings': array([ 0.00353234, 0.03547408, -0.03322641, ..., -0.0161653 ,\n", + " 0.0126635 , -0.00675929])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- David Edward Carlyn -merged--> david edward carlyn \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576045, 0.05851134, -0.02460315, ..., 0.0047984 ,\n", + " 0.01212809, -0.00122917])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Li Dong', 'label': 'entity', 'properties': {'embeddings': array([-0.01724486, 0.00562865, -0.0085898 , ..., -0.00198226,\n", + " -0.00758643, -0.02024681])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Li Dong -merged--> li dong \n", + "{'name': 'Microsoft Research', 'label': 'entity', 'properties': {'embeddings': array([-0.00815963, 0.00013193, -0.01923476, ..., -0.02711041,\n", + " 0.00384317, -0.00265756])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Microsoft Research -merged--> microsoft research \n", + "{'name': 'Wasila M Dahdul', 'label': 'entity', 'properties': {'embeddings': array([-0.00467227, 0.0117989 , -0.01903435, ..., 0.01228676,\n", + " -0.00189987, 0.00050898])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Wasila M Dahdul -merged--> wasila m dahdul \n", + "{'name': 'University of California, Irvine', 'label': 'entity', 'properties': {'embeddings': array([-0.00448329, 0.01646532, -0.02938869, ..., -0.00701611,\n", + " 0.03433467, -0.00894265])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- University of California, Irvine -merged--> university of california, irvine \n", + "{'name': 'Charles Stewart', 'label': 'entity', 'properties': {'embeddings': array([ 0.02761898, -0.0221835 , -0.01609331, ..., 0.00171572,\n", + " -0.00287381, 0.00434687])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Charles Stewart -merged--> charles stewart \n", + "{'name': 'Rensselaer Polytechnic Institute', 'label': 'entity', 'properties': {'embeddings': array([-0.02283968, 0.00605274, -0.02625268, ..., -0.00856298,\n", + " -0.00080183, 0.0124026 ])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Rensselaer Polytechnic Institute -merged--> rensselaer polytechnic institute \n", + "{'name': 'Tanya Berger-Wolf', 'label': 'entity', 'properties': {'embeddings': array([ 0.01394409, -0.00197252, -0.03055129, ..., -0.01551549,\n", + " -0.00822917, 0.01693802])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Tanya Berger-Wolf -merged--> tanya berger wolf \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00577644, 0.05845048, -0.02469872, ..., 0.00478676,\n", + " 0.01212855, -0.00120359])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Wei-Lun Chao', 'label': 'entity', 'properties': {'embeddings': array([-0.01101528, 0.00213179, -0.02404084, ..., -0.00559415,\n", + " -0.00098145, -0.00826352])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Wei-Lun Chao -merged--> wei lun chao \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "{'name': 'Yu Su', 'label': 'entity', 'properties': {'embeddings': array([-0.00584009, 0.0171121 , 0.0013933 , ..., -0.00952155,\n", + " -0.01110717, -0.00173476])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Yu Su -merged--> yu su \n", + "{'name': 'The Ohio State University', 'label': 'entity', 'properties': {'embeddings': array([ 0.00576804, 0.05847684, -0.02464922, ..., 0.0048021 ,\n", + " 0.01214325, -0.00123207])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- The Ohio State University -merged--> the ohio state university \n", + "[INFO] Extracting Entities from the Document 3\n", + "{'entities': [{'label': 'Dataset', 'name': 'TREEOFLIFE-10M'}, {'label': 'Model', 'name': 'BIOCLIP'}, {'label': 'Domain', 'name': 'biology images'}, {'label': 'Task', 'name': 'fine-grained biology classification tasks'}, {'label': 'Concept', 'name': 'hierarchical representation'}, {'label': 'Concept', 'name': 'tree of life'}]}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- treeoflife 10m -merged--> tree of life \n", + "[INFO] Extracting Relations from the Document 3\n", + "{'relationships': [{'startNode': 'bioclip', 'endNode': 'tree of life', 'name': 'is a foundation model for'}, {'startNode': 'bioclip', 'endNode': 'biology images', 'name': 'trained on'}, {'startNode': 'bioclip', 'endNode': 'fine grained biology classification tasks', 'name': 'outperforms existing baselines in'}, {'startNode': 'bioclip', 'endNode': 'hierarchical representation', 'name': 'has learned a'}, {'startNode': 'hierarchical representation', 'endNode': 'tree of life', 'name': 'conforms to'}]}\n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is a foundation model for -merged--> is a vision foundation model for \n", + "[INFO] Extracting Entities from the Document 4\n", + "{'entities': [{'label': 'Dataset', 'name': 'TREEOFLIFE-10M'}, {'label': 'Dataset', 'name': 'RARE SPECIES'}, {'label': 'Model', 'name': 'BIOCLIP'}, {'label': 'Technique', 'name': 'multimodal contrastive learning'}, {'label': 'Technique', 'name': 'zero-shot classification'}, {'label': 'Technique', 'name': 'few-shot classification'}, {'label': 'Technique', 'name': 'fine-grained classification'}, {'label': 'Technique', 'name': 'mixed text type training strategy'}, {'label': 'Technique', 'name': 'hierarchical representation'}, {'label': 'Organization', 'name': 'Hugging Face'}, {'label': 'Organization', 'name': 'OpenAI'}, {'label': 'Technique', 'name': 'cross-entropy methods'}, {'label': 'Technique', 'name': 'hierarchical cross-entropy methods'}]}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- treeoflife 10m -merged--> tree of life \n", + "[INFO] Wohoo ! Entity using embeddings is matched --- fine grained classification -merged--> fine grained biology classification tasks \n", + "[INFO] Extracting Relations from the Document 4\n", + "{'relationships': [{'startNode': 'bioclip', 'endNode': 'tree of life', 'name': 'is a foundation model for'}, {'startNode': 'bioclip', 'endNode': 'multimodal contrastive learning', 'name': 'utilizes'}, {'startNode': 'bioclip', 'endNode': 'zero shot classification', 'name': 'achieves strong performance in'}, {'startNode': 'bioclip', 'endNode': 'few shot classification', 'name': 'achieves strong performance in'}, {'startNode': 'bioclip', 'endNode': 'fine grained biology classification tasks', 'name': 'outperforms existing baselines in'}, {'startNode': 'bioclip', 'endNode': 'mixed text type training strategy', 'name': 'is enhanced by'}, {'startNode': 'bioclip', 'endNode': 'hierarchical representation', 'name': 'learns'}, {'startNode': 'bioclip', 'endNode': 'rare species', 'name': 'classifies effectively'}, {'startNode': 'multimodal contrastive learning', 'endNode': 'hierarchical representation', 'name': 'is used for learning'}, {'startNode': 'zero shot classification', 'endNode': 'rare species', 'name': 'is used for classifying'}, {'startNode': 'openai', 'endNode': 'bioclip', 'name': 'provides initial model weights for'}, {'startNode': 'cross entropy methods', 'endNode': 'hierarchical cross entropy methods', 'name': 'are outperformed by'}]}\n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is a foundation model for -merged--> is a vision foundation model for \n", + "[INFO] Wohoo ! Relation using embeddings is matched --- learns -merged--> has learned a \n", + "Some isolated entities without relations were detected ... trying to solve them!\n", + "{'relationships': [{'startNode': 'Hugging Face', 'endNode': 'TREEOFLIFE-10M dataset', 'name': 'hosts'}, {'startNode': 'Hugging Face', 'endNode': 'RARE SPECIES dataset', 'name': 'hosts'}]}\n", + "{'name': 'Hugging Face', 'label': 'entity', 'properties': {'embeddings': array([-0.02319396, 0.02293837, -0.00191486, ..., -0.01873727,\n", + " -0.00598219, -0.01356176])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Hugging Face -merged--> hugging face \n", + "{'name': 'TREEOFLIFE-10M dataset', 'label': 'entity', 'properties': {'embeddings': array([-0.03017091, 0.0142082 , -0.01843049, ..., -0.00654659,\n", + " -0.01063361, 0.01785659])}}\n", + "{'name': 'Hugging Face', 'label': 'entity', 'properties': {'embeddings': array([-0.02319396, 0.02293837, -0.00191486, ..., -0.01873727,\n", + " -0.00598219, -0.01356176])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- Hugging Face -merged--> hugging face \n", + "{'name': 'RARE SPECIES dataset', 'label': 'entity', 'properties': {'embeddings': array([-0.03784592, 0.03343168, -0.02633557, ..., 0.00614247,\n", + " 0.0018063 , 0.02260428])}}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- RARE SPECIES dataset -merged--> rare species \n", + "[INFO] Extracting Entities from the Document 5\n", + "{'entities': [{'label': 'Species', 'name': 'Onoclea sensibilis'}, {'label': 'Species', 'name': 'Onoclea hintonii'}, {'label': 'Methodology', 'name': 'Computational methods'}, {'label': 'Technique', 'name': 'Extracting biologically relevant information from images'}, {'label': 'Data Structure', 'name': 'Tree of life taxonomy'}, {'label': 'Methodology', 'name': 'Supervised classification'}, {'label': 'Model', 'name': 'CLIP'}, {'label': 'Model', 'name': 'OpenCLIP'}, {'label': 'Model', 'name': 'ResNet'}, {'label': 'Model', 'name': 'Swin Transformer'}, {'label': 'Model', 'name': 'ALIGN'}, {'label': 'Model', 'name': 'BASIC'}]}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- tree of life taxonomy -merged--> tree of life \n", + "[INFO] Extracting Relations from the Document 5\n", + "{'relationships': [{'startNode': 'supervised classification', 'endNode': 'resnet', 'name': 'is a method used by'}, {'startNode': 'supervised classification', 'endNode': 'swin transformer', 'name': 'is a method used by'}, {'startNode': 'clip', 'endNode': 'openclip', 'name': 'is the basis for'}, {'startNode': 'clip', 'endNode': 'tree of life', 'name': 'is adapted for use in'}, {'startNode': 'openclip', 'endNode': 'tree of life', 'name': 'is adapted for use in'}, {'startNode': 'computational methods', 'endNode': 'extracting biologically relevant information from images', 'name': 'are used for'}, {'startNode': 'clip', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'openclip', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'resnet', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'swin transformer', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'align', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'basic', 'endNode': 'extracting biologically relevant information from images', 'name': 'is used for'}, {'startNode': 'onoclea sensibilis', 'endNode': 'onoclea hintonii', 'name': 'is closely related to'}]}\n", + "[INFO] Extracting Entities from the Document 6\n", + "{'entities': [{'label': 'Dataset', 'name': 'TREEOFLIFE-10M'}, {'label': 'Model', 'name': 'BIOCLIP'}, {'label': 'Technique', 'name': 'contrastive pre-training objective'}, {'label': 'Technique', 'name': 'mixed text type training strategy'}, {'label': 'Dataset', 'name': 'BIOSCAN-1M'}, {'label': 'Technique', 'name': 'multimodal contrastive learning'}, {'label': 'Technique', 'name': 'flattening the taxonomy'}, {'label': 'Technique', 'name': 'cosine learning rate schedule'}, {'label': 'Data Structure', 'name': 'ViT-B/16 vision transformer image encoder'}, {'label': 'Data Structure', 'name': '77-token causal autoregressive transformer text encoder'}, {'label': 'Technique', 'name': 'zero-shot classification'}, {'label': 'Technique', 'name': 'few-shot learning'}, {'label': 'Technique', 'name': 'out-of-distribution generalization'}, {'label': 'Technique', 'name': 'species-classification objective'}, {'label': 'Dataset', 'name': 'iNat21'}, {'label': 'Technique', 'name': 'hierarchical classification'}]}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- treeoflife 10m -merged--> tree of life \n", + "[INFO] Wohoo ! Entity using embeddings is matched --- few shot learning -merged--> few shot classification \n", + "[INFO] Wohoo ! Entity using embeddings is matched --- hierarchical classification -merged--> hierarchical representation \n", + "[INFO] Extracting Relations from the Document 6\n", + "{'relationships': [{'startNode': 'bioclip', 'endNode': 'tree of life', 'name': 'is a foundation model for'}, {'startNode': 'bioclip', 'endNode': 'contrastive pre training objective', 'name': 'utilizes'}, {'startNode': 'bioclip', 'endNode': 'mixed text type training strategy', 'name': 'employs'}, {'startNode': 'bioclip', 'endNode': 'zero shot classification', 'name': 'achieves strong performance in'}, {'startNode': 'bioclip', 'endNode': 'few shot classification', 'name': 'achieves strong performance in'}, {'startNode': 'bioclip', 'endNode': 'out of distribution generalization', 'name': 'demonstrates'}, {'startNode': 'bioclip', 'endNode': 'species classification objective', 'name': 'is trained on'}, {'startNode': 'bioclip', 'endNode': 'hierarchical representation', 'name': 'learns'}, {'startNode': 'contrastive pre training objective', 'endNode': 'multimodal contrastive learning', 'name': 'is a type of'}, {'startNode': 'mixed text type training strategy', 'endNode': 'flattening the taxonomy', 'name': 'incorporates'}, {'startNode': 'bioclip', 'endNode': 'cosine learning rate schedule', 'name': 'is trained using'}, {'startNode': 'bioclip', 'endNode': 'vit b/16 vision transformer image encoder', 'name': 'uses'}, {'startNode': 'bioclip', 'endNode': '77 token causal autoregressive transformer text encoder', 'name': 'uses'}]}\n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is a foundation model for -merged--> is a vision foundation model for \n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is trained on -merged--> trained on \n", + "[INFO] Wohoo ! Relation using embeddings is matched --- learns -merged--> has learned a \n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is trained using -merged--> trained on \n", + "Some isolated entities without relations were detected ... trying to solve them!\n", + "{'relationships': [{'startNode': 'bioscan 1m', 'endNode': 'TREEOFLIFE-10M', 'name': 'is part of'}, {'startNode': 'inat21', 'endNode': 'TREEOFLIFE-10M', 'name': 'contributes to'}]}\n", + "{'name': 'TREEOFLIFE-10M', 'label': 'entity', 'properties': {'embeddings': array([-0.02074538, -0.00246318, -0.01296366, ..., -0.01759039,\n", + " -0.00745565, 0.01343074])}}\n", + "{'name': 'TREEOFLIFE-10M', 'label': 'entity', 'properties': {'embeddings': array([-0.02076307, -0.00246319, -0.01295488, ..., -0.01757282,\n", + " -0.00743804, 0.01340434])}}\n", + "[INFO] Extracting Entities from the Document 7\n", + "{'entities': [{'label': 'Dataset', 'name': 'TREEOFLIFE-10M'}, {'label': 'Dataset Component', 'name': 'BIOSCAN-1M'}, {'label': 'Methodology', 'name': 'mixed text type training strategy'}, {'label': 'Model', 'name': 'BIOCLIP'}, {'label': 'Process', 'name': 'classification'}, {'label': 'Process', 'name': 'data collection and labeling'}, {'label': 'Challenge', 'name': 'classifying rare species'}, {'label': 'Challenge', 'name': 'taxonomic classification dynamics'}, {'label': 'Constraint', 'name': 'computational constraints'}, {'label': 'Research Area', 'name': 'zero-shot generalization'}, {'label': 'Research Area', 'name': 'text diversity and model performance'}]}\n", + "[INFO] Wohoo ! Entity using embeddings is matched --- treeoflife 10m -merged--> tree of life \n", + "[INFO] Wohoo ! Entity using embeddings is matched --- classifying rare species -merged--> rare species \n", + "[INFO] Wohoo ! Entity using embeddings is matched --- zero shot generalization -merged--> zero shot classification \n", + "[INFO] Extracting Relations from the Document 7\n", + "{'relationships': [{'startNode': 'bioclip', 'endNode': 'tree of life', 'name': 'is a foundation model for'}, {'startNode': 'bioclip', 'endNode': 'classification', 'name': 'is used for'}, {'startNode': 'bioclip', 'endNode': 'zero shot classification', 'name': 'achieves strong performance in'}, {'startNode': 'bioclip', 'endNode': 'rare species', 'name': 'classifies effectively'}, {'startNode': 'bioclip', 'endNode': 'mixed text type training strategy', 'name': 'utilizes'}, {'startNode': 'bioclip', 'endNode': 'taxonomic classification dynamics', 'name': 'adapts to'}, {'startNode': 'bioclip', 'endNode': 'computational constraints', 'name': 'is limited by'}, {'startNode': 'bioclip', 'endNode': 'text diversity and model performance', 'name': 'is influenced by'}, {'startNode': 'data collection and labeling', 'endNode': 'bioclip', 'name': 'supports training of'}, {'startNode': 'bioscan 1m', 'endNode': 'bioclip', 'name': 'provides data for'}, {'startNode': 'bioscan 1m', 'endNode': 'data collection and labeling', 'name': 'is a result of'}]}\n", + "[INFO] Wohoo ! Relation using embeddings is matched --- is a foundation model for -merged--> is a vision foundation model for \n" + ] + } + ], + "source": [ + "global_ent, global_rel = itext2kg.build_graph(sections=semantic_blocks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global entities as context (less precise but more enriched graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "global_entities = itext2kg.extract_entities_for_all_sections(sections=semantic_blocks)\n", + "global_relations = itext2kg.extract_relations_for_all_sections(sections=semantic_blocks, entities=global_entities)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Draw the graph\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final section involves visualizing the constructed knowledge graph using GraphIntegrator. The graph database Neo4j is accessed using specified credentials, and the resulting graph is visualized to provide a visual representation of the relationships and entities extracted from the document." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from itext2kg.graph_integration import GraphIntegrator\n", + "\n", + "\n", + "URI = \"bolt://localhost:7687\"\n", + "USERNAME = \"neo4j\"\n", + "PASSWORD = \"allah0505\"\n", + "\n", + "\n", + "new_graph = {}\n", + "new_graph[\"nodes\"] = global_ent\n", + "new_graph[\"relationships\"] = global_rel\n", + "\n", + "GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}