diff --git a/ch8/ner-with-haystack.ipynb b/ch8/ner-with-haystack.ipynb index 535b773..bf29672 100644 --- a/ch8/ner-with-haystack.ipynb +++ b/ch8/ner-with-haystack.ipynb @@ -105,65 +105,128 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 37, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'content_type': 'text/html',\n", - " 'url': 'https://www.britannica.com/topic/Chinese-New-Year',\n", - " 'named_entities': [NamedEntityAnnotation(entity='MISC', start=0, end=16, score=np.float32(0.8706437)),\n", - " NamedEntityAnnotation(entity='MISC', start=30, end=44, score=np.float32(0.91554195)),\n", - " NamedEntityAnnotation(entity='LOC', start=61, end=66, score=np.float32(0.9186233)),\n", - " NamedEntityAnnotation(entity='MISC', start=67, end=74, score=np.float32(0.66412055)),\n", - " NamedEntityAnnotation(entity='MISC', start=82, end=96, score=np.float32(0.8688859)),\n", - " NamedEntityAnnotation(entity='MISC', start=121, end=137, score=np.float32(0.9623936)),\n", - " NamedEntityAnnotation(entity='LOC', start=165, end=170, score=np.float32(0.99973375)),\n", - " NamedEntityAnnotation(entity='MISC', start=175, end=182, score=np.float32(0.99974746)),\n", - " NamedEntityAnnotation(entity='MISC', start=311, end=318, score=np.float32(0.9997546)),\n", - " NamedEntityAnnotation(entity='MISC', start=414, end=428, score=np.float32(0.9671516)),\n", - " NamedEntityAnnotation(entity='LOC', start=523, end=528, score=np.float32(0.9997794)),\n", - " NamedEntityAnnotation(entity='MISC', start=588, end=604, score=np.float32(0.9242876)),\n", - " NamedEntityAnnotation(entity='MISC', start=650, end=665, score=np.float32(0.7919462)),\n", - " NamedEntityAnnotation(entity='MISC', start=713, end=729, score=np.float32(0.9364104)),\n", - " NamedEntityAnnotation(entity='MISC', start=761, end=777, score=np.float32(0.9695166)),\n", - " NamedEntityAnnotation(entity='PER', start=859, end=861, score=np.float32(0.73694575)),\n", - " NamedEntityAnnotation(entity='MISC', start=865, end=871, score=np.float32(0.7323043)),\n", - " NamedEntityAnnotation(entity='MISC', start=1304, end=1320, score=np.float32(0.9760999)),\n", - " NamedEntityAnnotation(entity='MISC', start=1443, end=1459, score=np.float32(0.93961865)),\n", - " NamedEntityAnnotation(entity='MISC', start=1673, end=1689, score=np.float32(0.93505925)),\n", - " NamedEntityAnnotation(entity='MISC', start=1704, end=1720, score=np.float32(0.92978096)),\n", - " NamedEntityAnnotation(entity='MISC', start=1838, end=1845, score=np.float32(0.9997471))]}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "extracted_documents[0].meta" + "import pandas as pd\n", + "\n", + "# Function to extract entity annotations into a DataFrame\n", + "# Function to extract uniquely identified named entities into a DataFrame with URL\n", + "def extract_named_entities_with_ids_and_url(documents):\n", + " extracted_data = []\n", + " for document in documents:\n", + " content = document.content\n", + " doc_id = document.id\n", + " url = document.meta.get('url', 'N/A') # Default to 'N/A' if URL is not available\n", + " named_entities = document.meta.get('named_entities', [])\n", + " for entity in named_entities:\n", + " word = content[entity.start:entity.end]\n", + " extracted_data.append({\n", + " 'document_id': doc_id,\n", + " 'word': word,\n", + " 'entity_type': entity.entity,\n", + " 'score': float(entity.score),\n", + " 'url': url\n", + " })\n", + " \n", + " # Convert to pandas DataFrame\n", + " df = pd.DataFrame(extracted_data)\n", + " return df\n", + "\n", + "# Extract and display named entities with unique IDs and URLs\n", + "df_entities = extract_named_entities_with_ids_and_url(extracted_documents)\n", + "df_entities.drop_duplicates(subset=['word', 'entity_type','score'], inplace=True)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + " | document_id | \n", + "word | \n", + "score | \n", + "url | \n", + "
---|---|---|---|---|
entity_type | \n", + "\n", + " | \n", + " | \n", + " | \n", + " |
LOC | \n", + "5 | \n", + "5 | \n", + "5 | \n", + "5 | \n", + "
MISC | \n", + "55 | \n", + "55 | \n", + "55 | \n", + "55 | \n", + "
ORG | \n", + "6 | \n", + "6 | \n", + "6 | \n", + "6 | \n", + "
PER | \n", + "4 | \n", + "4 | \n", + "4 | \n", + "4 | \n", + "